文章目录
- 饼图
- 垂直条形图
- 水平条形图
- 堆叠条形图
- 水平交错条形图
- pandas模块之垂直或水平条形图
- pandas模块之水平交错条形图
- seaborn模块之垂直或水平条形图
- pandas模块之水平交错条形图
- matplotlib模块之直方图
- pandas模块之直方图和核密度图
- seaborn模块之分组的直方图和核密度图
- 单个箱线图
- 分组箱线图
- 小提琴图
- 单条折线图
- 多条折线图
- pandas模块之单组散点图
- seaborn模块之分组散点图
- 气泡图
- 热力图
- 词云分词
饼图
import matplotlib.pyplot as plt
edu = [0.2515, 0.3724, 0.3336, 0.0368, 0.0057]
labels = ['中专', '大专', '本科', '硕士', '其他']
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
plt.pie(x=edu,
labels=labels,
autopct='%.1f%%'
)
plt.title('失信用户的教育水平分布')
plt.show()
explode = [0, 0.1, 0, 0, 0]
colors = ['#9999ff', '#ff9999', '#7777aa', '#2442aa', '#dd5555']
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
plt.axes(aspect='equal')
plt.pie(x=edu,
explode=explode,
labels=labels,
colors=colors,
autopct='%.1f%%',
pctdistance=0.8,
labeldistance=1.1,
startangle=180,
radius=1.2,
counterclock=False,
wedgeprops={'linewidth': 1.5, 'edgecolor': 'green'},
textprops={'fontsize': 10, 'color': 'black'},
)
plt.title('失信用户的受教育水平分布')
plt.show()
import pandas as pd
data1 = pd.Series({'中专': 0.2515, '大专': 0.3724, '本科': 0.3336, '硕士': 0.0368, '其他': 0.0057})
data1.name = ''
plt.axes(aspect='equal')
data1.plot(kind='pie',
autopct='%.1f%%',
radius=1,
startangle=180,
counterclock=False,
title='失信用户的受教育水平分布',
wedgeprops={'linewidth': 1.5, 'edgecolor': 'green'},
textprops={'fontsize': 10, 'color': 'black'}
)
plt.show()



垂直条形图
GDP = pd.read_excel(
r'E:\PyCharmProject\venv\StataAnalysis_Files\从零开始学Python--数据分析与挖掘\第6章 Python数据可视化\Province GDP 2017.xlsx')
plt.style.use('ggplot')
plt.bar(x=range(GDP.shape[0]),
height=GDP.GDP,
tick_label=GDP.Province,
color='steelblue',
)
plt.ylabel('GDP(万亿)')
plt.title('2017年度6个省份GDP分布')
for x, y in enumerate(GDP.GDP):
plt.text(x, y + 0.1, '%s' % round(y, 1), ha='center')
plt.show()

水平条形图
GDP.sort_values(by='GDP', inplace=True)
plt.barh(y=range(GDP.shape[0]),
width=GDP.GDP,
tick_label=GDP.Province,
color='steelblue',
)
plt.xlabel('GDP(万亿)')
plt.title('2017年度6个省份GDP分布')
for y, x in enumerate(GDP.GDP):
plt.text(x + 0.1, y, '%s' % round(x, 1), va='center')
plt.show()

堆叠条形图
Industry_GDP = pd.read_excel(
r'E:\PyCharmProject\venv\StataAnalysis_Files\从零开始学Python--数据分析与挖掘\第6章 Python数据可视化\Industry_GDP.xlsx')
Quarters = Industry_GDP.Quarter.unique()
Industry1 = Industry_GDP.GPD[Industry_GDP.Industry_Type == '第一产业']
Industry1.index = range(len(Quarters))
Industry2 = Industry_GDP.GPD[Industry_GDP.Industry_Type == '第二产业']
Industry2.index = range(len(Quarters))
Industry3 = Industry_GDP.GPD[Industry_GDP.Industry_Type == '第三产业']
plt.bar(x=range(len(Quarters)), height=Industry1, color='steelblue', label='第一产业', tick_label=Quarters)
plt.bar(x=range(len(Quarters)), height=Industry2, bottom=Industry1, color='green', label='第二产业')
plt.bar(x=range(len(Quarters)), height=Industry3, bottom=Industry1 + Industry2, color='red', label='第三产业')
plt.ylabel('生成总值(亿)')
plt.title('2017年各季度三产业总值')
plt.legend()
plt.show()

水平交错条形图
import numpy as np
HuRun = pd.read_excel(
r'E:\PyCharmProject\venv\StataAnalysis_Files\从零开始学Python--数据分析与挖掘\第5章 Python数据处理工具--Pandas\HuRun.xlsx')
Cities = HuRun.City.unique()
Counts2016 = HuRun.Counts[HuRun.Year == 2016]
Counts2017 = HuRun.Counts[HuRun.Year == 2017]
bar_width = 0.4
plt.bar(x=np.arange(len(Cities)), height=Counts2016, label='2016', color='steelblue', width=bar_width)
plt.bar(x=np.arange(len(Cities)) + bar_width, height=Counts2017, label='2017', color='indianred', width=bar_width)
plt.xticks(np.arange(5) + 0.2, Cities)
plt.ylabel('亿万资产家庭数')
plt.title('近两年5个城市亿万资产家庭数比较')
plt.legend()
plt.show()

pandas模块之垂直或水平条形图
GDP.GDP.plot(kind='bar', width=0.8, rot=0, color='steelblue', title='2017年度6个省份GDP分布')
plt.ylabel('GDP(万亿)')
plt.xticks(range(len(GDP.Province)),
GDP.Province
)
for x, y in enumerate(GDP.GDP):
plt.text(x - 0.1, y + 0.2, '%s' % round(y, 1), va='center')
plt.show()

pandas模块之水平交错条形图
HuRun_reshape = HuRun.pivot_table(index='City', columns='Year', values='Counts').reset_index()
HuRun_reshape.sort_values(by=2016, ascending=False, inplace=True)
HuRun_reshape.plot(x='City', y=[2016, 2017], kind='bar', color=['steelblue', 'indianred'],
rot=0,
width=0.8, title='近两年5个城市亿万资产家庭数比较')
plt.ylabel('亿万资产家庭数')
plt.xlabel('')
plt.show()

seaborn模块之垂直或水平条形图
import seaborn as sns
sns.barplot(y='Province',
x='GDP',
data=GDP,
color='steelblue',
orient='horizontal'
)
plt.xlabel('GDP(万亿)')
plt.ylabel('')
plt.title('2017年度6个省份GDP分布')
for y, x in enumerate(GDP.GDP):
plt.text(x, y, '%s' % round(x, 1), va='center')
plt.show()

pandas模块之水平交错条形图
import pandas as pd
Titanic = pd.read_csv(
r'E:\PyCharmProject\venv\StataAnalysis_Files\从零开始学Python--数据分析与挖掘\第6章 Python数据可视化\titanic_train.csv')
sns.barplot(x='Pclass',
y='Age',
hue='Sex',
data=Titanic,
palette='RdBu',
errcolor='blue',
errwidth=2,
saturation=1,
capsize=0.05
)
plt.title('各船舱等级中男女乘客的年龄差异')
plt.show()

matplotlib模块之直方图
any(Titanic.Age.isnull())
Titanic.dropna(subset=['Age'], inplace=True)
plt.hist(x=Titanic.Age,
bins=20,
color='steelblue',
edgecolor='black'
)
plt.xlabel('年龄')
plt.ylabel('频数')
plt.title('乘客年龄分布')
plt.show()

pandas模块之直方图和核密度图
Titanic.Age.plot(kind='hist', bins=20, color='steelblue', edgecolor='black', density=True, label='直方图')
Titanic.Age.plot(kind='kde', color='red', label='核密度图')
plt.xlabel('年龄')
plt.ylabel('核密度值')
plt.title('乘客年龄分布')
plt.legend()
plt.show()

seaborn模块之分组的直方图和核密度图
Age_Male = Titanic.Age[Titanic.Sex == 'male']
Age_Female = Titanic.Age[Titanic.Sex == 'female']
sns.distplot(Age_Male, bins=20, kde=False, hist_kws={'color': 'steelblue'}, label='男性')
sns.distplot(Age_Female, bins=20, kde=False, hist_kws={'color': 'purple'}, label='女性')
plt.title('男女乘客的年龄直方图')
plt.legend()
plt.show()
sns.distplot(Age_Male, hist=False, kde_kws={'color': 'red', 'linestyle': '-'},
norm_hist=True, label='男性')
sns.distplot(Age_Female, hist=False, kde_kws={'color': 'black', 'linestyle': '--'},
norm_hist=True, label='女性')
plt.title('男女乘客的年龄核密度图')
plt.legend()
plt.show()


单个箱线图
Sec_Buildings = pd.read_excel(
r'E:\PyCharmProject\venv\StataAnalysis_Files\从零开始学Python--数据分析与挖掘\第6章 Python数据可视化\sec_buildings.xlsx')
plt.boxplot(x=Sec_Buildings.price_unit,
patch_artist=True,
showmeans=True,
boxprops={'color': 'black', 'facecolor': 'steelblue'},
flierprops={'marker': 'o', 'markerfacecolor': 'red', 'markersize': 3},
meanprops={'marker': 'D', 'markerfacecolor': 'indianred', 'markersize': 4},
medianprops={'linestyle': '--', 'color': 'orange'},
labels=['']
)
plt.title('二手房单价分布的箱线图')
plt.show()

分组箱线图
group_region = Sec_Buildings.groupby('region')
avg_price = group_region.aggregate({'price_unit': np.mean}).sort_values('price_unit', ascending=False)
region_price = []
for region in avg_price.index:
region_price.append(Sec_Buildings.price_unit[Sec_Buildings.region == region])
plt.boxplot(x=region_price,
patch_artist=True,
labels=avg_price.index,
showmeans=True,
boxprops={'color': 'black', 'facecolor': 'steelblue'},
flierprops={'marker': 'o', 'markerfacecolor': 'red', 'markersize': 3},
meanprops={'marker': 'D', 'markerfacecolor': 'indianred', 'markersize': 4},
medianprops={'linestyle': '--', 'color': 'orange'}
)
plt.ylabel('单价(元)')
plt.title('不同行政区域的二手房单价对比')
plt.show()
sns.boxplot(x='region', y='price_unit', data=Sec_Buildings,
order=avg_price.index, showmeans=True, color='steelblue',
flierprops={'marker': 'o', 'markerfacecolor': 'red', 'markersize': 3},
meanprops={'marker': 'D', 'markerfacecolor': 'indianred', 'markersize': 4},
medianprops={'linestyle': '--', 'color': 'orange'}
)
plt.xlabel('')
plt.ylabel('单价(元)')
plt.title('不同行政区域的二手房单价对比')
plt.show()


小提琴图
tips = pd.read_csv(r'E:\PyCharmProject\venv\StataAnalysis_Files\从零开始学Python--数据分析与挖掘\第6章 Python数据可视化\tips.csv')
sns.violinplot(x="total_bill",
y="day",
hue="sex",
data=tips,
order=['Thur', 'Fri', 'Sat', 'Sun'],
scale='count',
split=True,
palette='RdBu'
)
plt.title('每天不同性别客户的消费额情况')
plt.legend(loc='upper center', ncol=2)
plt.show()

单条折线图
wechat = pd.read_excel(r'E:\PyCharmProject\venv\StataAnalysis_Files\从零开始学Python--数据分析与挖掘\第6章 Python数据可视化\wechat.xlsx')
plt.plot(wechat.Date,
wechat.Counts,
linestyle='-',
linewidth=2,
color='steelblue',
marker='o',
markersize=6,
markeredgecolor='black',
markerfacecolor='brown')
plt.ylabel('人数')
plt.title('每天微信文章阅读人数趋势')
plt.show()

多条折线图
import matplotlib as mpl
plt.plot(wechat.Date,
wechat.Counts,
linestyle='-',
color='steelblue',
label='阅读人数'
)
plt.plot(wechat.Date,
wechat.Times,
linestyle='--',
color='indianred',
label='阅读人次'
)
ax = plt.gca()
date_format = mpl.dates.DateFormatter("%m-%d")
ax.xaxis.set_major_formatter(date_format)
xlocator = mpl.ticker.MultipleLocator(7)
ax.xaxis.set_major_locator(xlocator)
plt.xticks(rotation=45)
plt.ylabel('人数')
plt.title('每天微信文章阅读人数与人次趋势')
plt.legend()
plt.show()
weather = pd.read_excel(r'E:\PyCharmProject\venv\StataAnalysis_Files\从零开始学Python--数据分析与挖掘\第6章 Python数据可视化\weather.xlsx')
data = weather.pivot_table(index='month', columns='year', values='high')
data.plot(kind='line',
style=['-', '--', ':']
)
plt.xlabel('月份')
plt.ylabel('气温')
plt.title('每月平均最高气温波动趋势')
plt.show()


pandas模块之单组散点图
iris = pd.read_csv(r'E:\PyCharmProject\venv\StataAnalysis_Files\从零开始学Python--数据分析与挖掘\第6章 Python数据可视化\iris.csv')
plt.scatter(x=iris.Petal_Width,
y=iris.Petal_Length,
color='steelblue'
)
plt.xlabel('花瓣宽度')
plt.ylabel('花瓣长度')
plt.title('鸢尾花的花瓣宽度与长度关系')
plt.show()
iris.plot(x='Petal_Width', y='Petal_Length', kind='scatter', title='鸢尾花的花瓣宽度与长度关系')
plt.xlabel('花瓣宽度')
plt.ylabel('花瓣长度')
plt.show()

seaborn模块之分组散点图
sns.lmplot(x='Petal_Width',
y='Petal_Length',
hue='Species',
data=iris,
legend_out=False,
truncate=True
)
plt.xlabel('花瓣宽度')
plt.ylabel('花瓣长度')
plt.title('鸢尾花的花瓣宽度与长度关系')
plt.show()

气泡图
Prod_Category = pd.read_excel(
r'E:\PyCharmProject\venv\StataAnalysis_Files\从零开始学Python--数据分析与挖掘\第6章 Python数据可视化\SuperMarket.xlsx')
range_diff = Prod_Category.Profit_Ratio.max() - Prod_Category.Profit_Ratio.min()
Prod_Category['std_ratio'] = (Prod_Category.Profit_Ratio - Prod_Category.Profit_Ratio.min()) / range_diff + 0.001
plt.scatter(x=Prod_Category.Sales[Prod_Category.Category == '办公用品'],
y=Prod_Category.Profit[Prod_Category.Category == '办公用品'],
s=Prod_Category.std_ratio[Prod_Category.Category == '办公用品'] * 1000,
color='steelblue', label='办公用品', alpha=0.6
)
plt.scatter(x=Prod_Category.Sales[Prod_Category.Category == '技术产品'],
y=Prod_Category.Profit[Prod_Category.Category == '技术产品'],
s=Prod_Category.std_ratio[Prod_Category.Category == '技术产品'] * 1000,
color='indianred', label='技术产品', alpha=0.6
)
plt.scatter(x=Prod_Category.Sales[Prod_Category.Category == '家具产品'],
y=Prod_Category.Profit[Prod_Category.Category == '家具产品'],
s=Prod_Category.std_ratio[Prod_Category.Category == '家具产品'] * 1000,
color='black', label='家具产品', alpha=0.6
)
plt.xlabel('销售额')
plt.ylabel('利润')
plt.title('销售额、利润及利润率的气泡图')
plt.legend()
plt.show()

热力图
Sales = pd.read_excel(r'E:\PyCharmProject\venv\StataAnalysis_Files\从零开始学Python--数据分析与挖掘\第6章 Python数据可视化\Sales.xlsx')
Sales['year'] = Sales.Date.dt.year
Sales['month'] = Sales.Date.dt.month
Summary = Sales.pivot_table(index='month', columns='year', values='Sales', aggfunc=np.sum)
sns.heatmap(data=Summary,
cmap='PuBuGn',
linewidths=.1,
annot=True,
fmt='.1e'
)
plt.title('每年各月份销售总额热力图')
plt.show()
Prod_Trade = pd.read_excel(
r'E:\PyCharmProject\venv\StataAnalysis_Files\从零开始学Python--数据分析与挖掘\第6章 Python数据可视化\Prod_Trade.xlsx')
Prod_Trade['year'] = Prod_Trade.Date.dt.year
Prod_Trade['month'] = Prod_Trade.Date.dt.month
plt.figure(figsize=(12, 6))
ax1 = plt.subplot2grid(shape=(2, 3), loc=(0, 0))
Class_Counts = Prod_Trade.Order_Class[Prod_Trade.year == 2012].value_counts()
Class_Percent = Class_Counts / Class_Counts.sum()
ax1.set_aspect(aspect='equal')
ax1.pie(x=Class_Percent.values, labels=Class_Percent.index, autopct='%.1f%%')
ax1.set_title('各等级订单比例')
ax2 = plt.subplot2grid(shape=(2, 3), loc=(0, 1))
Month_Sales = Prod_Trade[Prod_Trade.year == 2012].groupby(by='month').aggregate({'Sales': np.sum})
Month_Sales.plot(title='2012年各月销售趋势', ax=ax2, legend=False)
ax2.set_xlabel('')
ax3 = plt.subplot2grid(shape=(2, 3), loc=(0, 2), rowspan=2)
sns.boxplot(x='Transport', y='Trans_Cost', data=Prod_Trade, ax=ax3)
ax3.set_title('各运输方式成本分布')
ax3.set_xlabel('')
ax3.set_ylabel('运输成本')
ax4 = plt.subplot2grid(shape=(2, 3), loc=(1, 0), colspan=2)
sns.distplot(Prod_Trade.Sales[Prod_Trade.year == 2012], bins=40, norm_hist=True, ax=ax4,
hist_kws={'color': 'steelblue'}, kde_kws=({'linestyle': '--', 'color': 'red'}))
ax4.set_title('2012年客单价分布图')
ax4.set_xlabel('销售额')
plt.subplots_adjust(hspace=0.6, wspace=0.3)
plt.show()


词云分词
import jieba
import wordcloud
text = open(r'text.txt',encoding='utf-8').read()
def split_words(text):
cut_text = jieba.cut(text)
string = ','.join(cut_text)
stop_words = ['我们', '你们',]
word_cloud = wordcloud.WordCloud(
font_path=r'.\simhei.ttf',
background_color='white',
width=500,
height=350,
max_font_size=100,
min_font_size=10,
stopwords=stop_words,
scale=15,
)
word_cloud.generate(string)
word_cloud.to_file(r'词云分词.png')
split_words(text=text)