利用Python进行描述统计分析时,用到numpy库/scipy库
1. 中心位置:均值、中位数(分位数)、众数
2. 发散程度:最值、极差、方差、标准差、变异系数
最大最小值
极差(ptp(data)):最大值-最小值,作为样本观测数据变异程度大小的一个简单度量
方差(var(data))\标准差(std(data)):描述样本观测数据变异程度的大小(注意此处两个方差,式1代表样本的方差(即用样本的方差估计总体方差时是除以n-1,pandas包中求std即默认除n-1),式2代表数据的方差(numpy中std即除n))
变异系数mean(data)/std(data):方差和平均值的比值,衡量数据变异程度。
3. 偏差程度:z-分数、偏度、峰度
4. 相关程度:相关系数、协方差
本案例探究BMI指数与身高、体重等因素的关系。(BMI通过体重公斤数除以身高米数平方得出的数值,用以衡量人体胖瘦程度以及是否健康的标准)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import brfss
%config InlineBackend.figure_format='retina' #设置图像清晰度
df=brfss.ReadBrfss() #导入brfss数据
#选取bmi和income两列数据,并舍弃缺失值
bmi_income=df[['bmi','income']].dropna()
bmi_income.head(3)
>>>
> bmi income
0 40.18 3.0
1 25.09 1.0
3 28.19 8.0
bmi_income.info() #查看数据基本信息
bmi_income['income'].value_counts()
bmi_rich=bmi_income[bmi_income.income==8]['bmi']
bmi_ord=bmi_income[bmi_income.income!=8]['bmi']
bmi_rich.describe()
>>>
>count 110259.000000
mean 27.450733
std 5.900353
min 12.050000
25% 23.690000
50% 26.570000
75% 30.040000
max 97.650000
Name: bmi, dtype: float64
bmi_ord.describe()
>count 232833.000000
mean 28.537320
std 6.971436
min 12.020000
25% 24.030000
50% 27.370000
75% 31.620000
max 97.650000
Name: bmi, dtype: float64
中心趋势
mean_rich=bmi_rich.mean()
mean_ord=bmi_ord.mean()
print('BMI mean of rich people: %.2f' % mean_rich)
print('BMI mean of ordinary people: %.2f' % mean_ord)
>>>
>BMI mean of rich people: 27.45
BMI mean of ordinary people: 28.54
median_rich=bmi_rich.median()
median_ord=bmi_ord.median()
print('BMI median of rich people: %.2f' % median_rich)
print('BMI median of ordinary people: %.2f' % median_ord)
>>>
>BMI median of rich people: 26.57
BMI median of ordinary people: 27.37
mode_rich=bmi_rich.mode().iloc[0]
mode_count_rich=np.count_nonzero(bmi_rich==mode_rich)
mode_ord=bmi_ord.mode().iloc[0]
mode_count_ord=np.count_nonzero(bmi_ord==mode_ord)
print('BMI mode of rich people: %.2f(counts %d)' % (mode_rich,mode_count_rich))
print('BMI mode of ordinary people: %.2f(counts %d)' % (mode_ord,mode_count_ord))
>>>
>BMI mode of rich people: 26.63(counts 1246)
BMI mode of ordinary people: 26.63(counts 2766)
print('mean difference(rich-ordinary): %.2f' % (mean_rich-mean_ord))
>>>
>mean difference(rich-ordinary): -1.09
fig=plt.figure(figsize=(12,8))
#绘制富人bmi数据直方图
p1=fig.add_subplot(211)
plt.hist(bmi_rich,bins=50,rwidth=0.9)
plt.xlabel('BMI')
plt.xlim((0,80))
plt.ylabel('Counts')
plt.title('BMI histogram of rich people')
#绘制普通人bmi数据直方图
p2=fig.add_subplot(212)
plt.hist(bmi_ord,bins=50,rwidth=0.9)
plt.xlabel('BMI')
plt.xlim((0,80))
plt.ylabel('Counts')
plt.title('BMI histogram of ordinary people')
plt.hist(bmi_rich,bins=50,range=(10,60),normed=True,label='rich',
alpha=0.4,color='r')
plt.hist(bmi_ord,bins=50,range=(10,60),normed=True,label='ord',
alpha=0.4,color='b')
plt.legend()
plt.xlabel('BMI')
plt.ylabel('probability density')
plt.title('BMI histogram')
#计算众数区间
bin_edge=np.arange(10,60,1)
counts,bins=np.histogram(bmi_rich,bin_edge)
mode_left=bins[np.argmax(counts)] #返回counts取最大值时的下标值
mode_right=bins[np.argmax(counts)+1]
mode_middle=(mode_left+mode_right)/2
print('mode range(%.2f,%.2f)'%(mode_left,mode_right))
print('median:%.2f'%(median_rich))
print('mean:%.2f'%(mean_rich))
#计算偏度
print('skewness:%.2f'%bmi_rich.skew())
#作图
plt.axvline(x=mean_rich,linewidth=1,color='r',label='mean')
plt.axvline(x=median_rich,linewidth=1,color='g',label='median')
plt.axvline(x=mode_middle,linewidth=1,color='b',label='mode')
plt.legend(loc='best')
plt.hist(bmi_rich,bins=bin_edge,rwidth=0.9,alpha=0.5)
plt.xlabel('BMI')
plt.ylabel('Counts')
plt.title('BMI distribution of rich people')
#收入水平分布
print('skewness:%.2f'%bmi_income.income.skew())
bins=np.arange(1,10)
plt.hist(bmi_income.income,bins=bins,rwidth=0.9,align='left',alpha=0.5,
label='skewness:%.2f'%bmi_income.income.skew())
plt.legend()
plt.xlabel('income level')
plt.ylabel('counts')
plt.title('income distribution')
def ecdf(data): #计算数据的ECDF值
x=np.sort(data)
y=np.arange(1,len(x)+1)/len(x)
return (x,y)
def plot_ecdf(data,xlabel=None,ylabel='ECDF',label=None): #绘制数据的ECDF图
x,y=ecdf(data)
_=plt.plot(x,y,marker='.',markersize=3,linestyle='none',label=label)
_=plt.legend(markerscale=4)
_=plt.xlabel(xlabel)
_=plt.ylabel(ylabel)
plt.margins(0.01)
plot_ecdf(bmi_rich,label='rich')
plot_ecdf(bmi_ord,xlabel='BMI',label='ord')
q1=bmi_rich.quantile(0.25)
q2=bmi_rich.quantile(0.5)
q3=bmi_rich.quantile(0.75)
IQR=q3-q1 #四分位距(interquartile range, IQR)
print('min: ', bmi_rich.min())
print('25%: ',q1)
print('50%: ',q2)
print('75%: ',q3)
print('IQR: %.2f'%IQR)
print('max: ', bmi_rich.max())
>>>
>min: 12.05
25%: 23.69
50%: 26.57
75%: 30.04
IQR: 6.35
max: 97.65
bmi_income['income_level']=bmi_income['income'].map(lambda x: 'rich' if x==8 else 'ord')
sns.boxplot(x=bmi_income['income_level'],y=bmi_income['bmi'],palette='muted')
var_rich=bmi_rich.var()
std_rich=bmi_rich.std()
print('For rich people: Variance=%.2f,Standard deviance=%.2f'%(var_rich,std_rich))
var_ord=bmi_ord.var()
std_ord=bmi_ord.std()
print('For ordinary people: Variance=%.2f,Standard deviance=%.2f'%(var_ord,std_ord))
>>>
>For rich people: Variance=34.81,Standard deviance=5.90
For ordinary people: Variance=48.60,Standard deviance=6.97
def cohen_d(data1,data2):
n1=len(data1)
n2=len(data2)
x1=data1.mean()
x2=data2.mean()
var1=np.var(data1,ddof=1) #ddof=1代表无偏,ddof=0代表有偏
var2=np.var(data2,ddof=1)
sp=np.sqrt(((n1-1)*var1+(n2-1)*var2)/(n1+n2-2))
return (x1-x2)/sp
print("'Cohen's d:%.3f"%cohen_d(bmi_rich,bmi_ord))
>>>
'Cohen's d:-0.163
相关性
df2=df[['height','weight','bmi']].dropna()
height=.height
weight=df2.weight
bmi=df2.bmi
np.cov(height,weight) #计算协方差矩阵
>>>
>array([[1.12563400e-02, 1.08190764e+00],
[1.08190764e+00, 4.67153513e+02]])
np.corrcoef(height,weight)[0][1] #计算相关系数
>>>0.4718041740847708
#绘制身高体重散点图,一般通过散点图查看两个变量间的相关性
plt.plot(height,weight,marker='.',linestyle='none',alpha=0.05)
plt.xlabel('height(m)')
plt.ylabel('weight(kg)')
plt.title('correlation of weight and height')
plt.show()
#计算体重与BMI的相关系数
corr_weightBMI=np.corrcoef(weight,bmi)[0][1]
corr_weightBMI
#绘制BMI及体重散点图,一般通过散点图查看两个变量间的相关性
plt.plot(weight,bmi,marker='.',linestyle='none',alpha=0.05)
plt.xlabel('weight(kg)')
plt.ylabel('bmi')
plt.title('correlation of weight and bmi')
plt.show()