第七次尝试

第七课:案例分析

import pandas as pd
import numpy as np
import scipy.stats
import matplotlib.pyplot as plt
import matplotlib
matplotlib.style.use('ggplot')  

%matplotlib inline
%config InlineBackend.figure_format = 'retina'
ls data 
 驱动器 C 中的卷没有标签。
 卷的序列号是 46F0-2618

 C:\Users\Eva's\Desktop\python课件\第七课材料\第七课材料\codes\data 的目录

2017/09/14  21:51              .
2017/09/14  21:51              ..
2017/09/03  23:30             6,148 .DS_Store
2017/09/03  23:30            14,142 data_75_12.csv
2017/09/03  23:30             3,930 evolution.csv
2017/09/03  23:30             8,568 fortis_heritability.csv
               4 个文件         32,788 字节
               2 个目录  6,175,817,728 可用字节

随时间的演化

数据导入和清洗

evolution = pd.read_csv('data/evolution.csv')
evolution.head()










































































Year Species Beak length Beak depth Beak width CI Beak length CI Beak depth CI Beak width
0 1973 fortis 10.76 9.48 8.69 0.097 0.13 0.081
1 1974 fortis 10.72 9.42 8.66 0.144 0.17 0.112
2 1975 fortis 10.57 9.19 8.55 0.075 0.084 0.057
3 1976 fortis 10.64 9.23 8.58 0.048 0.053 0.039
4 1977 fortis 10.73 9.35 8.63 0.085 0.092 0.066

evolution.info()

RangeIndex: 83 entries, 0 to 82
Data columns (total 8 columns):
Year              82 non-null object
Species           81 non-null object
Beak length       81 non-null object
Beak depth        80 non-null float64
Beak width        80 non-null float64
CI Beak length    80 non-null object
CI Beak depth     80 non-null object
CI Beak width     80 non-null object
dtypes: float64(2), object(6)
memory usage: 5.3+ KB
evolution = evolution.dropna()
evolution.info()

Int64Index: 80 entries, 0 to 82
Data columns (total 8 columns):
Year              80 non-null object
Species           80 non-null object
Beak length       80 non-null object
Beak depth        80 non-null float64
Beak width        80 non-null float64
CI Beak length    80 non-null object
CI Beak depth     80 non-null object
CI Beak width     80 non-null object
dtypes: float64(2), object(6)
memory usage: 5.6+ KB
evolution['Beak length'] = pd.to_numeric(evolution['Beak length']) 
evolution['CI Beak length'] = pd.to_numeric(evolution['CI Beak length'], errors='coerce')  
evolution['CI Beak depth'] = pd.to_numeric(evolution['CI Beak depth'], errors='coerce')
evolution['CI Beak width'] = pd.to_numeric(evolution['CI Beak width'], errors='coerce')
evolution['Year'] = pd.to_datetime(evolution['Year']) 
evolution.info()

Int64Index: 80 entries, 0 to 82
Data columns (total 8 columns):
Year              80 non-null datetime64[ns]
Species           80 non-null object
Beak length       80 non-null float64
Beak depth        80 non-null float64
Beak width        80 non-null float64
CI Beak length    79 non-null float64
CI Beak depth     79 non-null float64
CI Beak width     79 non-null float64
dtypes: datetime64[ns](1), float64(6), object(1)
memory usage: 5.6+ KB

数据探索

evolution.Species.value_counts() 
scandens    40
fortis      40
Name: Species, dtype: int64

fortis = evolution[evolution.Species == 'fortis']     
scandens = evolution[evolution.Species == 'scandens'] 
fortis.plot(x='Year', y = ['Beak length', 'Beak depth', 'Beak width'])

第七次尝试_第1张图片
output_17_1.png
scandens.plot(x='Year', y = ['Beak length', 'Beak depth', 'Beak width'])

第七次尝试_第2张图片
output_18_1.png
scandens.plot(x='Year', y = ['Beak length', 'Beak depth', 'Beak width'], subplots=True, figsize=(10, 6))
array([,
       ,
       ], dtype=object)
第七次尝试_第3张图片
output_19_1.png
fortis.plot(x='Year', y ='Beak depth', yerr='CI Beak depth', marker='o', figsize=(10,5), color='DarkBlue')

第七次尝试_第4张图片
output_20_1.png
scandens.plot(x='Year', y='Beak depth', yerr='CI Beak depth', marker='o', figsize=(10,5), color='DarkBlue')

第七次尝试_第5张图片
output_21_1.png

1975年和2012年数据的比较

数据探索

data = pd.read_csv('data/data_75_12.csv')
data.head()


















































species length depth year
0 fortis 9.4 8.0 1975
1 fortis 9.2 8.3 1975
2 fortis 9.5 7.5 1975
3 fortis 9.5 8.0 1975
4 fortis 11.5 9.9 1975

data.info()

RangeIndex: 651 entries, 0 to 650
Data columns (total 4 columns):
species    651 non-null object
length     651 non-null float64
depth      651 non-null float64
year       651 non-null int64
dtypes: float64(2), int64(1), object(1)
memory usage: 20.4+ KB
data.groupby(['species', 'year']).count() 










































length depth
species year
fortis 1975 316 316
2012 121 121
scandens 1975 87 87
2012 127 127

data.groupby(['species', 'year']).mean() 










































length depth
species year
fortis 1975 10.565190 9.171646
2012 10.517355 8.605372
scandens 1975 14.120920 8.960000
2012 13.421024 9.186220


fortis2 = data[data.species == 'fortis']
scandens2 = data[data.species == 'scandens']

fortis2.boxplot(by='year', figsize = (10,6))
array([,
       ], dtype=object)
第七次尝试_第6张图片
output_30_1.png
scandens2.boxplot(by='year', figsize = (10,6))
array([,
       ], dtype=object)
第七次尝试_第7张图片
output_31_1.png

中地雀鸟喙深度和长度的变化


fortis1975 = fortis2[fortis2.year == 1975]
fortis2012 = fortis2[fortis2.year == 2012]
fortis1975.depth.mean()
9.171645569620255
fortis2012.depth.mean()
8.605371900826446

鸟喙深度95%置信区间

def mean_ci(data):    
    '''给定样本数据,计算均值95%的置信区间'''        
    sample_size = len(data)    
    std = np.std(data, ddof=1)     
    se = std / np.sqrt(sample_size)        
    point_estimate = np.mean(data)      
    z_score = scipy.stats.norm.isf(0.025) 
    confidence_interval = (point_estimate - z_score * se, point_estimate + z_score * se)    
    
    return confidence_interval

mean_ci(fortis1975.depth)
(9.0903471711839057, 9.2529439680566039)

mean_ci(fortis2012.depth)
(8.4748436937850755, 8.7359001078678169)

鸟喙深度差值的95%置信区间

diff_mean = fortis2012.depth.mean() - fortis1975.depth.mean()
diff_mean
-0.5662736687938086
def draw_bs_reps(data, func, size=1):
    '''bootstrap方法'''

    
    bs_replicates = np.empty(size)

   
    for i in range(size):
        bs_replicates[i] = func(np.random.choice(data, size=len(data)))

    return bs_replicates

bs1975 = draw_bs_reps(fortis1975.depth, np.mean, 10000)
bs2012 = draw_bs_reps(fortis2012.depth, np.mean, 10000)


bs_diff = bs2012 - bs1975


conf_int = np.percentile(bs_diff, [2.5, 97.5])


print('均值的差: ', diff_mean, 'mm')
print('95% 置信区间:', conf_int, 'mm')
均值的差:  -0.5662736687938086 mm
95% 置信区间: [-0.71819592 -0.41113047] mm

鸟喙长度差值的95%置信区间


diff_mean = fortis2012.length.mean() - fortis1975.length.mean()


bs1975 = draw_bs_reps(fortis1975.length, np.mean, 10000)
bs2012 = draw_bs_reps(fortis2012.length, np.mean, 10000)


bs_diff = bs2012 - bs1975


conf_int = np.percentile(bs_diff, [2.5, 97.5])


print('均值的差: ', diff_mean, 'mm')
print('95% 置信区间:', conf_int, 'mm')
均值的差:  -0.047834501516897276 mm
95% 置信区间: [-0.209585   0.1162503] mm

假设检验判断鸟喙深度和长度是否有改变(t分布)


scipy.stats.ttest_ind(fortis2012.depth, fortis1975.depth)
Ttest_indResult(statistic=-7.196495054344143, pvalue=2.727142579713897e-12)

原假设为1975年和2012年中地雀鸟喙深度均值相同,取$\alpha = 0.01$, 因为 $pvalue < \alpha$, 所以原假设不成立,得到结论是:2012年和1975年相比,中地雀的鸟喙深度地雀发生了变化。

scipy.stats.ttest_ind(fortis2012.length, fortis1975.length)
Ttest_indResult(statistic=-0.62821616808278724, pvalue=0.53019202035616386)

取$\alpha = 0.01$, 因为 $pvalue > \alpha$, 所以无法拒绝原假设,即不能证明鸟喙长度发生了改变。

中地雀鸟喙形状的变化


ax = fortis1975.plot.scatter(x='length', y='depth', color='Blue', label='1975')
fortis2012.plot.scatter(x='length', y='depth', color='Red', label='2012', ax=ax)
plt.legend(loc='upper left')

第七次尝试_第8张图片
output_52_1.png

np.corrcoef(fortis1975.length, fortis1975.depth)[0,1]
0.82123033856315231

np.corrcoef(fortis2012.length, fortis2012.depth)[0,1]
0.72342938117020117

ratio1975 = fortis1975.length / fortis1975.depth
ratio2012 = fortis2012.length / fortis2012.depth

print(np.mean(ratio1975))
print(np.mean(ratio2012))
1.154557328563076
1.2250642338241673

scipy.stats.ttest_ind(ratio2012, ratio1975)
Ttest_indResult(statistic=11.221158991741978, pvalue=7.7605361953834971e-26)

结论,2012年相比1975年,鸟喙形状发生了改变。

遗传数据分析

数据探索


herit = pd.read_csv('data/fortis_heritability.csv')
herit.head()


















































species mid_offspr male_parent female_parent
0 fortis 10.70 10.90 9.3
1 fortis 9.78 10.70 8.4
2 fortis 9.48 10.70 8.1
3 fortis 9.60 10.70 9.8
4 fortis 10.27 9.85 10.4

herit.info()

RangeIndex: 413 entries, 0 to 412
Data columns (total 4 columns):
species          413 non-null object
mid_offspr       413 non-null float64
male_parent      413 non-null float64
female_parent    413 non-null float64
dtypes: float64(3), object(1)
memory usage: 13.0+ KB

np.corrcoef(herit.mid_offspr, herit.male_parent)[0,1]
0.52168902179704335

np.corrcoef(herit.mid_offspr, herit.female_parent)[0,1]
0.58770631611267576

herit['mid_parent'] = (herit.male_parent + herit.female_parent) / 2
herit.head()
























































species mid_offspr male_parent female_parent mid_parent
0 fortis 10.70 10.90 9.3 10.100
1 fortis 9.78 10.70 8.4 9.550
2 fortis 9.48 10.70 8.1 9.400
3 fortis 9.60 10.70 9.8 10.250
4 fortis 10.27 9.85 10.4 10.125


np.corrcoef(herit.mid_offspr, herit.mid_parent)[0,1]
0.72834123955184848

herit.plot.scatter(x='mid_parent', y='mid_offspr')

第七次尝试_第9张图片
output_68_1.png

推断是否有显著的遗传相关性

def draw_bs_pairs(x, y, func, size=1):
    """对配对数据使用bootstrap方法"""

    
    inds = np.arange(len(x))

    
    bs_replicates = np.empty(size)

    
    for i in range(size):
        bs_inds = np.random.choice(inds, len(inds))
        bs_x, bs_y = x[bs_inds], y[bs_inds]
        bs_replicates[i] = func(bs_x, bs_y)

    return bs_replicates

def pearson_r(x, y):
    """计算皮尔逊相关系数"""
    corr_mat = np.corrcoef(x,y)
    return corr_mat[0,1]

bs_replicates = draw_bs_pairs(herit.mid_offspr, herit.mid_parent, pearson_r, 1000)


conf_int = np.percentile(bs_replicates, [2.5, 97.5])


print(conf_int)
[ 0.67140317  0.77933128]

皮尔逊相关系数的计算公式:

$$ \rho = \frac{cov(x,y)}{\sigma_x \sigma_y} $$

但是,遗传的相关性,应该只依赖于父母,而非儿女,所以要修改该公式:

$$ \rho = \frac{cov(x,y)}{\sigma_x \sigma_x} = \frac{cov(x,y)}{var_x}$$


def heritability(parents, offspring):
    """计算遗传相关性系数"""
    covariance_matrix = np.cov(parents, offspring)
    return covariance_matrix[0,1] / covariance_matrix[0,0]
herit_fortis = heritability(herit.mid_parent, herit.mid_offspr)
print(herit_fortis)
0.722905191144
bs_replicates = draw_bs_pairs(herit.mid_offspr, herit.mid_parent, heritability, 1000)
conf_int = np.percentile(bs_replicates, [2.5, 97.5])
print( conf_int)
[ 0.65984576  0.8041422 ]
##完全没有懂。



你可能感兴趣的:(第七次尝试)