import pandas as pd
from scipy import stats
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
file = r'./iris-clean.csv'
df = pd.read_csv(file,sep=',',header=0)
df
sepal.length | sepal.width | petal.length | petal.width | variety | |
---|---|---|---|---|---|
0 | 5.1 | 3.5 | 1.4 | 0.2 | Setosa |
1 | 4.9 | 3.0 | 1.4 | 0.2 | Setosa |
2 | 4.7 | 3.2 | 1.3 | 0.2 | Setosa |
3 | 4.6 | 3.1 | 1.5 | 0.2 | Setosa |
4 | 5.0 | 3.6 | 1.4 | 0.2 | Setosa |
... | ... | ... | ... | ... | ... |
145 | 6.7 | 3.0 | 5.2 | 2.3 | Virginica |
146 | 6.3 | 2.5 | 5.0 | 1.9 | Virginica |
147 | 6.5 | 3.0 | 5.2 | 2.0 | Virginica |
148 | 6.2 | 3.4 | 5.4 | 2.3 | Virginica |
149 | 5.9 | 3.0 | 5.1 | 1.8 | Virginica |
150 rows × 5 columns
sepal_len = df['sepal.length']
sepal_width = df['sepal.width']
petal_len = df['petal.length']
petal_width = df['petal.width']
sns.distplot(sepal_len)
sns.distplot(sepal_width)
sns.distplot(petal_len)
sns.distplot(petal_width)
sepal_len = np.array(sepal_len)
stats.kstest(sepal_len,'norm',(sepal_len.mean(),sepal_len.std())) # 卡方检验 pvalue > 0.05 就是正态分布
KstestResult(statistic=0.08945440179507252, pvalue=0.1706852358415618)
sepal_width = np.array(sepal_width)
stats.kstest(sepal_width,'norm',(sepal_width.mean(),sepal_width.std())) # 所以这是正态分布
KstestResult(statistic=0.10583307189330171, pvalue=0.0644929989865557)
petal_len = np.array(petal_len)
stats.kstest(petal_len,'norm',(petal_len.mean(),petal_len.std())) # 所以这不是正态分布
KstestResult(statistic=0.19894200836859716, pvalue=1.1160726642867047e-05)
petal_width = np.array(petal_width)
stats.kstest(petal_width,'norm',(petal_width.mean(),petal_width.std())) # 所以这不是正态分布
KstestResult(statistic=0.1736414962992952, pvalue=0.0002000375783506314)