"""
Created on Fri Oct 4 20:41:20 2019
@author: shenlong
"""
import os
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
"""
代码说明:
programmer_1: 制作箱线图
data.boxplot-->数据转为箱线图的字典格式
plt.annotate-->绘图
programmer_2: 计算数据
range-->极差
var-->方差
dis-->四分距
programmer_3: 画出盈利图(比例和数值)
programmer_4: 计算成对相关性
data.corr()-->dataframe中相互之间的相关性
data.corr()[u'百合酱蒸凤爪'] -->dataframe某一项与其他项的相关性
"""
def programmer_1(file_name):
catering_sale = file_name
data = pd.read_excel(catering_sale, index_col=u'日期')
plt.figure()
p = data.boxplot(return_type='dict')
x = p['fliers'][0].get_xdata()
y = p['fliers'][0].get_ydata()
y = np.sort(y)
for i in range(len(x)):
temp = y[i] - y[i - 1] if i != 0 else -78 / 3
plt.annotate(
y[i], xy=(x[i], y[i]), xytext=(x[i] + 0.05 - 0.8 / temp, y[i]))
plt.show()
def programmer_2(file_name):
catering_sale = file_name
data = pd.read_excel(catering_sale, index_col=u'日期')
data = data[(data[u'销量'] > 400) & data[u'销量'] < 5000]
statistics = data.describe()[u'销量']
statistics['range'] = statistics['max'] - statistics['min']
statistics['var'] = statistics['std'] / statistics['mean']
statistics['dis'] = statistics['75%'] - statistics['25%']
print(statistics)
def programmer_3(file_name):
dish_profit = file_name
data = pd.read_excel(dish_profit, index_col=u'菜品名')
data = data[u'盈利'].copy()
data.sort_values(ascending=False)
plt.figure()
data.plot(kind='bar')
plt.ylabel(u'盈利(元)')
p = 1.0 * data.cumsum() / data.sum()
p.plot(color='r', secondary_y=True, style='-o', linewidth=2)
plt.annotate(
format(p[6], '.4%'),
xy=(6, p[6]),
xytext=(6 * 0.9, p[6] * 0.9),
arrowprops=dict(arrowstyle="->", connectionstyle="arc3,rad=.2"))
plt.ylabel(u'盈利(比例)')
plt.show()
def programmer_4(file_name):
catering_sale = file_name
data = pd.read_excel(catering_sale, index_col=u'日期')
data.corr()
data.corr()[u'百合酱蒸凤爪']
data[u'百合酱蒸凤爪'].corr(data[u'翡翠蒸香茜饺'])
if __name__ == "__main__":
path = os.getcwd()
pass
import pandas as pd
import numpy as np
'''
#餐饮数据
catering_sale='catering_sale.xls'#将文件路径命名
#读取数据,指定日期为索引列
df=pd.read_excel(catering_sale,index_col=u'日期')
#首先可以先使用describe()函数查看数据的基本情况:
df.describe()
#检测异常值的方法可以使用箱型图:
import matplotlib.pyplot as plt
# 用来正常显示中文标签
# plt.rcParams['dont.sans-serif'] = ['SimHei']
plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来正常显示中文标签
# # 用来正常显示负号
plt.rcParams['axes.unicode_minus'] = False
plt.figure()
'''
'''
画箱型图,这里画箱型图有两种方法:
1. 一种是直接调用DataFrame的boxplot();
2. 另一种是调用Series或者DataFrame的plot()方法,并用kind参数指定箱型图(box);
'''
'''
p = df.boxplot(return_type='dict')
# 'flies'即为异常值的标签
x = p['fliers'][0].get_xdata()
y = p['fliers'][0].get_ydata()
y.sort()
for i in range(len(x)):
if i > 0:
plt.annotate(y[i],xy = (x[i],y[i]), xytext = (x[i]+0.05-0.8/(y[i]-y[i-1]),y[i]))
else:
plt.annotate(y[i],xy = (x[i],y[i]),xytext = (x[i]+0.08,y[i]))
plt.show()
'''
'''
上下的两个标线表示的是上界和下界(四分位点),超过上下界的值就是异常,但是那其中
几个散点离上下界比较近,所以可以把865,4060.3,4065.2归为正常值,将22,51,60,
6607.4,9106.44归为异常值;
'''
'''
#找到异常值
'''
'''数据特征分析
对于定量数据可以通过绘制频率分布表,绘制频率分布直方图,茎叶图的方式进行直观的分析;
对于定性分类的数据,可以使用饼图和条形图的方式来查看显示分布情况;
统计量分析
极差反映了最大值和最小值的分布情况;
标准差用来度量数据偏离均值的程度;
变异系数度量标准差相对于均值的离中趋势;
四分位数间距表示上下四分位数之差,越大表示变异程度越大;
'''
'''
# 过滤异常数据
df = df[(df[u'销量'] > 400)&(df[u'销量'] < 5000)]
statistics = df.describe()
print('过滤掉异常数据之后的数据情况:\n',statistics)
# 极差
statistics.loc['range'] = statistics.loc['max'] - statistics.loc['min']
print('极差是:\n',statistics.loc['range'])
# 变异系数
statistics.loc['var'] = statistics.loc['std'] / statistics.loc['mean']
print('变异系数是:\n',statistics.loc['var'])
# 四分位数间距
statistics.loc['dis'] = statistics.loc['75%'] - statistics.loc['25%']
print('四分位数间距:\n',statistics.loc['dis'])
#贡献度分析
# 初始化参数
dish_profit = 'catering_dish_profit.xls'
df = pd.read_excel(dish_profit,index_col = u'菜品名')
df = df[u'盈利'].copy()
# 表示按降序排列
df.sort_values(ascending=False)
# 导入图像库
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False #用来正常显示负号
plt.figure()
# 显示直方图
df.plot(kind='bar')
plt.ylabel(u'盈利(元)')
p = 1.0 * df.cumsum()/df.sum()
p.plot(color = 'r',secondary_y=True,style='-o',linewidth=2)
# 添加注释,即85%处的标记,这里包括了指定箭头样式
plt.annotate(format(p[6],'.4%'),xy=(6,p[6]),xytext=(6*0.9,p[6]*0.9),arrowprops=dict(arrowstyle="->",connectionstyle="arc3,rad=.2"))
plt.show()
# 相关性分析
catering_sale = 'catering_sale_all.xls'
# 读取数据,指定'日期'列为索引列
data = pd.read_excel(catering_sale,index_col = u'日期')
# 相关系数矩阵,即给出了任意两款菜式之间的相关系数
data.corr()
# 只显示"百合酱蒸风爪"与其他菜式的相关系数
data.corr()[u'百合酱蒸凤爪']
# 计算两者的相关系数
data[u'百合酱蒸凤爪'].corr(data[u'翡翠蒸香茜饺'])
#计算两个列向量的相关系数:
# 计算两个列的相关系数
# 生成样本D,一行为1-7,一行为2-8
'''
D = pd.DataFrame([range(1,8),range(2,9)])
print(D)
S1 = D.loc[0]
print(S1)
S2 = D.loc[1]
print(S2)
print(S1.corr(S2,method='pearson'))
print(S1.corr(S2,method='kendall'))
print(S1.corr(S2,method='spearman'))
D = pd.DataFrame(np.random.randn(6,5))
print(D.cov())
print('第一列和第二列的协方差:\n',D[0].cov(D[1]))
D = pd.DataFrame(np.random.randn(6,5))
print('偏度:\n',D.skew())
print('峰度:\n',D.kurt())
D = pd.Series(range(0,20))
print(D)
print('前n项和:\n',D.cumsum(0))
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
plt.figure(figsize = (7,5))
import numpy as np
x = np.linspace(0, 2*np.pi, 50)
y = np.sin(x)
plt.plot(x, y, 'bp--')
plt.legend()
plt.title('6不6')
plt.xlabel('游戏人生')
plt.ylabel('努力加油')
plt.show()
import matplotlib.pyplot as plt
labels = 'Frogs','Hogs','Dogs','Logs'
sizes = [15, 30, 45, 10]
colors = ['yellowgreen', 'gold', 'lightskyblue', 'lightcoral']
explode = (0,0.09,0,0.09)
plt.pie(sizes,explode=explode, labels=labels, colors=colors, autopct='%1.1f%%',
shadow=True, startangle=90)
plt.axis('equal')
plt.title('西瓜圆不圆')
plt.show()
import matplotlib.pyplot as plt
import numpy as np
seed=np.random.seed(10)
x = np.random.randn(1000)
plt.hist(x, 10)
plt.show()
'''
绘制箱型图的两种方法:
1. 直接调用DataFrame的boxplot()方法;
2. 调用Series或者DataFrame的plot()方法,并用kind参数指定箱型图(box);
'''
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
seed_=np.random.seed(100)
x = np.random.randn(1000)
D = pd.DataFrame([x,x+1]).T
D.plot(kind = 'box')
plt.title('人生苦短,我用python')
plt.show()
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
import numpy as np
import pandas as pd
x = pd.Series(np.exp(np.arange(20)))
x.plot(label = u'原始数据图',legend = True)
plt.show()
x.plot(logy = True, label = u'对数数据图',legend = True)
plt.show()
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
import numpy as np
import pandas as pd
np.random.seed(11)
error = np.random.randn(10)
y = pd.Series(np.sin(np.arange(10)))
y.plot(yerr = error)
plt.show()
'''
数据清洗
缺失值处理
缺失值处理一般采用:均值/中位数/众数插补,使用固定值,最近邻值插补,回归法,插值法
拉格朗日插值:
'''
import pandas as pd
from scipy.interpolate import lagrange
import xlwt
inputfile = 'catering_sale.xls'
outputfile = './sales.xls'
data = pd.read_excel(inputfile)
data[u'销量'][(data[u'销量'] < 400) | (data[u'销量'] > 5000)] = None
def ployinterp_column(s, n, k=5):
y = s[list(range(n-k,n)) + list(range(n+1, n+1+k))]
y = y[y.notnull()]
return lagrange(y.index, list(y))(n)
for i in data.columns:
for j in range(len(data)):
if (data[i].isnull())[j]:
data[i][j] = ployinterp_column(data[i],j)
data.to_excel(outputfile)
import pandas as pd
import numpy as np
datafile = 'normalization_data.xls'
data = pd.read_excel(datafile,header = None)
(data - data.min())/(data.max() - data.min())
(data - data.mean())/data.std()
data/10**np.ceil(np.log10(data.abs().max()))
def cluster_plot(d, k):
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
plt.figure(figsize = (8, 3))
for j in range(0, k):
plt.plot(data[d==j], [j for i in d[d==j]], 'o')
plt.ylim(-0.5, k-0.5)
return plt
import pandas as pd
datafile = './data/discretization_data.xls'
data = pd.read_excel(datafile)
data = data[u'肝气郁结证型系数'].copy()
k = 4
d1 = pd.cut(data, k, labels= range(k))
w = [1.0*i/k for i in range(k+1)]
w = data.describe(percentiles = w)[4:4+k+1]
w[0] = w[0] * (1-1e-10)
d2 = pd.cut(data, w, labels = range(k))
from sklearn.cluster import KMeans
kmodel = KMeans(n_clusters = k, n_jobs = 4)
kmodel.fit(data.values.reshape((len(data), 1)))
c = pd.DataFrame(kmodel.cluster_centers_).sort_values(by=0)
w = c.rolling(2).mean().iloc[1:]
w = [0] + list(w[0]) + [data.max()]
d3 = pd.cut(data, w, labels = range(k))
cluster_plot(d1, k).show()
cluster_plot(d2, k).show()
cluster_plot(d3, k).show()
import pandas as pd
inputfile = './data/electricity_data.xls'
outputfile = './electricity_data.xls'
data = pd.read_excel(inputfile)
data[u'线损率'] = (data[u'供入电量'] - data[u'供出电量'])/data[u'供入电量']
data.to_excel(outputfile, index = False)
inputfile = './data/principal_component.xls'
outputfile = './dimention_reducted.xls'
data = pd.read_excel(inputfile, header = None)
from sklearn.decomposition import PCA
pca = PCA()
pca.fit(data)
pca.components_
pca.explained_variance_ratio_
pca = PCA(3)
pca.fit(data)
low_d = pca.transform(data)
pd.DataFrame(low_d).to_excel(outputfile)
pca.inverse_transform(low_d)
D = pd.Series([1,1,2,3,5])
D.unique()
np.unique(D)