Python 做数据分析有完整的工具链条.
往深, 可以实现 Deep Learning 的项目(Scikitlearn/Tensorflow)
往浅, 也可以实现表格(二维矩阵)的处理
本次即使用较浅的部分,处理表格(虽然只有一张, 但很长)
数据科学最小工具链
python | numpy | pandas | Matplotlib(/bokeh) | Scikit-Learn(/tensorflow) |
---|---|---|---|---|
list | array | matrix | plot | |
. | . | index,column,column,column | . | |
l=[,] | a=[[,] | index, | ||
. | [,]] | index, | ||
l | np.xxx | pd.dataframe(np.xxx) | plot.xxx.(pd.dataframe(np.xxx),x,x) | |
NumPy’s Structured Arrays | Pandas df operates like a tructured array | Visualization with Matplotlib |
matplotlib 和 bokeh 选哪个?
- Jupyter 常见可视化框架的选择
- 我希望导出 html并且可交互, 故选择 bokeh.
- 内部显示的话, matplotlib也很顺手, 有时会现用它展示, 再重新用 bokeh写一次.
以需求为底, 逐步拆解到实现.
du_start,du_end,act_type,wday,wn4y
1491088707.0,1491088708.0,05.Career,6,13
1491088708.0,1491088865.0,07.social,6,13
- 转换后格式如下:
import pandas as pd
df = pd.read_csv('/Users/liguanghe/atl4dama/src/_rp4lgh/df_isocalendar4lgh.csv')
df.head(3) #显示头部, 还可以显示尾部 tail(), 显示描述 describ() ,括号里填写数字, 即可限定显示多少行
df3[44:45]
df['du_time'] = df.apply(lambda x: x.du_end-x.du_start, axis=1)
df.tail(1)
import numpy as np
UniqueAct = df.act_type.unique()
l = []
a = []
w = []
for i in UniqueAct:
c = df[df['act_type'].isin([i])]
Uniqueweeks = df.wn4y.unique()
for week in Uniqueweeks:
a.append(i)
w.append(week)
l.append(np.sum((c[c['wn4y'].isin([week])])['du_time'])/60/60)
d4 = {'act':a,
'week':w,
'sum':l}
to= (pd.DataFrame(d4).round(1)).set_index('act')
ls = list(range(52))
df3=pd.DataFrame(ls)
for act in UniqueAct:
df3[act]= (to.loc[act]).set_index('week')
df3.tail(1)
有了按照类型分周的总用时的矩阵 df3,可根据自己的希望的分类选择类型, 组建新表. 下面是特有的类型
df3.columns
Index([ 0, '05.Career',
'07.social', '07.social:networking',
'09.HealthFun:sport', '12.sleep:noonsleep',
'04.StudyGrow:reading', '11.traffic',
'04.StudyGrow:writer', '04.StudyGrow:ComputerThinking',
'09.HealthFun:fun', '08.familylife:washingbeauty',
'12.sleep', '08.familylife:families',
'08.familylife:dinner', '08.familylife:generalAffair',
'04.StudyGrow', '04.StudyGrow:law',
'08.familylife:finance', '09.HealthFun',
'09.HealthFun:fantasy'],
dtype='object')
sl =df3.loc[:,['12.sleep','12.sleep:noonsleep']]# :指所有的index,即所有的行, ['','']是要选择的 column, 即列
sl[44:45]
下面分别为 live, healthfun, input,output 的分类表格
fa = df3.loc[:,['08.familylife:washingbeauty','08.familylife:generalAffair','08.familylife:dinner','08.familylife:families','08.familylife:finance']]
hf = df3.loc[:,['09.HealthFun:sport', '09.HealthFun:fun','09.HealthFun', '09.HealthFun:fantasy']]
ip = df3.loc[:,['04.StudyGrow:reading','07.social', '07.social:networking']]
op = df3.loc[:,['04.StudyGrow:ComputerThinking','04.StudyGrow:writer','05.Career', '04.StudyGrow:law',
'04.StudyGrow']]
df3['Sleep'] = df3.apply(lambda x: x['12.sleep:noonsleep']+x['12.sleep'], axis=1)
df3['Live'] = df3.apply(lambda x: x['08.familylife:washingbeauty']+x['08.familylife:families']+x['08.familylife:dinner']+x['08.familylife:generalAffair']+x['08.familylife:finance'], axis=1)
df3['HealthFun'] = df3.apply(lambda x: x['09.HealthFun:sport']+x['09.HealthFun']+x['09.HealthFun:fantasy']+x['09.HealthFun:fun'], axis=1)
df3['Input'] = df3.apply(lambda x: x['04.StudyGrow:reading']+x['07.social']+x['07.social:networking'], axis=1)
df3['Output'] = df3.apply(lambda x: x['04.StudyGrow:ComputerThinking']+x['04.StudyGrow:writer']+x['05.Career']+x['04.StudyGrow:law']+x[
'04.StudyGrow'], axis=1)
ti =df3.loc[:,['Output','Input','Sleep','Live','HealthFun']]
ti[44:45]
开篇分析了可视化框架的选择, 虽然本报告使用 bokeh, 但因 Matplotlib 是经典的 jupyter notebook 可视化框架, 这里快速展示一下, 其他可视化框架的原理都跟这差不多.
import matplotlib.pyplot as plt
ti.plot.bar(stacked=True); #ti 是矩阵名称, .plot是制图, .bar 是制作柱状图 (stacked=True) 是说折叠的柱状图
plt.show() #展示图片
主要是其可导出为 .html, 这样可直接发布到公网, 无需再另行排版或上传图片取得链接等等.
且 bokeh 与 jupyter notebook 对接良好, 图片可直接在notebook 里预览.
#from bokeh.io import show, output_file #生成的图片在 html网页显示
from bokeh.io import output_notebook, show #生成的图片在 jupyter notebook 中的 .ipynb 中显示
output_notebook()
#output_file('/Users/liguanghe/liguanghe.github.io/source/_posts/TimeReport.html', title = 'Time Report')
bokeh 也可以直接处理 矩阵, 比 matplotlib 多一步
from bokeh.models import ColumnDataSource
# 下文中 source3 = ColumnDataSource(ti) source1 = ColumnDataSource(op)... 等
pt = figure(title='5 types')
pt.vbar_stack(ti.columns, x = ti.index,width = 0.9,color=Spectral5,source = source3,
legend=[value(x) for x in ti.columns])
pt.legend.location = "top_left"
from bokeh.palettes import GnBu5,Greens3,Spectral5,Oranges5,Reds4
legend=[value(x) for x in ti.columns
from bokeh.plotting import figure
from bokeh.palettes import GnBu5,Greens3,Spectral5,Oranges5,Reds4
from bokeh.core.properties import value
#WIDTH = 500
HEIGHT = 300
source3 = ColumnDataSource(ti)
pt = figure(title='5 types')
pt.vbar_stack(ti.columns, x = ti.index,width = 0.9,color=Spectral5,source = source3,
legend=[value(x) for x in ti.columns])
pt.legend.location = "top_left"
po = figure(height=HEIGHT)
source1 = ColumnDataSource(op)
po.vbar_stack(op.columns, x = op.index,width = 0.9,color=GnBu5,source = source1,
legend=[value(x) for x in op.columns])
po.legend.location = "top_left"
source2 = ColumnDataSource(ip)
pi = figure(height=HEIGHT,title='Input')
pi.vbar_stack(ip.columns, x = ip.index,width = 0.9,color=Greens3,source = source2,
legend=[value(x) for x in ip.columns])
pi.legend.location = "top_left"
colors = ['#ffffcc','#FDE724' ]
ps = figure(height=HEIGHT)
source4 = ColumnDataSource(sl)
ps = figure(height=HEIGHT,title='sleep')
ps.vbar_stack(sl.columns, x = sl.index,width = 0.9,color=colors,source = source4,
legend=[value(x) for x in sl.columns])
ps.legend.location = "top_left"
source5= ColumnDataSource(fa)
pf = figure(height=HEIGHT,title='live')
pf.vbar_stack(fa.columns, x = fa.index,width = 0.9,color=Oranges5,source = source5,
legend=[value(x) for x in fa.columns])
pf.legend.location = "top_left"
source6= ColumnDataSource(hf)
ph = figure(height=HEIGHT,title='healthfun')
ph.vbar_stack(hf.columns, x = hf.index,width = 0.9,color=Reds4,source = source6,
legend=[value(x) for x in hf.columns])
ph.legend.location = "top_left"
Div(text=''' ''')
中. 可赋值长度和宽度from bokeh.layouts import widgetbox
from bokeh.models.widgets import Div
t0 = Div(text="""
Time Report
Each Nature week has 168 hours(24h*7d=168h).
Hours in this form are a little more or less.
One reason is some time may not be recorded,the other reason
is an event time may cross two weeks.
Each bar in the plot nearly touchs 168(y),
this shows I record all time-spent every week.
5 big types
Label daily action to 5 big types, like the plot shows.
""",width=WIDTH, height=200)
from lgh7SumFrequency import frequency
t1 = Div(text='''Output ↑
'''
+frequency(cvsf,'04.StudyGrow:ComputerThinking')+'''\n
'''
+frequency(cvsf,'04.StudyGrow:writer')+'''\n04.StudyGrow shows curious.
''',width=WIDTH, height=100)
t2 = Div(text='''Input ↑
Reading without output is input.
Meaningly social is in this part. Others belong to live:familes.\n
'''
+frequency(cvsf,'04.StudyGrow:reading'),width=WIDTH, height=100)
t3 = Div(text='''Sleep ↓
sleep < 59h
''',width=WIDTH, height=100)
t4 = Div(text='''Live
08.familylife:familes includes other social.
'''
+frequency(cvsf,'08.familylife:washingbeauty'),width=WIDTH, height=100)
t5 = Div(text='''HealthFun
'''+frequency(cvsf,'09.HealthFun:sport')+'''\n09.Healthfun shows curious.
09.Healthfun:fun should be down.
''',width=WIDTH, height=100,)
from bokeh.layouts import gridplot
grid = gridplot([[widgetbox(t0)],[pt],[t1], [po],[t2],[pi],[t3],[ps],[t4],[pf],[t5],[ph]])
save(grid)
show(grid)
即可生成 HTML, 输出成果见time report
import pandas as pd
import numpy as np
#import matplotlib.pyplot as plt
u = (((df.loc[:,['act_type','wday','wn4y']]).set_index('act_type')).loc['09.HealthFun:sport']).set_index('wn4y')
yo =[]
we=[]
da = []
for week in u.index.unique():
for i in range(7):
if i in (u.loc[week].values):
y = 0
else:
y = 1
yo.append(y)
we.append(week)
da.append(i)
d = {'you':yo,
'week':we,
'day':da}
y= pd.DataFrame(d).set_index('week')
ls = list(range(7))
yu = pd.DataFrame(ls)
for we in u.index.unique():
yu[we] = (y.loc[we]).set_index('day')
(yu.T).loc[30:33]
for we in u.index.unique():
yu[we] = (y.loc[we]).set_index('day')
''' sum = 0 / sum = n, n is not sport last week'''
if (yu.sum(axis = 0)<=0)[-1:].values == True:
return stype + ' everyday.'
else:
return str(yu.sum(axis = 0).iloc[-1]) + " days didn't "+ stype +' last week.'
frequency(cvsf,'04.StudyGrow:ComputerThinking')
import sys
import pandas as pd
import numpy as np
def frequency(cvsf,stype):
'''read .csv to df'''
df = pd.read_csv(cvsf)
df['du_time'] = df.apply(lambda x: x.du_end-x.du_start, axis=1)
'''sport index=week, column = day'''
if stype in str(df['act_type']):
u = (((df.loc[:,['act_type','wday','wn4y']]).set_index('act_type')).loc[stype]).set_index('wn4y')
'''index = 0-6, column = week, value = 0/1'''
Uniqueday = u.wday.unique()
UniqueWeeks = df.wn4y.unique()
yo =[]
we =[]
da =[]
for week in u.index.unique():
for i in range(7):
if i in (u.loc[week].values):
y = 0
else:
y = 1
yo.append(y)
we.append(week)
da.append(i)
d = {'you':yo,
'week':we,
'day':da}
y= pd.DataFrame(d).set_index('week')
ls = list(range(7))
yu = pd.DataFrame(ls)
for we in u.index.unique():
yu[we] = (y.loc[we]).set_index('day')
''' sum = 0 / sum = n, n is not sport last week'''
if (yu.sum(axis = 0)<=0)[-1:].values == True:
return stype + ' everyday.'
else:
return stype+' ' +str(yu.sum(axis = 0).iloc[-1]) + " days off last week."
else:
print ('please input one act_type in your data')
if __name__ == '__main__':
print(__version__)
_csv = sys.argv[1]
_ty = sys.argv[2]
frequency(_csv,_ty)