python dataframe 的高级操作
- 1、groupby 与 apply的结合使用
- 1)对dataframe进行分组,并将分组结果合并(某列多行变一行)
- 2) 对dataframe进行分组,并将分组结果合并后排序或将list转tuple,以及将某一值映射(贴标签)
- 2、groupby 与 pivot_table的区别与联系
- 3、堆叠柱状图及百分比堆叠柱状图
1、groupby 与 apply的结合使用
- groupby:主要用于分组聚合,可结合统计函数(mean()、min()、sum()、count()、、、)一起使用,跟聚合函数agg()类 似;
- apply:可以利用函数包括自定义函数迭代计算,可以结合lambda使用
1)对dataframe进行分组,并将分组结果合并(某列多行变一行)
df = pd.DataFrame(data=[[1,2,'book1'], [1, 2, 'book2'], [4, 5, 'book3']], columns=['ID', 'Day', 'title'])
print(df)
df.groupby(['ID', 'Day'])['title'].apply(list)

2) 对dataframe进行分组,并将分组结果合并后排序或将list转tuple,以及将某一值映射(贴标签)
test = pd.DataFrame(data=[['1',['物理', '历史', '数学']], ['2', ['历史', '物理', '数学']], ['3', ['历史', '语文', '数学']]], columns=['id', 'subject'])
print(test)
test['subject'].apply(lambda str : str.sort(), str)
print(test)
#test['km'] = test['km'].apply(lambda str : tuple(str))
test['subject_str'] = test['subject'].apply(lambda str : str[0] + '_' + str[1] + '_' + str[2])
print(test)
mapping = {'历史_语文_数学': 'a', '历史_数学_物理':'b', '历史_数学_语文': 'c'}
test['label'] = test['subject_str'].replace(mapping)
print(test)

2、groupby 与 pivot_table的区别与联系
df = pd.DataFrame({"A": ["foo", "foo", "foo", "foo", "foo","bar", "bar", "bar", "bar"],"B": ["one", "one", "one", "two", "two", "one", "one", "two", "two"],"C": ["small", "large", "large", "small","small", "large", "small", "small","large"],"D": [1, 2, 2, 3, 3, 4, 5, 6, 7], "E": [2, 4, 5, 5, 6, 6, 8, 9, 9]})
print(df)
print('-----------------------')
print('------pivot_table效果---数值sum--')
print(pd.pivot_table(df, values='D', index=['A'],columns=['C'], aggfunc=np.sum))
print('----行数len--')
print(pd.pivot_table(df, values='C', index=['A'],columns=['B'], aggfunc=len))
print('----字符串sum--')
print(pd.pivot_table(df, values='C', index=['A'],columns=['B'], aggfunc=np.sum))
print('-------groupby效果--------')
print(df.groupby(by=['A', 'C'])['D'].sum())
print("------groupby 展平效果-----")
temp = df.groupby(by=['A', 'C'])['D'].sum()
temp = temp.reset_index(level = ['A','C']).rename(columns={'D':'D_SUM'})
print(temp)
print('-----------------------')
print('-----pivot_table效果--数值sum----')
print(pd.pivot_table(df, values='D', index=['A','B'],columns=['C'], aggfunc=np.sum))
print('-----pivot_table效果并填充空值----数值sum---')
print(pd.pivot_table(df, values='D', index=['A','B'],columns=['C'], aggfunc=np.sum, fill_value=0))
print('------groupby效果-----')
print(df.groupby(by=['A', 'B', 'C'])['D'].sum())


3、堆叠柱状图及百分比堆叠柱状图
import matplotlib.pyplot as plt
graph_name = ['primId','adId','siteId','slotId','contentId','netType','age','gender','city',
'province','phoneType','carrier','billId','creativeType','intertype','firstClass','spreadAppId']
for x in graph_name:
graph_data = new_train_9.groupBy(x, 'label').agg({'label': 'count'}).toPandas()
graph_data = pd.pivot_table(graph_data, values='count(label)', index=[x],columns=['label'], aggfunc=np.sum, fill_value=0)
print(graph_data.head())
graph_data[[0,1]].plot(kind='bar', stacked=True, figsize=((16, 10)))
plt.show()

import numpy as np
import matplotlib.pyplot as plt
people = ('G1','G2','G3','G4','G5','G6','G7','G8')
segments = 4
# multi-dimensional data
data = [[ 3.40022085, 7.70632498, 6.4097905, 10.51648577, 7.5330039,
7.1123587, 12.77792868, 3.44773477],
[ 11.24811149, 5.03778215, 6.65808464, 12.32220677, 7.45964195,
6.79685302, 7.24578743, 3.69371847],
[ 3.94253354, 4.74763549, 11.73529246, 4.6465543, 12.9952182,
4.63832778, 11.16849999, 8.56883433],
[ 4.24409799, 12.71746612, 11.3772169, 9.00514257, 10.47084185,
10.97567589, 3.98287652, 8.80552122]]
percentages = (np.random.randint(5,20, (len(people), segments)))
print(percentages)
print(percentages[1,0])
y_pos = np.arange(len(people))
fig = plt.figure(figsize=(10,8))
ax = fig.add_subplot(111)
colors ='rgwm'
patch_handles = []
# left alignment of data starts at zero
left = np.zeros(len(people))
for i, d in enumerate(data):
patch_handles.append(ax.barh(y_pos, d,
color=colors[i%len(colors)], align='center',
left=left))
left += d
# search all of the bar segments and annotate
for j in range(len(patch_handles)):
print(j)
for i, patch in enumerate(patch_handles[j].get_children()):
# print(i)
bl = patch.get_xy()
x = 0.5*patch.get_width() + bl[0]
y = 0.5*patch.get_height() + bl[1]
ax.text(x,y, "%d%%" % (percentages[i,j]), ha='center')
ax.set_yticks(y_pos)
ax.set_yticklabels(people)
ax.set_xlabel('Scores')
plt.show()

import matplotlib.pyplot as plt
graph_name = ['primId','adId','siteId','slotId','contentId','netType','age','gender','city',
'province','phoneType','carrier','billId','creativeType','intertype','firstClass','spreadAppId']
for x in graph_name:
graph_data = new_train_9.groupBy(x, 'label').agg({'label': 'count'}).toPandas()
graph_data = pd.pivot_table(graph_data, values='count(label)', index=[x],columns=['label'], aggfunc=np.sum, fill_value=0)
print(graph_data.head())
# Create a figure with a single subplot
f, ax = plt.subplots(1, figsize=((16, 10)))
# Create the total ad counts for each participant
totals = [i+j for i,j in zip(graph_data[0], graph_data[1])]
# Create the percentage of the total ad counts the ad-colicks counts value for each participant was
unclick = [i / j * 100 for i,j in zip(graph_data[0], totals)]
click = [i / j * 100 for i,j in zip(graph_data[1], totals)]
ax.bar(graph_data.index,
# using unclick(0) data
unclick,
# labeled
label='0',
# with border color
edgecolor='white'
)
ax.bar(graph_data.index,
# using unclick(0) data
click,
# labeled
label='1',
# with unclick
bottom=unclick,
# with border color
edgecolor='white'
)
ax.set_ylabel("Percentage")
ax.set_xlabel(x)
plt.show()

import matplotlib.pyplot as plt
graph_name = ['primId','adId','siteId','slotId','contentId','netType','age','gender','city',
'province','phoneType','carrier','billId','creativeType','intertype','firstClass','spreadAppId']
for name in graph_name:
graph_data = new_train_9.groupBy(name, 'label').agg({'label': 'count'}).toPandas()
graph_data = pd.pivot_table(graph_data, values='count(label)', index=[name],columns=['label'], aggfunc=np.sum, fill_value=0)
print(graph_data.head())
# Create a figure with a single subplot
f, ax = plt.subplots(1, figsize=((16, 10)))
# Create the total ad counts for each participant
totals = [i+j for i,j in zip(graph_data[0], graph_data[1])]
# Create the percentage of the total ad counts the ad-colicks counts value for each participant was
unclick = [i / j * 100 for i,j in zip(graph_data[0], totals)]
click = [i / j * 100 for i,j in zip(graph_data[1], totals)]
patch_handles = []
patch_handles.append(ax.bar(graph_data.index,
# using unclick(0) data
unclick,
# labeled
label='0',
# with border color
edgecolor='white'))
patch_handles.append(ax.bar(graph_data.index,
# using unclick(0) data
click,
# labeled
label='1',
# with unclick
bottom=unclick,
# with border color
edgecolor='white'
))
percentages = []
percentages.append(unclick)
percentages.append(click)
# search all of the bar segments and annotate
for j in range(len(patch_handles)):
for i, patch in enumerate(patch_handles[j].get_children()):
bl = patch.get_xy()
x = 0.5*patch.get_width() + bl[0]
y = 0.5*patch.get_height() + bl[1]
ax.text(x,y, "%d%%" % (percentages[j][i]), ha='center')
ax.set_ylabel("Percentage")
ax.set_xlabel(name)
plt.show()
