python dataframe 的高级操作

python dataframe 的高级操作

  • 1、groupby 与 apply的结合使用
            • 1)对dataframe进行分组,并将分组结果合并(某列多行变一行)
            • 2) 对dataframe进行分组,并将分组结果合并后排序或将list转tuple,以及将某一值映射(贴标签)
  • 2、groupby 与 pivot_table的区别与联系
  • 3、堆叠柱状图及百分比堆叠柱状图

1、groupby 与 apply的结合使用

  • groupby:主要用于分组聚合,可结合统计函数(mean()、min()、sum()、count()、、、)一起使用,跟聚合函数agg()类 似;
  • apply:可以利用函数包括自定义函数迭代计算,可以结合lambda使用
1)对dataframe进行分组,并将分组结果合并(某列多行变一行)
df = pd.DataFrame(data=[[1,2,'book1'], [1, 2, 'book2'], [4, 5, 'book3']], columns=['ID', 'Day', 'title'])
print(df)
df.groupby(['ID', 'Day'])['title'].apply(list)

python dataframe 的高级操作_第1张图片

2) 对dataframe进行分组,并将分组结果合并后排序或将list转tuple,以及将某一值映射(贴标签)
test = pd.DataFrame(data=[['1',['物理', '历史', '数学']], ['2', ['历史', '物理', '数学']], ['3', ['历史', '语文', '数学']]], columns=['id', 'subject'])
print(test)

test['subject'].apply(lambda str : str.sort(), str)
print(test)

#test['km'] = test['km'].apply(lambda str : tuple(str))
test['subject_str'] = test['subject'].apply(lambda str : str[0] + '_' + str[1] + '_' + str[2])
print(test)

mapping = {'历史_语文_数学': 'a', '历史_数学_物理':'b', '历史_数学_语文': 'c'}
test['label'] = test['subject_str'].replace(mapping)
print(test)

python dataframe 的高级操作_第2张图片

2、groupby 与 pivot_table的区别与联系

df = pd.DataFrame({"A": ["foo", "foo", "foo", "foo", "foo","bar", "bar", "bar", "bar"],"B": ["one", "one", "one", "two", "two", "one", "one", "two", "two"],"C": ["small", "large", "large", "small","small", "large", "small", "small","large"],"D": [1, 2, 2, 3, 3, 4, 5, 6, 7], "E": [2, 4, 5, 5, 6, 6, 8, 9, 9]})
print(df)
print('-----------------------')
print('------pivot_table效果---数值sum--')
print(pd.pivot_table(df, values='D', index=['A'],columns=['C'], aggfunc=np.sum))
print('----行数len--')
print(pd.pivot_table(df, values='C', index=['A'],columns=['B'], aggfunc=len))
print('----字符串sum--')
print(pd.pivot_table(df, values='C', index=['A'],columns=['B'], aggfunc=np.sum))
print('-------groupby效果--------')
print(df.groupby(by=['A', 'C'])['D'].sum())
print("------groupby 展平效果-----")
temp = df.groupby(by=['A', 'C'])['D'].sum()
temp = temp.reset_index(level = ['A','C']).rename(columns={'D':'D_SUM'})
print(temp)
print('-----------------------')
print('-----pivot_table效果--数值sum----')
print(pd.pivot_table(df, values='D', index=['A','B'],columns=['C'], aggfunc=np.sum))
print('-----pivot_table效果并填充空值----数值sum---')
print(pd.pivot_table(df, values='D', index=['A','B'],columns=['C'], aggfunc=np.sum, fill_value=0))
print('------groupby效果-----')
print(df.groupby(by=['A', 'B', 'C'])['D'].sum())

python dataframe 的高级操作_第3张图片
python dataframe 的高级操作_第4张图片

3、堆叠柱状图及百分比堆叠柱状图

import matplotlib.pyplot as plt

graph_name = ['primId','adId','siteId','slotId','contentId','netType','age','gender','city',
              'province','phoneType','carrier','billId','creativeType','intertype','firstClass','spreadAppId']
for x in graph_name:
    graph_data = new_train_9.groupBy(x, 'label').agg({'label': 'count'}).toPandas()
    graph_data = pd.pivot_table(graph_data, values='count(label)', index=[x],columns=['label'], aggfunc=np.sum, fill_value=0)
    print(graph_data.head())
    graph_data[[0,1]].plot(kind='bar', stacked=True, figsize=((16, 10)))
    plt.show()

python dataframe 的高级操作_第5张图片

import numpy as np
import matplotlib.pyplot as plt

people = ('G1','G2','G3','G4','G5','G6','G7','G8')
segments = 4

# multi-dimensional data 
data = [[  3.40022085,   7.70632498,   6.4097905,   10.51648577,   7.5330039,
    7.1123587,   12.77792868,   3.44773477],
 [ 11.24811149,   5.03778215,   6.65808464,  12.32220677,   7.45964195,
    6.79685302,   7.24578743,   3.69371847],
 [  3.94253354,   4.74763549,  11.73529246,   4.6465543,   12.9952182,
    4.63832778,  11.16849999,   8.56883433],
 [  4.24409799,  12.71746612,  11.3772169,    9.00514257,  10.47084185,
   10.97567589,   3.98287652,   8.80552122]]
percentages = (np.random.randint(5,20, (len(people), segments)))
print(percentages)
print(percentages[1,0])
y_pos = np.arange(len(people))

fig = plt.figure(figsize=(10,8))
ax = fig.add_subplot(111)

colors ='rgwm'
patch_handles = []
# left alignment of data starts at zero
left = np.zeros(len(people)) 
for i, d in enumerate(data):
    patch_handles.append(ax.barh(y_pos, d, 
      color=colors[i%len(colors)], align='center', 
      left=left))
    left += d

# search all of the bar segments and annotate
for j in range(len(patch_handles)):
    print(j)
    for i, patch in enumerate(patch_handles[j].get_children()):
#         print(i)
        bl = patch.get_xy()
        x = 0.5*patch.get_width() + bl[0]
        y = 0.5*patch.get_height() + bl[1]
        ax.text(x,y, "%d%%" % (percentages[i,j]), ha='center')

ax.set_yticks(y_pos)
ax.set_yticklabels(people)
ax.set_xlabel('Scores')
plt.show()

python dataframe 的高级操作_第6张图片

import matplotlib.pyplot as plt

graph_name = ['primId','adId','siteId','slotId','contentId','netType','age','gender','city',
              'province','phoneType','carrier','billId','creativeType','intertype','firstClass','spreadAppId']
for x in graph_name:
    graph_data = new_train_9.groupBy(x, 'label').agg({'label': 'count'}).toPandas()
    graph_data = pd.pivot_table(graph_data, values='count(label)', index=[x],columns=['label'], aggfunc=np.sum, fill_value=0)
    print(graph_data.head())
    
    # Create a figure with a single subplot
    f, ax = plt.subplots(1, figsize=((16, 10)))
    # Create the total ad counts for each participant
    totals = [i+j for i,j in zip(graph_data[0], graph_data[1])]
    # Create the percentage of the total ad counts the ad-colicks counts value for each participant was
    unclick = [i / j * 100 for  i,j in zip(graph_data[0], totals)]
    click = [i / j * 100 for  i,j in zip(graph_data[1], totals)]
    ax.bar(graph_data.index, 
       # using unclick(0) data
       unclick, 
       # labeled 
       label='0', 
       # with border color
       edgecolor='white'
       )
    ax.bar(graph_data.index, 
       # using unclick(0) data
       click, 
       # labeled 
       label='1',
       # with unclick
       bottom=unclick,    
       # with border color
       edgecolor='white'
       )
    ax.set_ylabel("Percentage")
    ax.set_xlabel(x)
    plt.show()

python dataframe 的高级操作_第7张图片

import matplotlib.pyplot as plt

graph_name = ['primId','adId','siteId','slotId','contentId','netType','age','gender','city',
              'province','phoneType','carrier','billId','creativeType','intertype','firstClass','spreadAppId']
for name in graph_name:
    graph_data = new_train_9.groupBy(name, 'label').agg({'label': 'count'}).toPandas()
    graph_data = pd.pivot_table(graph_data, values='count(label)', index=[name],columns=['label'], aggfunc=np.sum, fill_value=0)
    print(graph_data.head())
    
    # Create a figure with a single subplot
    f, ax = plt.subplots(1, figsize=((16, 10)))
    # Create the total ad counts for each participant
    totals = [i+j for i,j in zip(graph_data[0], graph_data[1])]
    # Create the percentage of the total ad counts the ad-colicks counts value for each participant was
    unclick = [i / j * 100 for  i,j in zip(graph_data[0], totals)]
    click = [i / j * 100 for  i,j in zip(graph_data[1], totals)]
    
    
    patch_handles = []

    patch_handles.append(ax.bar(graph_data.index, 
                               # using unclick(0) data
                               unclick, 
                               # labeled 
                               label='0', 
                               # with border color
                               edgecolor='white'))
    patch_handles.append(ax.bar(graph_data.index, 
                               # using unclick(0) data
                               click, 
                               # labeled 
                               label='1',
                               # with unclick
                               bottom=unclick,    
                               # with border color
                               edgecolor='white'
                               ))
    percentages = []
    percentages.append(unclick)
    percentages.append(click)

    # search all of the bar segments and annotate
    for j in range(len(patch_handles)):
        for i, patch in enumerate(patch_handles[j].get_children()):
            bl = patch.get_xy()
            x = 0.5*patch.get_width() + bl[0]
            y = 0.5*patch.get_height() + bl[1]
            ax.text(x,y, "%d%%" % (percentages[j][i]), ha='center')
    
    ax.set_ylabel("Percentage")
    ax.set_xlabel(name)
    plt.show()

python dataframe 的高级操作_第8张图片

你可能感兴趣的:(python dataframe 的高级操作)