今天总结一下Python数据聚合中的一些小例子,一方面是自己复习,另外希望对小码们有所帮助
import pandas as pd
import numpy as np
ser=pd.Series([12,8,20,14,6,10],index=['white','white','red','red','black','black'])
ser.index.name='color'
print("原数据:")
print(ser)
ser1=ser.groupby('color').mean()
print("根据color索引将各组数据平均聚合:")
print(ser1)
import pandas as pd
frame=pd.DataFrame([[3,5,7,9],[11,13,15,17],[2,4,6,8],[10,12,14,16]],
index=[['odd','odd','even','even'],['small','big','small','big']])
frame.index.names=['obj','id']
print("增加索引名称后:")
print(frame)
frame1=frame.groupby('obj').mean()
print("根据obj聚合后:")
print(frame1)
frame2=frame.groupby('id').mean()
print("根据id聚合后:")
print(frame2)
import pandas as pd
frame=pd.DataFrame([[3,5,7,9],[11,13,15,17],[2,4,6,8],[10,12,14,16]],
index=[['odd','odd','even','even'],['small','big','small','big']])
frame.index.names=['obj','id']
frame.columns=['a','b','c','d']
print("增加行索引和列索引后后的数据:")
print(frame)
print("a列根据obj和id进行分组:")
frame1=frame['a'].groupby(['obj','id']).mean()
print(frame1)
print("a,b列根据obj和id进行分组:")
frame2=frame[['a','b']].groupby(['obj','id']).mean()
print(frame2)
import pandas as pd
frame=pd.DataFrame([[3,5,7,9],[11,13,15,17],[2,4,6,8],[10,12,14,16]],
index=[['odd','odd','even','even'],['small','big','small','big']])
print(frame)
frame.index.names=['obj','id']
frame.columns=['a','b','c','d']
for name,group in frame.groupby('obj'):
print("------------------"+name+"------------------")
print(group)
quit()
运行结果:
注:按照obj分组后,分别将不同的obj作为一组输出
此处附分位数的相关知识,前辈的解答也很详细,大家可参考该文理解分位数的概念以及计算 快点我,我是分位数
import pandas as pd
import numpy as np
data=np.random.randint(0,30,10)
ser=pd.Series(data,index=['a','a','a','b','b','b','b','c','c','c'])
ser.index.name='id'
print(ser)
group=ser.groupby('id')
# 此处概率点取0.6
ser1=group.quantile(0.6)
print("--------分组后各组的分位数(概率点为0.6)----------")
print(ser1)
通常情况下在聚合过程中可以选择使用系统给出的聚合函数,当然也可以使用自定义的聚合函数 ☟ ☟ ☟ ☟ ☟
import pandas as pd
import numpy as np
# eg6 自定义聚合函数
# 返回本组的最大值与最小值的差(极差)
def mydef(series):
return series.max()-series.min()
frame=pd.DataFrame([[3,5,7,9],[11,13,15,17],[2,4,6,8],[10,12,14,16]],
index=[['odd','odd','even','even'],['small','big','small','big']])
frame.index.names=['obj','id']
frame.columns=['a','b','c','d']
print(frame)
#根据obj索引分组
group=frame.groupby('obj')
print("--------调用自定义的聚合函数----------")
frame1=group[['a','b']].agg(mydef)
print(frame1)
import pandas as pd
import numpy as np
frame=pd.DataFrame([[3,5,7,9],[11,13,15,17],[2,4,6,8],[10,12,14,16]],
index=[['odd','odd','even','even'],['small','big','small','big']])
frame.index.names=['obj','id']
frame.columns=['a','b','c','d']
group=frame.groupby('obj')
print(frame)
# 聚合结果为一组的数据之和增加前缀
print("--------调用sum()函数将分组后的数据求和----------")
frame1=group[['a','b']].sum().add_prefix('test_')
print(frame1)
# 用numpy函数计算
print("--------使用transform函数调用numpy的sum()函数将分组后的数据求和----------")
frame2=group[['a','b']].transform(np.sum).add_prefix('test1_')
print(frame2)
import pandas as pd
import numpy as np
frame=pd.DataFrame([[3,5,7,9],[11,13,15,17],[2,4,6,8],[10,12,14,16]],
index=[['odd','odd','even','even'],['small','big','small','big']])
frame.index.names=['obj','id']
frame.columns=['a','b','c','d']
group=frame.groupby('obj')
print(frame)
# 聚合结果为一组的数据之和增加前缀
frame1=group[['a','b']].sum().add_prefix('test_')
print("--------调用sum()函数将分组后的数据求和----------")
print(frame1)
frame2=group[['a','b']].apply(np.sum).add_prefix('test1_')
print("--------调用apply()函数将分组后的数据求和----------")
print(frame2)
运行结果:
文中若有失误,还请小码们指出,倍加感谢。祝大家学习愉快♡