第一章Pandas快速入门

《深入浅出Pandas》第一章代码
数据来源:https://www.gairuo.com/file/data/dataset/team.xlsx

import numpy as np
import pandas as pd
file_path = 'E:\\Data_python\\anconda_code\\Dive_into_Pandas\\data_files\\'
team_path = 'team.xlsx'
team_df = pd.read_excel(file_path + team_path)
team_df.head(5)
name team Q1 Q2 Q3 Q4
0 Liver E 89 21 24 64
1 Arry C 36 37 37 57
2 Ack A 57 60 18 84
3 Eorge C 93 96 71 78
4 Oah D 65 49 61 86
team_df.tail(5)
name team Q1 Q2 Q3 Q4
95 Gabriel C 48 59 87 74
96 Austin7 C 21 31 30 43
97 Lincoln4 C 98 93 1 20
98 Eli E 11 74 58 91
99 Ben E 21 43 41 74
team_df.sample(6)
name team Q1 Q2 Q3 Q4
33 Adam C 90 32 47 39
54 Ollie3 C 10 76 30 36
82 Finn E 4 1 55 32
85 Liam B 2 80 24 25
58 Lewis B 4 34 77 28
27 Finley D 62 73 84 68
team_df.shape
(100, 6)
team_df.info()

RangeIndex: 100 entries, 0 to 99
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   name    100 non-null    object
 1   team    100 non-null    object
 2   Q1      100 non-null    int64 
 3   Q2      100 non-null    int64 
 4   Q3      100 non-null    int64 
 5   Q4      100 non-null    int64 
dtypes: int64(4), object(2)
memory usage: 4.8+ KB
team_df.describe()
Q1 Q2 Q3 Q4
count 100.000000 100.000000 100.000000 100.000000
mean 49.200000 52.550000 52.670000 52.780000
std 29.962603 29.845181 26.543677 27.818524
min 1.000000 1.000000 1.000000 2.000000
25% 19.500000 26.750000 29.500000 29.500000
50% 51.500000 49.500000 55.000000 53.000000
75% 74.250000 77.750000 76.250000 75.250000
max 98.000000 99.000000 99.000000 99.000000
team_df.dtypes
name    object
team    object
Q1       int64
Q2       int64
Q3       int64
Q4       int64
dtype: object
team_df.axes
[RangeIndex(start=0, stop=100, step=1),
 Index(['name', 'team', 'Q1', 'Q2', 'Q3', 'Q4'], dtype='object')]
team_df.columns
Index(['name', 'team', 'Q1', 'Q2', 'Q3', 'Q4'], dtype='object')
# 建立索引(将team的name字段作为索引)
team_df.set_index('name', inplace=True)
team_df
team Q1 Q2 Q3 Q4
name
Liver E 89 21 24 64
Arry C 36 37 37 57
Ack A 57 60 18 84
Eorge C 93 96 71 78
Oah D 65 49 61 86
... ... ... ... ... ...
Gabriel C 48 59 87 74
Austin7 C 21 31 30 43
Lincoln4 C 98 93 1 20
Eli E 11 74 58 91
Ben E 21 43 41 74

100 rows × 5 columns

team_df['Q1']
name
Liver       89
Arry        36
Ack         57
Eorge       93
Oah         65
            ..
Gabriel     48
Austin7     21
Lincoln4    98
Eli         11
Ben         21
Name: Q1, Length: 100, dtype: int64
team_df.Q1
name
Liver       89
Arry        36
Ack         57
Eorge       93
Oah         65
            ..
Gabriel     48
Austin7     21
Lincoln4    98
Eli         11
Ben         21
Name: Q1, Length: 100, dtype: int64
team_df[['Q1','Q3']]
Q1 Q3
name
Liver 89 24
Arry 36 37
Ack 57 18
Eorge 93 71
Oah 65 61
... ... ...
Gabriel 48 87
Austin7 21 30
Lincoln4 98 1
Eli 11 58
Ben 21 41

100 rows × 2 columns

team_df.loc[:,['Q1','Q2']]
Q1 Q2
name
Liver 89 21
Arry 36 37
Ack 57 60
Eorge 93 96
Oah 65 49
... ... ...
Gabriel 48 59
Austin7 21 31
Lincoln4 98 93
Eli 11 74
Ben 21 43

100 rows × 2 columns

# 获取行
team_df[team_df.index == 'Liver']  #输出name==Liver的这行                                                                                                          
team Q1 Q2 Q3 Q4
name
Liver E 89 21 24 64
team_df[0:3]
team Q1 Q2 Q3 Q4
name
Liver E 89 21 24 64
Arry C 36 37 37 57
Ack A 57 60 18 84
team_df[0:10:2]
team Q1 Q2 Q3 Q4
name
Liver E 89 21 24 64
Ack A 57 60 18 84
Oah D 65 49 61 86
Acob B 61 95 94 8
Reddie D 64 93 57 72
team_df.iloc[:10, :]
team Q1 Q2 Q3 Q4
name
Liver E 89 21 24 64
Arry C 36 37 37 57
Ack A 57 60 18 84
Eorge C 93 96 71 78
Oah D 65 49 61 86
Harlie C 24 13 87 43
Acob B 61 95 94 8
Lfie A 9 10 99 37
Reddie D 64 93 57 72
Oscar A 77 9 26 67
# 指定行和列
team_df.loc['Ben', 'Q1':'Q4'] # 只看Ben的四个季度成绩
team_df.loc['Eorge':'Alexander', 'team':'Q4'] # 指定行区间
team Q1 Q2 Q3 Q4
name
Eorge C 93 96 71 78
Oah D 65 49 61 86
Harlie C 24 13 87 43
Acob B 61 95 94 8
Lfie A 9 10 99 37
Reddie D 64 93 57 72
Oscar A 77 9 26 67
Leo B 17 4 33 79
Logan B 9 89 35 65
Archie C 83 89 59 68
Theo C 51 86 87 27
Thomas B 80 48 56 41
James E 48 77 52 11
Joshua A 63 4 80 30
Henry A 91 15 75 17
William C 80 68 3 26
Max E 97 75 41 3
Lucas A 60 41 77 62
Ethan D 79 45 89 88
Arthur A 44 53 42 40
Mason D 80 96 26 49
Isaac E 74 23 28 65
Harrison B 89 13 18 75
Teddy E 71 91 21 48
Finley D 62 73 84 68
Daniel C 50 50 72 61
Riley E 35 26 59 83
Edward B 57 38 86 87
Joseph E 67 87 87 93
Alexander C 91 76 26 79
# 条件选择
# 单一条件
team_df[team_df.team == 'C']
team_df[team_df.Q1 > 90]
team_df[team_df.index=='Qsar']
team Q1 Q2 Q3 Q4
name
Oscar A 77 9 26 67
# 多条件
team_df[(team_df.Q1 > 90) & (team_df.team == 'C')] # and 关系
team Q1 Q2 Q3 Q4
name
Eorge C 93 96 71 78
Alexander C 91 76 26 79
Lincoln4 C 98 93 1 20
team_df[team_df['team']=='C'].loc[team_df.Q1 > 90] # 多重条件筛选
team Q1 Q2 Q3 Q4
name
Eorge C 93 96 71 78
Alexander C 91 76 26 79
Lincoln4 C 98 93 1 20
排序
team_df.sort_values(by='Q1') # 按Q1列排序
team Q1 Q2 Q3 Q4
name
Sebastian C 1 14 68 48
Harley B 2 99 12 13
Liam B 2 80 24 25
Lewis B 4 34 77 28
Finn E 4 1 55 32
... ... ... ... ... ...
Eorge C 93 96 71 78
Aaron A 96 75 55 8
Elijah B 97 89 15 46
Max E 97 75 41 3
Lincoln4 C 98 93 1 20

100 rows × 5 columns

team_df.sort_values(by='Q3', ascending=False) # 降序
team Q1 Q2 Q3 Q4
name
David B 21 47 99 2
Lfie A 9 10 99 37
Luke6 D 15 97 95 99
Acob B 61 95 94 8
Blake A 78 23 93 9
... ... ... ... ... ...
Harley B 2 99 12 13
Jake3 C 69 23 11 40
Jayden6 D 64 21 10 21
William C 80 68 3 26
Lincoln4 C 98 93 1 20

100 rows × 5 columns

team_df.sort_values(['team','Q2'], ascending=[True, False]) # team升序,Q2降序
team Q1 Q2 Q3 Q4
name
Dylan A 86 87 65 20
Nathan A 87 77 62 13
Aaron A 96 75 55 8
Stanley A 69 71 39 97
Ack A 57 60 18 84
... ... ... ... ... ...
Liver E 89 21 24 64
Rory9 E 8 12 58 27
Jackson5 E 6 10 15 33
Roman E 73 1 25 44
Finn E 4 1 55 32

100 rows × 5 columns

分组聚合
# 按团队分组求和
team_df.groupby('team').sum()
Q1 Q2 Q3 Q4
team
A 1066 639 875 783
B 975 1218 1202 1136
C 1056 1194 1068 1127
D 860 1191 1241 1199
E 963 1013 881 1033
team_df.groupby('team').mean() # 分组求平均
Q1 Q2 Q3 Q4
team
A 62.705882 37.588235 51.470588 46.058824
B 44.318182 55.363636 54.636364 51.636364
C 48.000000 54.272727 48.545455 51.227273
D 45.263158 62.684211 65.315789 63.105263
E 48.150000 50.650000 44.050000 51.650000
# 不同列不同计算方法
team_df.groupby('team').agg({'Q1': sum,  # 求和
                             'Q2': 'count', # 计数
                             'Q3': 'mean', # 平均
                             'Q4': max}) # 最大值
Q1 Q2 Q3 Q4
team
A 1066 17 51.470588 97
B 975 22 54.636364 99
C 1056 22 48.545455 98
D 860 19 65.315789 99
E 963 20 44.050000 98
数据转换
# 转置
team_df.groupby('team').sum()
Q1 Q2 Q3 Q4
team
A 1066 639 875 783
B 975 1218 1202 1136
C 1056 1194 1068 1127
D 860 1191 1241 1199
E 963 1013 881 1033
team_df.groupby('team').sum().T  # 二维数组可认为行转列
team A B C D E
Q1 1066 975 1056 860 963
Q2 639 1218 1194 1191 1013
Q3 875 1202 1068 1241 881
Q4 783 1136 1127 1199 1033
team_df.groupby('team').sum().stack()  # 分组透视到行
team    
A     Q1    1066
      Q2     639
      Q3     875
      Q4     783
B     Q1     975
      Q2    1218
      Q3    1202
      Q4    1136
C     Q1    1056
      Q2    1194
      Q3    1068
      Q4    1127
D     Q1     860
      Q2    1191
      Q3    1241
      Q4    1199
E     Q1     963
      Q2    1013
      Q3     881
      Q4    1033
dtype: int64
team_df.groupby('team').sum().unstack() # 分组后透视到列
    team
Q1  A       1066
    B        975
    C       1056
    D        860
    E        963
Q2  A        639
    B       1218
    C       1194
    D       1191
    E       1013
Q3  A        875
    B       1202
    C       1068
    D       1241
    E        881
Q4  A        783
    B       1136
    C       1127
    D       1199
    E       1033
dtype: int64
添加列
# 添加一个固定值的列
team_df['grade'] = 'Grade 1'
team_df.head(5)
team Q1 Q2 Q3 Q4 grade
name
Liver E 89 21 24 64 Grade 1
Arry C 36 37 37 57 Grade 1
Ack A 57 60 18 84 Grade 1
Eorge C 93 96 71 78 Grade 1
Oah D 65 49 61 86 Grade 1
# 添加汇总列
team_df['total'] = team_df['Q1'] + team_df['Q2'] + team_df['Q3'] + team_df['Q4']
team_df.head(5)
team Q1 Q2 Q3 Q4 grade total
name
Liver E 89 21 24 64 Grade 1 198
Arry C 36 37 37 57 Grade 1 167
Ack A 57 60 18 84 Grade 1 219
Eorge C 93 96 71 78 Grade 1 338
Oah D 65 49 61 86 Grade 1 261
# 将计算结果赋给新列
team_df['total1'] = team_df.loc[:,'Q1':'Q4'].apply(lambda x : sum(x), axis=1)
team_df['total2'] = team_df.sum(axis=1) # 把所有为数字的列相加
team_df['avg'] = team_df.total/4 # 添加平均值列
team_df
team Q1 Q2 Q3 Q4 grade total total1 total2 avg
name
Liver E 89 21 24 64 Grade 1 198 198 1188 49.50
Arry C 36 37 37 57 Grade 1 167 167 1002 41.75
Ack A 57 60 18 84 Grade 1 219 219 1314 54.75
Eorge C 93 96 71 78 Grade 1 338 338 2028 84.50
Oah D 65 49 61 86 Grade 1 261 261 1566 65.25
... ... ... ... ... ... ... ... ... ... ...
Gabriel C 48 59 87 74 Grade 1 268 268 1608 67.00
Austin7 C 21 31 30 43 Grade 1 125 125 750 31.25
Lincoln4 C 98 93 1 20 Grade 1 212 212 1272 53.00
Eli E 11 74 58 91 Grade 1 234 234 1404 58.50
Ben E 21 43 41 74 Grade 1 179 179 1074 44.75

100 rows × 10 columns



你可能感兴趣的:(深入浅出Pandas学习代码,python,pandas)