Pandas 基础
Pandas中的一维数组:Series
data = pd.Series([0.25, 0.5, 0.75, 1.0])
>>> data:
0 0.25
1 0.50
2 0.75
3 1.00
dtype: float64
>>> data.values
array([0.25, 0.5 , 0.75, 1. ])
>>> data.index
RangeIndex(start=0, stop=4, step=1)
Index
- Series和Numpy中的一维向量之间的核心区别是:Index
data = pd.Series([0.25, 0.5, 0.75, 1.0],
index=['a', 'b', 'c', 'd'])
>>>data
a 0.25
b 0.50
c 0.75
d 1.00
dtype: float64
>>>data['b']
0.5
字典与Series
- 由于index的存在,Series可以看做是一个字典(Dictionary)
>>>population_dict = {'California': 38332521,
'Texas': 26448193,
'New York': 19651127,
'Florida': 19552860,
'Illinois': 12882135}
>>>population = pd.Series(population_dict)
>>>population
California 38332521
Florida 19552860
Illinois 12882135
New York 19651127
Texas 26448193
dtype: int64
>>>population['California':'lllionis']
California 38332521
Florida 19552860
Illinois 12882135
dtype: int64
创建Series
>>>pd.Series([2, 4, 6])
0 2
1 4
2 6
dtype: int64
>>>pd.Series(5, index=[100, 200, 300])
100 5
200 5
300 5
dtype: int64
>>>pd.Series({2:'a', 1:'b', 3:'c'})
1 b
2 a
3 c
dtype: object
>>>pd.Series({2:'a', 1:'b', 3:'c'}, index=[3, 2])
3 c
2 a
dtype: object
Pandas 中的 DataFrame
population_dict = {'California': 38332521,
'Texas': 26448193,
'New York': 19651127,
'Florida': 19552860,
'Illinois': 12882135}
population = pd.Series(population_dict)
area_dict = {'California': 423967,
'Texas': 695662,
'New York': 141297,
'Florida': 170312,
'Illinois': 149995}
area = pd.Series(area_dict)
states = pd.DataFrame({'population': population, 'area': area})
states:
area |
population |
California |
423967 |
Florida |
170312 |
Illinois |
149995 |
New York |
141297 |
Texas |
695662 |
>>> states.index
Index(['California', 'Florida', 'Illinois', 'New York', 'Texas'], dtype='object')
>>>state.columns
Index(['area', 'population'], dtype='object')
>>>states['area']
California 423967
Florida 170312
Illinois 149995
New York 141297
Texas 695662
Name: area, dtype: int64
创建 DataFrame
pd.DataFrame(population, columns=['population'])
population |
California |
Florida |
Illinois |
New York |
Texas |
- 通过多个字典(此时,每个字典可以理解成是在描述一个样本)
data = [{'a': i, 'b': 2 * i} for i in range(3)]
pd.DataFrame(data)
pd.DataFrame([{'a': 1, 'b': 2}, {'b': 3, 'c': 4}])
- 通过多个Series
>>>pd.DataFrame({'population': population, 'area': area})
area |
population |
California |
423967 |
Florida |
170312 |
Illinois |
149995 |
New York |
141297 |
Texas |
695662 |
- 通过Numpy二维数组
import numpy as np
pd.DataFrame(np.random.rand(3, 2),
columns=['foo', 'bar'],
index=['a', 'b', 'c'])
foo |
bar |
a |
0.529692 |
b |
0.391235 |
c |
0.440382 |
- 通过Numpy中的structured array
A = np.zeros(3, dtype=[('A', 'i8'), ('B', 'f8')])
>>> A
array([(0, 0.), (0, 0.), (0, 0.)], dtype=[('A', '), ('B', ')])
>>>pd.DataFrame(A)
Index
>>>ind = pd.Index([2, 3, 5, 7, 11])
>>>ind
Int64Index([2, 3, 5, 7, 11], dtype='int64')
>>>ind[::2]
Int64Index([2, 5, 11], dtype='int64')
>>>ind.shape
(5,)
>>>ind.ndim
1
>>>ind.dtype
dtype('int64')
>>>ind[1] = 0
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
41-906a9fa1424c> in ()
----> 1 ind[1] = 0
~/anaconda/lib/python3.6/site-packages/pandas/core/indexes/base.py in __setitem__(self, key, value)
1722
1723 def __setitem__(self, key, value):
-> 1724 raise TypeError("Index does not support mutable operations")
1725
1726 def __getitem__(self, key):
TypeError: Index does not support mutable operations
>>>indA = pd.Index([1, 3, 5, 7, 9])
>>>indB = pd.Index([2, 3, 5, 7, 11])
>>>indA & indB
Int64Index([3, 5, 7], dtype='int64')
>>>indA | indB
Int64Index([1, 2, 3, 5, 7, 9, 11], dtype='int64')
>>>indA ^ indB
Int64Index([1, 2, 9, 11], dtype='int64')
Pandas 中的数据索引和选择
Series 中的数据选择
字典式的数据选择
import pandas as pd
>>>data = pd.Series([0.25, 0.5, 0.75, 1.0], index=['a', 'b', 'c', 'd'])
>>>data
a 0.25
b 0.50
c 0.75
d 1.00
dtype: float64
>>>'a' in data
True
>>>data.keys()
Index(['a', 'b', 'c', 'd'], dtype='object')
>>>data.items()
0x110f11e48>
>>>list(data.items())
[('a', 0.25), ('b', 0.5), ('c', 0.75), ('d', 1.0)]
像字典操作一样在Series中添加数据
>>>data['e'] = 1.25
a 0.25
b 0.50
c 0.75
d 1.00
e 1.25
dtype: float64
一维数组式的数据选择
>>>data['a': 'c']
a 0.25
b 0.50
c 0.75
dtype: float64
>>>data[1: 3]
b 0.50
c 0.75
dtype: float64
>>>data[(data > 0.3) & (data < 0.8)]
b 0.50
c 0.75
dtype: float64
>>>data[['a', 'e']]
a 0.25
e 1.25
dtype: float64
数据的选择:`loc` 和 `iloc`
>>>data = pd.Series(['a', 'b', 'c'], index=[1, 3, 5])
1 a
3 b
5 c
dtype: object
>>>data[1]
'a'
>>> data[1:3]
3 b
5 c
dtype: object
>>>data.loc[1]
'a'
>>>data.loc[1:3]
1 a
3 b
dtype: object
>>> data.iloc[1]
'b'
>>>data.iloc[1:3]
3 b
5 c
dtype: object
DataFrame 中的数据选择
字典式的数据选择
area = pd.Series({'California': 423967, 'Texas': 695662,
'New York': 141297, 'Florida': 170312,
'Illinois': 149995})
pop = pd.Series({'California': 38332521, 'Texas': 26448193,
'New York': 19651127, 'Florida': 19552860,
'Illinois': 12882135})
data = pd.DataFrame({'area':area, 'pop':pop})
area |
pop |
California |
423967 |
Florida |
170312 |
Illinois |
149995 |
New York |
141297 |
Texas |
695662 |
>>>data['area']
California 423967
Florida 170312
Illinois 149995
New York 141297
Texas 695662
Name: area, dtype: int64
>>>data.area
California 423967
Florida 170312
Illinois 149995
New York 141297
Texas 695662
Name: area, dtype: int64
>>>data.area is data['area']
True
>>> data.pop is data['pop']
False
>>>type(data.pop)
method
使用字典的方式为 DataFrame 添加一列数据
>>> data['density'] = data['pop'] / data['area']
area |
pop |
density |
California |
423967 |
38332521 |
Florida |
170312 |
19552860 |
Illinois |
149995 |
12882135 |
New York |
141297 |
19651127 |
Texas |
695662 |
26448193 |
二维数组式的数据选择
>>> data.values
array([[4.23967000e+05, 3.83325210e+07, 9.04139261e+01],
[1.70312000e+05, 1.95528600e+07, 1.14806121e+02],
[1.49995000e+05, 1.28821350e+07, 8.58837628e+01],
[1.41297000e+05, 1.96511270e+07, 1.39076746e+02],
[6.95662000e+05, 2.64481930e+07, 3.80187404e+01]])
>>> data.T
California |
Florida |
Illinois |
New York |
Texas |
area |
4.239670e+05 |
1.703120e+05 |
1.499950e+05 |
1.412970e+05 |
pop |
3.833252e+07 |
1.955286e+07 |
1.288214e+07 |
1.965113e+07 |
density |
9.041393e+01 |
1.148061e+02 |
8.588376e+01 |
1.390767e+02 |
>>> data.values[0]
array([4.23967000e+05, 3.83325210e+07, 9.04139261e+01])
>>>data['area']
California 423967
Florida 170312
Illinois 149995
New York 141297
Texas 695662
Name: area, dtype: int64
使用 iloc 和 loc
>>> data.iloc[:3, :2]
area |
pop |
California |
423967 |
Florida |
170312 |
Illinois |
149995 |
>>>data.loc[:'Illinois', :'pop']
area |
pop |
California |
423967 |
Florida |
170312 |
Illinois |
149995 |
data.loc[data.density > 100, ['pop', 'density']]
pop |
density |
Florida |
19552860 |
New York |
19651127 |
>>> data.iloc[0, 2] = 90
area |
pop |
density |
California |
423967 |
38332521 |
Florida |
170312 |
19552860 |
Illinois |
149995 |
12882135 |
New York |
141297 |
19651127 |
Texas |
695662 |
26448193 |
注意 切片的选择方式都是选择行
data['Florida': 'Illinois']
area |
pop |
density |
Florida |
170312 |
19552860 |
Illinois |
149995 |
12882135 |
data[1:3]
area |
pop |
density |
Florida |
170312 |
19552860 |
Illinois |
149995 |
12882135 |
不过为了清晰起见,还是建议使用 loc 或者 iloc
data.loc['Florida': 'Illinois']
area |
pop |
density |
Florida |
170312 |
19552860 |
Illinois |
149995 |
12882135 |
data.iloc[1:3]
area |
pop |
density |
Florida |
170312 |
19552860 |
Illinois |
149995 |
12882135 |
在DataFrame中使用masking的方式,也是基于行做选择的
data[data.density > 100]
area |
pop |
density |
Florida |
170312 |
19552860 |
New York |
141297 |
19651127 |
Pandas 中的数据操作
Universal Functions
import pandas as pd
import numpy as np
rng = np.random.RandomState(42)
series = pd.Series(rng.randint(0, 10, 4))
>>> series
0 6
1 3
2 7
3 4
dtype: int64
df = pd.DataFrame(rng.randint(0, 10, (3, 4)), columns=['A', 'B', 'C', 'D'])
>>> df
A |
B |
C |
D |
0 |
6 |
9 |
2 |
1 |
7 |
4 |
3 |
2 |
7 |
2 |
5 |
>>> np.exp(series)
0 403.428793
1 20.085537
2 1096.633158
3 54.598150
dtype: float64
>>>np.sin(df * np.pi / 4)
A |
B |
C |
D |
0 |
-1.000000 |
7.071068e-01 |
1.000000 |
1 |
-0.707107 |
1.224647e-16 |
0.707107 |
2 |
-0.707107 |
1.000000e+00 |
-0.707107 |
Index 的对齐
Series 中的 Index 对齐
area = pd.Series({'Alaska': 1723337, 'Texas': 695662,'California': 423967}, name='area')
population = pd.Series({'California': 38332521, 'Texas': 26448193,'New York': 19651127}, name='population')
>>> population / area
California 90.413926
New York NaN
Texas 38.018740
dtype: float64
>>> area.index | population.index
Index(['Alaska', 'California', 'New York', 'Texas'], dtype='object')
>>>A = pd.Series([2, 4, 6], index=[0, 1, 2])
>>>B = pd.Series([1, 3, 5], index=[1, 2, 3])
>>>A + B
0 NaN
1 5.0
2 9.0
3 NaN
dtype: float64
>>>A.add(B, fill_value=0)
0 2.0
1 5.0
2 9.0
3 5.0
dtype: float64
DataFrame 中的 Index 对齐
>>>A = pd.DataFrame(rng.randint(0, 20, (2, 2)), columns=list('AB'))
>>>A
>>>B = pd.DataFrame(rng.randint(0, 10, (3, 3)), columns=list('BAC'))
>>>B
>>>A + B
A |
B |
C |
0 |
1.0 |
15.0 |
1 |
13.0 |
6.0 |
2 |
NaN |
NaN |
>>>A.stack()
0 A 1
B 11
1 A 5
B 1
dtype: int64
>>>fill = A.stack().mean()
>>>A.add(B, fill_value=fill)
0 A 1
B 11
1 A 5
B 1
dtype: int64
A |
B |
C |
0 |
1.0 |
15.0 |
1 |
13.0 |
6.0 |
2 |
6.5 |
13.5 |
DataFrame 和 Series 之间的运算
- 类似 Numpy 中二维数组和一维数组的运算(广播)
>>>A = rng.randint(10, size=(3, 4))
>>>A
array([[3, 8, 2, 4],
[2, 6, 4, 8],
[6, 1, 3, 8]])
>>>A-A[0]
array([[ 0, 0, 0, 0],
[-1, -2, 2, 4],
[ 3, -7, 1, 4]])
>>> df = pd.DataFrame(A, columns=list('QRST'))
>>> df
Q |
R |
S |
T |
0 |
3 |
8 |
2 |
1 |
2 |
6 |
4 |
2 |
6 |
1 |
3 |
>>>df - df.iloc[0]
Q |
R |
S |
T |
0 |
0 |
0 |
0 |
1 |
-1 |
-2 |
2 |
2 |
3 |
-7 |
1 |
- 按列运算
>>>df.subtract(df['R'], axis=0)
Q |
R |
S |
T |
0 |
-5 |
0 |
-6 |
1 |
-4 |
0 |
-2 |
2 |
5 |
0 |
2 |
- 对部分数据进行运算
>>>halfrow = df.iloc[0, ::2]
>>>halfrow
Q 3
S 2
Name: 0, dtype: int64
>>>df - halfrow
Q |
R |
S |
T |
0 |
0.0 |
NaN |
0.0 |
1 |
-1.0 |
NaN |
2.0 |
2 |
3.0 |
NaN |
1.0 |
处理缺失值
缺失值的表示
None:Python的表示法
import numpy as np
import pandas as pd
>>>vals1 = np.array([1, None, 3, 4])
>>>vals1
array([1, None, 3, 4], dtype=object)
>>>vals1.sum()
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
2-30a3fc8c6726> in ()
----> 1 vals1.sum()
~/anaconda/lib/python3.6/site-packages/numpy/core/_methods.py in _sum(a, axis, dtype, out, keepdims)
30
31 def _sum(a, axis=None, dtype=None, out=None, keepdims=False):
---> 32 return umr_sum(a, axis, dtype, out, keepdims)
33
34 def _prod(a, axis=None, dtype=None, out=None, keepdims=False):
TypeError: unsupported operand type(s) for +: 'int' and 'NoneType'
NaN
>>> vals2 = np.array([1, np.nan, 3, 4])
>>> vals2.dtype
dtype('float64')
>>>vals2.sum()
nan
>>>1 + np.nan
nan
>>> np.nansum(vals2)
9.0
Pandas 中的 NaN 和 None
>>> pd.Series([1, np.nan, 2, None])
0 1.0
1 NaN
2 2.0
3 NaN
dtype: float64
>>> x = pd.Series(range(2), dtype=int)
>>> x
0 0
1 1
dtype: int64
>>>x[0] = None
>>>x
0 NaN
1 1.0
dtype: float64
缺失值的操作
判断缺失值:isnull
>>> data = pd.Series([1, np.nan, 'hello', None])
>>> data.isnull()
0 False
1 True
2 False
3 True
dtype: bool
>>> data[data.notnull()]
0 1
2 hello
dtype: object
忽略缺失值:dropna()
>>> data.dropna()
0 1
2 hello
dtype: object
>>> df = pd.DataFrame([[1, np.nan, 2],
[2, 3, 5],
[np.nan, 4, 6]])
>>> df
0 |
1 |
2 |
0 |
1.0 |
NaN |
1 |
2.0 |
3.0 |
2 |
NaN |
4.0 |
df.dropna()
df.dropna(axis='columns')
>>> df[3] = np.nan
>>> df
0 |
1 |
2 |
3 |
0 |
1.0 |
NaN |
2 |
1 |
2.0 |
3.0 |
5 |
2 |
NaN |
4.0 |
6 |
df.dropna(axis='columns', how='all')
0 |
1 |
2 |
0 |
1.0 |
NaN |
1 |
2.0 |
3.0 |
2 |
NaN |
4.0 |
df.dropna(axis='rows', thresh=3)
填补缺失值:fillna()
>>> data = pd.Series([1, np.nan, 2, None, 3], index=list('abcde'))
>>> data
a 1.0
b NaN
c 2.0
d NaN
e 3.0
dtype: float64
>>> data.fillna(0)
a 1.0
b 0.0
c 2.0
d 0.0
e 3.0
dtype: float64
>>> data.fillna(method='ffill')
a 1.0
b 1.0
c 2.0
d 2.0
e 3.0
dtype: float64
>>>data.fillna(method='bfill')
a 1.0
b 2.0
c 2.0
d 3.0
e 3.0
dtype: float64
>>> df
0 |
1 |
2 |
3 |
0 |
1.0 |
NaN |
2 |
1 |
2.0 |
3.0 |
5 |
2 |
NaN |
4.0 |
6 |
df.fillna(method='ffill', axis=1)
0 |
1 |
2 |
3 |
0 |
1.0 |
1.0 |
2.0 |
1 |
2.0 |
3.0 |
5.0 |
2 |
NaN |
4.0 |
6.0 |
Concat 和 Append
回顾:numpy的矩阵连接(Concatenation)
import numpy as np
x = [1, 2, 3]
y = [4, 5, 6]
z = [7, 8, 9]
>>> np.concatenate([x, y, z])
array([1, 2, 3, 4, 5, 6, 7, 8, 9])
>>> x = [[1, 2], [3, 4]]
>>> np.concatenate([x, x], axis=1)
array([[1, 2, 1, 2],
[3, 4, 3, 4]])
Pandas的数据连接
import pandas as pd
>>> ser1 = pd.Series(['A', 'B', 'C'], index=[1, 2, 3])
>>> ser2 = pd.Series(['D', 'E', 'F'], index=[4, 5, 6])
>>> pd.concat([ser1, ser2])
2 B
3 C
4 D
5 E
6 F
dtype: object
def make_df(cols, ind):
data = {c: [str(c) + str(i) for i in ind] for c in cols}
return pd.DataFrame(data, ind)
>>>make_df('ABC', range(3))
A |
B |
C |
0 |
A0 |
B0 |
1 |
A1 |
B1 |
2 |
A2 |
B2 |
>>> df1 = make_df('AB', [1, 2])
>>> df1
>>> df2 = make_df('AB', [3, 4])
>>> df2
>>> pd.concat([df1, df2])
>>>df3 = make_df('AB', [0, 1])
>>>df3
>>> df4 = make_df('CD', [0, 1])
>>> df4
pd.concat([df3, df4], axis=1)
A |
B |
C |
D |
0 |
A0 |
B0 |
C0 |
1 |
A1 |
B1 |
C1 |
处理相同索引
>>> df_x = make_df('AB', [0, 1])
>>> df_x
>>> df_y = make_df('AB', [0, 1])
>>> df_y
pd.concat([df_x, df_y])
try:
pd.concat([df_x, df_y], verify_integrity=True)
except ValueError as e:
print("ValueError:", e)
ValueError: Indexes have overlapping values: [0, 1]
忽略相同索引
pd.concat([df_x, df_y], ignore_index=True)
制作多层索引
pd.concat([df_x, df_y], keys=['x', 'y'])
|
A |
B |
x |
0 |
A0 |
1 |
A1 |
B1 |
y |
0 |
A0 |
1 |
A1 |
B1 |
join
>>> df5 = make_df('ABC', [1, 2])
>>> df5
>>>df6 = make_df('BCD', [3, 4])
>>>df6
>>> pd.concat([df5, df6])
A |
B |
C |
D |
1 |
A1 |
B1 |
C1 |
2 |
A2 |
B2 |
C2 |
3 |
NaN |
B3 |
C3 |
4 |
NaN |
B4 |
C4 |
pd.concat([df5, df6], join='inner')
pd.concat([df5, df6], join_axes=[df5.columns])
A |
B |
C |
1 |
A1 |
B1 |
2 |
A2 |
B2 |
3 |
NaN |
B3 |
4 |
NaN |
B4 |
append()
df1.append(df2)
注意:和Python中的list的append方法不同,pandas中的append没有修改原有的数据!
Merge 和 Join
Join
一对一 Join
import pandas as pd
>>> df1 = pd.DataFrame({'employee': ['Bob', 'Jake', 'Lisa', 'Sue'],
'group': ['Accounting', 'Engineering', 'Engineering', 'HR']})
>>> df1
employee |
group |
0 |
Bob |
1 |
Jake |
2 |
Lisa |
3 |
Sue |
>>> df2 = pd.DataFrame({'employee': ['Lisa', 'Bob', 'Jake', 'Sue'],
'hire_date': [2004, 2008, 2012, 2014]})
>>> df2
employee |
hire_date |
0 |
Lisa |
1 |
Bob |
2 |
Jake |
3 |
Sue |
>>> df3 = pd.merge(df1, df2)
>>> df3
employee |
group |
hire_date |
0 |
Bob |
Accounting |
1 |
Jake |
Engineering |
2 |
Lisa |
Engineering |
3 |
Sue |
HR |
多对一 Join
>>>df4 = pd.DataFrame({'group': ['Accounting', 'Engineering', 'HR'],
'supervisor': ['Carly', 'Guido', 'Steve']})
>>>df4
group |
supervisor |
0 |
Accounting |
1 |
Engineering |
2 |
HR |
pd.merge(df3, df4)
employee |
group |
hire_date |
supervisor |
0 |
Bob |
Accounting |
2008 |
1 |
Jake |
Engineering |
2012 |
2 |
Lisa |
Engineering |
2004 |
3 |
Sue |
HR |
2014 |
多对多 Join
>>> df5 = pd.DataFrame({'group':['Accounting', 'Accounting','Engineering',
'Engineering', 'HR', 'HR'],
'skills': ['math', 'spreadsheets', 'coding',
'linux', 'spreadsheets', 'organization']})
>>> df5
group |
skills |
0 |
Accounting |
1 |
Accounting |
2 |
Engineering |
3 |
Engineering |
4 |
HR |
5 |
HR |
>>> pd.merge(df1, df5)
employee |
group |
skills |
0 |
Bob |
Accounting |
1 |
Bob |
Accounting |
2 |
Jake |
Engineering |
3 |
Jake |
Engineering |
4 |
Lisa |
Engineering |
5 |
Lisa |
Engineering |
6 |
Sue |
HR |
7 |
Sue |
HR |
Merge 中的参数
on
>>> df1
employee |
group |
0 |
Bob |
1 |
Jake |
2 |
Lisa |
3 |
Sue |
>>> df2
employee |
hire_date |
0 |
Lisa |
1 |
Bob |
2 |
Jake |
3 |
Sue |
pd.merge(df1, df2, on='employee')
employee |
group |
hire_date |
0 |
Bob |
Accounting |
1 |
Jake |
Engineering |
2 |
Lisa |
Engineering |
3 |
Sue |
HR |
left_on
和 right_on
>>> df3 = pd.DataFrame({'name': ['Bob', 'Jake', 'Lisa', 'Sue'],
'salary': [70000, 80000, 120000, 90000]})
>>> df3
name |
salary |
0 |
Bob |
1 |
Jake |
2 |
Lisa |
3 |
Sue |
pd.merge(df1, df3, left_on="employee", right_on="name")
employee |
group |
name |
salary |
0 |
Bob |
Accounting |
Bob |
1 |
Jake |
Engineering |
Jake |
2 |
Lisa |
Engineering |
Lisa |
3 |
Sue |
HR |
Sue |
>>> pd.merge(df1, df3, left_on="employee", right_on="name").drop('name', axis=1)
employee |
group |
salary |
0 |
Bob |
Accounting |
1 |
Jake |
Engineering |
2 |
Lisa |
Engineering |
3 |
Sue |
HR |
left_index
和 right_index
>>> df1a = df1.set_index('employee')
>>>df1a
group |
employee |
Bob |
Jake |
Lisa |
Sue |
>>> df2a = df2.set_index('employee')
>>> df2a
hire_date |
employee |
Lisa |
Bob |
Jake |
Sue |
>>> pd.merge(df1a, df2a, left_index=True, right_index=True)
group |
hire_date |
employee |
|
Bob |
Accounting |
Jake |
Engineering |
Lisa |
Engineering |
Sue |
HR |
>>> df1a.join(df2a)
group |
hire_date |
employee |
|
Bob |
Accounting |
Jake |
Engineering |
Lisa |
Engineering |
Sue |
HR |
>>> pd.merge(df1a, df3, left_index=True, right_on='name')
group |
name |
salary |
0 |
Accounting |
Bob |
1 |
Engineering |
Jake |
2 |
Engineering |
Lisa |
3 |
HR |
Sue |
集合操作
>>> df6 = pd.DataFrame({'name': ['Peter', 'Paul', 'Mary'],
'food': ['fish', 'beans', 'bread']},
columns=['name', 'food'])
>>> df6
name |
food |
0 |
Peter |
1 |
Paul |
2 |
Mary |
>>> df7 = pd.DataFrame({'name': ['Mary', 'Joseph'],
'drink': ['wine', 'beer']},
columns=['name', 'drink'])
>>> df7
name |
drink |
0 |
Mary |
1 |
Joseph |
>>> pd.merge(df6, df7)
name |
food |
drink |
0 |
Mary |
bread |
>>> pd.merge(df6, df7, how='inner')
name |
food |
drink |
0 |
Mary |
bread |
>>> pd.merge(df6, df7, how='outer')
name |
food |
drink |
0 |
Peter |
fish |
1 |
Paul |
beans |
2 |
Mary |
bread |
3 |
Joseph |
NaN |
>>> pd.merge(df6, df7, how='left')
name |
food |
drink |
0 |
Peter |
fish |
1 |
Paul |
beans |
2 |
Mary |
bread |
pd.merge(df6, df7, how='right')
name |
food |
drink |
0 |
Mary |
bread |
1 |
Joseph |
NaN |
处理相同的列
>>> df8 = pd.DataFrame({'name': ['Bob', 'Jake', 'Lisa', 'Sue'],
'rank': [1, 2, 3, 4]})
>>> df8
name |
rank |
0 |
Bob |
1 |
Jake |
2 |
Lisa |
3 |
Sue |
>>> df9 = pd.DataFrame({'name': ['Bob', 'Jake', 'Lisa', 'Sue'],
'rank': [3, 1, 4, 2]})
>>>df9
name |
rank |
0 |
Bob |
1 |
Jake |
2 |
Lisa |
3 |
Sue |
pd.merge(df8, df9, on="name")
name |
rank_x |
rank_y |
0 |
Bob |
1 |
1 |
Jake |
2 |
2 |
Lisa |
3 |
3 |
Sue |
4 |
pd.merge(df8, df9, on="name", suffixes=["_L", "_R"])
name |
rank_L |
rank_R |
0 |
Bob |
1 |
1 |
Jake |
2 |
2 |
Lisa |
3 |
3 |
Sue |
4 |
表格操作
### pd.read_csv
header : 默认header=0,第一行数据作为列名。 header=None 不作列名。
skip_blank_lines=True:忽略空行
skiprows:忽略的行数
nrows:需要读取的行数
names:为每一列起列名
python
Names = ['a','asd','b','c','d','e']
dt = pd.read_excel("uci.xls", sheet_name=0,header=None,
usecols=[0,1,2,3,4,5], names=Names )
index_col:用作行索引的列编号或列名
prefix=’XX’:为列名添加前缀名
usecols:仅用某几列。usecols=[0,1,2,3]或usecols=[‘第一列名’,‘第二列名’]
dtype:每列数据的数据类型。用字典。dtype={‘a’: np.float64, ‘b’: np.int32}
na_values:用于替换NA/NAN的值
pd.read_excel
pd.read_excel(io, sheetname=0,header=0,skiprows=None,index_col=None,names=None,
arse_cols=None,date_parser=None,na_values=None,thousands=None,
convert_float=True,has_index_names=None,converters=None,dtype=None,
true_values=None,false_values=None,engine=None,squeeze=False,**kwds)
Names = ['a','b','c','d','e']
dt = pd.read_excel("uci.xls", sheet_name=0,header=None,
usecols=[0,1,2,3,4,5], names=Names,
index_col = 0,
dtype={'a': np.float64,'b': np.int32} )
df_out.to_excel
df_out.to_excel('tmp.xlsx')