import numpy as np
import pandas as pd
from pandas import Series,DataFrame
from numpy import nan as NA
pd.merge() dataframe和dataframe
# 创建2个dataframe用来连接
df1 = DataFrame(
{'key':list('aabbabc') ,'data1':np.arange(7)*10}
)
df1
key data1
0 a 0
1 a 10
2 b 20
3 b 30
4 a 40
5 b 50
6 c 60
df2 = DataFrame(
{'key':list('abd') ,'data2':np.arange(1,4)*10}
)
df2
key data2
0 a 10
1 b 20
2 d 30
# pd.merge 采用 inner 连接,取交集。没有交集的舍弃
pd.merge(df1,df2)
key data1 data2
0 a 0 10
1 a 10 10
2 a 40 10
3 b 20 20
4 b 30 20
5 b 50 20
# 默认是以相同的列名称 为 键 进行合并, 一般情况下 建议 用 on='' 指定一下
pd.merge(df1,df2,on='key')
key data1 data2
0 a 0 10
1 a 10 10
2 a 40 10
3 b 20 20
4 b 30 20
5 b 50 20
df3 = DataFrame(
{ 'Lkey':list('aabbabc') , 'data1':np.arange(7)*10}
)
df4 = DataFrame(
{'Rkey':list('abd') ,'data2':np.arange(1,4)*10}
)
print(df3)
print(df4)
Lkey data1
0 a 0
1 a 10
2 b 20
3 b 30
4 a 40
5 b 50
6 c 60
Rkey data2
0 a 10
1 b 20
2 d 30
# 没有相同的列 , 可以用 left_on= '' right_on=' '
pd.merge(df3,df4 ,left_on='Lkey',right_on='Rkey')
Lkey data1 Rkey data2
0 a 0 a 10
1 a 10 a 10
2 a 40 a 10
3 b 20 b 20
4 b 30 b 20
5 b 50 b 20
# 默认参数 how = 'inner' 去交集, how='outer' 取并集
pd.merge(df1,df2,how='outer')
key data1 data2
0 a 0.0 10.0
1 a 10.0 10.0
2 a 40.0 10.0
3 b 20.0 20.0
4 b 30.0 20.0
5 b 50.0 20.0
6 c 60.0 NaN
7 d NaN 30.0
# how = left 以左边为主 ,how=right 右边为主 .mysql 左连接与右连接
pd.merge(df3,df4,left_on='Lkey',right_on='Rkey',how='left')
Lkey data1 Rkey data2
0 a 0 a 10.0
1 a 10 a 10.0
2 b 20 b 20.0
3 b 30 b 20.0
4 a 40 a 10.0
5 b 50 b 20.0
6 c 60 NaN NaN
# 创建数据
df_left = DataFrame({
'key1':'foo/foo/foo/bar/bar'.split('/'),
'key2':'one/two/one/one/two'.split('/'),
'Lvalue':[10,20,30,40,50]
})
df_right = DataFrame({
'key1':'bar/foo/foo'.split('/'),
'key2':'one/one/two'.split('/'),
'Rvalue': [6,7,8]
})
print(df_left)
print(df_right)
key1 key2 Lvalue
0 foo one 10
1 foo two 20
2 foo one 30
3 bar one 40
4 bar two 50
key1 key2 Rvalue
0 bar one 6
1 foo one 7
2 foo two 8
# 多列
pd.merge(df_left,df_right , on=['key1','key2'] , how='outer' )
key1 key2 Lvalue Rvalue
0 foo one 10 7.0
1 foo one 30 7.0
2 foo two 20 8.0
3 bar one 40 6.0
4 bar two 50 NaN
# 如果完全没有相同列
# 创建数据
df_5 = DataFrame({
'key1':'foo/foo/foo/bar/bar'.split('/'),
'key2':'one/two/one/one/two'.split('/'),
'Lvalue':[10,20,30,40,50]
})
df_6 = DataFrame({
'key3':'bar/foo/foo'.split('/'),
'key4':'one/one/two'.split('/'),
'Rvalue': [6,7,8]
})
print(df_5)
print(df_6)
key1 key2 Lvalue
0 foo one 10
1 foo two 20
2 foo one 30
3 bar one 40
4 bar two 50
key3 key4 Rvalue
0 bar one 6
1 foo one 7
2 foo two 8
pd.merge(df_5,df_6,left_on=['key1','key2'], right_on=['key3','key4'], how='outer')
key1 key2 Lvalue key3 key4 Rvalue
0 foo one 10 foo one 7.0
1 foo one 30 foo one 7.0
2 foo two 20 foo two 8.0
3 bar one 40 bar one 6.0
4 bar two 50 NaN NaN NaN
pd.concat( obj ,axis=0 ,join='outer' ) series 和 series
obj : iterable 可迭代的
# 创建 series 用来连接
s1 = Series( [0,10] ,index=['a','b'] )
s2 = Series( [20,30,40] ,index=['c','d','e'] )
s3 = Series( [5,6] ,index=['f','g'] )
print(s1)
print(s2)
print(s3)
a 0
b 10
dtype: int64
c 20
d 30
e 40
dtype: int64
f 5
g 6
dtype: int64
pd.concat([s1,s2])
a 0
b 10
c 20
d 30
e 40
dtype: int64
pd.concat([s1,s2] ,axis=0)
a 0
b 10
c 20
d 30
e 40
dtype: int64
pd.concat([s1,s2,s3],axis=1)
0 1 2
a 0.0 NaN NaN
b 10.0 NaN NaN
c NaN 20.0 NaN
d NaN 30.0 NaN
e NaN 40.0 NaN
f NaN NaN 5.0
g NaN NaN 6.0
pd.concat([s1,s2,s3],axis=1 ,join='inner')
0 1 2
pd.concat([s1,s2,s3],axis=0)
a 0
b 10
c 20
d 30
e 40
f 5
g 6
dtype: int64
# 可以 使用 keys = [] ,将原参与合并的数据进行标记。 标记后会形成多层索引
s4 = pd.concat([s1,s2,s3],axis=0 , keys=['one','two','three'])
s4
one a 0
b 10
two c 20
d 30
e 40
three f 5
g 6
dtype: int64
s4.index
MultiIndex([( 'one', 'a'),
( 'one', 'b'),
( 'two', 'c'),
( 'two', 'd'),
( 'two', 'e'),
('three', 'f'),
('three', 'g')],
)
s4.loc['one']
a 0
b 10
dtype: int64
s4.iloc[0]
0
s4.loc['one','a']
0
pd.concat([s1,s2,s3] ,ignore_index=True)
0 0
1 10
2 20
3 30
4 40
5 5
6 6
dtype: int64
pd.concat([s1,s2,s3],axis=0 , keys=['one','two','three'] ,names=['num','letter'])
num letter
one a 0
b 10
two c 20
d 30
e 40
three f 5
g 6
dtype: int64
1、 以左边 非 nan 为主
2、 如果左边的值是nan,右边不是,用右边的
3、 左边索引上没有值的,用右边
# 创建数据
s6 = Series([NA, 2 ,NA ,4,5, NA] ,index=list('fedbac'))
s6
f NaN
e 2.0
d NaN
b 4.0
a 5.0
c NaN
dtype: float64
s7 = Series([10,20,30,40] ,index=list('febg'))
s7
f 10
e 20
b 30
g 40
dtype: int64
s6.combine_first(s7)
a 5.0
b 4.0
c NaN
d NaN
e 2.0
f 10.0
g 40.0
dtype: float64
df7 = DataFrame({
'a':[1,NA,5,NA],
'b':[NA,2,NA,6],
'c':[2,4,6,8]
})
df8 = DataFrame({
'a':[50,NA,40,NA,NA],
'b':[40,50,60,70,80]
})
print(df7)
print(df8)
a b c
0 1.0 NaN 2
1 NaN 2.0 4
2 5.0 NaN 6
3 NaN 6.0 8
a b
0 50.0 40
1 NaN 50
2 40.0 60
3 NaN 70
4 NaN 80
df7.combine_first(df8)
a b c
0 1.0 40.0 2.0
1 NaN 2.0 4.0
2 5.0 60.0 6.0
3 NaN 6.0 8.0
4 NaN 80.0 NaN
a = 3
b = 2
c = a if a>b else b
if a>b:
c=a
else:
c=b
3
df5 = DataFrame([[1,20],[10,3]])
df5
0 1
0 1 20
1 10 3
df6 = DataFrame([[12,2],[4,30]])
df6
0 1
0 12 2
1 4 30
# 保留合并位置 的 大的
df5.combine(df6,func=np.maximum)
0 1
0 12 20
1 10 30
# 保留合并位置 的 大的
df5.combine(df6,func=np.minimum)
0 1
0 1 2
1 4 3