08.pandas 高级 - 合并数据集

目录

    • 4.1 合并数据集
      • 如果没有相同的列
      • 多列 合并
      • xxx.combine_first()
      • xxx.combine(yyy, func)

import numpy as np
import pandas as pd
from pandas import Series,DataFrame
from numpy import nan as NA

4.1 合并数据集

pd.merge() dataframe和dataframe

# 创建2个dataframe用来连接
df1 = DataFrame(
    {'key':list('aabbabc') ,'data1':np.arange(7)*10}
)
df1
key	data1
0	a	0
1	a	10
2	b	20
3	b	30
4	a	40
5	b	50
6	c	60
df2 = DataFrame(
    {'key':list('abd') ,'data2':np.arange(1,4)*10}
)
df2
key	data2
0	a	10
1	b	20
2	d	30
# pd.merge 采用 inner 连接,取交集。没有交集的舍弃
pd.merge(df1,df2)
key	data1	data2
0	a	0	10
1	a	10	10
2	a	40	10
3	b	20	20
4	b	30	20
5	b	50	20
# 默认是以相同的列名称 为 键 进行合并, 一般情况下 建议 用 on='' 指定一下
pd.merge(df1,df2,on='key')
key	data1	data2
0	a	0	10
1	a	10	10
2	a	40	10
3	b	20	20
4	b	30	20
5	b	50	20

如果没有相同的列

df3 = DataFrame(
    { 'Lkey':list('aabbabc') , 'data1':np.arange(7)*10}
)
df4 = DataFrame(
    {'Rkey':list('abd') ,'data2':np.arange(1,4)*10}  
)
print(df3)
print(df4)
  Lkey  data1
0    a      0
1    a     10
2    b     20
3    b     30
4    a     40
5    b     50
6    c     60
  Rkey  data2
0    a     10
1    b     20
2    d     30
#  没有相同的列 , 可以用 left_on= ''   right_on=' '
pd.merge(df3,df4 ,left_on='Lkey',right_on='Rkey')
Lkey	data1	Rkey	data2
0	a	0	a	10
1	a	10	a	10
2	a	40	a	10
3	b	20	b	20
4	b	30	b	20
5	b	50	b	20
#  默认参数 how = 'inner' 去交集, how='outer' 取并集
pd.merge(df1,df2,how='outer')
key	data1	data2
0	a	0.0	10.0
1	a	10.0	10.0
2	a	40.0	10.0
3	b	20.0	20.0
4	b	30.0	20.0
5	b	50.0	20.0
6	c	60.0	NaN
7	d	NaN	30.0
# how = left  以左边为主   ,how=right  右边为主 .mysql 左连接与右连接
pd.merge(df3,df4,left_on='Lkey',right_on='Rkey',how='left')
Lkey	data1	Rkey	data2
0	a	0	a	10.0
1	a	10	a	10.0
2	b	20	b	20.0
3	b	30	b	20.0
4	a	40	a	10.0
5	b	50	b	20.0
6	c	60	NaN	NaN

多列 合并

# 创建数据
df_left = DataFrame({
    'key1':'foo/foo/foo/bar/bar'.split('/'),
    'key2':'one/two/one/one/two'.split('/'),
    'Lvalue':[10,20,30,40,50]
})
df_right = DataFrame({
    'key1':'bar/foo/foo'.split('/'),
    'key2':'one/one/two'.split('/'),
    'Rvalue': [6,7,8]
})
print(df_left)
print(df_right)
  key1 key2  Lvalue
0  foo  one      10
1  foo  two      20
2  foo  one      30
3  bar  one      40
4  bar  two      50
  key1 key2  Rvalue
0  bar  one       6
1  foo  one       7
2  foo  two       8
# 多列
pd.merge(df_left,df_right , on=['key1','key2'] , how='outer' )
key1	key2	Lvalue	Rvalue
0	foo	one	10	7.0
1	foo	one	30	7.0
2	foo	two	20	8.0
3	bar	one	40	6.0
4	bar	two	50	NaN
# 如果完全没有相同列
# 创建数据
df_5 = DataFrame({
    'key1':'foo/foo/foo/bar/bar'.split('/'),
    'key2':'one/two/one/one/two'.split('/'),
    'Lvalue':[10,20,30,40,50]
})
df_6 = DataFrame({
    'key3':'bar/foo/foo'.split('/'),
    'key4':'one/one/two'.split('/'),
    'Rvalue': [6,7,8]
})
print(df_5)
print(df_6)
  key1 key2  Lvalue
0  foo  one      10
1  foo  two      20
2  foo  one      30
3  bar  one      40
4  bar  two      50
  key3 key4  Rvalue
0  bar  one       6
1  foo  one       7
2  foo  two       8
pd.merge(df_5,df_6,left_on=['key1','key2'], right_on=['key3','key4'], how='outer')
key1	key2	Lvalue	key3	key4	Rvalue
0	foo	one	10	foo	one	7.0
1	foo	one	30	foo	one	7.0
2	foo	two	20	foo	two	8.0
3	bar	one	40	bar	one	6.0
4	bar	two	50	NaN	NaN	NaN
pd.concat( obj ,axis=0 ,join='outer' ) series 和 series
obj : iterable 可迭代的
# 创建 series 用来连接
s1 = Series( [0,10] ,index=['a','b'] )
s2 = Series( [20,30,40] ,index=['c','d','e'] )
s3 = Series( [5,6] ,index=['f','g'] )
print(s1)
print(s2)
print(s3)
a     0
b    10
dtype: int64
c    20
d    30
e    40
dtype: int64
f    5
g    6
dtype: int64
pd.concat([s1,s2]) 
a     0
b    10
c    20
d    30
e    40
dtype: int64
pd.concat([s1,s2] ,axis=0)  
a     0
b    10
c    20
d    30
e    40
dtype: int64
pd.concat([s1,s2,s3],axis=1)
0	1	2
a	0.0	NaN	NaN
b	10.0	NaN	NaN
c	NaN	20.0	NaN
d	NaN	30.0	NaN
e	NaN	40.0	NaN
f	NaN	NaN	5.0
g	NaN	NaN	6.0
pd.concat([s1,s2,s3],axis=1 ,join='inner')
0	1	2
pd.concat([s1,s2,s3],axis=0)
a     0
b    10
c    20
d    30
e    40
f     5
g     6
dtype: int64
# 可以 使用 keys = [] ,将原参与合并的数据进行标记。 标记后会形成多层索引
s4 = pd.concat([s1,s2,s3],axis=0 , keys=['one','two','three'])
s4
one    a     0
       b    10
two    c    20
       d    30
       e    40
three  f     5
       g     6
dtype: int64
s4.index
MultiIndex([(  'one', 'a'),
            (  'one', 'b'),
            (  'two', 'c'),
            (  'two', 'd'),
            (  'two', 'e'),
            ('three', 'f'),
            ('three', 'g')],
           )
s4.loc['one']
a     0
b    10
dtype: int64
s4.iloc[0]
0
s4.loc['one','a']
0
pd.concat([s1,s2,s3] ,ignore_index=True)
0     0
1    10
2    20
3    30
4    40
5     5
6     6
dtype: int64
pd.concat([s1,s2,s3],axis=0 , keys=['one','two','three'] ,names=['num','letter'])
num    letter
one    a          0
       b         10
two    c         20
       d         30
       e         40
three  f          5
       g          6
dtype: int64

xxx.combine_first()

1、 以左边 非 nan 为主
2、 如果左边的值是nan,右边不是,用右边的
3、 左边索引上没有值的,用右边

# 创建数据
s6 = Series([NA, 2 ,NA ,4,5, NA] ,index=list('fedbac'))
s6
f    NaN
e    2.0
d    NaN
b    4.0
a    5.0
c    NaN
dtype: float64
s7 = Series([10,20,30,40] ,index=list('febg'))
s7
f    10
e    20
b    30
g    40
dtype: int64
s6.combine_first(s7)
a     5.0
b     4.0
c     NaN
d     NaN
e     2.0
f    10.0
g    40.0
dtype: float64
df7 = DataFrame({
    'a':[1,NA,5,NA],
    'b':[NA,2,NA,6],
    'c':[2,4,6,8]
})
df8 = DataFrame({
    'a':[50,NA,40,NA,NA],
    'b':[40,50,60,70,80]
})
print(df7)
print(df8)
     a    b  c
0  1.0  NaN  2
1  NaN  2.0  4
2  5.0  NaN  6
3  NaN  6.0  8
      a   b
0  50.0  40
1   NaN  50
2  40.0  60
3   NaN  70
4   NaN  80
df7.combine_first(df8)
a	b	c
0	1.0	40.0	2.0
1	NaN	2.0	4.0
2	5.0	60.0	6.0
3	NaN	6.0	8.0
4	NaN	80.0	NaN
a = 3
b = 2
c = a if a>b else b
if a>b:
    c=a
else:
    c=b
3

xxx.combine(yyy, func)

df5 = DataFrame([[1,20],[10,3]])
df5
0	1
0	1	20
1	10	3
df6 = DataFrame([[12,2],[4,30]])
df6
0	1
0	12	2
1	4	30
# 保留合并位置 的 大的
df5.combine(df6,func=np.maximum)
0	1
0	12	20
1	10	30
# 保留合并位置 的 大的
df5.combine(df6,func=np.minimum)
0	1
0	1	2
1	4	3

你可能感兴趣的:(numpy,python,索引)