创建DataFrame对象
创建DataFrame对象的数据可以为列表,数组和字典
import pandas as pd
import numpy as np
a = [[1, 2, 3],[4, 5, 6]] #二维矩阵
df2 = pd.DataFrame(a, columns=[‘col1’,‘col2’,‘col3’], index=[‘a’,‘b’])
print df2
结果:
col1 col2 col3
a 1 2 3
b 4 5 6
读取
excel格式
data = pd.read_excel(‘四创-渠道.xlsx’,sheetname=‘单篇图文’)#sheetname指定sheet
保存成csv格式
不要index和header的保存
data_train.to_csv(‘temp/data_train.csv’,encoding=‘utf8’,index=False,header = False)
一、增
列连接(横向:变宽):merge
前提:header要一致
df1 = pd.DataFrame({‘key’: [‘a’, ‘b’, ‘c’], ‘data1’: [1, 2, 3]})
df2 = pd.DataFrame({‘key’: [‘a’, ‘b’, ‘c’], ‘data2’: [4, 5, 6]})
df3 = pd.merge(df1, df2)
结果:
df1:
key data1
0 a 1
1 b 2
2 c 3
df2:
key data2
0 a 4
1 b 5
2 c 6
df3
key data1 data2
0 a 1 4
1 b 2 5
2 c 3 6
行连接(纵向:变长):concat
前提:header要一致
import pandas as pd
df1=pd.DataFrame({‘key’:[‘a’,‘b’,‘c’],‘data’:[1,2,3]})
df2=pd.DataFrame({‘key’:[‘d’,‘e’,‘f’],‘data’:[4,5,6]})
df3=pd.concat([df1,df2],ignore_index=True)
结果:
df1:
key data
0 a 1
1 b 2
2 c 3
df2:
key data
0 d 4
1 e 5
2 f 6
df3:
key data
0 a 1
1 b 2
2 c 3
0 d 4
1 e 5
2 f 6
根据索引合并
a = [[1, 2, 3], [4, 5, 6]] # 二维矩阵
df1 = pd.DataFrame(a, columns=[‘col4’, ‘col5’, ‘col6’])
df2 = pd.DataFrame(a, columns=[‘col1’, ‘col2’, ‘col3’])
df = pd.merge(df1,df2 , left_index= True,right_index= True)
DataFrame对象的合并
df_a = pd.DataFrame([‘wang’,‘jing’,‘hui’,‘is’,‘a’,‘master’],columns=[‘col6’],index=[‘a’,‘b’,‘c’,‘d’,‘e’,‘f’])
dfb = pd.DataFrame([1,2,4,5,6,7],columns=[‘col1’],index=[‘a’,‘b’,‘c’,‘d’,‘f’,‘g’])
df_a
col6
a wang
b jing
c hui
d is
e a
f master
dfb
col1
a 1
b 2
c 4
d 5
f 6
g 7
print dfb.join(df_a)
col1 col6
a 1 wang
b 2 jing
c 4 hui
d 5 is
f 6 master
g 7 NaN
#通过指定参数 how,指定合并的方式
print dfb.join(df_a,how=‘inner’)
print dfb.join(df_a,how=‘outer’)
用某几列计算生成新列
方法一:
import pandas as pd
df=pd.DataFrame({‘key’:[‘a’,‘b’,‘c’],‘data1’:[1,2,3],‘data2’:[4,5,6]})
print(df)
df[‘data3’]=df[‘data1’]+df[‘data2’]
print(df)
方法二:
import pandas as pd
import math
def testme(x):
print(x[‘data1’],x[‘data2’])
return x[‘data1’] + x[‘data2’]
df=pd.DataFrame({‘key’:[‘a’,‘b’,‘c’],‘data1’:[1,2,3],‘data2’:[4,5,6]})
print(df)
df[‘data3’]=df.apply(testme, axis=1)
print(df)
结果:
data1 data2 key
0 1 4 a
1 2 5 b
2 3 6 c
data1 data2 key data3
0 1 4 a 5
1 2 5 b 7
2 3 6 c 9
列扩充
df2 = pd.DataFrame([[1, 2, 3],[4, 5, 6]], columns=[‘col1’,‘col2’,‘col3’],index = [‘a’,‘b’])
print(df2)
结果:
col1 col2 col3
a 1 2 3
b 4 5 6
df2[‘col4’] = [‘cnn’,‘rnn’]
print df2
结果:
col1 col2 col3 col4
a 1 2 3 3
b 4 5 6 2
二、删
行删除:
DF.drop([‘column-name’], axis = 0)
删除某列指定的几行
eg:删除日期为data_dele的数据 data_dele = [‘2018-09-13’,‘2018-09-14’,‘2018-09-15’,‘2018-09-16’]
data=data[~data[‘日期’].isin(data_dele)]
列删除:
DF.drop([‘column-name’], axis = 1,inplace=None) #inplace默认为None,表示不在原本的DF上修改,True在原本的DF上修改 del DF[‘column-name’] #对原来的DF的改变
删除后出现的重复值:df[‘city’].drop_duplicates()
删除先出现的重复值:df[‘city’].drop_duplicates(keep=‘last’)
获取唯一的dataframe: data.drop_duplicates([‘日期’])
import pandas as pd
import math
df=pd.DataFrame({‘key’:[‘a’,‘b’,‘c’],‘data1’:[1,2,3],‘data2’:[4,5,6]})
print(df)
df=df.drop([‘data2’],axis=1)
print(df)
结果:
data1 data2 key
0 1 4 a
1 2 5 b
2 3 6 c
data1 key
0 1 a
1 2 b
2 3 c
load[‘ld’] = load[‘ld’].str.rstrip(’%’).astype(‘float’)