目录
1创建数据
1.1创建series
1.2创建dataframe
2.Series
2.1 类似于ndarray的Series
2.2布尔子集:series
2.3 操作自动对齐和向量化
2.3.1同长度向量
2.3.2向量和整数运算
2.3.3不同长度向量间的运算
2.3.4带有常见索引标签的向量
3.dataframe
3.1布尔子集
3.2操作自动对齐和向量化
4.更改series和dataframe
4.1 添加列
4.2 更改列
4.3 删除值
import pandas as pd
#通过传入一个python列表,手动为series指定索引值
s=pd.Series(['Wes Mckinney','Creator of Pandas'],index=['Person','Who'])
print(s)
scientists=pd.DataFrame({
'Name':['Rosaline Franklin','william Gosset'],
'Occupation':['Chemist','Statistician'],
'Born':['1920-07-25','1876-06-13'],
'Died':['1958-04-16','1937-10-16'],
'Age':[37,61]
})
print(scientists)
#创建名为scientists的dataframe
#带有行索引标签
scientists=pd.DataFrame(
data={
'Occupation':['Chemist','Statistician'],
'Born':['1920-07-25','1876-06-13'],
'Died':['1958-04-16','1937-10-16'],
'Age':[37,61]
},index=['Rosaline Franklin','william Gosset'],columns=['Occupation','Born','Died','Age'])
print(scientists)
#使用行索引标签进行选择
first_row=scientists.loc['william Gosset']
print(type(first_row))
print(first_row)
#series的两个属性
print(first_row.index)
print(first_row.values)
#series的方法
print(first_row.keys())
#使用属性获取第一个索引
print(first_row.index)
#使用方法获取第一个索引
print(first_row.keys()[0])
scientists=pd.read_csv('scientists.csv')
#获取age列
ages=scientists['Age']
print(ages)
print(ages.mean())
print(ages.min())
print(ages.max())
print(ages.std())
# mean,min,max,std也是numpy.ndarry的方法
ages=scientists['Age']
print(ages)
#获取基本统计量
print(ages.describe())
#所有年龄的平均值
print(ages.mean())
#获取大于平均值的年龄
print(ages[ages>ages.mean()])
print(ages+ages)
print(ages*ages)
print(ages+100)
print(ages*2)
print(ages+pd.Series([1,100]))
#根据年龄在数据中的原有顺序输出
rev_ages=ages.sort_index(ascending=False)
print(rev_ages)
#参考输出,显示索引标签对齐
print(ages*2)
#在向量反向的情形下如何获取相同值
print(ages+rev_ages)
#使用布尔向量获取部分数据行
print(scientists[scientists['Age']>scientists['Age'].mean()])
#传入包含8个值的布尔向量
#返回3行
print(scientists.loc[[True,False,True,False,True,False,False,False]])
first_half=scientists[:4]
second_half=scientists[4:]
print(first_half)
print(second_half)
print(scientists*2)
print(scientists['Born'].dtype)
print(scientists['Died'].dtype)
#born和died列的类型是object,表明它们是字符串
#把born列格式化为dataframe
born_dataframe=pd.to_datetime(scientists['Born'],format='%Y-%m-%d')
print(born_dataframe)
died_dataframe=pd.to_datetime(scientists['Died'],format='%Y-%m-%d')
print(died_dataframe)
scientists['born_dt'],scientists['died_dt']=(born_dataframe,died_dataframe)
print(scientists.head())
print(scientists['Age'])
import random
random.seed(42)
random.shuffle(scientists['Age'])
print(scientists['Age'])
#使用random_state减少随机化
#sample(L, n) 从序列L中随机抽取n个元素,并将n个元素以list形式返回。
#reset_index中drop=True参数指示pandas不要把索引插入到dataframe列中
scientists['Age']=scientists['Age'].sample(len(scientists['Age']),random_state=24).reset_index(drop=True)
print(scientists['Age'])
#死亡日期减去出生日期得到在世天数
scientists['age_days_dt']=(scientists['died_dt']-scientists['born_dt'])
print(scientists)
#使用astype方法把天数转换成年
#Timedeltas时间上差异,以差异单位表示,例天,小时,分钟,秒
scientists['age_days_dt']=scientists['age_days_dt'].astype('timedelta64[Y]')
print(scientists)
print(scientists.columns)
#删除混乱的age列
#设置参数axis=1,删除列
scientists_dropped=scientists.drop(['Age'],axis=1)
#删除指定列之后
print(scientists_dropped.columns)