import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
datalist = []
for i in range(1,3):
try:
data = pd.read_csv('lianjia{}.csv'.format(i),encoding = 'gbk')
except:
data = pd.read_csv('lianjia{}.csv'.format(i),encoding = 'utf-8')
datalist.append(data)
将列表中所有数据框合并为一个数据框
data = pd.concat(datalist)
查看描述统计和字段格式信息
data.describe()
data.info()
查看数据框前5行
data.head()
data = data[['cjdanjia','cjxiaoqu','cjlouceng','bankuai']]
查看提取后的前5行
data.head()
(data.isnull()).sum()#查看是否有缺失值
查看有缺失值的具体行
data[data.cjdanjia.isnull()]#查看成交单价缺失的行
删除含有缺失值的行,加入how = ‘all’,删除一行全为空的行
data.dropna(inplace = True)#删除字段全为空的行how = 'all',
(data.duplicated()).sum()#查看是否有重复值
查看是否有除bankuai外的三个字段重复的行
(data.duplicated(subset = ['cjdanjia','cjxiaoqu','cjlouceng'])).sum()
查看重复值
data[data.duplicated()]
将数据集按照板块排序,要先排序之后再删除重复值,这是因为如果不按照板块排序的话,删除的重复值可能板块不为空,会丢失信息
data.sort_values(by = 'bankuai',inplace = True)
删除重复值,查看排序并删重之后的数据是什么样子的
data.drop_duplicates(subset = ['cjdanjia','cjxiaoqu','cjlouceng'],inplace = True)
data.head()
(data.cjdanjia.str.contains('元/平') == False).sum()
将成交单价列的字符去掉’元/平’,转为float,以万为单位,保留两位小数点
data = data.assign(cjdanjia = np.round(data.cjdanjia.str.replace('元/平','').astype(np.float32).map(lambda x:x/10000),2))
data.cjdanjia.min()
剔除成交单价小于0.5万的数据
data = data[data.cjdanjia > 0.5]
bins = [0,1,3,5,7,10,15]
pd.cut(data.cjdanjia,bins).value_counts().plot.bar(rot = 20)
(data.cjlouceng.str.split('/').map(len) != 3).sum()
将朝向、楼层提取出来存为单独一列字段
data = data.assign(chaoxiang = data.cjlouceng.map(lambda x:x.split('/')[0]))
data = data.assign(louceng = data.cjlouceng.map(lambda x:x.split('/')[1]))
返回列唯一值
data.louceng.unique()
将楼层列的字符串转为数值型
data = data.join(pd.get_dummies(data.louceng))
data.head()