首先奉上爬虫demo,如果有直接需要数据的请评论留言,会分享。
import os
import re
import time
import requests
from pymongo import MongoClient
from info import rent_type, city_info
class Rent(object):
"""
初始化函数,获取租房类型(整租、合租)、要爬取的城市分区信息以及连接mongodb数据库
"""
def __init__(self):
self.rent_type = rent_type
self.city_info = city_info
host = os.environ.get('MONGODB_HOST', '127.0.0.1') # 本地数据库
port = os.environ.get('MONGODB_PORT', '27017') # 数据库端口
mongo_url = 'mongodb://{}:{}'.format(host, port)
mongo_db = os.environ.get('MONGODB_DATABASE', 'Lianjia')
client = MongoClient(mongo_url)
self.db = client[mongo_db]
self.db['zufang'].create_index('m_url', unique=True) # 以m端链接为主键进行去重
def get_data(self):
"""
爬取不同租房类型、不同城市各区域的租房信息
:return: None
"""
for ty, type_code in self.rent_type.items(): # 整租、合租
for city, info in self.city_info.items(): # 城市、城市各区的信息
for dist, dist_py in info[2].items(): # 各区及其拼音
res_bc = requests.get('https://m.lianjia.com/chuzu/{}/zufang/{}/'.format(info[1], dist_py))
pa_bc = r"data-type=\"bizcircle\" data-key=\"(.*)\" class=\"oneline \">"
bc_list = re.findall(pa_bc, res_bc.text)
self._write_bc(bc_list)
bc_list = self._read_bc() # 先爬取各区的商圈,最终以各区商圈来爬数据,如果按区爬,每区最多只能获得2000条数据
if len(bc_list) > 0:
for bc_name in bc_list:
idx = 0
has_more = 1
while has_more:
try:
url = 'https://app.api.lianjia.com/Rentplat/v1/house/list?city_id={}&condition={}' \
'/rt{}&limit=30&offset={}&request_ts={}&scene=list'.format(info[0],
bc_name,
type_code,
idx*30,
int(time.time()))
res = requests.get(url=url, timeout=10)
print('成功爬取{}市{}-{}的{}第{}页数据!'.format(city, dist, bc_name, ty, idx+1))
item = {'city': city, 'type': ty, 'dist': dist}
self._parse_record(res.json()['data']['list'], item)
total = res.json()['data']['total']
idx += 1
if total/30 <= idx:
has_more = 0
# time.sleep(random.random())
except:
print('链接访问不成功,正在重试!')
def _parse_record(self, data, item):
"""
解析函数,用于解析爬回来的response的json数据
:param data: 一个包含房源数据的列表
:param item: 传递字典
:return: None
"""
if len(data) > 0:
for rec in data:
item['bedroom_num'] = rec.get('frame_bedroom_num')
item['hall_num'] = rec.get('frame_hall_num')
item['bathroom_num'] = rec.get('frame_bathroom_num')
item['rent_area'] = rec.get('rent_area')
item['house_title'] = rec.get('house_title')
item['resblock_name'] = rec.get('resblock_name')
item['bizcircle_name'] = rec.get('bizcircle_name')
item['layout'] = rec.get('layout')
item['rent_price_listing'] = rec.get('rent_price_listing')
item['house_tag'] = self._parse_house_tags(rec.get('house_tags'))
item['frame_orientation'] = rec.get('frame_orientation')
item['m_url'] = rec.get('m_url')
item['rent_price_unit'] = rec.get('rent_price_unit')
try:
res2 = requests.get(item['m_url'], timeout=5)
pa_lon = r"longitude: '(.*)',"
pa_lat = r"latitude: '(.*)'"
pa_distance = r"(\d*)米"
item['longitude'] = re.findall(pa_lon, res2.text)[0]
item['latitude'] = re.findall(pa_lat, res2.text)[0]
distance = re.findall(pa_distance, res2.text)
if len(distance) > 0:
item['distance'] = distance[0]
else:
item['distance'] = None
except:
item['longitude'] = None
item['latitude'] = None
item['distance'] = None
self.db['zufang'].update_one({'m_url': item['m_url']}, {'$set': item}, upsert=True)
print('成功保存数据:{}!'.format(item))
@staticmethod
def _parse_house_tags(house_tag):
"""
处理house_tags字段,相当于数据清洗
:param house_tag: house_tags字段的数据
:return: 处理后的house_tags
"""
if len(house_tag) > 0:
st = ''
for tag in house_tag:
st += tag.get('name') + ' '
return st.strip()
@staticmethod
def _write_bc(bc_list):
"""
把爬取的商圈写入txt,为了整个爬取过程更加可控
:param bc_list: 商圈list
:return: None
"""
with open('bc_list.txt', 'w') as f:
for bc in bc_list:
f.write(bc+'\n')
@staticmethod
def _read_bc():
"""
读入商圈
:return: None
"""
with open('bc_list.txt', 'r') as f:
return [bc.strip() for bc in f.readlines()]
if __name__ == '__main__':
rent = Rent()
rent.get_data()
其中的info.py文件
rent_type = {'整租': 200600000001, '合租': 200600000002}
city_info = {'北京': [110000, 'bj', {'东城': 'dongcheng', '西城': 'xicheng', '朝阳': 'chaoyang', '海淀': 'haidian',
'丰台': 'fengtai', '石景山': 'shijingshan', '通州': 'tongzhou', '昌平': 'changping',
'大兴': 'daxing', '亦庄开发区': 'yizhuangkaifaqu', '顺义': 'shunyi', '房山': 'fangshan',
'门头沟': 'mentougou', '平谷': 'pinggu', '怀柔': 'huairou', '密云': 'miyun',
'延庆': 'yanqing'}],
'上海': [310000, 'sh', {'静安': 'jingan', '徐汇': 'xuhui', '黄浦': 'huangpu', '长宁': 'changning',
'普陀': 'putuo', '浦东': 'pudong', '宝山': 'baoshan', '闸北': 'zhabei',
'虹口': 'hongkou','杨浦': 'yangpu', '闵行': 'minhang', '金山': 'jinshan',
'嘉定': 'jiading','崇明': 'chongming', '奉贤': 'fengxian', '松江': 'songjiang',
'青浦': 'qingpu'}],
'广州': [440100, 'gz', {'天河': 'tianhe', '越秀': 'yuexiu', '荔湾': 'liwan', '海珠': 'haizhu', '番禺': 'panyu',
'白云': 'baiyun', '黄埔': 'huangpu', '从化': 'conghua', '增城': 'zengcheng',
'花都': 'huadu', '南沙': 'nansha'}],
'深圳': [440300, 'sz', {'罗湖区': 'luohuqu', '福田区': 'futianqu', '南山区': 'nanshanqu',
'盐田区': 'yantianqu', '宝安区': 'baoanqu', '龙岗区': 'longgangqu',
'龙华区': 'longhuaqu', '光明区': 'guangmingqu', '坪山区': 'pingshanqu',
'大鹏新区': 'dapengxinqu'}]}
_id 唯一ID
bathroom_num
bedroom_num 卧室数量
bizcircle_name
city 城市
dist 区
distance 距离地铁距离
frame_orientation
hall_num 大厅数量
house_tag 房屋标签
house_title 房屋名称
latitude 维度
layout 布局类型
longitude 经度
m_url 网站来源
rent_area 出租面积
rent_price_listing 价格
rent_price_unit 出租价格单位
resblock_name 小区名称
type 出租类型
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pylab import mpl
# 预设值
mpl.rcParams['font.sans-serif'] = ['SimHei'] # 解决seaborn中文字体显示问题
plt.style.use('ggplot')
plt.rc('figure', figsize=(10, 10)) # 把plt默认的图片size调大一点
plt.rcParams["figure.dpi"] = mpl.rcParams['axes.unicode_minus'] = False # 解决保存图像是负号'-'显示为方块的问题
data = pd.read_csv('data_sample.csv')
print(data.info())
'''
# 会采样数据,本数据已经采样完成,故不再重复此操作
# 每个城市各采样3000条数据,保存为csv文件
data_sample = pd.concat([data[data['city'] == city].sample(3000) for city in ['北京', '上海', '广州', '深圳']])
data_sample.to_csv('data_sample.csv', index=False)
'''
# 数据清洗(按列清理)
# 1. 去掉“_id”列
data = data.drop(columns='_id')
# 2. 查看bathroom_num
print('通过浴室检验异常值:')
print(data['bathroom_num'].unique())
# 这里我们会看到,卫生间多的 都是合租房,没有异常值
# print(data[data['bathroom_num'].isin(['8', '9', '11'])])
print('\n')
# 3. bedroom_num
print('通过卧室检验异常值:')
print(data['bedroom_num'].unique())
# 没有异常数据,只是很多10室以上都是专门用来合租的
# print(data[data['bedroom_num'].isin(['10', '11', '12', '13', '14', '15', '20'])])
print('\n')
# 4. distance
data['frame_orientation'].unique() # 这个数据太乱了,要用的时候再处理叭
# 5. hall_num
print('通过大厅检验异常值:')
print(data['hall_num'].unique()) # 无异常值
print('\n')
# 6. rent_area
# print(data.sample(5)['rent_area']) # 随机查看
# rent_area字段有些填写的是一个范围,比如23-25平房米,后期转换成“float”类型的时候不好转换,考虑取平均值
def get_aver(data):
if isinstance(data, str) and '-' in data:
low, high = data.split('-')
return (int(low)+int(high))/2
else:
return int(data)
data['rent_area'] = data['rent_area'].apply(get_aver)
print('通过面积检验异常值:')
print(data[data['rent_area'] < 5]) # 输出,无异常值
print('\n')
# 7. rent_price_unit
print(data['rent_price_unit'].unique())
# 租金都是以“元/月”计算的,所以这一列没用了,可以删了
data = data.drop(columns='rent_price_unit')
# 查看是否删除成功
# print(data.info())
print('\n')
# 8. rent_price_listing
# print(data[data['rent_price_listing'].str.contains('-')].sample(3))
# 我们可以看到:价格是有区间的,需要按照处理rent_area一样的方法处理
data['rent_price_listing'] = data['rent_price_listing'].apply(get_aver)
# 重点:数据类型转换
for col in ['bathroom_num', 'bedroom_num', 'hall_num', 'rent_price_listing']:
data[col] = data[col].astype(int)
# 'distance', 'latitude', 'longitude'因为有None,需另外处理
def to_int(data):
if data.isnull(): # nan是float类型,在python3.中无法强制转化为int
return np.nan
else:
return int(data)
def to_float(data):
if data is None or data == '':
return np.nan
else:
return float(data)
# 这里都转化为float
data['distance'] = data['distance'].apply(to_float)
data['latitude'] = data['latitude'].apply(to_float)
data['longitude'] = data['longitude'].apply(to_float)
print('\n')
print('数据清洗结束,查看数据:')
print(data.info())
问题:
各城市的租房分布怎么样?
城市各区域的房价分布怎么样?
距离地铁口远近有什么关系?
房屋大小对价格的影响如何?
租个人房源好还是公寓好?
精装和简装对房子价格的影响
北方集中供暖对价格的影响
北上广深租房时都看重什么?
def get_city_zf_loc(city, city_short, col=['longitude', 'latitude', 'dist'], data=data):
file_name = 'data_' + city_short + '_latlon.csv'
data_latlon = data.loc[data['city'] == city, col].dropna(subset=['latitude', 'longitude'])
data_latlon['longitude'] = data_latlon['longitude'].astype(str)
data_latlon['latitude'] = data_latlon['latitude'].astype(str)
data_latlon['latlon'] = data_latlon['longitude'].str.cat(data_latlon['latitude'], sep=',')
# data_latlon.to_csv(file_name, index=False) # 分别保存各城市,以后精细分析
print(city+'的数据一共有{}条'.format(data_latlon.shape[0]))
# 分别是:经度 纬度 区
get_city_zf_loc('北京', 'bj', ['longitude', 'latitude', 'dist'])
get_city_zf_loc('上海', 'sh', ['longitude', 'latitude', 'dist'])
get_city_zf_loc('广州', 'gz', ['longitude', 'latitude', 'dist'])
get_city_zf_loc('深圳', 'sz', ['longitude', 'latitude', 'dist'])
# 画出北京各区分布
fig = plt.figure(dpi=300)
data.dropna(subset=['latitude', 'longitude'])[data['city'] == '北京']['dist'].value_counts(ascending=True).plot.barh()
plt.show()
fig = plt.figure(dpi=300)
data.dropna(subset=['latitude', 'longitude'])[data['city'] =='上海']['dist'].value_counts(ascending=True).plot.barh()
plt.show()
# 其余两个城市的图在这里不画啦~~
# 我们先看一下两个城市的单价分布情况
data['aver_price'] = data['rent_price_listing'] / data['rent_area'] # 平方单价
sns.distplot((data[data['city'] == '北京']['aver_price']), bins=100, label='Bei Jing')
plt.legend()
plt.show()
data['aver_price'] = data['rent_price_listing'] / data['rent_area']
sns.distplot((data[data['city'] == '上海']['aver_price']), bins=100, label='Shang Hai')
plt.legend()
plt.show()
# 由于平均租金基本上都集中在250元/平米/月以内,所以选取这部分数据绘制热力图
# 这个函数可以得到的我们需要的数据(按城市分开)
def get_city_zf_aver_price(city, city_short, col=['longitude', 'latitude', 'aver_price'], data=data):
file_name = 'data_' + city_short + '_aver_price.csv'
data_latlon = data.loc[(data['city'] == city) & (data['aver_price'] <= 250), col].dropna(subset=['latitude', 'longitude'])
data_latlon['longitude'] = data_latlon['longitude'].astype(str)
data_latlon['latitude'] = data_latlon['latitude'].astype(str)
data_latlon['latlon'] = data_latlon['longitude'].str.cat(data_latlon['latitude'], sep=',') # 把两列(经纬度)拼接,逗号分隔
# data_latlon.to_csv(file_name, index=False) # 这里不再保存
print(city+'的数据一共有{}条'.format(data_latlon.shape[0]))
get_city_zf_aver_price('北京', 'bj')
get_city_zf_aver_price('上海', 'sh')
get_city_zf_aver_price('广州', 'gz')
get_city_zf_aver_price('深圳', 'sz')
# 最贵的top50
bc_top50 = data.groupby(['city', 'bizcircle_name'])['aver_price'].mean().nlargest(50).reset_index()['city'].value_counts()
print('最贵的top50:')
print(bc_top50)
from pyecharts import Bar
bar = Bar("每平米平均租金前50的北上广深商圈数量", width=400)
bar.add("", bc_top50.index, bc_top50.values, is_stack=True,
xaxis_label_textsize=16, yaxis_label_textsize=16, is_label_show=True)
bar.render('top50.html')
# 看看每个城市哪儿最贵~
def get_top10_bc(city, data=data):
top10_bc = data[(data['city'] == city) & (data['bizcircle_name']!='')].groupby('bizcircle_name')['aver_price'].mean().nlargest(10)
bar = Bar(city+"市每平米平均租金Top10的商圈", width=600)
bar.add("", top10_bc.index, np.round(top10_bc.values, 0), is_stack=True,
xaxis_label_textsize=16, yaxis_label_textsize=16, xaxis_rotate=30, is_label_show=True)
bar.render('{}.html'.format(city))
get_top10_bc('北京')
get_top10_bc('上海')
get_top10_bc('广州')
get_top10_bc('深圳')
from scipy import stats
mpl.rcParams['font.sans-serif'] = ['SimHei'] # 解决seaborn中文字体显示问题
data['aver_price'] = data['rent_price_listing'] / data['rent_area']
def distance_price_relation(city, data=data):
g = sns.jointplot(x="distance", y="aver_price",
data=data[(data['city'] == city) & (data['aver_price'] <= 350)].dropna(subset=['distance']),
kind="reg",
stat_func=stats.pearsonr)
g.fig.set_dpi(100)
g.ax_joint.set_xlabel('最近地铁距离', fontweight='bold')
g.ax_joint.set_ylabel('每平米租金', fontweight='bold')
plt.show()
return g
# 其他城市图就不画啦
distance_price_relation('北京')
# 对距离分段
bins = [100*i for i in range(13)]
data['bin'] = pd.cut(data.dropna(subset=['distance'])['distance'], bins)
bin_bj = data[data['city'] == '北京'].groupby('bin')['aver_price'].mean()
bin_sh = data[data['city'] == '上海'].groupby('bin')['aver_price'].mean()
bin_gz = data[data['city'] == '广州'].groupby('bin')['aver_price'].mean()
bin_sz = data[data['city'] == '深圳'].groupby('bin')['aver_price'].mean()
# 可以得到距离组的价格:(这里只打印北京的)
print(bin_bj)
# print(bin_sh)
# print(bin_gz)
# print(bin_sz)
from pyecharts import Line
line = Line("距离地铁远近跟每平米租金均价的关系")
for city, bin_data in {'北京': bin_bj, '上海': bin_sh, '广州': bin_gz, '深圳': bin_sz}.items():
line.add(city, bin_data.index, bin_data.values,
legend_text_size=18, xaxis_label_textsize=14, yaxis_label_textsize=18,
xaxis_rotate=20, yaxis_min=8, legend_top=30)
line.render('{}.html'.format(city))
data['aver_price'] = data['rent_price_listing'] / data['rent_area']
# 面积--价格
# 后期找一些,简单的画法
def area_price_relation(city, data=data):
fig = plt.figure(dpi=100)
g = sns.lineplot(x="rent_area",
y="aver_price",
data=data[(data['city'] == city) & (data['rent_area'] < 150)],
ci=None)
g.set_xlabel('面积', fontweight='bold')
g.set_ylabel('每平米均价', fontweight='bold')
plt.show()
return g
area_price_relation('北京')
# 根据house_title和house_tag再造一个字段:is_dep,也就是“是否是公寓”
data['is_dep'] = (data['house_title'].str.contains('公寓') + data['house_tag'].str.contains('公寓')) > 0
# 每个城市房源的公寓占比
for city in ['北京', '上海', '广州', '深圳']:
print(city+'的公寓占总房源量比重为:{}%。'.format(
np.round(data[data['city'] == city]['is_dep'].mean()*100, 2)))
print('看一下广州,面积在0到60的,价格大于100的房源中,公寓的比例:')
ret = data[(data['city'] == '广州') & (data['rent_area'] > 0) & (data['rent_area'] < 60)&
(data['aver_price'] > 100)]['is_dep'].mean()
print(ret)
data['is_dep'] = (data['house_title'].str.contains('公寓') + data['house_tag'].str.contains('公寓')) > 0
data['aver_price'] = data['rent_price_listing'] / data['rent_area']
is_dep = data[(data['city'].isin(['广州', '深圳'])) &
(data['is_dep'] == 1)].groupby('city')['aver_price'].mean()
not_dep = data[(data['city'].isin(['广州', '深圳'])) &
(data['is_dep'] == 0)].groupby('city')['aver_price'].mean()
from pyecharts import Bar
bar = Bar("个人房源和公寓的每平米租金差别", width=600)
bar.add("个人房源", not_dep.index, np.round(not_dep.values, 0),
legend_text_size=18, xaxis_label_textsize=14, yaxis_label_textsize=18,
yaxis_min=8, legend_top=30, is_label_show=True)
bar.add("公寓", is_dep.index, np.round(is_dep.values, 0),
legend_text_size=18, xaxis_label_textsize=14, yaxis_label_textsize=18,
yaxis_min=8, legend_top=30, is_label_show=True)
bar.render()
from pyecharts import Bar
data['is_dep'] = (data['house_title'].str.contains('公寓') + data['house_tag'].str.contains('公寓')) > 0
data['aver_price'] = data['rent_price_listing'] / data['rent_area']
data['decorated'] = data[data['house_tag'].notna()]['house_tag'].str.contains('精装')
decorated = data[data['decorated'] == 1].groupby('city')['aver_price'].mean()
not_decorated = data[data['decorated'] == 0].groupby('city')['aver_price'].mean()
bar = Bar("各城市精装和简装的每平米租金差别", width=600)
bar.add("精装(刷过墙)", decorated.index, np.round(decorated.values, 0),
legend_text_size=18, xaxis_label_textsize=14, yaxis_label_textsize=18,
yaxis_min=8, legend_top=30, is_label_show=True)
bar.add("简装(破房子)", not_decorated.index, np.round(not_decorated.values, 0),
legend_text_size=18, xaxis_label_textsize=14, yaxis_label_textsize=18,
yaxis_min=8, legend_top=30, is_label_show=True)
bar.render()
is_dec_dep = data[(data['decorated'] == 1) &
(data['is_dep'] == 1) &
(data['city'].isin(['广州', '深圳']))].groupby('city')['aver_price'].mean()
is_dec_not_dep = data[(data['decorated'] == 1) &
(data['is_dep'] == 0) &
(data['city'].isin(['广州', '深圳']))].groupby('city')['aver_price'].mean()
not_dec_dep = data[(data['decorated'] == 0) &
(data['is_dep'] == 0) &
(data['city'].isin(['广州', '深圳']))].groupby('city')['aver_price'].mean()
bar = Bar("各城市装修和房源类型的每平米租金差别", width=600)
bar.add("精装公寓", is_dec_dep.index, np.round(is_dec_dep.values, 0),
legend_text_size=18, xaxis_label_textsize=14, yaxis_label_textsize=18,
yaxis_min=8, legend_top=30, is_label_show=True)
bar.add("精装个人房源", is_dec_not_dep.index, np.round(is_dec_not_dep.values, 0),
legend_text_size=18, xaxis_label_textsize=14, yaxis_label_textsize=18,
yaxis_min=8, legend_top=30, is_label_show=True)
bar.add("简装个人房源", not_dec_dep.index, np.round(not_dec_dep.values, 0),
legend_text_size=18, xaxis_label_textsize=14, yaxis_label_textsize=18,
yaxis_min=8, legend_top=30, is_label_show=True)
bar.render()
data['ct_heating'] = data['house_tag'].str.contains('集中供暖')
ret = data[data['city'] =='北京'].groupby('ct_heating')['aver_price'].mean()
print(ret)
def layout_top3(city, data):
layout_data = data[data['city'] == city]['layout'].value_counts().nlargest(3)
bar = Bar(city+"最受欢迎的户型", width=600)
bar.add("", layout_data.index, layout_data.values,
legend_text_size=18, xaxis_label_textsize=14, yaxis_label_textsize=18,
yaxis_min=8, legend_top=30, is_label_show=True)
bar.render('beijing.html')
return bar
layout_top3('北京', data)
# 制作词云
from pyecharts import WordCloud
bj_tag = []
for st in data[data['city']=='北京'].dropna(subset=['house_tag'])['house_tag']:
bj_tag.extend(st.split(' '))
ciyun = pd.Series(bj_tag)
ciyun = ciyun.value_counts()
name, value = ciyun.index, ciyun.values
wordcloud = WordCloud(width=500, height=500)
wordcloud.add("", name, value, word_size_range=[20, 100])
wordcloud.render('ciyun.html')
没太看懂这块的想法
zs_ratio = [57036, 62779, 32039, 56758]/(data.groupby('city')['rent_price_listing'].sum()/
data.groupby('city')['rent_area'].sum())/12
print(zs_ratio)
bar = Bar("各城市房屋租售比(租多少年可以在该城市买下一套房)", width=450)
bar.add("", zs_ratio.index, np.round(zs_ratio.values, 0),
legend_text_size=18,xaxis_label_textsize=14,yaxis_label_textsize=18,
yaxis_min=8, legend_top=30, is_label_show=True)
bar.render()