爬取网站:https://gz.lianjia.com/ershoufang/pg
爬取链家二手房信息,数据存mongodb,代码如下:
import requests
from fake_useragent import UserAgent
from lxml import etree
import time
import random
from pymongo import MongoClient
class LianjiaSpider(object):
def __init__(self):
self.url = 'https://gz.lianjia.com/ershoufang/pg{}/'
# 获取页面,异常处理
def get_html(self, url):
ua = UserAgent()
headers = {
'User-Agent': ua.random}
response = requests.get(url=url, headers=headers)
return response.content.decode('utf-8')
# 解析页面
def parse_html(self, url):
html = self.get_html(url)
# with open('test.html', 'w', encoding='utf-8') as f:
# f.write(html)
parse_obj = etree.HTML(html)
li_list = parse_obj.xpath('//ul[@class="sellListContent"]/li')
# 2.for循环一次遍历每个房源li节点,获取1房源的所有注释
house_list = []
item = {
}
for li in li_list:
# 名称[0]:第一个房源,strip移除空格
item['name'] = li.xpath('.//div[@class="positionInfo"]/a[1]/text()')[0].strip()
# 信息 户型+面积+方向+装修+楼层+时间+类型
# 2室1厅 | 51.99平米 | 西 | 精装 | 中楼层(共18层) | 2000年建 | 塔楼
info_list = li.xpath('.//div[@class="houseInfo"]/text()')[0].split('|')
item['model'] = info_list[0].strip()
item['area'] = info_list[1].strip()
item['direction'] = info_list[2].strip()
item['perfect'] = info_list[3].strip()
item['floor'] = info_list[4].strip()
item['time'] = info_list[5].strip()[:-2]
item['type'] = info_list[-1].strip() if len(info_list) == 6 else None
# 地区 总价 单价
item['address'] = li.xpath('.//div[@class="positionInfo"]/a[2]/text()')[0].strip()
item['total price'] = li.xpath('.//div[@class="totalPrice"]/span/text()')[0].strip()
item['unit price'] = li.xpath('.//div[@class="unitPrice"]/span/text()')[0].strip()
house_list.append(item)
lianjia_info = dict()
lianjia_info['url'] = self.url
lianjia_info['描述'] = '链家二手房信息'
lianjia_info['信息'] = house_list
return lianjia_info
def save_data(self, data):
client = MongoClient('mongodb://localhost:27017')
collection = client["student_de"]
collection['class_one'].insert(data)
def run(self):
for pg in range(1, 3):
url = self.url.format(pg)
data = self.parse_html(url)
self.save_data(data)
time.sleep(random.randint(1, 3))
if __name__ == '__main__':
spider = LianjiaSpider()
spider.run()
执行成功,结果如下图: