用python爬取链家二手房信息,并把数据存入mongodb

爬取网站:https://gz.lianjia.com/ershoufang/pg

爬取链家二手房信息,数据存mongodb,代码如下:

import requests
from fake_useragent import UserAgent
from lxml import etree
import time
import random

from pymongo import MongoClient

class LianjiaSpider(object):

    def __init__(self):
        self.url = 'https://gz.lianjia.com/ershoufang/pg{}/'

    # 获取页面,异常处理
    def get_html(self, url):
        ua = UserAgent()
        headers = {
     'User-Agent': ua.random}
        response = requests.get(url=url, headers=headers)
        return response.content.decode('utf-8')

    # 解析页面
    def parse_html(self, url):
        html = self.get_html(url)
        # with open('test.html', 'w', encoding='utf-8') as f:
        #     f.write(html)
        parse_obj = etree.HTML(html)

        li_list = parse_obj.xpath('//ul[@class="sellListContent"]/li')
        
        # 2.for循环一次遍历每个房源li节点,获取1房源的所有注释
        house_list = []
        item = {
     }
        for li in li_list:
            # 名称[0]:第一个房源,strip移除空格
            item['name'] = li.xpath('.//div[@class="positionInfo"]/a[1]/text()')[0].strip()
            # 信息 户型+面积+方向+装修+楼层+时间+类型
            # 2室1厅 | 51.99平米 | 西 | 精装 | 中楼层(共18层) | 2000年建 | 塔楼
            info_list = li.xpath('.//div[@class="houseInfo"]/text()')[0].split('|')
            item['model'] = info_list[0].strip()
            item['area'] = info_list[1].strip()
            item['direction'] = info_list[2].strip()
            item['perfect'] = info_list[3].strip()
            item['floor'] = info_list[4].strip()
            item['time'] = info_list[5].strip()[:-2]
            item['type'] = info_list[-1].strip() if len(info_list) == 6 else None
            # 地区 总价 单价
            item['address'] = li.xpath('.//div[@class="positionInfo"]/a[2]/text()')[0].strip()
            item['total price'] = li.xpath('.//div[@class="totalPrice"]/span/text()')[0].strip()
            item['unit price'] = li.xpath('.//div[@class="unitPrice"]/span/text()')[0].strip()
            house_list.append(item)

        lianjia_info = dict()
        lianjia_info['url'] = self.url
        lianjia_info['描述'] = '链家二手房信息'
        lianjia_info['信息'] = house_list
        return lianjia_info

    def save_data(self, data):
        client = MongoClient('mongodb://localhost:27017')
        collection = client["student_de"]
        collection['class_one'].insert(data)

    def run(self):
        for pg in range(1, 3):
            url = self.url.format(pg)
            data = self.parse_html(url)
            self.save_data(data)
            time.sleep(random.randint(1, 3))


if __name__ == '__main__':
    spider = LianjiaSpider()
    spider.run()

执行成功,结果如下图:

用python爬取链家二手房信息,并把数据存入mongodb_第1张图片

你可能感兴趣的:(爬虫总结和详解)