Python-Scrapy-抓取链家二手房信息

继续回顾下scrapy,来抓下链家二手房信息,主要涉及scrapy的部分有:

  • CrawlSpider

  • Rule

  • LinkExtractor

  • Mysql 数据存储

  • 房产图片的下载

  • 简单看下链家的二手房信息网站
    Python-Scrapy-抓取链家二手房信息_第1张图片

  • 详情页
    Python-Scrapy-抓取链家二手房信息_第2张图片
    Python-Scrapy-抓取链家二手房信息_第3张图片

  • 翻页(最多只能抓取100页)
    Python-Scrapy-抓取链家二手房信息_第4张图片

  • 思路:

  • 先抓取房屋详情链接

  • 进入详情页抓取关键字段

  • 找到图片链接

  • 信息存入数据库

  • 下载图片保存本地

  • 翻页

  • 写来写去,链家没什么反爬,加上个headers 就OK了

  • 直接上代码

  • spider文件

# -*- coding: utf-8 -*-
from scrapy.spiders import CrawlSpider,Rule
from scrapy.linkextractors import LinkExtractor
from lianjia.items import LianjiaItem


class LjCrwalerSpider(CrawlSpider):
    name = 'lj_crawler'
    start_urls = ['https://qd.lianjia.com/ershoufang/']

    #设置抓取规则rule

    rules = {
        #房产详情链接
        Rule(LinkExtractor(restrict_xpaths="//ul[@class='sellListContent']/li/div[@class='info clear']/div[@class='title']/a"), follow=True, callback="process_item"),
        #翻页链接
        Rule(LinkExtractor(restrict_xpaths="//div[@class='pagination_group_a']/a"), follow=True),
    }


    def process_item(self, response):
        item = LianjiaItem()
        #提取关键字段信息
        item['title'] = response.css('title::text').extract_first()
        item['price'] = response.css('div.overview div.content > div.price > span.total::text').extract_first()
        item['unit_price'] = response.css('div.overview div.content > div.price span.unitPriceValue::text').extract_first()
        item['community_name'] = response.css('div.overview div.content > div.aroundInfo > div.communityName > a::text').extract_first()
        item['region'] = response.css('div.areaName span.info a::text').extract()
        item['linkman'] = response.xpath('//div[@class="brokerInfoText fr"]/div[@class="brokerName"]/a/text()').extract_first()
        item['linktel'] = response.xpath('//div[@class="brokerInfoText fr"]/div[@class="phone"]/text()').extract()
        item['type'] = response.css('#introduction div.base ul > li:first-child::text').extract_first()
        item['construction_area'] = response.css('#introduction div.base ul > li:nth-child(3)::text').extract_first()
        item['actual_area'] = response.css('#introduction div.base ul > li:nth-child(5)::text').extract_first()
        item['orientation'] = response.css('#introduction div.base ul > li:nth-child(7)::text').extract_first()
        item['decoration'] = response.css('#introduction div.base ul > li:nth-child(9)::text').extract_first()
        item['floor'] = response.css('#introduction div.base ul > li:nth-child(2)::text').extract_first()
        item['elevator'] = response.css('#introduction div.base ul > li:nth-child(12)::text').extract_first()
        item['property'] = response.css('#introduction div.base ul > li:nth-child(13)::text').extract_first()
        item['house_years'] = response.css('#introduction div.transaction li:nth-child(5) span:nth-child(2)::text').extract_first()
        item['mortgage'] = response.css('#introduction div.transaction li:nth-child(7) span:nth-child(2)::text').extract_first().strip()
        item['purposes'] = response.css('#introduction div.transaction ul > li:nth-child(4) span:nth-child(2)::text').extract_first()
        item['release_date'] = response.css('#introduction div.transaction ul > li:first-child span:nth-child(2)::text').extract_first()
        item['image_urls'] = response.css('div.content-wrapper img::attr(src)').extract()
        item['from_url'] = response.url
        yield item

  • item.py文件
# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html

import scrapy


class LianjiaItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    #标题
    title = scrapy.Field()
    #价格
    price = scrapy.Field()
    #单价
    unit_price = scrapy.Field()
    # 小区名字
    community_name = scrapy.Field()
    # 地区
    region = scrapy.Field()
    # 联系人
    linkman = scrapy.Field()
    # 联系电话
    linktel = scrapy.Field()
    #户型
    type = scrapy.Field()
    #建筑面积
    construction_area = scrapy.Field()
    #实际面积
    actual_area = scrapy.Field()
    #房屋朝向
    orientation = scrapy.Field()
    #装修情况
    decoration = scrapy.Field()
    #所在楼层
    floor = scrapy.Field()
    #电梯
    elevator = scrapy.Field()
    #产权年限
    property = scrapy.Field()
    #房屋年限
    house_years = scrapy.Field()
    #有无抵押
    mortgage = scrapy.Field()
    #房屋用途
    purposes = scrapy.Field()
    #挂牌时间
    release_date = scrapy.Field()
    #房屋照片
    image_urls = scrapy.Field()
    #房产链接
    from_url = scrapy.Field()

  • pipelines.py文件
# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import hashlib
import pymysql
from scrapy.exceptions import DropItem
from urllib.request import urlretrieve
from scrapy.utils.python import to_bytes
import os


class LianjiaPipeline(object):

    def __init__(self, settings):
        self.host = settings.get('HOST')
        self.port = settings.get('PORT')
        self.user = settings.get('USER')
        self.passwd = settings.get('PASSWD')
        self.db = settings.get('DB')
        self.charset = settings.get('CHARSET')
        self.table = settings.get('TABLE')
        self.settings = settings

    @classmethod
    def from_crawler(cls, crawler):

        return cls(crawler.settings)

    def open_spider(self, spider):
        self.conn = pymysql.connect(host=self.host, port=self.port, user=self.user, passwd=self.passwd, db=self.db, charset=self.charset)

        self.db = self.conn.cursor()

    def close_spider(self, spider):
        self.db.close()
        self.conn.close()

    def save_data(self, item):
        '''
        数据存储
        :param item:
        :return:
        '''
        keys = ', '.join(item.keys())
        values = ', '.join(['%s'] * len(item.keys()))
        insert_sql = "insert into `{}`({})values({})".format(self.table, keys, values)
        try:
            self.db.execute(insert_sql, tuple(item.values()))
            self.conn.commit()
        except Exception as e:
            print(e.args)
            self.conn.rollback()

    def select_data(self, item):
        '''
        判重
        :param item:
        :return:
        '''
        value = item.get('from_url')
        select_sql = "select * from `{}` where from_url='{}';".format(self.table, value)
        try:
            self.db.execute(select_sql)
            res = self.db.fetchall()
            if res:
                return True
            else:
                return False
        except Exception as e:
            print(e.args)
            return False

    def process_item(self, item, spider):
        item['linktel'] = '-'.join(item['linktel'])
        item['region'] = '/'.join(item['region'])
        item['image_urls'] = ','.join(item['image_urls'])
        if not self.select_data(item):
            self.save_data(item)
        return item

class ImageDownloadPipeline(object):

    def __init__(self, settings):
        self.imagepath = settings.get('IMAGES_STORE')

    @classmethod
    def from_crawler(cls, crawler):
        return cls(crawler.settings)

    def process_item(self, item, spider):
        '''
        图片下载
        :param item:
        :param spider:
        :return:
        '''
        for image in item['image_urls'].split(','):
            #图片命名
            image_guid = hashlib.sha1(to_bytes(image)).hexdigest()
            image_name = '%s.jpg' % (image_guid)

            house_id = item['from_url'].split('/')[-1].replace('.html','')
            file_path = '%s/%s'%(self.imagepath, house_id)

            if not os.path.exists(file_path):
                os.makedirs(file_path)

            image_path = '%s/%s/%s'%(self.imagepath, house_id, image_name)

            if not os.path.exists(image_path):
                urlretrieve(image, image_path)
            else:
                raise DropItem('It exists!')

  • settings.py 文件
# -*- coding: utf-8 -*-

# Scrapy settings for lianjia project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://doc.scrapy.org/en/latest/topics/settings.html
#     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://doc.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'lianjia'

SPIDER_MODULES = ['lianjia.spiders']
NEWSPIDER_MODULE = 'lianjia.spiders'

HOST = '127.0.0.1'
PORT = 3306
USER = 'root'
PASSWD = '123456'
DB = 'mycrawler'
CHARSET = 'UTF8'
TABLE = 'lianjia'

IMAGES_STORE = 'C:/Users/wang/Desktop/lianjia/lianjia/images'

# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'lianjia (+http://www.yourdomain.com)'

# Obey robots.txt rules
ROBOTSTXT_OBEY = False

# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
DEFAULT_REQUEST_HEADERS = {
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
    'Accept-Encoding': 'gzip, deflate, br',
    'Accept-Language': 'zh-CN,zh;q=0.9',
    'Cache-Control': 'max-age=0',
    'Connection': 'keep-alive',
    'Upgrade-Insecure-Requests': '1',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36'
}

# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
#    'lianjia.middlewares.LianjiaSpiderMiddleware': 543,
#}

# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
#    'lianjia.middlewares.LianjiaDownloaderMiddleware': 543,
#}

# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
#}

# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
   'lianjia.pipelines.LianjiaPipeline': 300,
   'lianjia.pipelines.ImageDownloadPipeline': 400,

}

# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

  • run.py文件
# -*- coding: utf-8 -*-
from scrapy import cmdline


cmdline.execute("scrapy crawl lj_crawler".split())
  • 结果
    Python-Scrapy-抓取链家二手房信息_第5张图片
    Python-Scrapy-抓取链家二手房信息_第6张图片

#总结:

  • 图片的下载其实想用scrapy自带的imagespipeline来着,但是不知道应该怎么实现分目录存储图片,IMAGE_STORE貌似只能设置一个固定的路径,应该怎么实现动态的按照房产id生成文件夹呢?有大神的话,求指点一下,应该如何改写里面的方法
  • 抓取的是青岛的房产信息,全国的话, 可以再对start_urls进行处理

你可能感兴趣的:(python爬虫)