继续回顾下scrapy,来抓下链家二手房信息,主要涉及scrapy的部分有:
CrawlSpider
Rule
LinkExtractor
Mysql 数据存储
房产图片的下载
思路:
先抓取房屋详情链接
进入详情页抓取关键字段
找到图片链接
信息存入数据库
下载图片保存本地
翻页
写来写去,链家没什么反爬,加上个headers 就OK了
直接上代码
spider文件
# -*- coding: utf-8 -*-
from scrapy.spiders import CrawlSpider,Rule
from scrapy.linkextractors import LinkExtractor
from lianjia.items import LianjiaItem
class LjCrwalerSpider(CrawlSpider):
name = 'lj_crawler'
start_urls = ['https://qd.lianjia.com/ershoufang/']
#设置抓取规则rule
rules = {
#房产详情链接
Rule(LinkExtractor(restrict_xpaths="//ul[@class='sellListContent']/li/div[@class='info clear']/div[@class='title']/a"), follow=True, callback="process_item"),
#翻页链接
Rule(LinkExtractor(restrict_xpaths="//div[@class='pagination_group_a']/a"), follow=True),
}
def process_item(self, response):
item = LianjiaItem()
#提取关键字段信息
item['title'] = response.css('title::text').extract_first()
item['price'] = response.css('div.overview div.content > div.price > span.total::text').extract_first()
item['unit_price'] = response.css('div.overview div.content > div.price span.unitPriceValue::text').extract_first()
item['community_name'] = response.css('div.overview div.content > div.aroundInfo > div.communityName > a::text').extract_first()
item['region'] = response.css('div.areaName span.info a::text').extract()
item['linkman'] = response.xpath('//div[@class="brokerInfoText fr"]/div[@class="brokerName"]/a/text()').extract_first()
item['linktel'] = response.xpath('//div[@class="brokerInfoText fr"]/div[@class="phone"]/text()').extract()
item['type'] = response.css('#introduction div.base ul > li:first-child::text').extract_first()
item['construction_area'] = response.css('#introduction div.base ul > li:nth-child(3)::text').extract_first()
item['actual_area'] = response.css('#introduction div.base ul > li:nth-child(5)::text').extract_first()
item['orientation'] = response.css('#introduction div.base ul > li:nth-child(7)::text').extract_first()
item['decoration'] = response.css('#introduction div.base ul > li:nth-child(9)::text').extract_first()
item['floor'] = response.css('#introduction div.base ul > li:nth-child(2)::text').extract_first()
item['elevator'] = response.css('#introduction div.base ul > li:nth-child(12)::text').extract_first()
item['property'] = response.css('#introduction div.base ul > li:nth-child(13)::text').extract_first()
item['house_years'] = response.css('#introduction div.transaction li:nth-child(5) span:nth-child(2)::text').extract_first()
item['mortgage'] = response.css('#introduction div.transaction li:nth-child(7) span:nth-child(2)::text').extract_first().strip()
item['purposes'] = response.css('#introduction div.transaction ul > li:nth-child(4) span:nth-child(2)::text').extract_first()
item['release_date'] = response.css('#introduction div.transaction ul > li:first-child span:nth-child(2)::text').extract_first()
item['image_urls'] = response.css('div.content-wrapper img::attr(src)').extract()
item['from_url'] = response.url
yield item
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class LianjiaItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
#标题
title = scrapy.Field()
#价格
price = scrapy.Field()
#单价
unit_price = scrapy.Field()
# 小区名字
community_name = scrapy.Field()
# 地区
region = scrapy.Field()
# 联系人
linkman = scrapy.Field()
# 联系电话
linktel = scrapy.Field()
#户型
type = scrapy.Field()
#建筑面积
construction_area = scrapy.Field()
#实际面积
actual_area = scrapy.Field()
#房屋朝向
orientation = scrapy.Field()
#装修情况
decoration = scrapy.Field()
#所在楼层
floor = scrapy.Field()
#电梯
elevator = scrapy.Field()
#产权年限
property = scrapy.Field()
#房屋年限
house_years = scrapy.Field()
#有无抵押
mortgage = scrapy.Field()
#房屋用途
purposes = scrapy.Field()
#挂牌时间
release_date = scrapy.Field()
#房屋照片
image_urls = scrapy.Field()
#房产链接
from_url = scrapy.Field()
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import hashlib
import pymysql
from scrapy.exceptions import DropItem
from urllib.request import urlretrieve
from scrapy.utils.python import to_bytes
import os
class LianjiaPipeline(object):
def __init__(self, settings):
self.host = settings.get('HOST')
self.port = settings.get('PORT')
self.user = settings.get('USER')
self.passwd = settings.get('PASSWD')
self.db = settings.get('DB')
self.charset = settings.get('CHARSET')
self.table = settings.get('TABLE')
self.settings = settings
@classmethod
def from_crawler(cls, crawler):
return cls(crawler.settings)
def open_spider(self, spider):
self.conn = pymysql.connect(host=self.host, port=self.port, user=self.user, passwd=self.passwd, db=self.db, charset=self.charset)
self.db = self.conn.cursor()
def close_spider(self, spider):
self.db.close()
self.conn.close()
def save_data(self, item):
'''
数据存储
:param item:
:return:
'''
keys = ', '.join(item.keys())
values = ', '.join(['%s'] * len(item.keys()))
insert_sql = "insert into `{}`({})values({})".format(self.table, keys, values)
try:
self.db.execute(insert_sql, tuple(item.values()))
self.conn.commit()
except Exception as e:
print(e.args)
self.conn.rollback()
def select_data(self, item):
'''
判重
:param item:
:return:
'''
value = item.get('from_url')
select_sql = "select * from `{}` where from_url='{}';".format(self.table, value)
try:
self.db.execute(select_sql)
res = self.db.fetchall()
if res:
return True
else:
return False
except Exception as e:
print(e.args)
return False
def process_item(self, item, spider):
item['linktel'] = '-'.join(item['linktel'])
item['region'] = '/'.join(item['region'])
item['image_urls'] = ','.join(item['image_urls'])
if not self.select_data(item):
self.save_data(item)
return item
class ImageDownloadPipeline(object):
def __init__(self, settings):
self.imagepath = settings.get('IMAGES_STORE')
@classmethod
def from_crawler(cls, crawler):
return cls(crawler.settings)
def process_item(self, item, spider):
'''
图片下载
:param item:
:param spider:
:return:
'''
for image in item['image_urls'].split(','):
#图片命名
image_guid = hashlib.sha1(to_bytes(image)).hexdigest()
image_name = '%s.jpg' % (image_guid)
house_id = item['from_url'].split('/')[-1].replace('.html','')
file_path = '%s/%s'%(self.imagepath, house_id)
if not os.path.exists(file_path):
os.makedirs(file_path)
image_path = '%s/%s/%s'%(self.imagepath, house_id, image_name)
if not os.path.exists(image_path):
urlretrieve(image, image_path)
else:
raise DropItem('It exists!')
# -*- coding: utf-8 -*-
# Scrapy settings for lianjia project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://doc.scrapy.org/en/latest/topics/settings.html
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'lianjia'
SPIDER_MODULES = ['lianjia.spiders']
NEWSPIDER_MODULE = 'lianjia.spiders'
HOST = '127.0.0.1'
PORT = 3306
USER = 'root'
PASSWD = '123456'
DB = 'mycrawler'
CHARSET = 'UTF8'
TABLE = 'lianjia'
IMAGES_STORE = 'C:/Users/wang/Desktop/lianjia/lianjia/images'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'lianjia (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36'
}
# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'lianjia.middlewares.LianjiaSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'lianjia.middlewares.LianjiaDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'lianjia.pipelines.LianjiaPipeline': 300,
'lianjia.pipelines.ImageDownloadPipeline': 400,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
# -*- coding: utf-8 -*-
from scrapy import cmdline
cmdline.execute("scrapy crawl lj_crawler".split())
#总结: