博客参考
下载
Linux:pip3 install scrapy
Windows
shell界面调试(pip安装好ipython):
scrapy shell “http://www.baidu.com”
创建项目:
scrapy startproject first_obj
目录结构:
创建爬虫:
cd first_obj
scrapy genspider baidu baidu.com
执行爬虫:
scrapy crawl baidu [–nolog] [-o baidu.json/baidu.csv]
其他命令:
scrapy list
scrapy fetch
scrapy view
scrapy settings --get [options]
scrapy version
scrapy bench
配置文件:
settings:
ROBOTSTXT_OBEY:是否遵循网站的robots.txt规则
CONCURRENT_REQUESTS:Scrapy执行的最大并发请求
所有配置项必须大写
。
from scrapy.selector import Selector
hxs = Selector(response=response)
img_list = hxs.xpath("//div[@class='item']")
for item in img_list:
title = item.xpath("./div[@class='news-content']/div[@class='part2']/@share-title").extract()[0]
url = item.xpath("./div[@class='news-pic']/img/@original").extract_first().strip('//')
page_list = hxs.xpath('//a[re:test(@href, "/all/hot/recent/\d+")]/@href').extract()
for page in page_list:
yield Request(url=page, callback=self.parse)
import scrapy
from scrapy.selector import Selector
from scrapy.http import Request
from ..items import ChouTiItem
class ChoutiSpider(scrapy.Spider):
name = 'chouti'
allowed_domains = ['dig.chouti.com']
start_urls = ['https://dig.chouti.com/']
def parse(self, response):
hxs = Selector(response=response)
img_list = hxs.xpath("//div[@class='item']")
for item in img_list:
title = item.xpath("./div[@class='news-content']/div[@class='part2']/@share-title").extract()[0]
url = item.xpath("./div[@class='news-pic']/img/@original").extract_first().strip('//')
obj = ChouTiItem(title=title, url=url)
yield obj
import scrapy
class ChouTiItem(scrapy.Item):
title = scrapy.Field()
url = scrapy.Field()
process_item
即可from scrapy.exceptions import DropItem
class SavePipeline(object):
def __init__(self, v):
self.file = open('chouti.txt', 'a+')
def process_item(self, item, spider):
# 操作并进行持久化
# return表示会被后续的pipeline继续处理
self.file.write(item)
return item
# 表示将item丢弃,不会被后续pipeline处理
# raise DropItem()
@classmethod
def from_crawler(cls, crawler):
"""
初始化时候,用于创建pipeline对象
:param crawler:
:return:
"""
val = crawler.settings.get('SIX')
return cls(val)
def open_spider(self, spider):
"""
爬虫开始执行时,调用
:param spider:
:return:
"""
print('开启爬虫')
def close_spider(self, spider):
"""
爬虫关闭时,被调用
:param spider:
:return:
"""
print('关闭爬虫')
# 每行后面的整型值,确定了他们运行的顺序,item按数字从低到高的顺序通过pipeline,通常将这些数字定义在0-1000范围内。
# 当遇到raise DropItem()将不再往下执行
ITEM_PIPELINES = {
'fone.pipelines.SavePipeline': 300,
}
spider.name
判断: def process_item(self, item, spider):
if spider.name == 'chouti':
pass
DUPEFILTER_CLASS = 'scrapy.dupefilter.RFPDupeFilter'
DUPEFILTER_DEBUG = False
# 保存范文记录的日志路径,如:/root/ 最终路径为 /root/requests.seen
JOBDIR = ""
class RepeatUrl:
def __init__(self):
self.visited_url = set()
@classmethod
def from_settings(cls, settings):
"""
初始化时,调用
:param settings:
:return:
"""
return cls()
def request_seen(self, request):
"""
检测当前请求是否已经被访问过
:param request:
:return: True表示已经访问过;False表示未访问过
"""
if request.url in self.visited_url:
return True
self.visited_url.add(request.url)
return False
def open(self):
"""
开始爬去请求时,调用
:return:
"""
print('open replication')
def close(self, reason):
"""
结束爬虫爬取时,调用
:param reason:
:return:
"""
print('close replication')
def log(self, request, spider):
"""
记录日志
:param request:
:param spider:
:return:
"""
print('repeat', request.url)
2) settings.py
DUPEFILTER_CLASS = 'fone.rfp.RFPDupeFilter'
from scrapy import signals
class MyExtension(object):
def __init__(self, value):
self.value = value
@classmethod
def from_crawler(cls, crawler):
val = crawler.settings.getint('SIX')
ext = cls(val)
crawler.signals.connect(ext.spider_opened, signal=signals.spider_opened)
crawler.signals.connect(ext.spider_closed, signal=signals.spider_closed)
return ext
def spider_opened(self, spider):
print('open')
def spider_closed(self, spider):
print('close')
EXTENSIONS = {
'fone.extensions.MyExtension': 100,
}
爬虫中间件SpiderMiddleware示例代码类的方法说明(执行顺序):
下载中间件DownloaderMiddleware
示例代码及说明:
None
和Response对象
方法class DownMiddleware1(object):
def process_request(self, request, spider):
"""
请求需要被下载时,经过所有下载器中间件的process_request调用
:param request:
:param spider:
:return:
None,继续后续中间件去下载;
Response对象,停止process_request的执行,开始执行process_response
Request对象,停止中间件的执行,将Request重新调度器
raise IgnoreRequest异常,停止process_request的执行,开始执行process_exception
"""
pass
def process_response(self, request, response, spider):
"""
spider处理完成,返回时调用
:param response:
:param result:
:param spider:
:return:
Response 对象:转交给其他中间件process_response
Request 对象:停止中间件,request会被重新调度下载
raise IgnoreRequest 异常:调用Request.errback
"""
print('response1')
return response
def process_exception(self, request, exception, spider):
"""
当下载处理器(download handler)或 process_request() (下载中间件)抛出异常
:param response:
:param exception:
:param spider:
:return:
None:继续交给后续中间件处理异常;
Response对象:停止后续process_exception方法
Request对象:停止中间件,request将会被重新调用下载
"""
return None
from scrapy.commands import ScrapyCommand
class Command(ScrapyCommand):
requires_project = True
def syntax(self):
return '[options]'
def short_desc(self):
return 'Runs all of the spiders'
def run(self, args, opts):
spider_list = self.crawler_process.spiders.list()
for name in spider_list:
self.crawler_process.crawl(name, **opts.__dict__)
self.crawler_process.start()