1. wangyi.py
import scrapy
from selenium import webdriver
'''
在scrapy中使用selenium的编码流程:
1.在spider的构造方法中创建一个浏览器对象(作为当前spider的一个属性)
2.重写spider的一个方法closed(self,spider),在该方法中执行浏览器关闭的操作
3.在下载中间件的process_response方法中,通过spider参数获取浏览器对象
4.在中间件的process_response中定制基于浏览器自动化的操作代码(获取动态加载出来的页面源码数据)
5.实例化一个响应对象,且将page_source返回的页面源码封装到该对象中
6.返回该新的响应对象
'''
class TestSpider(scrapy.Spider):
name = 'wangyi'
start_urls = ['http://war.163.com/']
def __init__(self):
self.browser = webdriver.Chrome()
def parse(self, response):
div_list = response.xpath('//div[@class="data_row news_article clearfix "]')
for div in div_list:
title = div.xpath('.//div[@class="news_title"]/h3/a/text()').extract_first()
print(title)
def closed(self, spider):
print('bro has been closed')
self.browser.quit()
2. middlewares.py
from scrapy import signals
from scrapy.http import HtmlResponse
from time import sleep
class WangyiproDownloaderMiddleware(object):
@classmethod
def from_crawler(cls, crawler):
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_request(self, request, spider):
return None
def process_response(self, request, response, spider):
print('即将返回一个新的响应对象!!!')
bro = spider.bro
bro.get(url=request.url)
sleep(3)
page_text = bro.page_source
sleep(3)
return HtmlResponse(url=spider.bro.current_url,body=page_text,encoding='utf-8',request=request)
def process_exception(self, request, exception, spider):
pass
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
3. 在中间件判断是否需要使用selenium来爬取内容的版本
from scrapy.http import HtmlResponse
def process_response(self, request, response, spider):
if request.url in['http://news.163.com/domestic/','http://news.163.com/world/','http://news.163.com/air/','http://war.163.com/']:
spider.bro.get(url=request.url)
js = 'window.scrollTo(0,document.body.scrollHeight)'
spider.bro.execute_script(js)
time.sleep(2)
page_text = spider.bro.page_source
return HtmlResponse(url=spider.bro.current_url,body=page_text,encoding='utf-8',request=request)
else:
return response