1 安装需要的依赖
pip install selenium
pip install scrapy
pip install pymysql
PhantomJS下载地址: http://phantomjs.org/download.html
2 创建scrapy项目
scrapy startproject bilibili
cd bilibili
scrapy genspider bilibili bilibili.com
目录说明:
scrapy.cfg 项目的配置信息
items.py 数据存储模板
pipelines 数据处理,数据持久化
settings.py 配置文件
spiders 爬虫目录
3 编写配置文件setting.py
这里我们设置了简单的反爬机制,随机选定一个浏览器头,开启了日志记录,对原有的默认设置做了简单的定制
# -*- coding: utf-8 -*-
import random
BOT_NAME = 'Bilibili'
SPIDER_MODULES = ['Bilibili.spiders']
NEWSPIDER_MODULE = 'Bilibili.spiders'
# 数据库配置
DBKWARGS={'db':'ABVideos','user':'root', 'passwd':'root',
'host':'localhost','use_unicode':True, 'charset':'utf8'}
# DOWNLOAD_HANDLERS = {'S3':None}
#日志文件
LOG_FILE = "BVideosScrapy.log"
# 浏览器请求头,很多网站都会检查客户端的headers,
# 比如豆瓣就是每一个请求都检查headers的user_agent,否则只会返回403,可以开启
DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en',
}
#中间件设置
DOWNLOADER_MIDDLEWARES = {
'Bilibili.middlewares.UserAgentMiddleware': 400,
'Bilibili.middlewares.bilibiliDownloaderMiddleware': 401,
#调用中间件
# 'scrapy_crawlera.CrawleraMiddleware': 600
}
CRAWLERA_PRESERVE_DELAY = True
CRAWLERA_ENABLED = True
CRAWLERA_USER = 'a357e516e11440a9846f32067f6d9cb6'
CRAWLERA_PASS = ''
#浏览器请求头,这个必须要有
USER_AGENT_LIST=[
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1" \
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", \
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", \
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", \
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", \
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", \
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", \
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", \
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", \
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", \
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", \
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", \
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", \
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", \
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", \
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3", \
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", \
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
]
ua= random.choice(USER_AGENT_LIST)
if ua:
USER_AGENT =ua
else:
USER_AGENT="Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
print ("User_agent——setting:"+USER_AGENT)
#是否遵循robots协定
ROBOTSTXT_OBEY = False
#线程数量
CONCURRENT_REQUESTS = 4
# 下载延迟单位秒,下载器在下载同一个网站下一个页面前需要等待的时间。该选项可以用来限制爬取速度, 减轻服务器压力。同时也支持小数:
# 该设定影响(默认启用的) RANDOMIZE_DOWNLOAD_DELAY 设定。 默认情况下,Scrapy在两个请求间不等待一个固定的值, 而是使用0.5到1.5之间的一个随机值 * DOWNLOAD_DELAY 的结果作为等待间隔。
# DOWNLOAD_DELAY = 0.5
# The download delay setting will honor only one of:
CONCURRENT_REQUESTS_PER_DOMAIN = 8
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#cookies开关,建议禁用
COOKIES_ENABLED = False
ITEM_PIPELINES = {
'Bilibili.pipelines.bilibiliPipeline': 300,
}
#开始下载时限速并延迟时间
AUTOTHROTTLE_START_DELAY = 5
DOWNLOAD_TIMEOUT = 600
#高并发请求时最大延迟时间
AUTOTHROTTLE_MAX_DELAY = 180
#是否启用在本地缓存,如果开启会优先读取本地缓存,从而加快爬取速度
#有两个参数最好每次都开启,而每次都是项目文件手动开启不免有些麻烦,最好是项目创建后就自动开启
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
#在关闭Scrapy之前所允许的最大内存数(单位: MB)(如果 MEMUSAGE_ENABLED为True)。 如果为0,将不做限制。
MEMUSAGE_LIMIT_MB=8168
4 编写items
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class bilibiliItem(scrapy.Item):
# 编号
vnum = scrapy.Field()
# 标题
vtitle = scrapy.Field()
# 视频ID
vid = scrapy.Field()
# 图片地址
vpic = scrapy.Field()
# 综合评分
vpts = scrapy.Field()
pass
5 编写中间件middlewares
这里需要调用PhantomJS,在PhantomJS中网页加载完成后再反馈到scrapy
# -*- coding: utf-8 -*-
# Define here the models for your spider middleware
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
import random
from selenium import webdriver
from scrapy import signals
from scrapy.http import HtmlResponse
class bilibiliDownloaderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_request(self, request, spider):
# 加载驱动
browser = webdriver.PhantomJS(executable_path='D:\\Workspaces\\python\\crawler\\phantomjs-2.1.1-windows\\phantomjs-2.1.1-windows\\bin\\phantomjs.exe')
browser.get(request.url)# 加载网页
data = browser.page_source # 获取网页文本
data = data.encode('utf-8')
browser.quit()
return HtmlResponse(request.url, body=data,encoding='utf-8',request=request)
def process_response(self, request, response, spider):
# Called with the response returned from the downloader.
# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return response
def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
class UserAgentMiddleware(object):
def __init__(self, user_agent=''):
self.user_agent = user_agent
def process_request(self, request, spider):
ua = random.choice(self.user_agent_list)
if ua:
# 显示当前使用的useragent
print ("********Current UserAgent:%s************" % ua)
# 記錄
request.headers.setdefault('User-Agent', ua)
user_agent_list = [ \
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 "
"(KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 "
"(KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 "
"(KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 "
"(KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 "
"(KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 "
"(KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 "
"(KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 "
"(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 "
"(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
]
通过Xpath来获取我们所需要的信息,具体Xpath的用法请看:http://www.w3school.com.cn/xpath/index.asp
# -*- coding: utf-8 -*-
import json
import scrapy
from Bilibili.items import bilibiliItem
class BilibilispiderSpider(scrapy.Spider):
name = 'Bilibili'
allowed_domains = ['bilibili.com']
start_urls = ['https://www.bilibili.com/ranking#!/all/0/0/7/']
def parse(self, response):
ul = response.xpath('//*[@id="rank_list"]')
if not ul:
#信息获取失败输入日志
self.log("----------------------%s" % response.url)
else:
#信息获取成功,输出日志
self.log("++++++++++++++++++++++%s" % response.url)
#根据li,获取li的list
lis = ul[0].xpath('li')
items = []
for bilibili in lis[0:]:
bilibi_item =bilibiliItem ()
try:
# 编号
s_vnum = bilibili.xpath('div/div[1]/text()').extract()[0]
bilibi_item['vnum']=s_vnum
# 标题
s_title = bilibili.xpath('div/div[2]/div/a/div/text()').extract()[0]
bilibi_item['vtitle']=s_title
# 视频ID
s_id = bilibili.xpath('div/div[2]/i/@aid').extract()[0]
bilibi_item['vid']=s_id
# 图片地址
s_pic=bilibili.xpath('div/div[2]/a/div/div/img/@data-img').extract()
bilibi_item['vpic']=s_pic
# 综合评分
s_pts=bilibili.xpath('div/div[2]/div/div[2]/div/text()').extract()[0]
bilibi_item['vpts'] = s_pts
# print(bilibi_item)
except IndexError as e:
#如果有个别信息为空获取失败,则输出日志
self.log("!!!!!!!!!!!"+str(e))
items.append(bilibi_item)
return items
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import pymysql
class bilibiliPipeline(object):
def process_item(self, item, spider):
DBKWARGS = spider.settings.get('DBKWARGS')
con = pymysql.connect(**DBKWARGS)
cur = con.cursor()
sql = "insert into " \
"T_test (vnum , vtitle , vid , vpic ,vpts ,inTime ) " \
"VALUE (%s ,%s ,%s ,%s ,%s , now())"
lis = (item['vnum'],
item['vtitle'],item['vid'],item['vpic'],item['vpts'])
try:
cur.execute(sql,lis)
except Exception as e:
print ("Insert error:", e)
con.rollback()
else:
con.commit()
cur.close()
con.close()
return item
8 运行爬虫:
scrapy crawl bilibili
9 结果如下: