Python3+selenium+PhantomJS+scrapy获取B站排行版Ajax动态爬虫

1 安装需要的依赖

pip install selenium
pip install scrapy
pip install pymysql

PhantomJS下载地址:  http://phantomjs.org/download.html


2 创建scrapy项目

scrapy startproject bilibili
cd bilibili
scrapy genspider bilibili bilibili.com
 目录说明:

        scrapy.cfg   项目的配置信息
items.py      数据存储模板
pipelines    数据处理,数据持久化
settings.py 配置文件
spiders       爬虫目录


3  编写配置文件setting.py

这里我们设置了简单的反爬机制,随机选定一个浏览器头,开启了日志记录,对原有的默认设置做了简单的定制

# -*- coding: utf-8 -*-

import random
BOT_NAME = 'Bilibili'
SPIDER_MODULES = ['Bilibili.spiders']
NEWSPIDER_MODULE = 'Bilibili.spiders'

 # 数据库配置
DBKWARGS={'db':'ABVideos','user':'root', 'passwd':'root',
    'host':'localhost','use_unicode':True, 'charset':'utf8'}

# DOWNLOAD_HANDLERS = {'S3':None}
#日志文件
LOG_FILE = "BVideosScrapy.log"

# 浏览器请求头,很多网站都会检查客户端的headers,
# 比如豆瓣就是每一个请求都检查headers的user_agent,否则只会返回403,可以开启
DEFAULT_REQUEST_HEADERS = {
 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 'Accept-Language': 'en',
}
#中间件设置
DOWNLOADER_MIDDLEWARES = {
   'Bilibili.middlewares.UserAgentMiddleware': 400,
   'Bilibili.middlewares.bilibiliDownloaderMiddleware': 401,
    #调用中间件
     # 'scrapy_crawlera.CrawleraMiddleware': 600
 }
CRAWLERA_PRESERVE_DELAY = True
CRAWLERA_ENABLED = True
CRAWLERA_USER = 'a357e516e11440a9846f32067f6d9cb6'
CRAWLERA_PASS = ''

#浏览器请求头,这个必须要有
USER_AGENT_LIST=[
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
    "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
    "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
    "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
    "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)",
    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
    "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1" \
    "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", \
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", \
    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", \
    "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", \
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", \
    "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", \
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", \
    "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", \
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", \
    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", \
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", \
    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", \
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", \
    "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", \
    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3", \
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", \
    "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"

]
ua= random.choice(USER_AGENT_LIST)
if ua:
    USER_AGENT =ua
else:
    USER_AGENT="Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
print ("User_agent——setting:"+USER_AGENT)

#是否遵循robots协定
ROBOTSTXT_OBEY = False

#线程数量
CONCURRENT_REQUESTS = 4

# 下载延迟单位秒,下载器在下载同一个网站下一个页面前需要等待的时间。该选项可以用来限制爬取速度, 减轻服务器压力。同时也支持小数:
# 该设定影响(默认启用的) RANDOMIZE_DOWNLOAD_DELAY 设定。 默认情况下,Scrapy在两个请求间不等待一个固定的值, 而是使用0.5到1.5之间的一个随机值 * DOWNLOAD_DELAY 的结果作为等待间隔。
# DOWNLOAD_DELAY = 0.5
# The download delay setting will honor only one of:
CONCURRENT_REQUESTS_PER_DOMAIN = 8
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
#cookies开关,建议禁用
COOKIES_ENABLED = False

ITEM_PIPELINES = {
     'Bilibili.pipelines.bilibiliPipeline': 300,
 }

#开始下载时限速并延迟时间
AUTOTHROTTLE_START_DELAY = 5
DOWNLOAD_TIMEOUT = 600
#高并发请求时最大延迟时间
AUTOTHROTTLE_MAX_DELAY = 180

#是否启用在本地缓存,如果开启会优先读取本地缓存,从而加快爬取速度
#有两个参数最好每次都开启,而每次都是项目文件手动开启不免有些麻烦,最好是项目创建后就自动开启
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
#在关闭Scrapy之前所允许的最大内存数(单位: MB)(如果 MEMUSAGE_ENABLED为True)。 如果为0,将不做限制。
MEMUSAGE_LIMIT_MB=8168


4 编写items

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html

import scrapy

class bilibiliItem(scrapy.Item):
    #  编号
    vnum = scrapy.Field()
    # 标题
    vtitle = scrapy.Field()
    # 视频ID
    vid = scrapy.Field()
    # 图片地址
    vpic = scrapy.Field()
    # 综合评分
    vpts = scrapy.Field()
    pass
5 编写中间件middlewares

这里需要调用PhantomJS,在PhantomJS中网页加载完成后再反馈到scrapy

# -*- coding: utf-8 -*-

# Define here the models for your spider middleware
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
import random
from selenium import webdriver
from scrapy import signals
from scrapy.http import HtmlResponse


class bilibiliDownloaderMiddleware(object):
    # Not all methods need to be defined. If a method is not defined,
    # scrapy acts as if the downloader middleware does not modify the
    # passed objects.

    @classmethod
    def from_crawler(cls, crawler):
        # This method is used by Scrapy to create your spiders.
        s = cls()
        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
        return s

    def process_request(self, request, spider):
        # 加载驱动
        browser = webdriver.PhantomJS(executable_path='D:\\Workspaces\\python\\crawler\\phantomjs-2.1.1-windows\\phantomjs-2.1.1-windows\\bin\\phantomjs.exe')
        browser.get(request.url)# 加载网页
        data = browser.page_source # 获取网页文本
        data = data.encode('utf-8')
        browser.quit()
        return HtmlResponse(request.url, body=data,encoding='utf-8',request=request)

    def process_response(self, request, response, spider):
        # Called with the response returned from the downloader.

        # Must either;
        # - return a Response object
        # - return a Request object
        # - or raise IgnoreRequest
        return response

    def process_exception(self, request, exception, spider):
        # Called when a download handler or a process_request()
        # (from other downloader middleware) raises an exception.

        # Must either:
        # - return None: continue processing this exception
        # - return a Response object: stops process_exception() chain
        # - return a Request object: stops process_exception() chain
        pass

    def spider_opened(self, spider):
        spider.logger.info('Spider opened: %s' % spider.name)
class UserAgentMiddleware(object):
    def __init__(self, user_agent=''):
        self.user_agent = user_agent

    def process_request(self, request, spider):
        ua = random.choice(self.user_agent_list)
        if ua:
            # 显示当前使用的useragent
            print ("********Current UserAgent:%s************" % ua)

            # 記錄
            request.headers.setdefault('User-Agent', ua)

    user_agent_list = [ \
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 "
        "(KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
        "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 "
        "(KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 "
        "(KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 "
        "(KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 "
        "(KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 "
        "(KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
        "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 "
        "(KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 "
        "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 "
        "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
    ]

6 编写爬虫bilibili.py

通过Xpath来获取我们所需要的信息,具体Xpath的用法请看:http://www.w3school.com.cn/xpath/index.asp


# -*- coding: utf-8 -*-
import json

import scrapy

from Bilibili.items import bilibiliItem

class BilibilispiderSpider(scrapy.Spider):
    name = 'Bilibili'
    allowed_domains = ['bilibili.com']
    start_urls = ['https://www.bilibili.com/ranking#!/all/0/0/7/']
    def parse(self, response):
        ul = response.xpath('//*[@id="rank_list"]')
        if not ul:
            #信息获取失败输入日志
            self.log("----------------------%s" % response.url)
        else:
            #信息获取成功,输出日志
            self.log("++++++++++++++++++++++%s" % response.url)
            #根据li,获取li的list
            lis = ul[0].xpath('li')
            items = []
            for bilibili in lis[0:]:
                bilibi_item =bilibiliItem ()
                try:
                    #  编号
                    s_vnum = bilibili.xpath('div/div[1]/text()').extract()[0]
                    bilibi_item['vnum']=s_vnum
                    # 标题
                    s_title =  bilibili.xpath('div/div[2]/div/a/div/text()').extract()[0]
                    bilibi_item['vtitle']=s_title
                    # 视频ID
                    s_id = bilibili.xpath('div/div[2]/i/@aid').extract()[0]
                    bilibi_item['vid']=s_id
                    # 图片地址
                    s_pic=bilibili.xpath('div/div[2]/a/div/div/img/@data-img').extract()
                    bilibi_item['vpic']=s_pic
                    # 综合评分
                    s_pts=bilibili.xpath('div/div[2]/div/div[2]/div/text()').extract()[0]
                    bilibi_item['vpts'] = s_pts
                    # print(bilibi_item)
                except IndexError as e:
                    #如果有个别信息为空获取失败,则输出日志
                    self.log("!!!!!!!!!!!"+str(e))
                items.append(bilibi_item)

            return items

7 编写pipelines,将数据存入mysql,前提我们需要在数据中建表,数据库的连接信息在settings中

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import  pymysql

class bilibiliPipeline(object):
    def process_item(self, item, spider):
        DBKWARGS = spider.settings.get('DBKWARGS')
        con = pymysql.connect(**DBKWARGS)
        cur = con.cursor()
        sql = "insert into " \
              "T_test (vnum , vtitle , vid , vpic ,vpts ,inTime )  " \
              "VALUE (%s ,%s ,%s ,%s ,%s , now())"
        lis = (item['vnum'],
               item['vtitle'],item['vid'],item['vpic'],item['vpts'])
        try:
            cur.execute(sql,lis)
        except Exception as e:
            print ("Insert error:", e)
            con.rollback()
        else:
            con.commit()
        cur.close()
        con.close()
        return item
8 运行爬虫:

scrapy crawl bilibili
9 结果如下:

Python3+selenium+PhantomJS+scrapy获取B站排行版Ajax动态爬虫_第1张图片


你可能感兴趣的:(python爬虫学习,python)