手撸爬虫爬取爱奇艺视频信息(1)

# 爬取爱奇艺华语院线电影和美国院线电影
import random
import time
import urllib.request
import urllib.response
import pymysql
from lxml import etree
import schedule
from io import BytesIO
import requests

useragent = [
    "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)",
    "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)",
    "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
    "Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
    "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11",
    "Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)"]

headers1 = {
    'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.80 Safari/537.36",
    "method": "GET",
    "authority": "list.iqiyi.com",
    "path": "/www/1/1-27815------------11-1-1-iqiyi--.html",
    "scheme": "https",
    "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
    "accept-language": "zh-CN,zh;q=0.9",
    "cache-control": "max-age=0",
    "upgrade-insecure-requests": "1",
    "cookie": "QC005=b4b984320692d3b4314ab4e223adcd2d; QC173=0; Hm_lvt_53b7374a63c37483e5dd97d78d9bb36e=1585879290; QC175=%7B%22upd%22%3Atrue%2C%22ct%22%3A%22%22%7D; QC007=DIRECT; QC006=cxi755tl8psf5cdjn3675ae7; QC008=1585886016.1585886016.1585886016.1; nu=0; T00404=a8a63ad6d7b7e4dceba700e91b9dbbea; Hm_lpvt_53b7374a63c37483e5dd97d78d9bb36e=1585887255; QC010=70960532; IMS=IggQABj_up30BSokCiAxY2VkMzc1ZDE5OWJhMGFjZjc1NjBiZDJmODY5MzdiZBAAciQKIDFjZWQzNzVkMTk5YmEwYWNmNzU2MGJkMmY4NjkzN2JkEAA; __dfp=a1c2f37ae6eb3d4d62ac566e461536f7169314d9d24b23dbb85ca81e8138c8394d@1587175290494@1585879291494"
}
headers2 = {
    'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.80 Safari/537.36",
    "method": "GET",
    "authority": "list.iqiyi.com",
    "path": "/www/1/1-27815------------11-1-1-iqiyi--.html",
    "scheme": "https",
    "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
    "accept-language": "zh-CN,zh;q=0.9",
    "cache-control": "max-age=0",
    "upgrade-insecure-requests": "1",
    "cookie": "QC005=b4b984320692d3b4314ab4e223adcd2d; QC173=0; Hm_lvt_53b7374a63c37483e5dd97d78d9bb36e=1585879290; QC175=%7B%22upd%22%3Atrue%2C%22ct%22%3A%22%22%7D; QC007=DIRECT; QC006=cxi755tl8psf5cdjn3675ae7; QC008=1585886016.1585886016.1585886016.1; nu=0; T00404=a8a63ad6d7b7e4dceba700e91b9dbbea; IMS=IggQABj_up30BSokCiAyMjMwMzVkODNlODEzNzZlMTk0ODg1N2ZkNzFkZTZjYRAAciQKIDIyMzAzNWQ4M2U4MTM3NmUxOTQ4ODU3ZmQ3MWRlNmNhEAA; QP001=1; QP0013=; QC010=33145914; Hm_lpvt_53b7374a63c37483e5dd97d78d9bb36e=1585888116; __dfp=a1c2f37ae6eb3d4d62ac566e461536f7169314d9d24b23dbb85ca81e8138c8394d@1587175290494@1585879291494"}

# 根目录 //ul[@class='qy-mod-ul']//li[@class='qy-mod-li']//div[@class='qy-list-img vertical']
# 取播放地址 //div[@class='list-content']//ul[@class='qy-mod-ul']//li[@class='qy-mod-li']//div[@class='qy-mod-link-wrap']/a/@href
# 取电影名称 //div[@class='list-content']//ul[@class='qy-mod-ul']//li[@class='qy-mod-li']//a[@class='link-txt']/@title
# 取电影评分 //div[@class='list-content']//ul[@class='qy-mod-ul']//li[@class='qy-mod-li']//p/span[@class='text-score']//text()
# 取电影海报 //div[@class='list-content']//ul[@class='qy-mod-ul']//li[@class='qy-mod-li']//a[@class='qy-mod-link']//div[@class='qy-mod-cover']/@style
print(
    "---华语华语华语华语华语华语华语华语华语华语华语华语华语华语华语华语华语华语华语华语华语华语----")

# 打开数据库连接
db = pymysql.connect("*****", "*****", "*****", "******", port=3306, charset='utf8')

# 使用cursor()方法获取操作游标
cursor = db.cursor()

for i in range(1, 20):
    tableChinese = {}
    ChineseUrl = "https://list.iqiyi.com/www/1/1-27815------------11-{}-1-iqiyi--.html"  # 华语院线 ci控制分页  共19页
    ChineseUrl = ChineseUrl.format(i)
    chineseRequest = urllib.request.Request(url=ChineseUrl, headers=headers1)
    ChineseReponse = urllib.request.urlopen(chineseRequest)
    Chinesehtml = ChineseReponse.read().decode('utf-8')
    Chinesehtml = etree.HTML(Chinesehtml)
    Cxpath_list = Chinesehtml.xpath(
        "//div[@class='list-content']//ul[@class='qy-mod-ul']//li[@class='qy-mod-li']")

    for x in Cxpath_list:
        CvideoUrl = x.xpath(".//div[@class='qy-mod-link-wrap']/a/@href")  # URL
        tableChinese['CvideoUrl'] = "http:" + str(CvideoUrl[0])
        tableChinese['CvideoName'] = x.xpath(".//a[@class='link-txt']/@title")  # 电影名
        tableChinese['CvideoScore'] = x.xpath(".//p/span[@class='text-score']//text()")  # 评分

        print(tableChinese['CvideoName'])
        print(tableChinese['CvideoUrl'])
        print(tableChinese['CvideoScore'])

        # SQL 插入语句
        sql = """INSERT INTO iqiyiVideo(
                     name,score,videourl,flag    
                     )
                     VALUES ('{0}','{1}','{2}','{3}')""".format(tableChinese['CvideoName'][0],
                                                                tableChinese['CvideoScore'][0],
                                                                tableChinese['CvideoUrl'], "chinese"
                                                                )
        try:
            cursor.execute(sql)  # 执行sql语句
            db.commit()  # 提交到数据库执行

        except:
            db.rollback()  # 如果发生错误则回滚

    i = i + 1

print(
    "---美国美国美国美国美国美国美国美国美国美国美国美国美国美国美国美国美国美国----")

for i in range(1, 13):
    USAUrl = "https://list.iqiyi.com/www/1/2-27815------------11-{}-1-iqiyi--.html"  # 美国院线  ui控制分页 共13页
    tableUSA = {}
    USAUrl = USAUrl.format(str(i))
    USARequest = urllib.request.Request(url=USAUrl, headers=headers2)
    USAReponse = urllib.request.urlopen(USARequest)
    USAhtml = USAReponse.read().decode('utf-8')
    USAhtml = etree.HTML(USAhtml)
    Uxpath_list = USAhtml.xpath(
        "//div[@class='list-content']//ul[@class='qy-mod-ul']//li[@class='qy-mod-li']")

    for x in Uxpath_list:
        UvideoUrl = x.xpath(".//div[@class='qy-mod-link-wrap']/a/@href")  # URL
        tableUSA['UvideoUrl'] = "http:" + str(UvideoUrl[0])
        tableUSA['UvideoName'] = x.xpath(".//a[@class='link-txt']/@title")  # 电影名
        tableUSA['UvideoScore'] = x.xpath(".//p/span[@class='text-score']//text()")  # 评分

        # SQL 插入语句
        sql = """INSERT INTO iqiyiVideo(
                         name,score,videourl,flag    
                         )
                         VALUES ('{0}','{1}','{2}','{3}')""".format(tableUSA['UvideoUrl'][0],
                                                                    tableUSA['UvideoName'][0],
                                                                    tableUSA['UvideoScore'], "usa"
                                                                    )
        try:
            cursor.execute(sql)  # 执行sql语句
            db.commit()  # 提交到数据库执行

        except:
            db.rollback()  # 如果发生错误则回滚

        print(tableUSA['UvideoName'])
        print(tableUSA['UvideoUrl'])
        print(tableUSA['UvideoScore'])

    i = i + 1

db.close()

只是爬取了视频名字和播放源还有评分,想要获取电影海报获取不到,因为图片是由js动态加载。那怎么爬取更多的信息呢,下节揭晓。

你可能感兴趣的:(python,python)