爬虫实战之下载梨视频

  • 基本原理:
    • 主要利用BeautifulSoup进行提取,用urlretrieve进行视频下载
    • 代码如下:
#!/usr/bin/env python
# -*- coding:utf-8 -*-
# @Time    : 2020/06/06 21:00:01
# @File    : 梨视频下载.py
# @Sortware: Vsc

import requests
import re
from lxml import etree
from bs4 import BeautifulSoup
from pyquery import PyQuery as pq
# from urllib.request import urlretrieve
import urllib
import sys
reload(sys)
sys.setdefaultencoding('utf-8')

# 思路:
# 1、请求第一页,获取html文件
# headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36'}
# response = requests.get(url, headers=headers)
# if response.status_code == 200:
#     print(response.text)
# else:
#     print('None')

# 2、获取视频url:
# base_url = 'https://www.pearvideo.com/'
# 正则
# vervideo_bd = re.findall('class="vervideo-bd">\s*

# xpath
# html = etree.HTML(response.text)
# vervideo_bd = html.xpath('//div[@class="vervideo-bd"]/a/@href')

# beautifulsoup
# soup = BeautifulSoup(response.text, 'lxml')
# vervideo_bds = soup.find_all('a', class_='vervideo-lilink actplay')
# for vervideo_bd in vervideo_bds:
#     print(vervideo_bd.get('href'))   

# pyquery
# html = pq(response.text)
# vervideo_bds = html.find('.vervideo-lilink')
# for vervideo_bd in vervideo_bds:
#     print(vervideo_bd.get('href'))

# 3、请求视频url,获取播放地址
# 4、下载播放地址

def download_video(url):
    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36'}
    response = requests.get(url, headers=headers)
    text = response.content.decode('utf-8')
    soup = BeautifulSoup(text, 'lxml')
    vervideo_bds = soup.find_all('a', class_='vervideo-lilink actplay')
    base_url = 'https://www.pearvideo.com/'
    for vervideo_bd in vervideo_bds:
        v_url = base_url + vervideo_bd.get('href')
        # print(v_url)
        response2 = requests.get(v_url, headers=headers).text
        title = re.findall(r'class="video-tt">(.*?)', response2)[0]
        download_url = re.findall(r'srcUrl="(.*?)"', response2)[0]
        # print(title)
        # 下载视频
        path = '/Users/amoryan/Desktop/python_test/实战/20200606/梨视频/%s.mp4' % title
        print('正在下载%s'% title)
        # print(path)
        urllib.urlretrieve(download_url, path)
        
def main():
    url = 'https://www.pearvideo.com/category_8'
    download_video(url)

if __name__ == '__main__':
    main()

你可能感兴趣的:(爬虫实战)