Python爬取电影信息

利用火狐浏览器模拟登录爬取源码  ! 

爬取电影天堂的影片

进行多页爬取

1、导入模块

from bs4 import BeautifulSoup
import re
import requests
from selenium import webdriver
import time

2、构建浏览器爬取多页代码 

for i in range(1, 4):#爬去第一页到第三页的信息
    firefox = webdriver.Firefox()
    firefox.get('http://vip.1905.com/list/t_1/p%so6.shtml' % i)
    time.sleep(10)
    html = firefox.page_source
    firefox.quit()

3、解析页面 

    soup = BeautifulSoup(html, 'lxml')

    for soups in soup.find_all(class_='borderBox'):
        for scores in soups.find_all(class_='fr score'):
            for score in scores:
                print("得分:", score)
        for titles in soups.find_all(class_='name'):
            for title in titles:
                print("剧名:", title)
        for years in soups.find_all(class_='hidden year'):
            for year in years:
                print("时间:", year)
        for zhu_actors in soups.find_all(class_="hidden actor"):
            for zhu_actor in zhu_actors:
                print("主演:", zhu_actor)
        for pages in soups.find_all(class_='hidden descr'):
            for page in pages:
                print("简介:", page)
        for urls in soups.find_all(class_='hidden url'):
            for url in urls:
                print("网址:", url)

4、保存解析信息 

with open('file1.txt', 'a+', encoding='utf-8')as f:
            f.write("得分:"+score+'\n'+"标题:"+title+'\n'+"时间:"+year+'\n' +
                    "主演:" +zhu_actor+'\n'+"简介:"+page+'\n'+"网址:"+url+'\n')
            f.write('='*50+'\n')

5、提取图片并保存 

    patter = re.compile('')
    imgs = re.findall(patter, html)
    print(imgs)
    for img in imgs:
        url = 'http:' + img
        print(img)
        tupian = img.split('/')[-1]
        res = requests.get(url)
        with open('D:\爬虫\Video' + '\\' + tupian, 'ab')as f:
            f.write(res.content)

完整代码如下: 

from bs4 import BeautifulSoup
import re
import requests
from selenium import webdriver
import time

for i in range(1, 4):#爬去第一页到第三页的信息
    firefox = webdriver.Firefox()
    firefox.get('http://vip.1905.com/list/t_1/p%so6.shtml' % i)
    time.sleep(10)
    html = firefox.page_source
    firefox.quit()


    soup = BeautifulSoup(html, 'lxml')

    for soups in soup.find_all(class_='borderBox'):
        for scores in soups.find_all(class_='fr score'):
            for score in scores:
                print("得分:", score)
        for titles in soups.find_all(class_='name'):
            for title in titles:
                print("剧名:", title)
        for years in soups.find_all(class_='hidden year'):
            for year in years:
                print("时间:", year)
        for zhu_actors in soups.find_all(class_="hidden actor"):
            for zhu_actor in zhu_actors:
                print("主演:", zhu_actor)
        for pages in soups.find_all(class_='hidden descr'):
            for page in pages:
                print("简介:", page)
        for urls in soups.find_all(class_='hidden url'):
            for url in urls:
                print("网址:", url)


        with open('file1.txt', 'a+', encoding='utf-8')as f:
            f.write("得分:"+score+'\n'+"标题:"+title+'\n'+"时间:"+year+'\n' +
                    "主演:" +zhu_actor+'\n'+"简介:"+page+'\n'+"网址:"+url+'\n')
            f.write('='*50+'\n')
    patter = re.compile('')
    imgs = re.findall(patter, html)
    print(imgs)
    for img in imgs:
        url = 'http:' + img
        print(img)
        tupian = img.split('/')[-1]
        res = requests.get(url)
        with open('D:\爬虫\Video' + '\\' + tupian, 'ab')as f:
            f.write(res.content)

利用火狐浏览器模拟登录爬取源码  !

运行代码出现下面的图示: 

Python爬取电影信息_第1张图片

你可能感兴趣的:(Python爬虫开发,爬虫开发)