requests+re 爬取猫眼电影榜单

python3

 

requests+re 爬取猫眼电影榜单_第1张图片

 

 

import re
import json
import requests
from multiprocessing import Pool
from requests.exceptions import RequestException

def spider_page(url):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            return response.text
    except RequestException:
        return None

def parse_page(html):

    pattern = re.compile('
.*?board-index.*?>(\d+).*?data-src="(.*?)".*?name">.*?' +'title="(.*?)".*?star">(.*?)<.*?releasetime">(.*?)

.*?' +'integer">(.*?)<.*?fraction">(.*?)<.*?
',re.S)#创建一个正则对象 ans = re.findall(pattern,html) for item in ans:#将结果转换成生成器 yield { 'index':item[0], 'photo_url':item[1], 'name':item[2], 'actors':item[3].strip()[3:], 'time':item[4].strip()[5:], 'score':item[5]+item[6] } def write_file(text) : with open('result.txt','a',encoding='utf-8') as f: f.write(json.dumps(text,ensure_ascii=False)+'\n')#确保解析成汉字 f.close() def main(num): url = 'https://maoyan.com/board/4?offset='+str(num) #分析也页面可知 共有10页 每页地址后缀不同 html = spider_page(url) parse_page(html) for i in parse_page(html): # print(i) write_file(i) # print(html) if __name__ == '__main__': for i in range(10): main(i*10) # pool = Pool() #线程池方法输出 # pool.map(main,[i*10 for i in range(10)])

 

你可能感兴趣的:(python学习)