Python3 requests多线程抓取猫眼电影Top100保存到文件

#coding=utf-8
import json
import re
from multiprocessing import Pool
import requests
from requests.exceptions import RequestException
from fake_useragent import UserAgent


ua=UserAgent()

def get_page(url):
    try:
        headers={'User-Agent':ua.chrome}
        response=requests.get(url,headers=headers)
        if response.status_code==200:
            return response.text
        return None
    except RequestException:
        return None

'''
1 霸王别姬

霸王别姬

主演:张国荣,张丰毅,巩俐

上映时间:1993-01-01

9.5

''' def parse_page(html): pattern=re.compile(r'
.*?board-index-\d+">(.*?).*?(.*?)' +'.*?

(.*?)

.*?class="releasetime">(.*?)

' +'.*?(.*?)' +'.*?(.*?).*?
',re.S) items=pattern.findall(html) for item in items: yield{ 'index':item[0], 'image':item[1], 'title':item[2], 'star': item[3].strip()[4:], 'time':item[4][5:], 'score':item[5]+item[6] } def write_to_file(content): with open("res.txt","a",encoding="utf-8") as f: f.write(content) def main(offset): url="https://maoyan.com/board/4?offset="+str(offset) html=get_page(url) if html!=None: for item in parse_page(html): print(item) write_to_file(json.dumps(item,ensure_ascii=False)+"\n") if __name__=="__main__": pool=Pool() pool.map(main,[i*10 for i in range(10)]) pool.close() pool.join()
User-Agentx 必须要加,否则403
ensure_ascii=False 保证中文是友好显示

原文: https://rumenz.com/rumenbiji/python3-requests-multiprocessing.html

你可能感兴趣的:(多线程,python爬虫,python)