requests爬猫眼电影 -- 记录

#!/usr/bin/env python
#-*- coding:utf-8 -*-
import requests
import re
from requests.exceptions import RequestException
from json import dumps

def get_one_page(url):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            return response.text
        return None
    except RequestException:
        return None

def parse_one_page(html):
    pattern = re.compile('
.*?"board-index.*?>(.*?).*?data-src="(.*?)".*?/>.*?name">(.*?)' +'

.*?star">(.*?)

.*?releasetime">(.*?)

.*?integer">(.*?)'
+'.*?fraction">(.*?).*?
'
,re.S) items = re.findall(pattern,html) for item in items: yield { 'index' :item[0], 'picurl' :item[1], 'title' :item[2], 'actor' :item[3].replace('\n','').strip(), 'times' :item[4], 'score' :item[5] + item[6] } def write_to_file(content): with open('movie.txt','a',encoding='utf8') as fp: fp.write(dumps(content,ensure_ascii=False) + '\n') def main(offset): url = "http://maoyan.com/board/4?offset=%s" % offset html = get_one_page(url) for item in parse_one_page(html): write_to_file(item) if __name__ == '__main__': for i in range(10): main(i * 10)

你可能感兴趣的:(python爬虫,python)