爬虫 利用正则表达式爬取猫眼电影

# coding=utf-8
import requests
import re
import json
from io import open
#from multiprocessing import Pool
from requests.exceptions import RequestException
headers = {'User-Agent':'Mozilla/5.0 '}
def get_one_page(url):
   try:
       res = requests.get(url,headers = headers)
       if res.status_code == 200:
           #print res.text
           print url
           return res.text
       return None
   except RequestException:
       return None

def parse_one_page(html):
   #pattern = re.compile('
.*?board-index.*?>(\d+).*?data-src="(.*?)".*?name">(.*?).*?star">(.*?)

.*?releasetime">(.*?)

' #+'.*?integer">(.*?).*?fraction">(.*?).*?
',re.S) #items = re.findall(pattern,html) items=re.findall(r'
.*?class="board-index board-index-.*?
',html,re.S) print items content=[] for item1 in items: item=re.search(r'
.*?i class="board-index.*?>(\d+).*?data-src="(.*?)".*?name">(.*?).*?star">(.*?)

.*?releasetime">(.*?)

' +'.*?integer">(.*?).*?fraction">(.*?).*?
',item1,re.S) li={} li['index']=item.group(1) li['image'] = item.group(2) li['title'] = item.group(3) li['actor'] = item.group(4).strip()[3:] li['time'] = item.group(5).strip()[5:] li['score'] = item.group(6)+item.group(7) content.append(li) #print content return content #yield{ #'index': item[0], #'image': item[1], #'title': item[2], #'actor': item[3].strip()[3:], #'time': item[4].strip()[5:], #'score': item[5] + item[6] #} def write_to_file(content): with open ('res.txt', 'a',encoding='utf-8') as f: f.write(json.dumps(content,ensure_ascii=False)) f.close() def main(): for i in range(0,10): url = 'http://maoyan.com/board/4?offset=' + str(i*10) html = get_one_page(url) content=parse_one_page(html) print content #for item in parse_one_page(html): #print(item) write_to_file(content) if __name__ == '__main__': #p = Pool() #p.map(main, [i * 10 for i in range(10)]) main()
爬虫 利用正则表达式爬取猫眼电影_第1张图片

你可能感兴趣的:(python爬虫)