requests 抓取网站

 1 import requests
 2 from requests.exceptions import RequestException
 3 import re
 4 import json
 5 
 6 def get_one_page(url):
 7     try:
 8         headers = {
 9             'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/'
10                 + '535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1'
11         }
12         response = requests.get(url, headers=headers)
13         response.encoding = 'gb2312'
14         if response.status_code == 200:
15             return response.text
16         return None
17     except RequestException:
18         return None
19 
20 def parse_one_page(html):
21     pattern = re.compile(
22         '
  • .*?target.*?src="(.*?)".*?

    .*?title.*?>(.*?)' 23 + '

    .*?(.*?).*?>(.*?)

    .*?
  • ', re.S) 24 items = re.findall(pattern, html) 25 #print(items) 26 for item in items: 27 yield { 28 'image': item[0], 29 'title': item[1], 30 'type': item[2], 31 'introduction': item[3] 32 } 33 34 def write_to_file(content): 35 with open('Yinghua.json', 'a', encoding='utf-8') as f: 36 f.write(json.dumps(content, ensure_ascii=False) + '\n') 37 38 def main(page): 39 num = (page - 1) * 12 + 1 40 url = '×××page=' + str(page) + '×××' 41 html = get_one_page(url) 42 #print(html) 43 for item in parse_one_page(html): 44 print(num) 45 print(str(item) + '\n') 46 item = str(num) + str(item) 47 write_to_file(item) 48 num += 1 49 50 if __name__ == '__main__': 51 for i in range(1, 398): 52 main(page=i)

     

    你可能感兴趣的:(requests 抓取网站)