爬今日头条的图集
import os from _md5 import md5 from hashlib import md5 from multiprocessing import Pool import requests import json from bs4 import BeautifulSoup import re KEYWORD='美女,清纯'#关键词 GROUP_START=0 GROUP_END=20 #关键词搜索,获取关键词列表 def get_page_index(url,offset,keyword,code='utf-8'): params = { 'autoload': 'true', 'count': 20, 'cur_tab': 3, 'format': 'json', 'keyword': keyword, 'offset': offset, } try: r=requests.get(url,params=params) print(r.status_code) r.raise_for_status() r.encoding=code #print(r.text) return r.text except: return "有误" #解析返回关键词查询结果的json数据,获取每一个链接url def parser_page_index(html): data= json.loads(html) #判定返回json数据是否为空,是否包含字典中keyword:data if data and 'data' in data.keys(): for itemUrl in data.get('data'): #生成器来存储url yield itemUrl.get('article_url') #获取图片集详情信息 def get_page_detail(url,code='utf-8'): try: r=requests.get(url) #print(r.status_code) r.raise_for_status() r.encoding=code #print(r.text) return r.text except: return "有误" #解析图片集中的每一张图片的信息:url,title, def parser_page_detail(html,url): soup=BeautifulSoup(html,'html.parser') title=soup.select('title')[0].get_text() print(title) #re.S 使得匹配换行符 images_pattern = re.compile('var gallery = (.*?);', re.S) result= re.search(images_pattern,html) print(result) if result: data = json.loads(result.group(1)) # print(result.group(1)) if data and 'sub_images' in data.keys(): sub_images = data.get('sub_images') images = [item.get('url') for item in sub_images] for image in images: download_image(image) #print(images) return { 'title': title, 'url': url, 'images': images } #下载 def download_image(url): print('Downloading', url) try: r = requests.get(url) r.raise_for_status() save_image(r.content) return None except ConnectionError: return None def save_image(content): file_path = '{0}/{1}.{2}'.format(os.getcwd(), md5(content).hexdigest(), 'jpg')#MD5防止图片重复 print(file_path) if not os.path.exists(file_path): with open(file_path, 'wb') as f: f.write(content) f.close() def main(offset): url='http://www.toutiao.com/search_content/?' html=get_page_index(url,offset,KEYWORD) for url in parser_page_index(html): html = get_page_detail(url) if html: parser_page_detail(html,url) if __name__ == '__main__': pool = Pool() groups = ([x * 20 for x in range(GROUP_START, GROUP_END + 1)]) pool.map(main, groups) pool.close() pool.join()