Python抓取今日头条图片

#TouTiao.py

from _md5 import md5
import pymongo
import requests
from urllib.parse import urlencode
from bs4 import BeautifulSoup
from requests.exceptions import RequestException
import json
import re
from TouTiaoConfig import *
import os
from multiprocessing import Pool

client=pymongo.MongoClient(MONGO_URL,MONGO_Port)
db=client[MONGO_DB]

headers = {
        "User-Agent": "your_user-agent"}    #换成你自己的(系统默认浏览器)

def get_page_index(offset,keyword):
    data={
        'autoload':'true',
        'count':'20',
        'cur_tab':'1',
        'format':'json',
        'from':'search_tab',
        'keyword':keyword,
        'offset':offset
    }
    #当url以字符串形式传递给web服务器时,字符串中不允许出现空格和特殊字符
    #将data中的键值对以连接符&划分
    url="https://www.toutiao.com/search_content/?"+urlencode(data)
    try:
        response=requests.get(url,headers=headers)
        if response.status_code==200:
            return response.text
        return None
    except RequestException as e:
        print("请求索引页出错")
        return None

def parse_page_index(html):
    data=json.loads(html)   #将字符串转换为json对象
    if data and 'data' in data.keys():
        for item in data.get('data'):
            yield item.get('article_url')

def get_page_detail(url):
    try:
        response=requests.get(url,headers=headers)
        if response.status_code==200:
            return response.text
        return None
    except RequestException as e:
        print("请求详情页出错",url)
        return None
    
def parse_page_detail(html2,url):
    soup=BeautifulSoup(html2,'lxml')
    title=soup.select('title')[0].get_text()
    #images_pattern=re.compile(r'BASE_DATA.galleryInfo =(.*?)',re.S)
    images_pattern = re.compile('gallery: JSON.parse\("(.*?)"\),', re.S)
    result=re.search(images_pattern,html2)
    if result:
        result_pattern1=re.compile(r'(\\.?)"')
        result=re.sub(result_pattern1,'"',result.group(1))
        result_pattern2=re.compile(r'\\')
        result=re.sub(result_pattern2,'',result)
        # print(result)
        data=json.loads(result)    #将字符串转换为json对象
        if data and 'sub_images' in data.keys():
            sub_images=data.get('sub_images')
            #print(sub_images)
            images=[item.get('url') for item in sub_images]
            for image in images:
                download_image(image)
            return {
                'url': url,
                'title': title,
                'images': images
            }
    else:
        images_pattern1 = re.compile(r'articleInfo:.*?content:(.*?),', re.S)
        result = re.search(images_pattern1, html2)
        if result:
            result=result.group(1)
            #print(result)
            images_pattern2=re.compile(r'3D;"(http.*?)"')
            images=re.findall(images_pattern2,result)
            for image in images:
                download_image(image)
            #print(images)
            return {
                'url': url,
                'title': title,
                'images': images
            }
 
def save_to_mongo(result):
    if db[MONGO_TABLE].insert(result):
        print("存储到MongoDB成功",result)
        return True
    return False

def download_image(url):
    print('正在下载',url)
    try:
        response=requests.get(url)
        if response.status_code==200:
            save_image(response.content)    #返回二进制内容
        return None
    except RequestException as e:
        print("请求图片出错",url)
        return None
    
def save_image(content):
    file_dir=os.path.split(os.path.realpath(__file__))[0]+os.sep+"image"+os.sep+"toutiao"
    if not os.path.isdir(file_dir):
        os.mkdir(file_dir)
    #filepath='{0}/{1}.{2}'.format(os.getcwd(),md5(content).hexdigest(),'jpg')
    file_path=file_dir+os.sep+md5(content).hexdigest()+'.jpg' #防止不同图片文件名相同
    if not os.path.exists(file_path):
        with open(file_path,'wb') as f:
            f.write(content)

def main(offset):
    html=get_page_index(offset,KEYWORD)
    for url in parse_page_index(html):
        if url!=None:
            #print(url)
            html2=get_page_detail(url)
            if html2:
                result=parse_page_detail(html2,url)
                #print(result)
                if result:
                    save_to_mongo(result)
    #parse_page_index(html)
    #print(html)
    
if __name__=='__main__':
    groups=[x*20 for x in range(GROUP_START,GROUP_END+1)]
    pool=Pool()
    pool.map(main,groups)  #map(fun,list)
#TouTiaoConfig.py
#MongoDB数据库配置

MONGO_URL='localhost'
MONGO_Port=27017
MONGO_DB='toutiao'
MONGO_TABLE='toutiao'

GROUP_START=0
GROUP_END=9

KEYWORD='街拍'

获取图片源:

在TouTiaoConfig.py中设置“KEYWORD”变量就可以下载相应图片,如:“街拍”
Python抓取今日头条图片_第1张图片

结果:
Python抓取今日头条图片_第2张图片

Python抓取今日头条图片_第3张图片

你可能感兴趣的:(python)