python爬虫 爬取今日头条信息

""""

进入头条首页,在右边输入关键字,进入搜索页面,主要爬取搜索的到的图片以及图片的标题

""""

""""

python版本:python3.6.5

""""

#手动输入搜索关键字和要爬取的页数,默认从第一页开始爬取

end_page = int(input('请输入结束页面:'))
keyword = input('请输入查找关键字:')

#获取网页url

def get_url(): 
    url = 'https://www.toutiao.com/search_content/?'
    # 定义url的参数
    params = {
        'offset': '1',
        'format': 'json',
        'keyword': keyword,
        'autoload': 'true',
        'count': '20',
        'cur_tab': '1',
        'from': 'search_tab'
    }
    #定义请求头
    headers = {
        'referer': 'https://www.toutiao.com/search/?keyword=%E8%A1%97%E6%8B%8D',
        'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) '
                      'AppleWebKit/537.36 (KHTML, like Gecko) '
                      'Chrome/63.0.3239.132 Safari/537.36',
        'x-requested-with': 'XMLHttpRequest'
    }
    try:
        toutiao_json = requests.get(url, params=params, headers=headers).text
        return toutiao_json
    except:
        return None

# 爬取头条的内容

def get_toutiao(toutiao_json):

    # 将返回的json对象转化为python字符串
    json_toutiao = json.loads(toutiao_json)
    data_list = json_toutiao['data']
    items = []
    # 获取想要的内容
    for data in data_list:
        if data.get('title'):
            title = data.get('title')
            images = data.get('image_list')
            url_list = []
            for image in images:
                url = 'https:' + image['url']
                url_list.append(url)
            dict1 = {
                'title': title,
                'image': url_list
            }
            items.append(dict1)
    return items

# 保存数据

def save_content(items):
    #定义文件名称
    filename = keyword + '.txt'
    for item in items:
        # 以追加方式写入文件
        with open(filename, 'a', encoding='utf-8') as f:
            #以json格式保存在文件中
            f.write(json.dumps(item, ensure_ascii=False))

# 最后整理代码如下

import json

import pymysql
import requests

end_page = int(input('请输入结束页面:'))
keyword = input('请输入查找关键字:')


def get_url():
    for page in range(end_page):
        offset = (page-1) * 20
        url = 'https://www.toutiao.com/search_content/?'
        params = {
            'offset': offset,
            'format': 'json',
            'keyword': keyword,
            'autoload': 'true',
            'count': '20',
            'cur_tab': '1',
            'from': 'search_tab'
        }
        headers = {
            'referer': 'https://www.toutiao.com/search/?keyword=%E8%A1%97%E6%8B%8D',
            'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) '
                          'AppleWebKit/537.36 (KHTML, like Gecko) '
                          'Chrome/63.0.3239.132 Safari/537.36',
            'x-requested-with': 'XMLHttpRequest'
        }
        try:
            toutiao_json = requests.get(url, params=params, headers=headers).text
            return toutiao_json
        except:
            return None


def get_toutiao(toutiao_json):

    json_toutiao = json.loads(toutiao_json)
    data_list = json_toutiao['data']
    items = []
    for data in data_list:
        if data.get('title'):
            title = data.get('title')
            images = data.get('image_list')
            url_list = []
            for image in images:
                url = 'https:' + image['url']
                url_list.append(url)
            dict1 = {
                'title': title,
                'image': url_list
            }
            items.append(dict1)
    return items


def save_content(items):
    filename = keyword + '.txt'
    for item in items:
        with open(filename, 'a', encoding='utf-8') as f:
            f.write(json.dumps(item, ensure_ascii=False))


def main():
    toutiao_json = get_url()
    items = get_toutiao(toutiao_json)
    save_content(items)


if __name__ == '__main__':
    main()

 

你可能感兴趣的:(爬虫,python)