学了2天的Python,自己写了个简单的爬虫,可是爬虫有什么用呢?

# -*- coding: UTF-8 -*-
import requests
import pandas
import   re
import json
from bs4 import BeautifulSoup
import  openpyxl





def parseLinkedNews(url):
    newsDetails = []
    res = requests.get(url)
    res.encoding = 'utf-8'
    jd = json.loads(res.text.lstrip('  newsloadercallback(').rstrip(');'))


    for  ent in jd['result']['data']:
        newsDetails.append(getNewsDetail(ent['url']))
    #print(newsDetails)
    return newsDetails





def getNewsDetail(url):
      #得到具体的新闻链接,接下来对具体的新闻进行处理,所以要requests.get('url')
        result = {}
        res = requests.get(url,'html.parser')
        res.encoding = 'utf-8'
        soup = BeautifulSoup(res.text,'html.parser')
        title = soup.select('#artibodyTitle')[0].text#新闻标题
        result['title']=title
        str = ''
        for p in soup.select('#artibody p')[:-1]:
            #print(p.text)
            str = str + p.text+'\n'
            #print(str)
        result['article']=str#新闻正文
        result['time']=soup.select('.time-source')[0].contents[0]#新闻发布时间
        result['source'] = soup.select('.time-source span a')[0].text #新闻来源
        result['url']=url#新闻链接
        return result







if __name__ == '__main__':
    url = 'http://api.roll.news.sina.com.cn/zt_list?channel=news&cat_1=gnxw&cat_2==gdxw1||=gatxw||=zs-pl||' \
      '=mtjj&level==1||=2&show_ext=1&show_all=1' \
      '&show_num=22&tag=1&format=json&page={}&callback=newsloadercallback&_=1512199560142'

    news_total = []
    for i in  range(1,10):
        newsurl=url.format(i)
        #print(newsurl)
        newsarray = parseLinkedNews(newsurl)
        news_total.extend(newsarray)


df = pandas.DataFrame(news_total)
df.to_excel('C:/news.xlsx')

你可能感兴趣的:(人工智能)