python 爬虫 selenium 爬取今日头条首页新闻

由于利用 XHR 请求拿 json 的请求参数有些复杂,,所以尝试了下用selenium

from selenium.common.exceptions import TimeoutException, NoSuchElementException
import time
import pymongo


def get_page():
    try:
        options = webdriver.ChromeOptions()
        options.add_argument('--headless')
        browser = webdriver.Chrome(options=options)
        browser.get('https://www.toutiao.com/')
        return browser
    except TimeoutException:
        print('time out')

def more_news(page_number,browser):
    for i in range(page_number):
        while (i + 1) % 20 == 0:
            print('第' + str(i + 1) + '条查找成功')
            break
        time.sleep(2)
        browser.execute_script('window.scrollTo(0, document.body.scrollHeight)')
    return browser

def page_parse(browser):
    try:
        articals = browser.find_elements_by_xpath('/html/body/div/div[2]/div[2]/div[2]/ul/li//a[@class="link"]')
    except NoSuchElementException:
        return None
        
    for artical in articals:
        result = {
            'title': artical.text,
            'href': artical.get_attribute('href')
        }
        save_to_mongo(result)

def save_to_mongo(result):
    client = pymongo.MongoClient(host='localhost', port=27017)
    db = client.test
    collection = db.Toutiao
    collection.insert_one(result)

def main():
    browser = get_page()
    browser = more_news(page_number=100, browser=browser)
    page_parse(browser)
    print('存储完成')

if __name__ == '__main__':
    main()

只存储了文章的标题,,以后也许还会看看别的

你可能感兴趣的:(爬虫)