爬虫抓取自己csdn博客点赞数

查看自己文章的点赞数、浏览量。。。

# 翻页,获取全部文章链接
import requests, re, math
url = 'https://me.csdn.net/yellow_python'
r = requests.get(url, headers={'User-Agent': 'Opera/8.0 (Windows NT 5.1; U; en)'}).text
articles = re.search('(\d+)\s+原创', r).group(1)
pages = int(math.ceil(int(articles) / 20))
article_urls = []
for page in range(1, pages + 1):
    page_url = 'https://blog.csdn.net/Yellow_python/article/list/%d' % page
    rp = requests.get(page_url, headers={'User-Agent': 'Opera/8.0 (Windows NT 5.1; U; en)'}).text
    article_urls.extend(re.findall('

\s+', rp)) print(len(article_urls), article_urls) # 解析文章 from selenium import webdriver from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.firefox.options import Options # 火狐浏览器设置 firefox_option = Options() firefox_option.set_headless() # 设置浏览器为【无头】 driver = webdriver.Firefox(firefox_options=firefox_option) wait = WebDriverWait(driver, 9) # 显式等待,设置timeout for article_url in article_urls: driver.get(article_url) title = driver.find_element_by_css_selector('html body div#mainBox.container.clearfix main div.blog-content-box div.article-header-box div.article-header div.article-title-box h1.title-article').text approval = driver.find_element_by_css_selector('.long-height > p:nth-child(4)').text pv = driver.find_element_by_css_selector('.read-count').text print(approval, pv, title, article_url, sep=' | ') driver.close()

注意:程序运行后,要清理关闭失败的无头浏览器

你可能感兴趣的:(爬虫)