selenium爬取新浪微博

逻辑很简单,先登录之后,搜索一个关键字,爬取内容

注意

  1. 使用mongodb数据库,注意update_one自带去重。
  2. tqdm添加进度条
  3. 无头浏览器
  4. 不加载图片
  5. 使用scrapy的selector解析页面

from selenium import webdriver
from scrapy import Selector
import time
import pymongo
from tqdm import tqdm


# 链接到mongodb
client = pymongo.MongoClient('localhost', 27017)
# 使用名叫weibo的数据库
db = client.weibo

chrome_options = webdriver.ChromeOptions()
# 配置不加载图片
prefs = {"profile.managed_default_content_settings.images": 2}
chrome_options.add_experimental_option("prefs", prefs)
# 配置无头浏览器
chrome_options.add_argument('--headless')
browser = webdriver.Chrome(options=chrome_options)

# browser = webdriver.Chrome()

# 打开登陆页面
browser.get('https://passport.weibo.cn/signin/login')
time.sleep(3)

# 登录新浪
browser.find_element_by_css_selector(
    '#loginName').send_keys('[email protected]')
browser.find_element_by_css_selector('#loginPassword').send_keys('sdklfs')
browser.find_element_by_css_selector('#loginAction').click()
# time.sleep(3)
# print(browser.current_url)

# 随便搜索一个关键字
keyword = 'AI'
base_serach_url = 'https://m.weibo.cn/search?containerid=100103type%3D1%26q%3D'
browser.get(base_serach_url + keyword)

time.sleep(3)
# 下滑条滚动的 javascript 代码
scroll_js = "window.scrollTo(0,document.body.scrollHeight);var lenOfPage = document.body.scrollHeight;return lenOfPage;"
# 默认爬取前20页,可以修改
for i in tqdm(range(20)):
    time.sleep(1)
    browser.execute_script(scroll_js)

# 爬取数据用scrapy的selector
selector = Selector(text=browser.page_source)
cards = selector.css('div.card-main')


def handle_column(ss):
    # 修整不规则字段
    if not ss:
        return ''
    if ss.strip() in ('赞', '评论', '转发'):
        return '0'
    return ss.strip()


for card in cards:
    author = card.css('header > div > div > a > h3::text').get()
    created_at = card.css('.time::text').get()
    source = card.css('.from::text').get()
    text = card.css('.weibo-text::text').get()
    forward = card.css('.m-font.m-font-forward+h4::text').get()
    comment = card.css('.m-font.m-font-comment+h4::text').get()
    like = card.css('.m-icon.m-icon-like+h4::text').get()

    res_dict = {
        'author': handle_column(author),
        'created_at': handle_column(created_at),
        'source': handle_column(source),
        'text': handle_column(text),
        'forward': handle_column(forward),
        'comment': handle_column(comment),
        'like': handle_column(like)
    }

    db.weibo.update_one({'text': handle_column(text)}, {'$set': res_dict}, True)

# print(browser.page_source)
browser.close()

你可能感兴趣的:(python)