东方财富股吧评论爬虫和情绪分析

目录

      • 爬虫准备
      • 爬虫
      • 情绪分析

爬虫准备

本次用selenmium和bs4进行爬虫,最后把数据储存进json文件里,需要使用的库有这些。

from selenium import webdriver
from bs4 import BeautifulSoup
from selenium.webdriver.chrome.options import Options
import json

爬虫

先爬取贴子的基本信息,如评论,阅读数,帖子链接等。有些咨询帖子是官方发的,和爬取的股票没有关系,所以需要过滤掉。

def get_list(stock, x, y):
    DFCF_dict = {
     'view_num': [], 'comment_num': [], 'title': [], 'poster': [], 'latest_time': [], 'sub_url': []}
    href_list = []
    total_list = []
    print('从第{}页到第{}页'.format(x, y - 1))
    for i in range(x, y):
        print('正在第{}页'.format(i))
        chrome_options = Options()
        chrome_options.add_argument('--headless')
        browser = webdriver.Chrome(options=chrome_options)

        url = 'http://guba.eastmoney.com/list,{}_{}.html'.format(stock, i)
        url2 = 'http://guba.eastmoney.com'
        browser.get(url)
        html_source = browser.page_source
        soup = BeautifulSoup(html_source, "html.parser")
        view_num = soup.find_all('span', class_='l1 a1')
        comment_num = soup.find_all('span', class_='l2 a2')
        title = soup.find_all('span', class_='l3 a3')
        poster = soup.find_all('span', class_='l4 a4')
        time = soup.find_all('span', class_='l5 a5')
        for element in view_num[1:]:
            DFCF_dict['view_num'].append(element.string)
        for element in comment_num[1:]:
            DFCF_dict['comment_num'].append(element.text)
        for element in title[1:]:
            a = element.find('a')
            href = a.get('href')
            title = a.get('title')
            href_list.append(href)
            DFCF_dict['title'].append(title)
        for element in poster[1:]:
            DFCF_dict['poster'].append(element.text)
        for element in time[1:]:
            DFCF_dict['latest_time'].append(element.text)
        for i in range(len(time[1:])):
            if (href_list[i][:5] == '/news') and href_list[i][7].isdigit():
                sub_url = url2 + href_list[i]
            else:
                sub_url = ''
            DFCF_dict['sub_url'].append(sub_url)
        browser.close()
    print('结束爬虫')
    return DFCF_dict

这个函数是为了爬取帖子内部的评论等信息,可以翻页。

def get_comment(sub_url):
    comment_list = []
    comment_time_list = []
    sub_comment_list = []
    sub_comment_time_list = []
    chrome_options = Options()
    chrome_options.add_argument('--headless')
    browser = webdriver.Chrome(options=chrome_options)

    browser.get(sub_url)
    html_source = browser.page_source
    soup = BeautifulSoup(html_source, "html.parser")

    post_time = soup.find('div', class_="zwfbtime")
    article = soup.find('div', class_="stockcodec .xeditor").text
    article_like = soup.find_all('div', id='like_wrap')[0].text
    comment = soup.find_all('div', class_='full_text')
    comment_time = soup.find_all('div', class_='publish_time')
    sub_comment = soup.find_all('span', class_="l2_full_text")
    sub_comment_time = soup.find_all('span', class_='time fl')

    page_num = soup.find_all('span', class_='sumpage')
    if len(page_num) == 0:
        page_num = 1
    else:
        page_num = int(page_num[0].text)

    print('本贴共{}页:'.format(page_num))
    for element in comment:
        comment_list.append(element.text)
    for element in comment_time:
        comment_time_list.append(element.text)
    for element in sub_comment:
        sub_comment_list.append(element.text)
    for element in sub_comment_time:
        sub_comment_time_list.append(element.text)
    print('爬取第1页')
    browser.close()

    if page_num > 1:
        for i in range(2, page_num + 1):

            print('爬取第{}页'.format(i))
            new_url = sub_url[:-5] + '_{}'.format(i) + sub_url[-5:]
            browser = webdriver.Chrome(options=chrome_options)
            browser.get(new_url)
            html_source = browser.page_source
            soup = BeautifulSoup(html_source, "html.parser")

            comment = soup.find_all('div', class_='full_text')
            comment_time = soup.find_all('div', class_='publish_time')[4:]
            sub_comment = soup.find_all('span', class_="l2_full_text")
            sub_comment_time = soup.find_all('span', class_='time fl')[4:]

            for element in comment:
                comment_list.append(element.text)
            for element in comment_time:
                comment_time_list.append(element.text)
            for element in sub_comment:
                sub_comment_list.append(element.text)
            for element in sub_comment_time:
                sub_comment_time_list.append(element.text)

            browser.close()
    return article, article_like, comment_list, comment_time_list, sub_comment_list, sub_comment_time_list

最后的爬虫程序是这样的

if __name__ == '__main__':

    DFCF_dict = get_data_from_DFCF(600000,1,3)
    for url in DFCF_dict['sub_url']:
        if url != '':
            article,article_like,comment_list,comment_time_list,sub_comment_list,sub_comment_time_list = get_comment(url)

            DFCF_dict['article'].append(article)
            DFCF_dict['article_like'].append(article_like)
            DFCF_dict['comment'].append(comment_list)
            DFCF_dict['comment_time'].append(comment_time_list)
            DFCF_dict['sub_comment'].append(sub_comment_list)
            DFCF_dict['sub_comment_time'].append(sub_comment_time_list)

    DFCF_json = json.dumps(DFCF_dict,sort_keys=False, indent=4, separators=(',', ': '))
    f = open('data.json', 'w')
    f.write(DFCF_json)

情绪分析

用snownlp对评论进行分析

import json
import pandas as pd
from snownlp import SnowNLP

因为电脑配置爬不了太多数据,所以只是个简单版

all_comment = []
all_comment_time = []
for comment in info_data['comment']:
    for i in range(len(comment)):
        all_comment.append(comment[i])

for comment in info_data['sub_comment']:
    for i in range(len(comment)):
        all_comment.append(comment[i])

for time in info_data['comment_time']:
    for i in range(len(time)):
        all_comment_time.append(time[i][4:])

for time in info_data['sub_comment_list']:
    for i in range(len(time)):
        all_comment_time.append(time[i])

df1 = pd.DataFrame([all_comment_time,all_comment])
df1 = df1.T
df1.columns = ['date','comment']
df1['date'] = pd.to_datetime(df1['date'])
df1.set_index(['date'], inplace=True)
df1['view'] = 0.0
for i in range(len(df1)):
    npl = SnowNLP(df1.comment[i])
    df1.view[i] = npl.sentiments * 100
    
df1.plot.bar()

东方财富股吧评论爬虫和情绪分析_第1张图片

你可能感兴趣的:(python,数据挖掘,自然语言处理)