B站弹幕爬虫小试

输入CID号即可

import requests
import xml.etree.ElementTree as ET
import pandas as pd
from snownlp import SnowNLP

def get_bilibili_danmaku(cid):
    url = f"https://comment.bilibili.com/{cid}.xml"
    response = requests.get(url)
    
    if response.status_code == 200:
        return response.content
    else:
        print(f"Error accessing the API. Status Code: {response.status_code}")
    return None

def parse_danmaku(xml_content):
    root = ET.fromstring(xml_content)
    danmaku_list = []
    for d in root.iter('d'):
        danmaku_text = d.text
        danmaku_attr = d.attrib
        time_info = danmaku_attr.get('p', '').split(',')
        
        if len(time_info) > 0:
            video_time = int(float(time_info[0]))
            hours = video_time // 3600
            minutes = (video_time % 3600) // 60
            seconds = video_time % 60
            real_time = f"{hours}h {minutes}min {seconds}s"
            
            danmaku_list.append({
                '弹幕': danmaku_text,
                '视频中时间': real_time,
            })
    return danmaku_list

def perform_sentiment_analysis(danmaku_list):
    for danmaku in danmaku_list:
        text = danmaku['弹幕']
        s = SnowNLP(text)
        sentiment = s.sentiments

        # Assign sentiment score
        if sentiment > 0.6:
            danmaku['情感分析'] = 1  # Positive
        elif sentiment < 0.4:
            danmaku['情感分析'] = -1  # Negative
        else:
            danmaku['情感分析'] = 0  # Neutral

    return danmaku_list

def save_danmaku_to_excel(danmaku_list, filename):
    df = pd.DataFrame(danmaku_list)
    df.to_excel(filename, index=False, engine='openpyxl')
    print(f"保存了 {len(danmaku_list)} 条弹幕到 {filename}")

if __name__ == '__main__':
    cid = input('输入视频的cid:')
    danmaku_xml = get_bilibili_danmaku(cid) 
    if danmaku_xml:
        danmaku_list = parse_danmaku(danmaku_xml)
        danmaku_list = perform_sentiment_analysis(danmaku_list)
        filename = f"{cid}_弹幕_with_sentiment.xlsx"
        save_danmaku_to_excel(danmaku_list, filename)
    else:
        print("无法获取弹幕。")

你可能感兴趣的:(爬虫,windows,python)