爬取微博热搜+关键词爬取评论

获取第一级评论(可翻页)

import requests
import csv

f = open('5.3微博热搜top50.csv', mode='a', encoding='utf-8', newline='')
csv_write = csv.writer(f)
csv_write.writerow(['id', 'screen_name', 'text_raw', 'create_at', 'like_counts', 'total_number'])

"""param = {
    "type":"24",
    "interval_id":"100:90",
    "action":"",
    "start":0,
    "limit":20
}"""

headers = {
    "Cookie":"SCF=ApR9UyRcpl7-0VlQ_Sj2v4Qh0m1qMI21-cotmFhdiGgbKrb-VrN665r_qQRNsa8mz36pbk68asD38D07fZyRJFI.; SINAGLOBAL=6964292461763.401.1742461264689; XSRF-TOKEN=9mP7YO-5pwF8OdLaDhVK9FR6; ALF=1748852561; SUB=_2A25FEaIBDeRhGeFH61IZ-S7IyjSIHXVmbrvJrDV8PUJbkNB-LVbGkW1NeFFhOzS7EJ4MBWGdQ88y-XXzOss4GeQF; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WFui4P6ohlOWRineSqz-QUk5JpX5KMhUgL.FoM4eh5R1K5XeKn2dJLoIE9y9EH8SC-RxF-4SEH8SCHWxCHWBbH8SC-RxFHFxbH8SC-RxFHFxntt; _s_tentry=weibo.com; Apache=5372934081314.751.1746260665311; ULV=1746260665386:12:1:1:5372934081314.751.1746260665311:1742461264705; WBPSESS=FanyJc4XtWDeu4BHypru5GvHwF1l9C2eaOC9N36OYtspya1f480h9J_E1SozZ3-GB4UY8rnBfm-1uzIerN76XU0e7wHEHqYzy0NQTPb8JBo_V5AabiGnKlYWJl48cGwtlHDZZpFKnlMsT0OoyxZgvQ==",
    "Referer":"https://weibo.com/2810373291/Pq79az77K",
    "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36 SLBrowser/9.0.6.2081 SLBChan/111 SLBVPV/64-bit"
}

def get_next(next = "count=10"):
    url = f"https://weibo.com/ajax/statuses/buildComments?is_reload=1&id=5162162638553693&is_show_bulletin=2&is_mix=0&{next}&uid=7872307803&fetch_level=0&locale=zh-CN "
    resp = requests.get(url, headers=headers)
    json_data = resp.json()
    data_lise = json_data['data']
    max_id = json_data['max_id']
    if not data_lise:
        return
    if max_id == 0:
        print("已到达终页,终止循环")
        return

    for data in data_lise:
        text_raw = data['text_raw']
        id = data['id']
        create_at = data['created_at']
        like_counts = data['like_counts']
        total_number = data['total_number']
        screen_name = data['user']['screen_name']
        print(id, screen_name, text_raw, create_at, like_counts, total_number)
        csv_write.writerow([id, screen_name, text_raw, create_at, like_counts, total_number])

    max_str = 'max_id='+str(max_id)
    get_next(max_str)
get_next()
#https://weibo.com/ajax/statuses/buildComments?is_reload=1&id=5162162638553693&is_show_bulletin=2&is_mix=0&count=10&uid=7872307803&fetch_level=0&locale=zh-CN
#https://weibo.com/ajax/statuses/buildComments?flow=0&is_reload=1&id=5162162638553693&is_show_bulletin=2&is_mix=0&max_id=156613293222632&count=20&uid=7872307803&fetch_level=0&locale=zh-CN
#https://weibo.com/ajax/statuses/buildComments?flow=0&is_reload=1&id=5162162638553693&is_show_bulletin=2&is_mix=0&max_id=142044763021595&count=20&uid=7872307803&fetch_level=0&locale=zh-CN
#https://weibo.com/ajax/statuses/buildComments?flow=0&is_reload=1&id=5162162638553693&is_show_bulletin=2&is_mix=0&max_id=140670373952922&count=20&uid=7872307803&fetch_level=0&locale=zh-CN
#https://weibo.com/ajax/statuses/buildComments?flow=0&is_reload=1&id=5162162638553693&is_show_bulletin=2&is_mix=0&max_id=140258056794599&count=20&uid=7872307803&fetch_level=0&locale=zh-CN

获得热搜前五十

import requests
import csv
from bs4 import BeautifulSoup

f = open('5.3微博热搜top50.csv', mode='a', encoding='utf-8', newline='')
csv_write = csv.writer(f)
csv_write.writerow(['title', 'url', 'hotness'])

def get_news():
    news = []
    url = 'https://s.weibo.com/top/summary/'
    headers = {
        "Cookie":"SCF=ApR9UyRcpl7-0VlQ_Sj2v4Qh0m1qMI21-cotmFhdiGgbKrb-VrN665r_qQRNsa8mz36pbk68asD38D07fZyRJFI.; SINAGLOBAL=6964292461763.401.1742461264689; XSRF-TOKEN=9mP7YO-5pwF8OdLaDhVK9FR6; ALF=1748852561; SUB=_2A25FEaIBDeRhGeFH61IZ-S7IyjSIHXVmbrvJrDV8PUJbkNB-LVbGkW1NeFFhOzS7EJ4MBWGdQ88y-XXzOss4GeQF; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WFui4P6ohlOWRineSqz-QUk5JpX5KMhUgL.FoM4eh5R1K5XeKn2dJLoIE9y9EH8SC-RxF-4SEH8SCHWxCHWBbH8SC-RxFHFxbH8SC-RxFHFxntt; _s_tentry=weibo.com; Apache=5372934081314.751.1746260665311; ULV=1746260665386:12:1:1:5372934081314.751.1746260665311:1742461264705; WBPSESS=FanyJc4XtWDeu4BHypru5GvHwF1l9C2eaOC9N36OYtspya1f480h9J_E1SozZ3-G7gRBm5EqTBkJmBP0BK_GYcYj4UuZXIELLeStoElW2asfbpZlcXs9Zb8VnFWiP21nCaP8CkaY7H64yqThIc17Bg==; UOR=,,link.csdn.net",
        "Referer" : "https://s.weibo.com/",
        "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36 SLBrowser/9.0.6.2081 SLBChan/111 SLBVPV/64-bit"
    }
    resp = requests.get(url, headers=headers)
    soup = BeautifulSoup(resp.text, 'lxml')
    print(soup)

    #解析页面
    url_titles = soup.select('#pl_top_realtimehot > table > tbody > tr > td.td-02 > a')
    url_hotness = soup.select('#pl_top_realtimehot > table > tbody > tr > td.td-02 > span')

    for i in range(len(url_titles) - 1):
        hot_news = {}
        hot_news['title'] = url_titles[i + 1].get_text()
        hot_news['url'] = "https://s.weibo.com"+ url_titles[i]['href']
        hot_news['hotness'] = url_hotness[i].get_text()
        news.append(hot_news)
        csv_write.writerow([hot_news['title'], hot_news['url'], hot_news['hotness']])

    return news

news = get_news()
print(news)
#   #pl_top_realtimehot > table > tbody > tr > td.td-02
# #pl_top_realtimehot > table > tbody > tr > td.td-02 > a 标题及链接
# #pl_top_realtimehot > table > tbody > tr > td.td-02 >span 热度

#https://s.weibo.com/top/summary?cate=entrank
#https://s.weibo.com/top/summary?cate=entrank
#https://s.weibo.com/top/summary?cate=realtimehot
#https://s.weibo.com/top/summary?cate=recommend

可爬取一级和二级评论以及热搜内容

import requests
import csv
import time
from bs4 import BeautifulSoup
import json


class WeiboSpider:
    def __init__(self):
        self.headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
            "Cookie": "SCF=ApR9UyRcpl7-0VlQ_Sj2v4Qh0m1qMI21-cotmFhdiGgbKrb-VrN665r_qQRNsa8mz36pbk68asD38D07fZyRJFI.; SINAGLOBAL=6964292461763.401.1742461264689; ALF=1748852561; SUB=_2A25FEaIBDeRhGeFH61IZ-S7IyjSIHXVmbrvJrDV8PUJbkNB-LVbGkW1NeFFhOzS7EJ4MBWGdQ88y-XXzOss4GeQF; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WFui4P6ohlOWRineSqz-QUk5JpX5KMhUgL.FoM4eh5R1K5XeKn2dJLoIE9y9EH8SC-RxF-4SEH8SCHWxCHWBbH8SC-RxFHFxbH8SC-RxFHFxntt; UOR=,,link.csdn.net; XSRF-TOKEN=zI4FVs-b5Jo4_Nx7DsOw9YKY; _s_tentry=weibo.com; Apache=9442437661723.48.1746334453943; ULV=1746334453970:13:2:1:9442437661723.48.1746334453943:1746260665386; WBPSESS=FanyJc4XtWDeu4BHypru5GvHwF1l9C2eaOC9N36OYtspya1f480h9J_E1SozZ3-G7gRBm5EqTBkJmBP0BK_GYZgcFehyJ9JdxLw6cMeO3Siwm1ppe2Ed4BNFsRn1Sa6FDLYtdDpyRHVUyriIzX9DlQ=="  # 请替换有效Cookie
        }
        self.base_url = "https://weibo.com/ajax/statuses/buildComments"

    def save_to_csv(self, data, filename, mode='a'):
        """保存数据到CSV文件"""
        with open(filename, mode, encoding='utf-8', newline='') as f:
            writer = csv.writer(f)
            writer.writerow(data)

    def fetch_hot_search(self, filename='weibo_hot.csv'):
        """获取微博热搜Top50"""
        url = 'https://s.weibo.com/top/summary/'

        # 初始化CSV文件
        self.save_to_csv(['排名', '标题', '链接', '热度'], filename, mode='w')

        try:
            resp = requests.get(url, headers=self.headers)
            soup = BeautifulSoup(resp.text, 'lxml')

            # 解析热搜数据
            items = soup.select('#pl_top_realtimehot table tr')[1:51]  # 取前50条

            for index, item in enumerate(items, 1):
                title_tag = item.select_one('td.td-02 a')
                hot_tag = item.select_one('td.td-02 span')

                title = title_tag.get_text(strip=True)
                link = "https://s.weibo.com" + title_tag['href']
                hot = hot_tag.get_text(strip=True) if hot_tag else '0'

                self.save_to_csv([index, title, link, hot], filename)
                print(f'已保存热搜:{title}')

        except Exception as e:
            print(f'获取热搜失败:{str(e)}')

    def fetch_comments(self, weibo_id, uid, filename='comments.csv'):
        """获取微博评论(支持分页)"""
        # 初始化CSV文件
        self.save_to_csv(['评论ID', '用户昵称', '内容', '时间', '点赞数','评论等级'], filename, mode='a')

        max_id = 0
        page = 1

        while True:
            params = {
                "is_reload": 1,
                "id": weibo_id,
                "is_show_bulletin": 2,
                "is_mix": 0,
                "max_id": max_id,
                "count": 20,
                "uid": uid,
                "fetch_level": 0,
                "locale": "zh-CN"
            }

            try:
                resp = requests.get(self.base_url, headers=self.headers, params=params)
                data = resp.json()

                if data.get('ok') != 1:
                    print(f'请求失败:{data.get("msg")}')
                    break

                comments = data.get('data', [])
                if not comments:
                    print('没有更多评论')
                    break

                # 保存数据
                for comment in comments:
                    row = [
                        comment.get('id'),
                        comment.get('user', {}).get('screen_name'),
                        comment.get('text_raw'),
                        comment.get('created_at'),
                        comment.get('like_counts'),
                        1
                    ]
                    self.save_to_csv(row, filename)
                    print(f'已保存评论:{row[2]}')

                    cid = comment.get('id')
                    self.fetch_second_comment(cid, uid)

                # 更新分页参数
                max_id = data.get('max_id', 0)

                if max_id == 0:
                    break

                print(f'第{page}页采集完成,即将采集下一页...')
                page += 1
                time.sleep(2)  # 降低请求频率

            except Exception as e:
                print(f'请求失败:{str(e)}')
                break

    def fetch_second_comment(self, rootid, uid, filename='comments.csv'):
        self.save_to_csv(['评论ID', '用户昵称', '内容', '时间', '点赞数', '评论等级'], filename, mode='a')

        max_id = 0
        page = 1
        while True:
            params = {
                "is_reload": 1,
                "id": rootid,
                "is_show_bulletin": 2,
                "is_mix": 1,
                "fetch_level":1,
                "max_id": max_id,
                "count": 20,
                "uid": uid,
                "locale": "zh-CN"
            }
            try:
                resp = requests.get(self.base_url, headers=self.headers, params=params)
                data = resp.json()

                if data.get('ok') != 1:
                    print(f'请求失败:{data.get("msg")}')
                    break

                comments = data.get('data', [])
                if not comments:
                    print('没有更多评论')
                    break

                # 保存数据
                for comment in comments:
                    row = [
                        comment.get('id'),
                        comment.get('user', {}).get('screen_name'),
                        comment.get('text_raw'),
                        comment.get('created_at'),
                        comment.get('like_counts'),
                        2
                    ]
                    self.save_to_csv(row, filename)
                    print(f'已保存评论:{row[2]}')

                # 更新分页参数
                max_id = data.get('max_id', 0)

                if max_id == 0:
                    break

                print(f'第{page}页采集完成,即将采集下一页...')
                page += 1
                time.sleep(2)  # 降低请求频率

            except Exception as e:
                print(f'请求失败:{str(e)}')
                break


if __name__ == '__main__':
    spider = WeiboSpider()

    # 示例用法
    # 1. 获取热搜
    spider.fetch_hot_search()

    spider.fetch_comments(
         weibo_id="5162273186778546",
         uid="5143998400",
         filename="comments.csv"
    )

你可能感兴趣的:(python,爬虫)