获取第一级评论(可翻页)
import requests
import csv
f = open('5.3微博热搜top50.csv', mode='a', encoding='utf-8', newline='')
csv_write = csv.writer(f)
csv_write.writerow(['id', 'screen_name', 'text_raw', 'create_at', 'like_counts', 'total_number'])
"""param = {
"type":"24",
"interval_id":"100:90",
"action":"",
"start":0,
"limit":20
}"""
headers = {
"Cookie":"SCF=ApR9UyRcpl7-0VlQ_Sj2v4Qh0m1qMI21-cotmFhdiGgbKrb-VrN665r_qQRNsa8mz36pbk68asD38D07fZyRJFI.; SINAGLOBAL=6964292461763.401.1742461264689; XSRF-TOKEN=9mP7YO-5pwF8OdLaDhVK9FR6; ALF=1748852561; SUB=_2A25FEaIBDeRhGeFH61IZ-S7IyjSIHXVmbrvJrDV8PUJbkNB-LVbGkW1NeFFhOzS7EJ4MBWGdQ88y-XXzOss4GeQF; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WFui4P6ohlOWRineSqz-QUk5JpX5KMhUgL.FoM4eh5R1K5XeKn2dJLoIE9y9EH8SC-RxF-4SEH8SCHWxCHWBbH8SC-RxFHFxbH8SC-RxFHFxntt; _s_tentry=weibo.com; Apache=5372934081314.751.1746260665311; ULV=1746260665386:12:1:1:5372934081314.751.1746260665311:1742461264705; WBPSESS=FanyJc4XtWDeu4BHypru5GvHwF1l9C2eaOC9N36OYtspya1f480h9J_E1SozZ3-GB4UY8rnBfm-1uzIerN76XU0e7wHEHqYzy0NQTPb8JBo_V5AabiGnKlYWJl48cGwtlHDZZpFKnlMsT0OoyxZgvQ==",
"Referer":"https://weibo.com/2810373291/Pq79az77K",
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36 SLBrowser/9.0.6.2081 SLBChan/111 SLBVPV/64-bit"
}
def get_next(next = "count=10"):
url = f"https://weibo.com/ajax/statuses/buildComments?is_reload=1&id=5162162638553693&is_show_bulletin=2&is_mix=0&{next}&uid=7872307803&fetch_level=0&locale=zh-CN "
resp = requests.get(url, headers=headers)
json_data = resp.json()
data_lise = json_data['data']
max_id = json_data['max_id']
if not data_lise:
return
if max_id == 0:
print("已到达终页,终止循环")
return
for data in data_lise:
text_raw = data['text_raw']
id = data['id']
create_at = data['created_at']
like_counts = data['like_counts']
total_number = data['total_number']
screen_name = data['user']['screen_name']
print(id, screen_name, text_raw, create_at, like_counts, total_number)
csv_write.writerow([id, screen_name, text_raw, create_at, like_counts, total_number])
max_str = 'max_id='+str(max_id)
get_next(max_str)
get_next()
#https://weibo.com/ajax/statuses/buildComments?is_reload=1&id=5162162638553693&is_show_bulletin=2&is_mix=0&count=10&uid=7872307803&fetch_level=0&locale=zh-CN
#https://weibo.com/ajax/statuses/buildComments?flow=0&is_reload=1&id=5162162638553693&is_show_bulletin=2&is_mix=0&max_id=156613293222632&count=20&uid=7872307803&fetch_level=0&locale=zh-CN
#https://weibo.com/ajax/statuses/buildComments?flow=0&is_reload=1&id=5162162638553693&is_show_bulletin=2&is_mix=0&max_id=142044763021595&count=20&uid=7872307803&fetch_level=0&locale=zh-CN
#https://weibo.com/ajax/statuses/buildComments?flow=0&is_reload=1&id=5162162638553693&is_show_bulletin=2&is_mix=0&max_id=140670373952922&count=20&uid=7872307803&fetch_level=0&locale=zh-CN
#https://weibo.com/ajax/statuses/buildComments?flow=0&is_reload=1&id=5162162638553693&is_show_bulletin=2&is_mix=0&max_id=140258056794599&count=20&uid=7872307803&fetch_level=0&locale=zh-CN
获得热搜前五十
import requests
import csv
from bs4 import BeautifulSoup
f = open('5.3微博热搜top50.csv', mode='a', encoding='utf-8', newline='')
csv_write = csv.writer(f)
csv_write.writerow(['title', 'url', 'hotness'])
def get_news():
news = []
url = 'https://s.weibo.com/top/summary/'
headers = {
"Cookie":"SCF=ApR9UyRcpl7-0VlQ_Sj2v4Qh0m1qMI21-cotmFhdiGgbKrb-VrN665r_qQRNsa8mz36pbk68asD38D07fZyRJFI.; SINAGLOBAL=6964292461763.401.1742461264689; XSRF-TOKEN=9mP7YO-5pwF8OdLaDhVK9FR6; ALF=1748852561; SUB=_2A25FEaIBDeRhGeFH61IZ-S7IyjSIHXVmbrvJrDV8PUJbkNB-LVbGkW1NeFFhOzS7EJ4MBWGdQ88y-XXzOss4GeQF; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WFui4P6ohlOWRineSqz-QUk5JpX5KMhUgL.FoM4eh5R1K5XeKn2dJLoIE9y9EH8SC-RxF-4SEH8SCHWxCHWBbH8SC-RxFHFxbH8SC-RxFHFxntt; _s_tentry=weibo.com; Apache=5372934081314.751.1746260665311; ULV=1746260665386:12:1:1:5372934081314.751.1746260665311:1742461264705; WBPSESS=FanyJc4XtWDeu4BHypru5GvHwF1l9C2eaOC9N36OYtspya1f480h9J_E1SozZ3-G7gRBm5EqTBkJmBP0BK_GYcYj4UuZXIELLeStoElW2asfbpZlcXs9Zb8VnFWiP21nCaP8CkaY7H64yqThIc17Bg==; UOR=,,link.csdn.net",
"Referer" : "https://s.weibo.com/",
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36 SLBrowser/9.0.6.2081 SLBChan/111 SLBVPV/64-bit"
}
resp = requests.get(url, headers=headers)
soup = BeautifulSoup(resp.text, 'lxml')
print(soup)
#解析页面
url_titles = soup.select('#pl_top_realtimehot > table > tbody > tr > td.td-02 > a')
url_hotness = soup.select('#pl_top_realtimehot > table > tbody > tr > td.td-02 > span')
for i in range(len(url_titles) - 1):
hot_news = {}
hot_news['title'] = url_titles[i + 1].get_text()
hot_news['url'] = "https://s.weibo.com"+ url_titles[i]['href']
hot_news['hotness'] = url_hotness[i].get_text()
news.append(hot_news)
csv_write.writerow([hot_news['title'], hot_news['url'], hot_news['hotness']])
return news
news = get_news()
print(news)
# #pl_top_realtimehot > table > tbody > tr > td.td-02
# #pl_top_realtimehot > table > tbody > tr > td.td-02 > a 标题及链接
# #pl_top_realtimehot > table > tbody > tr > td.td-02 >span 热度
#https://s.weibo.com/top/summary?cate=entrank
#https://s.weibo.com/top/summary?cate=entrank
#https://s.weibo.com/top/summary?cate=realtimehot
#https://s.weibo.com/top/summary?cate=recommend
可爬取一级和二级评论以及热搜内容
import requests
import csv
import time
from bs4 import BeautifulSoup
import json
class WeiboSpider:
def __init__(self):
self.headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
"Cookie": "SCF=ApR9UyRcpl7-0VlQ_Sj2v4Qh0m1qMI21-cotmFhdiGgbKrb-VrN665r_qQRNsa8mz36pbk68asD38D07fZyRJFI.; SINAGLOBAL=6964292461763.401.1742461264689; ALF=1748852561; SUB=_2A25FEaIBDeRhGeFH61IZ-S7IyjSIHXVmbrvJrDV8PUJbkNB-LVbGkW1NeFFhOzS7EJ4MBWGdQ88y-XXzOss4GeQF; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WFui4P6ohlOWRineSqz-QUk5JpX5KMhUgL.FoM4eh5R1K5XeKn2dJLoIE9y9EH8SC-RxF-4SEH8SCHWxCHWBbH8SC-RxFHFxbH8SC-RxFHFxntt; UOR=,,link.csdn.net; XSRF-TOKEN=zI4FVs-b5Jo4_Nx7DsOw9YKY; _s_tentry=weibo.com; Apache=9442437661723.48.1746334453943; ULV=1746334453970:13:2:1:9442437661723.48.1746334453943:1746260665386; WBPSESS=FanyJc4XtWDeu4BHypru5GvHwF1l9C2eaOC9N36OYtspya1f480h9J_E1SozZ3-G7gRBm5EqTBkJmBP0BK_GYZgcFehyJ9JdxLw6cMeO3Siwm1ppe2Ed4BNFsRn1Sa6FDLYtdDpyRHVUyriIzX9DlQ==" # 请替换有效Cookie
}
self.base_url = "https://weibo.com/ajax/statuses/buildComments"
def save_to_csv(self, data, filename, mode='a'):
"""保存数据到CSV文件"""
with open(filename, mode, encoding='utf-8', newline='') as f:
writer = csv.writer(f)
writer.writerow(data)
def fetch_hot_search(self, filename='weibo_hot.csv'):
"""获取微博热搜Top50"""
url = 'https://s.weibo.com/top/summary/'
# 初始化CSV文件
self.save_to_csv(['排名', '标题', '链接', '热度'], filename, mode='w')
try:
resp = requests.get(url, headers=self.headers)
soup = BeautifulSoup(resp.text, 'lxml')
# 解析热搜数据
items = soup.select('#pl_top_realtimehot table tr')[1:51] # 取前50条
for index, item in enumerate(items, 1):
title_tag = item.select_one('td.td-02 a')
hot_tag = item.select_one('td.td-02 span')
title = title_tag.get_text(strip=True)
link = "https://s.weibo.com" + title_tag['href']
hot = hot_tag.get_text(strip=True) if hot_tag else '0'
self.save_to_csv([index, title, link, hot], filename)
print(f'已保存热搜:{title}')
except Exception as e:
print(f'获取热搜失败:{str(e)}')
def fetch_comments(self, weibo_id, uid, filename='comments.csv'):
"""获取微博评论(支持分页)"""
# 初始化CSV文件
self.save_to_csv(['评论ID', '用户昵称', '内容', '时间', '点赞数','评论等级'], filename, mode='a')
max_id = 0
page = 1
while True:
params = {
"is_reload": 1,
"id": weibo_id,
"is_show_bulletin": 2,
"is_mix": 0,
"max_id": max_id,
"count": 20,
"uid": uid,
"fetch_level": 0,
"locale": "zh-CN"
}
try:
resp = requests.get(self.base_url, headers=self.headers, params=params)
data = resp.json()
if data.get('ok') != 1:
print(f'请求失败:{data.get("msg")}')
break
comments = data.get('data', [])
if not comments:
print('没有更多评论')
break
# 保存数据
for comment in comments:
row = [
comment.get('id'),
comment.get('user', {}).get('screen_name'),
comment.get('text_raw'),
comment.get('created_at'),
comment.get('like_counts'),
1
]
self.save_to_csv(row, filename)
print(f'已保存评论:{row[2]}')
cid = comment.get('id')
self.fetch_second_comment(cid, uid)
# 更新分页参数
max_id = data.get('max_id', 0)
if max_id == 0:
break
print(f'第{page}页采集完成,即将采集下一页...')
page += 1
time.sleep(2) # 降低请求频率
except Exception as e:
print(f'请求失败:{str(e)}')
break
def fetch_second_comment(self, rootid, uid, filename='comments.csv'):
self.save_to_csv(['评论ID', '用户昵称', '内容', '时间', '点赞数', '评论等级'], filename, mode='a')
max_id = 0
page = 1
while True:
params = {
"is_reload": 1,
"id": rootid,
"is_show_bulletin": 2,
"is_mix": 1,
"fetch_level":1,
"max_id": max_id,
"count": 20,
"uid": uid,
"locale": "zh-CN"
}
try:
resp = requests.get(self.base_url, headers=self.headers, params=params)
data = resp.json()
if data.get('ok') != 1:
print(f'请求失败:{data.get("msg")}')
break
comments = data.get('data', [])
if not comments:
print('没有更多评论')
break
# 保存数据
for comment in comments:
row = [
comment.get('id'),
comment.get('user', {}).get('screen_name'),
comment.get('text_raw'),
comment.get('created_at'),
comment.get('like_counts'),
2
]
self.save_to_csv(row, filename)
print(f'已保存评论:{row[2]}')
# 更新分页参数
max_id = data.get('max_id', 0)
if max_id == 0:
break
print(f'第{page}页采集完成,即将采集下一页...')
page += 1
time.sleep(2) # 降低请求频率
except Exception as e:
print(f'请求失败:{str(e)}')
break
if __name__ == '__main__':
spider = WeiboSpider()
# 示例用法
# 1. 获取热搜
spider.fetch_hot_search()
spider.fetch_comments(
weibo_id="5162273186778546",
uid="5143998400",
filename="comments.csv"
)