在当今数据驱动的时代,网络爬虫已成为获取互联网信息的重要手段。B站(哔哩哔哩)作为国内最大的视频弹幕网站,其排行榜数据蕴含着丰富的用户行为和内容趋势信息。然而,B站页面采用动态加载技术,传统requests库难以直接获取数据。本文将通过Selenium自动化测试工具,结合显式等待策略,手把手教你实现B站排行榜数据的完整爬取流程。
# 创建虚拟环境(可选)
python -m venv bilibili_spider
source bilibili_spider/bin/activate # Linux/Mac
# bilibili_spider\Scripts\activate # Windows
# 安装依赖库
pip install selenium webdriver-manager pandas
关键库说明:
selenium
:Web自动化测试核心库webdriver-manager
:浏览器驱动自动管理pandas
:数据存储与处理from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
# 自动下载并配置ChromeDriver
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
优势:
打开B站排行榜页面:
https://www.bilibili.com/v/popular/rank/all
页面特征:
li.rank-item
元素中元素定位:
#rank-list li.rank-item div.info a.title
网络监控:
响应式设计:
等待类型 | 特点 | 适用场景 |
---|---|---|
强制等待 | time.sleep() 固定时间 |
简单调试 |
隐式等待 | driver.implicitly_wait() |
全局元素加载 |
显式等待 | WebDriverWait + 条件判断 |
精准控制特定元素加载 |
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
def explicit_wait(driver, selector, timeout=10):
"""
自定义显式等待函数
:param driver: WebDriver实例
:param selector: CSS选择器
:param timeout: 最大等待时间(秒)
:return: 找到的WebElement
"""
try:
element = WebDriverWait(driver, timeout).until(
EC.presence_of_element_located((By.CSS_SELECTOR, selector))
)
return element
except Exception as e:
print(f"元素等待超时:{selector},错误信息:{str(e)}")
return None
关键参数说明:
timeout
:最大等待时间(建议5-15秒)poll_frequency
:检查间隔(默认0.5秒)ignored_exceptions
:忽略的异常类型EC.presence_of_element_located
EC.visibility_of_element_located
EC.element_to_be_clickable
EC.text_to_be_present_in_element
def parse_video_item(item):
"""
解析单个视频条目
:param item: WebElement对象
:return: 字典格式数据
"""
try:
return {
"rank": item.find_element(By.CSS_SELECTOR, ".num").text,
"title": item.find_element(By.CSS_SELECTOR, ".title").text,
"link": item.find_element(By.CSS_SELECTOR, ".title").get_attribute("href"),
"author": item.find_element(By.CSS_SELECTOR, ".up-name").text,
"play_count": item.find_element(By.CSS_SELECTOR, ".play").text,
"danmaku": item.find_element(By.CSS_SELECTOR, ".danmaku").text,
"publish_time": item.find_element(By.CSS_SELECTOR, ".time").text.strip(),
}
except Exception as e:
print(f"解析失败:{str(e)}")
return None
字段说明:
.num
.title
href
属性.up-name
.play
.danmaku
.time
def scroll_to_load(driver, scroll_times=5):
"""
模拟滚动加载更多内容
:param driver: WebDriver实例
:param scroll_times: 滚动次数
"""
for _ in range(scroll_times):
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(2) # 等待数据加载
注意事项:
import pandas as pd
def save_to_csv(data_list, filename="bilibili_rank.csv"):
"""
保存数据到CSV文件
:param data_list: 解析后的数据列表
:param filename: 输出文件名
"""
if not data_list:
print("无数据可保存")
return
# 转换为DataFrame
df = pd.DataFrame(data_list)
# 数据清洗
df['play_count'] = df['play_count'].str.replace('万次观看', '').astype(float) * 10000
df['danmaku'] = df['danmaku'].str.replace('条弹幕', '').astype(int)
# 保存文件
df.to_csv(filename, index=False, encoding='utf-8-sig')
print(f"数据已保存至:{filename}")
优化点:
utf-8-sig
避免Excel乱码from selenium.webdriver.chrome.options import Options
def init_driver_with_anti_anti():
"""
初始化带反爬防护的浏览器驱动
"""
chrome_options = Options()
chrome_options.add_argument("--disable-blink-features=AutomationControlled") # 禁用自动化特征
chrome_options.add_argument(f"user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36")
# 随机User-Agent(需安装fake_useragent库)
# from fake_useragent import UserAgent
# ua = UserAgent()
# chrome_options.add_argument(f"user-agent={ua.random}")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
# 处理Cookie弹窗
try:
close_btn = explicit_wait(driver, ".banner-link.international-home")
if close_btn:
close_btn.click()
except:
pass
return driver
进阶方案:
chrome_options.add_argument(f'--proxy-server={proxy}')
time.sleep(random.uniform(2,5))
import time
import random
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
class BilibiliRankSpider:
def __init__(self):
self.driver = self.init_driver()
self.base_url = "https://www.bilibili.com/v/popular/rank/all"
self.data_list = []
def init_driver(self):
"""初始化浏览器驱动"""
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("--disable-blink-features=AutomationControlled")
chrome_options.add_argument("--start-maximized")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
return driver
def handle_login_popup(self):
"""处理登录弹窗"""
try:
close_btn = WebDriverWait(self.driver, 5).until(
EC.element_to_be_clickable((By.CSS_SELECTOR, ".banner-link.international-home"))
)
close_btn.click()
except:
pass
def scroll_load(self, times=3):
"""滚动加载更多内容"""
for _ in range(times):
self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(random.uniform(1, 3)) # 随机等待时间
def parse_data(self):
"""解析页面数据"""
items = self.driver.find_elements(By.CSS_SELECTOR, "#rank-list li.rank-item")
for item in items:
try:
data = {
"rank": item.find_element(By.CSS_SELECTOR, ".num").text,
"title": item.find_element(By.CSS_SELECTOR, ".title").text,
"link": item.find_element(By.CSS_SELECTOR, ".title").get_attribute("href"),
"author": item.find_element(By.CSS_SELECTOR, ".up-name").text,
"play_count": item.find_element(By.CSS_SELECTOR, ".play").text,
"danmaku": item.find_element(By.CSS_SELECTOR, ".danmaku").text,
"publish_time": item.find_element(By.CSS_SELECTOR, ".time").text.strip(),
}
self.data_list.append(data)
except Exception as e:
print(f"解析失败:{str(e)}")
def save_data(self, filename="bilibili_rank.csv"):
"""保存数据到CSV"""
if not self.data_list:
print("未获取到数据")
return
df = pd.DataFrame(self.data_list)
df['play_count'] = df['play_count'].str.replace('万次观看', '').astype(float) * 10000
df['danmaku'] = df['danmaku'].str.replace('条弹幕', '').astype(int)
df.to_csv(filename, index=False, encoding='utf-8-sig')
print(f"数据保存成功,共{len(self.data_list)}条记录")
def run(self):
"""主执行流程"""
try:
self.driver.get(self.base_url)
self.handle_login_popup()
# 显式等待页面主体加载
WebDriverWait(self.driver, 10).until(
EC.presence_of_element_located((By.ID, "rank-list"))
)
# 初始解析
self.parse_data()
# 滚动加载更多
self.scroll_load(times=2)
self.parse_data()
# 保存数据
self.save_data()
except Exception as e:
print(f"爬虫运行出错:{str(e)}")
finally:
self.driver.quit()
if __name__ == "__main__":
spider = BilibiliRankSpider()
spider.run()
from tenacity import retry, stop_after_attempt, wait_random
@retry(stop=stop_after_attempt(3), wait=wait_random(min=1, max=3))
def robust_parse(self):
# 解析逻辑
分布式爬取:
动态代理池:
import requests
def get_proxy():
# 从代理API获取可用代理
resp = requests.get("http://proxy-api.com/get")
return resp.json()['proxy']
chrome_options.add_argument(f'--proxy-server={get_proxy()}')
copy selector
)//div[contains(@class,'rank-item')]//a[@class='title']
# 先等待关键元素
WebDriverWait(driver, 20).until(
EC.presence_of_element_located((By.CSS_SELECTOR, ".rank-item:nth-child(20)"))
)
driver.add_cookie({'name': 'SESSIONDATA', 'value': 'your_cookie_value'})
本文通过B站排行榜爬取实战,系统讲解了:
进阶方向:
注意事项: