对于上一个使用爬虫获取豆瓣电影信息的博客进行改进。将所有代码放在了文章最后,需要的自行提取。
import requests # 发送 HTTP 请求
from bs4 import BeautifulSoup # 解析 HTML 页面
import time # 用于延迟
import random # 用于随机数、随机选择
import re # 正则表达式,提取评价人数
import os # 文件和文件夹操作
from datetime import datetime # 时间戳
from itertools import cycle # 让代理轮流使用
这部分导入了所有需要用到的标准库和第三方库。
cycle()
可以创建一个无限循环的迭代器,代理就会自动轮流使用
# 代理 IP 列表(示例,需定期更新)
PROXIES = [
"http://123.456.789.1:8080",
"http://234.567.890.2:8080",
"http://345.678.901.3:8080"
]
proxy_pool = cycle(PROXIES)
PROXIES
存储了代理 IP。
proxy_pool = cycle(PROXIES)
表示形成了一个可无限循环的代理池。
# User-Agent 轮换池
USER_AGENTS = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36'
]
USER_AGENTS
是用来防止反爬的,通过不断变换 UA,让豆瓣服务器以为是不同的用户在访问。
def get_douban_movies(page=1, max_retries=3):
用来爬取第 page
页的豆瓣 Top250 页面。
每一页有25部电影。
url = f'https://movie.douban.com/top250?start={(page - 1) * 25}'
这是豆瓣 Top250 的分页 URL
headers = {
'User-Agent': random.choice(USER_AGENTS),
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
'Referer': 'https://movie.douban.com/'
}
每次随机选择一个 UA 防止反爬。
for attempt in range(max_retries):
proxy = next(proxy_pool)
proxies = {"http": proxy, "https": proxy}
代理轮换,最多尝试 max_retries
次(即默认3次)。
response = requests.get(url, headers=headers, proxies=proxies, timeout=10)
response.raise_for_status()
请求豆瓣 Top250 页面,如果响应状态码不是200,直接抛出异常。
soup = BeautifulSoup(response.text, 'html.parser')
movie_list = soup.select('div.item')
用 BeautifulSoup 解析 HTML 页面,找到所有电影的卡片块。
for movie in movie_list:
title = movie.select_one('span.title').text if movie.select_one('span.title') else "未知标题"
rating = movie.select_one('span.rating_num').text if movie.select_one('span.rating_num') else "暂无评分"
people_text = movie.select_one('div.star').text if movie.select_one('div.star') else ""
people_match = re.search(r'(\d+)人评价', people_text)
people = people_match.group(1) if people_match else "未知"
quote = movie.select_one('span.inq').text if movie.select_one('span.inq') else "暂无简介"
movies.append({'标题': title, '评分': rating, '评价人数': people, '简介': quote})
提取每个电影的:
title
:电影标题
rating
:电影评分
people
:评价人数(正则提取)
quote
:短评
然后封装成字典放到 movies
列表中。
except requests.RequestException as e:
print(f"[警告] 第 {attempt + 1} 次尝试失败 ({proxy}),错误: {e}")
time.sleep(random.uniform(2, 5))
如果请求异常,则输出警告,等待 2~5 秒后重试。
尝试 max_retries
次失败后,放弃该页。
def save_to_file(movies, save_dir="E:/information"):
将所有爬取的电影信息写入 txt
文件。
os.makedirs(save_dir, exist_ok=True)
如果文件夹不存在则自动创建。
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
file_path = os.path.join(save_dir, f"豆瓣电影Top250_{timestamp}.txt")
每次运行都会根据时间生成独立的文件,防止覆盖。
def main():
print("开始爬取豆瓣电影 Top 250...")
all_movies = []
max_pages = 2
for page in range(1, max_pages + 1):
max_pages设置为2,所以爬取前 2
页(即 50 部电影),可以需改max_pages的参数修改获取电影的数量。
delay = random.uniform(2, 5)
print(f"等待 {delay:.2f} 秒后继续...")
time.sleep(delay)
加入随机等待,模拟正常用户访问行为。
if __name__ == '__main__':
main()
只有直接运行这个 Python 文件时才会执行 main()
函数。
二、程序代码
import requests
from bs4 import BeautifulSoup
import time
import random
import re
import os
from datetime import datetime
from itertools import cycle
# 代理 IP 列表(示例,需定期更新)
PROXIES = [
"http://123.456.789.1:8080",
"http://234.567.890.2:8080",
"http://345.678.901.3:8080"
]
proxy_pool = cycle(PROXIES)
# User-Agent 轮换池
USER_AGENTS = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36'
]
def get_douban_movies(page=1, max_retries=3):
"""获取豆瓣电影 Top 250 的电影信息"""
url = f'https://movie.douban.com/top250?start={(page - 1) * 25}'
headers = {
'User-Agent': random.choice(USER_AGENTS),
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
'Referer': 'https://movie.douban.com/'
}
for attempt in range(max_retries):
proxy = next(proxy_pool)
proxies = {"http": proxy, "https": proxy}
try:
response = requests.get(url, headers=headers, proxies=proxies, timeout=10)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
movie_list = soup.select('div.item')
movies = []
for movie in movie_list:
title = movie.select_one('span.title').text if movie.select_one('span.title') else "未知标题"
rating = movie.select_one('span.rating_num').text if movie.select_one('span.rating_num') else "暂无评分"
people_text = movie.select_one('div.star').text if movie.select_one('div.star') else ""
people_match = re.search(r'(\d+)人评价', people_text)
people = people_match.group(1) if people_match else "未知"
quote = movie.select_one('span.inq').text if movie.select_one('span.inq') else "暂无简介"
movies.append({'标题': title, '评分': rating, '评价人数': people, '简介': quote})
return movies
except requests.RequestException as e:
print(f"[警告] 第 {attempt + 1} 次尝试失败 ({proxy}),错误: {e}")
time.sleep(random.uniform(2, 5))
print(f"[错误] 无法获取第 {page} 页数据,放弃。")
return []
def save_to_file(movies, save_dir="E:/information"):
"""将电影信息保存到文件"""
os.makedirs(save_dir, exist_ok=True)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
file_path = os.path.join(save_dir, f"豆瓣电影Top250_{timestamp}.txt")
try:
with open(file_path, 'w', encoding='utf-8') as f:
f.write("豆瓣电影 Top 250 信息\n")
f.write(f"爬取时间:{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
f.write("="*50 + "\n\n")
for i, movie in enumerate(movies, 1):
f.write(f"No.{i}\n电影:{movie['标题']}\n评分:{movie['评分']}\n评价人数:{movie['评价人数']}\n简介:{movie['简介']}\n")
f.write("-"*30 + "\n\n")
print(f"[成功] 电影信息已保存至: {file_path}")
return True
except Exception as e:
print(f"[错误] 文件保存失败: {e}")
return False
def main():
print("开始爬取豆瓣电影 Top 250...")
all_movies = []
max_pages = 2
for page in range(1, max_pages + 1):
print(f"\n正在爬取第 {page} 页...")
movies = get_douban_movies(page)
if not movies:
print(f"[警告] 第 {page} 页数据为空,可能是被反爬限制或网页结构变化。")
break
all_movies.extend(movies)
delay = random.uniform(2, 5)
print(f"等待 {delay:.2f} 秒后继续...")
time.sleep(delay)
if all_movies:
save_to_file(all_movies)
if __name__ == '__main__':
main()