- Next
我们可以通过 BeautifulSoup
查找li.next > a['href']
获取下一页地址,并拼接 URL。
核心思路伪代码
while True:
1. 请求当前页 URL
2. 解析 HTML,提取所需内容
3. 判断是否存在下一页链接
- 如果有,拼接新 URL,继续循环
- 如果没有,break 退出循环
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
def scrape_all_quotes(start_url):
quotes = []
url = start_url
while url:
print(f"正在抓取:{url}")
res = requests.get(url)
soup = BeautifulSoup(res.text, 'lxml')
for quote_block in soup.find_all("div", class_="quote"):
quote_text = quote_block.find("span", class_="text").text.strip()
author = quote_block.find("small", class_="author").text.strip()
tags = [tag.text for tag in quote_block.find_all("a", class_="tag")]
quotes.append({
"quote": quote_text,
"author": author,
"tags": tags
})
# 查找下一页
next_link = soup.select_one("li.next > a")
if next_link:
next_href = next_link['href']
url = urljoin(url, next_href) # 拼接为完整URL
else:
url = None
return quotes
if __name__ == "__main__":
all_quotes = scrape_all_quotes("https://quotes.toscrape.com/")
print(f"共抓取到 {len(all_quotes)} 条名言")
# 示例输出前3条
for quote in all_quotes[:3]:
print(f"\n{quote['quote']}\n—— {quote['author']}|标签:{', '.join(quote['tags'])}")
修改已有爬虫,实现抓取所有页面的名言数据
使用 len() 查看共抓取多少条数据
额外挑战:将所有数据保存为 JSON 文件(使用 json.dump)
练习代码:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import json
def scrape_all_quotes(start_url):
quotes = []
url = start_url
while url:
print(f"抓取页面:{url}")
response = requests.get(url)
soup = BeautifulSoup(response.text, "lxml")
quote_blocks = soup.find_all("div", class_="quote")
for block in quote_blocks:
text = block.find("span", class_="text").text.strip()
author = block.find("small", class_="author").text.strip()
tags = [tag.text for tag in block.find_all("a", class_="tag")]
quotes.append({
"quote": text,
"author": author,
"tags": tags
})
# 找到下一页链接
next_link = soup.select_one("li.next > a")
if next_link:
next_href = next_link['href']
url = urljoin(url, next_href)
else:
url = None
return quotes
if __name__ == "__main__":
start_url = "https://quotes.toscrape.com/"
all_quotes = scrape_all_quotes(start_url)
print(f"\n共抓取到 {len(all_quotes)} 条名言。\n")
# 保存到 JSON 文件
output_file = "quotes.json"
with open(output_file, "w", encoding="utf-8") as f:
json.dump(all_quotes, f, ensure_ascii=False, indent=2)
print(f"数据已保存到文件:{output_file}")
运行输出:
正在抓取:https://quotes.toscrape.com/
正在抓取:https://quotes.toscrape.com/page/2/
正在抓取:https://quotes.toscrape.com/page/3/
正在抓取:https://quotes.toscrape.com/page/4/
正在抓取:https://quotes.toscrape.com/page/5/
正在抓取:https://quotes.toscrape.com/page/6/
正在抓取:https://quotes.toscrape.com/page/7/
正在抓取:https://quotes.toscrape.com/page/8/
正在抓取:https://quotes.toscrape.com/page/9/
正在抓取:https://quotes.toscrape.com/page/10/
共抓取到 100 条名言
数据已保存到文件:quotes.json
quotes.json文件输出:
[
{
"quote": "“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”",
"author": "Albert Einstein",
"tags": [
"change",
"deep-thoughts",
"thinking",
"world"
]
},
{
"quote": "“It is our choices, Harry, that show what we truly are, far more than our abilities.”",
"author": "J.K. Rowling",
"tags": [
"abilities",
"choices"
]
},
{
"quote": "“There are only two ways to live your life. One is as though nothing is a miracle. The other is as though everything is a miracle.”",
"author": "Albert Einstein",
"tags": [
"inspirational",
"life",
"live",
"miracle",
"miracles"
]
},
... # 此处省去95条数据
{
"quote": "“A person's a person, no matter how small.”",
"author": "Dr. Seuss",
"tags": [
"inspirational"
]
},
{
"quote": "“... a mind needs books as a sword needs a whetstone, if it is to keep its edge.”",
"author": "George R.R. Martin",
"tags": [
"books",
"mind"
]
}
urljoin(base_url, relative_path)
可以自动拼接绝对路径
网站有时采用 JavaScript 动态分页 —— 这类网站需用 Selenium/Playwright(后续学习)