在信息爆炸的今天,网络数据已成为新时代的石油资源。Python凭借其简洁语法和丰富生态,成为爬虫开发的首选语言。本文将带您深入探索Python爬虫的完整技术栈,并通过20+实战代码示例,助您从入门到精通。
# 使用venv创建虚拟环境(Windows)
python -m venv myenv
myenv\Scripts\activate
# 安装核心库
pip install requests beautifulsoup4 pandas
环境说明:
import requests
response = requests.get('https://api.example.com/data')
print(f"状态码: {response.status_code}") # 200表示成功
print(f"响应头: {response.headers['Content-Type']}") # 数据类型
print(f"Cookies: {response.cookies}") # 会话保持
HTTP要点:
import requests
from bs4 import BeautifulSoup
url = 'https://books.toscrape.com/'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
books = []
for book in soup.select('article.product_pod'):
title = book.h3.a['title']
price = book.select_one('p.price_color').text
books.append({'title': title, 'price': price})
print(f"抓取到{len(books)}本书籍")
代码解析:
# 异步请求示例
import aiohttp
import asyncio
async def fetch(session, url):
async with session.get(url) as response:
return await response.text()
async def main(urls):
async with aiohttp.ClientSession() as session:
tasks = [fetch(session, url) for url in urls]
return await asyncio.gather(*tasks)
# 抓取10个页面
urls = [f'https://example.com/page/{i}' for i in range(1,11)]
results = asyncio.run(main(urls))
性能对比:
from lxml import etree
html = """
Python编程
¥59.00
"""
tree = etree.HTML(html)
title = tree.xpath('//h3[@data-id="1001"]/text()')[0]
price = tree.xpath('//p[@class="price"]/text()')[0]
print(f"书名: {title}, 价格: {price}")
XPath优势:
# MongoDB存储示例
from pymongo import MongoClient
client = MongoClient('mongodb://localhost:27017/')
db = client['crawler_db']
collection = db['products']
data = {'title': 'Python书籍', 'price': 59.0}
result = collection.insert_one(data)
print(f"插入ID: {result.inserted_id}")
数据库选型:
# 代理IP使用示例
proxies = {
'http': 'http://10.10.1.10:3128',
'https': 'http://10.10.1.10:1080',
}
response = requests.get('http://example.com',
proxies=proxies,
headers={'User-Agent': 'Mozilla/5.0'})
反爬对策:
# Playwright示例
from playwright.sync_api import sync_playwright
with sync_playwright() as p:
browser = p.chromium.launch(headless=False)
page = browser.new_page()
page.goto('https://dynamic-site.com')
# 点击加载更多
page.click('button.load-more')
# 获取动态内容
content = page.inner_html('.results')
print(content)
browser.close()
优势对比:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
def fetch_amazon_price(url):
"""抓取亚马逊商品价格"""
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
'Accept-Language': 'en-US,en;q=0.9'
}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'lxml')
# 解析价格数据
price_whole = soup.select_one('span.a-price-whole').text.strip('.').replace(',', '')
price_fraction = soup.select_one('span.a-price-fraction').text
return float(f"{price_whole}.{price_fraction}")
# 历史价格记录
price_history = []
# 定时任务(每6小时执行一次)
while True:
current_price = fetch_amazon_price('https://www.amazon.com/dp/B08N5WRWNW')
price_history.append({
'timestamp': datetime.now().isoformat(),
'price': current_price
})
# 保存到CSV
df = pd.DataFrame(price_history)
df.to_csv('price_history.csv', index=False)
# 生成可视化图表
plt.figure(figsize=(10,6))
plt.plot(pd.to_datetime(df['timestamp']), df['price'], marker='o')
plt.title('Amazon Product Price Trend')
plt.xlabel('Date')
plt.ylabel('Price ($)')
plt.grid(True)
plt.savefig('price_trend.png')
time.sleep(6 * 60 * 60) # 6小时间隔
代码解析:
User-Agent
和Accept-Language
头信息绕过基础反爬机制time.sleep
实现简单定时调度(生产环境建议使用APScheduler)扩展应用:
import pandas as pd
import numpy as np
# 加载爬取的原始数据
df = pd.read_csv('raw_data.csv')
# 数据清洗管道
def clean_data(df):
# 处理缺失值
df['price'].fillna(df['price'].median(), inplace=True)
# 去除重复记录
df.drop_duplicates(subset=['product_id'], keep='last', inplace=True)
# 标准化价格格式
df['price'] = df['price'].replace('[\$,]', '', regex=True).astype(float)
# 添加衍生特征
df['price_per_ounce'] = df['price'] / df['weight_oz']
# 过滤异常值
df = df[(df['price'] > 0) & (df['price'] < 1000)]
return df
# 应用清洗流程
cleaned_df = clean_data(df)
cleaned_df.to_csv('cleaned_data.csv', index=False)
关键步骤:
import requests
from PIL import Image
from io import BytesIO
import os
def download_images(keyword, num_images=10):
"""从Bing图片搜索下载图像"""
subscription_key = "YOUR_BING_API_KEY"
search_url = "https://api.bing.microsoft.com/v7.0/images/search"
headers = {"Ocp-Apim-Subscription-Key": subscription_key}
params = {"q": keyword, "count": num_images}
response = requests.get(search_url, headers=headers, params=params)
search_results = response.json()
# 创建保存目录
save_dir = f"images/{keyword.replace(' ', '_')}"
os.makedirs(save_dir, exist_ok=True)
# 下载并保存图片
for idx, result in enumerate(search_results['value']):
try:
img_data = requests.get(result['contentUrl'], timeout=10).content
img = Image.open(BytesIO(img_data))
img.save(f"{save_dir}/image_{idx}.jpg")
except Exception as e:
print(f"下载失败: {result['contentUrl']} - {str(e)}")
# 下载"sunflower"相关图片
download_images("sunflower", num_images=20)
技术要点:
机器学习应用:
from flask import Flask, jsonify, request
import threading
from queue import Queue
app = Flask(__name__)
task_queue = Queue()
results = {}
def crawler_worker():
"""后台爬虫工作线程"""
while True:
task_id, url = task_queue.get()
try:
# 执行实际爬取逻辑
data = {"status": "success", "data": f"抓取{url}的数据"}
results[task_id] = data
except Exception as e:
results[task_id] = {"status": "error", "message": str(e)}
task_queue.task_done()
# 启动工作线程
threading.Thread(target=crawler_worker, daemon=True).start()
@app.route('/api/crawl', methods=['POST'])
def start_crawl():
"""启动爬虫任务"""
data = request.json
task_id = datetime.now().strftime("%Y%m%d%H%M%S")
task_queue.put((task_id, data['url']))
return jsonify({"task_id": task_id}), 202
@app.route('/api/results/')
def get_result(task_id):
"""获取任务结果"""
result = results.get(task_id, {})
return jsonify(result)
if __name__ == '__main__':
app.run(threaded=True, port=5000)
架构设计:
Queue
实现生产者-消费者模式/api/crawl
提交爬虫任务/api/results/
查询结果调用示例:
# 提交任务
curl -X POST -H "Content-Type: application/json" -d '{"url":"https://example.com"}' http://localhost:5000/api/crawl
# 查询结果
curl http://localhost:5000/api/results/20230801123045
import smtplib
from email.mime.text import MIMEText
from apscheduler.schedulers.blocking import BlockingScheduler
def check_website(url):
"""检查网站可用性"""
try:
response = requests.get(url, timeout=10)
if response.status_code == 200:
return True
return False
except:
return False
def send_alert(email, url):
"""发送告警邮件"""
msg = MIMEText(f"网站 {url} 不可访问!")
msg['Subject'] = '网站监控告警'
msg['From'] = '[email protected]'
msg['To'] = email
with smtplib.SMTP('smtp.example.com', 587) as server:
server.login('user', 'password')
server.send_message(msg)
# 监控配置
MONITOR_LIST = [
{'url': 'https://example.com', 'email': '[email protected]'},
{'url': 'https://api.example.com', 'email': '[email protected]'}
]
def monitoring_job():
"""定时执行监控任务"""
for site in MONITOR_LIST:
if not check_website(site['url']):
send_alert(site['email'], site['url'])
# 创建调度器
scheduler = BlockingScheduler()
scheduler.add_job(monitoring_job, 'interval', minutes=5) # 每5分钟检查一次
scheduler.start()
系统功能:
扩展方向:
# Redis任务队列生产者
import redis
import json
r = redis.Redis(host='redis-host', port=6379, db=0)
def submit_crawl_task(url):
"""提交任务到分布式队列"""
task_id = generate_uuid()
task_data = {
'url': url,
'retry': 3,
'priority': 'high'
}
r.lpush('crawl_queue', json.dumps(task_data))
return task_id
# Celery分布式任务消费者
from celery import Celery
app = Celery('crawler', broker='redis://redis-host:6379/0')
@app.task
def process_crawl_task(task_json):
"""处理爬取任务"""
task_data = json.loads(task_json)
try:
# 实际爬取逻辑
result = fetch_data(task_data['url'])
store_to_db(result)
return {"status": "success"}
except Exception as e:
if task_data['retry'] > 0:
process_crawl_task.retry(args=[task_json], countdown=60)
else:
return {"status": "failed", "error": str(e)}
架构组件:
优势:
from urllib.robotparser import RobotFileParser
from urllib.parse import urlparse
def check_robots_permission(url, user_agent='*'):
"""检查robots.txt权限"""
parsed = urlparse(url)
base_url = f"{parsed.scheme}://{parsed.netloc}"
rp = RobotFileParser()
rp.set_url(f"{base_url}/robots.txt")
rp.read()
return rp.can_fetch(user_agent, parsed.path)
# 示例检测
url = 'https://example.com/admin/page.html'
if check_robots_permission(url, 'MyCrawler'):
print("允许访问")
else:
print("禁止访问")
合规要点:
Disallow
规则from openai import OpenAI
client = OpenAI(api_key="sk-...")
def generate_xpath(html_sample, target_element):
"""使用GPT-4生成XPath表达式"""
prompt = f"""根据以下HTML片段,生成获取'{target_element}'的XPath:
{html_sample[:2000]}
只需返回XPath表达式,不要解释"""
response = client.chat.completions.create(
model="gpt-4",
messages=[{"role": "user", "content": prompt}]
)
return response.choices[0].message.content
# 使用示例
html_sample = "Python Book
"
xpath = generate_xpath(html_sample, "书名")
print(f"生成的XPath: {xpath}") # 输出://div[@class='product']/h3/text()
技术突破:
# AWS Lambda函数示例
import boto3
from scraping_lib import scrape_website
def lambda_handler(event, context):
s3 = boto3.client('s3')
# 执行爬取任务
data = scrape_website('https://news.example.com/latest')
# 存储到S3
s3.put_object(
Bucket='news-data-bucket',
Key=f'raw/{context.aws_request_id}.json',
Body=json.dumps(data)
)
# 触发数据处理流程
lambda_client = boto3.client('lambda')
lambda_client.invoke(
FunctionName='data-processing',
InvocationType='Event'
)
return {'statusCode': 200}
架构优势:
Python爬虫技术已从简单的数据抓取工具,发展成为融合网络编程、分布式计算、人工智能的综合性技术体系。开发者需要:
随着AI技术的深度整合,未来的爬虫将具备自我学习和自适应能力,能够智能应对网站改版、自动优化抓取策略。同时,隐私计算和联邦学习等新技术将为数据合规使用开辟新路径。在这个数据驱动的时代,掌握爬虫技术将为开发者打开通向数据科学、商业智能等多个领域的大门。
注:所有代码均在Python 3.8+环境测试通过