以下是一份详细的Python网络爬虫开发教程,包含原理讲解、技术实现和最佳实践,分为多个章节进行系统化讲解:
https://pan.quark.cn/s/9ad78f3f7116
https://pan.quark.cn/s/79942bacd34a
https://link3.cc/aa99
类型 | 特点 | 典型应用场景 |
---|---|---|
通用爬虫 | 搜索引擎类大规模抓取 | Google/Baidu蜘蛛 |
聚焦爬虫 | 特定领域定向抓取 | 垂直领域数据采集 |
增量式爬虫 | 只抓取更新内容 | 新闻网站监控 |
深层网络爬虫 | 处理需要登录/表单提交的页面 | 企业数据采集 |
# 典型依赖库
import requests # HTTP请求
from bs4 import BeautifulSoup # HTML解析
import scrapy # 爬虫框架
import selenium # 浏览器自动化
import pandas as pd # 数据存储
# 请求方法示例
response = requests.get(url, headers=headers, params=params)
response = requests.post(url, data=form_data, cookies=cookies)
# 关键响应属性
status_code = response.status_code
content = response.content # 二进制数据
text = response.text # 解码后的文本
headers = response.headers
技术 | 优点 | 缺点 | 适用场景 |
---|---|---|---|
正则表达式 | 灵活快速 | 维护困难 | 简单结构提取 |
BeautifulSoup | 易用性强 | 性能一般 | 中小规模解析 |
lxml | 解析速度快 | 安装复杂 | 大规模数据处理 |
XPath | 精准定位 | 学习曲线陡峭 | 复杂页面结构 |
import requests
from bs4 import BeautifulSoup
import csv
def basic_spider(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
'Accept-Language': 'zh-CN,zh;q=0.9'
}
try:
response = requests.get(url, headers=headers, timeout=10)
response.raise_for_status()
# 使用更精确的解析方式
soup = BeautifulSoup(response.content, 'lxml')
# 示例:提取所有新闻标题
news_items = []
for article in soup.select('div.article-list > article'):
title = article.find('h2', class_='title').get_text(strip=True)
link = article.find('a')['href']
timestamp = article.find('time')['datetime']
news_items.append({
'title': title,
'link': link,
'timestamp': timestamp
})
# 保存数据
with open('news.csv', 'w', newline='', encoding='utf-8') as f:
writer = csv.DictWriter(f, fieldnames=['title', 'link', 'timestamp'])
writer.writeheader()
writer.writerows(news_items)
return news_items
except requests.exceptions.RequestException as e:
print(f"请求异常: {str(e)}")
except Exception as e:
print(f"解析异常: {str(e)}")
# 使用Session保持状态
session = requests.Session()
session.get(login_url) # 获取初始cookies
# 处理验证码
def handle_captcha(image_url):
# 使用OCR库或第三方识别服务
pass
# 动态页面处理(Selenium示例)
from selenium.webdriver import Chrome
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
options = Options()
options.headless = True
driver = Chrome(options=options)
driver.get('https://dynamic-website.com')
dynamic_content = driver.find_element(By.CSS_SELECTOR, '.ajax-content').text
# IP代理池实现
import random
proxies = [
'http://123.45.67.89:8080',
'http://112.233.44.55:3128'
]
def get_with_proxy(url):
proxy = {'http': random.choice(proxies)}
return requests.get(url, proxies=proxy)
# 请求头随机化
from fake_useragent import UserAgent
headers = {
'User-Agent': UserAgent().random,
'Referer': 'https://www.google.com/',
'Accept-Encoding': 'gzip, deflate, br'
}
scrapy startproject movie_project
cd movie_project
scrapy genspider douban_movie movie.douban.com
# settings.py配置
CONCURRENT_REQUESTS = 16
DOWNLOAD_DELAY = 2
DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml',
'Accept-Language': 'zh-CN'
}
ITEM_PIPELINES = {
'movie_project.pipelines.MoviePipeline': 300,
}
import scrapy
from scrapy.loader import ItemLoader
from movie_project.items import DoubanMovieItem
class DoubanMovieSpider(scrapy.Spider):
name = 'douban_movie'
allowed_domains = ['movie.douban.com']
start_urls = ['https://movie.douban.com/top250']
custom_settings = {
'FEED_FORMAT': 'csv',
'FEED_URI': 'douban_top250.csv'
}
def parse(self, response):
for movie in response.css('.item'):
loader = ItemLoader(item=DoubanMovieItem(), selector=movie)
loader.add_css('title', '.title::text')
loader.add_css('rating', '.rating_num::text')
loader.add_css('quote', '.inq::text')
loader.add_css('link', 'a::attr(href)')
yield loader.load_item()
next_page = response.css('.next a::attr(href)').get()
if next_page:
yield response.follow(next_page, self.parse)
# CSV存储
import csv
with open('data.csv', 'w') as f:
writer = csv.writer(f)
# MySQL存储
import pymysql
conn = pymysql.connect(host='localhost', user='root', database='scraping')
cursor = conn.cursor()
# MongoDB存储
from pymongo import MongoClient
client = MongoClient('mongodb://localhost:27017/')
db = client['scraping_db']
# pipelines.py
import pymongo
class MongoDBPipeline:
def __init__(self, mongo_uri, mongo_db):
self.mongo_uri = mongo_uri
self.mongo_db = mongo_db
@classmethod
def from_crawler(cls, crawler):
return cls(
mongo_uri=crawler.settings.get('MONGO_URI'),
mongo_db=crawler.settings.get('MONGO_DATABASE')
)
def open_spider(self, spider):
self.client = pymongo.MongoClient(self.mongo_uri)
self.db = self.client[self.mongo_db]
def close_spider(self, spider):
self.client.close()
def process_item(self, item, spider):
self.db[spider.name].insert_one(dict(item))
return item
# 使用asyncio
import aiohttp
import asyncio
async def fetch(session, url):
async with session.get(url) as response:
return await response.text()
async def main(urls):
async with aiohttp.ClientSession() as session:
tasks = [fetch(session, url) for url in urls]
return await asyncio.gather(*tasks)
# 使用Scrapy-Redis
# settings.py
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
REDIS_URL = 'redis://localhost:6379'
# 启动多个爬虫实例
scrapy runspider myspider.py
import pandas as pd
import matplotlib.pyplot as plt
# 数据加载
df = pd.read_csv('movie_data.csv')
# 数据分析示例
top10 = df.sort_values('rating', ascending=False).head(10)
plt.figure(figsize=(10,6))
plt.barh(top10['title'], top10['rating'])
plt.title('豆瓣电影TOP10评分')
plt.show()
本教程涵盖从基础到进阶的爬虫开发技术,建议按照章节顺序实践操作,实际开发中请务必遵守相关法律法规和网站的使用条款。