本文将详细介绍如何使用Python构建一个高效的Google Images爬虫,通过Selenium自动化浏览器操作,结合异步请求技术实现大规模关键词图片数据的采集。文章包含完整的代码实现、反反爬策略、性能优化技巧以及数据处理方法,帮助开发者快速构建自己的图片数据集。
关键词:Python爬虫、Google Images、Selenium、异步爬虫、图片采集、反反爬策略
在当今大数据时代,图像数据已成为人工智能训练、市场分析和内容创作的重要资源。Google Images作为全球最大的图片搜索引擎,包含了海量的高质量图片资源。然而,Google并没有提供公开的API来获取这些图片数据,这就需要我们使用爬虫技术来自动化采集。
传统爬虫技术在面对Google这样的现代网站时面临诸多挑战:
本文将介绍如何使用Python最新技术栈构建一个高效、稳定的Google Images爬虫系统,能够:
我们选择以下技术组合:
python
# 推荐使用Python 3.8+
# 创建虚拟环境
python -m venv google_images_env
source google_images_env/bin/activate # Linux/Mac
google_images_env\Scripts\activate # Windows
# 安装依赖
pip install selenium aiohttp beautifulsoup4 redis pymongo pillow
pip install webdriver-manager # 自动管理浏览器驱动
python
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
def init_driver(headless=True):
chrome_options = Options()
if headless:
chrome_options.add_argument("--headless")
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--window-size=1920,1080")
chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
# 自动下载并配置Chrome驱动
driver = webdriver.Chrome(
service=Service(ChromeDriverManager().install()),
options=chrome_options
)
return driver
text
Google Images Crawler Architecture
┌───────────────────────────────────────────────────────────────┐
│ Main Controller │
└───────────────────────┬───────────────────┬───────────────────┘
│ │
┌──────────────▼───────┐ ┌─────────▼───────────────┐
│ Search Keyword │ │ Image Downloader │
│ Handler │ │ (Async) │
└──────────────┬───────┘ └─────────┬───────────────┘
│ │
┌──────────────▼───────┐ ┌─────────▼───────────────┐
│ Scroll & Parse │ │ Proxy & Anti-Anti │
│ Page │ │ Crawl Manager │
└──────────────┬───────┘ └─────────┬───────────────┘
│ │
┌──────────────▼───────┐ ┌─────────▼───────────────┐
│ Data Storage │ │ Logging & Monitoring │
│ (MongoDB/File) │ │ │
└─────────────────────┘ └─────────────────────────┘
python
import os
import time
import logging
from urllib.parse import urlparse, unquote
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
class GoogleImagesCrawler:
def __init__(self, keywords, output_dir='images', max_images=100, headless=True):
self.keywords = keywords
self.output_dir = output_dir
self.max_images = max_images
self.driver = init_driver(headless)
self.logger = self._setup_logger()
# 创建输出目录
os.makedirs(self.output_dir, exist_ok=True)
def _setup_logger(self):
logger = logging.getLogger('google_images_crawler')
logger.setLevel(logging.INFO)
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
# 控制台输出
ch = logging.StreamHandler()
ch.setFormatter(formatter)
logger.addHandler(ch)
# 文件输出
fh = logging.FileHandler('google_images_crawler.log')
fh.setFormatter(formatter)
logger.addHandler(fh)
return logger
def search_keyword(self, keyword):
"""执行关键词搜索"""
self.logger.info(f"Searching for keyword: {keyword}")
search_url = f"https://www.google.com/search?q={keyword}&tbm=isch"
self.driver.get(search_url)
# 等待页面加载
WebDriverWait(self.driver, 10).until(
EC.presence_of_element_located((By.CSS_SELECTOR, "div[data-ri]"))
def scroll_to_bottom(self):
"""滚动页面加载更多图片"""
self.logger.info("Scrolling to load more images...")
last_height = self.driver.execute_script("return document.body.scrollHeight")
while True:
# 滚动到底部
self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(2) # 等待加载
# 计算新的滚动高度并比较
new_height = self.driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
# 尝试点击"Show more results"按钮
try:
more_btn = self.driver.find_element(By.CSS_SELECTOR, ".mye4qd")
if more_btn.is_displayed():
more_btn.click()
time.sleep(2)
continue
except:
break
last_height = new_height
def extract_image_data(self):
"""提取图片元数据"""
self.logger.info("Extracting image metadata...")
image_elements = self.driver.find_elements(By.CSS_SELECTOR, "div[data-ri]")
image_data = []
for idx, img_div in enumerate(image_elements[:self.max_images]):
try:
# 点击缩略图打开预览
img_div.click()
time.sleep(0.5)
# 等待大图加载
WebDriverWait(self.driver, 5).until(
EC.presence_of_element_located((By.CSS_SELECTOR, "img.n3VNCb")))
# 获取大图URL
big_img = self.driver.find_element(By.CSS_SELECTOR, "img.n3VNCb")
src = big_img.get_attribute("src")
# 过滤base64数据
if src.startswith("data:"):
continue
# 获取图片信息
alt = big_img.get_attribute("alt") or f"image_{idx}"
image_data.append({
"url": src,
"alt": alt,
"source_page": self.driver.current_url,
"keyword": self.keywords[0] if self.keywords else "",
"timestamp": time.time()
})
except Exception as e:
self.logger.warning(f"Error extracting image {idx}: {str(e)}")
continue
return image_data
def download_image(self, img_url, img_name, save_dir=None):
"""下载单张图片"""
save_dir = save_dir or self.output_dir
os.makedirs(save_dir, exist_ok=True)
try:
# 解析URL获取文件扩展名
parsed = urlparse(img_url)
filename = unquote(parsed.path.split("/")[-1])
ext = os.path.splitext(filename)[1]
# 如果没有扩展名,尝试从Content-Type获取
if not ext:
# 这里需要实际请求获取Content-Type
ext = ".jpg" # 默认假设
# 构建保存路径
save_path = os.path.join(save_dir, f"{img_name}{ext}")
# 使用requests下载图片
import requests
from PIL import Image
from io import BytesIO
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}
response = requests.get(img_url, headers=headers, stream=True)
if response.status_code == 200:
# 使用PIL验证图片完整性
img = Image.open(BytesIO(response.content))
img.save(save_path)
self.logger.info(f"Downloaded: {save_path}")
return save_path
else:
self.logger.warning(f"Failed to download {img_url}: HTTP {response.status_code}")
return None
except Exception as e:
self.logger.error(f"Error downloading {img_url}: {str(e)}")
return None
def crawl(self):
"""执行爬取流程"""
all_image_data = []
for keyword in self.keywords:
try:
self.search_keyword(keyword)
self.scroll_to_bottom()
image_data = self.extract_image_data()
# 下载图片
for idx, img_info in enumerate(image_data):
img_name = f"{keyword.replace(' ', '_')}_{idx}"
img_path = self.download_image(img_info["url"], img_name)
if img_path:
img_info["local_path"] = img_path
all_image_data.append(img_info)
except Exception as e:
self.logger.error(f"Error crawling keyword {keyword}: {str(e)}")
continue
return all_image_data
def __del__(self):
"""析构函数,关闭浏览器"""
if hasattr(self, 'driver'):
self.driver.quit()
# 使用示例
if __name__ == "__main__":
keywords = ["mountain landscape", "beach sunset"]
crawler = GoogleImagesCrawler(keywords, max_images=50, headless=False)
image_data = crawler.crawl()
print(f"Total images downloaded: {len(image_data)}")
python
import asyncio
import aiohttp
from concurrent.futures import ThreadPoolExecutor
class AsyncGoogleImagesCrawler(GoogleImagesCrawler):
def __init__(self, *args, max_workers=5, **kwargs):
super().__init__(*args, **kwargs)
self.max_workers = max_workers
self.session = None
async def init_session(self):
"""初始化aiohttp会话"""
timeout = aiohttp.ClientTimeout(total=30)
connector = aiohttp.TCPConnector(limit_per_host=10)
self.session = aiohttp.ClientSession(
timeout=timeout,
connector=connector,
headers={
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}
)
async def download_image_async(self, img_url, img_name, save_dir=None):
"""异步下载图片"""
save_dir = save_dir or self.output_dir
os.makedirs(save_dir, exist_ok=True)
try:
# 解析URL获取文件扩展名
parsed = urlparse(img_url)
filename = unquote(parsed.path.split("/")[-1])
ext = os.path.splitext(filename)[1]
# 如果没有扩展名,尝试从Content-Type获取
if not ext:
ext = ".jpg" # 默认假设
# 构建保存路径
save_path = os.path.join(save_dir, f"{img_name}{ext}")
# 如果文件已存在,跳过下载
if os.path.exists(save_path):
self.logger.info(f"File exists, skipped: {save_path}")
return save_path
async with self.session.get(img_url) as response:
if response.status == 200:
content = await response.read()
# 使用线程池执行阻塞的IO操作
with ThreadPoolExecutor(max_workers=1) as executor:
loop = asyncio.get_event_loop()
await loop.run_in_executor(
executor,
self._save_image,
content,
save_path
)
self.logger.info(f"Downloaded: {save_path}")
return save_path
else:
self.logger.warning(f"Failed to download {img_url}: HTTP {response.status}")
return None
except Exception as e:
self.logger.error(f"Error downloading {img_url}: {str(e)}")
return None
def _save_image(self, content, save_path):
"""保存图片(在同步上下文中执行)"""
from PIL import Image
from io import BytesIO
try:
img = Image.open(BytesIO(content))
img.save(save_path)
except Exception as e:
self.logger.error(f"Error saving image {save_path}: {str(e)}")
if os.path.exists(save_path):
os.remove(save_path)
async def crawl_async(self):
"""异步执行爬取流程"""
await self.init_session()
all_image_data = []
try:
for keyword in self.keywords:
try:
# 同步操作使用线程池执行
with ThreadPoolExecutor(max_workers=1) as executor:
loop = asyncio.get_event_loop()
await loop.run_in_executor(
executor,
self.search_keyword,
keyword
)
await loop.run_in_executor(
executor,
self.scroll_to_bottom
)
image_data = await loop.run_in_executor(
executor,
self.extract_image_data
)
# 异步下载图片
download_tasks = []
for idx, img_info in enumerate(image_data):
img_name = f"{keyword.replace(' ', '_')}_{idx}"
task = asyncio.create_task(
self.download_image_async(img_info["url"], img_name)
)
download_tasks.append((img_info, task))
# 等待所有下载任务完成
for img_info, task in download_tasks:
img_path = await task
if img_path:
img_info["local_path"] = img_path
all_image_data.append(img_info)
except Exception as e:
self.logger.error(f"Error crawling keyword {keyword}: {str(e)}")
continue
finally:
await self.session.close()
return all_image_data
async def __aenter__(self):
await self.init_session()
return self
async def __aexit__(self, exc_type, exc_val, exc_tb):
if self.session:
await self.session.close()
self.driver.quit()
# 使用示例
async def main():
keywords = ["mountain landscape", "beach sunset"]
async with AsyncGoogleImagesCrawler(keywords, max_images=50, headless=False) as crawler:
image_data = await crawler.crawl_async()
print(f"Total images downloaded: {len(image_data)}")
if __name__ == "__main__":
asyncio.run(main())
python
from fake_useragent import UserAgent
import random
import time
class AntiAntiCrawlMixin:
def __init__(self):
self.ua = UserAgent()
self.last_request_time = 0
self.request_interval = random.uniform(2, 5)
def random_sleep(self):
"""随机延迟模拟人类操作"""
sleep_time = random.uniform(0.5, 3)
time.sleep(sleep_time)
def rotate_user_agent(self):
"""随机更换User-Agent"""
new_ua = self.ua.random
if hasattr(self, 'driver'):
self.driver.execute_cdp_cmd(
"Network.setUserAgentOverride",
{"userAgent": new_ua}
)
return new_ua
def simulate_human_behavior(self):
"""模拟人类浏览行为"""
# 随机鼠标移动
if hasattr(self, 'driver'):
width = self.driver.execute_script("return window.innerWidth")
height = self.driver.execute_script("return window.innerHeight")
for _ in range(random.randint(2, 5)):
x = random.randint(0, width)
y = random.randint(0, height)
self.driver.execute_script(
f"document.elementFromPoint({x}, {y}).dispatchEvent("
"new MouseEvent('mousemove', { bubbles: true }))"
)
time.sleep(random.uniform(0.1, 0.5))
# 随机滚动
scroll_steps = random.randint(3, 10)
for _ in range(scroll_steps):
scroll_px = random.randint(200, 800)
self.driver.execute_script(
f"window.scrollBy(0, {scroll_px})")
time.sleep(random.uniform(0.2, 1))
self.random_sleep()
def handle_captcha(self):
"""处理验证码"""
try:
# 检测验证码是否存在
captcha_frame = self.driver.find_elements(
By.XPATH, "//iframe[contains(@src, 'captcha')]")
if captcha_frame:
self.logger.warning("Captcha detected! Please solve it manually.")
input("Press Enter after solving the captcha...")
return True
except:
pass
return False
def use_proxy(self, proxy_url):
"""配置代理"""
if hasattr(self, 'driver'):
self.driver.quit()
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument(f"--proxy-server={proxy_url}")
self.driver = webdriver.Chrome(
service=Service(ChromeDriverManager().install()),
options=chrome_options
)
self.logger.info(f"Using proxy: {proxy_url}")
python
import redis
class ProxyManager:
def __init__(self, redis_host='localhost', redis_port=6379):
self.redis = redis.StrictRedis(
host=redis_host, port=redis_port, decode_responses=True)
self.proxy_key = "google_images:proxies"
def add_proxy(self, proxy):
"""添加代理到池"""
self.redis.sadd(self.proxy_key, proxy)
def get_random_proxy(self):
"""随机获取一个代理"""
return self.redis.srandmember(self.proxy_key)
def remove_proxy(self, proxy):
"""移除失效代理"""
self.redis.srem(self.proxy_key, proxy)
def get_all_proxies(self):
"""获取所有代理"""
return self.redis.smembers(self.proxy_key)
def check_proxy_health(self, proxy, test_url="https://www.google.com"):
"""检查代理可用性"""
import requests
proxies = {
"http": proxy,
"https": proxy
}
try:
response = requests.get(
test_url,
proxies=proxies,
timeout=10
)
return response.status_code == 200
except:
return False
def health_check_all(self):
"""健康检查所有代理"""
all_proxies = self.get_all_proxies()
for proxy in all_proxies:
if not self.check_proxy_health(proxy):
self.remove_proxy(proxy)
self.logger.warning(f"Removed bad proxy: {proxy}")
python
from ratelimit import limits, sleep_and_retry
class OptimizedCrawler(AsyncGoogleImagesCrawler):
def __init__(self, *args, max_concurrent=10, **kwargs):
super().__init__(*args, **kwargs)
self.semaphore = asyncio.Semaphore(max_concurrent)
@sleep_and_retry
@limits(calls=20, period=60) # 每分钟最多20次请求
async def limited_download(self, img_url, img_name, save_dir=None):
"""带速率限制的下载"""
async with self.semaphore:
return await self.download_image_async(img_url, img_name, save_dir)
async def crawl_optimized(self):
"""优化后的爬取流程"""
await self.init_session()
all_image_data = []
try:
for keyword in self.keywords:
try:
# 同步操作使用线程池执行
with ThreadPoolExecutor(max_workers=1) as executor:
loop = asyncio.get_event_loop()
await loop.run_in_executor(
executor,
self.search_keyword,
keyword
)
await loop.run_in_executor(
executor,
self.scroll_to_bottom
)
image_data = await loop.run_in_executor(
executor,
self.extract_image_data
)
# 分批处理图片下载
batch_size = 10
for i in range(0, len(image_data), batch_size):
batch = image_data[i:i+batch_size]
# 异步下载当前批次的图片
download_tasks = []
for idx, img_info in enumerate(batch, start=i):
img_name = f"{keyword.replace(' ', '_')}_{idx}"
task = asyncio.create_task(
self.limited_download(img_info["url"], img_name)
)
download_tasks.append((img_info, task))
# 等待当前批次完成
for img_info, task in download_tasks:
try:
img_path = await task
if img_path:
img_info["local_path"] = img_path
all_image_data.append(img_info)
except Exception as e:
self.logger.error(f"Download failed: {str(e)}")
# 批次间延迟
await asyncio.sleep(random.uniform(5, 10))
except Exception as e:
self.logger.error(f"Error crawling keyword {keyword}: {str(e)}")
continue
finally:
await self.session.close()
return all_image_data
python
from diskcache import Cache
class CachedCrawler(OptimizedCrawler):
def __init__(self, *args, cache_dir=".cache", **kwargs):
super().__init__(*args, **kwargs)
self.cache = Cache(cache_dir)
async def cached_search(self, keyword):
"""带缓存的搜索"""
cache_key = f"search:{keyword}"
if cache_key in self.cache:
self.logger.info(f"Using cached result for: {keyword}")
return self.cache[cache_key]
# 执行实际搜索
with ThreadPoolExecutor(max_workers=1) as executor:
loop = asyncio.get_event_loop()
await loop.run_in_executor(
executor,
self.search_keyword,
keyword
)
await loop.run_in_executor(
executor,
self.scroll_to_bottom
)
image_data = await loop.run_in_executor(
executor,
self.extract_image_data
)
# 缓存结果(1小时过期)
self.cache.set(cache_key, image_data, expire=3600)
return image_data
async def crawl_cached(self):
"""使用缓存的爬取流程"""
await self.init_session()
all_image_data = []
try:
for keyword in self.keywords:
try:
image_data = await self.cached_search(keyword)
# 下载图片(带缓存检查)
download_tasks = []
for idx, img_info in enumerate(image_data):
img_name = f"{keyword.replace(' ', '_')}_{idx}"
save_path = os.path.join(self.output_dir, f"{img_name}.jpg")
# 如果本地已存在,跳过下载
if os.path.exists(save_path):
img_info["local_path"] = save_path
all_image_data.append(img_info)
continue
# 否则创建下载任务
task = asyncio.create_task(
self.limited_download(img_info["url"], img_name)
)
download_tasks.append((img_info, task))
# 等待下载完成
for img_info, task in download_tasks:
try:
img_path = await task
if img_path:
img_info["local_path"] = img_path
all_image_data.append(img_info)
except Exception as e:
self.logger.error(f"Download failed: {str(e)}")
except Exception as e:
self.logger.error(f"Error crawling keyword {keyword}: {str(e)}")
continue
finally:
await self.session.close()
self.cache.close()
return all_image_data
python
from pymongo import MongoClient
from datetime import datetime
class MongoStorage:
def __init__(self, mongo_uri="mongodb://localhost:27017", db_name="google_images"):
self.client = MongoClient(mongo_uri)
self.db = self.client[db_name]
self.collection = self.db["images"]
def save_image_metadata(self, metadata):
"""保存图片元数据"""
# 添加时间戳
metadata["created_at"] = datetime.utcnow()
metadata["updated_at"] = datetime.utcnow()
# 检查是否已存在
existing = self.collection.find_one({"url": metadata["url"]})
if existing:
# 更新现有记录
self.collection.update_one(
{"_id": existing["_id"]},
{"$set": metadata}
)
return existing["_id"]
else:
# 插入新记录
result = self.collection.insert_one(metadata)
return result.inserted_id
def get_images_by_keyword(self, keyword, limit=100):
"""按关键词查询图片"""
return list(self.collection.find(
{"keyword": keyword},
limit=limit
).sort("created_at", -1))
def count_images(self, keyword=None):
"""统计图片数量"""
query = {}
if keyword:
query["keyword"] = keyword
return self.collection.count_documents(query)
def close(self):
"""关闭连接"""
self.client.close()
# 集成到爬虫中
class StorageEnabledCrawler(CachedCrawler):
def __init__(self, *args, mongo_uri=None, **kwargs):
super().__init__(*args, **kwargs)
self.storage = MongoStorage(mongo_uri) if mongo_uri else None
async def crawl_with_storage(self):
"""带存储的爬取流程"""
image_data = await self.crawl_cached()
if self.storage:
saved_ids = []
for img_info in image_data:
try:
doc_id = self.storage.save_image_metadata(img_info)
saved_ids.append(doc_id)
except Exception as e:
self.logger.error(f"Failed to save metadata: {str(e)}")
self.logger.info(f"Saved {len(saved_ids)} records to MongoDB")
return image_data
def __del__(self):
"""析构函数"""
super().__del__()
if hasattr(self, 'storage') and self.storage:
self.storage.close()
python
import hashlib
from PIL import Image
class ImageProcessor:
@staticmethod
def calculate_image_hash(image_path, hash_size=16):
"""计算图片感知哈希"""
try:
img = Image.open(image_path)
# 转换为灰度并缩小尺寸
img = img.convert("L").resize(
(hash_size, hash_size),
Image.Resampling.LANCZOS
)
# 计算平均像素值
pixels = list(img.getdata())
avg = sum(pixels) / len(pixels)
# 生成哈希
bits = "".join(['1' if pixel > avg else '0' for pixel in pixels])
hex_hash = "{0:0{1}x}".format(int(bits, 2), len(bits) // 4)
return hex_hash
except Exception as e:
print(f"Error calculating hash: {str(e)}")
return None
@staticmethod
def find_duplicates(image_dir, threshold=5):
"""查找重复图片"""
hashes = {}
duplicates = []
for root, _, files in os.walk(image_dir):
for filename in files:
if filename.lower().endswith(('.png', '.jpg', '.jpeg')):
path = os.path.join(root, filename)
img_hash = ImageProcessor.calculate_image_hash(path)
if img_hash:
# 检查是否有相似哈希
found_duplicate = False
for existing_hash, existing_files in hashes.items():
# 计算汉明距离
distance = bin(int(img_hash, 16) ^ int(existing_hash, 16)).count('1')
if distance <= threshold:
existing_files.append(path)
duplicates.append(existing_files)
found_duplicate = True
break
if not found_duplicate:
hashes[img_hash] = [path]
return duplicates
@staticmethod
def optimize_image(image_path, quality=85, max_size=(1920, 1080)):
"""优化图片大小和质量"""
try:
img = Image.open(image_path)
# 调整尺寸
img.thumbnail(max_size, Image.Resampling.LANCZOS)
# 保存优化后的图片
if image_path.lower().endswith('.jpg') or image_path.lower().endswith('.jpeg'):
img.save(image_path, "JPEG", quality=quality, optimize=True)
elif image_path.lower().endswith('.png'):
img.save(image_path, "PNG", optimize=True)
return True
except Exception as e:
print(f"Error optimizing image: {str(e)}")
return False
python
import json
from rq import Queue
from redis import Redis
class DistributedCrawler:
def __init__(self, redis_host='localhost', redis_port=6379):
self.redis = Redis(host=redis_host, port=redis_port)
self.task_queue = Queue('google_images', connection=self.redis)
def enqueue_crawl_task(self, keywords, max_images=100):
"""将爬取任务加入队列"""
task_data = {
"keywords": keywords,
"max_images": max_images,
"created_at": time.time()
}
self.task_queue.enqueue(
self._execute_crawl_task,
json.dumps(task_data),
result_ttl=86400 # 结果保留24小时
)
@staticmethod
def _execute_crawl_task(task_json):
"""实际执行爬取任务"""
task_data = json.loads(task_json)
# 初始化爬虫
crawler = AsyncGoogleImagesCrawler(
keywords=task_data["keywords"],
max_images=task_data["max_images"],
headless=True
)
# 执行爬取
loop = asyncio.get_event_loop()
image_data = loop.run_until_complete(crawler.crawl_async())
return {
"status": "completed",
"image_count": len(image_data),
"keywords": task_data["keywords"],
"completed_at": time.time()
}
def monitor_queue(self):
"""监控任务队列状态"""
while True:
print(f"Queue status: {len(self.task_queue)} jobs pending")
time.sleep(10)
# Worker实现
def run_worker():
"""启动RQ worker"""
from rq import Worker
redis_conn = Redis()
worker = Worker(['google_images'], connection=redis_conn)
worker.work()
python
from apscheduler.schedulers.background import BackgroundScheduler
class ScheduledCrawler:
def __init__(self):
self.scheduler = BackgroundScheduler()
self.distributed_crawler = DistributedCrawler()
def add_daily_job(self, keywords, hour=3, minute=0):
"""添加每日任务"""
self.scheduler.add_job(
self.distributed_crawler.enqueue_crawl_task,
'cron',
hour=hour,
minute=minute,
args=[keywords]
)
def start(self):
"""启动调度器"""
self.scheduler.start()
try:
while True:
time.sleep(1)
except (KeyboardInterrupt, SystemExit):
self.scheduler.shutdown()
# 使用示例
if __name__ == "__main__":
keywords_groups = [
["mountain landscape", "forest"],
["beach sunset", "ocean waves"],
["city skyline", "urban architecture"]
]
scheduler = ScheduledCrawler()
# 为每组关键词设置不同的执行时间
for idx, keywords in enumerate(keywords_groups):
hour = 3 + (idx * 2) # 3am, 5am, 7am
scheduler.add_daily_job(keywords, hour=hour)
scheduler.start()