现在到了最激动人心的部分——将所有流式组件整合成一个完整的生产级系统!这就像指挥一个交响乐团,每个乐器都要在正确的时间演奏正确的音符,最终奏出美妙的和谐乐章。
# production_dialog_system.py - 生产级流式对话系统
import asyncio
import time
import threading
import logging
import json
import uuid
from typing import Dict, List, Any, Optional, Callable, Union
from dataclasses import dataclass, asdict, field
from datetime import datetime, timedelta
from pathlib import Path
import numpy as np
from collections import deque, defaultdict
import socket
import psutil
import gc
# 导入流式组件
from streaming_asr import StreamingASRProcessor, StreamingChunk
from streaming_llm import StreamingLLMEngine, StreamingResponse, StreamingRequest
from streaming_tts import StreamingTTSSynthesizer, TextChunk, AudioChunk
# 导入基础组件
from asr_module import ASREngine
from llm_module import LLMManager, ChatMessage
from tts_module import TTSManager
logger = logging.getLogger(__name__)
@dataclass
class ProductionConfig:
"""生产环境配置"""
# 系统配置
max_concurrent_sessions: int = 100
session_timeout: int = 300 # 5分钟
max_audio_duration: float = 60.0 # 最大音频时长
# 性能配置
enable_gpu_acceleration: bool = True
max_memory_usage_gb: float = 16.0
max_cpu_usage_percent: float = 80.0
# 质量配置
min_confidence_threshold: float = 0.7
max_response_tokens: int = 512
enable_content_filtering: bool = True
# 监控配置
metrics_collection_interval: int = 10 # 秒
health_check_interval: int = 30 # 秒
log_level: str = "INFO"
# 存储配置
audio_storage_path: str = "./audio_storage"
session_storage_path: str = "./session_storage"
metrics_storage_path: str = "./metrics_storage"
@dataclass
class DialogSession:
"""对话会话"""
session_id: str
user_id: Optional[str] = None
created_at: datetime = field(default_factory=datetime.now)
last_activity: datetime = field(default_factory=datetime.now)
# 会话状态
is_active: bool = True
current_state: str = "idle" # idle, listening, processing, speaking
# 对话历史
conversation_history: List[Dict[str, Any]] = field(default_factory=list)
# 性能指标
total_turns: int = 0
avg_response_time: float = 0.0
response_times: List[float] = field(default_factory=list)
# 元数据
metadata: Dict[str, Any] = field(default_factory=dict)
class SystemMetrics:
"""系统指标收集器"""
def __init__(self):
self.metrics = defaultdict(deque)
self.current_metrics = {}
self.start_time = time.time()
def record_metric(self, name: str, value: float, timestamp: Optional[float] = None):
"""记录指标"""
timestamp = timestamp or time.time()
self.metrics[name].append((timestamp, value))
self.current_metrics[name] = value
# 保持最近1小时的数据
cutoff_time = timestamp - 3600
while self.metrics[name] and self.metrics[name][0][0] < cutoff_time:
self.metrics[name].popleft()
def get_metric_stats(self, name: str) -> Dict[str, float]:
"""获取指标统计"""
if name not in self.metrics or not self.metrics[name]:
return {}
values = [value for _, value in self.metrics[name]]
return {
"current": values[-1] if values else 0,
"min": min(values),
"max": max(values),
"avg": sum(values) / len(values),
"count": len(values)
}
def get_system_metrics(self) -> Dict[str, Any]:
"""获取系统指标"""
# CPU和内存使用率
cpu_percent = psutil.cpu_percent(interval=1)
memory = psutil.virtual_memory()
# GPU信息(如果可用)
gpu_info = {}
try:
import torch
if torch.cuda.is_available():
gpu_info = {
"gpu_count": torch.cuda.device_count(),
"gpu_memory_allocated": torch.cuda.memory_allocated() / 1024**3,
"gpu_memory_reserved": torch.cuda.memory_reserved() / 1024**3
}
except:
pass
return {
"timestamp": time.time(),
"uptime": time.time() - self.start_time,
"cpu_percent": cpu_percent,
"memory_percent": memory.percent,
"memory_used_gb": memory.used / 1024**3,
"memory_available_gb": memory.available / 1024**3,
"disk_usage": dict(psutil.disk_usage('/')),
"network_io": dict(psutil.net_io_counters()),
**gpu_info
}
class HealthChecker:
"""健康检查器"""
def __init__(self, config: ProductionConfig):
self.config = config
self.health_status = {
"overall": "healthy",
"components": {},
"last_check": time.time(),
"issues": []
}
def check_component_health(self, component_name: str, component) -> Dict[str, Any]:
"""检查组件健康状态"""
health = {
"status": "healthy",
"last_check": time.time(),
"metrics": {},
"issues": []
}
try:
if hasattr(component, 'get_statistics'):
stats = component.get_statistics()
health["metrics"] = stats
# 检查特定指标
if component_name == "asr":
if stats.get("avg_processing_time", 0) > 2.0:
health["issues"].append("ASR处理时间过长")
health["status"] = "warning"
elif component_name == "llm":
if stats.get("avg_latency", 0) > 5.0:
health["issues"].append("LLM响应时间过长")
health["status"] = "warning"
if stats.get("active_requests", 0) > 50:
health["issues"].append("LLM请求积压过多")
health["status"] = "warning"
elif component_name == "tts":
if stats.get("avg_synthesis_time", 0) > 3.0:
health["issues"].append("TTS合成时间过长")
health["status"] = "warning"
except Exception as e:
health["status"] = "error"
health["issues"].append(f"健康检查失败: {str(e)}")
return health
def perform_health_check(self, components: Dict[str, Any]) -> Dict[str, Any]:
"""执行完整健康检查"""
self.health_status = {
"overall": "healthy",
"components": {},
"last_check": time.time(),
"issues": []
}
# 检查各个组件
for name, component in components.items():
component_health = self.check_component_health(name, component)
self.health_status["components"][name] = component_health
# 更新整体状态
if component_health["status"] == "error":
self.health_status["overall"] = "error"
elif component_health["status"] == "warning" and self.health_status["overall"] == "healthy":
self.health_status["overall"] = "warning"
# 收集问题
self.health_status["issues"].extend(component_health["issues"])
# 检查系统资源
system_metrics = SystemMetrics().get_system_metrics()
if system_metrics["cpu_percent"] > self.config.max_cpu_usage_percent:
self.health_status["issues"].append(f"CPU使用率过高: {system_metrics['cpu_percent']:.1f}%")
self.health_status["overall"] = "warning"
if system_metrics["memory_used_gb"] > self.config.max_memory_usage_gb:
self.health_status["issues"].append(f"内存使用过高: {system_metrics['memory_used_gb']:.1f}GB")
self.health_status["overall"] = "warning"
return self.health_status
class ProductionDialogSystem:
"""生产级流式对话系统"""
def __init__(self, config: Optional[ProductionConfig] = None):
"""
初始化生产对话系统
Args:
config: 生产配置
"""
self.config = config or ProductionConfig()
# 设置日志
logging.basicConfig(
level=getattr(logging, self.config.log_level),
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
# 系统组件
self.components = {}
self.is_running = False
# 会话管理
self.active_sessions = {}
self.session_lock = threading.RLock()
# 监控组件
self.metrics = SystemMetrics()
self.health_checker = HealthChecker(self.config)
# 后台任务
self.background_tasks = []
# 创建存储目录
self._create_storage_directories()
logger.info(" 生产级对话系统初始化完成")
def _create_storage_directories(self):
"""创建存储目录"""
directories = [
self.config.audio_storage_path,
self.config.session_storage_path,
self.config.metrics_storage_path
]
for directory in directories:
Path(directory).mkdir(parents=True, exist_ok=True)
async def initialize_components(self):
"""初始化所有组件"""
try:
logger.info(" 初始化系统组件...")
# 初始化ASR
logger.info(" 初始化ASR组件...")
asr_engine = ASREngine(model_name="base", device="auto")
streaming_asr = StreamingASRProcessor(asr_engine)
self.components["asr"] = streaming_asr
# 初始化LLM
logger.info(" 初始化LLM组件...")
streaming_llm = StreamingLLMEngine(model_name="gpt2", device="auto")
self.components["llm"] = streaming_llm
# 初始化TTS
logger.info(" 初始化TTS组件...")
tts_manager = TTSManager()
streaming_tts = StreamingTTSSynthesizer(tts_manager)
self.components["tts"] = streaming_tts
logger.info("✅ 所有组件初始化成功")
except Exception as e:
logger.error(f"❌ 组件初始化失败: {e}")
raise
async def start_system(self):
"""启动系统"""
if self.is_running:
logger.warning("⚠️ 系统已在运行")
return
try:
# 初始化组件
await self.initialize_components()
# 启动各个组件
self.components["asr"].start_streaming()
self.components["llm"].start_engine()
self.components["tts"].start_synthesis()
self.is_running = True
# 启动后台任务
await self._start_background_tasks()
logger.info(" 生产对话系统已启动")
except Exception as e:
logger.error(f"❌ 系统启动失败: {e}")
await self.stop_system()
raise
async def stop_system(self):
"""停止系统"""
if not self.is_running:
return
logger.info(" 正在停止系统...")
self.is_running = False
# 停止后台任务
for task in self.background_tasks:
task.cancel()
# 停止组件
if "asr" in self.components:
self.components["asr"].stop_streaming()
if "llm" in self.components:
self.components["llm"].stop_engine()
if "tts" in self.components:
self.components["tts"].stop_synthesis()
# 保存会话数据
await self._save_session_data()
logger.info("✅ 系统已停止")
async def _start_background_tasks(self):
"""启动后台任务"""
# 指标收集任务
metrics_task = asyncio.create_task(self._metrics_collection_loop())
self.background_tasks.append(metrics_task)
# 健康检查任务
health_task = asyncio.create_task(self._health_check_loop())
self.background_tasks.append(health_task)
# 会话清理任务
cleanup_task = asyncio.create_task(self._session_cleanup_loop())
self.background_tasks.append(cleanup_task)
logger.info(" 后台任务已启动")
async def _metrics_collection_loop(self):
"""指标收集循环"""
while self.is_running:
try:
# 收集系统指标
system_metrics = self.metrics.get_system_metrics()
for metric_name, value in system_metrics.items():
if isinstance(value, (int, float)):
self.metrics.record_metric(f"system.{metric_name}", value)
# 收集组件指标
for component_name, component in self.components.items():
if hasattr(component, 'get_statistics'):
stats = component.get_statistics()
for stat_name, value in stats.items():
if isinstance(value, (int, float)):
self.metrics.record_metric(f"{component_name}.{stat_name}", value)
# 收集会话指标
with self.session_lock:
active_session_count = len(self.active_sessions)
self.metrics.record_metric("sessions.active_count", active_session_count)
await asyncio.sleep(self.config.metrics_collection_interval)
except Exception as e:
logger.error(f"❌ 指标收集错误: {e}")
await asyncio.sleep(5)
async def _health_check_loop(self):
"""健康检查循环"""
while self.is_running:
try:
# 执行健康检查
health_status = self.health_checker.perform_health_check(self.components)
# 记录健康状态
if health_status["overall"] == "healthy":
self.metrics.record_metric("system.health_score", 1.0)
elif health_status["overall"] == "warning":
self.metrics.record_metric("system.health_score", 0.5)
else:
self.metrics.record_metric("system.health_score", 0.0)
# 记录问题数量
self.metrics.record_metric("system.issue_count", len(health_status["issues"]))
# 如果有严重问题,记录日志
if health_status["overall"] == "error":
logger.error(f" 系统健康检查失败: {health_status['issues']}")
elif health_status["issues"]:
logger.warning(f"⚠️ 系统健康警告: {health_status['issues']}")
await asyncio.sleep(self.config.health_check_interval)
except Exception as e:
logger.error(f"❌ 健康检查错误: {e}")
await asyncio.sleep(10)
async def _session_cleanup_loop(self):
"""会话清理循环"""
while self.is_running:
try:
current_time = datetime.now()
expired_sessions = []
with self.session_lock:
for session_id, session in self.active_sessions.items():
# 检查会话是否过期
if (current_time - session.last_activity).seconds > self.config.session_timeout:
expired_sessions.append(session_id)
# 清理过期会话
for session_id in expired_sessions:
session = self.active_sessions.pop(session_id)
logger.info(f"️ 清理过期会话: {session_id}")
# 保存会话数据
await self._save_session(session)
# 记录清理统计
if expired_sessions:
self.metrics.record_metric("sessions.cleanup_count", len(expired_sessions))
await asyncio.sleep(60) # 每分钟检查一次
except Exception as e:
logger.error(f"❌ 会话清理错误: {e}")
await asyncio.sleep(60)
async def create_session(self, user_id: Optional[str] = None) -> str:
"""创建新会话"""
session_id = str(uuid.uuid4())
session = DialogSession(
session_id=session_id,
user_id=user_id
)
with self.session_lock:
self.active_sessions[session_id] = session
logger.info(f" 创建新会话: {session_id}")
return session_id
async def process_audio_stream(
self,
session_id: str,
audio_data: np.ndarray,
callback: Optional[Callable] = None
) -> Dict[str, Any]:
"""处理音频流"""
if not self.is_running:
raise RuntimeError("系统未运行")
# 检查会话
with self.session_lock:
if session_id not in self.active_sessions:
raise ValueError(f"会话不存在: {session_id}")
session = self.active_sessions[session_id]
session.last_activity = datetime.now()
session.current_state = "listening"
try:
start_time = time.time()
# 设置回调链
async def on_asr_result(text: str):
"""ASR结果回调"""
if not text.strip():
return
session.current_state = "processing"
# 进行LLM推理
messages = [
ChatMessage(role="system", content="你是一个有帮助的AI助手。"),
ChatMessage(role="user", content=text)
]
# 流式生成回复
full_response = ""
def on_llm_token(response: StreamingResponse):
nonlocal full_response
if not response.is_final:
full_response += response.token
# 流式TTS合成
self.components["tts"].add_text(response.token)
else:
# 最终处理
session.current_state = "speaking"
session.conversation_history.append({
"user": text,
"assistant": full_response,
"timestamp": datetime.now().isoformat()
})
session.total_turns += 1
# 更新性能指标
response_time = time.time() - start_time
session.response_times.append(response_time)
session.avg_response_time = np.mean(session.response_times)
# 记录指标
self.metrics.record_metric("dialog.response_time", response_time)
self.metrics.record_metric("dialog.total_turns", 1)
if callback:
callback({
"session_id": session_id,
"user_input": text,
"assistant_response": full_response,
"response_time": response_time
})
# 开始LLM推理
self.components["llm"].generate_stream(
messages=messages,
callback=on_llm_token
)
# 处理音频数据
self.components["asr"].add_audio_chunk(audio_data)
# 设置ASR回调
def asr_callback(result):
if result.get("type") == "final":
asyncio.create_task(on_asr_result(result["text"]))
self.components["asr"].add_callback("on_final_result", asr_callback)
return {
"session_id": session_id,
"status": "processing",
"timestamp": time.time()
}
except Exception as e:
logger.error(f"❌ 音频流处理失败: {e}")
session.current_state = "error"
raise
finally:
# 清理状态
if session.current_state != "error":
session.current_state = "idle"
async def get_session_status(self, session_id: str) -> Dict[str, Any]:
"""获取会话状态"""
with self.session_lock:
if session_id not in self.active_sessions:
raise ValueError(f"会话不存在: {session_id}")
session = self.active_sessions[session_id]
return {
"session_id": session_id,
"user_id": session.user_id,
"is_active": session.is_active,
"current_state": session.current_state,
"total_turns": session.total_turns,
"avg_response_time": session.avg_response_time,
"last_activity": session.last_activity.isoformat(),
"conversation_length": len(session.conversation_history)
}
async def get_system_status(self) -> Dict[str, Any]:
"""获取系统状态"""
# 基本状态
status = {
"is_running": self.is_running,
"timestamp": datetime.now().isoformat(),
"uptime": time.time() - self.metrics.start_time if self.is_running else 0
}
# 会话统计
with self.session_lock:
status["sessions"] = {
"active_count": len(self.active_sessions),
"total_sessions": len(self.active_sessions), # 简化版本
}
# 组件状态
status["components"] = {}
for name, component in self.components.items():
if hasattr(component, 'get_statistics'):
status["components"][name] = component.get_statistics()
# 系统指标
status["system_metrics"] = self.metrics.get_system_metrics()
# 健康状态
status["health"] = self.health_checker.health_status
# 性能指标
performance_metrics = {}
for metric_name in ["dialog.response_time", "system.cpu_percent", "system.memory_percent"]:
stats = self.metrics.get_metric_stats(metric_name)
if stats:
performance_metrics[metric_name] = stats
status["performance"] = performance_metrics
return status
async def _save_session(self, session: DialogSession):
"""保存会话数据"""
try:
session_file = Path(self.config.session_storage_path) / f"{session.session_id}.json"
session_data = asdict(session)
# 处理datetime对象
session_data["created_at"] = session.created_at.isoformat()
session_data["last_activity"] = session.last_activity.isoformat()
with open(session_file, 'w', encoding='utf-8') as f:
json.dump(session_data, f, ensure_ascii=False, indent=2)
except Exception as e:
logger.error(f"❌ 保存会话数据失败: {e}")
async def _save_session_data(self):
"""保存所有会话数据"""
with self.session_lock:
for session in self.active_sessions.values():
await self._save_session(session)
# 使用示例和压力测试
if __name__ == "__main__":
print(" 生产级流式对话系统测试")
print("=" * 50)
async def main():
# 创建系统配置
config = ProductionConfig(
max_concurrent_sessions=10,
session_timeout=300,
metrics_collection_interval=5
)
# 创建系统实例
dialog_system = ProductionDialogSystem(config)
try:
# 启动系统
print(" 启动系统...")
await dialog_system.start_system()
# 创建测试会话
session_id = await dialog_system.create_session(user_id="test_user")
print(f" 创建会话: {session_id}")
# 模拟音频输入
print(" 模拟音频输入...")
test_audio = np.random.randn(16000) # 1秒的随机音频
# 处理音频流
result_callback_called = False
def on_result(result):
nonlocal result_callback_called
result_callback_called = True
print(f"✅ 对话结果: {result}")
await dialog_system.process_audio_stream(
session_id=session_id,
audio_data=test_audio,
callback=on_result
)
# 等待处理完成
print("⏳ 等待处理完成...")
await asyncio.sleep(10)
# 获取系统状态
status = await dialog_system.get_system_status()
print(f"\n 系统状态:")
print(f" 运行状态: {status['is_running']}")
print(f" 运行时间: {status['uptime']:.1f}s")
print(f" 活跃会话: {status['sessions']['active_count']}")
print(f" 健康状态: {status['health']['overall']}")
# 性能指标
if status['performance']:
print(f"\n 性能指标:")
for metric, stats in status['performance'].items():
print(f" {metric}: 当前={stats.get('current', 0):.3f}, "
f"平均={stats.get('avg', 0):.3f}")
# 组件状态
print(f"\n 组件状态:")
for component, component_status in status['components'].items():
print(f" {component}: {len(component_status)} 个指标")
except Exception as e:
print(f"❌ 测试失败: {e}")
import traceback
traceback.print_exc()
finally:
print(" 停止系统...")
await dialog_system.stop_system()
# 运行测试
asyncio.run(main())
print(" 生产级流式对话系统测试完成!")
最后,让我们创建一个完整的性能测试和监控系统!这就像给我们的AI助手做全面体检,确保它在各种情况下都能稳定高效地工作。
# performance_testing.py - 性能测试与监控系统
import asyncio
import aiohttp
import time
import json
import logging
import threading
import multiprocessing
from typing import Dict, List, Any, Optional, Callable
from dataclasses import dataclass, field
from datetime import datetime, timedelta
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed
import psutil
import socket
import requests
from pathlib import Path
# 导入系统组件
from production_dialog_system import ProductionDialogSystem, ProductionConfig
logger = logging.getLogger(__name__)
@dataclass
class TestConfig:
"""测试配置"""
# 负载测试参数
max_concurrent_users: int = 50
duration_seconds: int = 300 # 5分钟
ramp_up_seconds: int = 60 # 1分钟
# 延迟测试参数
target_response_time: float = 3.0 # 目标响应时间
latency_percentiles: List[float] = field(default_factory=lambda: [50, 90, 95, 99])
# 音频测试参数
audio_duration: float = 5.0 # 音频时长
sample_rate: int = 16000
# 结果存储
results_dir: str = "./test_results"
save_detailed_logs: bool = True
# 监控参数
monitoring_interval: float = 1.0 # 监控间隔
alert_thresholds: Dict[str, float] = field(default_factory=lambda: {
"cpu_percent": 80.0,
"memory_percent": 80.0,
"response_time": 5.0,
"error_rate": 0.05
})
@dataclass
class TestResult:
"""测试结果"""
test_id: str
test_type: str
start_time: datetime
end_time: datetime
duration: float
# 性能指标
total_requests: int = 0
successful_requests: int = 0
failed_requests: int = 0
error_rate: float = 0.0
# 延迟指标
response_times: List[float] = field(default_factory=list)
avg_response_time: float = 0.0
min_response_time: float = 0.0
max_response_time: float = 0.0
percentiles: Dict[str, float] = field(default_factory=dict)
# 吞吐量指标
requests_per_second: float = 0.0
# 系统资源
avg_cpu_usage: float = 0.0
avg_memory_usage: float = 0.0
max_cpu_usage: float = 0.0
max_memory_usage: float = 0.0
# 错误信息
errors: List[Dict[str, Any]] = field(default_factory=list)
# 详细数据
detailed_metrics: Dict[str, List[float]] = field(default_factory=dict)
class PerformanceMonitor:
"""性能监控器"""
def __init__(self, config: TestConfig):
self.config = config
self.is_monitoring = False
self.monitoring_thread = None
# 监控数据
self.metrics = {
"timestamps": [],
"cpu_percent": [],
"memory_percent": [],
"memory_used_gb": [],
"network_bytes_sent": [],
"network_bytes_recv": [],
"disk_io_read": [],
"disk_io_write": []
}
# GPU监控(如果可用)
self.gpu_available = False
try:
import torch
if torch.cuda.is_available():
self.gpu_available = True
self.metrics.update({
"gpu_memory_allocated": [],
"gpu_memory_reserved": [],
"gpu_utilization": []
})
except ImportError:
pass
def start_monitoring(self):
"""开始监控"""
if self.is_monitoring:
return
self.is_monitoring = True
self.monitoring_thread = threading.Thread(
target=self._monitoring_loop,
daemon=True
)
self.monitoring_thread.start()
logger.info(" 性能监控已启动")
def stop_monitoring(self):
"""停止监控"""
self.is_monitoring = False
if self.monitoring_thread:
self.monitoring_thread.join(timeout=5)
logger.info(" 性能监控已停止")
def _monitoring_loop(self):
"""监控循环"""
while self.is_monitoring:
try:
timestamp = time.time()
# 系统资源监控
cpu_percent = psutil.cpu_percent(interval=None)
memory = psutil.virtual_memory()
network = psutil.net_io_counters()
disk = psutil.disk_io_counters()
# 记录指标
self.metrics["timestamps"].append(timestamp)
self.metrics["cpu_percent"].append(cpu_percent)
self.metrics["memory_percent"].append(memory.percent)
self.metrics["memory_used_gb"].append(memory.used / 1024**3)
self.metrics["network_bytes_sent"].append(network.bytes_sent)
self.metrics["network_bytes_recv"].append(network.bytes_recv)
self.metrics["disk_io_read"].append(disk.read_bytes)
self.metrics["disk_io_write"].append(disk.write_bytes)
# GPU监控
if self.gpu_available:
try:
import torch
gpu_memory_allocated = torch.cuda.memory_allocated() / 1024**3
gpu_memory_reserved = torch.cuda.memory_reserved() / 1024**3
self.metrics["gpu_memory_allocated"].append(gpu_memory_allocated)
self.metrics["gpu_memory_reserved"].append(gpu_memory_reserved)
# GPU利用率(需要nvidia-ml-py包)
try:
import pynvml
pynvml.nvmlInit()
handle = pynvml.nvmlDeviceGetHandleByIndex(0)
gpu_util = pynvml.nvmlDeviceGetUtilizationRates(handle)
self.metrics["gpu_utilization"].append(gpu_util.gpu)
except ImportError:
self.metrics["gpu_utilization"].append(0)
except Exception as e:
logger.warning(f"GPU监控错误: {e}")
# 检查告警阈值
self._check_alerts(cpu_percent, memory.percent)
time.sleep(self.config.monitoring_interval)
except Exception as e:
logger.error(f"❌ 监控错误: {e}")
time.sleep(1)
def _check_alerts(self, cpu_percent: float, memory_percent: float):
"""检查告警阈值"""
alerts = []
if cpu_percent > self.config.alert_thresholds["cpu_percent"]:
alerts.append(f"CPU使用率过高: {cpu_percent:.1f}%")
if memory_percent > self.config.alert_thresholds["memory_percent"]:
alerts.append(f"内存使用率过高: {memory_percent:.1f}%")
if alerts:
logger.warning(f" 性能告警: {', '.join(alerts)}")
def get_summary_stats(self) -> Dict[str, Any]:
"""获取监控摘要统计"""
if not self.metrics["timestamps"]:
return {}
stats = {}
# 基础统计
for metric_name, values in self.metrics.items():
if metric_name == "timestamps":
continue
if values:
stats[metric_name] = {
"avg": np.mean(values),
"min": np.min(values),
"max": np.max(values),
"std": np.std(values)
}
# 时间统计
if len(self.metrics["timestamps"]) > 1:
duration = self.metrics["timestamps"][-1] - self.metrics["timestamps"][0]
stats["monitoring_duration"] = duration
stats["data_points"] = len(self.metrics["timestamps"])
return stats
class LoadTester:
"""负载测试器"""
def __init__(self, config: TestConfig):
self.config = config
self.results = []
self.test_session = None
async def run_load_test(
self,
dialog_system: ProductionDialogSystem,
test_name: str = "load_test"
) -> TestResult:
"""运行负载测试"""
logger.info(f" 开始负载测试: {test_name}")
# 创建测试结果
test_result = TestResult(
test_id=f"{test_name}_{int(time.time())}",
test_type="load_test",
start_time=datetime.now(),
end_time=datetime.now(),
duration=0.0
)
# 启动性能监控
monitor = PerformanceMonitor(self.config)
monitor.start_monitoring()
try:
start_time = time.time()
# 生成测试任务
tasks = []
for i in range(self.config.max_concurrent_users):
# 计算启动延迟(渐进式增加负载)
delay = (i / self.config.max_concurrent_users) * self.config.ramp_up_seconds
task = asyncio.create_task(
self._simulate_user_session(dialog_system, i, delay)
)
tasks.append(task)
# 等待所有任务完成
results = await asyncio.gather(*tasks, return_exceptions=True)
# 统计结果
end_time = time.time()
test_result.duration = end_time - start_time
test_result.end_time = datetime.now()
# 处理结果
for i, result in enumerate(results):
if isinstance(result, Exception):
test_result.failed_requests += 1
test_result.errors.append({
"user_id": i,
"error": str(result),
"timestamp": time.time()
})
else:
test_result.successful_requests += 1
if result.get("response_time"):
test_result.response_times.append(result["response_time"])
# 计算统计指标
test_result.total_requests = len(results)
test_result.error_rate = test_result.failed_requests / test_result.total_requests
if test_result.response_times:
test_result.avg_response_time = np.mean(test_result.response_times)
test_result.min_response_time = np.min(test_result.response_times)
test_result.max_response_time = np.max(test_result.response_times)
# 计算百分位数
for p in self.config.latency_percentiles:
test_result.percentiles[f"p{p}"] = np.percentile(test_result.response_times, p)
test_result.requests_per_second = test_result.total_requests / test_result.duration
except Exception as e:
logger.error(f"❌ 负载测试失败: {e}")
test_result.errors.append({
"error": str(e),
"timestamp": time.time()
})
finally:
# 停止监控
monitor.stop_monitoring()
# 获取系统资源统计
resource_stats = monitor.get_summary_stats()
if resource_stats:
test_result.avg_cpu_usage = resource_stats.get("cpu_percent", {}).get("avg", 0)
test_result.avg_memory_usage = resource_stats.get("memory_percent", {}).get("avg", 0)
test_result.max_cpu_usage = resource_stats.get("cpu_percent", {}).get("max", 0)
test_result.max_memory_usage = resource_stats.get("memory_percent", {}).get("max", 0)
test_result.detailed_metrics = monitor.metrics
logger.info(f"✅ 负载测试完成: {test_result.test_id}")
return test_result
async def _simulate_user_session(
self,
dialog_system: ProductionDialogSystem,
user_id: int,
delay: float
) -> Dict[str, Any]:
"""模拟用户会话"""
await asyncio.sleep(delay)
try:
# 创建会话
session_id = await dialog_system.create_session(user_id=f"test_user_{user_id}")
# 生成测试音频
audio_data = self._generate_test_audio()
# 记录开始时间
start_time = time.time()
# 处理音频
result_received = False
result_data = {}
def on_result(result):
nonlocal result_received, result_data
result_received = True
result_data = result
await dialog_system.process_audio_stream(
session_id=session_id,
audio_data=audio_data,
callback=on_result
)
# 等待结果
timeout = 30 # 30秒超时
elapsed = 0
while not result_received and elapsed < timeout:
await asyncio.sleep(0.1)
elapsed += 0.1
if not result_received:
raise TimeoutError(f"用户 {user_id} 响应超时")
response_time = time.time() - start_time
return {
"user_id": user_id,
"session_id": session_id,
"response_time": response_time,
"success": True,
"result": result_data
}
except Exception as e:
logger.error(f"❌ 用户 {user_id} 会话失败: {e}")
return {
"user_id": user_id,
"error": str(e),
"success": False
}
def _generate_test_audio(self) -> np.ndarray:
"""生成测试音频"""
# 生成简单的合成语音信号
duration = self.config.audio_duration
sample_rate = self.config.sample_rate
t = np.linspace(0, duration, int(sample_rate * duration))
# 生成复合信号(模拟语音)
frequencies = [220, 440, 880, 1760] # 不同频率的正弦波
signal = np.zeros_like(t)
for freq in frequencies:
signal += 0.25 * np.sin(2 * np.pi * freq * t) * np.exp(-t / 2)
# 添加一些随机噪声
signal += 0.1 * np.random.randn(len(t))
# 归一化
signal = signal / np.max(np.abs(signal))
return signal.astype(np.float32)
class PerformanceReporter:
"""性能报告生成器"""
def __init__(self, config: TestConfig):
self.config = config
self.results_dir = Path(config.results_dir)
self.results_dir.mkdir(parents=True, exist_ok=True)
def save_test_result(self, test_result: TestResult):
"""保存测试结果"""
# 保存JSON格式的详细结果
result_file = self.results_dir / f"{test_result.test_id}.json"
# 准备可序列化的数据
result_data = {
"test_id": test_result.test_id,
"test_type": test_result.test_type,
"start_time": test_result.start_time.isoformat(),
"end_time": test_result.end_time.isoformat(),
"duration": test_result.duration,
"total_requests": test_result.total_requests,
"successful_requests": test_result.successful_requests,
"failed_requests": test_result.failed_requests,
"error_rate": test_result.error_rate,
"avg_response_time": test_result.avg_response_time,
"min_response_time": test_result.min_response_time,
"max_response_time": test_result.max_response_time,
"percentiles": test_result.percentiles,
"requests_per_second": test_result.requests_per_second,
"avg_cpu_usage": test_result.avg_cpu_usage,
"avg_memory_usage": test_result.avg_memory_usage,
"max_cpu_usage": test_result.max_cpu_usage,
"max_memory_usage": test_result.max_memory_usage,
"errors": test_result.errors
}
# 如果启用详细日志,保存响应时间和监控数据
if self.config.save_detailed_logs:
result_data["response_times"] = test_result.response_times
result_data["detailed_metrics"] = test_result.detailed_metrics
with open(result_file, 'w', encoding='utf-8') as f:
json.dump(result_data, f, indent=2, ensure_ascii=False)
logger.info(f" 测试结果已保存: {result_file}")
def generate_performance_report(self, test_result: TestResult) -> str:
"""生成性能报告"""
report = f"""
# 性能测试报告
## 测试概览
- **测试ID**: {test_result.test_id}
- **测试类型**: {test_result.test_type}
- **开始时间**: {test_result.start_time.strftime('%Y-%m-%d %H:%M:%S')}
- **结束时间**: {test_result.end_time.strftime('%Y-%m-%d %H:%M:%S')}
- **测试时长**: {test_result.duration:.2f} 秒
## 请求统计
- **总请求数**: {test_result.total_requests}
- **成功请求**: {test_result.successful_requests}
- **失败请求**: {test_result.failed_requests}
- **错误率**: {test_result.error_rate:.2%}
- **吞吐量**: {test_result.requests_per_second:.2f} 请求/秒
## 响应时间分析
- **平均响应时间**: {test_result.avg_response_time:.3f} 秒
- **最小响应时间**: {test_result.min_response_time:.3f} 秒
- **最大响应时间**: {test_result.max_response_time:.3f} 秒
### 响应时间百分位数
"""
for percentile, value in test_result.percentiles.items():
report += f"- **{percentile}**: {value:.3f} 秒\n"
report += f"""
## 系统资源使用
- **平均CPU使用率**: {test_result.avg_cpu_usage:.1f}%
- **最大CPU使用率**: {test_result.max_cpu_usage:.1f}%
- **平均内存使用率**: {test_result.avg_memory_usage:.1f}%
- **最大内存使用率**: {test_result.max_memory_usage:.1f}%
## 性能评估
"""
# 性能评估
if test_result.error_rate > 0.05:
report += "⚠️ **错误率过高**: 错误率超过5%,需要优化系统稳定性\n"
else:
report += "✅ **错误率正常**: 错误率在可接受范围内\n"
if test_result.avg_response_time > 3.0:
report += "⚠️ **响应时间过长**: 平均响应时间超过3秒,需要优化性能\n"
else:
report += "✅ **响应时间良好**: 平均响应时间在可接受范围内\n"
if test_result.max_cpu_usage > 80:
report += "⚠️ **CPU使用率过高**: 最大CPU使用率超过80%,可能出现性能瓶颈\n"
else:
report += "✅ **CPU使用率正常**: CPU使用率在正常范围内\n"
if test_result.max_memory_usage > 80:
report += "⚠️ **内存使用率过高**: 最大内存使用率超过80%,可能出现内存不足\n"
else:
report += "✅ **内存使用率正常**: 内存使用率在正常范围内\n"
# 错误详情
if test_result.errors:
report += f"\n## 错误详情\n"
for i, error in enumerate(test_result.errors[:10]): # 只显示前10个错误
report += f"- **错误 {i+1}**: {error.get('error', 'Unknown error')}\n"
if len(test_result.errors) > 10:
report += f"- ... 还有 {len(test_result.errors) - 10} 个错误\n"
return report
def save_performance_charts(self, test_result: TestResult):
"""保存性能图表"""
if not test_result.detailed_metrics:
return
# 创建图表目录
charts_dir = self.results_dir / f"{test_result.test_id}_charts"
charts_dir.mkdir(exist_ok=True)
# 响应时间分布图
if test_result.response_times:
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.hist(test_result.response_times, bins=50, alpha=0.7, edgecolor='black')
plt.xlabel('响应时间 (秒)')
plt.ylabel('频次')
plt.title('响应时间分布')
plt.grid(True, alpha=0.3)
plt.subplot(1, 2, 2)
plt.plot(test_result.response_times, marker='o', markersize=2)
plt.xlabel('请求序号')
plt.ylabel('响应时间 (秒)')
plt.title('响应时间趋势')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig(charts_dir / 'response_times.png', dpi=300, bbox_inches='tight')
plt.close()
# 系统资源使用图
if 'timestamps' in test_result.detailed_metrics:
timestamps = test_result.detailed_metrics['timestamps']
# 转换为相对时间(秒)
if timestamps:
start_time = timestamps[0]
relative_times = [(t - start_time) for t in timestamps]
plt.figure(figsize=(15, 10))
# CPU使用率
plt.subplot(2, 2, 1)
if 'cpu_percent' in test_result.detailed_metrics:
plt.plot(relative_times, test_result.detailed_metrics['cpu_percent'])
plt.xlabel('时间 (秒)')
plt.ylabel('CPU使用率 (%)')
plt.title('CPU使用率')
plt.grid(True, alpha=0.3)
# 内存使用率
plt.subplot(2, 2, 2)
if 'memory_percent' in test_result.detailed_metrics:
plt.plot(relative_times, test_result.detailed_metrics['memory_percent'])
plt.xlabel('时间 (秒)')
plt.ylabel('内存使用率 (%)')
plt.title('内存使用率')
plt.grid(True, alpha=0.3)
# 网络I/O
plt.subplot(2, 2, 3)
if 'network_bytes_sent' in test_result.detailed_metrics and 'network_bytes_recv' in test_result.detailed_metrics:
sent = np.array(test_result.detailed_metrics['network_bytes_sent'])
recv = np.array(test_result.detailed_metrics['network_bytes_recv'])
# 计算速率(字节/秒)
if len(sent) > 1:
sent_rate = np.diff(sent) / np.diff(relative_times)
recv_rate = np.diff(recv) / np.diff(relative_times)
plt.plot(relative_times[1:], sent_rate / 1024, label='发送')
plt.plot(relative_times[1:], recv_rate / 1024, label='接收')
plt.xlabel('时间 (秒)')
plt.ylabel('网络速率 (KB/s)')
plt.title('网络I/O')
plt.legend()
plt.grid(True, alpha=0.3)
# GPU使用率(如果可用)
plt.subplot(2, 2, 4)
if 'gpu_memory_allocated' in test_result.detailed_metrics:
plt.plot(relative_times, test_result.detailed_metrics['gpu_memory_allocated'], label='已分配')
if 'gpu_memory_reserved' in test_result.detailed_metrics:
plt.plot(relative_times, test_result.detailed_metrics['gpu_memory_reserved'], label='已预留')
plt.xlabel('时间 (秒)')
plt.ylabel('GPU内存 (GB)')
plt.title('GPU内存使用')
plt.legend()
plt.grid(True, alpha=0.3)
else:
plt.text(0.5, 0.5, 'GPU不可用', ha='center', va='center', transform=plt.gca().transAxes)
plt.title('GPU使用率')
plt.tight_layout()
plt.savefig(charts_dir / 'system_resources.png', dpi=300, bbox_inches='tight')
plt.close()
logger.info(f" 性能图表已保存: {charts_dir}")
# 完整的性能测试主程序
async def run_comprehensive_performance_test():
"""运行综合性能测试"""
print(" 综合性能测试开始")
print("=" * 60)
# 创建测试配置
test_config = TestConfig(
max_concurrent_users=10, # 减少并发数以适应测试环境
duration_seconds=120, # 2分钟测试
ramp_up_seconds=30, # 30秒渐进
results_dir="./performance_results"
)
# 创建生产系统配置
prod_config = ProductionConfig(
max_concurrent_sessions=20,
metrics_collection_interval=1
)
# 创建系统实例
dialog_system = ProductionDialogSystem(prod_config)
load_tester = LoadTester(test_config)
reporter = PerformanceReporter(test_config)
try:
# 启动系统
print(" 启动对话系统...")
await dialog_system.start_system()
# 等待系统稳定
await asyncio.sleep(5)
# 运行负载测试
print(" 开始负载测试...")
test_result = await load_tester.run_load_test(dialog_system, "comprehensive_test")
# 生成报告
print(" 生成性能报告...")
reporter.save_test_result(test_result)
report_text = reporter.generate_performance_report(test_result)
# 保存报告
report_file = Path(test_config.results_dir) / f"{test_result.test_id}_report.md"
with open(report_file, 'w', encoding='utf-8') as f:
f.write(report_text)
# 生成图表
print(" 生成性能图表...")
reporter.save_performance_charts(test_result)
# 显示结果摘要
print(f"\n 测试结果摘要:")
print(f" 测试ID: {test_result.test_id}")
print(f" 总请求数: {test_result.total_requests}")
print(f" 成功率: {(test_result.successful_requests/test_result.total_requests)*100:.1f}%")
print(f" 平均响应时间: {test_result.avg_response_time:.3f}s")
print(f" P95响应时间: {test_result.percentiles.get('p95', 0):.3f}s")
print(f" 吞吐量: {test_result.requests_per_second:.2f} 请求/秒")
print(f" 平均CPU使用率: {test_result.avg_cpu_usage:.1f}%")
print(f" 平均内存使用率: {test_result.avg_memory_usage:.1f}%")
if test_result.errors:
print(f" 错误数: {len(test_result.errors)}")
print(f"\n 详细报告: {report_file}")
except Exception as e:
print(f"❌ 性能测试失败: {e}")
import traceback
traceback.print_exc()
finally:
print(" 停止对话系统...")
await dialog_system.stop_system()
print(" 综合性能测试完成!")
if __name__ == "__main__":
# 运行综合性能测试
asyncio.run(run_comprehensive_performance_test())
让我们通过一个完整的部署示例来展示如何在实际生产环境中部署和优化我们的流式对话系统。这就像把我们精心打造的AI助手从实验室搬到真实世界中!
# 端到端对话系统部署优化实践指南
## ️ 部署架构最佳实践
### 1. 容器化部署方案
#### Docker配置文件
```dockerfile
# Dockerfile
FROM python:3.9-slim
# 安装系统依赖
RUN apt-get update && apt-get install -y \
portaudio19-dev \
ffmpeg \
libsndfile1 \
&& rm -rf /var/lib/apt/lists/*
# 设置工作目录
WORKDIR /app
# 安装Python依赖
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
# 复制应用代码
COPY . .
# 暴露端口
EXPOSE 8000
# 启动命令
CMD ["python", "production_dialog_system.py"]
# docker-compose.yml
version: '3.8'
services:
dialog-system:
build: .
ports:
- "8000:8000"
environment:
- CUDA_VISIBLE_DEVICES=0
- MODEL_CACHE_DIR=/models
- LOG_LEVEL=INFO
volumes:
- ./models:/models
- ./logs:/app/logs
- ./data:/app/data
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: 1
capabilities: [gpu]
restart: unless-stopped
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
interval: 30s
timeout: 10s
retries: 3
redis:
image: redis:7-alpine
ports:
- "6379:6379"
volumes:
- redis_data:/data
restart: unless-stopped
nginx:
image: nginx:alpine
ports:
- "80:80"
- "443:443"
volumes:
- ./nginx.conf:/etc/nginx/nginx.conf
- ./ssl:/etc/nginx/ssl
depends_on:
- dialog-system
restart: unless-stopped
volumes:
redis_data:
# k8s-deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: dialog-system
labels:
app: dialog-system
spec:
replicas: 3
selector:
matchLabels:
app: dialog-system
template:
metadata:
labels:
app: dialog-system
spec:
containers:
- name: dialog-system
image: dialog-system:latest
ports:
- containerPort: 8000
env:
- name: MODEL_CACHE_DIR
value: "/models"
- name: REDIS_URL
value: "redis://redis-service:6379"
resources:
requests:
memory: "4Gi"
cpu: "2"
nvidia.com/gpu: 1
limits:
memory: "8Gi"
cpu: "4"
nvidia.com/gpu: 1
volumeMounts:
- name: model-storage
mountPath: /models
livenessProbe:
httpGet:
path: /health
port: 8000
initialDelaySeconds: 30
periodSeconds: 10
readinessProbe:
httpGet:
path: /ready
port: 8000
initialDelaySeconds: 5
periodSeconds: 5
volumes:
- name: model-storage
persistentVolumeClaim:
claimName: model-pvc
---
apiVersion: v1
kind: Service
metadata:
name: dialog-system-service
spec:
selector:
app: dialog-system
ports:
- protocol: TCP
port: 80
targetPort: 8000
type: LoadBalancer
优化技术 | 延迟改善 | 内存节省 | 精度损失 | 实现难度 | 推荐场景 |
---|---|---|---|---|---|
动态量化 | 20-30% | 25-50% | 最少 | 低 | 生产环境首选 |
静态量化 | 30-50% | 50-75% | 少量 | 中等 | 对精度要求不高 |
知识蒸馏 | 50-70% | 60-80% | 中等 | 高 | 有大量数据 |
剪枝优化 | 10-30% | 30-60% | 少量 | 中等 | 结构化场景 |
模型并行 | 40-60% | 负增长 | 无 | 极高 | 多GPU环境 |
# 缓存层级设计
class MultiLevelCache:
def __init__(self):
self.l1_cache = {} # 内存缓存(最快)
self.l2_cache = redis.Redis() # Redis缓存(中等)
self.l3_cache = {} # 磁盘缓存(最慢)
async def get(self, key):
# L1缓存
if key in self.l1_cache:
return self.l1_cache[key]
# L2缓存
value = await self.l2_cache.get(key)
if value:
self.l1_cache[key] = value
return value
# L3缓存
return await self.get_from_disk(key)
# 基于硬件配置的并发参数
OPTIMIZATION_CONFIGS = {
"single_gpu": {
"max_concurrent_sessions": 20,
"asr_threads": 2,
"llm_batch_size": 4,
"tts_workers": 3
},
"multi_gpu": {
"max_concurrent_sessions": 100,
"asr_threads": 4,
"llm_batch_size": 16,
"tts_workers": 8
},
"cpu_only": {
"max_concurrent_sessions": 10,
"asr_threads": 4,
"llm_batch_size": 1,
"tts_workers": 2
}
}
指标类别 | 关键指标 | 目标值 | 告警阈值 | 监控频率 |
---|---|---|---|---|
延迟指标 | 端到端响应时间 | <2s | >3s | 实时 |
ASR处理延迟 | <0.5s | >1s | 实时 | |
LLM推理延迟 | <1s | >2s | 实时 | |
TTS合成延迟 | <0.3s | >0.5s | 实时 | |
吞吐量指标 | 并发用户数 | 100+ | <50 | 1分钟 |
请求处理速率 | 50 req/s | <20 req/s | 1分钟 | |
质量指标 | ASR准确率 | >95% | <90% | 1小时 |
对话成功率 | >98% | <95% | 1小时 | |
资源指标 | GPU利用率 | 70-85% | >90% | 30秒 |
内存使用率 | <80% | >85% | 30秒 | |
CPU使用率 | <70% | >80% | 30秒 |
# 性能诊断脚本
#!/bin/bash
echo "=== ASR性能诊断 ==="
# 检查音频处理延迟
echo "音频处理队列长度:"
curl -s http://localhost:8000/metrics | grep asr_queue_size
# 检查模型加载状态
echo "ASR模型状态:"
curl -s http://localhost:8000/status | jq '.components.asr'
# 检查音频质量
echo "音频质量指标:"
curl -s http://localhost:8000/metrics | grep audio_quality
# 优化建议
echo "=== 优化建议 ==="
if [ $(curl -s http://localhost:8000/metrics | grep asr_avg_latency | cut -d' ' -f2) > 1.0 ]; then
echo "- 建议调整ASR模型大小或启用量化"
echo "- 检查音频预处理参数"
echo "- 考虑使用更快的ASR引擎"
fi
class ResourceManager:
def __init__(self):
self.gpu_memory_threshold = 0.85
self.cpu_memory_threshold = 0.80
def monitor_and_optimize(self):
# GPU内存管理
if torch.cuda.is_available():
gpu_usage = torch.cuda.memory_allocated() / torch.cuda.max_memory_allocated()
if gpu_usage > self.gpu_memory_threshold:
self.cleanup_gpu_memory()
# CPU内存管理
memory_usage = psutil.virtual_memory().percent / 100
if memory_usage > self.cpu_memory_threshold:
self.cleanup_cpu_memory()
def cleanup_gpu_memory(self):
"""GPU内存清理"""
torch.cuda.empty_cache()
gc.collect()
def cleanup_cpu_memory(self):
"""CPU内存清理"""
# 清理缓存
if hasattr(self, 'cache'):
self.cache.clear()
gc.collect()
# nginx负载均衡配置
upstream dialog_backend {
least_conn;
server dialog-system-1:8000 weight=3 max_fails=3 fail_timeout=30s;
server dialog-system-2:8000 weight=3 max_fails=3 fail_timeout=30s;
server dialog-system-3:8000 weight=2 max_fails=3 fail_timeout=30s;
}
server {
listen 80;
server_name dialog.example.com;
location / {
proxy_pass http://dialog_backend;
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
# WebSocket支持
proxy_http_version 1.1;
proxy_set_header Upgrade $http_upgrade;
proxy_set_header Connection "upgrade";
# 超时设置
proxy_connect_timeout 30s;
proxy_send_timeout 30s;
proxy_read_timeout 30s;
}
# 健康检查
location /health {
access_log off;
proxy_pass http://dialog_backend/health;
}
}
class FailoverManager:
def __init__(self):
self.primary_endpoints = ["http://primary:8000"]
self.backup_endpoints = ["http://backup-1:8000", "http://backup-2:8000"]
self.current_endpoint = self.primary_endpoints[0]
self.health_check_interval = 30
async def health_check(self, endpoint):
"""健康检查"""
try:
async with aiohttp.ClientSession() as session:
async with session.get(f"{endpoint}/health", timeout=5) as response:
return response.status == 200
except:
return False
async def failover_if_needed(self):
"""根据健康检查结果进行故障转移"""
if not await self.health_check(self.current_endpoint):
# 主服务不可用,切换到备用服务
for backup in self.backup_endpoints:
if await self.health_check(backup):
self.current_endpoint = backup
logger.warning(f"故障转移到: {backup}")
break
部署方案 | 硬件成本 | 运维成本 | 扩展性 | 适用场景 |
---|---|---|---|---|
单机部署 | 低 | 低 | 差 | 原型验证 |
云服务器 | 中等 | 中等 | 好 | 中小规模 |
专用GPU集群 | 高 | 高 | 优秀 | 大规模生产 |
混合云 | 中等 | 中等 | 优秀 | 弹性负载 |
DEPLOYMENT_CONFIGS = {
"small_scale": { # 100并发用户
"instances": 2,
"cpu_per_instance": 4,
"memory_per_instance": "8GB",
"gpu_per_instance": 1,
"estimated_cost_per_month": "$800"
},
"medium_scale": { # 1000并发用户
"instances": 5,
"cpu_per_instance": 8,
"memory_per_instance": "16GB",
"gpu_per_instance": 2,
"estimated_cost_per_month": "$4000"
},
"large_scale": { # 10000并发用户
"instances": 20,
"cpu_per_instance": 16,
"memory_per_instance": "32GB",
"gpu_per_instance": 4,
"estimated_cost_per_month": "$20000"
}
}
class SecurityManager:
def __init__(self):
self.encryption_key = os.getenv('ENCRYPTION_KEY')
self.data_retention_days = 30
def encrypt_audio_data(self, audio_data):
"""音频数据加密"""
from cryptography.fernet import Fernet
f = Fernet(self.encryption_key)
return f.encrypt(audio_data)
def anonymize_user_data(self, user_data):
"""用户数据匿名化"""
# 移除敏感信息
anonymized = user_data.copy()
anonymized.pop('user_id', None)
anonymized.pop('phone', None)
anonymized.pop('email', None)
return anonymized
def auto_delete_expired_data(self):
"""自动删除过期数据"""
cutoff_date = datetime.now() - timedelta(days=self.data_retention_days)
# 实现数据清理逻辑
class ABTestManager:
def __init__(self):
self.experiments = {}
self.user_assignments = {}
def create_experiment(self, name, variants, traffic_split):
"""创建A/B测试实验"""
self.experiments[name] = {
"variants": variants,
"traffic_split": traffic_split,
"metrics": defaultdict(list)
}
def assign_user_to_variant(self, user_id, experiment_name):
"""将用户分配到实验组"""
if experiment_name not in self.experiments:
return "control"
# 基于用户ID的一致性哈希
hash_value = hash(f"{user_id}_{experiment_name}") % 100
cumulative = 0
for variant, percentage in self.experiments[experiment_name]["traffic_split"].items():
cumulative += percentage
if hash_value < cumulative:
self.user_assignments[user_id] = variant
return variant
return "control"
#!/bin/bash
# 蓝绿部署脚本
echo "开始蓝绿部署..."
# 1. 构建新版本
docker build -t dialog-system:v2 .
# 2. 启动绿色环境
docker-compose -f docker-compose.green.yml up -d
# 3. 健康检查
sleep 30
if curl -f http://green-env:8000/health; then
echo "绿色环境健康检查通过"
else
echo "绿色环境健康检查失败,回滚"
docker-compose -f docker-compose.green.yml down
exit 1
fi
# 4. 切换流量
kubectl patch service dialog-system-service -p '{"spec":{"selector":{"version":"green"}}}'
# 5. 监控新版本
sleep 60
ERROR_RATE=$(curl -s http://monitoring:9090/api/v1/query?query=error_rate | jq '.data.result[0].value[1]')
if (( $(echo "$ERROR_RATE > 0.05" | bc -l) )); then
echo "错误率过高,回滚到蓝色环境"
kubectl patch service dialog-system-service -p '{"spec":{"selector":{"version":"blue"}}}'
exit 1
fi
# 6. 关闭蓝色环境
docker-compose -f docker-compose.blue.yml down
echo "蓝绿部署完成"
这个全面的部署优化指南涵盖了从容器化部署到生产环境监控的各个方面,为你的端到端对话系统提供了enterprise-grade的部署和优化方案。通过这些实践,你可以构建一个高性能、高可用、可扩展的AI对话系统!
怎么样今天的内容还满意吗?再次感谢朋友们的观看,关注GZH:凡人的AI工具箱,回复666,送您价值199的AI大礼包。最后,祝您早日实现财务自由,还请给个赞,谢谢!