多文档代理是一种先进的检索增强生成(RAG)架构,它采用分层设计来处理多个领域或文档集的查询。这种系统特别适用于处理大规模、多领域的知识库,能够提高检索精度和回答质量。
每个文档级查询引擎专注于一个特定领域或文档集:
协调代理在系统中扮演"指挥官"角色:
多文档代理系统是传统RAG的进化升级,通过分层设计有效处理大规模、多领域的知识库,提供更精确、更有条理的回答。
class DocumentProcessor:
def __init__(self, chunking_strategy):
self.chunking_strategy = chunking_strategy
def process(self, document):
# 文档清洗
cleaned_doc = self._clean_document(document)
# 文档分块
chunks = self._chunk_document(cleaned_doc)
# 元数据提取
chunks_with_metadata = self._extract_metadata(chunks)
return chunks_with_metadata
def _clean_document(self, document):
# 移除不必要的格式、标点符号规范化等
pass
def _chunk_document(self, document):
# 根据chunking_strategy进行文档分块
# 可以是固定大小、语义分块或混合策略
pass
def _extract_metadata(self, chunks):
# 提取每个块的关键信息作为元数据
pass
class IndexManager:
def __init__(self, vector_db_client, embedding_model):
self.vector_db = vector_db_client
self.embedding_model = embedding_model
def create_index(self, domain_name, schema):
# 为特定领域创建新的向量索引
pass
def index_documents(self, domain_name, processed_chunks):
# 将处理后的文档块索引到对应领域的向量数据库
embeddings = self.embedding_model.embed_documents([chunk.text for chunk in processed_chunks])
records = [{
'id': chunk.id,
'embedding': embedding,
'text': chunk.text,
'metadata': chunk.metadata
} for chunk, embedding in zip(processed_chunks, embeddings)]
self.vector_db.upsert(domain_name, records)
def update_index(self, domain_name, new_chunks):
# 更新现有索引中的文档
pass
def delete_from_index(self, domain_name, chunk_ids):
# 从索引中删除特定文档
pass
class DocumentQueryEngine:
def __init__(self, domain_name, vector_db_client, embedding_model, llm, retrieval_config):
self.domain_name = domain_name
self.vector_db = vector_db_client
self.embedding_model = embedding_model
self.llm = llm
self.retrieval_config = retrieval_config
# 领域特定的提示模板
self.prompt_template = self._load_domain_prompt()
def _load_domain_prompt(self):
# 加载针对该领域优化的提示模板
pass
def search(self, query, top_k=5):
# 将查询转换为向量
query_embedding = self.embedding_model.embed_query(query)
# 在向量数据库中搜索
search_params = {
'top_k': top_k,
**self.retrieval_config.get_search_params() # 领域特定的检索参数
}
results = self.vector_db.search(
collection_name=self.domain_name,
query_vector=query_embedding,
**search_params
)
# 返回检索结果
return self._post_process_results(results, query)
def _post_process_results(self, results, query):
# 领域特定的后处理,如重新排序、过滤等
# 可能包括基于特定领域知识的结果优化
processed_results = []
for result in results:
# 添加领域特定的相关性计算
relevance = self._calculate_domain_relevance(result, query)
processed_results.append({
'content': result['text'],
'metadata': result['metadata'],
'relevance': relevance
})
# 按领域特定相关性排序
processed_results.sort(key=lambda x: x['relevance'], reverse=True)
return processed_results
def _calculate_domain_relevance(self, result, query):
# 计算结果在该特定领域内与查询的相关性
# 可能使用领域特定的相关性公式
pass
def generate_answer(self, query, search_results):
# 根据检索结果生成针对该领域的回答
context = self._format_context(search_results)
prompt = self.prompt_template.format(query=query, context=context)
response = self.llm.generate(prompt)
return {
'answer': response,
'sources': [r['metadata'] for r in search_results],
'domain': self.domain_name
}
def _format_context(self, search_results):
# 将搜索结果格式化为LLM可用的上下文
pass
class CoordinatorAgent:
def __init__(self, router, query_engines, llm, config):
self.router = router
self.query_engines = query_engines # 各领域的文档查询引擎字典
self.llm = llm
self.config = config
async def process_query(self, query):