langchain使用向量数据库进行检索时,使用重排序模型

# 安装必要库
!pip install langchain langchain-community cohere tiktoken faiss-cpu

from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import CohereRerank

# -------------------- 1. 加载文档并切分 --------------------
loader = TextLoader("文档路径")  # 替换为你的文档路径
documents = loader.load()

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500, 
    chunk_overlap=50
)
texts = text_splitter.split_documents(documents)

# -------------------- 2. 创建向量数据库 --------------------
embeddings = OpenAIEmbeddings()  # 使用 OpenAI 或 HuggingFace 嵌入
vectorstore = FAISS.from_documents(texts, embeddings)

# -------------------- 3. 配置重排序模型 --------------------
# 方案一:使用 Cohere Reranker(需 API Key)
from langchain.retrievers.document_compressors import CohereRerank
compressor = CohereRerank(
    cohere_api_key="your_cohere_key",  # 替换为你的 Cohere Key
    top_n=5  # 重排序后保留前5个结果
)

# 方案二:使用开源的 BAAI/bge-reranker(本地运行)
# from langchain.retrievers.document_compressors import LLMChainFilter
# compressor = LLMChainFilter.from_llm(
#     llm=HuggingFacePipeline.from_model_id(
#         model_id="BAAI/bge-reranker-large",
#         task="text-generation",
#         device=0  # GPU 加速
#     )
# )

# -------------------- 4. 创建组合检索器 --------------------
base_retriever = vectorstore.as_retriever(search_kwargs={"k": 20})  # 先检索20个文档
compression_retriever = ContextualCompressionRetriever(
    base_retriever=base_retriever,
    base_compressor=compressor
)

# -------------------- 5. 执行检索 --------------------
query = "你的查询问题"
compressed_docs = compression_retriever.invoke(query)

# 打印结果
print(f"重排序后Top {compressor.top_n}结果:")
for doc in compressed_docs:
    print(f"分数: {doc.metadata['relevance_score']:.3f} | 内容: {doc.page_content[:100]}...")

你可能感兴趣的:(python,深度学习,langchain,重排序模型)