LlamaIndex和Langchain都是比较成熟的RAG和Agent框架,这里基于llama实现RAG框架,大模型选用阿里的开源模型Qwen大模型。可以实现Qwen2.5 与外部数据(例如文档、网页等)的连接,利用 LlamaIndex 与 Qwen2.5 快速部署检索增强生成(RAG)技术。
LlamaIndex(前身为 GPT Index)是一个强大的开源数据框架,主要用于构建基于大型语言模型(LLM)的应用程序。
通义千问(英文: Qwen ;读作: kùn)是由阿里巴巴通义千问团队开发的大规模语言和多模态系列模型。通义千问可以执行自然语言理解、文本生成、视觉理解、音频理解、工具调用、角色扮演、智能体等多种任务。语言和多模态模型均在大规模、多语言、多模态数据上进行预训练,并在高质量语料上后训练以与人类偏好对齐。
RAG(Retrieval-Augmented Generation)即检索增强生成,是一种结合了信息检索和语言生成技术的方法,旨在提高语言模型在生成文本时的准确性、相关性和实用性。
使用conda创建环境以及pip安装环境
建议使用 Python 3.10 或以上版本, PyTorch 2.3 ,transformers>=4.37.0
版本
# create rag env
conda create -n rag python=3.10
conda activate rag
# install torch
pip install torch
pip install transformers -U
# install qwen
pip install accelerate
pip install tiktoken
pip install einops
pip install transformers_stream_generator==0.0.4
pip install scipy
# install llama-index
pip install llama-index
pip install llama-index-llms-huggingface
pip install llama-index-readers-web
pip install ipywidgets
设置参数
现在可以设置语言模型和向量模型。Qwen2.5-Instruct支持包括英语和中文在内的多种语言对话。您可以使用 bge-base-en-v1.5
模型来检索英文文档,下载 bge-base-zh-v1.5
模型以检索中文文档。根据计算资源可以选择 bge-large
或 bge-small
作为向量模型,或调整上下文窗口大小或文本块大小。Qwen2.5模型系列支持最大32K上下文窗口大小(7B 、14B 、32B 及 72B可扩展支持 128K 上下文,但需要额外配置)
import torch
from llama_index.core import Settings
from llama_index.core.node_parser import SentenceSplitter
from llama_index.llms.huggingface import HuggingFaceLLM
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
# Set prompt template for generation (optional)
from llama_index.core import PromptTemplate
def completion_to_prompt(completion):
return f"<|im_start|>system\n<|im_end|>\n<|im_start|>user\n{completion}<|im_end|>\n<|im_start|>assistant\n"
def messages_to_prompt(messages):
prompt = ""
for message in messages:
if message.role == "system":
prompt += f"<|im_start|>system\n{message.content}<|im_end|>\n"
elif message.role == "user":
prompt += f"<|im_start|>user\n{message.content}<|im_end|>\n"
elif message.role == "assistant":
prompt += f"<|im_start|>assistant\n{message.content}<|im_end|>\n"
if not prompt.startswith("<|im_start|>system"):
prompt = "<|im_start|>system\n" + prompt
prompt = prompt + "<|im_start|>assistant\n"
return prompt
# Set Qwen2.5 as the language model and set generation config
Settings.llm = HuggingFaceLLM(
model_name="Qwen/Qwen2.5-7B-Instruct",
tokenizer_name="Qwen/Qwen2.5-7B-Instruct",
context_window=30000,
max_new_tokens=2000,
generate_kwargs={"temperature": 0.7, "top_k": 50, "top_p": 0.95},
messages_to_prompt=messages_to_prompt,
completion_to_prompt=completion_to_prompt,
device_map="auto",
)
# Set embedding model
Settings.embed_model = HuggingFaceEmbedding(
model_name = "BAAI/bge-base-en-v1.5"
)
# Set the size of the text chunk for retrieval
Settings.transformations = [SentenceSplitter(chunk_size=1024)]
构建索引
可以从文档构建索引
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
documents = SimpleDirectoryReader("./document").load_data()
index = VectorStoreIndex.from_documents(
documents,
embed_model=Settings.embed_model,
transformations=Settings.transformations
)
也可以从一系列网站的内容构建索引
from llama_index.readers.web import SimpleWebPageReader
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
documents = SimpleWebPageReader(html_to_text=True).load_data(
["web_address_1","web_address_2",...]
)
index = VectorStoreIndex.from_documents(
documents,
embed_model=Settings.embed_model,
transformations=Settings.transformations
)
保存索引与加载索引
index.storage_context.persist(persist_dir="save")
# save index
storage_context = StorageContext.from_defaults(persist_dir="save")
# load index
index = load_index_from_storage(storage_context)
检索增强rag
可以输入查询,Qwen2.5 将基于索引文档的内容提供答案。
query_engine = index.as_query_engine()
your_query = ""
print(query_engine.query(your_query).response)
总的代码如下:
import os
os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"
import torch
from llama_index.core import Settings
from llama_index.core.node_parser import SentenceSplitter
from llama_index.llms.huggingface import HuggingFaceLLM
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.core import StorageContext, load_index_from_storage
# Set prompt template for generation (optional)
from llama_index.core import PromptTemplate
def completion_to_prompt(completion):
return f"<|im_start|>system\n<|im_end|>\n<|im_start|>user\n{completion}<|im_end|>\n<|im_start|>assistant\n"
def messages_to_prompt(messages):
prompt = ""
for message in messages:
if message.role == "system":
prompt += f"<|im_start|>system\n{message.content}<|im_end|>\n"
elif message.role == "user":
prompt += f"<|im_start|>user\n{message.content}<|im_end|>\n"
elif message.role == "assistant":
prompt += f"<|im_start|>assistant\n{message.content}<|im_end|>\n"
if not prompt.startswith("<|im_start|>system"):
prompt = "<|im_start|>system\n" + prompt
prompt = prompt + "<|im_start|>assistant\n"
return prompt
# Set Qwen2.5 as the language model and set generation config
device = "cuda:2"
Settings.llm = HuggingFaceLLM(
model_name="Qwen/Qwen2.5-7B-Instruct",
tokenizer_name="Qwen/Qwen2.5-7B-Instruct",
context_window=30000,
max_new_tokens=2000,
generate_kwargs={"temperature": 0.7, "top_k": 50, "top_p": 0.95},
messages_to_prompt=messages_to_prompt,
completion_to_prompt=completion_to_prompt,
device_map=device)
# Set embedding model
Settings.embed_model = HuggingFaceEmbedding(model_name = "BAAI/bge-small-zh-v1.5")
# Set the size of the text chunk for retrieval
Settings.transformations = [SentenceSplitter(chunk_size=1024)]
print('----build index----')
# Build index
documents = SimpleDirectoryReader("./document").load_data()
index = VectorStoreIndex.from_documents(
documents,
embed_model=Settings.embed_model,
transformations=Settings.transformations)
print('----save index----')
# save index
index.storage_context.persist(persist_dir="./save")
## load index
#index = load_index_from_storage(storage_context)
query_engine = index.as_query_engine()
while True:
your_query = input()
print(query_engine.query(your_query).response)