claude生成的一个排序的例子,有几种简单的方法。
查询:“人工智能在医疗领域的应用前景如何?”
文档库:8个相关文档,涵盖AI在医疗、金融、教育、自动驾驶等领域的应用
使用jieba进行中文分词
计算TF-IDF和文档长度归一化
处理中文停用词
Jaccard相似度 + 查询词覆盖率
简化的TF权重计算
使用中文预训练模型 text2vec-base-chinese
备用多语言模型支持
多排序器加权融合
分数归一化处理
# 1. 安装依赖
pip install sentence-transformers jieba scikit-learn pandas torch
# 2. 运行示例
python rag_ranking_example.py
# RAG排序完整中文示例与使用流程
import numpy as np
import torch
import torch.nn.functional as F
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
import jieba
from collections import Counter
import math
import pandas as pd
# 示例数据:查询和候选文档
SAMPLE_QUERY = "人工智能在医疗领域的应用前景如何?"
SAMPLE_DOCUMENTS = [
"人工智能在医疗诊断中发挥着越来越重要的作用。通过深度学习算法,AI系统能够分析医学影像,如X光片、CT扫描和MRI图像,帮助医生更准确地识别疾病。例如,AI在肺癌早期筛查、眼底病变检测等方面已经展现出超越人类专家的准确率。",
"机器学习技术在金融风控领域应用广泛。银行和金融机构利用大数据分析和机器学习算法来评估信贷风险、检测欺诈交易、优化投资组合。这些技术能够处理海量的交易数据,识别异常模式,为金融决策提供科学依据。",
"医疗人工智能的发展正在革命性地改变传统医疗模式。智能诊断系统不仅能够提高诊断准确性,还能大幅缩短诊断时间。在药物研发方面,AI技术能够加速新药发现过程,预测药物分子的特性和副作用,降低研发成本和时间。",
"自动驾驶技术是人工智能应用的重要方向之一。通过计算机视觉、深度学习和传感器融合技术,自动驾驶汽车能够实时感知道路环境、识别交通标志、避障和路径规划。目前主要汽车厂商都在加大对自动驾驶技术的投入。",
"智慧医疗系统整合了物联网、大数据、云计算和人工智能等先进技术。患者可以通过智能穿戴设备实时监测健康状况,医生可以远程诊断和治疗,医院管理也更加智能化。这种新型医疗模式提高了医疗服务效率,降低了医疗成本。",
"教育技术的发展让个性化学习成为可能。AI驱动的在线教育平台能够根据学生的学习进度和能力水平,提供定制化的学习内容和路径。自适应学习系统通过分析学习数据,优化教学策略,提高学习效果。",
"临床决策支持系统利用人工智能技术辅助医生做出更好的诊疗决策。这些系统能够整合患者的病史、检查结果、药物信息等多源数据,提供循证医学建议。AI还能够预测疾病发展趋势,帮助制定个性化治疗方案。",
"区块链技术在供应链管理中展现出巨大潜力。通过分布式账本技术,企业可以实现供应链的透明化和可追溯性。这对于食品安全、药品监管等领域特别重要,消费者可以清楚了解产品的来源和流通过程。"
]
class ChineseTextProcessor:
"""中文文本处理工具类"""
def __init__(self):
# 初始化停用词列表
self.stop_words = set([
'的', '了', '在', '是', '我', '有', '和', '就', '不', '人', '都', '一', '一个',
'上', '也', '很', '到', '说', '要', '去', '你', '会', '着', '没有', '看', '好',
'自己', '这', '能', '那', '来', '用', '把', '让', '更', '为', '可以', '等',
'中', '通过', '对', '及', '与', '以', '或', '但', '而', '由于', '因为'
])
def tokenize(self, text):
"""中文分词"""
words = jieba.lcut(text)
return [word for word in words if word not in self.stop_words and len(word) > 1]
def extract_keywords(self, text, top_k=10):
"""提取关键词"""
words = self.tokenize(text)
word_freq = Counter(words)
return dict(word_freq.most_common(top_k))
class SimpleBM25Ranker:
"""简化版BM25排序器(适用于中文)"""
def __init__(self, k1=1.2, b=0.75):
self.k1 = k1
self.b = b
self.processor = ChineseTextProcessor()
self.corpus_tokens = []
self.doc_freqs = {}
self.avg_doc_length = 0
def fit(self, documents):
"""训练BM25模型"""
self.corpus_tokens = []
word_doc_count = {}
total_length = 0
for doc in documents:
tokens = self.processor.tokenize(doc)
self.corpus_tokens.append(tokens)
total_length += len(tokens)
# 计算词文档频率
unique_words = set(tokens)
for word in unique_words:
word_doc_count[word] = word_doc_count.get(word, 0) + 1
# 计算平均文档长度
self.avg_doc_length = total_length / len(documents)
# 计算IDF
corpus_size = len(documents)
for word, doc_count in word_doc_count.items():
self.doc_freqs[word] = math.log((corpus_size - doc_count + 0.5) / (doc_count + 0.5))
def score(self, query, doc_tokens):
"""计算BM25分数"""
query_tokens = self.processor.tokenize(query)
doc_length = len(doc_tokens)
score = 0
word_count = Counter(doc_tokens)
for word in query_tokens:
if word in word_count:
tf = word_count[word]
idf = self.doc_freqs.get(word, 0)
score += idf * (tf * (self.k1 + 1)) / (
tf + self.k1 * (1 - self.b + self.b * (doc_length / self.avg_doc_length))
)
return score
def rank_documents(self, query, documents, top_k=10):
"""使用BM25对文档排序"""
if not self.corpus_tokens:
self.fit(documents)
scores = []
for i, doc in enumerate(documents):
doc_tokens = self.corpus_tokens[i] if i < len(self.corpus_tokens) else self.processor.tokenize(doc)
score = self.score(query, doc_tokens)
scores.append((doc, score, i))
# 排序
scores.sort(key=lambda x: x[1], reverse=True)
return [
{
'document': doc,
'bm25_score': score,
'original_index': idx,
'rank': i + 1
}
for i, (doc, score, idx) in enumerate(scores[:top_k])
]
class ChineseSentenceTransformerRanker:
"""基于中文语义模型的排序器"""
def __init__(self, model_name="shibing624/text2vec-base-chinese"):
try:
self.model = SentenceTransformer(model_name)
print(f"成功加载模型: {model_name}")
except Exception as e:
print(f"模型加载失败,使用备用模型: {e}")
# 备用模型
self.model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
def rank_documents(self, query, documents, top_k=10):
"""使用语义相似度排序"""
print("正在计算文档嵌入...")
# 编码查询和文档
query_embedding = self.model.encode([query])
doc_embeddings = self.model.encode(documents, show_progress_bar=True)
# 计算相似度
similarities = cosine_similarity(query_embedding, doc_embeddings)[0]
# 排序
scored_docs = list(zip(documents, similarities, range(len(documents))))
scored_docs.sort(key=lambda x: x[1], reverse=True)
return [
{
'document': doc,
'semantic_score': score,
'original_index': idx,
'rank': i + 1
}
for i, (doc, score, idx) in enumerate(scored_docs[:top_k])
]
class ChineseKeywordRanker:
"""基于关键词匹配的排序器"""
def __init__(self):
self.processor = ChineseTextProcessor()
def calculate_keyword_score(self, query, document):
"""计算关键词匹配分数"""
query_words = set(self.processor.tokenize(query))
doc_words = set(self.processor.tokenize(document))
if not query_words:
return 0
intersection = query_words.intersection(doc_words)
# Jaccard相似度
jaccard = len(intersection) / len(query_words.union(doc_words))
# 查询词覆盖率
coverage = len(intersection) / len(query_words)
# TF-IDF权重(简化版)
tf_score = sum([document.count(word) for word in intersection])
return 0.4 * jaccard + 0.4 * coverage + 0.2 * (tf_score / len(document.split()))
def rank_documents(self, query, documents, top_k=10):
"""使用关键词匹配排序"""
scores = []
for i, doc in enumerate(documents):
score = self.calculate_keyword_score(query, doc)
scores.append((doc, score, i))
# 排序
scores.sort(key=lambda x: x[1], reverse=True)
return [
{
'document': doc,
'keyword_score': score,
'original_index': idx,
'rank': i + 1
}
for i, (doc, score, idx) in enumerate(scores[:top_k])
]
class ChineseEnsembleRanker:
"""中文文档集成排序器"""
def __init__(self):
self.rankers = {}
self.weights = {}
self.processor = ChineseTextProcessor()
def add_ranker(self, name, ranker, weight=1.0):
"""添加排序器"""
self.rankers[name] = ranker
self.weights[name] = weight
print(f"添加排序器: {name}, 权重: {weight}")
def normalize_scores(self, scores):
"""归一化分数"""
if not scores or all(s == 0 for s in scores):
return scores
min_score = min(scores)
max_score = max(scores)
if max_score == min_score:
return [1.0] * len(scores)
return [(s - min_score) / (max_score - min_score) for s in scores]
def rank_documents(self, query, documents, top_k=10, verbose=True):
"""集成排序"""
if verbose:
print(f"\n{'='*50}")
print(f"查询: {query}")
print(f"文档数量: {len(documents)}")
print(f"{'='*50}")
all_rankings = {}
# 获取每个排序器的结果
for name, ranker in self.rankers.items():
if verbose:
print(f"\n运行排序器: {name}")
try:
rankings = ranker.rank_documents(query, documents, top_k=len(documents))
all_rankings[name] = rankings
if verbose:
print(f"{name} 排序完成,Top 3 结果:")
for i, result in enumerate(rankings[:3]):
score_key = [k for k in result.keys() if 'score' in k][0]
print(f" {i+1}. 分数: {result[score_key]:.4f}")
print(f" 文档: {result['document'][:100]}...")
except Exception as e:
print(f"排序器 {name} 运行失败: {e}")
continue
if not all_rankings:
print("所有排序器都失败了!")
return []
# 计算加权平均分数
doc_scores = {}
for i, doc in enumerate(documents):
doc_scores[i] = {'document': doc, 'scores': {}, 'final_score': 0}
total_weight = 0
weighted_score = 0
for name, rankings in all_rankings.items():
weight = self.weights[name]
# 找到该文档在当前排序中的分数
doc_score = 0
for rank_info in rankings:
if rank_info['document'] == doc:
# 获取分数字段
score_keys = [k for k in rank_info.keys() if 'score' in k]
if score_keys:
doc_score = rank_info[score_keys[0]]
break
doc_scores[i]['scores'][name] = doc_score
weighted_score += weight * doc_score
total_weight += weight
if total_weight > 0:
doc_scores[i]['final_score'] = weighted_score / total_weight
# 排序并返回结果
sorted_docs = sorted(doc_scores.items(), key=lambda x: x[1]['final_score'], reverse=True)
results = []
for i, (doc_idx, info) in enumerate(sorted_docs[:top_k]):
result = {
'document': info['document'],
'final_score': info['final_score'],
'rank': i + 1,
'original_index': doc_idx,
'individual_scores': info['scores']
}
results.append(result)
return results
def analyze_query_and_documents():
"""分析查询和文档的特征"""
processor = ChineseTextProcessor()
print("查询分析:")
print(f"查询: {SAMPLE_QUERY}")
query_keywords = processor.extract_keywords(SAMPLE_QUERY)
print(f"关键词: {query_keywords}")
print(f"\n文档分析:")
for i, doc in enumerate(SAMPLE_DOCUMENTS):
print(f"\n文档 {i+1}: {doc[:50]}...")
doc_keywords = processor.extract_keywords(doc, top_k=5)
print(f"关键词: {list(doc_keywords.keys())}")
def run_individual_rankers():
"""运行各个排序器的示例"""
print("\n" + "="*60)
print("运行各个排序器")
print("="*60)
# 1. BM25排序
print("\n1. BM25排序结果:")
bm25_ranker = SimpleBM25Ranker()
bm25_results = bm25_ranker.rank_documents(SAMPLE_QUERY, SAMPLE_DOCUMENTS, top_k=5)
for result in bm25_results:
print(f"排名 {result['rank']}: BM25分数 {result['bm25_score']:.4f}")
print(f"文档: {result['document'][:100]}...\n")
# 2. 关键词匹配排序
print("\n2. 关键词匹配排序结果:")
keyword_ranker = ChineseKeywordRanker()
keyword_results = keyword_ranker.rank_documents(SAMPLE_QUERY, SAMPLE_DOCUMENTS, top_k=5)
for result in keyword_results:
print(f"排名 {result['rank']}: 关键词分数 {result['keyword_score']:.4f}")
print(f"文档: {result['document'][:100]}...\n")
# 3. 语义相似度排序
print("\n3. 语义相似度排序结果:")
try:
semantic_ranker = ChineseSentenceTransformerRanker()
semantic_results = semantic_ranker.rank_documents(SAMPLE_QUERY, SAMPLE_DOCUMENTS, top_k=5)
for result in semantic_results:
print(f"排名 {result['rank']}: 语义分数 {result['semantic_score']:.4f}")
print(f"文档: {result['document'][:100]}...\n")
except Exception as e:
print(f"语义排序失败: {e}")
def run_ensemble_ranking():
"""运行集成排序示例"""
print("\n" + "="*60)
print("集成排序示例")
print("="*60)
# 创建集成排序器
ensemble = ChineseEnsembleRanker()
# 添加各种排序器
ensemble.add_ranker('bm25', SimpleBM25Ranker(), weight=0.3)
ensemble.add_ranker('keyword', ChineseKeywordRanker(), weight=0.3)
try:
ensemble.add_ranker('semantic', ChineseSentenceTransformerRanker(), weight=0.4)
except Exception as e:
print(f"语义排序器添加失败,跳过: {e}")
# 重新调整权重
ensemble.weights['bm25'] = 0.5
ensemble.weights['keyword'] = 0.5
# 运行集成排序
results = ensemble.rank_documents(SAMPLE_QUERY, SAMPLE_DOCUMENTS, top_k=5, verbose=True)
print(f"\n{'='*50}")
print("最终集成排序结果:")
print("="*50)
for result in results:
print(f"\n排名 {result['rank']}: 综合分数 {result['final_score']:.4f}")
print(f"各排序器分数: {result['individual_scores']}")
print(f"文档: {result['document'][:150]}...")
def compare_ranking_methods():
"""比较不同排序方法的结果"""
print("\n" + "="*60)
print("排序方法对比分析")
print("="*60)
# 运行所有排序器
rankers = {
'BM25': SimpleBM25Ranker(),
'关键词匹配': ChineseKeywordRanker(),
}
try:
rankers['语义相似度'] = ChineseSentenceTransformerRanker()
except:
print("语义相似度排序器不可用")
all_results = {}
for name, ranker in rankers.items():
print(f"\n运行 {name} 排序器...")
results = ranker.rank_documents(SAMPLE_QUERY, SAMPLE_DOCUMENTS, top_k=8)
all_results[name] = results
# 创建对比表格
comparison_data = []
for i in range(len(SAMPLE_DOCUMENTS)):
row = {'文档ID': i+1, '文档摘要': SAMPLE_DOCUMENTS[i][:50] + '...'}
for method_name, results in all_results.items():
rank = None
score = 0
for result in results:
if result['original_index'] == i:
rank = result['rank']
score_key = [k for k in result.keys() if 'score' in k][0]
score = result[score_key]
break
row[f'{method_name}_排名'] = rank if rank else '-'
row[f'{method_name}_分数'] = f"{score:.3f}" if score else '0.000'
comparison_data.append(row)
# 打印对比结果
df = pd.DataFrame(comparison_data)
print("\n排序方法对比表:")
print(df.to_string(index=False))
# 分析相关文档
print(f"\n基于查询 '{SAMPLE_QUERY}' 的相关性分析:")
relevant_docs = [0, 2, 4, 6] # 手工标注的相关文档
print(f"预期相关文档ID: {[x+1 for x in relevant_docs]}")
for method_name, results in all_results.items():
top3_indices = [r['original_index'] for r in results[:3]]
relevant_in_top3 = len(set(top3_indices).intersection(set(relevant_docs)))
print(f"{method_name}: Top3中包含 {relevant_in_top3}/3 个相关文档")
def interactive_ranking_demo():
"""交互式排序演示"""
print("\n" + "="*60)
print("交互式排序演示")
print("="*60)
# 准备排序器
ensemble = ChineseEnsembleRanker()
ensemble.add_ranker('bm25', SimpleBM25Ranker(), weight=0.4)
ensemble.add_ranker('keyword', ChineseKeywordRanker(), weight=0.6)
while True:
print(f"\n当前有 {len(SAMPLE_DOCUMENTS)} 个文档可供检索")
user_query = input("\n请输入您的查询(输入 'quit' 退出): ").strip()
if user_query.lower() == 'quit':
break
if not user_query:
print("查询不能为空,请重新输入")
continue
print(f"\n正在为查询 '{user_query}' 排序文档...")
results = ensemble.rank_documents(user_query, SAMPLE_DOCUMENTS, top_k=3, verbose=False)
print(f"\nTop 3 相关文档:")
for result in results:
print(f"\n排名 {result['rank']}: 综合分数 {result['final_score']:.4f}")
print(f"文档: {result['document']}")
print("-" * 80)
def main():
"""主函数:完整的使用流程演示"""
print("RAG文档排序系统 - 中文示例")
print("="*60)
# 1. 数据分析
analyze_query_and_documents()
# 2. 运行各个排序器
run_individual_rankers()
# 3. 集成排序
run_ensemble_ranking()
# 4. 方法对比
compare_ranking_methods()
# 5. 交互式演示
print(f"\n是否启动交互式演示?(y/n): ", end="")
if input().lower() == 'y':
interactive_ranking_demo()
print("\n演示完成!")
if __name__ == "__main__":
# 安装必要的包
print("请确保已安装以下包:")
print("pip install sentence-transformers jieba scikit-learn pandas torch")
print("="*60)
main()
数据分析 - 分析查询和文档的关键词特征
单独排序器测试 - 展示每个排序器的结果
集成排序 - 多方法融合的最终结果
方法对比 - 生成对比表格分析各方法效果
交互式演示 - 支持自定义查询测试
对于查询"人工智能在医疗领域的应用前景如何?",相关文档应该包括:
文档1:AI在医疗诊断中的作用
文档3:医疗AI革命性改变
文档5:智慧医疗系统
文档7:临床决策支持系统
首次运行:会自动下载中文语义模型(约200MB)
模型选择:可根据需要更换其他中文预训练模型
权重调整:可根据实际效果调整各排序器权重
扩展性:可轻松添加新的排序算法