【RAG排序】rag排序代码示例-简单版

claude生成的一个排序的例子,有几种简单的方法。

示例数据

查询:“人工智能在医疗领域的应用前景如何?”
文档库:8个相关文档,涵盖AI在医疗、金融、教育、自动驾驶等领域的应用

实现的排序方法

SimpleBM25Ranker - 中文BM25排序器

使用jieba进行中文分词
计算TF-IDF和文档长度归一化
处理中文停用词

ChineseKeywordRanker - 关键词匹配排序器

Jaccard相似度 + 查询词覆盖率
简化的TF权重计算

ChineseSentenceTransformerRanker - 语义相似度排序器

使用中文预训练模型 text2vec-base-chinese
备用多语言模型支持

ChineseEnsembleRanker - 集成排序器

多排序器加权融合
分数归一化处理

使用方法

# 1. 安装依赖
pip install sentence-transformers jieba scikit-learn pandas torch

# 2. 运行示例
python rag_ranking_example.py

完整代码

# RAG排序完整中文示例与使用流程
import numpy as np
import torch
import torch.nn.functional as F
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
import jieba
from collections import Counter
import math
import pandas as pd

# 示例数据:查询和候选文档
SAMPLE_QUERY = "人工智能在医疗领域的应用前景如何?"

SAMPLE_DOCUMENTS = [
    "人工智能在医疗诊断中发挥着越来越重要的作用。通过深度学习算法,AI系统能够分析医学影像,如X光片、CT扫描和MRI图像,帮助医生更准确地识别疾病。例如,AI在肺癌早期筛查、眼底病变检测等方面已经展现出超越人类专家的准确率。",
    
    "机器学习技术在金融风控领域应用广泛。银行和金融机构利用大数据分析和机器学习算法来评估信贷风险、检测欺诈交易、优化投资组合。这些技术能够处理海量的交易数据,识别异常模式,为金融决策提供科学依据。",
    
    "医疗人工智能的发展正在革命性地改变传统医疗模式。智能诊断系统不仅能够提高诊断准确性,还能大幅缩短诊断时间。在药物研发方面,AI技术能够加速新药发现过程,预测药物分子的特性和副作用,降低研发成本和时间。",
    
    "自动驾驶技术是人工智能应用的重要方向之一。通过计算机视觉、深度学习和传感器融合技术,自动驾驶汽车能够实时感知道路环境、识别交通标志、避障和路径规划。目前主要汽车厂商都在加大对自动驾驶技术的投入。",
    
    "智慧医疗系统整合了物联网、大数据、云计算和人工智能等先进技术。患者可以通过智能穿戴设备实时监测健康状况,医生可以远程诊断和治疗,医院管理也更加智能化。这种新型医疗模式提高了医疗服务效率,降低了医疗成本。",
    
    "教育技术的发展让个性化学习成为可能。AI驱动的在线教育平台能够根据学生的学习进度和能力水平,提供定制化的学习内容和路径。自适应学习系统通过分析学习数据,优化教学策略,提高学习效果。",
    
    "临床决策支持系统利用人工智能技术辅助医生做出更好的诊疗决策。这些系统能够整合患者的病史、检查结果、药物信息等多源数据,提供循证医学建议。AI还能够预测疾病发展趋势,帮助制定个性化治疗方案。",
    
    "区块链技术在供应链管理中展现出巨大潜力。通过分布式账本技术,企业可以实现供应链的透明化和可追溯性。这对于食品安全、药品监管等领域特别重要,消费者可以清楚了解产品的来源和流通过程。"
]

class ChineseTextProcessor:
    """中文文本处理工具类"""
    
    def __init__(self):
        # 初始化停用词列表
        self.stop_words = set([
            '的', '了', '在', '是', '我', '有', '和', '就', '不', '人', '都', '一', '一个',
            '上', '也', '很', '到', '说', '要', '去', '你', '会', '着', '没有', '看', '好',
            '自己', '这', '能', '那', '来', '用', '把', '让', '更', '为', '可以', '等',
            '中', '通过', '对', '及', '与', '以', '或', '但', '而', '由于', '因为'
        ])
    
    def tokenize(self, text):
        """中文分词"""
        words = jieba.lcut(text)
        return [word for word in words if word not in self.stop_words and len(word) > 1]
    
    def extract_keywords(self, text, top_k=10):
        """提取关键词"""
        words = self.tokenize(text)
        word_freq = Counter(words)
        return dict(word_freq.most_common(top_k))

class SimpleBM25Ranker:
    """简化版BM25排序器(适用于中文)"""
    
    def __init__(self, k1=1.2, b=0.75):
        self.k1 = k1
        self.b = b
        self.processor = ChineseTextProcessor()
        self.corpus_tokens = []
        self.doc_freqs = {}
        self.avg_doc_length = 0
        
    def fit(self, documents):
        """训练BM25模型"""
        self.corpus_tokens = []
        word_doc_count = {}
        total_length = 0
        
        for doc in documents:
            tokens = self.processor.tokenize(doc)
            self.corpus_tokens.append(tokens)
            total_length += len(tokens)
            
            # 计算词文档频率
            unique_words = set(tokens)
            for word in unique_words:
                word_doc_count[word] = word_doc_count.get(word, 0) + 1
        
        # 计算平均文档长度
        self.avg_doc_length = total_length / len(documents)
        
        # 计算IDF
        corpus_size = len(documents)
        for word, doc_count in word_doc_count.items():
            self.doc_freqs[word] = math.log((corpus_size - doc_count + 0.5) / (doc_count + 0.5))
    
    def score(self, query, doc_tokens):
        """计算BM25分数"""
        query_tokens = self.processor.tokenize(query)
        doc_length = len(doc_tokens)
        score = 0
        
        word_count = Counter(doc_tokens)
        
        for word in query_tokens:
            if word in word_count:
                tf = word_count[word]
                idf = self.doc_freqs.get(word, 0)
                
                score += idf * (tf * (self.k1 + 1)) / (
                    tf + self.k1 * (1 - self.b + self.b * (doc_length / self.avg_doc_length))
                )
        
        return score
    
    def rank_documents(self, query, documents, top_k=10):
        """使用BM25对文档排序"""
        if not self.corpus_tokens:
            self.fit(documents)
        
        scores = []
        for i, doc in enumerate(documents):
            doc_tokens = self.corpus_tokens[i] if i < len(self.corpus_tokens) else self.processor.tokenize(doc)
            score = self.score(query, doc_tokens)
            scores.append((doc, score, i))
        
        # 排序
        scores.sort(key=lambda x: x[1], reverse=True)
        
        return [
            {
                'document': doc,
                'bm25_score': score,
                'original_index': idx,
                'rank': i + 1
            }
            for i, (doc, score, idx) in enumerate(scores[:top_k])
        ]

class ChineseSentenceTransformerRanker:
    """基于中文语义模型的排序器"""
    
    def __init__(self, model_name="shibing624/text2vec-base-chinese"):
        try:
            self.model = SentenceTransformer(model_name)
            print(f"成功加载模型: {model_name}")
        except Exception as e:
            print(f"模型加载失败,使用备用模型: {e}")
            # 备用模型
            self.model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
    
    def rank_documents(self, query, documents, top_k=10):
        """使用语义相似度排序"""
        print("正在计算文档嵌入...")
        
        # 编码查询和文档
        query_embedding = self.model.encode([query])
        doc_embeddings = self.model.encode(documents, show_progress_bar=True)
        
        # 计算相似度
        similarities = cosine_similarity(query_embedding, doc_embeddings)[0]
        
        # 排序
        scored_docs = list(zip(documents, similarities, range(len(documents))))
        scored_docs.sort(key=lambda x: x[1], reverse=True)
        
        return [
            {
                'document': doc,
                'semantic_score': score,
                'original_index': idx,
                'rank': i + 1
            }
            for i, (doc, score, idx) in enumerate(scored_docs[:top_k])
        ]

class ChineseKeywordRanker:
    """基于关键词匹配的排序器"""
    
    def __init__(self):
        self.processor = ChineseTextProcessor()
    
    def calculate_keyword_score(self, query, document):
        """计算关键词匹配分数"""
        query_words = set(self.processor.tokenize(query))
        doc_words = set(self.processor.tokenize(document))
        
        if not query_words:
            return 0
        
        intersection = query_words.intersection(doc_words)
        
        # Jaccard相似度
        jaccard = len(intersection) / len(query_words.union(doc_words))
        
        # 查询词覆盖率
        coverage = len(intersection) / len(query_words)
        
        # TF-IDF权重(简化版)
        tf_score = sum([document.count(word) for word in intersection])
        
        return 0.4 * jaccard + 0.4 * coverage + 0.2 * (tf_score / len(document.split()))
    
    def rank_documents(self, query, documents, top_k=10):
        """使用关键词匹配排序"""
        scores = []
        
        for i, doc in enumerate(documents):
            score = self.calculate_keyword_score(query, doc)
            scores.append((doc, score, i))
        
        # 排序
        scores.sort(key=lambda x: x[1], reverse=True)
        
        return [
            {
                'document': doc,
                'keyword_score': score,
                'original_index': idx,
                'rank': i + 1
            }
            for i, (doc, score, idx) in enumerate(scores[:top_k])
        ]

class ChineseEnsembleRanker:
    """中文文档集成排序器"""
    
    def __init__(self):
        self.rankers = {}
        self.weights = {}
        self.processor = ChineseTextProcessor()
    
    def add_ranker(self, name, ranker, weight=1.0):
        """添加排序器"""
        self.rankers[name] = ranker
        self.weights[name] = weight
        print(f"添加排序器: {name}, 权重: {weight}")
    
    def normalize_scores(self, scores):
        """归一化分数"""
        if not scores or all(s == 0 for s in scores):
            return scores
        
        min_score = min(scores)
        max_score = max(scores)
        
        if max_score == min_score:
            return [1.0] * len(scores)
        
        return [(s - min_score) / (max_score - min_score) for s in scores]
    
    def rank_documents(self, query, documents, top_k=10, verbose=True):
        """集成排序"""
        if verbose:
            print(f"\n{'='*50}")
            print(f"查询: {query}")
            print(f"文档数量: {len(documents)}")
            print(f"{'='*50}")
        
        all_rankings = {}
        
        # 获取每个排序器的结果
        for name, ranker in self.rankers.items():
            if verbose:
                print(f"\n运行排序器: {name}")
            
            try:
                rankings = ranker.rank_documents(query, documents, top_k=len(documents))
                all_rankings[name] = rankings
                
                if verbose:
                    print(f"{name} 排序完成,Top 3 结果:")
                    for i, result in enumerate(rankings[:3]):
                        score_key = [k for k in result.keys() if 'score' in k][0]
                        print(f"  {i+1}. 分数: {result[score_key]:.4f}")
                        print(f"     文档: {result['document'][:100]}...")
                
            except Exception as e:
                print(f"排序器 {name} 运行失败: {e}")
                continue
        
        if not all_rankings:
            print("所有排序器都失败了!")
            return []
        
        # 计算加权平均分数
        doc_scores = {}
        
        for i, doc in enumerate(documents):
            doc_scores[i] = {'document': doc, 'scores': {}, 'final_score': 0}
            
            total_weight = 0
            weighted_score = 0
            
            for name, rankings in all_rankings.items():
                weight = self.weights[name]
                
                # 找到该文档在当前排序中的分数
                doc_score = 0
                for rank_info in rankings:
                    if rank_info['document'] == doc:
                        # 获取分数字段
                        score_keys = [k for k in rank_info.keys() if 'score' in k]
                        if score_keys:
                            doc_score = rank_info[score_keys[0]]
                        break
                
                doc_scores[i]['scores'][name] = doc_score
                weighted_score += weight * doc_score
                total_weight += weight
            
            if total_weight > 0:
                doc_scores[i]['final_score'] = weighted_score / total_weight
        
        # 排序并返回结果
        sorted_docs = sorted(doc_scores.items(), key=lambda x: x[1]['final_score'], reverse=True)
        
        results = []
        for i, (doc_idx, info) in enumerate(sorted_docs[:top_k]):
            result = {
                'document': info['document'],
                'final_score': info['final_score'],
                'rank': i + 1,
                'original_index': doc_idx,
                'individual_scores': info['scores']
            }
            results.append(result)
        
        return results

def analyze_query_and_documents():
    """分析查询和文档的特征"""
    processor = ChineseTextProcessor()
    
    print("查询分析:")
    print(f"查询: {SAMPLE_QUERY}")
    query_keywords = processor.extract_keywords(SAMPLE_QUERY)
    print(f"关键词: {query_keywords}")
    
    print(f"\n文档分析:")
    for i, doc in enumerate(SAMPLE_DOCUMENTS):
        print(f"\n文档 {i+1}: {doc[:50]}...")
        doc_keywords = processor.extract_keywords(doc, top_k=5)
        print(f"关键词: {list(doc_keywords.keys())}")

def run_individual_rankers():
    """运行各个排序器的示例"""
    print("\n" + "="*60)
    print("运行各个排序器")
    print("="*60)
    
    # 1. BM25排序
    print("\n1. BM25排序结果:")
    bm25_ranker = SimpleBM25Ranker()
    bm25_results = bm25_ranker.rank_documents(SAMPLE_QUERY, SAMPLE_DOCUMENTS, top_k=5)
    
    for result in bm25_results:
        print(f"排名 {result['rank']}: BM25分数 {result['bm25_score']:.4f}")
        print(f"文档: {result['document'][:100]}...\n")
    
    # 2. 关键词匹配排序
    print("\n2. 关键词匹配排序结果:")
    keyword_ranker = ChineseKeywordRanker()
    keyword_results = keyword_ranker.rank_documents(SAMPLE_QUERY, SAMPLE_DOCUMENTS, top_k=5)
    
    for result in keyword_results:
        print(f"排名 {result['rank']}: 关键词分数 {result['keyword_score']:.4f}")
        print(f"文档: {result['document'][:100]}...\n")
    
    # 3. 语义相似度排序
    print("\n3. 语义相似度排序结果:")
    try:
        semantic_ranker = ChineseSentenceTransformerRanker()
        semantic_results = semantic_ranker.rank_documents(SAMPLE_QUERY, SAMPLE_DOCUMENTS, top_k=5)
        
        for result in semantic_results:
            print(f"排名 {result['rank']}: 语义分数 {result['semantic_score']:.4f}")
            print(f"文档: {result['document'][:100]}...\n")
    except Exception as e:
        print(f"语义排序失败: {e}")

def run_ensemble_ranking():
    """运行集成排序示例"""
    print("\n" + "="*60)
    print("集成排序示例")
    print("="*60)
    
    # 创建集成排序器
    ensemble = ChineseEnsembleRanker()
    
    # 添加各种排序器
    ensemble.add_ranker('bm25', SimpleBM25Ranker(), weight=0.3)
    ensemble.add_ranker('keyword', ChineseKeywordRanker(), weight=0.3)
    
    try:
        ensemble.add_ranker('semantic', ChineseSentenceTransformerRanker(), weight=0.4)
    except Exception as e:
        print(f"语义排序器添加失败,跳过: {e}")
        # 重新调整权重
        ensemble.weights['bm25'] = 0.5
        ensemble.weights['keyword'] = 0.5
    
    # 运行集成排序
    results = ensemble.rank_documents(SAMPLE_QUERY, SAMPLE_DOCUMENTS, top_k=5, verbose=True)
    
    print(f"\n{'='*50}")
    print("最终集成排序结果:")
    print("="*50)
    
    for result in results:
        print(f"\n排名 {result['rank']}: 综合分数 {result['final_score']:.4f}")
        print(f"各排序器分数: {result['individual_scores']}")
        print(f"文档: {result['document'][:150]}...")

def compare_ranking_methods():
    """比较不同排序方法的结果"""
    print("\n" + "="*60)
    print("排序方法对比分析")
    print("="*60)
    
    # 运行所有排序器
    rankers = {
        'BM25': SimpleBM25Ranker(),
        '关键词匹配': ChineseKeywordRanker(),
    }
    
    try:
        rankers['语义相似度'] = ChineseSentenceTransformerRanker()
    except:
        print("语义相似度排序器不可用")
    
    all_results = {}
    
    for name, ranker in rankers.items():
        print(f"\n运行 {name} 排序器...")
        results = ranker.rank_documents(SAMPLE_QUERY, SAMPLE_DOCUMENTS, top_k=8)
        all_results[name] = results
    
    # 创建对比表格
    comparison_data = []
    
    for i in range(len(SAMPLE_DOCUMENTS)):
        row = {'文档ID': i+1, '文档摘要': SAMPLE_DOCUMENTS[i][:50] + '...'}
        
        for method_name, results in all_results.items():
            rank = None
            score = 0
            
            for result in results:
                if result['original_index'] == i:
                    rank = result['rank']
                    score_key = [k for k in result.keys() if 'score' in k][0]
                    score = result[score_key]
                    break
            
            row[f'{method_name}_排名'] = rank if rank else '-'
            row[f'{method_name}_分数'] = f"{score:.3f}" if score else '0.000'
        
        comparison_data.append(row)
    
    # 打印对比结果
    df = pd.DataFrame(comparison_data)
    print("\n排序方法对比表:")
    print(df.to_string(index=False))
    
    # 分析相关文档
    print(f"\n基于查询 '{SAMPLE_QUERY}' 的相关性分析:")
    relevant_docs = [0, 2, 4, 6]  # 手工标注的相关文档
    print(f"预期相关文档ID: {[x+1 for x in relevant_docs]}")
    
    for method_name, results in all_results.items():
        top3_indices = [r['original_index'] for r in results[:3]]
        relevant_in_top3 = len(set(top3_indices).intersection(set(relevant_docs)))
        print(f"{method_name}: Top3中包含 {relevant_in_top3}/3 个相关文档")

def interactive_ranking_demo():
    """交互式排序演示"""
    print("\n" + "="*60)
    print("交互式排序演示")
    print("="*60)
    
    # 准备排序器
    ensemble = ChineseEnsembleRanker()
    ensemble.add_ranker('bm25', SimpleBM25Ranker(), weight=0.4)
    ensemble.add_ranker('keyword', ChineseKeywordRanker(), weight=0.6)
    
    while True:
        print(f"\n当前有 {len(SAMPLE_DOCUMENTS)} 个文档可供检索")
        user_query = input("\n请输入您的查询(输入 'quit' 退出): ").strip()
        
        if user_query.lower() == 'quit':
            break
        
        if not user_query:
            print("查询不能为空,请重新输入")
            continue
        
        print(f"\n正在为查询 '{user_query}' 排序文档...")
        
        results = ensemble.rank_documents(user_query, SAMPLE_DOCUMENTS, top_k=3, verbose=False)
        
        print(f"\nTop 3 相关文档:")
        for result in results:
            print(f"\n排名 {result['rank']}: 综合分数 {result['final_score']:.4f}")
            print(f"文档: {result['document']}")
            print("-" * 80)

def main():
    """主函数:完整的使用流程演示"""
    print("RAG文档排序系统 - 中文示例")
    print("="*60)
    
    # 1. 数据分析
    analyze_query_and_documents()
    
    # 2. 运行各个排序器
    run_individual_rankers()
    
    # 3. 集成排序
    run_ensemble_ranking()
    
    # 4. 方法对比
    compare_ranking_methods()
    
    # 5. 交互式演示
    print(f"\n是否启动交互式演示?(y/n): ", end="")
    if input().lower() == 'y':
        interactive_ranking_demo()
    
    print("\n演示完成!")

if __name__ == "__main__":
    # 安装必要的包
    print("请确保已安装以下包:")
    print("pip install sentence-transformers jieba scikit-learn pandas torch")
    print("="*60)
    
    main()

功能演示

数据分析 - 分析查询和文档的关键词特征
单独排序器测试 - 展示每个排序器的结果
集成排序 - 多方法融合的最终结果
方法对比 - 生成对比表格分析各方法效果
交互式演示 - 支持自定义查询测试

预期结果

对于查询"人工智能在医疗领域的应用前景如何?",相关文档应该包括:

文档1:AI在医疗诊断中的作用
文档3:医疗AI革命性改变
文档5:智慧医疗系统
文档7:临床决策支持系统

使用建议

首次运行:会自动下载中文语义模型(约200MB)
模型选择:可根据需要更换其他中文预训练模型
权重调整:可根据实际效果调整各排序器权重
扩展性:可轻松添加新的排序算法

你可能感兴趣的:(大模型,人工智能,算法,搜索引擎)