Milvus/ES 插入方案对比

在 Python 中加载它并打印一个示例嵌入的维度。 

python -c "from sentence_transformers import SentenceTransformer; model = SentenceTransformer('/root/.cache/modelscope/hub/models/Qwen/Qwen3-Embedding-0.6B'); example_embedding = model.encode('test sentence'); print(example_embedding.shape[0])"

代码环境:

  • ubuntu_22.04_cu124_pytorch,版本 V1.0.2 
  • GPU:RTX3090/24G * 1 
  • CPU:Intel Xeon 8360Y * 12
  • 内存:32GB
  • 硬盘:系统盘 160GB 

Milvus数据库插入数据耗时

from pymilvus import connections, FieldSchema, CollectionSchema, DataType, Collection, utility
import json
from sentence_transformers import SentenceTransformer
from tqdm import tqdm # 导入 tqdm
import time # 导入 time

# Milvus 连接参数
MILVUS_HOST = "localhost" # 请根据您的Milvus实例修改
MILVUS_PORT = "19530"   # 请根据您的Milvus实例修改

# Collection 参数
COLLECTION_NAME = "recipe_collection"
VECTOR_DIM = 1024 # 修改为 Qwen3-Embedding-0.6B 模型的实际输出维度

# 文本嵌入模型
EMBEDDING_MODEL_NAME = "/root/.cache/modelscope/hub/models/Qwen/Qwen3-Embedding-0.6B"
model = None # 将在 insert_data 函数中加载

def connect_milvus():
    """连接到 Milvus"""
    connections.connect("default", host=MILVUS_HOST, port=MILVUS_PORT)
    print(f"成功连接到 Milvus: {MILVUS_HOST}:{MILVUS_PORT}")

def create_collection():
    """创建 Milvus Collection 并定义 Schema"""
    if utility.has_collection(COLLECTION_NAME):
        print(f"Collection '{COLLECTION_NAME}' 已经存在,正在删除旧 Collection...")
        Collection(COLLECTION_NAME).drop()
        print(f"旧 Collection '{COLLECTION_NAME}' 删除成功.")

    fields = [
        FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True),
        FieldSchema(name="title", dtype=DataType.VARCHAR, max_length=512), # 增加长度以适应更长的标题
        FieldSchema(name="description", dtype=DataType.VARCHAR, max_length=8192), # 再次增加长度以适应更长的描述
        FieldSchema(name="vector", dtype=DataType.FLOAT_VECTOR, dim=VECTOR_DIM)
    ]
    schema = CollectionSchema(fields, description="食谱数据集")
    collection = Collection(name=COLLECTION_NAME, schema=schema)
    print(f"Collection '{COLLECTION_NAME}' 创建成功.")
    # 创建索引
    index_params = {
        "metric_type":"L2",
        "index_type":"IVF_FLAT",
        "params":{"nlist":128}
    }
    collection.create_index(field_name="vector", index_params=index_params)
    print("索引创建成功.")
    return collection

def insert_data(collection, data_path):
    """从 JSON 文件批量插入数据到 Milvus"""
    global model
    if model is None:
        print(f"加载文本嵌入模型: {EMBEDDING_MODEL_NAME}...")
        model = SentenceTransformer(EMBEDDING_MODEL_NAME)
        print("模型加载完成.")

    print("正在读取数据...")
    with open(data_path, 'r', encoding='utf-8', errors='ignore') as f:
        recipes = []
        for line_num, line in enumerate(f):
            try:
                recipes.append(json.loads(line))
            except json.JSONDecodeError as e:
                print(f"警告: JSON 解析错误在文件 {data_path} 的第 {line_num + 1} 行: {e}. 跳过此行.")
    print(f"数据读取完成,共 {len(recipes)} 条记录.")

    if not recipes:
        print("错误: 未能从文件中读取到任何有效的食谱数据。请检查JSON文件格式和内容。")
        return

    batch_size = 1000
    total_inserted = 0
    
    start_total_time = time.time()

    for i in tqdm(range(0, len(recipes), batch_size), desc="插入进度"):
        batch_start_time = time.time()
        batch = recipes[i:i + batch_size]
        
        titles = [item.get("name", "") for item in batch] # 使用 'name' 作为标题
        descriptions = [item.get("description", "") for item in batch]
        
        # 组合标题和描述生成向量
        texts_to_embed = [f"{title} {description}" for title, description in zip(titles, descriptions)]
        embeddings = model.encode(texts_to_embed).tolist()

        entities = []
        for j, recipe in enumerate(batch):
            entities.append({
                "title": titles[j],
                "description": descriptions[j],
                "vector": embeddings[j]
            })

        if entities:
            collection.insert(entities)
            total_inserted += len(entities)
            collection.flush() # 每次插入后立即flush,确保数据写入磁盘
        
        batch_end_time = time.time()
        tqdm.write(f"批次 {i // batch_size + 1} 插入 {len(entities)} 条数据,耗时 {batch_end_time - batch_start_time:.2f} 秒.")
    
    end_total_time = time.time()
    print(f"总共插入 {total_inserted} 条数据. 总耗时 {end_total_time - start_total_time:.2f} 秒.")

if __name__ == "__main__":
    connect_milvus()
    collection = create_collection()
    insert_data(collection, "/root/recipe_corpus_full.json") 
(base) root@9gpu-com:~# python '/root/milvus_insert.py'
成功连接到 Milvus: localhost:19530
Collection 'recipe_collection' 已经存在,正在删除旧 Collection...
旧 Collection 'recipe_collection' 删除成功.
Collection 'recipe_collection' 创建成功.
索引创建成功.
加载文本嵌入模型: /root/.cache/modelscope/hub/models/Qwen/Qwen3-Embedding-0.6B...
模型加载完成.
正在读取数据...
警告: JSON 解析错误在文件 /root/recipe_corpus_full.json 的第 211588 行: Unterminated string starting at: line 1 column 179 (char 178). 跳过此行.
数据读取完成,共 211587 条有效记录.
正在生成向量...
Batches: 100%|████████████████████████████████████| 6613/6613 [12:35<00:00,  8.75it/s]
向量生成完成.
插入进度: 100%|█████████████████████████████████████| 212/212 [38:45<00:00, 10.97s/it]
总共插入 211587 条数据. 总耗时 2325.87 秒.

ES数据库插入数据耗时

from elasticsearch import Elasticsearch, helpers
import json
from tqdm import tqdm
import time
import requests # 导入 requests
from sentence_transformers import SentenceTransformer # 导入 SentenceTransformer

# Elasticsearch 连接参数
ES_HOST = "127.0.0.1" # 显式使用 IP 地址
ES_PORT = 9200
ES_INDEX = "recipes_with_vectors" # 修改索引名称,以区分之前的纯文本索引

# 文本嵌入模型
EMBEDDING_MODEL_NAME = "/root/.cache/modelscope/hub/models/Qwen/Qwen3-Embedding-0.6B"
VECTOR_DIM = 1024 # Qwen3-Embedding-0.6B 模型的维度
model = None # 将在 insert_data_to_es 函数中加载

def test_es_connection_with_requests():
    """使用 requests 库测试 Elasticsearch 连接"""
    url = f"http://{ES_HOST}:{ES_PORT}"
    print(f"正在使用 requests 测试连接到 Elasticsearch: {url}...")
    try:
        response = requests.get(url, timeout=5)
        response.raise_for_status() # 如果状态码不是 2xx,则抛出 HTTPError 异常
        print(f"requests 成功连接到 Elasticsearch: {response.json()}")
        return True
    except requests.exceptions.RequestException as e:
        print(f"requests 连接 Elasticsearch 失败!详细错误: {e}")
        return False

def connect_elasticsearch():
    """连接到 Elasticsearch"""
    es = Elasticsearch(f"http://{ES_HOST}:{ES_PORT}")
    try:
        if not es.ping():
            raise ValueError("Elasticsearch ping 失败!服务可能未运行或不可访问。")
    except Exception as e:
        raise ValueError(f"连接 Elasticsearch 失败!详细错误: {e}")
    print(f"成功连接到 Elasticsearch: {ES_HOST}:{ES_PORT}")
    return es

def create_es_index(es_client, index_name):
    """创建 Elasticsearch 索引并定义 Mapping (包含 dense_vector) """
    if es_client.indices.exists(index=index_name):
        print(f"Elasticsearch 索引 '{index_name}' 已经存在,正在删除旧索引...")
        es_client.indices.delete(index=index_name)
        print(f"旧索引 '{index_name}' 删除成功.")

    mapping = {
        "properties": {
            "name": {"type": "text"},
            "dish": {"type": "keyword"},
            "description": {"type": "text"},
            "recipeIngredient": {"type": "text"},
            "recipeInstructions": {"type": "text"},
            "author": {"type": "keyword"},
            "keywords": {"type": "keyword"},
            "vector": {"type": "dense_vector", "dims": VECTOR_DIM} # 添加 dense_vector 字段
        }
    }
    print(f"正在使用 VECTOR_DIM: {VECTOR_DIM} 来创建 dense_vector 字段。") # 调试打印
    es_client.indices.create(index=index_name, body={"mappings": mapping}) # 重新使用 body 参数
    print(f"Elasticsearch 索引 '{index_name}' 创建成功.")

def generate_actions(recipes_list, index_name, embeddings):
    """生成用于批量插入的 action 字典,包含向量"""
    for i, doc in enumerate(recipes_list):
        yield {
            "_index": index_name,
            "_source": {
                "name": doc.get("name", ""),
                "dish": doc.get("dish", ""),
                "description": doc.get("description", ""),
                "recipeIngredient": doc.get("recipeIngredient", ""),
                "recipeInstructions": doc.get("recipeInstructions", ""),
                "author": doc.get("author", ""),
                "keywords": doc.get("keywords", ""),
                "vector": embeddings[i].tolist() # 添加向量
            }
        }

def insert_data_to_es(es_client, data_path, index_name):
    """从 JSON 文件批量插入数据到 Elasticsearch (带向量) """
    global model
    if model is None:
        print(f"加载文本嵌入模型: {EMBEDDING_MODEL_NAME}...")
        model = SentenceTransformer(EMBEDDING_MODEL_NAME)
        print("模型加载完成.")

    print("正在读取数据...")
    valid_recipes = []
    with open(data_path, 'r', encoding='utf-8', errors='ignore') as f:
        for line_num, line in enumerate(f):
            try:
                valid_recipes.append(json.loads(line))
            except json.JSONDecodeError as e:
                print(f"警告: JSON 解析错误在文件 {data_path} 的第 {line_num + 1} 行: {e}. 跳过此行.")
    print(f"数据读取完成,共 {len(valid_recipes)} 条有效记录.")

    if not valid_recipes:
        print("错误: 未能从文件中读取到任何有效的食谱数据。请检查JSON文件格式和内容。")
        return

    print("正在生成向量...")
    texts_to_embed = [f"{doc.get('name', '')} {doc.get('description', '')}" for doc in valid_recipes]
    embeddings = model.encode(texts_to_embed, show_progress_bar=True) # 显示嵌入进度
    print("向量生成完成.")

    actions_generator = generate_actions(valid_recipes, index_name, embeddings)

    start_total_time = time.time()
    
    success_count = 0
    for ok, item in tqdm(helpers.streaming_bulk(es_client, actions_generator, chunk_size=1000, request_timeout=60),
                         total=len(valid_recipes), desc="Elasticsearch 插入进度"):
        if not ok:
            print(f"插入失败: {item}")
        else:
            success_count += 1

    end_total_time = time.time()
    print(f"\n总共成功插入 {success_count} 条数据到 Elasticsearch. 总耗时 {end_total_time - start_total_time:.2f} 秒.")

if __name__ == "__main__":
    try:
        if not test_es_connection_with_requests():
            print("requests 无法连接到 Elasticsearch,停止执行。")
        else:
            es = connect_elasticsearch()
            create_es_index(es, ES_INDEX)
            insert_data_to_es(es, "/root/recipe_corpus_full.json", ES_INDEX)
    except Exception as e:
        print(f"发生错误: {e}") 
(base) root@9gpu-com:~# python es_insert.py
正在使用 requests 测试连接到 Elasticsearch: http://127.0.0.1:9200...
requests 成功连接到 Elasticsearch: {'name': '9gpu-com', 'cluster_name': 'elasticsearch', 'cluster_uuid': 'g-C_9E91Qp6E9WhcJGovhg', 'version': {'number': '7.11.1', 'build_flavor': 'default', 'build_type': 'tar', 'build_hash': 'ff17057114c2199c9c1bbecc727003a907c0db7a', 'build_date': '2021-02-15T13:44:09.394032Z', 'build_snapshot': False, 'lucene_version': '8.7.0', 'minimum_wire_compatibility_version': '6.8.0', 'minimum_index_compatibility_version': '6.0.0-beta1'}, 'tagline': 'You Know, for Search'}
成功连接到 Elasticsearch: 127.0.0.1:9200
正在使用 VECTOR_DIM: 1024 来创建 dense_vector 字段。
/root/es_insert.py:62: DeprecationWarning: The 'body' parameter is deprecated for the 'create' API and will be removed in a future version. Instead use API parameters directly. See https://github.com/elastic/elasticsearch-py/issues/1698 for more information
  es_client.indices.create(index=index_name, body={"mappings": mapping}) # 重新使用 body 参数
Elasticsearch 索引 'recipes_with_vectors' 创建成功.
加载文本嵌入模型: /root/.cache/modelscope/hub/models/Qwen/Qwen3-Embedding-0.6B...
模型加载完成.
正在读取数据...
警告: JSON 解析错误在文件 /root/recipe_corpus_full.json 的第 211588 行: Unterminated string starting at: line 1 column 179 (char 178). 跳过此行.
数据读取完成,共 211587 条有效记录.
正在生成向量...
Batches: 100%|████████████████████████████████████| 6613/6613 [12:34<00:00,  8.76it/s]
向量生成完成.
Elasticsearch 插入进度: 100%|████████████████| 211587/211587 [05:35<00:00, 630.11it/s]

总共成功插入 211587 条数据到 Elasticsearch. 总耗时 335.79 秒.

你可能感兴趣的:(milvus,python,开发语言)