实体消歧(链接到实体库)

disambiguation.py

#!/usr/bin/python3
import pymysql
import json
import requests
from SimilarityEN import similarity


def findCandidates(entity):
    # 打开数据库连接
    db = pymysql.connect(host=host, port=3306,
                       user=username, passwd=password, db=dbname)
    # 使用 cursor() 方法创建一个游标对象 cursor
    cursor = db.cursor()
    # SQL 查询语句
    sql = "SELECT * FROM t_wikidata_human_content WHERE `name` LIKE " + "'" + "%" + entity + "'"
    cursor.execute(sql)
    persons = cursor.fetchall()
    candidates = []
    for person in persons:
        candidates.append({'wiki_id': person[1], 'name': person[2], 'description': person[3]})
    return candidates


def getEntity(query):
    url = 'http://ip:8018/getNer'
    properties = {'text': query, 'lang': 'en'}
    resp = requests.get(url, params=properties).json()
    entities = []
    for entity in resp:
        if entity['ner'] == "PERSON":
            entities.append(entity)
    return entities


def match(query):
    entities = getEntity(query)
    points = []
    for entity in entities:     #单一实体消歧
        candidates = findCandidates(entity['word'])
        texts = []
        for candidate in candidates:
            texts.append(candidate['description'])
        try:
            indent = similarity(texts, query)
            sort = sorted(enumerate(indent), key=lambda x: x[1])   ##b[-1][0] 最大值的原下标
            max_index = sort[-1][0]
        except:
            max_index = 0
        try:
            points.append({'wiki_id': candidates[max_index]['wiki_id'], 'name': candidates[max_index]['name'],
                      'begin': entity['begin'], 'end': entity['end']})
        except:
            points.append({'wiki_id': -1,
                           'begin': entity['begin'], 'end': entity['end']})
    points = json.dumps(points, indent=4)
    return points

similarity.py

# -*- coding:utf-8 -*-

import codecs
import re
from gensim import corpora, models, similarities
from nltk.tokenize import WordPunctTokenizer


def wordtokenizer(sentence):
    words = WordPunctTokenizer().tokenize(sentence)
    return words


def tokenization(text, stopwordpath):
    stop_words = stopwordpath
    stopwords = codecs.open(stop_words, 'r', encoding='utf8').readlines()
    stopwords = [w.strip() for w in stopwords]
    result = []
    text = re.sub("[-',{:+}|.()/?!·;]", ' ', text).lower()
    words = wordtokenizer(text)
    for word in words:
        if word not in stopwords:
            result.append(word)
    return result


def similarity(texts, query, stopwordpath='stop.txt'):
    corpus = []
    for text in texts:
        corpus.append(tokenization(text, stopwordpath))
    dictionary = corpora.Dictionary(corpus)     # 生成特征字典,为每个出现在语料库中的单词分配了一个独一无二的整数编号id
    doc_bow = [dictionary.doc2bow(text) for text in corpus]     # 函数doc2bow() 简单地对每个不同单词的出现次数进行了计数,并将单词转换为其编号,然后以稀疏向量的形式返回结果。

    tfidf = models.TfidfModel(doc_bow)      # 每一个特征的IDF值的统计
    tfidf_bow = tfidf[doc_bow]      # 计算tfidf
    query = tokenization(query, stopwordpath)
    query_bow = dictionary.doc2bow(query)
    index = similarities.MatrixSimilarity(tfidf_bow)
    sims = index[query_bow]
    return sims

你可能感兴趣的:(nlp)