用于知识图谱中的关系抽取,关系抽取分为pipeline抽取和联合抽取。
pipeline抽取是先试用序列标注模型提取实体,然后实体之间做文本分类任务提取他们之间的关系。
联合抽取是文本送入模型同时抽取实体+关系。
三元组是知识图谱通用表示方式:实体-关系-实体,实体-属性-属性值,实体-标签-标签值,rbert可以训练实体-关系-实体。
loader:
# -*- coding: utf-8 -*-
import json
import re
import os
import torch
import random
import jieba
import numpy as np
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer
"""
数据加载
"""
class DataGenerator:
def __init__(self, data_path, config):
self.config = config
self.path = data_path
self.tokenizer = BertTokenizer.from_pretrained(self.config["pretrain_model_path"])
self.sentences = []
self.attribute_schema = json.load(open(config["schema_path"], encoding="utf8"))
self.config["num_labels"] = len(self.attribute_schema)
self.max_length = config["max_length"]
self.load()
print("超出设定最大长度的样本数量:%d, 占比:%.3f"%(self.exceed_max_length, self.exceed_max_length/len(self.data)))
print("由于文本截断,导致实体缺失的样本数量:%d, 占比%.3f"%(self.entity_disapper, self.entity_disapper/len(self.data)))
def load(self):
self.text_data = []
self.data = []
self.exceed_max_length = 0
self.entity_disapper = 0
with open(self.path, encoding="utf8") as f:
for line in f:
sample = json.loads(line)
context = sample["context"]
object = sample["object"]
attribute = sample["attribute"]
value = sample["value"]
if object == "" or value == "":
continue
if attribute not in self.attribute_schema:
attribute = "UNRELATED"
try:
input_id, e1_mask, e2_mask, label = self.process_sentence(context, object, attribute, value)
except IndexError:
self.entity_disapper += 1
continue
self.data.append([torch.LongTensor(input_id),
torch.LongTensor(e1_mask),
torch.LongTensor(e2_mask),
torch.LongTensor([label])])
return
def process_sentence(self, context, object, attribute, value):
if len(context) > self.max_length:
self.exceed_max_length += 1
object_start = context.index(object) + 1 #因为bert的分词会在第一位增加[cls],所以向后移动一位
value_start = context.index(value) + 1 #同上
input_id = self.tokenizer.encode(context, max_length=self.max_length, pad_to_max_length=True)
attribute_label = self.attribute_schema[attribute] #关系标签
# 标记头实体
e1_mask = [0] * len(input_id)
for index in range(object_start, object_start + len(object)):
e1_mask[index] = 1
assert sum(e1_mask) >= 1, (object_start, object, e1_mask, list(range(object_start, object_start+len(object))), context)
# 标记尾实体
e2_mask = [0] * len(input_id)
for index in range(value_start, value_start + len(value)):
e2_mask[index] = 1
return input_id, e1_mask, e2_mask, attribute_label
def __len__(self):
return len(self.data)
def __getitem__(self, index):
return self.data[index]
def load_schema(self, path):
with open(path, encoding="utf8") as f:
return json.load(f)
#用torch自带的DataLoader类封装数据
def load_data(data_path, config, shuffle=True):
dg = DataGenerator(data_path, config)
dl = DataLoader(dg, batch_size=config["batch_size"], shuffle=shuffle)
return dl
if __name__ == "__main__":
from config import Config
dg = DataGenerator("../ner_data/train.txt", Config)
model:
# -*- coding: utf-8 -*-
import torch
import torch.nn as nn
from torch.optim import Adam, SGD
from transformers import BertModel
"""
复现论文Relation Bert
建立网络模型结构
"""
class TorchModel(nn.Module):
def __init__(self, config):
super(TorchModel, self).__init__()
self.config = config
self.bert = BertModel.from_pretrained(config["pretrain_model_path"])
self.hidden_size = self.bert.config.hidden_size
self.cls_fc_layer = nn.Linear(self.hidden_size, self.hidden_size)
self.entity_fc_layer = nn.Linear(self.hidden_size, self.hidden_size)
self.num_labels = self.config["num_labels"]
self.label_classifier = nn.Linear(self.hidden_size * 3, self.num_labels)
self.activation = torch.tanh
self.dropout = nn.Dropout(0.5)
#entity mask 形如: [0,0,1,1,0,0,..]
def entity_average(self, hidden_output, e_mask):
e_mask_unsqueeze = e_mask.unsqueeze(1) # [batch_size, 1, sentence_length]
length_tensor = (e_mask != 0).sum(dim=1).unsqueeze(1) # [batch_size, 1]
# [batch_size, 1, sentence_length] * [b, sentence_length, hidden_size]
# = [batch_size, 1, hidden_size] -> [batch_size, hidden_size]
sum_vector = torch.bmm(e_mask_unsqueeze.float(), hidden_output).squeeze(1)
avg_vector = sum_vector.float() / length_tensor.float() # 除以实体词长度,求平均
return avg_vector
def forward(self, input_ids, e1_mask, e2_mask, labels=None):
outputs = self.bert(input_ids)
sequence_output = outputs[0] # batch, sen_len, hidden_size
pooled_output = outputs[1] # [CLS] batch, hidden_size
# 实体向量求平均
e1_h = self.entity_average(sequence_output, e1_mask)
e2_h = self.entity_average(sequence_output, e2_mask)
# dropout
e1_h = self.dropout(e1_h)
e2_h = self.dropout(e2_h)
pooled_output = self.dropout(pooled_output)
#过线性层并激活
pooled_output = self.activation(self.cls_fc_layer(pooled_output))
e1_h = self.activation(self.entity_fc_layer(e1_h))
e2_h = self.activation(self.entity_fc_layer(e2_h))
# 拼接向量
concat_h = torch.cat([pooled_output, e1_h, e2_h], dim=-1)
logits = self.label_classifier(concat_h)
if labels is not None:
loss_fct = nn.CrossEntropyLoss()
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
return loss
else:
return logits
def choose_optimizer(config, model):
optimizer = config["optimizer"]
learning_rate = config["learning_rate"]
if optimizer == "adam":
return Adam(model.parameters(), lr=learning_rate)
elif optimizer == "sgd":
return SGD(model.parameters(), lr=learning_rate)
if __name__ == "__main__":
from config import Config
model = TorchModel(Config)
main:
# -*- coding: utf-8 -*-
import torch
import os
import random
import os
import numpy as np
import logging
from config import Config
from model import TorchModel, choose_optimizer
from evaluate import Evaluator
from loader import load_data
logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
"""
模型训练主程序
"""
def main(config):
#创建保存模型的目录
if not os.path.isdir(config["model_path"]):
os.mkdir(config["model_path"])
#加载训练数据
train_data = load_data(config["train_data_path"], config)
#加载模型
model = TorchModel(config)
# 标识是否使用gpu
cuda_flag = torch.cuda.is_available()
if cuda_flag:
logger.info("gpu可以使用,迁移模型至gpu")
model = model.cuda()
#加载优化器
optimizer = choose_optimizer(config, model)
#加载效果测试类
evaluator = Evaluator(config, model, logger)
#训练
for epoch in range(config["epoch"]):
epoch += 1
model.train()
logger.info("epoch %d begin" % epoch)
train_loss = []
for index, batch_data in enumerate(train_data):
optimizer.zero_grad()
if cuda_flag:
batch_data = [d.cuda() for d in batch_data]
loss = model(*batch_data)
loss.backward()
optimizer.step()
train_loss.append(loss.item())
if index % int(len(train_data) / 2) == 0:
logger.info("batch index %d, loss %f" % (index, loss))
logger.info("epoch average loss: %f" % np.mean(train_loss))
evaluator.eval(epoch)
model_path = os.path.join(config["model_path"], "epoch_%d.pth" % epoch)
torch.save(model.state_dict(), model_path)
return model, train_data
if __name__ == "__main__":
model, train_data = main(Config)
evaluate:
# -*- coding: utf-8 -*-
import torch
import re
import numpy as np
from collections import defaultdict
from loader import load_data
from sklearn.metrics import classification_report
"""
模型效果测试
"""
class Evaluator:
def __init__(self, config, model, logger):
self.config = config
self.model = model
self.logger = logger
self.valid_data = load_data(config["valid_data_path"], config, shuffle=False)
self.attribute_schema = self.valid_data.dataset.attribute_schema
self.index_to_label = dict((y, x) for x, y in self.attribute_schema.items())
def eval(self, epoch):
self.logger.info("开始测试第%d轮模型效果:" % epoch)
self.stats_dict = {"object_acc":0, "attribute_acc": 0, "value_acc": 0, "full_match_acc":0}
self.model.eval()
gold = []
pred = []
for index, batch_data in enumerate(self.valid_data):
if torch.cuda.is_available():
batch_data = [d.cuda() for d in batch_data]
input_id, e1_mask, e2_mask, labels = batch_data #输入变化时这里需要修改,比如多输入,多输出的情况
gold += labels.detach().tolist()
with torch.no_grad():
batch_pred = self.model(input_id, e1_mask, e2_mask) #不输入labels,使用模型当前参数进行预测
batch_pred = torch.argmax(batch_pred, dim=-1)
pred += batch_pred.detach().tolist()
report = classification_report(np.array(gold), np.array(pred)).rstrip().split("\n")
self.logger.info(report[0])
self.logger.info(report[-1])
predict:
# -*- coding: utf-8 -*-
import torch
import re
import json
import numpy as np
from collections import defaultdict
from config import Config
from model import TorchModel
from transformers import BertTokenizer
"""
模型效果测试
"""
class SentenceLabel:
def __init__(self, config, model_path):
self.config = config
self.attribute_schema = json.load(open(config["schema_path"], encoding="utf8"))
self.index_to_label = dict((y, x) for x, y in self.attribute_schema.items())
config["num_labels"] = len(self.attribute_schema)
self.tokenizer = BertTokenizer.from_pretrained(config["pretrain_model_path"])
self.model = TorchModel(config)
self.model.load_state_dict(torch.load(model_path))
self.model.eval()
print("模型加载完毕!")
def process_sentence(self, context, enti1, enti2):
enti1_start = context.index(enti1) + 1 #因为bert的分词会在第一位增加[cls],所以向后移动一位
enti2_start = context.index(enti2) + 1 #同上
input_id = self.tokenizer.encode(context)
# 标记头实体
e1_mask = [0] * len(input_id)
for index in range(enti1_start, enti1_start + len(enti1)):
e1_mask[index] = 1
# 标记尾实体
e2_mask = [0] * len(input_id)
for index in range(enti2_start, enti2_start + len(enti2)):
e2_mask[index] = 1
return input_id, e1_mask, e2_mask
def predict(self, sentence, enti1, enti2):
input_id, e1_mask, e2_mask = self.process_sentence(sentence, enti1, enti2)
with torch.no_grad():
relation_pred = self.model(torch.LongTensor([input_id]),
torch.LongTensor([e1_mask]),
torch.LongTensor([e2_mask])
)
relation_pred = torch.argmax(relation_pred)
relation = self.index_to_label[int(relation_pred)]
return relation
if __name__ == "__main__":
sl = SentenceLabel(Config, "model_output/epoch_10.pth")
sentence = "可你知道吗,兰博基尼的命名取自创始人“费鲁吉欧·兰博基尼”的名字,而更让人意外的是,兰博基尼刚开始只是一个做拖拉机的!"
e1 = "兰博基尼"
e2 = "费鲁吉欧·兰博基尼"
res = sl.predict(sentence, e1, e2)
print("预测关系:", res)
sentence = "傻丫头郭芙蓉、大女人翠平、励志的杜拉拉,姚晨的角色跳跃很大,是一个颇能适应各种类型题材的职业演员。"
e1 = "姚晨"
e2 = "演员"
res = sl.predict(sentence, e1, e2)
print("预测关系:", res)
3. 协同过滤
import openpyxl
import numpy as np
import time
from collections import defaultdict
'''
电影打分数据集
实现协同过滤
'''
#为了好理解,将数据格式转化成user-item的打分矩阵形式
def build_u2i_matrix(user_item_score_data_path, item_name_data_path, write_file=False):
#获取item id到电影名的对应关系
item_id_to_item_name = {}
with open(item_name_data_path, encoding="ISO-8859-1") as f:
for line in f:
item_id, item_name = line.split("|")[:2]
item_id = int(item_id)
item_id_to_item_name[item_id] = item_name
total_movie_count = len(item_id_to_item_name)
print("total movie:", total_movie_count)
#读打分文件
user_to_rating = {}
with open(user_item_score_data_path, encoding="ISO-8859-1") as f:
for line in f:
user_id, item_id, score, time_stamp = line.split("\t")
user_id, item_id, score = int(user_id), int(item_id), int(score)
if user_id not in user_to_rating:
user_to_rating[user_id] = [0] * total_movie_count
user_to_rating[user_id][item_id - 1] = score
print("total user:", len(user_to_rating))
if not write_file:
return user_to_rating, item_id_to_item_name
# 写入excel便于查看
workbook = openpyxl.Workbook()
sheet = workbook.create_sheet(index=0)
#第一行:user_id, movie1, movie2...
header = ["user_id"] + [item_id_to_item_name[i + 1] for i in range(total_movie_count)]
sheet.append(header)
for i in range(len(user_to_rating)):
#每行:user_id, rate1, rate2...
line = [i + 1] + user_to_rating[i + 1]
sheet.append(line)
workbook.save("user_movie_rating.xlsx")
return user_to_rating, item_id_to_item_name
#向量余弦距离
def cosine_distance(vector1, vector2):
ab = vector1.dot(vector2)
a_norm = np.sqrt(np.sum(np.square(vector1)))
b_norm = np.sqrt(np.sum(np.square(vector2)))
return ab/(a_norm * b_norm)
#根据用户打分计算item相似度
def find_similar_item(user_to_rating):
item_to_vector = {}
total_user = len(user_to_rating)
for user, user_rating in user_to_rating.items():
for moive_id, score in enumerate(user_rating):
moive_id += 1
if moive_id not in item_to_vector:
item_to_vector[moive_id] = [0] * (total_user + 1)
item_to_vector[moive_id][user] = score
#item_to_vector记录了每个用户打分,数据结构和user_to_rating一样
#复用一下下方的相似度计算方法
return find_similar_user(item_to_vector)
#依照user对item的打分判断user之间的相似度
def find_similar_user(user_to_rating):
user_to_similar_user = {}
score_buffer = {}
for user_a, ratings_a in user_to_rating.items():
similar_user = []
for user_b, ratings_b in user_to_rating.items():
#全算比较慢,省去一部分用户
if user_b == user_a or user_b > 100 or user_a > 100:
continue
#ab用户互换不用重新计算cos
if "%d_%d"%(user_b, user_a) in score_buffer:
similarity = score_buffer["%d_%d"%(user_b, user_a)]
#相似度计算采取cos距离
else:
similarity = cosine_distance(np.array(ratings_a), np.array(ratings_b))
score_buffer["%d_%d" % (user_a, user_b)] = similarity
similar_user.append([user_b, similarity])
similar_user = sorted(similar_user, reverse=True, key=lambda x:x[1])
user_to_similar_user[user_a] = similar_user
return user_to_similar_user
#基于user的协同过滤
#输入user_id, item_id, 给出预测打分
#有预测打分之后就可以对该用户所有未看过的电影打分,然后给出排序结果
#所以实现打分函数即可
#topn为考虑多少相似的用户
#取前topn相似用户对该电影的打分
def user_cf(user_id, item_id, user_to_similar_user, user_to_rating, topn=10):
pred_score = 0
count = 0
for similar_user, similarity in user_to_similar_user[user_id][:topn]:
#相似用户对这部电影的打分
rating_by_similiar_user = user_to_rating[similar_user][item_id - 1]
#分数*用户相似度,作为一种对分数的加权,越相似的用户评分越重要
pred_score += rating_by_similiar_user * similarity
#如果这个相似用户没看过,就不计算在总数内
if rating_by_similiar_user != 0:
count += 1
pred_score /= count + 1e-5
return pred_score
#基于item的协同过滤
#类似user_cf
#自己尝试实现
def item_cf(user_id, item_id, similar_items, user_to_rating):
pass
#对于一个用户做完整的item召回
def movie_recommand(user_id, similar_user, similar_items, user_to_rating, item_to_name, topn=10):
#当前用户还没看过的所有电影id
unseen_items = [item_id + 1 for item_id, rating in enumerate(user_to_rating[user_id]) if rating == 0]
res = []
for item_id in unseen_items:
#user_cf打分
score = user_cf(user_id, item_id, similar_user, user_to_rating)
# item_cf打分
# score = item_cf(user_id, item_id, similar_items, user_to_rating)
res.append([item_to_name[item_id], score])
#排序输出
res = sorted(res, key=lambda x:x[1], reverse=True)
return res[:topn]
if __name__ == "__main__":
user_item_score_data_path = "ml-100k/u.data"
item_name_data_path = "ml-100k/u.item"
user_to_rating, item_to_name = build_u2i_matrix(user_item_score_data_path, item_name_data_path, False)
#user-cf
similar_user = find_similar_user(user_to_rating)
similar_items = find_similar_item(user_to_rating)
# print("相似用户计算完成,耗时:", time.time() - s)
# while True:
# user_id = int(input("输入用户id:"))
# item_id = int(input("输入电影id:"))
# res = user_cf(user_id, item_id, similar_user, user_to_rating)
# print(res)
#为用户推荐电影
while True:
user_id = int(input("输入用户id:"))
recommands = movie_recommand(user_id, similar_user, similar_items, user_to_rating, item_to_name)
for recommand, score in recommands:
print("%.4f\t%s"%(score, recommand))