比赛地址
这是用最基本的基于物品协同过滤算法实现的图书推荐。
import random
import numpy as np
import pandas as pd
import math
from operator import itemgetter
import logging
logging.basicConfig(level=logging.INFO, format='%(asctime)-15s %(levelname)s: %(message)s')
path = '/Users/Desktop/比赛/图书推荐系统'
train = pd.read_csv(path + '/dataset/train_dataset.csv')
test = pd.read_csv(path + '/dataset/test_dataset.csv')
sub = pd.read_csv(path + '/dataset/submission.csv')
logging.info("打印完毕")
data = train.copy()
data['rating'] = 1
data.head(5)
data.pivot(index='user_id', columns='item_id', values='rating') # 这样会发现有大量的稀疏, 所以才会用字典进行存放
trainSet, testSet = {}, {}
trainSet_len, testSet_len = 0, 0
pivot = 0.75 # 训练集的比例
# {user: {item_id: rating}}
for ele in data.itertuples():
user, item, rating = getattr(ele, 'user_id'), getattr(ele, 'item_id'), getattr(ele, 'rating')
if random.random() < pivot:
trainSet.setdefault(user, {})
trainSet[user][item] = rating
trainSet_len += 1
else:
testSet.setdefault(user, {})
testSet[user][item] = rating
testSet_len += 1
item_popular = {}
for user, items in trainSet.items(): # item:{movieID: rating}
for item in items:
if item not in item_popular:
item_popular[item] = 0
item_popular[item] += 1
item_count = len(item_popular)
print('Total movie number = %d' % movie_count)
# 下面建立item相似矩阵
print('Build user co-rated items matrix ...')
item_sim_matrix = {}
for user, items in trainSet.items():
for m1 in items: # 对于每个item, 都得双层遍历
for m2 in items:
if m1 == m2:
continue
item_sim_matrix.setdefault(m1, {})
item_sim_matrix[m1].setdefault(m2, 0)
item_sim_matrix[m1][m2] += 1 # 这里统计两个电影被同一个用户产生行为的次数, 这个就是余弦相似度的分子
# 计算电影之间的相似性
for m1, related_items in item_sim_matrix.items():
for m2, count in related_items.items(): # 这里面m2是相关电影, count是共同被同一个用户打分的次数
# 这里item的用户数为0处理
if item_popular[m1] == 0 or item_popular[m2] == 0:
item_sim_matrix[m1][m2] = 0
else:
item_sim_matrix[m1][m2] = count / math.sqrt(item_popular[m1] * item_popular[m2])
user_lst = test['user_id'].tolist()
# 找到最相似的K个item, 最终推荐n个给用户
k = 208
n = 10
result = []
for user in user_lst:
rank ={}
watched_items = trainSet[user] # 找出目标用户看过的书籍
for item, rating in watched_movies.items():
#遍历与物品item最相似的前k个产品,获得这些物品及相似分数
for related_item, w in sorted(item_sim_matrix[item].items(), key=itemgetter(1), reverse=True)[:k]:
# 若该物品用户看过则不推荐
if related_item in watched_items:
continue
# 计算用户user对related_item的偏好值, 初始化该值为0
rank.setdefault(related_item, 0)
#通过与其相似物品对物品related_item的偏好值相乘并相加。
#排名的依据—— > 推荐书籍与该已看书籍的相似度(累计) * 用户对已看书籍的评分
rank[related_item] += w * float(rating)
# 产生最后的推荐列表
rec_items = sorted(rank.items(), key=itemgetter(1), reverse=True)[:n]
for i in list(rec_items):
result.append(i)
r = []
for i in result:
r.append(i[0])
sub['item_id'] = r
sub
sub.to_csv(path + '/result/ItemCF.csv')
线上得分:0.02109538784