数据集
何向南老师github:
https://github.com/hexiangnan/neural_collaborative_filtering
我们先看一下数据集组成。
然后今天说的是 load_dataset做了什么事情。
先上代码:
导包
import pandas as pd import numpy as np import math from collections import defaultdict import heapq import scipy.sparse as sp import torch import torch.nn as nn import torch.nn.functional as F import torch.utils.data import torch.backends.cudnn as cudnn import os
载入数据。
def load_dataset(test_num=100):
train_data = pd.read_csv("./ncf_data/ml-1m.train.rating",
sep='t', header=None, names=['user', 'item'],
usecols=[0, 1], dtype={0: np.int32, 1: np.int32})
user_num = train_data['user'].max() + 1
item_num = train_data['item'].max() + 1
train_data = train_data.values.tolist()
#load ratings as a dok matrix
train_mat = sp.dok_matrix((user_num,item_num),dtype=np.float32)
for x in train_data:
train_mat[x[0], x[1]] = 1.0
test_data = []
with open("/data/fjsdata/ctKngbase/ml/ml-1m.test.negative", 'r') as fd:
line = fd.readline()
while line != None and line != '':
arr = line.split('t')
u = eval(arr[0])[0]
test_data.append([u, eval(arr[0])[1]])#one postive item
for i in arr[1:]:
test_data.append([u, int(i)]) #99 negative items
line = fd.readline()
return train_data, test_data, user_num, item_num, train_mat
先说一下 ml-1m.train.rating 文件
这个文件有列,分别是user,item,评分,时间戳(这个我也记不清是不是了)。
#load ratings as a dok matrix train_mat = sp.dok_matrix((user_num,item_num),dtype=np.float32) for x in train_data: train_mat[x[0], x[1]] = 1.0
上面这段代码是把所有打分交互过的用户,项目,组成一个矩阵,数据结构是这个样子的:(User,Item):1
这里补充一下,哪怕是用户打分只有1分,对应字典也是1.0
如图:
处理test_data
先看一下数据格式:
这里说明一下,由于作者在paper中没有明确说明(也可能是我没仔细看)
这个元组里面是用户项目交互,元组外面的一堆是未交互
所以在这里我们代码意思是把元组拿出来,作为积极,剩下的u对应下面这一串未交互的为消极。
test_data = []
with open("./ncf_data/ml-1m.test.negative", 'r') as fd:
line = fd.readline()
while line != None and line != '':
arr = line.split('t')
u = eval(arr[0])[0]
test_data.append([u, eval(arr[0])[1]])#one postive item
for i in arr[1:]:
test_data.append([u, int(i)]) #99 negative items
line = fd.readline()
GMF模型
所谓GMF也就是广义的矩阵分解模型。看一下通用框架。
实验中就是把用户(user)和项目(item)用one_hot编码的形式映射为 latent vector维度。所谓广义,就是这个模型可以多种用途,不一定就是处理这一类模型。上代码一看究竟:
class GMF(nn.Module): def __init__(self, user_num, item_num, factor_num): super(GMF, self).__init__() ‘’‘ user_num:用户数量 item_num:项目数量 factor_映射维度 ’‘’ self.embed_user_GMF = nn.Embedding(user_num, factor_num) self.embed_item_GMF = nn.Embedding(item_num, factor_num) self.predict_layer = nn.Linear(factor_num, 1) self._init_weight_() def _init_weight_(self): nn.init.normal_(self.embed_user_GMF.weight, std=0.01) nn.init.normal_(self.embed_item_GMF.weight, std=0.01) def forward(self, user, item): embed_user_GMF = self.embed_user_GMF(user) embed_item_GMF = self.embed_item_GMF(item) #GMF部分就是求两个embedding的内积 output_GMF = embed_user_GMF * embed_item_GMF prediction = self.predict_layer(output_GMF) return prediction.view(-1)
MLP模型
这里同样是把user,item映射,但是走MLP映射的维度与上面不同。毕竟模型不同。
上代码:
class MLP(nn.Module): def __init__(self, user_num, item_num, factor_num, num_layers, dropout): super(MLP, self).__init__() self.embed_user_MLP = nn.Embedding(user_num, factor_num * (2 ** (num_layers - 1))) self.embed_item_MLP = nn.Embedding(item_num, factor_num * (2 ** (num_layers - 1))) MLP_modules = [] for i in range(num_layers): input_size = factor_num * (2 ** (num_layers - i)) MLP_modules.append(nn.Dropout(p=dropout)) MLP_modules.append(nn.Linear(input_size, input_size//2)) MLP_modules.append(nn.ReLU()) '''这里解释一下,这里是把每一层的定义放在列表里,然后 nn.Sequential(*MLP_modules)函数直接取列表顺序定义网络,每一层的激活函 数都是relu''' self.MLP_layers = nn.Sequential(*MLP_modules) self.predict_layer = nn.Linear(factor_num, 1) self._init_weight_() def _init_weight_(self): nn.init.normal_(self.embed_user_MLP.weight, std=0.01) nn.init.normal_(self.embed_item_MLP.weight, std=0.01) for m in self.MLP_layers: if isinstance(m, nn.Linear): nn.init.xavier_uniform_(m.weight) nn.init.kaiming_uniform_(self.predict_layer.weight,a=1, nonlinearity='sigmoid') def forward(self, user, item): embed_user_MLP = self.embed_user_MLP(user) embed_item_MLP = self.embed_item_MLP(item) interaction = torch.cat((embed_user_MLP, embed_item_MLP), -1) output_MLP = self.MLP_layers(interaction) prediction = self.predict_layer(output_MLP) return prediction.view(-1)
NCF模型
就是将两个模型在倒数第二层(输出前一层融合)。这里要看一下这两个向量的融合方式。
class NCF(nn.Module): def __init__(self, user_num, item_num, factor_num, num_layers, dropout, model, GMF_model=None, MLP_model=None): super(NCF, self).__init__() """ user_num: number of users; item_num: number of items; factor_num: number of predictive factors; num_layers: the number of layers in MLP model; dropout: dropout rate between fully connected layers; model: 'MLP', 'GMF', 'NeuMF-end', and 'NeuMF-pre'; GMF_model: pre-trained GMF weights; MLP_model: pre-trained MLP weights. """ self.dropout = dropout self.model = model self.GMF_model = GMF_model self.MLP_model = MLP_model self.embed_user_GMF = nn.Embedding(user_num, factor_num) self.embed_item_GMF = nn.Embedding(item_num, factor_num) self.embed_user_MLP = nn.Embedding( user_num, factor_num * (2 ** (num_layers - 1))) self.embed_item_MLP = nn.Embedding( item_num, factor_num * (2 ** (num_layers - 1))) MLP_modules = [] for i in range(num_layers): input_size = factor_num * (2 ** (num_layers - i)) MLP_modules.append(nn.Dropout(p=self.dropout)) MLP_modules.append(nn.Linear(input_size, input_size//2)) MLP_modules.append(nn.ReLU()) self.MLP_layers = nn.Sequential(*MLP_modules) if self.model in ['MLP', 'GMF']: predict_size = factor_num else: predict_size = factor_num * 2 self.predict_layer = nn.Linear(predict_size, 1) self._init_weight_() def _init_weight_(self): """ We leave the weights initialization here. """ if not self.model == 'NeuMF-pre': nn.init.normal_(self.embed_user_GMF.weight, std=0.01) nn.init.normal_(self.embed_user_MLP.weight, std=0.01) nn.init.normal_(self.embed_item_GMF.weight, std=0.01) nn.init.normal_(self.embed_item_MLP.weight, std=0.01) for m in self.MLP_layers: if isinstance(m, nn.Linear): nn.init.xavier_uniform_(m.weight) nn.init.kaiming_uniform_(self.predict_layer.weight, a=1, nonlinearity='sigmoid') for m in self.modules(): if isinstance(m, nn.Linear) and m.bias is not None: m.bias.data.zero_() else: # embedding layers self.embed_user_GMF.weight.data.copy_( self.GMF_model.embed_user_GMF.weight) self.embed_item_GMF.weight.data.copy_( self.GMF_model.embed_item_GMF.weight) self.embed_user_MLP.weight.data.copy_( self.MLP_model.embed_user_MLP.weight) self.embed_item_MLP.weight.data.copy_( self.MLP_model.embed_item_MLP.weight) # mlp layers for (m1, m2) in zip( self.MLP_layers, self.MLP_model.MLP_layers): if isinstance(m1, nn.Linear) and isinstance(m2, nn.Linear): m1.weight.data.copy_(m2.weight) m1.bias.data.copy_(m2.bias) # predict layers predict_weight = torch.cat([ self.GMF_model.predict_layer.weight, self.MLP_model.predict_layer.weight], dim=1) precit_bias = self.GMF_model.predict_layer.bias + self.MLP_model.predict_layer.bias self.predict_layer.weight.data.copy_(0.5 * predict_weight) self.predict_layer.bias.data.copy_(0.5 * precit_bias) def forward(self, user, item): if not self.model == 'MLP': embed_user_GMF = self.embed_user_GMF(user) embed_item_GMF = self.embed_item_GMF(item) output_GMF = embed_user_GMF * embed_item_GMF if not self.model == 'GMF': embed_user_MLP = self.embed_user_MLP(user) embed_item_MLP = self.embed_item_MLP(item) interaction = torch.cat((embed_user_MLP, embed_item_MLP), -1) output_MLP = self.MLP_layers(interaction) if self.model == 'GMF': concat = output_GMF elif self.model == 'MLP': concat = output_MLP else: concat = torch.cat((output_GMF, output_MLP), -1) prediction = self.predict_layer(concat) return prediction.view(-1)



