代码链接:https://github.com/zhougr1993/DeepInterestNetwork
论文数据下载:http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Electronics_5.json.gz
http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/meta_Electronics.json.gz
在代码DeepInterestNetwork文件夹下,新建一个raw_data,将下载好的数据解压,然后放到raw_data中即可
首先运行1_convert_pd.py
import pickle
import pandas as pd
# json格式转化为pandas的dataframe格式,并保存为pickle二进制文件格式。解释一下为什么要保存pickle文件格式,因为pickle文件是二进制形式,读取速度快。
'''
(1)将reviews_Electronics_5.json转换成dataframe,列分别为reviewID ,asin, reviewerName等,
(2)将meta_Electronics.json转成dataframe,并且只保留在reviewes文件中出现过的商品,去重。
(3)转换完的文件保存成pkl格式。
'''
def to_df(file_path):
with open(file_path, 'r') as fin:
df = {}
i = 0
for line in fin:
df[i] = eval(line)
i += 1
df = pd.Dataframe.from_dict(df, orient='index')
print(df)
return df
reviews_df = to_df('../raw_data/reviews_Electronics_5.json')
with open('../raw_data/reviews.pkl', 'wb') as f:
#pickle.dump(obj, file, [,protocol])将对象obj保存到文件file中去。protocol为序列化使用的协议版本
pickle.dump(reviews_df, f, pickle.HIGHEST_PROTOCOL)
meta_df = to_df('../raw_data/meta_Electronics.json')
meta_df = meta_df[meta_df['asin'].isin(reviews_df['asin'].unique())]
#reviews_df['asin'].unique() 以数组形式(numpy.ndarray)返回列的所有唯一值(特征的所有唯一值)
meta_df = meta_df.reset_index(drop=True)#重置索引
# print(meta_df)
with open('../raw_data/meta.pkl', 'wb') as f:
pickle.dump(meta_df, f, pickle.HIGHEST_PROTOCOL)
然后,2_remap_id.py文件
import random
import pickle
import numpy as np
#将asin,categories,reviewerID三个字段进行位置编码。位置编码主要通过build_map。
#特别解读一下build_map函数的作用,就是讲id排序,并转换成对应的位置索引
'''
(1)将reviews_df只保留reviewerID, asin, unixReviewTime三列;
(2)将meta_df保留asin, categories列,并且类别列只保留三级类目;(至此,用到的数据只设计5列,(reviewerID, asin, unixReviewTime),(asin, categories));
'''
random.seed(1234)
with open('../raw_data/reviews.pkl', 'rb') as f:
reviews_df = pickle.load(f)
reviews_df = reviews_df[['reviewerID', 'asin', 'unixReviewTime']]
with open('../raw_data/meta.pkl', 'rb') as f:
meta_df = pickle.load(f)
meta_df = meta_df[['asin', 'categories']]
# 返回categories的最后一个类
meta_df['categories'] = meta_df['categories'].map(lambda x: x[-1][-1])
def build_map(df, col_name):
key = sorted(df[col_name].unique().tolist())
# zip() 函数用于将可迭代的对象作为参数,将对象中对应的元素打包成一个个元组,然后返回由这些元组组成的列表(02131321,0)
m = dict(zip(key, range(len(key))))
df[col_name] = df[col_name].map(lambda x: m[x])
return m, key
#(3)用asin,categories,reviewerID分别生产三个map(asin_map, cate_map, revi_map),key为对应的原始信息,
# value为按key排序后的index(从0开始顺序排序),然后将原数据的对应列原始数据转换成key对应的index;
asin_map, asin_key = build_map(meta_df, 'asin')
cate_map, cate_key = build_map(meta_df, 'categories')
revi_map, revi_key = build_map(reviews_df, 'reviewerID')
# print("asin_map:","n",asin_map)
# print("cate_map:","n",cate_map)
# print("revi_map:","n",revi_map)
user_count, item_count, cate_count, example_count =
len(revi_map), len(asin_map), len(cate_map), reviews_df.shape[0]
print('user_count: %dtitem_count: %dtcate_count: %dtexample_count: %d' %
(user_count, item_count, cate_count, example_count))
#(4)将meta_df按asin对应的index进行排序
meta_df = meta_df.sort_values('asin')
meta_df = meta_df.reset_index(drop=True)
# print(meta_df)
#(5)将reiviews_df中的asin转换成asin_map中asin对应的value值
reviews_df['asin'] = reviews_df['asin'].map(lambda x: asin_map[x])
#并且按照reviewerID和时间排序。
reviews_df = reviews_df.sort_values(['reviewerID', 'unixReviewTime'])
reviews_df = reviews_df.reset_index(drop=True)
reviews_df = reviews_df[['reviewerID', 'asin', 'unixReviewTime']]
# print(reviews_df)
#(6)生成cate_list, 就是把meta_df的'categories'列取出来。
cate_list = [meta_df['categories'][i] for i in range(len(asin_map))]
cate_list = np.array(cate_list, dtype=np.int32)
# print(cate_list)
with open('../raw_data/remap.pkl', 'wb') as f:
#pickle.dump将对象obj保存到文件file中去
pickle.dump(reviews_df, f, pickle.HIGHEST_PROTOCOL) # uid, iid
pickle.dump(cate_list, f, pickle.HIGHEST_PROTOCOL) # cid of iid line
pickle.dump((user_count, item_count, cate_count, example_count),
f, pickle.HIGHEST_PROTOCOL)
pickle.dump((asin_key, cate_key, revi_key), f, pickle.HIGHEST_PROTOCOL)
第三,运行build_dataset.py
import random
import pickle
random.seed(1234)
with open('../raw_data/remap.pkl', 'rb') as f:
reviews_df = pickle.load(f)
cate_list = pickle.load(f)
user_count, item_count, cate_count, example_count = pickle.load(f)
# pos_list(每个点击者点击的商品 ID 组成的 list)例如: [8] [9,6,4,5] [3] [8]
train_set = []
test_set = []
#(1)将reviews_df按reviewerID进行聚合
for reviewerID, hist in reviews_df.groupby('reviewerID'):
# print(reviewerID)
# print(hist)
# (2)将hist的asin列作为每个reviewerID(也就是用户)的正样本列表(pos_list),注意这里的asin存的已经不是原始的item_id了,
# 而是通过asin_map转换过来的index。负样本列表(neg_list)为在item_count范围内产生不在pos_list中的随机数列表。
pos_list = hist['asin'].tolist()
def gen_neg():
# 取每个用户点击列表的第一个商品
neg = pos_list[0]
while neg in pos_list:
# 随机初始化,即给点击者随机初始化一个商品,item_count-1 为商品数
neg = random.randint(0, item_count-1)
return neg
neg_list = [gen_neg() for i in range(len(pos_list))]
# 如果用户点击的商品数大于 1,则循环
for i in range(1, len(pos_list)):
hist = pos_list[:i]
'''
下面的 if 语句控制正负样本的个数和格式),例如某用户点击过 abcd 四个商品,
则最终生成的样本为:(其中 X 为随机初始化的某商品 ID)
((user_id,a,(b,1)) (user_id,a,(X,0)) (user_id,(a,b),(c,1))
user_id,(a,b),(X,0)) (user_id,(a,b,c),(d,1)) (user_id,(a,b,c),(X,0))
'''
if i != len(pos_list) - 1:
train_set.append((reviewerID, hist, pos_list[i], 1))
train_set.append((reviewerID, hist, neg_list[i], 0))
# 验证集格式(user_id,a,(b,X))
else:
label = (pos_list[i], neg_list[i])
test_set.append((reviewerID, hist, label))
#最终的数据集里点击商品数小于 1 的数据删除掉了
random.shuffle(train_set)
random.shuffle(test_set)
assert len(test_set) == user_count
# assert(len(test_set) + len(train_set) // 2 == reviews_df.shape[0])
with open('dataset.pkl', 'wb') as f:
pickle.dump(train_set, f, pickle.HIGHEST_PROTOCOL)
pickle.dump(test_set, f, pickle.HIGHEST_PROTOCOL)
pickle.dump(cate_list, f, pickle.HIGHEST_PROTOCOL)
pickle.dump((user_count, item_count, cate_count), f, pickle.HIGHEST_PROTOCOL)
第四,运行din文件夹下的train.py
import os
import time
import pickle
import random
import numpy as np
import tensorflow as tf
import sys
from din.input import DataInput, DataInputTest
from din.model import Model
os.environ['CUDA_VISIBLE_DEVICES'] = '1'
random.seed(1234)
np.random.seed(1234)
tf.set_random_seed(1234)
train_batch_size = 32
test_batch_size = 512
predict_batch_size = 32
predict_users_num = 1000
predict_ads_num = 100
with open('dataset.pkl', 'rb') as f:
train_set = pickle.load(f)
test_set = pickle.load(f)
cate_list = pickle.load(f)
user_count, item_count, cate_count = pickle.load(f)
best_auc = 0.0
def calc_auc(raw_arr):
# sort by pred value, from small to big
arr = sorted(raw_arr, key=lambda d: d[2])#sorted默认升序
auc = 0.0
fp1, tp1, fp2, tp2 = 0.0, 0.0, 0.0, 0.0
for record in arr:
fp2 += record[0] # noclick
tp2 += record[1] # click
auc += (fp2 - fp1) * (tp2 + tp1)
fp1, tp1 = fp2, tp2
# 如果所有都没点击或都点击,丢弃
threshold = len(arr) - 1e-3
if tp2 > threshold or fp2 > threshold:
return -0.5
if tp2 * fp2 > 0.0: # normal auc
return (1.0 - auc / (2.0 * tp2 * fp2))
else:
return None
def _auc_arr(score):
'''
socre_p_and_n [[0.6690283 0.66794145]
[0.6697357 0.66776145]
[0.6706023 0.6694659 ]
...
[0.66957724 0.6703156 ]
[0.67001593 0.6686039 ]
[0.66936827 0.66843873]]
'''
score_p = score[:, 0]
score_n = score[:, 1]
score_arr = []
for s in score_p.tolist():
score_arr.append([0, 1, s])
for s in score_n.tolist():
score_arr.append([1, 0, s])
return score_arr#[[0, 1, 0.6690282821655273], [0, 1, 0.6697357296943665], [0, 1, 0.6706023216247559], [0, 1, 0.6693568825721741],
def _eval(sess, model):
auc_sum = 0.0
score_arr = []
for _, uij in DataInputTest(test_set, test_batch_size):
#self.i, (u, i, j, hist_i, sl),uij为(u, i, j, hist_i, sl)本batch中用户ID列表,候选正样本商品ID列表,候选负样本商品ID列表
#u_auc, socre_p_and_n,其中,u_auc=tf.reduce_mean(i_b - j_b + d_layer_3_i - d_layer_3_j)
#socre_p_and_n=tf.concat(tf.reshape([tf.sigmoid(i_b + d_layer_3_i), [-1, 1]),tf.reshape(tf.sigmoid(j_b + d_layer_3_j), [-1, 1])], axis=-1)
auc_, score_ = model.eval(sess, uij)
#u_auc 0.5546875
score_arr += _auc_arr(score_)#拼接在后边
auc_sum += auc_ * len(uij[0])
test_gauc = auc_sum / len(test_set)
Auc = calc_auc(score_arr)
global best_auc
if best_auc < test_gauc:
best_auc = test_gauc
model.save(sess, 'save_path/ckpt')
return test_gauc, Auc
def _test(sess, model):
auc_sum = 0.0
score_arr = []
predicted_users_num = 0
print("test sub items")
for _, uij in DataInputTest(test_set, predict_batch_size):
if predicted_users_num >= predict_users_num:
break
score_ = model.test(sess, uij)
score_arr.append(score_)
predicted_users_num += predict_batch_size
return score_[0]
# tf.GPUOptions(allow_growth=True) 让TensorFlow在运行过程中动态申请显存,需要多少就申请多少
gpu_options = tf.GPUOptions(allow_growth=True)
with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess:
model = Model(user_count, item_count, cate_count, cate_list, predict_batch_size, predict_ads_num)
# 含有tf.Variable的环境下,因为tf中建立的变量是没有初始化的,也就是在debug时还不是一个tensor量,而是一个Variable变量类型
sess.run(tf.global_variables_initializer())
sess.run(tf.local_variables_initializer())
print('test_gauc: %.4ft test_auc: %.4f' % _eval(sess, model))
sys.stdout.flush()
lr = 1.0
start_time = time.time()
for _ in range(50):
random.shuffle(train_set)
epoch_size = round(len(train_set) / train_batch_size)
loss_sum = 0.0
for _, uij in DataInput(train_set, train_batch_size):
# (u, i, y, hist_i, sl)u,i,j分别为本batch中用户ID列表,候选样本商品ID列表,候选样本的label,1为正,0为负
loss = model.train(sess, uij, lr)
loss_sum += loss
# tf.Tensor.eval是tf.Session.run的缩写
# 每1000次输出一次
if model.global_step.eval() % 1000 == 0:
test_gauc, Auc = _eval(sess, model)
print('Epoch %d Global_step %dtTrain_loss: %.4fteval_GAUC: %.4fteval_AUC: %.4f' %
(model.global_epoch_step.eval(), model.global_step.eval(),
loss_sum / 1000, test_gauc, Auc))
sys.stdout.flush()
loss_sum = 0.0
#运行336000次后学习率从1.0改为0.1
if model.global_step.eval() % 336000 == 0:
lr = 0.1
print('Epoch %d DonEtCost time: %.2f' %
(model.global_epoch_step.eval(), time.time() - start_time))
sys.stdout.flush()
model.global_epoch_step_op.eval()
print('best test_gauc:', best_auc)
sys.stdout.flush()
在此,附上模型代码讲解,即din文件夹下的model.py
import tensorflow as tf
# from Dice import dice
class Model(object):
def __init__(self, user_count, item_count, cate_count, cate_list, predict_batch_size, predict_ads_num):
self.u = tf.placeholder(tf.int32, [None, ]) # shape: [B], user id。 (B:batch size)
self.i = tf.placeholder(tf.int32, [None, ]) # shape: [B] i: 正样本的item
self.j = tf.placeholder(tf.int32, [None, ]) # shape: [B] j: 负样本的item
self.y = tf.placeholder(tf.float32, [None, ]) # shape: [B], y: label
self.hist_i = tf.placeholder(tf.int32, [None,
None]) # shape: [B, T] #用户行为特征(User Behavior)中的item序列。T为序列长度 #行为本batch中用户数,每行为每个用户的历史用户行为,即历史点击的商品的id
self.sl = tf.placeholder(tf.int32, [
None, ]) # shape: [B]; sl:sequence length,User Behavior中序列的真实序列长度 #hist_i中每行用户的历史行为的长度
self.lr = tf.placeholder(tf.float32, []) # learning rate
hidden_units = 128
user_emb_w = tf.get_variable("user_emb_w", [user_count,
hidden_units]) # shape: [U, H], user_id的embedding weight. U是user_id的hash bucket size
item_emb_w = tf.get_variable("item_emb_w", [item_count,
hidden_units // 2]) # shape: [I, H//2], item_id的embedding weight. I是item_id的hash bucket size
item_b = tf.get_variable("item_b", [item_count],
initializer=tf.constant_initializer(0.0)) # shape: [I], bias
cate_emb_w = tf.get_variable("cate_emb_w",
[cate_count, hidden_units // 2]) # shape: [C, H//2], cate_id的embedding weight.
cate_list = tf.convert_to_tensor(cate_list, dtype=tf.int64) # shape: [C, H//2]
# ----嵌入层start-------
# 嵌入层一Candidate Ad
ic = tf.gather(cate_list, self.i) # 从cate_list中取出正样本的cate [B]
# 将正样本商品ID和该商品所属分类进行嵌入并拼接为i_emb
i_emb = tf.concat(values=[
tf.nn.embedding_lookup(item_emb_w, self.i), # [B,hidden_units//2]
tf.nn.embedding_lookup(cate_emb_w, ic), # [B,hidden_units//2]=[B,H//2]
], axis=1) # 正样本的embedding,正样本包括item和cate [B,H]
i_b = tf.gather(item_b, self.i) # 偏置b
jc = tf.gather(cate_list, self.j) # 从cate_list中取出负样本的cate
j_emb = tf.concat([
tf.nn.embedding_lookup(item_emb_w, self.j),
tf.nn.embedding_lookup(cate_emb_w, jc),
], axis=1) # 负样本的embedding,负样本包括item和cate
j_b = tf.gather(item_b, self.j)
# 嵌入层2 User behaviors
hc = tf.gather(cate_list, self.hist_i) # 用户行为序列(User Behavior)中的cate序列 [B,T]
h_emb = tf.concat([ # 在shape【0,1,2】某一个维度上连接
tf.nn.embedding_lookup(item_emb_w, self.hist_i), # [B, T, hidden_units // 2]
tf.nn.embedding_lookup(cate_emb_w, hc), # [B, T, hidden_units // 2]
], axis=2) # 用户行为序列(User Behavior)的embedding,包括item序列和cate序列 # [B, T, H]
# ----------嵌入层end------------
# ----attention操作start---------
hist_i = attention(i_emb, h_emb, self.sl) # attention操作 [B,1,H]
hist_i = tf.layers.batch_normalization(inputs=hist_i)
hist_i = tf.reshape(hist_i, [-1, hidden_units], name='hist_bn')
# 添加一层全连接层,hist为输入,hidden_units为输出维数
hist_i = tf.layers.dense(hist_i, hidden_units, name='hist_fcn')
u_emb_i = hist_i#[None, 128]
hist_j = attention(j_emb, h_emb, self.sl)
# hist_j = tf.layers.batch_normalization(inputs = hist_j)
hist_j = tf.layers.batch_normalization(inputs=hist_j, reuse=True)
hist_j = tf.reshape(hist_j, [-1, hidden_units], name='hist_bn')
hist_j = tf.layers.dense(hist_j, hidden_units, name='hist_fcn', reuse=True)
u_emb_j = hist_j#[None, 128]
# print("u_emb_i.get_shape().as_list()",u_emb_i.get_shape().as_list())
# print("u_emb_j.get_shape().as_list()",u_emb_j.get_shape().as_list())
# print("i_emb.get_shape().as_list()",i_emb.get_shape().as_list())
# print("j_emb.get_shape().as_list()",j_emb.get_shape().as_list())
# -- attention end ---
# 下面两个全连接用来计算y,i为正样本,j为负样本
# -- fcn begin -------
# 全连接4
din_i = tf.concat([u_emb_i, i_emb, u_emb_i * i_emb], axis=-1)
# 全连接5
din_i = tf.layers.batch_normalization(inputs=din_i, name='b1')
d_layer_1_i = tf.layers.dense(din_i, 80, activation=tf.nn.sigmoid, name='f1') # 全连接层 [B, 80]
# if u want try dice change sigmoid to None and add dice layer like following two lines. You can also find model_dice.py in this folder.
# d_layer_1_i = tf.layers.dense(din_i, 80, activation=None, name='f1')
# d_layer_1_i = dice(d_layer_1_i, name='dice_1_i')
d_layer_2_i = tf.layers.dense(d_layer_1_i, 40, activation=tf.nn.sigmoid, name='f2')
# d_layer_2_i = tf.layers.dense(d_layer_1_i, 40, activation=None, name='f2')
# d_layer_2_i = dice(d_layer_2_i, name='dice_2_i')
d_layer_3_i = tf.layers.dense(d_layer_2_i, 1, activation=None, name='f3')
din_j = tf.concat([u_emb_j, j_emb, u_emb_j * j_emb], axis=-1)
din_j = tf.layers.batch_normalization(inputs=din_j, name='b1', reuse=True)
d_layer_1_j = tf.layers.dense(din_j, 80, activation=tf.nn.sigmoid, name='f1', reuse=True)
# d_layer_1_j = tf.layers.dense(din_j, 80, activation=None, name='f1', reuse=True)
# d_layer_1_j = dice(d_layer_1_j, name='dice_1_j')
d_layer_2_j = tf.layers.dense(d_layer_1_j, 40, activation=tf.nn.sigmoid, name='f2', reuse=True)
# d_layer_2_j = tf.layers.dense(d_layer_1_j, 40, activation=None, name='f2', reuse=True)
# d_layer_2_j = dice(d_layer_2_j, name='dice_2_j')
d_layer_3_j = tf.layers.dense(d_layer_2_j, 1, activation=None, name='f3', reuse=True)
d_layer_3_i = tf.reshape(d_layer_3_i, [-1]) # 展开成行向量
d_layer_3_j = tf.reshape(d_layer_3_j, [-1])
#------------------------------------------------------------------------
# 预测的(y正-y负)
x = i_b - j_b + d_layer_3_i - d_layer_3_j # [B]
# 预测的(y正)
self.logits = i_b + d_layer_3_i
#---------------------------------------------------------------------------
# 所选项目的预测 所选项目的 logits:
item_emb_all = tf.concat([
item_emb_w,
tf.nn.embedding_lookup(cate_emb_w, cate_list)
], axis=1)
item_emb_sub = item_emb_all[:predict_ads_num, :]#[N,H]
item_emb_sub = tf.expand_dims(item_emb_sub, 0)#[none,N,H]
item_emb_sub = tf.tile(item_emb_sub, [predict_batch_size, 1, 1])#复制 [B,N,H]
hist_sub = attention_multi_items(item_emb_sub, h_emb, self.sl)# [B, N, 1, H]
hist_sub = tf.layers.batch_normalization(inputs=hist_sub, name='hist_bn', reuse=tf.AUTO_REUSE)
# print hist_sub.get_shape().as_list()
hist_sub = tf.reshape(hist_sub, [-1, hidden_units])
hist_sub = tf.layers.dense(hist_sub, hidden_units, name='hist_fcn', reuse=tf.AUTO_REUSE)
u_emb_sub = hist_sub
item_emb_sub = tf.reshape(item_emb_sub, [-1, hidden_units])
din_sub = tf.concat([u_emb_sub, item_emb_sub, u_emb_sub * item_emb_sub], axis=-1)
din_sub = tf.layers.batch_normalization(inputs=din_sub, name='b1', reuse=True)
d_layer_1_sub = tf.layers.dense(din_sub, 80, activation=tf.nn.sigmoid, name='f1', reuse=True)
# d_layer_1_sub = dice(d_layer_1_sub, name='dice_1_sub')
d_layer_2_sub = tf.layers.dense(d_layer_1_sub, 40, activation=tf.nn.sigmoid, name='f2', reuse=True)
# d_layer_2_sub = dice(d_layer_2_sub, name='dice_2_sub')
d_layer_3_sub = tf.layers.dense(d_layer_2_sub, 1, activation=None, name='f3', reuse=True)
d_layer_3_sub = tf.reshape(d_layer_3_sub, [-1, predict_ads_num])
self.logits_sub = tf.sigmoid(item_b[:predict_ads_num] + d_layer_3_sub)
self.logits_sub = tf.reshape(self.logits_sub, [-1, predict_ads_num, 1])
# -- fcn end -------
# tf.reduce_mean 函数用于计算张量tensor沿着指定的数轴(tensor的某一维度)上的的平均值,主要用作降维或者计算tensor(图像)的平均值。
self.mf_auc = tf.reduce_mean(tf.to_float(x > 0))
self.score_i = tf.sigmoid(i_b + d_layer_3_i)
self.score_j = tf.sigmoid(j_b + d_layer_3_j)
self.score_i = tf.reshape(self.score_i, [-1, 1])
self.score_j = tf.reshape(self.score_j, [-1, 1])
self.p_and_n = tf.concat([self.score_i, self.score_j], axis=-1)
print("self.p_and_n.get_shape().as_list()",self.p_and_n.get_shape().as_list())
# Step variable
self.global_step = tf.Variable(0, trainable=False, name='global_step')
self.global_epoch_step =
tf.Variable(0, trainable=False, name='global_epoch_step')
self.global_epoch_step_op =
tf.assign(self.global_epoch_step, self.global_epoch_step + 1)#将后边的值赋给前面的值,即global_epoch_step+1
# loss and train
# tf.reduce_mean 函数用于计算张量tensor沿着指定的数轴(tensor的某一维度)上的的平均值,主要用作降维或者计算tensor(图像)的平均值。
self.loss = tf.reduce_mean(
tf.nn.sigmoid_cross_entropy_with_logits(
logits=self.logits,
labels=self.y)
)
trainable_params = tf.trainable_variables()
self.opt = tf.train.GradientDescentOptimizer(learning_rate=self.lr)#这个方法会自动根据loss计算对应variable的导数。
gradients = tf.gradients(self.loss, trainable_params)#loss对trainable_params求导
clip_gradients, _ = tf.clip_by_global_norm(gradients, 5)#让权重的更新限制在一个合适的范围,防止loss divergence
self.train_op = self.opt.apply_gradients(#将compute_gradients()返回的值作为输入参数对variable进行更新
zip(clip_gradients, trainable_params), global_step=self.global_step)
def train(self, sess, uij, l):
# (u, i, y, hist_i, sl)u,i,j分别为本batch中用户ID列表,候选样本商品ID列表,候选样本的label,1为正,0为负
loss, _ = sess.run([self.loss, self.train_op], feed_dict={
self.u: uij[0],#本batch中用户ID列表
self.i: uij[1],#候选样本商品ID列表
self.y: uij[2],#候选样本的label,1为正,0为负
self.hist_i: uij[3],#用户历史行为矩阵
self.sl: uij[4],
self.lr: l,
})
return loss
def eval(self, sess, uij):
# uij为(u, i, j, hist_i, sl)
u_auc, socre_p_and_n = sess.run([self.mf_auc, self.p_and_n], feed_dict={
self.u: uij[0],
self.i: uij[1], # 正样本
self.j: uij[2], # 负样本
self.hist_i: uij[3], # 行为本batch中用户数,每行为每个用户的历史用户行为,即历史点击的商品的id
self.sl: uij[4], # hist_i中每行用户的历史行为的长度
})
return u_auc, socre_p_and_n
def test(self, sess, uij):
return sess.run(self.logits_sub, feed_dict={
self.u: uij[0],
self.i: uij[1],
self.j: uij[2],
self.hist_i: uij[3],
self.sl: uij[4],
})
def save(self, sess, path):
saver = tf.train.Saver()
saver.save(sess, save_path=path)
def restore(self, sess, path):
saver = tf.train.Saver()
saver.restore(sess, save_path=path)
def extract_axis_1(data, ind):
batch_range = tf.range(tf.shape(data)[0])
indices = tf.stack([batch_range, ind], axis=1)
res = tf.gather_nd(data, indices)
return res
def attention(queries, keys, keys_length):
'''
queries: shape: [B, H], 即i_emb
keys: shape: [B, T, H], 即h_emb
keys_length: shape: [B], 即self.sl
B:batch size; T: 用户序列的长度;H:embedding size
'''
# (?,32)->(None,32)->32
# 对queries的维度进行reshape
# (?,T,32)这里是为了让queries和keys的维度相同而做的操作
# (?,T,128)把u和v以及u v的element wise差值向量合并起来作为输入,
# 然后喂给全连接层,最后得出两个item embedding,比如u和v的权重,即g(Vi,Va)
# 将i_emb[B,H]转为[B,T,H]
# tile()函数是用来对张量(Tensor)进行扩展的,其特点是对当前张量内的数据进行一定规则的复制。最终的输出张量维度不变
queries_hidden_units = queries.get_shape().as_list()[-1] # shape: [H]
queries = tf.tile(queries, [1, tf.shape(keys)[1]]) # [B,H] -> T*[B,H] # tf.shape(keys)[1]==T
queries = tf.reshape(queries, [-1, tf.shape(keys)[1], queries_hidden_units]) # T*[B,H] ->[B, T, H]
# 拼接
din_all = tf.concat([queries, keys, queries - keys, queries * keys], axis=-1) # attention操作,输出维度为[B, T, 4*H]
# 三层全链接(d_layer_3_all为训练出来的atteneion权重)
d_layer_1_all = tf.layers.dense(din_all, 80, activation=tf.nn.sigmoid, name='f1_att',
reuse=tf.AUTO_REUSE) # [B, T, 80]
d_layer_2_all = tf.layers.dense(d_layer_1_all, 40, activation=tf.nn.sigmoid, name='f2_att',
reuse=tf.AUTO_REUSE) # [B, T, 40]
d_layer_3_all = tf.layers.dense(d_layer_2_all, 1, activation=None, name='f3_att', reuse=tf.AUTO_REUSE) # [B, T, 1]
d_layer_3_all = tf.reshape(d_layer_3_all, [-1, 1, tf.shape(keys)[1]]) # [B, 1, T]
# 为了让outputs维度和keys的维度一致
outputs = d_layer_3_all # attention的输出, [B, 1, T]
# Mask
# bool类型 tf.shape(keys)[1]为历史行为序列的最大长度,keys_length为人为设定的参数,
# 如tf.sequence_mask(3,5) 即为array[True,True,True,False,False]
# 函数的作用是为了后面补齐行为序列,获取等长的行为序列做铺垫
key_masks = tf.sequence_mask(keys_length, tf.shape(keys)[1]) # [B, T]
key_masks = tf.expand_dims(key_masks, 1) # 在第二维增加一维,[B, 1, T]
# tf.ones_like新建一个与output类型大小一致的tensor,设置填充值为一个很小的值,而不是0
paddings = tf.ones_like(outputs) * (-2 ** 32 + 1) # padding的mask后补一个很小的负数,这样softmax之后就会接近0.
# 填充,获取等长的行为序列
# tf.where(condition, x, y),condition是bool型值,True/False,返回值是对应元素,condition中元素为True的元素替换为x中的元素,为False的元素替换为y中对应元素
# 由于是替换,返回值的维度,和condition,x , y都是相等的。
outputs = tf.where(key_masks, outputs, paddings) # [B, 1, T] padding操作,将每个样本序列中空缺的商品都赋值为(-2 ** 32 + 1)
# Scale缩放,为什么
outputs = outputs / (keys.get_shape().as_list()[-1] ** 0.5) #keys.get_shape().as_list()[-1] H
# Activation
outputs = tf.nn.softmax(outputs) # [B, 1, T]#这里的output是attention计算出来的权重,即论文公式(3)里的w,
# Weighted sum outputs=g(Vi,Va) keys=Vi
# 这步为公式中的g(Vi*Va)*Vi
outputs = tf.matmul(outputs, keys) # [B, 1, H]三维矩阵相乘,相乘发生在后两维,即 B * (( 1 * T ) * ( T * H ))
return outputs
def attention_multi_items(queries, keys, keys_length):
'''
queries: [B, N, H] N 广告的数量
keys: [B, T, H]
keys_length: [B]
'''
queries_hidden_units = queries.get_shape().as_list()[-1]#H
queries_nums = queries.get_shape().as_list()[1]#N
queries = tf.tile(queries, [1, 1, tf.shape(keys)[1]])#T*[B, N, H]
queries = tf.reshape(queries, [-1, queries_nums, tf.shape(keys)[1], queries_hidden_units]) # shape : [B, N, T, H]
max_len = tf.shape(keys)[1]
keys = tf.tile(keys, [1, queries_nums, 1])
keys = tf.reshape(keys, [-1, queries_nums, max_len, queries_hidden_units]) # shape : [B, N, T, H]
din_all = tf.concat([queries, keys, queries - keys, queries * keys], axis=-1)
d_layer_1_all = tf.layers.dense(din_all, 80, activation=tf.nn.sigmoid, name='f1_att', reuse=tf.AUTO_REUSE)
d_layer_2_all = tf.layers.dense(d_layer_1_all, 40, activation=tf.nn.sigmoid, name='f2_att', reuse=tf.AUTO_REUSE)
d_layer_3_all = tf.layers.dense(d_layer_2_all, 1, activation=None, name='f3_att', reuse=tf.AUTO_REUSE)
d_layer_3_all = tf.reshape(d_layer_3_all, [-1, queries_nums, 1, max_len])
outputs = d_layer_3_all
# Mask
key_masks = tf.sequence_mask(keys_length, max_len) # [B, T]
key_masks = tf.tile(key_masks, [1, queries_nums])
key_masks = tf.reshape(key_masks, [-1, queries_nums, 1, max_len]) # shape : [B, N, 1, T]
paddings = tf.ones_like(outputs) * (-2 ** 32 + 1)
outputs = tf.where(key_masks, outputs, paddings) # [B, N, 1, T]
# Scale
outputs = outputs / (keys.get_shape().as_list()[-1] ** 0.5)
# Activation
outputs = tf.nn.softmax(outputs) # [B, N, 1, T]
outputs = tf.reshape(outputs, [-1, 1, max_len])
keys = tf.reshape(keys, [-1, max_len, queries_hidden_units])
# print outputs.get_shape().as_list()
# print keys.get_sahpe().as_list()
# Weighted sum
outputs = tf.matmul(outputs, keys)
outputs = tf.reshape(outputs, [-1, queries_nums, queries_hidden_units]) # [B, N, 1, H]
print("outputs.get_shape().as_list()",outputs.get_shape().as_list())#[None, 100, 128]
return outputs
din文件夹下的input.py,在train.py运行时会先调用input.py生成需要的数据模式,然后调用model.py进行训练
import numpy as np
class DataInput:
def __init__(self, data, batch_size):
# train_set, train_batch_size=32
self.batch_size = batch_size
self.data = data
self.epoch_size = len(self.data) // self.batch_size
if self.epoch_size * self.batch_size < len(self.data):
self.epoch_size += 1
self.i = 0
def __iter__(self):
return self
def __next__(self):
if self.i == self.epoch_size:
raise StopIteration
ts = self.data[self.i * self.batch_size: min((self.i + 1) * self.batch_size,
len(self.data))]
self.i += 1
# 例如某用户点击过 abcd 四个商品,则最终生成的样本为:(其中 X 为随机初始化的某商品 ID) ((user_id,a,(b,1)) (user_id,a,(X,0))
# (user_id,(a,b),(c,1)) user_id,(a,b),(X,0)) (user_id,(a,b,c),(d,1)) (user_id,(a,b,c),(X,0))
u, i, y, sl = [], [], [], []
for t in ts:
u.append(t[0])# 用户ID
i.append(t[2])#候选样本的ID,正负都有
y.append(t[3])#候选样本的label,1为正,0为负
sl.append(len(t[1]))#用户历史行为的长度
# 将历史行为存到矩阵中
max_sl = max(sl)
hist_i = np.zeros([len(ts), max_sl], np.int64)
k = 0
for t in ts:
for l in range(len(t[1])):
hist_i[k][l] = t[1][l]
k += 1
# u,i,j分别为本batch中用户ID列表,候选样本商品ID列表,候选样本的label,1为正,0为负
# sl为每个用户ID的用户历史行为长度列表
# hist_i包含每个用户ID的历史行为的矩阵
# self.i为第几组batch,从1开始,直到等于self.epoch_size停止
return self.i, (u, i, y, hist_i, sl)
class DataInputTest:
def __init__(self, data, batch_size):
self.batch_size = batch_size
self.data = data
self.epoch_size = len(self.data) // self.batch_size
if self.epoch_size * self.batch_size < len(self.data):
self.epoch_size += 1
self.i = 0
def __iter__(self):
return self
def __next__(self):
if self.i == self.epoch_size:
raise StopIteration
ts = self.data[self.i * self.batch_size: min((self.i + 1) * self.batch_size,
len(self.data))]
self.i += 1
u, i, j, sl = [], [], [], []
# test_set格式为(reviewerID,(a,b,c),(d,X))
for t in ts:
u.append(t[0]) # 用户ID
i.append(t[2][0]) # 候选正样本商品ID
j.append(t[2][1]) # 候选负样本商品ID
sl.append(len(t[1])) # 用户历史行为,即用户之前点击过的商品的ID的列表,的长度
max_sl = max(sl) # 该batch中用户历史行为最长的长度
hist_i = np.zeros([len(ts), max_sl], np.int64) # 创建行数为本batch,列数为历史行为最长的零矩阵
# 将用户历史行为复制到零矩阵中
k = 0
for t in ts:
for l in range(len(t[1])):
hist_i[k][l] = t[1][l]
k += 1
# u,i,j分别为本batch中用户ID列表,候选正样本商品ID列表,候选负样本商品ID列表
# sl为每个用户ID的用户历史行为长度列表
# hist_i包含每个用户ID的历史行为的矩阵
# self.i为第几组batch,从1开始,直到等于self.epoch_size停止
return self.i, (u, i, j, hist_i, sl)



