from tqdm import tqdm
import numpy as np
import random
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
defgen_data_set(data, negsample=0):# 根据timestamp排序数据,并替换
data.sort_values("timestamp", inplace=True)#根据item_id进行去重
item_ids = data['item_id'].unique()# 构建训练与测试list
train_set =list()
test_set =list()for reviewrID, hist in tqdm(data.groupby('user_id')):# 正样本列表
pos_list = hist['item_id'].tolist()
rating_list = hist['rating'].tolist()if negsample >0:# 候选集中去掉用户看过的item项目
candidate_set =list(set(item_ids)-set(pos_list))# 随机选择负采样样本
neg_list = np.random.choice(candidate_set, size=len(pos_list)* negsample, replace=True)for i inrange(1,len(pos_list)):if i !=len(pos_list)-1:# 训练集和测试集划分
train_set.append((reviewrID, hist[::-1], pos_list[i],1,len(hist[::-1]), rating_list[i]))for negi inrange(negsample):
train_set.append((reviewrID, hist[::-1], neg_list[i * negsample + negi],0,len(hist[::-1])))else:
test_set.append((reviewrID, hist[::-1], pos_list[i],1,len(hist[::-1]), rating_list[i]))# 打乱数据集
random.shuffle(train_set)
random.shuffle(test_set)return train_set, test_set
defgen_model_input(train_set, user_profile, seq_max_len):# 用户id
train_uid = np.array([line[0]for line in train_set])# 历史交互序列
train_seq =[line[1]for line in train_set]# 物品id
train_iid = np.array([line[2]for line in train_set])# 正负样本标签
train_label = np.array([line[3]for line in train_set])# 历史交互序列长度
train_hist_len = np.array([line[4]for line in train_set])
train_seq_pad = pad_sequences(train_seq, maxlen=seq_max_len, padding='post', truncating='post', value=0)
train_model_input ={"user_id": train_uid,"item_id": train_iid,"hist_item_id": train_seq_pad,"hist_len": train_hist_len}for key in{"gender","age","city"}:
train_model_input[key]= user_profile.loc[train_model_input['user_id']][key].values
return train_model_input, train_label
代码解释:
**gen_data_set() **主要作用是接收数据集(data)和一个负采样(negsample)参数,返回一个训练集列表(trainset)和一个测试集列表(testset)。具体流程是先通过timetamp列对数据进行排序,根据item_id进行去重;然后根据user_id分组形成正负样本(正样本为购买过的,负样本为没有购买过的),对于negsample大于0,我们就要进行负采样,也就是随机选择一些没有购买过的商品为负样本,然后将它们保存到训练集中;最后,将正负样本数据以及其他信息(如历史交互序列、用户 ID 和历史交互序列的长度)保存到训练集列表和测试集列表中。