特征交叉-CAN学习笔记代码解读

一核心模块coaction

对于每个特征对(feature_pairs)
weight, bias 来自于P_induction
P_fead是MLP的input

举个例子：如果是用户ID和产品ID的co-action，且产品ID是做induction，用户ID是做feed。

step1 用户ID/产品ID都先形成一个向量：对于产品ID，用parameter lookup获取一个可学习的P_induction（这个维度是(wi+bi) * L depth of mlp）; 用户ID则直接形成一个向量P_fead
step2 P_induction 这个向量逐层(MLP层)，reshape成MLP网络的weight 和bias；
step3 weight和bias作为MLP的参数，利用P_feed 作为input，进行MLP前向运算，得到特征交互结果

代码解读

#### CAN config #####
weight_emb_w = [[16, 8], [8,4]] # micro-mlp的参数dimension
weight_emb_b = [0, 0]           # bias参数
orders = 3  # 特征的阶数，文章提到了，要做高阶特征交叉，直接是P_feed^c, c就是阶数
order_indep = False # True
WEIGHT_EMB_DIM = (sum([w[0]*w[1] for w in weight_emb_w]) + sum(weight_emb_b)) # * orders 这个是供每一个micro-mlp拆解w&b需要的dimension总和
INDEP_NUM = 1
if order_indep:
    INDEP_NUM *= orders
###### 这一部分对应图中绿色和橙色部分，主要是把P_feed&P_induction的嵌入表示得到 ##########
if self.use_coaction:
   # batch_ph batch输入的数据；his_batch_ph历史批次数据; his_batch_embedded 历史嵌入表示
   ph_dict = {
       "item": [self.mid_batch_ph, self.mid_his_batch_ph, self.mid_his_batch_embedded],
       "cate": [self.cate_batch_ph, self.cate_his_batch_ph, self.cate_his_batch_embedded]
   }
   ### p_induction ####
   self.mlp_batch_embedded = [] # induction embedding
   with tf.device(device):
       # 定义可训练的嵌入矩阵，在这里n_mid是item id的数量
       self.item_mlp_embeddings_var = tf.get_variable("item_mlp_embedding_var", [n_mid, INDEP_NUM * WEIGHT_EMB_DIM], trainable=True)
       self.cate_mlp_embeddings_var = tf.get_variable("cate_mlp_embedding_var", [n_cate, INDEP_NUM * WEIGHT_EMB_DIM], trainable=True)
       # 通过embedding_lookup在上一步初始化好的矩阵中找到对应的embedding表示
       self.mlp_batch_embedded.append(tf.nn.embedding_lookup(self.item_mlp_embeddings_var, ph_dict['item'][0]))
       self.mlp_batch_embedded.append(tf.nn.embedding_lookup(self.cate_mlp_embeddings_var, ph_dict['cate'][0]))
       #########P_feed input ########
       self.input_batch_embedded = []
       self.item_input_embeddings_var = tf.get_variable("item_input_embedding_var", [n_mid, weight_emb_w[0][0] * INDEP_NUM], trainable=True)
       self.cate_input_embeddings_var = tf.get_variable("cate_input_embedding_var", [n_cate, weight_emb_w[0][0] * INDEP_NUM], trainable=True)  
         self.input_batch_embedded.append(tf.nn.embedding_lookup(self.item_input_embeddings_var, ph_dict['item'][1]))
       self.input_batch_embedded.append(tf.nn.embedding_lookup(self.cate_input_embeddings_var, ph_dict['cate'][1]))
################这一部分是P_induction&P_feed在MLP的使用#######################
if self.use_coaction:
    # p_feed/input
    input_batch = self.input_batch_embedded
    tmp_sum, tmp_seq = [], []
    if INDEP_NUM == 2:
        # 文章说明了是feature pairs，mlp_batch&input_batch都包含了两个部分，要分别组合
        for i, mlp_batch in enumerate(self.mlp_batch_embedded):
            for j, input_batch in enumerate(self.input_batch_embedded):
                coaction_sum, coaction_seq = gen_coaction(
                    mlp_batch[:, WEIGHT_EMB_DIM * j:  WEIGHT_EMB_DIM * (j+1)], 
                    input_batch[:, :, weight_emb_w[0][0] * i: weight_emb_w[0][0] * (i+1)],  
                    EMBEDDING_DIM, 
                    mode=CALC_MODE,
                    mask=self.mask) 
                
                tmp_sum.append(coaction_sum)
                tmp_seq.append(coaction_seq)
    else:
        for i, (mlp_batch, input_batch) in enumerate(zip(self.mlp_batch_embedded, self.input_batch_embedded)):
            coaction_sum, coaction_seq = gen_coaction(
                  mlp_batch[:, :INDEP_NUM * WEIGHT_EMB_DIM], 
                  input_batch[:, :, :weight_emb_w[0][0]],  
                  EMBEDDING_DIM, 
                  mode=CALC_MODE, 
                  mask=self.mask) 
            
            tmp_sum.append(coaction_sum)
            tmp_seq.append(coaction_seq)
            
    self.coaction_sum = tf.concat(tmp_sum, axis=1) # sum pooling
    self.cross.append(self.coaction_sum)   # concat              
###### core interaction 核心运算 #########
def gen_coaction(ad, his_items, dim, mode="can", mask=None):
    """
    ad: induct
    his_items 待交互seq
    """
    weight, bias = [], []
    idx = 0
    weight_orders = []
    bias_orders = []
    # 拆解得到weight&bias参数
    for i in range(orders):
        for w, b in zip(weight_emb_w, weight_emb_b):
            weight.append(tf.reshape(ad[:, idx:idx+w[0]*w[1]], [-1, w[0], w[1]]))
            idx += w[0] * w[1]
            if b == 0:
                bias.append(None)
            else:
                bias.append(tf.reshape(ad[:, idx:idx+b], [-1, 1, b]))
                idx += b
        weight_orders.append(weight)
        bias_orders.append(bias)
        if not order_indep:
            break
 
    if mode == "can":
        out_seq = []
        hh = []
        # 高阶特征处理，explicit deal with
        for i in range(orders):
            hh.append(his_items**(i+1))
        #hh = [sum(hh)]
        for i, h in enumerate(hh):
            if order_indep:
                weight, bias = weight_orders[i], bias_orders[i]
            else:
                weight, bias = weight_orders[0], bias_orders[0]
            # 模拟MLP forward calculation
            for j, (w, b) in enumerate(zip(weight, bias)):
                h  = tf.matmul(h, w)
                if b is not None:
                    h = h + b
                if j != len(weight)-1:
                    h = tf.nn.tanh(h)
                out_seq.append(h)
        out_seq = tf.concat(out_seq, 2)
        if mask is not None:
            mask = tf.expand_dims(mask, axis=-1) 
            out_seq = out_seq * mask
            
    # 序列交互结果做sum_pooling
    out = tf.reduce_sum(out_seq, 1)
    if keep_fake_carte_seq and mode=="emb":
        return out, out_seq
    return out, None

二文章中的应用
整体的模型结构两部分构成：

co-action作为核心形成的一部分，对于用户的序列特征，一一作用后做sum-pooling，对于非序列特征，作用后直接输出
DIEN作为核心形成的一部分

两部分concat以后加一个DNN常规操作，看起来就像是用co-action做显式的特征交叉，然后DIEN做之前的序列建模。
在这里插入图片描述
三一些其他细节补充

can 部分高阶特征处理: 直接把待交叉特征p_fead 做c阶运算后，再与p_induction进行作用
在文章场景，p_induction是target_item，也就是产品

四用tf2/torch重构

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

class CAN_Model(nn.Module):
    def __init__(self, n_uid, n_mid, n_cate, n_carte, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE, use_negsampling=False, use_softmax=True, use_coaction=False, use_cartes=False):
        super(CAN_Model, self).__init__()
        
        self.n_uid = n_uid
        self.n_mid = n_mid
        self.n_cate = n_cate
        self.n_carte = n_carte
        self.EMBEDDING_DIM = EMBEDDING_DIM
        self.HIDDEN_SIZE = HIDDEN_SIZE
        self.ATTENTION_SIZE = ATTENTION_SIZE
        self.use_negsampling = use_negsampling
        self.use_softmax = use_softmax
        self.use_coaction = use_coaction
        self.use_cartes = use_cartes

        self.uid_embeddings = nn.Embedding(n_uid, EMBEDDING_DIM)
        self.mid_embeddings = nn.Embedding(n_mid, EMBEDDING_DIM)
        self.cate_embeddings = nn.Embedding(n_cate, EMBEDDING_DIM)

        if use_cartes:
            self.carte_embeddings = nn.ModuleList([nn.Embedding(num, EMBEDDING_DIM) for num in n_carte])

        if self.use_coaction:
            self.item_mlp_embeddings = nn.Parameter(torch.randn(n_mid, INDEP_NUM * WEIGHT_EMB_DIM))
            self.cate_mlp_embeddings = nn.Parameter(torch.randn(n_cate, INDEP_NUM * WEIGHT_EMB_DIM))
            self.input_batch_embeddings = nn.ModuleList([nn.Embedding(n_mid, weight_emb_w[0][0] * INDEP_NUM), nn.Embedding(n_cate, weight_emb_w[0][0] * INDEP_NUM)])

        self.fc1 = nn.Linear(200, 80)
        self.fc2 = nn.Linear(80, 2 if use_softmax else 1)

    def forward(self, uid, mid, cate, mid_his, cate_his, mask, target, seq_len, lr, carte=None):
        # Embedding lookups
        uid_emb = self.uid_embeddings(uid)
        mid_emb = self.mid_embeddings(mid)
        cate_emb = self.cate_embeddings(cate)
        mid_his_emb = self.mid_embeddings(mid_his)
        cate_his_emb = self.cate_embeddings(cate_his)

        if self.use_cartes:
            carte_emb = [emb(carte[:, i, :]) for i, emb in enumerate(self.carte_embeddings)]

        # Co-action logic (if enabled)
        if self.use_coaction:
            # This is a simplified version of the co-action implementation from the original TensorFlow code
            mlp_embedded_item = self.item_mlp_embeddings[mid]
            mlp_embedded_cate = self.cate_mlp_embeddings[cate]
            input_embedded_item = self.input_batch_embeddings[0](mid_his)
            input_embedded_cate = self.input_batch_embeddings[1](cate_his)
            # Further coaction operations can be added based on your logic

        # Concatenate item and category embeddings
        item_eb = torch.cat([mid_emb, cate_emb], dim=1)
        item_his_eb = torch.cat([mid_his_emb, cate_his_emb], dim=2)
        item_his_eb_sum = item_his_eb.sum(dim=1)

        if self.use_negsampling:
            # Assuming the negative sampling implementation would need its own logic.
            pass

        # FC layers
        x = self.fc1(item_eb)
        x = F.relu(x)
        x = self.fc2(x)

        # Loss computation
        if self.use_softmax:
            y_hat = F.softmax(x, dim=-1)
            loss = F.cross_entropy(y_hat, target)
        else:
            y_hat = torch.sigmoid(x)
            loss = F.binary_cross_entropy_with_logits(x, target)

        return loss, y_hat

    def auxiliary_loss(self, h_states, click_seq, noclick_seq, mask):
        mask = mask.float()
        click_input = torch.cat([h_states, click_seq], dim=-1)
        noclick_input = torch.cat([h_states, noclick_seq], dim=-1)
        click_prop = self.auxiliary_net(click_input)[:, :, 0]
        noclick_prop = self.auxiliary_net(noclick_input)[:, :, 0]
        click_loss = -torch.log(click_prop) * mask
        noclick_loss = -torch.log(1.0 - noclick_prop) * mask
        loss = (click_loss + noclick_loss).mean()
        return loss

    def auxiliary_net(self, in_):
        x = F.relu(self.fc1(in_))
        x = F.relu(self.fc2(x))
        return x

    def train_step(self, data, optimizer):
        optimizer.zero_grad()
        loss, y_hat = self(data)
        loss.backward()
        optimizer.step()
        return loss.item()

    def evaluate(self, data):
        with torch.no_grad():
            loss, y_hat = self(data)
        return loss.item(), y_hat

# Example of using the model
n_uid = 1000
n_mid = 1000
n_cate = 500
n_carte = [10, 20]  # Example carte sizes
EMBEDDING_DIM = 128
HIDDEN_SIZE = 256
ATTENTION_SIZE = 128

model = CAN_Model(n_uid, n_mid, n_cate, n_carte, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE)

# Optimizer
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Example data
uid = torch.randint(0, n_uid, (32,))
mid = torch.randint(0, n_mid, (32,))
cate = torch.randint(0, n_cate, (32,))
mid_his = torch.randint(0, n_mid, (32, 5))
cate_his = torch.randint(0, n_cate, (32, 5))
mask = torch.ones(32, 5)
target = torch.randint(0, 2, (32,))
seq_len = torch.randint(1, 5, (32,))
lr = 0.001

# Training step
loss = model.train_step((uid, mid, cate, mid_his, cate_his, mask, target, seq_len, lr), optimizer)
print(f"Loss: {loss}")