李宏毅-hw5-translation-有关transformer、seq2seq的探索

一、ppt研读：

1.关于这个 input Embedding 的内容:

2.关于Positional Encoding：

二、慢慢积累，一点点阅读代码：

虽然这次的模块挺多的，但是，这样也就意味着，把这个内化为自己的，就可以获得更大的进步了

1.关于使用git命令获取到源代码：

2.Fix Random Seed部分都是相同的，就是为了结果可以复现：

seed = 73
random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  
np.random.seed(seed)  
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True

3.定义一个字符类型转换函数：

def strQ2B(ustring):
    """Full width -> half width"""
    # reference:https://ithelp.ithome.com.tw/articles/10233122
    ss = []
    for s in ustring:
        rstring = ""
        for uchar in s:
            inside_code = ord(uchar)
            if inside_code == 12288:  # Full width space: direct conversion
                inside_code = 32
            elif (inside_code >= 65281 and inside_code <= 65374):  # Full width chars (except space) conversion
                inside_code -= 65248
            rstring += chr(inside_code)
        ss.append(rstring)
    return ''.join(ss)

4.字符处理函数：

def clean_s(s, lang):
    if lang == 'en':
        s = re.sub(r"\([^()]*\)", "", s) # remove ([text])
        s = s.replace('-', '') # remove '-'
        s = re.sub('([.,;!?()\"])', r' \1 ', s) # keep punctuation
    elif lang == 'zh':
        s = strQ2B(s) # Q2B
        s = re.sub(r"\([^()]*\)", "", s) # remove ([text])
        s = s.replace(' ', '')
        s = s.replace('—', '')
        s = s.replace('“', '"')
        s = s.replace('”', '"')
        s = s.replace('_', '')
        s = re.sub('([。,;!?()\"~「」])', r' \1 ', s) # keep punctuation
    s = ' '.join(s.strip().split())
    return s

5.python中的string.split()分割函数:

6.定义整个文字串的处理函数：

```python
def clean_corpus(prefix, l1, l2, ratio=9, max_len=1000, min_len=1):
    # 检查已经存在的清洗后文件，如果存在则跳过清洗步骤
    if Path(f'{prefix}.clean.{l1}').exists() and Path(f'{prefix}.clean.{l2}').exists():
        print(f'{prefix}.clean.{l1} & {l2} exists. skipping clean.')
        return
    
    # 打开原始语料文件和清洗后的输出文件
    with open(f'{prefix}.{l1}', 'r') as l1_in_f:
        with open(f'{prefix}.{l2}', 'r') as l2_in_f:
            with open(f'{prefix}.clean.{l1}', 'w') as l1_out_f:
                with open(f'{prefix}.clean.{l2}', 'w') as l2_out_f:
                    # 逐行读取原始语料文件
                    for s1 in l1_in_f:
                        s1 = s1.strip()  # 去除首尾空格
                        s2 = l2_in_f.readline().strip()
                        s1 = clean_s(s1, l1)  # 清洗句子 s1
                        s2 = clean_s(s2, l2)  # 清洗句子 s2
                        s1_len = len_s(s1, l1)  # 计算句子 s1 的长度
                        s2_len = len_s(s2, l2)  # 计算句子 s2 的长度
                        
                        if min_len > 0:  # 判断句子长度是否满足最小长度要求，如果不满足则跳过
                            if s1_len < min_len or s2_len < min_len:
                                continue
                        
                        if max_len > 0:  # 判断句子长度是否超过最大长度，如果超过则跳过
                            if s1_len > max_len or s2_len > max_len:
                                continue
                                
                        if ratio > 0:  # 判断句子长度比例是否满足要求，如果不满足则跳过
                            if s1_len/s2_len > ratio or s2_len/s1_len > ratio:
                                continue
                                
                        # 将清洗后的句子写入输出文件
                        print(s1, file=l1_out_f)
                        print(s2, file=l2_out_f)
```

以上是一个用于清洗语料的函数，函数将两个语料文件 `prefix.l1` 和 `prefix.l2` 清洗后保存到 `prefix.clean.l1` 和 `prefix.clean.l2` 文件中。

注释已经逐行添加在代码中。

总的来说呢，就是将太长或者太短的句子进行处理，并且进行一些符号的处理，使得最终得到的句子是比较合理的那种

7.转换为subword units进行处理:


import sentencepiece as spm
vocab_size = 8000
if (prefix/f'spm{vocab_size}.model').exists():
    print(f'{prefix}/spm{vocab_size}.model exists. skipping spm_train.') #如果这段文字已经被subword过了，pass
else:
    spm.SentencePieceTrainer.train( #否则，就要设置这个SentencePieceTrainer模块的参数的具体数值了
        input=','.join([f'{prefix}/train.clean.{src_lang}',
                        f'{prefix}/valid.clean.{src_lang}',
                        f'{prefix}/train.clean.{tgt_lang}',
                        f'{prefix}/valid.clean.{tgt_lang}']),
        model_prefix=prefix/f'spm{vocab_size}',
        vocab_size=vocab_size,
        character_coverage=1,
        model_type='unigram', # 'bpe' works as well
        input_sentence_size=1e6,
        shuffle_input_sentence=True,
        normalization_rule_name='nmt_nfkc_cf',
    )

spm_model = spm.SentencePieceProcessor(model_file=str(prefix/f'spm{vocab_size}.model')) #初始化上述模块
in_tag = {
    'train': 'train.clean',
    'valid': 'valid.clean',
    'test': 'test.raw.clean',
}
#反正下面这里就是利用subword进行文字串的处理
for split in ['train', 'valid', 'test']:
    for lang in [src_lang, tgt_lang]:
        out_path = prefix/f'{split}.{lang}'
        if out_path.exists():
            print(f"{out_path} exists. skipping spm_encode.")
        else:
            with open(prefix/f'{split}.{lang}', 'w') as out_f:
                with open(prefix/f'{in_tag[split]}.{lang}', 'r') as in_f:
                    for line in in_f:
                        line = line.strip()
                        tok = spm_model.encode(line, out_type=str)
                        print(' '.join(tok), file=out_f)

8.将上述的字符串利用fairseq处理：

binpath = Path('./DATA/data-bin', dataset_name)
if binpath.exists():
    print(binpath, "exists, will not overwrite!")
else:
    !python -m fairseq_cli.preprocess \
        --source-lang {src_lang}\
        --target-lang {tgt_lang}\
        --trainpref {prefix/'train'}\
        --validpref {prefix/'valid'}\
        --testpref {prefix/'test'}\
        --destdir {binpath}\
        --joined-dictionary\
        --workers 2

9.原来真的需要些log实验日志:

logging.basicConfig(
    format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
    level="INFO", # "DEBUG" "WARNING" "ERROR"
    stream=sys.stdout,
)
proj = "hw5.seq2seq"
logger = logging.getLogger(proj)
if config.use_wandb:
    import wandb
    wandb.init(project=proj, name=Path(config.savedir).stem, config=config)

10.关于fairseq（facebook AI Research Sequence to Sequence Toolkit）这个库的初探

（1）概念：它基于PyTorch开发，提供了多种自然语言处理任务的模型，包括神经机器翻译、语音识别、文本生成等

（2）功能分块

(3)使用方式：（我感觉就是一种别具一格形式的函数调用）

http://t.csdn.cn/d8dpn

也可以参考这一篇文章

11.torch.bmm(tensor1,tensor2)这种批量的矩阵乘法运算：

12.了解一下，什么叫做RNN(Recurrent Neural Network):

二、下面重点研读 encoder 、 attention 和 decoder部分的代码（目前来说其他部分有些吃力，等我够强大了，再回来报仇！！！虽迟未晚）

1.Encoder部分的代码：

class RNNEncoder(FairseqEncoder): #通过继承来自FairseqEncoder的这个类模型
    def __init__(self, args, dictionary, embed_tokens): #_init_初始化函数：参数：arg参数数组中包含着embed_dim、hidden_dim、num_layers等参数，dictionary字符集，embed_tokens这个算是字符还是embedding之后的向量呢？
        super().__init__(dictionary) #用字符初始化父类的_init_函数
        self.embed_tokens = embed_tokens #把embed_tokens参数传进去
        
        self.embed_dim = args.encoder_embed_dim #原来arg中就有dim等参数
        self.hidden_dim = args.encoder_ffn_embed_dim #中间层的dim
        self.num_layers = args.encoder_layers  #layer的层数
        
        self.dropout_in_module = nn.Dropout(args.dropout) #args中的dropout概率参数也要传递进去
        self.rnn = nn.GRU(        #定义这个模型的rnn结构，
            self.embed_dim,      #传参：embed_dim就是embed时的dim维度
            self.hidden_dim,      #hidden_dim就是中间层的dim维度
            self.num_layers,      #num_layers层数
            dropout=args.dropout,   #dropout概率参数
            batch_first=False, 
            bidirectional=True
        )
        self.dropout_out_module = nn.Dropout(args.dropout)
        
        self.padding_idx = dictionary.pad()  #pad()又是这个用来对不够长的文段进行pad，然后返回索引
        
    def combine_bidir(self, outs, bsz: int):
        out = outs.view(self.num_layers, 2, bsz, -1).transpose(1, 2).contiguous()
        return out.view(self.num_layers, bsz, -1) #就是对outs结构的重新排布

    def forward(self, src_tokens, **unused): #定义encoder的运行函数了
        bsz, seqlen = src_tokens.size() #获取tokens的数量
        
        # get embeddings
        x = self.embed_tokens(src_tokens) #对tokens字符进行embedding得到向量序列
        x = self.dropout_in_module(x)   #经过一次dropout，防止overfit

        # B x T x C -> T x B x C
        x = x.transpose(0, 1)       
        
        # pass thru bidirectional RNN
        h0 = x.new_zeros(2 * self.num_layers, bsz, self.hidden_dim) #创建一个多维向量
        x, final_hiddens = self.rnn(x, h0) #将x数据 和 h0传进到rnn模型中，得到输出x和final_hiddens
        outputs = self.dropout_out_module(x) #再将x通过一个dropout得到outputs
        # outputs = [sequence len, batch size, hid dim * directions]
        # hidden =  [num_layers * directions, batch size  , hid dim]
        
        # Since Encoder is bidirectional, we need to concatenate the hidden states of two directions
        final_hiddens = self.combine_bidir(final_hiddens, bsz)
        # hidden =  [num_layers x batch x num_directions*hidden]
        
        encoder_padding_mask = src_tokens.eq(self.padding_idx).t()
        return tuple(
            (
                outputs,  # seq_len x batch x hidden
                final_hiddens,  # num_layers x batch x num_directions*hidden
                encoder_padding_mask,  # seq_len x batch
            )
        )
    
    def reorder_encoder_out(self, encoder_out, new_order): #用于beam_search，反正不重要
        # This is used by fairseq's beam search. How and why is not particularly important here.
        return tuple(
            (
                encoder_out[0].index_select(1, new_order),
                encoder_out[1].index_select(1, new_order),
                encoder_out[2].index_select(1, new_order),
            )
        )

2.attention函数的设计:

class AttentionLayer(nn.Module): #用nn.Module设计一个self-attention的网络结构
    def __init__(self, input_embed_dim, source_embed_dim, output_embed_dim, bias=False): #定义初始化函数
        super().__init__()

        self.input_proj = nn.Linear(input_embed_dim, source_embed_dim, bias=bias) #第一个Linear层-input
        self.output_proj = nn.Linear(
            input_embed_dim + source_embed_dim, output_embed_dim, bias=bias  #第二个Linear层-output
        )

    def forward(self, inputs, encoder_outputs, encoder_padding_mask): #定义运行函数
        # inputs: T, B, dim
        # encoder_outputs: S x B x dim
        # padding mask:  S x B
        
        # convert all to batch first
        #通过transpose将这些数据都转化为第零维的是batch
        inputs = inputs.transpose(1,0) # B, T, dim
        encoder_outputs = encoder_outputs.transpose(1,0) # B, S, dim
        encoder_padding_mask = encoder_padding_mask.transpose(1,0) # B, S
        
        # project to the dimensionality of encoder_outputs
        x = self.input_proj(inputs) #调用input_linear函数

        # compute attention
        # (B, T, dim) x (B, dim, S) = (B, T, S) #这里说的就是批量的矩阵乘法运算
        attn_scores = torch.bmm(x, encoder_outputs.transpose(1,2)) 

        # cancel the attention at positions corresponding to padding
        if encoder_padding_mask is not None:
            # leveraging broadcast  B, S -> (B, 1, S)
            encoder_padding_mask = encoder_padding_mask.unsqueeze(1)
            attn_scores = (
                attn_scores.float()
                .masked_fill_(encoder_padding_mask, float("-inf"))
                .type_as(attn_scores)
            )  # FP16 support: cast to float and back

        # softmax on the dimension corresponding to source sequence
        attn_scores = F.softmax(attn_scores, dim=-1)  #将attn_scores通过一个softmax

        # shape (B, T, S) x (B, S, dim) = (B, T, dim) weighted sum
        x = torch.bmm(attn_scores, encoder_outputs) #再次进行矩阵乘法

        # (B, T, dim)
        x = torch.cat((x, inputs), dim=-1) #沿着最后一维进行向量的拼接
        x = torch.tanh(self.output_proj(x)) # concat + linear + tanh（双曲正切值）
        
        # restore shape (B, T, dim) -> (T, B, dim)
        return x.transpose(1,0), attn_scores  #返回x，和对应的attn_scores分数

3.Decoder的结构设计:

class RNNDecoder(FairseqIncrementalDecoder): #继承来自FairseqIncrementalDecoder，有点d
    def __init__(self, args, dictionary, embed_tokens): #初始化参数：和encoder那边一样的基本
        super().__init__(dictionary) 
        self.embed_tokens = embed_tokens 
        
        #2个断言，不用管
        assert args.decoder_layers == args.encoder_layers, f"""seq2seq rnn requires that encoder 
        and decoder have same layers of rnn. got: {args.encoder_layers, args.decoder_layers}"""
        assert args.decoder_ffn_embed_dim == args.encoder_ffn_embed_dim*2, f"""seq2seq-rnn requires 
        that decoder hidden to be 2*encoder hidden dim. got: {args.decoder_ffn_embed_dim, args.encoder_ffn_embed_dim*2}"""
        
        #把参数传进去
        self.embed_dim = args.decoder_embed_dim
        self.hidden_dim = args.decoder_ffn_embed_dim
        self.num_layers = args.decoder_layers
        
        
        self.dropout_in_module = nn.Dropout(args.dropout)
        self.rnn = nn.GRU( #取nn.GRU模板作为rnn结构
            self.embed_dim, 
            self.hidden_dim, 
            self.num_layers, 
            dropout=args.dropout, 
            batch_first=False, 
            bidirectional=False
        )
        self.attention = AttentionLayer(  #把上面定义的那个attentionLayer涌过来
            self.embed_dim, self.hidden_dim, self.embed_dim, bias=False
        ) 
        # self.attention = None
        self.dropout_out_module = nn.Dropout(args.dropout)
        
        if self.hidden_dim != self.embed_dim: #是否需要Linear层的问题
            self.project_out_dim = nn.Linear(self.hidden_dim, self.embed_dim) 
        else:
            self.project_out_dim = None
        
        if args.share_decoder_input_output_embed:
            self.output_projection = nn.Linear(
                self.embed_tokens.weight.shape[1],
                self.embed_tokens.weight.shape[0],
                bias=False,
            )
            self.output_projection.weight = self.embed_tokens.weight
        else:
            self.output_projection = nn.Linear(
                self.output_embed_dim, len(dictionary), bias=False
            )
            nn.init.normal_(
                self.output_projection.weight, mean=0, std=self.output_embed_dim ** -0.5
            )
        
    def forward(self, prev_output_tokens, encoder_out, incremental_state=None, **unused): #定义整个Decoder的运行过程
        # extract the outputs from encoder
        encoder_outputs, encoder_hiddens, encoder_padding_mask = encoder_out #从encoder那里取到数据
        # outputs:          seq_len x batch x num_directions*hidden
        # encoder_hiddens:  num_layers x batch x num_directions*encoder_hidden
        # padding_mask:     seq_len x batch
        
        if incremental_state is not None and len(incremental_state) > 0:
            # if the information from last timestep is retained, we can continue from there instead of starting from bos
            prev_output_tokens = prev_output_tokens[:, -1:]
            cache_state = self.get_incremental_state(incremental_state, "cached_state")
            prev_hiddens = cache_state["prev_hiddens"]
        else:
            # incremental state does not exist, either this is training time, or the first timestep of test time
            # prepare for seq2seq: pass the encoder_hidden to the decoder hidden states
            prev_hiddens = encoder_hiddens
        
        bsz, seqlen = prev_output_tokens.size()
        
        #主要的操作都在下面了哇！
        # embed tokens
        x = self.embed_tokens(prev_output_tokens) #对tokens进行embedding得到向量序列
        x = self.dropout_in_module(x)       #通过一次dropout

        # B x T x C -> T x B x C
        x = x.transpose(0, 1)
                
        # decoder-to-encoder attention
        if self.attention is not None:
            x, attn = self.attention(x, encoder_outputs, encoder_padding_mask) #有attention就进行attention
                        
        # pass thru unidirectional RNN
        x, final_hiddens = self.rnn(x, prev_hiddens)  #通过rnn结构
        # outputs = [sequence len, batch size, hid dim]
        # hidden =  [num_layers * directions, batch size  , hid dim]
        x = self.dropout_out_module(x)
                
        # project to embedding size (if hidden differs from embed size, and share_embedding is True, 
        # we need to do an extra projection)
        if self.project_out_dim != None:
            x = self.project_out_dim(x)
        
        # project to vocab size
        x = self.output_projection(x)
        
        # T x B x C -> B x T x C
        x = x.transpose(1, 0)
        
        # if incremental, record the hidden states of current timestep, which will be restored in the next timestep
        cache_state = {
            "prev_hiddens": final_hiddens,
        }
        self.set_incremental_state(incremental_state, "cached_state", cache_state)
        
        return x, None
    
    def reorder_incremental_state( #下面就是用来进行beam_search的内容，暂时不管了
        self,
        incremental_state,
        new_order,
    ):
        # This is used by fairseq's beam search. How and why is not particularly important here.
        cache_state = self.get_incremental_state(incremental_state, "cached_state")
        prev_hiddens = cache_state["prev_hiddens"]
        prev_hiddens = [p.index_select(0, new_order) for p in prev_hiddens]
        cache_state = {
            "prev_hiddens": torch.stack(prev_hiddens),
        }
        self.set_incremental_state(incremental_state, "cached_state", cache_state)
        return

目前实力还不足以让我完成hw5,君子报仇，十年不晚