当前位置：首页 > article >正文

【NLP6-使用seq2seq模型架构实现英译法任务】

article 2025/4/2 18:08:27

使用seq2seq模型架构实现英译法任务

目标

1、更深一步了解seq2seq模型架构和翻译数据集

2、掌握适用基于GRU的seq2seq模型实现翻译过程

3、掌握Attention机制在解码器端的实现过程

seq2seq模型架构

encoder编码器 Decoder解码器

数据预览

基于GRU的seq2seq模型架构实现翻译的过程

1、导入必备工具包

from io import open #从io工具包导入open方法
import unicodedata   #用于字符规范化
import re   #用于正则表达式
import random
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import optim   #torch中预定义的优化方法工具包
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

2、对持久化文件中数据进行处理，以满足模型训练要求

2.1、将指定语言中的词汇映射成数值

# 2、对持久化文件中数据进行处理，以满足模型训练要求
#定义起始标志
SOS_token =0
#定义介绍标志
EOS_token =1

class Lang():
    def __init__(self,name):
        """初始化
        name:参数代表传入某种语言的名字
        """
        self.name = name
        #初始化单词到索引的映射字典
        self.word2index = {}
        #初始化索引到单词的映射字典，其中0，1对应的SOS，EOS已经在字典中了
        self.index2word ={0:"SOS",1:"EOS"}
        #初始化词汇对应的数字索引，从2开始，因为0，1已经被开始字符和结束字符占用了
        self.n_words =2

    def addSentence(self,sentence):
        """添加句子的函数，将整个句子中所有的单词依次添加到字典中
        因为英文，法文都是空格进行分割的语言，直接进行分词就可以
        """
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self,word):
        """添加单词到类内字典中，将单词转换为数字
        首先判断word是否已经在self.word2index字典的key中"""
        if word not in self.word2index:
            #添加的时候，索引值取当前类中单词的总数量
            self.word2index[word] = self.n_words
            #再添加翻转的字典
            self.index2word[self.n_words]=word
            #第三步更新类内的单词总数量
            self.n_words +=1

name = 'eng'
sentence ="hello I am Jay"
eng1 = Lang(name)
eng1.addSentence(sentence)

print("word2index:",eng1.word2index)
print("index2word:",eng1.index2word)
print("n_words:",eng1.n_words)

word2index: {‘hello’: 2, ‘I’: 3, ‘am’: 4, ‘Jay’: 5}
index2word: {0: ‘SOS’, 1: ‘EOS’, 2: ‘hello’, 3: ‘I’, 4: ‘am’, 5: ‘Jay’}
n_words: 6

2.2、字符规范化

#字符规范化
#将unicode 转为Ascii，我们可以认为是去掉一些语言中的重音标记
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD',s)
        if unicodedata.category(c) !='Mn'
    )

#定义字符串规范化函数
def normalizeString(s):
    #第一步使字符转变为小写并去除两侧的空白符，再调用上面的函数转换为ASCII字符串
    s = unicodeToAscii(s.lower().strip())
    #在.！？前面加一个空格
    s = re.sub(r"([.!?])",r" \1",s)
    s = re.sub(r"[^a-zA-Z.!?]+",r" ",s)
    return s

s="Are you kidding me?"

nsr = normalizeString(s)
print(nsr)

are you kidding me ?
input_lang: <main.Lang object at 0x000002437ECABDD8>

2.3、将持久化文件中的数据加载到内存，并实例化类Lang

#将持久化文件中的数据加载到内存，并实例化Lang
data_path = 'D:\data\code_and_data\eng-fra.txt'
def readLangs(lang1,lang2):
    #lang1:源语言的名字  lang2：目标语言的名字
    # 整个函数返回对应的两个类对象，以及语言对的列表
    lines = open(data_path,encoding='utf-8').read().strip().split('\n')
    #对lines列表中的句子进行标准化处理，并以\t进行再次划分，形成子列表
    pairs =[[normalizeString(s) for s in l.split('\t')] for l in lines]
    #直接初始化两个类对象
    input_lang = Lang(lang1)
    output_lang = Lang(lang2)
    return input_lang, output_lang, pairs

lang1 = 'eng'
lang2 ='fra'

input_lang, output_lang, pairs = readLangs(lang1, lang2)
print("input_lang:", input_lang)
print("output_lang:", output_lang)
print("pairs[:5]:",pairs[:5])

pairs[:5]: [[‘go .’, ‘va !’], [‘run !’, ‘cours !’], [‘run !’, ‘courez !’], [‘wow !’, ‘ca alors !’], [‘fire !’, ‘au feu !’]]

2.4、过滤出符合我们要求的语言对

#设置组成句子中单词或标点的最多个数
MAX_LENGTH =10

#选择带有指定前缀的英文源语言的语句作为训练数据
eng_prefixes =(
    "i am ","i m",
    "he is","he s",
    "she is ","she s",
    "you are ","you re",
    "they are ","they re"
)

#过滤语言对的具体逻辑函数
def filterPair(pair):
    return len(pair[0].split(' '))<MAX_LENGTH and \
           pair[0].startswith(eng_prefixes) and \
           len(pair[1].split(' ')) < MAX_LENGTH

#过滤语言对的函数
def filterPairs(pairs):
    return [pair for pair in pairs if filterPair(pair)]

fpairs = filterPairs(pairs)
print("fpairs[:5]:",fpairs[:5])

fpairs[:5]: [[‘i m .’, ‘j ai ans .’], [‘i m ok .’, ‘je vais bien .’], [‘i m ok .’, ‘ca va .’], [‘i m fat .’, ‘je suis gras .’], [‘i m fat .’, ‘je suis gros .’]]

2.5、对以上数据准备函数进行整合，并使用类Lang对语言对进行数值映射

#整合数据预处理的函数
def prepareData(lang1,lang2):
    #lang1:代表源语言的名字，英文
    #lang2：代表目标语言的名字，法文
    #通过调用readlangs()函数得到两个类对象，并得到字符串类型的语言对的列表
    input_lang,output_lang,pairs = readLangs(lang1,lang2)
    #对字符串类型的列表进行过滤操作
    pairs = filterPairs(pairs)
    #对过滤后的语言对列表进行遍历操作，添加进类对象中
    for pair in pairs:
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])
    #返回数值映射后的类对象，以及过滤后的语言对列表
    return input_lang,output_lang,pairs

input_lang,output_lang,pairs=prepareData('eng','fra')
print("input_n_words:",input_lang.n_words)
print("output_n_words:",output_lang.n_words)
print(random.choice(pairs))

input_n_words: 3082

output_n_words: 4552

[‘you re going to ruin your eyes .’, ‘vous allez vous bousiller les yeux .’]

2.6、将语言对转化维模型输入需要的张量

#将语言对转化为模型输入需要的张量
def tensorFromSentence(lang,sentence):
    indexes = [lang.word2index[word] for word in sentence.split(' ')]
    indexes.append(EOS_token)
    return torch.tensor(indexes,dtype=torch.long,device=device).view(-1,1)


def tensorsFromPair(pair):
    input_tensor = tensorFromSentence(input_lang,pair[0])
    output_tensor = tensorFromSentence(output_lang,pair[1])
    return (input_tensor,output_tensor)

pair = pairs[0]
pair_tensor = tensorsFromPair(pairs[0])
print(pair_tensor)

(tensor([[2],
[3],
[4],
[1]]), tensor([[2],
[3],
[4],
[5],
[1]]))

3、构建基于GRU的编码器和解码器

# 3、构建基于GRU的编码器和解码器
#编码器
class EncoderRNN(nn.Module):
    def __init__(self,input_size,hidden_size):
        #input_size:代表编码器输入尺寸，就是英文的词表大小
        #hidden_size：代表GRU的隐藏层神经单元数，同时也是词嵌入的维度
        super(EncoderRNN, self).__init__()
        #将参数传入类中
        self.input_size = input_size
        self.hidden_size = hidden_size
        #实例化Embedding层，输入参数分别是词表单词总数，和词嵌入的维度
        self.embedding = nn.Embedding(input_size,hidden_size)
        #实例化GRU,参数也是hidden_size
        self.gru = nn.GRU(hidden_size,hidden_size)

    def forward(self,input,hidden):
        #input：代表源语言中的输入张量
        #hidden:代表初始化的隐藏层张量
        #注意：经过Embedding处理后，张量是一个二维张量，但是GRU要求输入是三维张量
        #所以要对结果进行扩展维度 view(),同时让任意单词映射后的尺寸是（1，Embedding)
        output =self.embedding(input).view(1,1,-1)
        output,hidden = self.gru(output,hidden)
        return output,hidden

    def initHidden(self):
        #将隐藏层张量初始化为1*1*self.hidden_size大小的张量
        return torch.zeros(1,1,self.hidden_size,device=device)

hidden_size =25
input_size =20

input = pair_tensor[0][0]
hidden = torch.zeros(1,1,hidden_size)

encoder = EncoderRNN(input_size,hidden_size)
encoder_output,hidden = encoder(input,hidden)
print(encoder_output)

tensor([[[-2.8290e-01, -9.6259e-02, 2.5239e-01, -5.4805e-01, 1.4647e-01,
-4.3992e-02, 1.9509e-01, 2.4348e-01, 3.6683e-01, -5.5452e-02,
4.6634e-02, 1.5199e-01, 2.5359e-02, 3.4754e-01, 7.4054e-02,
8.3258e-02, 1.8888e-01, 3.2992e-01, 1.1495e-01, -2.5253e-01,
-4.0492e-02, 8.2203e-02, 4.5562e-04, 8.7427e-02, 1.9541e-01]]],
grad_fn=)

#解码器
class DecoderRNN(nn.Module):
    def __init__(self,hidden_size,output_size):
        #hidden_size:代表隐藏层的神经元个数，同时也是解码器的输入尺寸
        #output_size:代表整个解码器的输出尺寸，指定的尺寸也就是目标语言的单词总数
        super(DecoderRNN,self).__init__()
        #将参数传入类中
        self.hidden_size = hidden_size
        self.output_size = output_size
        #实例化Embedding对象，输入参数分别是目标语言的单词总数，和词嵌入的维度
        self.embedding = nn.Embedding(output_size,hidden_size)
        #实例化GRU对象
        self.gru = nn.GRU(hidden_size,hidden_size)
        #实例化线性层的对象，对GRU的输出做线性变换，得到希望的输出尺寸output_size
        self.out = nn.Linear(hidden_size,output_size)
        #最后进入softmax的处理
        self.softmax=nn.LogSoftmax(dim=1)

    def forward(self,input,hidden):
        #input:代表目标语言的输入张量
        #hidden：代表初始化的GRU隐藏层张量
        #经历了Embedding层处理后，要将张量形状改变为三维张量
        output = self.embedding(input).view(1,1,-1)
        #适用relu函数对输出进行处理，使得Embedding矩阵更稀疏，防止过拟合
        output=F.relu(output)
        #将张量传入GRU解码器
        output,hidden = self.gru(output,hidden)
        #经历GRU处理后的张量是三维张量，但是全连接层需要二位张量，利用output[0]来降维
        output=self.softmax(self.out(output[0]))
        return output,hidden

    def initHidden(self):
        #初始化隐藏层张量，形状为1*1*self.hidden_size
        return torch.zeros(1,1,self.hidden_size,device=device)

hidden_size=25
output_size =10

input1 = pair_tensor[1][0]
hidden=torch.zeros(1,1,hidden_size)

decoder = DecoderRNN(hidden_size,output_size)
output,hidden = decoder(input1,hidden)
print(output)
print(output.shape)

tensor([[-2.3544, -2.3464, -2.1223, -2.2558, -2.2278, -2.4300, -2.5363, -2.3480,
-2.3790, -2.1068]], grad_fn=)
torch.Size([1, 10])

构建基于GRU和Attention的解码器

#构建基于GRU和Attention的解码器

class AttnDecoderRNN(nn.Module):
    def __init__(self,hidden_size,output_size,dropout_p=0.1,max_length=MAX_LENGTH):
        #hidden_size:代表解码器的GRU输出尺寸，就是隐藏层神经元个数
        #output_size:指定的网络输出尺寸，代表目标语言的词汇总数
        #dropout_p：使用dropout层的置零比例
        #max_length:代表句子的最大长度
        super(AttnDecoderRNN, self).__init__()
        #将参数传入类中
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length=max_length

        #实例化一个Embedding对象，参数是目标语言的词汇总数和词嵌入的维度
        self.embedding = nn.Embedding(output_size,hidden_size)

        #实例化第一个注意力层，注意输入是两个张量的合并
        self.attn=nn.Linear(self.hidden_size * 2,self.max_length)

        #实例化第二个注意力层，注意输入也是两个张量的合并，同时输出要进入GRU中
        self.attn_combine = nn.Linear(self.hidden_size * 2 ,self.hidden_size)

        #实例化一个nn.Dropout(self.dropout_p)
        self.dropout = nn.Dropout(self.dropout_p)

        #实例化GRU单元
        self.gru = nn.GRU(self.hidden_size,self.hidden_size)

        #实例化GRU之后的线性层，作为整个解码器的输出
        self.out =nn.Linear(self.hidden_size,self.output_size)



    def forward(self,input1,hidden,encoder_output):
        #input1:源数据的输入张量
        #hidden:初始化的隐藏层张量
        #encoder_output:代表编码器的输出张量
        #对输入input1 进行词嵌入处理，并扩展维度
        embedded = self.embedding(input1).view(1,1,-1)
        #紧接着将其输入dropout层，防止过拟合
        embedded=self.dropout(embedded)

        #在进行第一个注意力层处理前，要将Q,K进行纵轴拼接
        attn_weights = F.softmax(self.attn(torch.cat((embedded[0],hidden[0]),1)),dim=1)
        #进行bmm操作，注意将二位张量扩展成三维张量
        attn_applied = torch.bmm(attn_weights.unsqueeze(0),encoder_output.unsqueeze(0))
        #再次进行拼接，顺便要进行一次降维
        output = torch.cat((embedded[0],attn_applied[0]),1)

        #将output输入第二个注意力层
        output=self.attn_combine(output).unsqueeze(0)
        #使用relu进行激活层处理
        output=F.relu(output)
        #将激活后的张量，连同隐藏层张量，一起传入GRU中
        output,hidden = self.gru(output,hidden)

        #最后将结果先降维，然后线性层梳理成指定的输出维度，最后经过softmax处理
        output=F.log_softmax(self.out(output[0]),dim=1)
        #返回解码器的最终输出结果，最后的隐藏层张量，注意力权重张量
        return output,hidden,attn_weights

    def initHidden(self):
        #初始化一个全零的隐藏层张量，形状为1*1*self.hidden_size
        return torch.zeros(1,1,self.hidden_size,device=device)


#实例化参数
hidden_size=25
output_size=10

#输入参数
input1= pair_tensor[1][0]
hidden = torch.zeros(1,1,hidden_size)
encoder_output = torch.randn(10,25)

#调用
decoder= AttnDecoderRNN(hidden_size,output_size)
output,hidden,attn_weights = decoder(input1,hidden,encoder_output)
print(output)
print(output.shape)
print(hidden.shape)
print(attn_weights)
print(attn_weights.shape)

tensor([[-2.3155, -2.0316, -2.4451, -2.2536, -2.5503, -2.3917, -2.0801, -2.4474,
-2.2222, -2.4182]], grad_fn=)
torch.Size([1, 10])
torch.Size([1, 1, 25])
tensor([[0.0789, 0.0532, 0.1400, 0.0623, 0.3510, 0.1217, 0.0299, 0.0812, 0.0376,
0.0441]], grad_fn=)
torch.Size([1, 10])

4、构建模型训练函数，并进行训练

什么是teacher_forcing：它是一种用于序列生成任务的训练技巧，在seq2seq架构中，根据循环神经网络，解码器每次应该使用上一步的结果作为输入的一部分，但是训练过程中，一旦上一步的结果是错误的，就会导致这种错误被累积，无法达到训练效果。因此，我们需要一种机制改变上一步出错的情况。因为训练时我们是已知争取的输出应该是什么，因此可以强制将上一步结果设置成正确的输出，这种方式叫做teacher_forcing

teacher_forcing作用：能够在训练的时候矫正模型的预测，避免在序列生成的过程中误差进一步放大。teacher_forcing能够极大的加快模型的收敛速度，令训练过程更快更平稳

构建训练函数train

构建时间计算函数timesince

调用训练函数并打印日志和制图

损失曲线分析：一直下降的损失曲线，说明模型正在收敛，能够从数据中找到一些规律应用于数据

# 4、构建模型训练函数，并进行训练
#设定teacher_forcing的比率，在多大的概率下使用这个策略进行训练
teacher_forcing_ratio =0.5

def train(input_tensor,target_tensor,encoder,decoder,encoder_optimizer,
          decoder_optimizer,criterion,max_length=MAX_LENGTH):
   #input_tensor：源语言的输入张量
   #target_tensor:目标语言的输入张量
   #encoder:编码器的实例化对象  decoder:解码器的实例化对象
   #encoder_optimizer 编码器优化器   decoder_optimizer ：解码器优化器
   #criterion：损失函数    max_length：代表句子的最大长度
   #初始化编码器的隐藏层张量
    encoder_hidden=encoder.initHidden()
   #训练前将编码器和解码器的优化器梯度归零
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()
    #根据源文本和目标文本张量获得对应的长度
    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)
    #初始化编码器的输出矩阵张量，形状是max_length*encoder.hidden_size
    encoder_outputs = torch.zeros(max_length,encoder.hidden_size,device=device)
    #设置初始损失值
    loss =0
    #遍历输入张量
    for ei in range(input_length):
        encoder_output,encoder_hidden = encoder(input_tensor[ei],encoder_hidden)
        #每一个轮次的输出encoder_output是三维张量，使用[0,0]进行降维到一维列表，赋值给输出张量
        encoder_outputs[ei]=encoder_output[0,0]
    #初始化解码器的第一个输入字符
    decoder_input = torch.tensor([[SOS_token]],device=device)
    #初始化解码器的隐藏层张量，赋值给最后一次编码器的隐藏层张量
    decoder_hidden = encoder_hidden
   #判断是否使用teacher_forcing
    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    #如果使用teacher_forcing
    if use_teacher_forcing:
        #遍历目标张量，进行解码
        for di in range(target_length):
            decoder_output,decoder_hidden,decoder_attention = decoder(
                decoder_input,decoder_hidden,encoder_outputs)
            #使用损失函数计算损失值，并进行累加
            loss +=criterion(decoder_output,target_tensor[di])
            #因为使用了teacher_forcing,所以将下一步的解码器输入强制设定为“正确的答案”
            decoder_input=target_tensor[di]
    #如果不适用teacher_forcing
    else:
        #遍历目标张量，进行解码
        for di in range(target_length):
            decoder_output,decoder_hidden,decoder_attention = decoder(
                decoder_input,decoder_hidden,encoder_outputs)
            #预测值变成输出张量中概率最大的那一个
            topv,topi = decoder_output.topk(1)
            #使用损失函数计算损失值，并进行累加
            loss+=criterion(decoder_output,target_tensor[di])
            #如果某一步的解码结果是句子终止符号，则解码直接结束，跳出循环
            if topi.squeeze().item()==EOS_token:
                break
            #下一步解码器的输入要设定为当前步最大概率值的那一个
            decoder_input=topi.squeeze().detach()
    #应用反向传播进行梯度计算
    loss.backward()
   #利用编码器和解码器的优化器进行参数的更新
    encoder_optimizer.step()
    decoder_optimizer.step()
    #返回平均损失
    return loss.item()/target_length

#构建时间函数
import time
import math

def timeSince(since):
    now = time.time()
    s = now - since
    m=math.floor(s/60)
    s-=m*60
    return '%dm %ds' % (m,s)

since = time.time()-10*60

period = timeSince(since)
print(period)


#调用训练函数并打印日志和制图
import matplotlib.pyplot as plt
def trainIters(encoder,decoder,n_iters,print_every=1000,
               plot_every=100,learning_rate =0.01):
    #encoder:编码器实例化对象
    # decoder：解码器实例化对象
    # n_iters：训练总迭代步数
    # print_every：每隔多少轮次进行一次训练日志的打印
    # plot_every ：每隔多少轮次进行一次损失值的添加，为了后续绘制损失曲线
    # learning_rate ：学习率
    #获取训练开始时间
    start = time.time()
    #初始化存放平均损失值的列表
    plot_losses =[]
    #每隔打印间隔的总损失值
    print_loss_total =0
    #每个绘制曲线损失值的列表
    plot_loss_total =0
    #定义编码器和解码器的优化器
    encoder_optimizer = optim.SGD(encoder.parameters(),lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(),lr=learning_rate)
    #定义损失函数
    criterion = nn.NLLLoss()
    #按照设定的总迭代次数进行迭代训练
    for iter in range(1,n_iters+1):
        #每次从语言对的列表中随机抽取一条样本作为本轮迭代的训练数据
        training_pair = tensorsFromPair(random.choice(pairs))
        #依次将选取出来的语句对作为输入张量，和输出张量
        input_tensor = training_pair[0]
        target_tensor = training_pair[1]
        #调用train()函数获得本轮迭代的损失值
        loss = train(input_tensor,target_tensor,encoder,decoder,
                     encoder_optimizer,decoder_optimizer,criterion)
        #将本轮迭代的损失值进行累加
        print_loss_total += loss
        plot_loss_total += loss
        #如果到达了打印的轮次
        if iter % print_every == 0:
            #首先获取本次打印的平均损失值
            print_loss_avg = print_loss_total/print_every
            #为了下一个打印间隔的累加，这里将累加器清零
            print_loss_total = 0
            #打印若干信息
            print('%s (%d %d%%) %.4f' % (timeSince(start),iter,
                            iter / n_iters * 100, print_loss_avg))
        #如果到达了绘制损失曲线的轮次
        if iter % plot_every == 0:
            #首先获取本次损失添加的平均损失值
            plot_loss_avg = plot_loss_total / plot_every
            #将平均损失值添加进最后的列表
            plot_losses.append(plot_loss_avg)
            #为了下一个添加损失值的累计，这里将累加器清零
            plot_loss_total = 0

    #绘制损失曲线
    plt.figure()
    plt.plot(plot_losses)
    #plt.savefig("D:\data\s2s.png")

#输入参数
hidden_size=256
encoder1 = EncoderRNN(input_lang.n_words,hidden_size).to(device)
attn_decoder1 = AttnDecoderRNN(hidden_size,output_lang.n_words,
                               dropout_p=0.1).to(device)
#attn_decoder1 = AttnDecoderRNN(hidden_size,output_lang.n_words).to(device)

n_iters = 5000
print_every = 500

if __name__ == '__main__':
    trainIters(encoder1,attn_decoder1,n_iters,print_every=print_every)
    #trainIters(encoder1,attn_decoder1,n_iters)

0m 35s (500 10%) 4.4319
1m 12s (1000 20%) 4.0590
1m 49s (1500 30%) 3.7745
2m 28s (2000 40%) 3.6969
3m 5s (2500 50%) 3.5634
3m 44s (3000 60%) 3.4102
4m 21s (3500 70%) 3.4046
5m 0s (4000 80%) 3.3699
5m 38s (4500 90%) 3.2801
6m 17s (5000 100%) 3.1754

5、构建模型评估函数，并进行测试以及Attention效果分析

5.1、构建模型评估函数evaluate

def evaluate(encoder,decoder,sentence,max_length=MAX_LENGTH):
    #encoder:编码器
    #decoder：解码器
    #sentence：待评估的源语句
    #max_length：句子最大长度
    with torch.no_grad():
        #对输入语句进行张量表示
        input_tensor = tensorFromSentence(input_lang,sentence)
        #获得输入句子长度
        input_length = input_tensor.size(0)
        #初始化编码器的隐藏层张量
        encoder_hidden = encoder.initHidden()
        #初始化编码器的输出张量，矩阵的形状max_length * hidden_size
        encoder_outputs= torch.zeros(max_length,encoder.hidden_size,device=device)

        #遍历输入张量
        for ei in range(input_length):
            #循环进入编码器的处理
            encoder_output,encoder_hidden = encoder(input_tensor[ei],encoder_hidden)
            #将三维张量的输出先进行降维到一维，然后赋值给encoder_output
            encoder_outputs[ei] = encoder_output[0,0]
        #初始化解码器的第一个输入，就是起始字符
        decoder_input = torch.tensor([[SOS_token]],device=device)
        #初始化解码器的隐藏层输入
        decoder_hidden = encoder_hidden
        #初始化预测词汇的列表
        decoded_words =[]
        #初始化一个attention张量
        decoder_attentions = torch.zeros(max_length,max_length)

        #遍历解码
        for di in range(max_length):
            #将张量送入解码器处理
            decoder_output,decoder_hidden,decoder_attention = decoder(
                decoder_input,decoder_hidden,encoder_outputs)
            #首先将注意力张量保存
            decoder_attentions[di] = decoder_attention.data
            #
            topv,topi = decoder_output.data.topk(1)
            #如果解析出的是结束字符
            if topi.item()== EOS_token:
                #将结束字符添加到结果列表中，并退出解码循环
                decoded_words.append('<EOS>')
                break
            else:
                #要根据索引去将真实的字符添加进结果列表中
                decoded_words.append(output_lang.index2word[topi.item()])
            #最后一步将预测的标签赋值给下一步解码器的输入
            decoder_input = topi.squeeze().detach()
        return decoded_words,decoder_attentions[:di + 1]

5.2、随机选择指定数量的数据进行评估

def evaluateRandomly(encoder,decoder,n=6):
    for i in range(n):
        pair = random.choice(pairs)
        print('>', pair[0])
        print('=', pair[1])
        output_words ,attentions = evaluate(encoder,decoder,pair[0])
        output_sentence = ' '.join(output_words)
        print('<',output_sentence)
        print(' ')

if __name__ == '__main__':
    evaluateRandomly(encoder1,attn_decoder1)

5.3、进行了Attention可视化分析

sentence="we re both teachers ."
output_words,attention = evaluate(encoder1,attn_decoder1,sentence)
print(output_words)
plt.figure()
plt.matshow(attention.numpy())

he saved us all .
= il nous a toutes sauvees .
< il a a de . .

they are russian .
= ils sont russes .
< ils sont en .

they re babies .
= ce sont des bebes .
< ils sont sont .

i m very strict .
= je suis tres strict .
< je suis tres en . .

she s eating for two .
= elle mange comme deux .
< elle est en train .

i m concentrating .
= je suis en train de me concentrer .
< je suis en .

[‘tu’, ‘es’, ‘vraiment’, ‘de’, ‘.’, ‘’]

分析：

Attention图像的纵坐标代表输入的源语言各个词汇对应的索引，0-6分别对应[“we”, “re”," both", “teachers”, “.”,“”]，纵坐标代表生成的目标语言各个词汇对应的索引，0-7代表[‘nous’,‘sommes’,‘toutes’,‘deux’,‘enseignantes’,‘.’,"],图中浅色小方块（颜色越浅说明影响越大）代表词汇之间的影响关系，比如源语言的第1个词汇对生成目标语言的第1个词汇影响最大，源语言的第4、5个词对生成目标语言的第5个词会影响最大，通过这样的可视化图形，我们可以知道Attention的效果好坏，与我们人为去判定到底还有多大的差距，进而衡量我们训练模型的可用性。

seq2seq模型架构分析

从图中可知，seq2seq模型架构，包括两部分分别是encoder（编码器）和decoder（解码器）。编码器和解码器的内部实现都使用了GRU模型，这里它要完成的是一个中文到英文的翻译：欢迎来北京 —>welcome to beijing 。编码器首先处理中文输入"欢迎来北京"，通过GRU模型获得每个时间步的输出张量，最后将它们拼接成一个中间语义张量C，接着解码器将使用这个中间语义张量c以及每一个时间步的隐层张量，逐个生成对应的翻译语言。

查看全文

http://www.kler.cn/a/273566.html