注意力机制Transformer介绍
def attention(query, key, value, mask=None, dropout=None): # 词嵌入的维度 d_k = query.size(-1) # 首先得到注意力得分张量scores scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(d_k) # 判断是否使用掩码 if mask is not None: # 如果掩码张量某处为0,那么就用 -inf 代替scores某处对应数值 scores = scores.masked_fill(mask==0, -1e9) p_attn = F.softmax(scores, dim=-1) # 判断是否使用dropout if dropout is not None: p_attn = dropout(p_attn) attn = torch.matmul(p_attn, value) return attn, p_attn
# 造一些假数据q = k = v = out_pe# 位置嵌入层的输出mask = torch.zeros(8,4,4)attn, p_attn = attention(q, k, v, mask = mask)print(attn)print(attn.shape)print(p_attn)print(p_attn.shape)
tensor([[[-3.4330, 17.0039, 7.6916, ..., -1.1810, 13.4835, 6.5877], [-3.4330, 17.0039, 7.6916, ..., -1.1810, 13.4835, 6.5877], [-3.4330, 17.0039, 7.6916, ..., -1.1810, 13.4835, 6.5877], [-3.4330, 17.0039, 7.6916, ..., -1.1810, 13.4835, 6.5877]], [[-5.7291, 9.6484, -3.8307, ..., -6.4001, 2.8157, -2.3674], [-5.7291, 9.6484, -3.8307, ..., -6.4001, 2.8157, -2.3674], [-5.7291, 9.6484, -3.8307, ..., -6.4001, 2.8157, -2.3674], [-5.7291, 9.6484, -3.8307, ..., -6.4001, 2.8157, -2.3674]]], grad_fn=
mask 演示
有两种mask,详情见Transformer介绍,编码器与解码器的代码是一样的,只是传入的mask矩阵是不一样的
input = torch.randn(2,5,5)print(input)
tensor([[[ 1.6237, -0.4926, 0.2511, -0.0766, 1.2336], [ 0.6095, 0.3129, -1.3681, 1.4665, 2.7871], [ 0.5896, -0.3104, -1.5489, 0.7066, -0.5313], [-0.0101, -0.4480, 1.3695, -0.5241, -1.6751], [-1.0940, -1.4501, 0.1156, 0.7294, -0.7895]], [[-0.3445, -1.2161, -0.4054, -0.7804, -0.5310], [-1.7213, -0.9197, -1.7822, -0.0254, 1.1709], [-0.2137, -1.0617, -0.8737, 0.6546, -1.8320], [-1.9422, 0.4181, -0.5073, 0.2615, 0.0958], [ 0.9671, -0.9516, -0.0827, -0.1647, -0.7664]]])
mask = torch.tensor([[[1,1,1,1,1], [1,1,1,0,1], [1,1,0,0,1], [1,0,0,0,0], [0,0,0,0,0]]])mask.shape# torch.Size([1, 5, 5])
print(mask)tensor([[[1, 1, 1, 1, 1], [1, 1, 1, 0, 1], [1, 1, 0, 0, 1], [1, 0, 0, 0, 0], [0, 0, 0, 0, 0]]])
input.masked_fill(mask==0,-1e9)tensor([[[ 1.6237e+00, -4.9256e-01, 2.5107e-01, -7.6616e-02, 1.2336e+00], [ 6.0951e-01, 3.1294e-01, -1.3681e+00, -1.0000e+09, 2.7871e+00], [ 5.8960e-01, -3.1039e-01, -1.0000e+09, -1.0000e+09, -5.3130e-01], [-1.0084e-02, -1.0000e+09, -1.0000e+09, -1.0000e+09, -1.0000e+09], [-1.0000e+09, -1.0000e+09, -1.0000e+09, -1.0000e+09, -1.0000e+09]], [[-3.4447e-01, -1.2161e+00, -4.0536e-01, -7.8037e-01, -5.3098e-01], [-1.7213e+00, -9.1969e-01, -1.7822e+00, -1.0000e+09, 1.1709e+00], [-2.1375e-01, -1.0617e+00, -1.0000e+09, -1.0000e+09, -1.8320e+00], [-1.9422e+00, -1.0000e+09, -1.0000e+09, -1.0000e+09, -1.0000e+09], [-1.0000e+09, -1.0000e+09, -1.0000e+09, -1.0000e+09, -1.0000e+09]]])
多头注意力机制# 克隆函数:生成相同的网络层,N代表克隆多少份,存放在nn.ModuleList类型的列表中def clones(module, N): return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])
# 多头注意力机制class MultiHeadedAttention(nn.Module): def __init__(self, head, embedding_dim, dropout=0.1): super(MultiHeadedAttention, self).__init__() # 首先判断head能否被词嵌入维度整除 assert embedding_dim % head == 0 # 头数 self.head = head # 词嵌入维度 self.embedding_dim = embedding_dim # 获得分割后的维度 self.d_k = self.embedding_dim // self.head # 获得四个embedding_dim x embedding_dim线性层 # 分别是Q、K、V,以及最后的拼接矩阵 self.linears = clones(nn.Linear(self.embedding_dim, self.embedding_dim), 4) # 初始化最后得到的注意力张量 self.attn = None self.dropout = nn.Dropout(p=dropout) def forward(self, query, key, value, mask=None): if mask is not None: # 扩展维度 mask = mask.unsqueeze(0) batch_size = query.size(0) # 对QKV进行多头分割 # 同时,将代表句子长度的维度与头数维度互换,使之与词向量维度相邻 query, key, value = [model(x).view(batch_size, -1, self.head, self.d_k).transpose(1, 2) for model, x in zip(self.linears, (query, key, value))] x, self.attn = attention(query, key, value, mask=mask, dropout=self.dropout) # 重塑为与输入的形状相同 x = x.transpose(1, 2).contiguous().view(batch_size, -1, self.embedding_dim) # 使用线性层列表的最后一个线性层处理输出,结构如上面的介绍图 return self.linears[-1](x)
head = 8embedding_dim = 512dropout = 0.2q = k = v = out_pemask = torch.zeros(8,4,4)mha = MultiHeadedAttention(head, embedding_dim, dropout)out_mha = mha(q, k, v, mask)print(out_mha)print(out_mha.shape)
tensor([[[ -2.8880, -8.5742, 1.3168, ..., -9.3297, -10.9577, -8.8687], [ 1.1822, -7.6474, -1.2463, ..., -8.7212, -8.4979, -4.2104], [ -0.4473, -8.8571, 2.7700, ..., -5.2058, -11.9773, -8.1035], [ 0.1940, -7.0324, -0.5626, ..., -8.7001, -5.0893, -7.9655]], [[ 6.0457, -4.1362, -8.6744, ..., -4.0828, 2.2592, 5.6931], [ 3.2074, -2.5000, -6.2406, ..., -5.8451, 10.2600, 2.8066], [ 4.5255, -1.2257, -7.2707, ..., -8.0517, 5.8174, -2.4052], [ 3.9835, -0.6454, -5.9249, ..., -7.9849, 5.4700, 2.4230]]], grad_fn=)torch.Size([2, 4, 512])
contiguous演示
使用transpose之后,必须使用contiguous才能进行view