前言
这是看了一个博主的博客和视频来学习的时候,对代码的一些注释,增加我对他的结构的理解,也方便以后重温
记录一下:服务器代码位置在 ~/medical-image-segmentation/transformer/wsy_transformer_study/transformer_study_1.py
colab地址:https://colab.research.google.com/drive/10x4Dj68g5v9yErYoGB_5EqPFrAee2_p8?usp=sharing
参考文档:https://wmathor.com/index.php/archives/1438/
代码
import math
import torch
import numpy as np
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as Data
# S: Symbol that shows starting of decoding input
# E: Symbol that shows starting of decoding output
# P: Symbol that will fill in blank sequence if current batch data size is short than time steps
sentences = [
# enc_input dec_input dec_output
['ich mochte ein bier P', 'S i want a beer .', 'i want a beer . E'],
['ich mochte ein cola P', 'S i want a coke .', 'i want a coke . E']
]
# Padding Should be Zero
src_vocab = {'P': 0, 'ich': 1, 'mochte': 2, 'ein': 3, 'bier': 4, 'cola': 5}
src_vocab_size = len(src_vocab)
tgt_vocab = {
'P': 0,
'i': 1,
'want': 2,
'a': 3,
'beer': 4,
'coke': 5,
'S': 6,
'E': 7,
'.': 8
}
idx2word = {i: w for i, w in enumerate(tgt_vocab)}
tgt_vocab_size = len(tgt_vocab)
src_len = 5 # enc_input max sequence length
tgt_len = 6 # dec_input(=dec_output) max sequence length
d_model = 512 # 这里其实就是固定住了word Embedding和positional Embedding,相加的时候保证的维度相同
d_ff = 2048 # 经过FeedForward的时候进行提取特征的时候所做的升高的具体的维度
d_k = d_v = 64 # 这里定义的是Q,K和V的维度,其中,Q和K的维度是相等的,v的维度没有限制
n_layers = 6 # 定义Layer到底有多少层
n_heads = 8 # 定义Multi-Head Attention的数量
def make_data(sentences):
enc_inputs, dec_inputs, dec_outputs = [], [], []
for i in range(len(sentences)):
enc_input = [[src_vocab[n] for n in sentences[i][0].split()]
] # [[1, 2, 3, 4, 0], [1, 2, 3, 5, 0]]
dec_input = [[tgt_vocab[n] for n in sentences[i][1].split()]
] # [[6, 1, 2, 3, 4, 8], [6, 1, 2, 3, 5, 8]]
dec_output = [[tgt_vocab[n] for n in sentences[i][2].split()]
] # [[1, 2, 3, 4, 8, 7], [1, 2, 3, 5, 8, 7]]
enc_inputs.extend(enc_input)
dec_inputs.extend(dec_input)
dec_outputs.extend(dec_output)
return torch.LongTensor(enc_inputs), torch.LongTensor(
dec_inputs), torch.LongTensor(dec_outputs)
class MyDataSet(Data.Dataset):
def __init__(self, enc_inputs, dec_inputs, dec_outputs):
super(MyDataSet, self).__init__()
self.enc_inputs = enc_inputs
self.dec_inputs = dec_inputs
self.dec_outputs = dec_outputs
def __len__(self):
return self.enc_inputs.shape[0]
def __getitem__(self, idx):
return self.enc_inputs[idx], self.dec_inputs[idx], self.dec_outputs[
idx]
class PositionalEncoding(nn.Module):
def __init__(self, d_model, dropout=0.1, max_len=5000):
super(PositionalEncoding, self).__init__()
self.dropout = nn.Dropout(p=dropout)
pe = torch.zeros(max_len, d_model)
position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
div_term = torch.exp(
torch.arange(0, d_model, 2).float() *
(-math.log(10000.0) / d_model))
pe[:, 0::2] = torch.sin(position * div_term)
pe[:, 1::2] = torch.cos(position * div_term)
pe = pe.unsqueeze(0).transpose(0, 1)
self.register_buffer('pe', pe)
def forward(self, x):
'''
x: [seq_len, batch_size, d_model]
'''
x = x + self.pe[:x.size(0), :]
return self.dropout(x)
def get_attn_pad_mask(seq_q, seq_k):
'''
seq_q: [batch_size, seq_len]
seq_k: [batch_size, seq_len]
seq_len could be src_len or it could be tgt_len
seq_len in seq_q and seq_len in seq_k maybe not equal
'''
batch_size, len_q = seq_q.size()
batch_size, len_k = seq_k.size()
# 先进行TF填充,然后再进行扩展维度,将2维转成3维
pad_attn_mask = seq_k.data.eq(0).unsqueeze(
1) # [batch_size, 1, len_k], False is masked
return pad_attn_mask.expand(batch_size, len_q,
len_k) # [batch_size, len_q, len_k]
def get_attn_subsequence_mask(seq):
'''
seq: [batch_size, tgt_len]
'''
attn_shape = [seq.size(0), seq.size(1), seq.size(1)]
subsequence_mask = np.triu(np.ones(attn_shape),
k=1) # Upper triangular matrix
subsequence_mask = torch.from_numpy(subsequence_mask).byte()
return subsequence_mask # [batch_size, tgt_len, tgt_len]
class ScaledDotProductAttention(nn.Module):
def __init__(self):
super(ScaledDotProductAttention, self).__init__()
def forward(self, Q, K, V, attn_mask):
'''
Q: [batch_size, n_heads, len_q, d_k]
K: [batch_size, n_heads, len_k, d_k]
V: [batch_size, n_heads, len_v(=len_k), d_v]
attn_mask: [batch_size, n_heads, seq_len, seq_len]
'''
# 先将Q,K进行合并(相乘)
scores = torch.matmul(Q, K.transpose(-1, -2)) / np.sqrt(
d_k) # scores : [batch_size, n_heads, len_q, len_k]
# 由于pad的存在,所以并不能直接分类,根据softmax公式(exi/求和exi),所有要求softmax为0,就需要exi为0,那么xi就是负无穷
scores.masked_fill_(
attn_mask, -1e9
)
# 下面进行分类,是将上面True的位置填充了负无穷已经
attn = nn.Softmax(dim=-1)(scores)
# 上面attn是真正的权重,再进行和V相乘 记录:咋相乘的还没看,先放个坑
context = torch.matmul(attn, V) # [batch_size, n_heads, len_q, d_v]
return context, attn
class MultiHeadAttention(nn.Module):
def __init__(self):
super(MultiHeadAttention, self).__init__()
self.W_Q = nn.Linear(d_model, d_k * n_heads, bias=False)
self.W_K = nn.Linear(d_model, d_k * n_heads, bias=False)
self.W_V = nn.Linear(d_model, d_v * n_heads, bias=False)
self.fc = nn.Linear(n_heads * d_v, d_model, bias=False)
def forward(self, input_Q, input_K, input_V, attn_mask):
'''
input_Q: [batch_size, len_q, d_model]
input_K: [batch_size, len_k, d_model]
input_V: [batch_size, len_v(=len_k), d_model]
attn_mask: [batch_size, seq_len, seq_len]
'''
residual, batch_size = input_Q, input_Q.size(0)
# 下面的注释体现了转换过程,就是将新的d_model转置增加一个头(D_new拆分的俩,咋拆分的还不太理解)之后将头进行提前来做
# (B, S, D) -proj-> (B, S, D_new) -split-> (B, S, H, W) -trans-> (B, H, S, W)
Q = self.W_Q(input_Q).view(batch_size, -1, n_heads, d_k).transpose(
1, 2) # Q: [batch_size, n_heads, len_q, d_k]
K = self.W_K(input_K).view(batch_size, -1, n_heads, d_k).transpose(
1, 2) # K: [batch_size, n_heads, len_k, d_k]
V = self.W_V(input_V).view(batch_size, -1, n_heads, d_v).transpose(
1, 2) # V: [batch_size, n_heads, len_v(=len_k), d_v]
# 下面就是对attn_mask进行操作,因为att_nmask是三维,但是上面的Q,K,V都变成了四维了,所以先进行围堵扩张一下,方便计算
attn_mask = attn_mask.unsqueeze(1).repeat(
1, n_heads, 1,
1) # attn_mask : [batch_size, n_heads, seq_len, seq_len]
# 下面进入ScaledDotProductAttention类
# context: [batch_size, n_heads, len_q, d_v], attn: [batch_size, n_heads, len_q, len_k]
context, attn = ScaledDotProductAttention()(Q, K, V, attn_mask)
# 下面将上面的 n_heads, len_q交换了一下,然后再reshape
context = context.transpose(1, 2).reshape(
batch_size, -1,
n_heads * d_v) # context: [batch_size, len_q, n_heads * d_v]
output = self.fc(context) # [batch_size, len_q, d_model]
# 下面残差连接(就是Add&Norm)一下进行输出,然后就到Feedforward里面了
return nn.LayerNorm(d_model).cuda()(output + residual), attn
class PoswiseFeedForwardNet(nn.Module):
def __init__(self):
super(PoswiseFeedForwardNet, self).__init__()
self.fc = nn.Sequential(nn.Linear(d_model, d_ff,
bias=False), nn.ReLU(),
nn.Linear(d_ff, d_model, bias=False))
def forward(self, inputs):
'''
inputs: [batch_size, seq_len, d_model]
'''
residual = inputs
output = self.fc(inputs)
return nn.LayerNorm(d_model).cuda()(
output + residual) # [batch_size, seq_len, d_model]
class EncoderLayer(nn.Module):
def __init__(self):
super(EncoderLayer, self).__init__()
self.enc_self_attn = MultiHeadAttention()
self.pos_ffn = PoswiseFeedForwardNet()
def forward(self, enc_inputs, enc_self_attn_mask):
'''
enc_inputs: [batch_size, src_len, d_model]
enc_self_attn_mask: [batch_size, src_len, src_len]
'''
# enc_outputs: [batch_size, src_len, d_model], attn: [batch_size, n_heads, src_len, src_len]
# 下面的是进行的K,V,Q多头输入了,下面跳到MultiHeadAttention这个类
enc_outputs, attn = self.enc_self_attn(
enc_inputs, enc_inputs, enc_inputs,
enc_self_attn_mask) # enc_inputs to same Q,K,V
# 这个就是简单的做一个特征的提取,就是feedforward过程,残差都在里面做了
enc_outputs = self.pos_ffn(
enc_outputs) # enc_outputs: [batch_size, src_len, d_model]
return enc_outputs, attn
class DecoderLayer(nn.Module):
def __init__(self):
super(DecoderLayer, self).__init__()
self.dec_self_attn = MultiHeadAttention()
self.dec_enc_attn = MultiHeadAttention()
self.pos_ffn = PoswiseFeedForwardNet()
def forward(self, dec_inputs, enc_outputs, dec_self_attn_mask,
dec_enc_attn_mask):
'''
dec_inputs: [batch_size, tgt_len, d_model]
enc_outputs: [batch_size, src_len, d_model]
dec_self_attn_mask: [batch_size, tgt_len, tgt_len]
dec_enc_attn_mask: [batch_size, tgt_len, src_len]
'''
# dec_outputs: [batch_size, tgt_len, d_model], dec_self_attn: [batch_size, n_heads, tgt_len, tgt_len]
dec_outputs, dec_self_attn = self.dec_self_attn(
dec_inputs, dec_inputs, dec_inputs, dec_self_attn_mask)
# dec_outputs: [batch_size, tgt_len, d_model], dec_enc_attn: [batch_size, h_heads, tgt_len, src_len]
dec_outputs, dec_enc_attn = self.dec_enc_attn(dec_outputs, enc_outputs,
enc_outputs,
dec_enc_attn_mask)
dec_outputs = self.pos_ffn(
dec_outputs) # [batch_size, tgt_len, d_model]
return dec_outputs, dec_self_attn, dec_enc_attn
class Encoder(nn.Module):
def __init__(self):
super(Encoder, self).__init__()
# 这句话就是src_emb(字向量),每个字的维度d_model进行Embedding
self.src_emb = nn.Embedding(src_vocab_size, d_model)
# 位置编码,记录:在transformer中,位置编码是使用数学公式写死的
self.pos_emb = PositionalEncoding(d_model)
self.layers = nn.ModuleList([EncoderLayer() for _ in range(n_layers)])
def forward(self, enc_inputs):
'''
enc_inputs: [batch_size, src_len]
'''
enc_outputs = self.src_emb(
enc_inputs) # [batch_size, src_len, d_model]
enc_outputs = self.pos_emb(enc_outputs.transpose(0, 1)).transpose(
0, 1) # [batch_size, src_len, d_model]
# 进行mask
enc_self_attn_mask = get_attn_pad_mask(
enc_inputs, enc_inputs) # [batch_size, src_len, src_len]
enc_self_attns = []
# 下面要进行multi-head-attention,这里遍历的是ModelList,也就是开始定义的Layer数量 记录:从这里开始遍历的原因是transformer
# 中他的Layer层数并不是每次都从头开始进行input,而是不再进行pos_enc了,直接从multi-head-attention开始的
# 下面这块直接看EncoderLayer这个类
for layer in self.layers:
# enc_outputs: [batch_size, src_len, d_model], enc_self_attn: [batch_size, n_heads, src_len, src_len]
enc_outputs, enc_self_attn = layer(enc_outputs, enc_self_attn_mask)
enc_self_attns.append(enc_self_attn)
# 到这里encoder就算是完成了一个
return enc_outputs, enc_self_attns
class Decoder(nn.Module):
def __init__(self):
super(Decoder, self).__init__()
self.tgt_emb = nn.Embedding(tgt_vocab_size, d_model)
self.pos_emb = PositionalEncoding(d_model)
self.layers = nn.ModuleList([DecoderLayer() for _ in range(n_layers)])
def forward(self, dec_inputs, enc_inputs, enc_outputs):
'''
dec_inputs: [batch_size, tgt_len]
enc_intpus: [batch_size, src_len]
enc_outputs: [batsh_size, src_len, d_model]
'''
dec_outputs = self.tgt_emb(
dec_inputs) [] # [batch_size, tgt_len, d_model]
dec_outputs = self.pos_emb(dec_outputs.transpose(0, 1)).transpose(
0, 1).cuda() # [batch_size, tgt_len, d_model]
dec_self_attn_pad_mask = get_attn_pad_mask(
dec_inputs, dec_inputs).cuda() # [batch_size, tgt_len, tgt_len]
dec_self_attn_subsequence_mask = get_attn_subsequence_mask(
dec_inputs).cuda() # [batch_size, tgt_len, tgt_len]
dec_self_attn_mask = torch.gt(
(dec_self_attn_pad_mask + dec_self_attn_subsequence_mask),
0).cuda() # [batch_size, tgt_len, tgt_len]
dec_enc_attn_mask = get_attn_pad_mask(
dec_inputs, enc_inputs) # [batc_size, tgt_len, src_len]
dec_self_attns, dec_enc_attns = [], []
for layer in self.layers:
# dec_outputs: [batch_size, tgt_len, d_model], dec_self_attn: [batch_size, n_heads, tgt_len, tgt_len], dec_enc_attn: [batch_size, h_heads, tgt_len, src_len]
dec_outputs, dec_self_attn, dec_enc_attn = layer(
dec_outputs, enc_outputs, dec_self_attn_mask,
dec_enc_attn_mask)
dec_self_attns.append(dec_self_attn)
dec_enc_attns.append(dec_enc_attn)
return dec_outputs, dec_self_attns, dec_enc_attns
class Transformer(nn.Module):
def __init__(self):
super(Transformer, self).__init__()
self.encoder = Encoder().cuda()
self.decoder = Decoder().cuda()
# 转化维度,就是将d_model的维度转换成了需要预测的长度,对应图像处理应该就是gt中的标签个数
self.projection = nn.Linear(d_model, tgt_vocab_size, bias=False).cuda()
def forward(self, enc_inputs, dec_inputs):
'''
enc_inputs: [batch_size, src_len]
dec_inputs: [batch_size, tgt_len]
'''
# tensor to store decoder outputs
# outputs = torch.zeros(batch_size, tgt_len, tgt_vocab_size).to(self.device)
# 开始进行encoder过程,下面进入Encoder这个类
# enc_outputs: [batch_size, src_len, d_model], enc_self_attns: [n_layers, batch_size, n_heads, src_len, src_len]
enc_outputs, enc_self_attns = self.encoder(enc_inputs)
# decoder几乎完全复用了encoder代码,decoder不注释了,但是代码层面特别清晰
# dec_outpus: [batch_size, tgt_len, d_model], dec_self_attns: [n_layers, batch_size, n_heads, tgt_len, tgt_len], dec_enc_attn: [n_layers, batch_size, tgt_len, src_len]
dec_outputs, dec_self_attns, dec_enc_attns = self.decoder(
dec_inputs, enc_inputs, enc_outputs)
dec_logits = self.projection(
dec_outputs) # dec_logits: [batch_size, tgt_len, tgt_vocab_size]
return dec_logits.view(
-1,
dec_logits.size(-1)), enc_self_attns, dec_self_attns, dec_enc_attns
if __name__ == '__main__':
enc_inputs, dec_inputs, dec_outputs = make_data(sentences)
loader = Data.DataLoader(MyDataSet(enc_inputs, dec_inputs, dec_outputs), 2,
True)
# 开始模型搭建,进入transformer这个类
model = Transformer().cuda()
criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = optim.SGD(model.parameters(), lr=1e-3, momentum=0.99)
for epoch in range(1000):
for enc_inputs, dec_inputs, dec_outputs in loader:
'''
enc_inputs: [batch_size, src_len]
dec_inputs: [batch_size, tgt_len]
dec_outputs: [batch_size, tgt_len]
'''
enc_inputs, dec_inputs, dec_outputs = enc_inputs.cuda(
), dec_inputs.cuda(), dec_outputs.cuda()
# outputs: [batch_size * tgt_len, tgt_vocab_size]
outputs, enc_self_attns, dec_self_attns, dec_enc_attns = model(
enc_inputs, dec_inputs)
loss = criterion(outputs, dec_outputs.view(-1))
print('Epoch:', '%04d' % (epoch + 1), 'loss =',
'{:.6f}'.format(loss))
optimizer.zero_grad()
loss.backward()
optimizer.step()
退出登录?