class TinyTransformer(nn.Module):
def __init__(self):
super().__init__()
# setting the constructor for the initial values that we are every gonna need for the training of the data
self.char_embedding = nn.Embedding(65, 64)
self.pos_embedding = nn.Embedding(64, 64)
self.query = nn.Linear(64, 64)
self.key = nn.Linear(64, 64)
self.value = nn.Linear(64, 64)
self.mask = torch.tril(torch.ones(64, 64))
# these are for changing the dimensions we are doing this to enlarge the matrix as to make it of higher resolution so as to make the
# data and weights more refined
self.ff1 = nn.Linear(64, 128)
# this is to join them back again
self.ff2 = nn.Linear(128, 64)
self.output_head = nn.Linear(64, 65)
self.norm1 = nn.LayerNorm(64)
self.norm2 = nn.LayerNorm(64)
self.out_proj = nn.Linear(64, 64)
def forward(self, x):
# feed forward function
x = self.char_embedding(x) + self.pos_embedding(torch.arange(64))
# this is the start of the attention stuff i am writing this as a way to separate the code in section inside a functions
#
Q = self.query(x)
Q = Q.view(32, 64, 2, 32)
Q = Q.transpose(1, 2)
K = self.key(x)
K = K.view(32, 64, 2, 32)
K = K.transpose(1, 2)
V = self.value(x)
V = V.view(32, 64, 2, 32)
V = V.transpose(1, 2)
A = (Q @ K.transpose(-2, -1)) / 32**0.5
A = A.masked_fill(self.mask == 0, float("-inf"))
At = A.softmax(dim=-1)
# the -1 this is just to tell the
output = At @ V
output = output.transpose(1, 2).contiguous().view(32, 64, 64)
output = self.out_proj(output)
# this is where the attention ends and we start with the feed forward thing that will give us the predictions
# added another form of normalization below to improve accuracy the first time the loss function reached 1.8 max now after adding the
# below line it reached to like 1.5 something
x = x + output
x = self.norm1(x)
output = self.ff1(x)
output = torch.relu(output)
output = self.ff2(output)
x = x + output # ← merge back into main flow
x = self.norm2(x)
x = self.output_head(x)
return x
此段代码,于今AI域中,为训练Transformer之常式也,人皆习之。
吾独欲解其中一行小文,其源流故事,殊可玩味。
x = x + output
吾辈何故为此?X等于X加output乎?
神经网络之学,赖反向传播以成。反向传播者,变其滤波器以求近于正解之变化也。然有一弊:层转愈深,梯度愈微,计算之难与费亦随之增。此盖因偏导之链式法则也。然则此何以解之?
*此问题之历史 *
览此篇于斯 --
此篇论文出自微软研究团队之手,大抵述其如何解一难题——非愈多而愈善也。夫训练深度学习模型,当世先于此文者,模型愈深(即层数愈多),其误差亦愈增,此诚大患。众人莫知所措,盖一则有深层次之领悟,一则有愈多误差之困也。
解
或谓其解,但加初词向量于算得之文矩中,此思似当,然其理非君所料也。文中明言,非为此故。
吾侪谓此优化之难,殆未易。
由梯度消逝所致。
何哉?释疑梯度消逝之故,盖有诸法以减之止之。其法借助批归一化(Batch Normalization)及ReLu之类。今示吾辈代码中行此之法。
x = self.norm1(x) # the batch normalization equivalent in transformers
output = self.ff1(x)
output = torch.relu(output) # another way to solve the vanishing gradient problem
output = self.ff2(output)
x = x + output # ← merge back into main flow
x = self.norm2(x)
x = self.output_head(x)
尔观此策,诚可解梯度消亡之患。然若去“x=x+output”之式,其效必逊。汝知乎?且试之。
此乃常法,未尝更易。今试改一事,去其“x=x+output”之句,但观此更于损耗函数之影响何如。
故仅此一行,损函数自一点七零跃至二点四七。虽似微末,然须知此惟单层之模,若层叠愈增,则谬误亦随之上徙也。为证吾言,当示梯度之变,于此间稍作微调可矣。
step 44500, loss: 2.5130
char_embedding.weight grad_norm: 0.006248
pos_embedding.weight grad_norm: 0.005838
ff1.weight grad_norm: 0.024721
ff2.weight grad_norm: 0.053932
output_head.weight grad_norm: 0.163109
norm1.weight grad_norm: 0.007271
norm2.weight grad_norm: 0.024594
step 45000, loss: 2.4751
char_embedding.weight grad_norm: 0.005574
pos_embedding.weight grad_norm: 0.005913
ff1.weight grad_norm: 0.023506
ff2.weight grad_norm: 0.056331
output_head.weight grad_norm: 0.161182
norm1.weight grad_norm: 0.007898
norm2.weight grad_norm: 0.020992
step 45500, loss: 2.4623
char_embedding.weight grad_norm: 0.006224
pos_embedding.weight grad_norm: 0.006075
ff1.weight grad_norm: 0.025461
ff2.weight grad_norm: 0.051210
output_head.weight grad_norm: 0.145062
norm1.weight grad_norm: 0.008452
norm2.weight grad_norm: 0.018521
step 46000, loss: 2.4764
char_embedding.weight grad_norm: 0.006709
pos_embedding.weight grad_norm: 0.006148
ff1.weight grad_norm: 0.026940
ff2.weight grad_norm: 0.057071
output_head.weight grad_norm: 0.163159
norm1.weight grad_norm: 0.008988
norm2.weight grad_norm: 0.025112
step 46500, loss: 2.4746
char_embedding.weight grad_norm: 0.006127
pos_embedding.weight grad_norm: 0.006181
ff1.weight grad_norm: 0.025931
ff2.weight grad_norm: 0.056799
output_head.weight grad_norm: 0.158272
norm1.weight grad_norm: 0.008369
norm2.weight grad_norm: 0.025981
吾所谓者,虽若解梯度渐衰之弊,实未之能也。
其所为之实,趣更甚焉。
每层与非线性皆施微变,其变相叠,迅疾若电。虽二十层犹可成事,然自二十至五十,层数愈增,复杂陡升。此等纤毫之变,纵其体量极微,所易之值却可翻天覆地,竟与原值判若云泥。譬如:
五、三、八 --->五、三、八--->零点三、零点零一、零点二 且观此非如梯度渐消之故,实因其间微调所致。若于此中复加原值,则当如何? -
[5,3,8]加[0.3,0.01,0.2]得[5.3,3.01,8.2]
故所得者与元本极近,此即残差之要义。残差算法亦多矣,然为简便计,吾辈但守旧法之加法,诚然此道更佳。














