层叠愈深而效反劣，残差遂生。

class TinyTransformer(nn.Module):
    def __init__(self):
        super().__init__()
        # setting the constructor for the initial values that we are every gonna need for the training of the data
        self.char_embedding = nn.Embedding(65, 64)
        self.pos_embedding = nn.Embedding(64, 64)
        self.query = nn.Linear(64, 64)
        self.key = nn.Linear(64, 64)
        self.value = nn.Linear(64, 64)
        self.mask = torch.tril(torch.ones(64, 64))
        # these are for changing the dimensions we are doing this to enlarge the matrix as to make it of higher resolution so as to make the 
        # data and weights more refined 
        self.ff1 = nn.Linear(64, 128)
        # this is to join them back again 
        self.ff2 = nn.Linear(128, 64)
        self.output_head = nn.Linear(64, 65)
        self.norm1 = nn.LayerNorm(64)
        self.norm2 = nn.LayerNorm(64)
        self.out_proj = nn.Linear(64, 64)

    def forward(self, x):
        # feed forward function
        x = self.char_embedding(x) + self.pos_embedding(torch.arange(64))
        # this is the start of the attention stuff i am writing this as a way to separate the code in section inside a functions
        #
        Q = self.query(x)
        Q = Q.view(32, 64, 2, 32)
        Q = Q.transpose(1, 2)
        K = self.key(x)
        K = K.view(32, 64, 2, 32)
        K = K.transpose(1, 2)
        V = self.value(x)
        V = V.view(32, 64, 2, 32)
        V = V.transpose(1, 2)
        A = (Q @ K.transpose(-2, -1)) / 32**0.5

        A = A.masked_fill(self.mask == 0, float("-inf"))
        At = A.softmax(dim=-1)
        # the -1 this is just to tell the
        output = At @ V
        output = output.transpose(1, 2).contiguous().view(32, 64, 64)
        output = self.out_proj(output)
        # this is where the attention ends and we start with the feed forward thing that will give us the predictions
        # added another form of normalization below to improve accuracy the first time the loss function reached 1.8 max now after adding the
        # below line it reached to like 1.5 something
        x = x + output
        x = self.norm1(x)

        output = self.ff1(x)
        output = torch.relu(output)
        output = self.ff2(output)

        x = x + output  # ← merge back into main flow
        x = self.norm2(x)
        x = self.output_head(x)
        return x

此段代码，于今AI域中，为训练Transformer之常式也，人皆习之。

吾独欲解其中一行小文，其源流故事，殊可玩味。

 x = x + output

吾辈何故为此？X等于X加output乎？

神经网络之学，赖反向传播以成。反向传播者，变其滤波器以求近于正解之变化也。然有一弊：层转愈深，梯度愈微，计算之难与费亦随之增。此盖因偏导之链式法则也。然则此何以解之？

此问题之历史

览此篇于斯 --

此篇论文出自微软研究团队之手，大抵述其如何解一难题——非愈多而愈善也。夫训练深度学习模型，当世先于此文者，模型愈深（即层数愈多），其误差亦愈增，此诚大患。众人莫知所措，盖一则有深层次之领悟，一则有愈多误差之困也。

解

或谓其解，但加初词向量于算得之文矩中，此思似当，然其理非君所料也。文中明言，非为此故。

吾侪谓此优化之难，殆未易。
由梯度消逝所致。

何哉？释疑梯度消逝之故，盖有诸法以减之止之。其法借助批归一化（Batch Normalization）及ReLu之类。今示吾辈代码中行此之法。

    x = self.norm1(x) # the batch normalization equivalent in transformers 

    output = self.ff1(x)
    output = torch.relu(output) # another way to solve the vanishing gradient problem 
    output = self.ff2(output)

    x = x + output  # ← merge back into main flow
    x = self.norm2(x)
    x = self.output_head(x)

尔观此策，诚可解梯度消亡之患。然若去“x=x+output”之式，其效必逊。汝知乎？且试之。

此乃常法，未尝更易。今试改一事，去其“x=x+output”之句，但观此更于损耗函数之影响何如。

故仅此一行，损函数自一点七零跃至二点四七。虽似微末，然须知此惟单层之模，若层叠愈增，则谬误亦随之上徙也。为证吾言，当示梯度之变，于此间稍作微调可矣。

step 44500, loss: 2.5130
  char_embedding.weight          grad_norm: 0.006248
  pos_embedding.weight           grad_norm: 0.005838
  ff1.weight                     grad_norm: 0.024721
  ff2.weight                     grad_norm: 0.053932
  output_head.weight             grad_norm: 0.163109
  norm1.weight                   grad_norm: 0.007271
  norm2.weight                   grad_norm: 0.024594
step 45000, loss: 2.4751
  char_embedding.weight          grad_norm: 0.005574
  pos_embedding.weight           grad_norm: 0.005913
  ff1.weight                     grad_norm: 0.023506
  ff2.weight                     grad_norm: 0.056331
  output_head.weight             grad_norm: 0.161182
  norm1.weight                   grad_norm: 0.007898
  norm2.weight                   grad_norm: 0.020992
step 45500, loss: 2.4623
  char_embedding.weight          grad_norm: 0.006224
  pos_embedding.weight           grad_norm: 0.006075
  ff1.weight                     grad_norm: 0.025461
  ff2.weight                     grad_norm: 0.051210
  output_head.weight             grad_norm: 0.145062
  norm1.weight                   grad_norm: 0.008452
  norm2.weight                   grad_norm: 0.018521
step 46000, loss: 2.4764
  char_embedding.weight          grad_norm: 0.006709
  pos_embedding.weight           grad_norm: 0.006148
  ff1.weight                     grad_norm: 0.026940
  ff2.weight                     grad_norm: 0.057071
  output_head.weight             grad_norm: 0.163159
  norm1.weight                   grad_norm: 0.008988
  norm2.weight                   grad_norm: 0.025112
step 46500, loss: 2.4746
  char_embedding.weight          grad_norm: 0.006127
  pos_embedding.weight           grad_norm: 0.006181
  ff1.weight                     grad_norm: 0.025931
  ff2.weight                     grad_norm: 0.056799
  output_head.weight             grad_norm: 0.158272
  norm1.weight                   grad_norm: 0.008369
  norm2.weight                   grad_norm: 0.025981

吾所谓者，虽若解梯度渐衰之弊，实未之能也。
其所为之实，趣更甚焉。
每层与非线性皆施微变，其变相叠，迅疾若电。虽二十层犹可成事，然自二十至五十，层数愈增，复杂陡升。此等纤毫之变，纵其体量极微，所易之值却可翻天覆地，竟与原值判若云泥。譬如：
五、三、八 --->五、三、八--->零点三、零点零一、零点二且观此非如梯度渐消之故，实因其间微调所致。若于此中复加原值，则当如何？ -
[5,3,8]加[0.3,0.01,0.2]得[5.3,3.01,8.2]
故所得者与元本极近，此即残差之要义。残差算法亦多矣，然为简便计，吾辈但守旧法之加法，诚然此道更佳。

推薦訂閱源

DEV Community

此问题之历史

解

推薦訂閱源

DEV Community

*此问题之历史 *

解

此问题之历史