Fix moe_loss in gpt_j_residual path (#1180)

Fixes #1174 Co-authored-by: Yang Zhang <[email protected]>
EleutherAI · Mar 8, 2024 · 82ddc66 · 82ddc66
1 parent 1e7abe7
commit 82ddc66
Showing 1 changed file with 1 addition and 3 deletions.
diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py
@@ -1046,6 +1046,7 @@ def _get_bias_dropout(self):
  def forward(self, x, attention_mask, layer_past=None):
  layer_past = layer_past if layer_past is not None else self.layer_past
  bias_dropout_fn = self._get_bias_dropout()
+ moe_loss = torch.tensor(0.0, device=x.device, dtype=x.dtype)
  # x: [b, s, h]
  if self.gpt_j_residual:
  # pseudocode:
@@ -1127,9 +1128,6 @@ def forward(self, x, attention_mask, layer_past=None):
 
  # output = x + mlp(ln2(x))
  layernorm_output = self.post_attention_layernorm(attention_output)
- moe_loss = torch.tensor(
- 0.0, device=layernorm_output.device, dtype=layernorm_output.dtype
- )
  mlp_bias = torch.tensor(
  0.0, device=layernorm_output.device, dtype=layernorm_output.dtype
  )