Skip to content

Commit

Permalink
Fix moe_loss in gpt_j_residual path (#1180)
Browse files Browse the repository at this point in the history
Fixes #1174

Co-authored-by: Yang Zhang <[email protected]>
  • Loading branch information
yang and yang committed Mar 8, 2024
1 parent 1e7abe7 commit 82ddc66
Showing 1 changed file with 1 addition and 3 deletions.
4 changes: 1 addition & 3 deletions megatron/model/transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -1046,6 +1046,7 @@ def _get_bias_dropout(self):
def forward(self, x, attention_mask, layer_past=None):
layer_past = layer_past if layer_past is not None else self.layer_past
bias_dropout_fn = self._get_bias_dropout()
moe_loss = torch.tensor(0.0, device=x.device, dtype=x.dtype)
# x: [b, s, h]
if self.gpt_j_residual:
# pseudocode:
Expand Down Expand Up @@ -1127,9 +1128,6 @@ def forward(self, x, attention_mask, layer_past=None):

# output = x + mlp(ln2(x))
layernorm_output = self.post_attention_layernorm(attention_output)
moe_loss = torch.tensor(
0.0, device=layernorm_output.device, dtype=layernorm_output.dtype
)
mlp_bias = torch.tensor(
0.0, device=layernorm_output.device, dtype=layernorm_output.dtype
)
Expand Down

0 comments on commit 82ddc66

Please sign in to comment.