Skip to content

Commit

Permalink
Formatting update
Browse files Browse the repository at this point in the history
  • Loading branch information
Stella Biderman authored and Stella Biderman committed Mar 22, 2022
1 parent 14a21a8 commit 8392fdd
Show file tree
Hide file tree
Showing 3 changed files with 9 additions and 7 deletions.
8 changes: 5 additions & 3 deletions megatron/checkpointing.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,13 +57,15 @@ def do_forward_pass(neox_args, model, inference=False):
# get context tokens
# always forward full batch size
context_tokens_tensor = (
torch.arange(neox_args.seq_length+1).repeat((neox_args.train_micro_batch_size_per_gpu, 1)).cuda()
torch.arange(neox_args.seq_length + 1)
.repeat((neox_args.train_micro_batch_size_per_gpu, 1))
.cuda()
)

# forward
if inference:
tokens, attention_mask, position_ids = get_batch(
neox_args, context_tokens_tensor[:, :neox_args.seq_length]
neox_args, context_tokens_tensor[:, : neox_args.seq_length]
)
model_inputs = (
tokens,
Expand All @@ -77,7 +79,7 @@ def do_forward_pass(neox_args, model, inference=False):
_, logits = model.eval_batch(data_iter=data_iterator, return_logits=True)
else:
tokens, attention_mask, position_ids = get_batch(
neox_args, context_tokens_tensor[:, :neox_args.seq_length]
neox_args, context_tokens_tensor[:, : neox_args.seq_length]
)
logits = model((tokens, position_ids, attention_mask))

Expand Down
4 changes: 2 additions & 2 deletions megatron/fused_kernels/scaled_masked_softmax.h
Original file line number Diff line number Diff line change
Expand Up @@ -324,8 +324,8 @@ __global__ void scaled_masked_softmax_warp_backward(output_t* gradInput,
output_t out[ELEMENTS_PER_LDG_STG];
#pragma unroll
for (int element = 0; element < ELEMENTS_PER_LDG_STG; ++element) {
out[element] = (output_t)(
scale * (grad_reg[i][it + element] - output_reg[i][it + element] * sum[i]));
out[element] = (output_t)(scale * (grad_reg[i][it + element] -
output_reg[i][it + element] * sum[i]));
}
copy_vector<output_t, ELEMENTS_PER_LDG_STG>(
gradInput + i * element_count + it * WARP_SIZE, out);
Expand Down
4 changes: 2 additions & 2 deletions megatron/fused_kernels/scaled_upper_triang_masked_softmax.h
Original file line number Diff line number Diff line change
Expand Up @@ -350,8 +350,8 @@ __global__ void scaled_upper_triang_masked_softmax_warp_backward(output_t* gradI
output_t out[ELEMENTS_PER_LDG_STG];
#pragma unroll
for (int element = 0; element < ELEMENTS_PER_LDG_STG; ++element) {
out[element] = (output_t)(
scale * (grad_reg[i][it + element] - output_reg[i][it + element] * sum[i]));
out[element] = (output_t)(scale * (grad_reg[i][it + element] -
output_reg[i][it + element] * sum[i]));
}
copy_vector<output_t, ELEMENTS_PER_LDG_STG>(
gradInput + i * element_count * stride + it * WARP_SIZE, out);
Expand Down

0 comments on commit 8392fdd

Please sign in to comment.