Skip to content

Commit

Permalink
addressed Jareds and Deepaks comments
Browse files Browse the repository at this point in the history
  • Loading branch information
mohammad committed Jan 5, 2021
1 parent 512337f commit db88a27
Show file tree
Hide file tree
Showing 3 changed files with 14 additions and 25 deletions.
3 changes: 3 additions & 0 deletions megatron/optimizer/clip_grads.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,9 @@ def clip_grad_norm_fp32(parameters, max_norm, norm_type=2):
else:
if norm_type == 2.0:
dummy_overflow_buf = torch.cuda.IntTensor([0])
# Use apex's multi-tensor applier for efficiency reasons.
# Multi-tensor applier takes a function and a list of list
# and performs the operation on that list all in one kernel.
grad_norm, _ = multi_tensor_applier(
amp_C.multi_tensor_l2norm,
dummy_overflow_buf,
Expand Down
22 changes: 6 additions & 16 deletions megatron/optimizer/optimizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ def zero_grad(self, set_to_none=True):

@abstractmethod
def get_loss_scale(self):
"""The output should be a cuda tensor of size 1."""
pass

def scale_loss(self, loss):
Expand All @@ -90,6 +91,11 @@ def step(self):

@abstractmethod
def reload_model_params(self):
"""Refreshes any internal state from the current model parameters.
Call whenever the parameters are changed outside of the optimizer.
For example, when we load a model from a checkpoint without loading
the optimizer, the model parameters are updated but for fp16 optimizer
with main parameters, the main parameters need to also be updated."""
pass

@abstractmethod
Expand Down Expand Up @@ -289,54 +295,38 @@ def step(self):

timers = get_timers()

# ==================================================
# Copy gradients from model params to main params.
# ==================================================
timers('optimizer-copy-to-main-grad').start()
self._copy_model_grads_to_main_grads()
timers('optimizer-copy-to-main-grad').stop()

# ==============================
# Unscale and check for inf/nan.
# ==============================
timers('optimizer-unscale-and-check-inf').start()
found_inf_flag = self._unscale_main_grads_and_check_for_nan()
timers('optimizer-unscale-and-check-inf').stop()

# ==================================
# We are done with scaling gradients
# so we can update the loss scale.
# ==================================
self.grad_scaler.update(found_inf_flag)

# =====================================
# If we found inf/nan, skip the update.
# =====================================
if found_inf_flag:
return False

# ==========================
# Clip the main gradients.
# ==========================
timers('optimizer-clip-main-grad').start()
self.clip_grad_norm(self.clip_grad)
timers('optimizer-clip-main-grad').stop()

# ===================
# Step the optimizer.
# ===================
self.optimizer.step()

# =================================
# Update params from main params.
# =================================
timers('optimizer-copy-main-to-model-params').start()
self._copy_main_params_to_model_params()
timers('optimizer-copy-main-to-model-params').stop()

# ==================
# Successful update.
# ==================
return True


Expand Down
14 changes: 5 additions & 9 deletions megatron/training.py
Original file line number Diff line number Diff line change
Expand Up @@ -703,10 +703,9 @@ def add_to_logging(name):
writer.add_scalar(key , loss_dict[key], iteration)
writer.add_scalar(key + ' vs samples', loss_dict[key],
args.consumed_train_samples)
if args.fp16:
writer.add_scalar('loss-scale', loss_scale, iteration)
writer.add_scalar('loss-scale vs samples', loss_scale,
args.consumed_train_samples)
writer.add_scalar('loss-scale', loss_scale, iteration)
writer.add_scalar('loss-scale vs samples', loss_scale,
args.consumed_train_samples)
timers.write(timers_to_log, writer, iteration,
normalizer=total_iterations)

Expand All @@ -732,8 +731,7 @@ def add_to_logging(name):
if avg > 0.0:
log_string += ' {}: {:.6E} |'.format(key, avg)
total_loss_dict[key] = torch.cuda.FloatTensor([0.0])
if args.fp16:
log_string += ' loss scale: {:.1f} |'.format(loss_scale)
log_string += ' loss scale: {:.1f} |'.format(loss_scale)
log_string += ' number of skipped iterations: {:3d} |'.format(
total_loss_dict[skipped_iters_key])
log_string += ' number of nan iterations: {:3d} |'.format(
Expand Down Expand Up @@ -797,9 +795,7 @@ def train(forward_step_func, model, optimizer, lr_scheduler,
get_num_microbatches()

# Logging.
loss_scale = None
if args.fp16:
loss_scale = optimizer.get_loss_scale().item()
loss_scale = optimizer.get_loss_scale().item()
report_memory_flag = training_log(loss_dict, total_loss_dict,
optimizer.param_groups[0]['lr'],
iteration, loss_scale,
Expand Down

0 comments on commit db88a27

Please sign in to comment.