Fix bf16 for zero > 0 and pipeline parallelism > 0 (#1032)

* Fix bugs so we can use bf16 with zero > 0 Signed-off-by: Dashiell Stander <[email protected]> * Typo Signed-off-by: Dashiell Stander <[email protected]> * Typo Signed-off-by: Dashiell Stander <[email protected]> * With the DeepSpeed updates there may be no need to do grad_accum in fp32 Signed-off-by: Dashiell Stander <[email protected]> * Add warning about necessity of fp32 grad_accum with bf16, pp>0, and zero1 Signed-off-by: Dashiell Stander <[email protected]> * Update NeoXArgs docs automatically * Update NeoXArgs docs automatically --------- Signed-off-by: Dashiell Stander <[email protected]> Co-authored-by: github-actions <[email protected]>
EleutherAI · Sep 18, 2023 · d9166bf · d9166bf
1 parent c883e8c
commit d9166bf
Show file tree

Hide file tree

Showing 3 changed files with 18 additions and 1 deletion.
diff --git a/configs/neox_arguments.md b/configs/neox_arguments.md
@@ -111,7 +111,7 @@ Logging Arguments
 
 - **git_hash**: str
 
- Default = 06a2534
+ Default = 2c15e12
 
  current git hash of repository
 

diff --git a/megatron/neox_arguments/arguments.py b/megatron/neox_arguments/arguments.py
@@ -930,12 +930,25 @@ def calculate_derived(self):
  self.update_value("fp16", fp16_args)
  elif self.precision == "bfloat16":
  bf_config = {"bf16": {"enabled": True}}
+ # dt_config = {"grad_accum_dtype": "fp32"}
  if self.deepspeed_extra_args is None:
  self.update_value("deepspeed_extra_args", bf_config)
  else:
  extra_args = copy.deepcopy(self.deepspeed_extra_args)
  extra_args.update(bf_config)
  self.update_value("deepspeed_extra_args", extra_args)
+
+ zero_stage = self.zero_optimization["stage"]
+ if self.data_types is None:
+ fp32_grad_accum = False
+ else:
+ fp32_grad_accum = self.data_types.get("grad_accum_dtype") == "fp32"
+ if (zero_stage > 0) and (pp_size > 0) and not fp32_grad_accum:
+ # Remove this code when this issue is resolved
+ # https://github.com/microsoft/DeepSpeed/issues/1835
+ logging.warn(
+ "Outstanding DeepSpeed issue means that pp>0, zero1, and bf16 will break without fp32 grads"
+ )
  else:
  self.update_value("precision", "fp32")
 

diff --git a/megatron/utils.py b/megatron/utils.py
@@ -33,6 +33,7 @@
 import torch
 
 from deepspeed.launcher.runner import fetch_hostfile, parse_inclusion_exclusion
+from deepspeed.runtime.bf16_optimizer import BF16_Optimizer
 
 from megatron import print_rank_0
 from megatron import mpu
@@ -349,8 +350,11 @@ def __init__(self, optimizer, n=50):
  self.optimizer = optimizer
  self.n = n
  self.history = deque(maxlen=n)
+ self.bf16 = isinstance(optimizer, BF16_Optimizer)
 
  def check(self, skipped):
+ if self.bf16:
+ return
  self.history.append(skipped)
  if (
  self.optimizer.overflow