Add DeepSpeed bf16 configuration (EleutherAI#787)

* add bf16 configuration Signed-off-by: Dashiell Stander <[email protected]> * Update NeoXArgs docs automatically * pre commit Signed-off-by: Dashiell Stander <[email protected]> * Update NeoXArgs docs automatically * Rework deriving precision Signed-off-by: Dashiell Stander <[email protected]> * Update NeoXArgs docs automatically * Belt and suspenders Signed-off-by: Dashiell Stander <[email protected]> * Update NeoXArgs docs automatically * Make the default setup (of only using fp16 dict) work Signed-off-by: Dashiell Stander <[email protected]> * Update NeoXArgs docs automatically * Got rid of bf16 argument Signed-off-by: Dashiell Stander <[email protected]> * Update NeoXArgs docs automatically * Update NeoXArgs docs automatically * Update NeoXArgs docs automatically * Re-add detailed bf16 message * Update NeoXArgs docs automatically * Remove unused import * Update NeoXArgs docs automatically * remove useless newline * Update NeoXArgs docs automatically * re-add detailed bf16 message to deepspeed_args * Update NeoXArgs docs automatically --------- Signed-off-by: Dashiell Stander <[email protected]> Co-authored-by: github-actions <[email protected]> Co-authored-by: Quentin Anthony <[email protected]>
Jarvis-LLM · May 16, 2023 · 056e9ca · 056e9ca
1 parent 9a18727
commit 056e9ca
Show file tree

Hide file tree

Showing 3 changed files with 36 additions and 24 deletions.
diff --git a/configs/neox_arguments.md b/configs/neox_arguments.md
@@ -111,7 +111,7 @@ Logging Arguments
 
 - **git_hash**: str
 
- Default = b192e18
+ Default = 6e60b1e
 
  current git hash of repository
 
@@ -1680,6 +1680,14 @@ Args for deepspeed config
 
 
 
+- **bf16**: dict
+
+ Default = None
+
+ Configuration for using bfloat16 floating-point format as an alternative to FP16. BFLOAT16 requires hardware support (e.g., NVIDIA A100). Dictionary options as described in Deepspeed documentation: https://www.deepspeed.ai/docs/config-json/#bfloat16-training-options
+
+
+
 - **amp**: dict
 
  Default = None
@@ -1766,16 +1774,6 @@ Args for deepspeed config
 
 
 
-- **bf16**: dict
-
- Default = None
-
- Configuration for using bfloat16 floating-point format as an alternative to FP16. BFLOAT16 requires hardware support (e.g., NVIDIA A100).
-
- Dictionary options as described in Deepspeed documentation: https://www.deepspeed.ai/docs/config-json/#bfloat16-training-options
-
-
-
 - **autotuning**: dict
 
  Default = None

diff --git a/megatron/neox_arguments/arguments.py b/megatron/neox_arguments/arguments.py
@@ -914,10 +914,28 @@ def calculate_derived(self):
  )
 
  # derive precision
- if (self.fp16 or {}).get("type", self.precision) == "bfloat16":
- self.update_value("precision", "bfloat16")
- elif (self.fp16 or {}).get("enabled", False):
- self.update_value("precision", "fp16")
+ fp16_conflict = "DeepSpeed fp16 field was set but precision conflicts"
+ if self.fp16 and self.fp16.get("enabled", False):
+ if self.precision is None:
+ self.update_value("precision", "fp16")
+ else:
+ assert self.precision == "fp16", fp16_conflict
+
+ if self.precision == "fp16":
+ if isinstance(self.fp16, dict) and len(self.fp16) > 0:
+ fp16_args = copy.deepcopy(self.fp16)
+ fp16_args["enabled"] = True
+ else:
+ fp16_args = {"type": "fp16", "enabled": True}
+ self.update_value("fp16", fp16_args)
+ elif self.precision == "bfloat16":
+ bf_config = {"bf16": {"enabled": True}}
+ if self.deepspeed_extra_args is None:
+ self.update_value("deepspeed_extra_args", bf_config)
+ else:
+ extra_args = copy.deepcopy(self.deepspeed_extra_args)
+ extra_args.update(bf_config)
+ self.update_value("deepspeed_extra_args", extra_args)
  else:
  self.update_value("precision", "fp32")
 

diff --git a/megatron/neox_arguments/deepspeed_args.py b/megatron/neox_arguments/deepspeed_args.py
@@ -98,6 +98,11 @@ class NeoXArgsDeepspeedConfig(NeoXArgsTemplate):
  Dictionary options as described in Deepspeed documentation: https://www.deepspeed.ai/docs/config-json/#fp16-training-options
  """
 
+ bf16: dict = None
+ """
+ Configuration for using bfloat16 floating-point format as an alternative to FP16. BFLOAT16 requires hardware support (e.g., NVIDIA A100). Dictionary options as described in Deepspeed documentation: https://www.deepspeed.ai/docs/config-json/#bfloat16-training-options
+ """
+
  # ---Automatic Mixed Precision (AMP) Training Options---
 
  amp: dict = None
@@ -162,15 +167,6 @@ class NeoXArgsDeepspeedConfig(NeoXArgsTemplate):
  During gradient averaging, perform communication with selected data type. By default it will be determined by selected regime
  """
 
- # ---BFLOAT16 Training Options---
-
- bf16: dict = None
- """
- Configuration for using bfloat16 floating-point format as an alternative to FP16. BFLOAT16 requires hardware support (e.g., NVIDIA A100).
-
- Dictionary options as described in Deepspeed documentation: https://www.deepspeed.ai/docs/config-json/#bfloat16-training-options
- """
-
  # ---Autotuning Options---
  autotuning: dict = None
  """