Skip to content

Commit

Permalink
Fix bf16 for zero > 0 and pipeline parallelism > 0 (#1032)
Browse files Browse the repository at this point in the history
* Fix bugs so we can use bf16 with zero > 0

Signed-off-by: Dashiell Stander <[email protected]>

* Typo

Signed-off-by: Dashiell Stander <[email protected]>

* Typo

Signed-off-by: Dashiell Stander <[email protected]>

* With the DeepSpeed updates there may be no need to do grad_accum in fp32

Signed-off-by: Dashiell Stander <[email protected]>

* Add warning about necessity of fp32 grad_accum with bf16, pp>0, and zero1

Signed-off-by: Dashiell Stander <[email protected]>

* Update NeoXArgs docs automatically

* Update NeoXArgs docs automatically

---------

Signed-off-by: Dashiell Stander <[email protected]>
Co-authored-by: github-actions <[email protected]>
  • Loading branch information
dashstander and github-actions committed Sep 18, 2023
1 parent c883e8c commit d9166bf
Show file tree
Hide file tree
Showing 3 changed files with 18 additions and 1 deletion.
2 changes: 1 addition & 1 deletion configs/neox_arguments.md
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ Logging Arguments

- **git_hash**: str

Default = 06a2534
Default = 2c15e12

current git hash of repository

Expand Down
13 changes: 13 additions & 0 deletions megatron/neox_arguments/arguments.py
Original file line number Diff line number Diff line change
Expand Up @@ -930,12 +930,25 @@ def calculate_derived(self):
self.update_value("fp16", fp16_args)
elif self.precision == "bfloat16":
bf_config = {"bf16": {"enabled": True}}
# dt_config = {"grad_accum_dtype": "fp32"}
if self.deepspeed_extra_args is None:
self.update_value("deepspeed_extra_args", bf_config)
else:
extra_args = copy.deepcopy(self.deepspeed_extra_args)
extra_args.update(bf_config)
self.update_value("deepspeed_extra_args", extra_args)

zero_stage = self.zero_optimization["stage"]
if self.data_types is None:
fp32_grad_accum = False
else:
fp32_grad_accum = self.data_types.get("grad_accum_dtype") == "fp32"
if (zero_stage > 0) and (pp_size > 0) and not fp32_grad_accum:
# Remove this code when this issue is resolved
# https://github.com/microsoft/DeepSpeed/issues/1835
logging.warn(
"Outstanding DeepSpeed issue means that pp>0, zero1, and bf16 will break without fp32 grads"
)
else:
self.update_value("precision", "fp32")

Expand Down
4 changes: 4 additions & 0 deletions megatron/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
import torch

from deepspeed.launcher.runner import fetch_hostfile, parse_inclusion_exclusion
from deepspeed.runtime.bf16_optimizer import BF16_Optimizer

from megatron import print_rank_0
from megatron import mpu
Expand Down Expand Up @@ -349,8 +350,11 @@ def __init__(self, optimizer, n=50):
self.optimizer = optimizer
self.n = n
self.history = deque(maxlen=n)
self.bf16 = isinstance(optimizer, BF16_Optimizer)

def check(self, skipped):
if self.bf16:
return
self.history.append(skipped)
if (
self.optimizer.overflow
Expand Down

0 comments on commit d9166bf

Please sign in to comment.