EleutherAI · Quentin-Anthony · May 18, 2023 · May 17, 2023 · May 17, 2023 · May 18, 2023
@@ -259,7 +259,7 @@ N.B - `OneBitAdam` requires you to use deepspeed's internal lr scheduler because
 Checkpointing works by trading compute for memory. Rather than storing all intermediate activations of the entire computation graph for computing backward, the checkpointed part does not save intermediate activations, and instead recomputes them in backward pass.
 
 ### Mixed Precision Training Settings:
-gpt-neox's mixed precision training is configured identically to DeepSpeed's, please see [their documentation](https://www.deepspeed.ai/docs/config-json/#fp16-training-options) for more information.
+gpt-neox's fp16 training is configured identically to DeepSpeed's, please see [their documentation](https://www.deepspeed.ai/docs/config-json/#fp16-training-options) for more information.
 An example config for fp16 training:
 
 ```yaml
@@ -272,7 +272,7 @@ An example config for fp16 training:
  },
 ```
 
-To train in fp32, simply set `fp16["enabled"]` to `false`.
+Alternatively you can use the `precision` config which can be set to `fp16`, `bfloat16`, or `fp32`. If you set `"precision": "fp16"` without adding a `"fp16": {...}` dict, then it will simply use DeepSpeed's defaults for fp16 training.
 
 
 ### SLURM Settings
@@ -312,6 +312,3 @@ To make this JSON just remove the comment and use all lowercase for the boolean:
  "comm_backend_name": "nccl"
  }
 ```
-
-
-** TODO: bf16 docs **
@@ -57,15 +57,7 @@
  "hidden_dropout": 0.0,
  "attention_dropout": 0.0,
 
- # precision settings
- "fp16": {
- "enabled": true,
- "type": "bfloat16", # set bf16 as precision
- "loss_scale": 0,
- "loss_scale_window": 1000,
- "hysteresis": 2,
- "min_loss_scale": 1
- },
+ "precision": "bfloat16",
 
  "fp32_allreduce": True, # without a patch to torch, bf16 models have to do the allreduce in fp32
  # misc. training settings

@@ -111,7 +111,7 @@ Logging Arguments
 
 - **git_hash**: str
 
- Default = b130d58
+ Default = 83e820c
 
  current git hash of repository
 

@@ -625,7 +625,7 @@ def setup_model_and_optimizer(neox_args, use_cache=False, iteration=None):
  dist_init_required=False,
  model_parameters=_model_params,
  # Need to remove the below so that it doesn't conflict with --deepspeed_config required by autotuning
- #config_params=neox_args.deepspeed_config,
+ # config_params=neox_args.deepspeed_config,
  mpu=mpu if not neox_args.is_pipe_parallel else None,
  )
  model.total_params = get_total_params(model.module)
@@ -792,8 +792,8 @@ def train(
  )
  iteration += 1
  neox_args.iteration = iteration
-
- overflow_monitor.check(skipped_iter) # check for repeated overflow
+ if neox_args.precision == "fp16":
+  overflow_monitor.check(skipped_iter) # check for repeated overflow
  if neox_args.log_gradient_noise_scale: # log noise scale if applicable
  noise_scale_logger.update()