Deprecate cpu_offload zero_optimization Option (#720)

* Deprecate cpu_offload zero_optimization option * Update NeoXArgs docs automatically Co-authored-by: github-actions <[email protected]>
EleutherAI · Nov 25, 2022 · ed09eba · ed09eba
1 parent 46b7d82
commit ed09eba
Show file tree

Hide file tree

Showing 19 changed files with 27 additions and 21 deletions.
diff --git a/configs/1-3B.yml b/configs/1-3B.yml
@@ -35,7 +35,8 @@
  }
  },
  "min_lr": 0.00002,
-
+
+ # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
  "zero_optimization": {
  "stage": 1,
  "allgather_partitions": True,
@@ -44,7 +45,6 @@
  "reduce_scatter": True,
  "reduce_bucket_size": 500000000,
  "contiguous_gradients": True,
- "cpu_offload": False
  },
 
  # batch / data settings

diff --git a/configs/125M.yml b/configs/125M.yml
@@ -37,6 +37,7 @@
  },
  "min_lr": 0.00006,
 
+ # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
  "zero_optimization": {
  "stage": 1,
  "allgather_partitions": True,
@@ -45,7 +46,6 @@
  "reduce_scatter": True,
  "reduce_bucket_size": 500000000,
  "contiguous_gradients": True,
- "cpu_offload": False
  },
 
  # batch / data settings

diff --git a/configs/13B.yml b/configs/13B.yml
@@ -35,6 +35,8 @@
  "eps": 1.0e-8,
  }
  },
+
+ # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
  "zero_optimization": {
  "stage": 1,
  "allgather_partitions": True,
@@ -43,7 +45,6 @@
  "reduce_scatter": True,
  "reduce_bucket_size": 500000000,
  "contiguous_gradients": True,
- "cpu_offload": False
  },
  "min_lr": 0.00001,
 

diff --git a/configs/175B.yml b/configs/175B.yml
@@ -35,6 +35,7 @@
  }
  },
  "min_lr": 0.000006,
+ # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
  "zero_optimization": {
  "stage": 1,
  "allgather_partitions": True,
@@ -43,7 +44,6 @@
  "reduce_scatter": True,
  "reduce_bucket_size": 500000000,
  "contiguous_gradients": True,
- "cpu_offload": False
  },
 
  # batch / data settings

diff --git a/configs/19M.yml b/configs/19M.yml
@@ -30,6 +30,7 @@
  },
  "min_lr": 0.0001,
 
+ # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
  "zero_optimization": {
  "stage": 1,
  "allgather_partitions": True,
@@ -38,7 +39,6 @@
  "reduce_scatter": True,
  "reduce_bucket_size": 500000000,
  "contiguous_gradients": True,
- "cpu_offload": False
  },
 
  "train_micro_batch_size_per_gpu": 4, #32,

diff --git a/configs/2-7B.yml b/configs/2-7B.yml
@@ -35,6 +35,8 @@
  }
  },
  "min_lr": 0.000016,
+
+ # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
  "zero_optimization": {
  "stage": 1,
  "allgather_partitions": True,
@@ -43,7 +45,6 @@
  "reduce_scatter": True,
  "reduce_bucket_size": 500000000,
  "contiguous_gradients": True,
- "cpu_offload": False
  },
 
  # batch / data settings

diff --git a/configs/20B.yml b/configs/20B.yml
@@ -46,6 +46,8 @@
  },
 
  "min_lr": 0.97e-5,
+
+ # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
  "zero_optimization": {
  "stage": 1,
  "allgather_partitions": True,
@@ -54,7 +56,6 @@
  "reduce_scatter": True,
  "reduce_bucket_size": 1260000000,
  "contiguous_gradients": True,
- "cpu_offload": False
  },
 
  # batch / data settings (assuming 96 GPUs)

diff --git a/configs/350M.yml b/configs/350M.yml
@@ -35,6 +35,8 @@
  }
  },
  "min_lr": 0.00003,
+
+ # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
  "zero_optimization": {
  "stage": 1,
  "allgather_partitions": True,
@@ -43,7 +45,6 @@
  "reduce_scatter": True,
  "reduce_bucket_size": 500000000,
  "contiguous_gradients": True,
- "cpu_offload": False
  },
  # batch / data settings
  "train_micro_batch_size_per_gpu": 4,

diff --git a/configs/49M.yml b/configs/49M.yml
@@ -34,6 +34,7 @@
  },
  "min_lr": 0.00008,
 
+ # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
  "zero_optimization": {
  "stage": 1,
  "allgather_partitions": True,
@@ -42,7 +43,6 @@
  "reduce_scatter": True,
  "reduce_bucket_size": 500000000,
  "contiguous_gradients": True,
- "cpu_offload": False
  },
 
  # batch / data settings

diff --git a/configs/6-7B.yml b/configs/6-7B.yml
@@ -34,6 +34,8 @@
  "eps": 1.0e-8,
  }
  },
+
+ # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
  "zero_optimization": {
  "stage": 1,
  "allgather_partitions": True,
@@ -42,7 +44,6 @@
  "reduce_scatter": True,
  "reduce_bucket_size": 500000000,
  "contiguous_gradients": True,
- "cpu_offload": False
  },
  "min_lr": 0.000012,
 

diff --git a/configs/760M.yml b/configs/760M.yml
@@ -35,6 +35,8 @@
  }
  },
  "min_lr": 0.000025,
+
+ # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
  "zero_optimization": {
  "stage": 1,
  "allgather_partitions": True,
@@ -43,7 +45,6 @@
  "reduce_scatter": True,
  "reduce_bucket_size": 500000000,
  "contiguous_gradients": True,
- "cpu_offload": False
  },
 
  # batch / data settings

diff --git a/configs/800M.yml b/configs/800M.yml
@@ -30,6 +30,7 @@
  },
  "min_lr": 0.000025,
 
+ # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
  "zero_optimization": {
  "stage": 1,
  "allgather_partitions": True,
@@ -38,7 +39,6 @@
  "reduce_scatter": True,
  "reduce_bucket_size": 500000000,
  "contiguous_gradients": True,
- "cpu_offload": False
  },
 
  "train_micro_batch_size_per_gpu": 16,

diff --git a/configs/README.md b/configs/README.md
@@ -38,6 +38,7 @@ For a detailed list of all the arguments available for neox, see [neox_arguments
  "betas": [0.9, 0.95]
  }
  },
+ # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
  "zero_optimization": {
  "stage": 0,
  "allgather_partitions": True,
@@ -46,7 +47,6 @@ For a detailed list of all the arguments available for neox, see [neox_arguments
  "reduce_scatter": True,
  "reduce_bucket_size": 500000000,
  "contiguous_gradients": True,
- "cpu_offload": False
  },
 
  # batch / data settings
@@ -165,6 +165,7 @@ Available optimizer types are:
 ### ZeRO Optimization:
 
 ```yaml
+# for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
  "zero_optimization": {
  "stage": 0,
  "allgather_partitions": True,
@@ -173,7 +174,6 @@ Available optimizer types are:
  "reduce_scatter": True,
  "reduce_bucket_size": 500000000,
  "contiguous_gradients": True,
- "cpu_offload": False
  },
  "zero_allow_untested_optimizer": false,
 

diff --git a/configs/small_bf16.yml → configs/bf16_125M.yml b/configs/small_bf16.yml → configs/bf16_125M.yml
@@ -29,6 +29,7 @@
  "eps": 1.0e-8,
  }
  },
+ # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
  "zero_optimization": {
  "stage": 0,
  "allgather_partitions": True,
@@ -37,7 +38,6 @@
  "reduce_scatter": True,
  "reduce_bucket_size": 500000000,
  "contiguous_gradients": True,
- "cpu_offload": False
  },
 
  # batch / data settings

diff --git a/configs/bnb_small.yml → configs/bnb_125M.yml b/configs/bnb_small.yml → configs/bnb_125M.yml
@@ -30,6 +30,7 @@
  "eps": 1.0e-8,
  }
  },
+ # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
  "zero_optimization": {
  "stage": 0,
  "allgather_partitions": True,
@@ -38,7 +39,6 @@
  "reduce_scatter": True,
  "reduce_bucket_size": 500000000,
  "contiguous_gradients": True,
- "cpu_offload": False
  },
 
  # batch / data settings

diff --git a/configs/neox_arguments.md b/configs/neox_arguments.md
@@ -111,7 +111,7 @@ Logging Arguments
 
 - **git_hash**: str
 
- Default = 32deba2
+ Default = 62c2de8
 
  current git hash of repository
 

diff --git a/configs/slurm_small.yml → configs/slurm_125M.yml b/configs/slurm_small.yml → configs/slurm_125M.yml
@@ -19,6 +19,7 @@
  "eps": 1.0e-8
  }
  },
+ # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
  "zero_optimization": {
  "stage": 0,
  "allgather_partitions": true,
@@ -27,7 +28,6 @@
  "reduce_scatter": true,
  "reduce_bucket_size": 500000000,
  "contiguous_gradients": true,
- "cpu_offload": false
  },
  "train_micro_batch_size_per_gpu": 4,
  "data-impl": "mmap",

diff --git a/megatron/neox_arguments/arguments.py b/megatron/neox_arguments/arguments.py
@@ -36,6 +36,7 @@
 
 # ZERO defaults by deespeed
 # These values should not be changed unless defaults in deepspeed are changed
+# for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
 ZERO_DEFAULTS = {
  "stage": 0,
  "allgather_partitions": True,
@@ -45,7 +46,6 @@
  "reduce_scatter": True,
  "reduce_bucket_size": int(5e8),
  "contiguous_gradients": False,
- "cpu_offload": False,
 }
 
 # NeoX optimizer defaults

diff --git a/tests/test_configs/test_train_base.yml b/tests/test_configs/test_train_base.yml
@@ -29,6 +29,7 @@
  }
  },
 
+ # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
  "zero_optimization": {
  "stage": 0,
  "allgather_partitions": True,
@@ -37,7 +38,6 @@
  "reduce_scatter": True,
  "reduce_bucket_size": 500000000,
  "contiguous_gradients": True,
- "cpu_offload": False
  },
 
  # batch / data settings