[ZeRO-3] Partitioned init with deepspeed.zero.Init() (#1190)

* added ds zero.Init() to get_model * Clean up conditional with block * pre-commit --------- Co-authored-by: Quentin Anthony <[email protected]>
EleutherAI · Mar 19, 2024 · 7267a74 · 7267a74
1 parent 277141e
commit 7267a74
Showing 1 changed file with 10 additions and 7 deletions.
diff --git a/megatron/training.py b/megatron/training.py
@@ -24,6 +24,7 @@
 
 import math
 import sys
+from contextlib import nullcontext
 
 import torch
 import deepspeed
@@ -426,13 +427,15 @@ def get_model(neox_args, use_cache=False):
  # If mup isn't being used anyways, this has no effect.
  old_use_mup = neox_args.use_mup
  neox_args.use_mup = False
- model = GPT2ModelPipe(
- neox_args=neox_args,
- num_tokentypes=0,
- parallel_output=True,
- topology=mpu.get_topology(),
- use_cache=use_cache,
- )
+
+ with deepspeed.zero.Init() if neox_args.zero_stage == 3 else nullcontext() as gs:
+ model = GPT2ModelPipe(
+ neox_args=neox_args,
+ num_tokentypes=0,
+ parallel_output=True,
+ topology=mpu.get_topology(),
+ use_cache=use_cache,
+ )
 
  ### soft prompt tuning stuff ###
  if neox_args.soft_prompt_tuning is not None and neox_args.soft_prompt_tuning.get(