diff --git a/megatron/training.py b/megatron/training.py index abea4b726..aa7025c25 100644 --- a/megatron/training.py +++ b/megatron/training.py @@ -538,9 +538,18 @@ def get_optimizer(model, neox_args): **neox_args.optimizer["params"], ) elif neox_args.optimizer_type.lower() == "lion": - from .optimizers import Lion + # if we want the deepspeed zero lion...megatron lion will throw DeepSpeed Error + if neox_args.zero_optimization["stage"] != 0: + from deepspeed.ops.lion import FusedLion - optimizer = Lion( + lion_optimizer = FusedLion + # if not zero + else: + from .optimizers import Lion + + lion_optimizer = Lion + + optimizer = lion_optimizer( param_groups, weight_decay=neox_args.weight_decay, **neox_args.optimizer["params"], diff --git a/requirements/requirements.txt b/requirements/requirements.txt index 6247ece3d..538cc21f9 100644 --- a/requirements/requirements.txt +++ b/requirements/requirements.txt @@ -1,5 +1,5 @@ best_download -git+https://github.com/EleutherAI/DeeperSpeed.git@b9260436e7da3e297fc6bedfd27d9e69fbba6f5c#egg=deepspeed +git+https://github.com/EleutherAI/DeeperSpeed.git@02e2ebf7dee6aaab3d89094ed470a4609763c742#egg=deepspeed ftfy>=6.0.1 git+https://github.com/EleutherAI/lm_dataformat.git@4eec05349977071bf67fc072290b95e31c8dd836 huggingface_hub>=0.11.0