Skip to content

Commit

Permalink
Add jsrun launcher based on neox 2.0
Browse files Browse the repository at this point in the history
  • Loading branch information
Quentin-Anthony committed Mar 18, 2023
1 parent e2d8a24 commit 3782c7a
Show file tree
Hide file tree
Showing 3 changed files with 7 additions and 2 deletions.
2 changes: 1 addition & 1 deletion configs/neox_arguments.md
Original file line number Diff line number Diff line change
Expand Up @@ -1809,7 +1809,7 @@ Args for deepspeed runner (deepspeed.launcher.runner).



- **launcher**: typing.Literal['pdsh', 'openmpi', 'mvapich', 'slurm']
- **launcher**: typing.Literal['pdsh', 'openmpi', 'mvapich', 'slurm', 'jsrun']

Default = pdsh

Expand Down
2 changes: 1 addition & 1 deletion megatron/neox_arguments/deepspeed_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,7 +192,7 @@ class NeoXArgsDeepspeedRunner(NeoXArgsTemplate):
IP address of node 0, will be inferred via 'hostname -I' if not specified.
"""

launcher: Literal["pdsh", "openmpi", "mvapich", "slurm"] = "pdsh"
launcher: Literal["pdsh", "openmpi", "mvapich", "slurm", "jsrun"] = "pdsh"
"""
Launcher backend for multi-node training. Options currently include PDSH, OpenMPI, MVAPICH.
"""
Expand Down
5 changes: 5 additions & 0 deletions megatron/neox_arguments/neox_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -612,6 +612,11 @@ class NeoXArgsOther(NeoXArgsTemplate):
Run via SLURM, this will attempt to discover the necessary variables to initialize torch distributed from the SLURM environment
"""

deepspeed_jsrun: bool = False
"""
Run via JSRUN, this will attempt to discover the necessary variables to initialize torch distributed from the IBM LSF environment
"""

user_script: str = None
"""
user script to be run
Expand Down

0 comments on commit 3782c7a

Please sign in to comment.