Slurm Fix and formatting (#729)

* Changes `slurm_comment` to `comment`. A `comment` command line argument has been added to DeeperSpeed. Necessary for Stability cluster Signed-off-by: Dashiell Stander <[email protected]> * Pull in master_addr argument from the environment Signed-off-by: Dashiell Stander <[email protected]> * Add test script and remove comment from config Signed-off-by: Dashiell Stander <[email protected]> * Fix configs Signed-off-by: Dashiell Stander <[email protected]> * Pre-commit changes * Small typos Signed-off-by: Dashiell Stander <[email protected]> * Update NeoXArgs docs automatically * remove test script Signed-off-by: Dashiell Stander <[email protected]> * Update NeoXArgs docs automatically * Update NeoXArgs docs automatically Signed-off-by: Dashiell Stander <[email protected]> Co-authored-by: github-actions <[email protected]> Co-authored-by: Quentin Anthony <[email protected]>
EleutherAI · Dec 6, 2022 · ebd47f6 · ebd47f6
1 parent 0accac6
commit ebd47f6
Show file tree

Hide file tree

Showing 26 changed files with 179 additions and 139 deletions.
diff --git a/README.md b/README.md
@@ -66,7 +66,7 @@ The Pythia Scaling Suite is a suite of models ranging from 19M parameters to 13B
 
 ## Polyglot
 
-The Polyglot Project is an effort to train powerful non-English pretrained language models to promote the accessibility of this technology to researchers outside the dominant powerhouses of machine learning. EleutherAI has trained and released 1.3B, 3.8B, and 5.8B parameter Korean language models, the largest of which outpreforms all other publicly avaliable language models on Korean language tasks. Further details about the project and links to the models can be found [here](https://github.com/EleutherAI/polyglot).
+The Polyglot Project is an effort to train powerful non-English pretrained language models to promote the accessibility of this technology to researchers outside the dominant powerhouses of machine learning. EleutherAI has trained and released 1.3B, 3.8B, and 5.8B parameter Korean language models, the largest of which outpreforms all other publicly available language models on Korean language tasks. Further details about the project and links to the models can be found [here](https://github.com/EleutherAI/polyglot).
 
 ## Fill-in-the-Middle
 
@@ -303,7 +303,7 @@ where `--eval_tasks` is a list of evaluation tasks followed by spaces, e.g `--ev
 
 # Exporting to HuggingFace
 
-GPT-NeoX is optimized heavily for training only, and GPT-NeoX model checkpoints are not compatible out of the box with other deep learning libraries. To make models easily loadable and sharable with end users, and for further exporting to various other frameworks, GPT-NeoX supports checkpoint conversion to the [HuggingFace Transformers](https://arxiv.org/abs/1910.03771) GPTNeoXModel format. 
+GPT-NeoX is optimized heavily for training only, and GPT-NeoX model checkpoints are not compatible out of the box with other deep learning libraries. To make models easily loadable and shareable with end users, and for further exporting to various other frameworks, GPT-NeoX supports checkpoint conversion to the [HuggingFace Transformers](https://arxiv.org/abs/1910.03771) GPTNeoXModel format.
 
 To convert a NeoX checkpoint to Huggingface-loadable format, run:
 ```bash

diff --git a/configs/1-3B.yml b/configs/1-3B.yml
@@ -16,11 +16,11 @@
  "no-weight-tying": true,
  "gpt_j_residual": false,
  "output_layer_parallelism": "column",
- 
+
  # these should provide some speedup but takes a while to build, set to true if desired
  "scaled-upper-triang-masked-softmax-fusion": false,
  "bias-gelu-fusion": false,
- 
+
  # init methods
  "init_method": "small_init",
  "output_layer_init_method": "wang_init",

diff --git a/configs/125M.yml b/configs/125M.yml
@@ -16,11 +16,11 @@
  "no-weight-tying": true,
  "gpt_j_residual": false,
  "output_layer_parallelism": "column",
- 
+
  # these should provide some speedup but takes a while to build, set to true if desired
  "scaled-upper-triang-masked-softmax-fusion": false,
  "bias-gelu-fusion": false,
- 
+
  # init methods
  "init_method": "small_init",
  "output_layer_init_method": "wang_init",
@@ -36,7 +36,7 @@
  }
  },
  "min_lr": 0.00006,
- 
+
  # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
  "zero_optimization": {
  "stage": 1,

diff --git a/configs/13B.yml b/configs/13B.yml
@@ -16,11 +16,11 @@
  "no-weight-tying": true,
  "gpt_j_residual": false,
  "output_layer_parallelism": "column",
- 
+
  # these should provide some speedup but takes a while to build, set to true if desired
  "scaled-upper-triang-masked-softmax-fusion": false,
  "bias-gelu-fusion": false,
- 
+
  # init methods
  "init_method": "small_init",
  "output_layer_init_method": "wang_init",
@@ -47,7 +47,7 @@
  "contiguous_gradients": True,
  },
  "min_lr": 0.00001,
- 
+
  # batch / data settings
  "train_micro_batch_size_per_gpu": 4,
  "data-impl": "mmap",

diff --git a/configs/175B.yml b/configs/175B.yml
@@ -16,11 +16,11 @@
  "no-weight-tying": true,
  "gpt_j_residual": false,
  "output_layer_parallelism": "column",
- 
+
  # these should provide some speedup but takes a while to build, set to true if desired
  "scaled-upper-triang-masked-softmax-fusion": false,
  "bias-gelu-fusion": false,
- 
+
  # init methods
  "init_method": "small_init",
  "output_layer_init_method": "wang_init",

diff --git a/configs/2-7B.yml b/configs/2-7B.yml
@@ -16,15 +16,15 @@
  "no-weight-tying": true,
  "gpt_j_residual": false,
  "output_layer_parallelism": "column",
- 
+
  # these should provide some speedup but takes a while to build, set to true if desired
  "scaled-upper-triang-masked-softmax-fusion": false,
  "bias-gelu-fusion": false,
- 
+
  # init methods
  "init_method": "small_init",
  "output_layer_init_method": "wang_init",
- 
+
  # optimizer settings
  "optimizer": {
  "type": "Adam",

diff --git a/configs/350M.yml b/configs/350M.yml
@@ -16,11 +16,11 @@
  "no-weight-tying": true,
  "gpt_j_residual": false,
  "output_layer_parallelism": "column",
- 
+
  # these should provide some speedup but takes a while to build, set to true if desired
  "scaled-upper-triang-masked-softmax-fusion": false,
  "bias-gelu-fusion": false,
- 
+
  # init methods
  "init_method": "small_init",
  "output_layer_init_method": "wang_init",

diff --git a/configs/6-7B.yml b/configs/6-7B.yml
@@ -16,11 +16,11 @@
  "no-weight-tying": true,
  "gpt_j_residual": false,
  "output_layer_parallelism": "column",
- 
+
  # these should provide some speedup but takes a while to build, set to true if desired
  "scaled-upper-triang-masked-softmax-fusion": false,
  "bias-gelu-fusion": false,
- 
+
  # init methods
  "init_method": "small_init",
  "output_layer_init_method": "wang_init",
@@ -46,7 +46,7 @@
  "contiguous_gradients": True,
  },
  "min_lr": 0.000012,
- 
+
  # batch / data settings
  "train_micro_batch_size_per_gpu": 4,
  "data-impl": "mmap",

diff --git a/configs/760M.yml b/configs/760M.yml
@@ -16,11 +16,11 @@
  "no-weight-tying": true,
  "gpt_j_residual": false,
  "output_layer_parallelism": "column",
- 
+
  # these should provide some speedup but takes a while to build, set to true if desired
  "scaled-upper-triang-masked-softmax-fusion": false,
  "bias-gelu-fusion": false,
- 
+
  # init methods
  "init_method": "small_init",
  "output_layer_init_method": "wang_init",

diff --git a/configs/cpu_mock_config.yml b/configs/cpu_mock_config.yml
@@ -1,5 +1,5 @@
-# CPU unit tests should be independent of the presence of GPUs on the test server 
+# CPU unit tests should be independent of the presence of GPUs on the test server
 # host. This configuration mocks these GPU resources and other dependencies.
 {
  "global_num_gpus": 1
-}
+}
diff --git a/configs/eleutherai_cluster.yml b/configs/eleutherai_cluster.yml
@@ -4,16 +4,16 @@
  "train-data-paths": ["/mnt/ssd-1/data/enron/enron_train_text_document"],
  "valid-data-paths": ["/mnt/ssd-1/data/enron/enron_val_text_document"],
  "test-data-paths": ["/mnt/ssd-1/data/enron/enron_test_text_document"],
- 
+
  # if using multiple datasets, provide weights for them to be sampled with
  # "train-data-weights": [1., 2.],
  # "test-data-weights": [2., 1.],
  # "valid-data-weights": [0.5, 0.4],
 
- 
+
  # If you would like the code to create val and test datasets from your training set use the following instead
  # "split" determines the relative size of train, val, and test
- 
+
  # "split" 995,4,1
  # "data_path": "/mnt/ssd-1/data/enron/enron_train_text_document",
 

diff --git a/configs/neox_arguments.md b/configs/neox_arguments.md
@@ -111,7 +111,7 @@ Logging Arguments
 
 - **git_hash**: str
 
- Default = bcb277e
+ Default = e617cdf
 
  current git hash of repository
 
@@ -1667,9 +1667,9 @@ Args for deepspeed runner (deepspeed.launcher.runner).
 
 
 
-- **slurm_comment**: str
+- **comment**: str
 
  Default = None
 
- If using SLURM launcher adds a `--comment` to the srun command that launches the job. Sometimes necessary for cluster rules, or so I've heard.
+ Adds a `--comment` to the DeepSpeed launch command. In DeeperSpeed this is passed on to the SlurmLauncher as well. Sometime necessary for cluster rules, or so I've heard.
 
diff --git a/configs/slurm_125M.yml b/configs/slurm_125M.yml
@@ -19,15 +19,14 @@
  "eps": 1.0e-8
  }
  },
- # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
  "zero_optimization": {
  "stage": 0,
  "allgather_partitions": true,
  "allgather_bucket_size": 500000000,
  "overlap_comm": true,
  "reduce_scatter": true,
  "reduce_bucket_size": 500000000,
- "contiguous_gradients": true,
+ "contiguous_gradients": true
  },
  "train_micro_batch_size_per_gpu": 4,
  "data-impl": "mmap",
@@ -61,5 +60,5 @@
  "wall_clock_breakdown": true,
  "launcher": "slurm",
  "deepspeed_slurm": true,
- "slurm_comment": "neox"
+ "comment": "neox"
 }
diff --git a/configs/slurm_local.yml b/configs/slurm_local.yml
@@ -3,7 +3,6 @@
  "vocab-file": "data/gpt2-vocab.json",
  "merge-file": "data/gpt2-merges.txt",
  "save": "checkpoints",
- "load": "checkpoints",
  "checkpoint_validation_with_forward_pass": false,
  "tensorboard-dir": "tensorboard",
  "log-dir": "logs",

diff --git a/eval_tasks/eval_adapter.py b/eval_tasks/eval_adapter.py
@@ -390,12 +390,14 @@ def run_eval(
  # Returns a list containing all values of the task registry that
  # match at least one of the patterns
  import fnmatch
+
  def pattern_match(patterns, source_list):
  task_names = set()
  for pattern in patterns:
  for matching in fnmatch.filter(source_list, pattern):
  task_names.add(matching)
  return list(task_names)
+
  eval_tasks = pattern_match(eval_tasks, tasks.ALL_TASKS)
  print(f"Found tasks: {eval_tasks}")
 
@@ -453,7 +455,11 @@ def run_eval_harness(
  bootstrap_iters=2,
 ):
  print_rank_0("Running evaluation harness...")
- adapter = EvalHarnessAdapter(model, forward_step_fn, neox_args, batch_size=batch_size)
+ adapter = EvalHarnessAdapter(
+ model, forward_step_fn, neox_args, batch_size=batch_size
+ )
  return adapter.run_eval(
- eval_tasks=eval_tasks, num_fewshot=num_fewshot, bootstrap_iters=bootstrap_iters,
+ eval_tasks=eval_tasks,
+ num_fewshot=num_fewshot,
+ bootstrap_iters=bootstrap_iters,
  )
diff --git a/evaluate.py b/evaluate.py
@@ -60,7 +60,6 @@ def main():
  neox_args.iteration,
  use_wandb=neox_args.use_wandb,
  )
-
 
  pprint(results)
  results_path = (

diff --git a/megatron/gradient_noise_scale/gradient_noise_scale.py b/megatron/gradient_noise_scale/gradient_noise_scale.py
@@ -14,6 +14,7 @@
 
 import torch
 
+
 def ema(avg, beta, yi, i):
  """Exponential moving average"""
  if avg is None:

diff --git a/megatron/neox_arguments/arguments.py b/megatron/neox_arguments/arguments.py
@@ -405,13 +405,21 @@ def get_deepspeed_main_args(self):
  args_list.extend(
  self.convert_key_value_to_command_line_arg(key, configured_value)
  )
+
  if "DLTS_HOSTFILE" in os.environ:
  args_list.extend(
  self.convert_key_value_to_command_line_arg(
  "hostfile", os.environ["DLTS_HOSTFILE"]
  )
  )
 
+ if "MASTER_ADDR" in os.environ:
+ args_list.extend(
+ self.convert_key_value_to_command_line_arg(
+ "master_addr", os.environ["MASTER_ADDR"]
+ )
+ )
+
  if (
  "--include" in args_list or "--exclude" in args_list
  ) and "--num_gpus" in args_list:

diff --git a/megatron/neox_arguments/deepspeed_args.py b/megatron/neox_arguments/deepspeed_args.py
@@ -186,7 +186,7 @@ class NeoXArgsDeepspeedRunner(NeoXArgsTemplate):
  If true, autodetects nvlink pairs and remaps cuda visible devices to place them next to each other. This is an Eleuther addition to deepspeed, and should speed up model parallel training on setups with nvlink pairs when mp=2.
  """
 
- slurm_comment: str = None
+ comment: str = None
  """
- If using SLURM launcher adds a `--comment` to the srun command that launches the job. Sometimes necessary for cluster rules, or so I've heard.
+ Adds a `--comment` to the DeepSpeed launch command. In DeeperSpeed this is passed on to the SlurmLauncher as well. Sometime necessary for cluster rules, or so I've heard.
  """
diff --git a/megatron/utils.py b/megatron/utils.py
@@ -414,7 +414,7 @@ def setup_for_inference_or_eval(
  from megatron.neox_arguments import NeoXArgs
  from megatron.initialize import initialize_megatron
  from megatron.training import setup_model_and_optimizer
- 
+
  _overwrite_values = {
  "checkpoint_activations": False,
  "partition_activations": False,

diff --git a/requirements/requirements.txt b/requirements/requirements.txt
@@ -2,6 +2,7 @@ git+https://github.com/EleutherAI/DeeperSpeed.git@eb7f5cff36678625d23db8a8fe78b4
 einops==0.3.0
 ftfy==6.0.1
 git+https://github.com/EleutherAI/lm_dataformat.git@4eec05349977071bf67fc072290b95e31c8dd836
+huggingface_hub==0.11.0
 lm_eval==0.2.0
 mpi4py==3.0.3
 numpy==1.22.0
@@ -11,5 +12,4 @@ sentencepiece
 six
 tokenizers==0.12.1
 transformers~=4.24.0
-huggingface_hub==0.11.0
 wandb==0.10.28
diff --git a/tests/neox_args/test_neoxargs_commandline.py b/tests/neox_args/test_neoxargs_commandline.py
@@ -149,7 +149,9 @@ def test_neoxargs_consume_neox_args():
  from megatron.neox_arguments import NeoXArgs
 
  # intitially load config from files as would be the case in deepy.py
- yaml_list = get_configs_with_path(["small.yml", "local_setup.yml", "cpu_mock_config.yml"])
+ yaml_list = get_configs_with_path(
+ ["small.yml", "local_setup.yml", "cpu_mock_config.yml"]
+ )
  args_baseline = NeoXArgs.from_ymls(yaml_list)
  args_baseline.update_value("user_script", str(get_root_directory() / "train.py"))
  deepspeed_main_args = args_baseline.get_deepspeed_main_args()

diff --git a/tests/neox_args/test_neoxargs_load.py b/tests/neox_args/test_neoxargs_load.py
@@ -75,7 +75,9 @@ def test_neoxargs_load_arguments_small_local_setup_text_generation():
  """
  verify small.yml can be loaded together with text generation without raising validation errors
  """
- run_neox_args_load_test(["small.yml", "local_setup.yml", "text_generation.yml", "cpu_mock_config.yml"])
+ run_neox_args_load_test(
+ ["small.yml", "local_setup.yml", "text_generation.yml", "cpu_mock_config.yml"]
+ )
 
 
 @pytest.mark.cpu