parameterize more testacases

EleutherAI · May 13, 2021 · d581e57 · d581e57
1 parent 7a4d1b3
commit d581e57
Show file tree

Hide file tree

Showing 5 changed files with 74 additions and 150 deletions.
diff --git a/tests/common.py b/tests/common.py
@@ -38,7 +38,7 @@ def get_configs_with_path(configs):
  return [str(get_config_directory() / cfg) for cfg in configs]
 
 
-def get_test_configs_with_path(configs):
+def get_test_configs_with_path(configs: list):
  test_config_dir = Path(__file__).parent / "test_configs"
  return [str((test_config_dir / cfg).absolute()) for cfg in configs]
 
@@ -227,19 +227,25 @@ def parametrize(params_to_test: dict, max_tests: int = 50, seed: int = None):
  keys, values = zip(*params_to_test.items())
  for p in bounded_product(values, n=max_tests, seed=seed):
  experiment = dict(zip(keys, p))
+ to_pop = []
+ to_add = {}
  for k, v in experiment.items():
  if "," in k:
  keys_split = [i.strip() for i in k.split(',')]
- values_separated = experiment.pop(k)
+ values_separated = experiment[k]
+ to_pop.append(k)
  assert len(values_separated) == len(keys_split)
  new_dict = dict(zip(keys_split, values_separated))
- experiment.update(new_dict)
+ to_add.update(new_dict)
+ experiment.update(to_add)
+ for k in to_pop:
+ experiment.pop(k)
  base = deepcopy(BASE_CONFIG)
  base.update(experiment)
  yield base
 
 
 binary = [True, False]
 
-with open(get_test_configs_with_path("test_train_base.yml")[0], 'r') as f:
+with open(get_test_configs_with_path(["test_train_base.yml"])[0], 'r') as f:
  BASE_CONFIG = load(f, Loader=Loader)
diff --git a/tests/model/test_model_checkpoint.py b/tests/model/test_model_checkpoint.py
@@ -4,125 +4,40 @@
 This tests contain a relatively large number of functions. They are not split into separate tests because a lot of boilerplate (e.g. instantiate model) needs
 to run in order to perform follow up tests. Joining in one test reduces runtime at the expense of decreased transparency of test results in case of failures.
 """
-
-import os
-from pathlib import Path
-
-from ..common import TEST_CHECKPOINT_DIR, TEST_LOG_DIR, TEST_TENSORBOARD_DIR
-from ..common import distributed_test, get_root_directory, get_test_configs_with_path, clear_test_dirs
-
+import pytest
+from torch._C import Value
+from ..common import distributed_test, clear_test_dirs, model_setup, binary, parametrize
 import torch
 
-@distributed_test(world_size=1)
-def test_model_checkpoint_small_0():
- yaml_list = get_test_configs_with_path(["test_local_setup.yml", "test_small_0.yml"])
- run_checkpoint_test(yaml_list=yaml_list, do_forward_pass=False, cpu=True)
+PARAMS_TO_TEST = {
+ "pipe_parallel_size,model_parallel_size": [[0, 1]],
+}
 
-@distributed_test(world_size=1)
-def test_model_checkpoint_small_1():
- yaml_list = get_test_configs_with_path(["test_local_setup.yml", "test_small_1.yml"])
- run_checkpoint_test(yaml_list=yaml_list, do_forward_pass=False, cpu=True)
 
-@distributed_test(world_size=2)
-def test_model_checkpoint_small_2():
- yaml_list = get_test_configs_with_path(["test_local_setup.yml", "test_small_2.yml"])
- run_checkpoint_test(yaml_list=yaml_list, do_forward_pass=False, cpu=False)
-
-@distributed_test(world_size=1)
-def test_model_checkpoint_small_3():
- yaml_list = get_test_configs_with_path(["test_local_setup.yml", "test_small_3.yml"])
- run_checkpoint_test(yaml_list=yaml_list, do_forward_pass=False, cpu=False)
-
-@distributed_test(world_size=2)
-def test_model_checkpoint_small_4():
- yaml_list = get_test_configs_with_path(["test_local_setup.yml", "test_small_4.yml"])
- run_checkpoint_test(yaml_list=yaml_list, do_forward_pass=False, cpu=False)
+@pytest.mark.parametrize("param_dict", list(parametrize(PARAMS_TO_TEST, max_tests=50, seed=None)))
+def test_train(param_dict):
+ @distributed_test(world_size=2)
+ def wrapper():
+ run_checkpoint_test(param_dict=param_dict)
+ wrapper()
 
 def run_checkpoint_test(yaml_list=None, param_dict=None, do_forward_pass=False, cpu=False):
- from megatron.neox_arguments import NeoXArgs
- from megatron import initialize_megatron
- from megatron.text_generation_utils import get_batch, forward_model
- from megatron.training import setup_model_and_optimizer
- from megatron.mpu import destroy_model_parallel
-
+
  from megatron.checkpointing import load_checkpoint
  from megatron.checkpointing import save_checkpoint
 
- destroy_model_parallel() # mpu model parallel contains remaining global vars
-
- if torch.distributed.get_world_size() == 1 or torch.distributed.get_rank() == 0:
- clear_test_dirs()
-
-
- overwrite_values = {
- "user_script": str(get_root_directory() / "pretrain_gpt2.py"),
- "save": TEST_CHECKPOINT_DIR,
- "load": TEST_CHECKPOINT_DIR,
- "log_dir": TEST_LOG_DIR,
- "tensorboard_dir": TEST_TENSORBOARD_DIR,
- }
-
- # should not both be none
- assert yaml_list is not None or param_dict is not None
-
- # intitially load config from files as would be the case in deepy.py
- if yaml_list is not None:
- args_loaded = NeoXArgs.from_ymls(yaml_list, overwrite_values=overwrite_values)
- else:
- p_dict = param_dict.copy()
- p_dict.update(overwrite_values)
- args_loaded = NeoXArgs.from_dict(p_dict)
-
- args_loaded.build_tokenizer()
-
- initialize_megatron(neox_args=args_loaded)
-
- model, optimizer, lr_scheduler = setup_model_and_optimizer(neox_args=args_loaded, inference=True, get_key_value=True)
- model.eval()
-
+ model, optimizer, lr_scheduler, args_loaded = model_setup(yaml_list, param_dict)
+ model.iteration += 1
  # save model checkpoint
  save_checkpoint(neox_args=args_loaded, iteration=42, model=model, optimizer=optimizer, lr_scheduler=lr_scheduler)
 
-
- # forward
- if do_forward_pass:
- context_tokens_tensor = torch.cuda.LongTensor([[1,2,3,4,5],[1,2,3,4,5],[6,7,8,9,10],[1,2,3,4,100]])
- tokens, attention_mask, position_ids = get_batch(args_loaded, context_tokens_tensor)
- logits, layer_past = forward_model(args_loaded, model, (tokens, position_ids, attention_mask, torch.Tensor()))
-
-
- # assert logits are the right shape
- assert torch.is_tensor(logits), "run_checkpoint_test() forward output is tensor"
- assert logits.size(0) == context_tokens_tensor.size(0), "run_checkpoint_test() batch size correct"
- assert logits.size(1) == context_tokens_tensor.size(1), "run_checkpoint_test() context size correct"
-
- # assert correct behaviour
- assert torch.isclose(logits[0], logits[1]).all().item(), "run_checkpoint_test() forward independent of batch index"
- assert not torch.isclose(logits[1], logits[2]).all().item(), "run_checkpoint_test() forward produced different outputs for different inputs"
- assert torch.isclose(logits[1, 3], logits[3, 3]).all().item(), "run_checkpoint_test() forward masks right side tokens"
-
  # reload model from checkpoint
- if yaml_list is not None:
- args_reloaded = NeoXArgs.from_ymls(yaml_list, overwrite_values=overwrite_values)
- else:
- p_dict = param_dict.copy()
- p_dict.update(overwrite_values)
- args_reloaded = NeoXArgs.from_dict(p_dict)
-
- args_reloaded.build_tokenizer()
-
- reloaded_model, optimizer, lr_scheduler = setup_model_and_optimizer(neox_args=args_reloaded, inference=True, get_key_value=True)
- iteration = load_checkpoint(neox_args=args_reloaded, model=reloaded_model, optimizer=optimizer, lr_scheduler=lr_scheduler)
- reloaded_model.eval()
+ reloaded_model, reloaded_optimizer, reloaded_lr_scheduler, args_reloaded = model_setup(yaml_list, param_dict)
+ iteration = load_checkpoint(neox_args=args_reloaded, model=reloaded_model, optimizer=reloaded_optimizer, lr_scheduler=reloaded_lr_scheduler)
 
  #ensure same checkpoint is loaded
  assert iteration == 42, "run_checkpoint_test() iteration loaded from checkpoint correct"
 
- if do_forward_pass:
- #check re-loaded model returns the same results
- logits_reloaded, layer_past = forward_model(args_reloaded, model, (tokens, position_ids, attention_mask))
- assert torch.isclose(logits, logits_reloaded).all().item(), "run_checkpoint_test() forward output after reloading checkpoint unchanged"
-
  #check all weight groups are the same
  for idx, ((n1, p1), (n2, p2)) in enumerate(zip(list(model.module.named_parameters()), list(reloaded_model.module.named_parameters()))):
  assert n1 == n2

diff --git a/tests/model/test_model_instantiation.py b/tests/model/test_model_instantiation.py
@@ -9,19 +9,20 @@
 from ..common import distributed_test, model_setup, clear_test_dirs, parametrize, binary
 
 PARAMS_TO_TEST = {
- "pipe_parallel_size,model_parallel_size": [[0, 1], [1, 1], [2, 2], [0, 2]],
+ "pipe_parallel_size,model_parallel_size": [[0, 1], [1, 2], [0, 2]],
  "no_weight_tying": binary,
  "attention_config": [[[["global"], "all"]], [[["local"], "all"]], [[["sparse_variable"], "all"]],
  [[["sparse_fixed"], "all"]]],
- "scaled_upper_triang_masked_softmax_fusion": [binary, binary],
- "bias_gelu_fusion": binary,
+ "scaled_upper_triang_masked_softmax_fusion,bias_gelu_fusion": [[True, False], [False, True]],
 }
 
 
 @pytest.mark.parametrize("param_dict", list(parametrize(PARAMS_TO_TEST, max_tests=50, seed=None)))
-@distributed_test(world_size=2)
-def test_model_instantiation(param_dict):
- run_test_model_instantiation(param_dict=param_dict)
+def test_train(param_dict):
+ @distributed_test(world_size=2)
+ def wrapper():
+ run_test_model_instantiation(param_dict=param_dict)
+ wrapper()
 
 
 def run_test_model_instantiation(yaml_list=None, param_dict=None):

diff --git a/tests/model/test_model_train.py b/tests/model/test_model_train.py
@@ -14,7 +14,7 @@
  "norm,pos_emb,activation": [["layernorm", "learned", "gelu"], ["rmsnorm", "rotary", "gelu"],
  ["scalenorm", "sinusoidal", "geglu"], ["layernorm", "rpe", "geglu"],
  ["rmsnorm", "none", "geglu"]],
- "pipe_parallel_size,model_parallel_size": [[0, 1], [1, 1], [2, 2], [0, 2]],
+ "pipe_parallel_size,model_parallel_size": [[0, 1], [1, 2], [0, 2]],
  "no_weight_tying": binary,
  "attention_config": [[[["global"], "all"]], [[["local"], "all"]], [[["sparse_variable"], "all"]],
  [[["sparse_fixed"], "all"]]],
@@ -24,9 +24,11 @@
 
 
 @pytest.mark.parametrize("param_dict", list(parametrize(PARAMS_TO_TEST, max_tests=50, seed=None)))
-@distributed_test(world_size=2)
 def test_train(param_dict):
- run_train_test(param_dict=param_dict)
+ @distributed_test(world_size=2)
+ def wrapper():
+ run_train_test(param_dict=param_dict)
+ wrapper()
 
 
 def run_train_test(yaml_list=None, param_dict=None):

diff --git a/tests/test_configs/test_train_base.yml b/tests/test_configs/test_train_base.yml
@@ -1,23 +1,23 @@
-# GPT-2 pretraining setup
+# GPT_2 pretraining setup
 {
  # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
  # across the node boundaries )
- "pipe-parallel-size": 0,
- "model-parallel-size": 1,
+ "pipe_parallel_size": 0,
+ "model_parallel_size": 1,
 
  # model settings
- "num-layers": 3,
- "hidden-size": 192,
- "num-attention-heads": 12,
- "seq-length": 1024,
- "max-position-embeddings": 1024,
+ "num_layers": 3,
+ "hidden_size": 192,
+ "num_attention_heads": 12,
+ "seq_length": 1024,
+ "max_position_embeddings": 1024,
  "norm": "layernorm",
- "pos-emb": "rotary",
- "no-weight-tying": true,
+ "pos_emb": "rotary",
+ "no_weight_tying": true,
 
  # these should provide some speedup but takes a while to build, set to true if desired
- "scaled-upper-triang-masked-softmax-fusion": false,
- "bias-gelu-fusion": false,
+ "scaled_upper_triang_masked_softmax_fusion": false,
+ "bias_gelu_fusion": false,
 
  # optimizer settings
  "optimizer": {
@@ -42,20 +42,20 @@
 
  # batch / data settings
  "train_micro_batch_size_per_gpu": 4,
- "data-impl": "mmap",
+ "data_impl": "mmap",
  "split": "949,50,1",
 
  # activation checkpointing
- "checkpoint-activations": true,
- "checkpoint-num-layers": 1,
- "partition-activations": true,
- "synchronize-each-layer": true,
+ "checkpoint_activations": true,
+ "checkpoint_num_layers": 1,
+ "partition_activations": true,
+ "synchronize_each_layer": true,
 
  # regularization
  "gradient_clipping": 1.0,
- "weight-decay": 0.0,
- "hidden-dropout": 0.0,
- "attention-dropout": 0.0,
+ "weight_decay": 0.0,
+ "hidden_dropout": 0.0,
+ "attention_dropout": 0.0,
 
  # precision settings
  "fp16": { 
@@ -67,23 +67,23 @@
  },
 
  # misc. training settings
- "train-iters": 320000,
- "lr-decay-iters": 320000,
- "distributed-backend": "nccl",
- "lr-decay-style": "cosine",
+ "train_iters": 320000,
+ "lr_decay_iters": 320000,
+ "distributed_backend": "nccl",
+ "lr_decay_style": "cosine",
  "warmup": 0.01,
- "save-interval": 10000,
- "eval-interval": 1000,
- "eval-iters": 10,
+ "save_interval": 10000,
+ "eval_interval": 1000,
+ "eval_iters": 10,
 
  # logging
- "log-interval": 100,
+ "log_interval": 100,
  "steps_per_print": 10,
- "keep-last-n-checkpoints": 4,
+ "keep_last_n_checkpoints": 4,
  "wall_clock_breakdown": true,
 
- # Suggested data paths when using GPT-NeoX locally
- "data-path": "data/enron/enron_text_document",
+ # Suggested data paths when using GPT_NeoX locally
+ "data_path": "data/enron/enron_text_document",
 
  # or for weighted datasets:
  # "train-data-paths": ["data/enron/enron_text_document", "data/enron/enron_text_document"],
@@ -93,11 +93,11 @@
  # "test-data-weights": [2., 1.],
  # "valid-data-weights": [0.5, 0.4],
 
- "vocab-file": "data/gpt2-vocab.json",
- "merge-file": "data/gpt2-merges.txt",
+ "vocab_file": "data/gpt2-vocab.json",
+ "merge_file": "data/gpt2-merges.txt",
  "save": "test_checkpoint",
  "load": "test_checkpoint",
- "tensorboard-dir": "test_tensorboard",
- "log-dir": "test_logs",
+ "tensorboard_dir": "test_tensorboard",
+ "log_dir": "test_logs",
 
 }