Increase test coverage (#289)

* requirements for test coverage * cleanup tensorboard dir when testing * simplify using subtests * fix clear test dirs in subtests * test update to try and run tests with a worldsize > 1 * fix test model instantiation for world size > 1 * neox args test with import in function * test readme update * test model checkpoint with forward option * test model checkpoint in inference mode * todo for config data_impl * upate test configs * add docstrings to testcases * test models with overwrite in neox_args * update tests readme * test config include sm3 optimizer * test config adjustments * add cpu and gpu testing in checkpoint test * add test for train / backwards step * requirements for test coverage * cleanup tensorboard dir when testing * simplify using subtests * fix clear test dirs in subtests * test update to try and run tests with a worldsize > 1 * fix test model instantiation for world size > 1 * neox args test with import in function * test readme update * test model checkpoint with forward option * test model checkpoint in inference mode * todo for config data_impl * upate test configs * add docstrings to testcases * test models with overwrite in neox_args * update tests readme * test config include sm3 optimizer * test config adjustments * add cpu and gpu testing in checkpoint test * add test for train / backwards step * test model train with right vocab size * modified test configs * test train with nan handling of losses * test model train comment out config 2 (no error, no termination) * text generation utils - create dir fix * test model generation init * changed model tests to allow for init from dict * fix use recompute kwarg in generation instead of neox_args.recompute * adjust tests for generation to new main branch * test text generation with multiple configs * test model generation with input file * adding config comparer and figured out what's causing test error * updated config comparer and config to meet new format * fix / make loss dict naming consistent * disable fp32 in testing * fix error message for unknown activation * add train_batch_size to known parameters in neox_args used testcase * fix comment with new variable name * add train_batch_size] to known properties in neox_args usage testcase * updated config comparer * compare arg value in neox args load test * mark testcases for cpu * readme for tests on cpu Co-authored-by: Samuel Weinbach <[email protected]> Co-authored-by: kip <[email protected]>
EleutherAI · May 12, 2021 · bb8222f · bb8222f
1 parent 079a307
commit bb8222f
Show file tree

Hide file tree

Showing 32 changed files with 1,343 additions and 476 deletions.
diff --git a/megatron/model/activations.py b/megatron/model/activations.py
@@ -44,7 +44,7 @@ def get_activation(neox_args):
  elif neox_args.activation == "mish":
  activation_func = mish
  else:
- raise ValueError(f"Activation function {neox_args.activation_func} not recognized")
+ raise ValueError(f"Activation function {neox_args.activation} not recognized")
  return activation_func
 
 

diff --git a/megatron/text_generation_utils.py b/megatron/text_generation_utils.py
@@ -231,7 +231,7 @@ def stream_tokens(neox_args, model, context_tokens: List[List[int]], eos_token_i
  token_generation_end_index = torch.ones([batch_size]).long().cuda() * (-1)
 
  while token_index_to_generate <= last_token_index_to_generate:
- if neox_args.recompute:
+ if recompute:
  # recompute is needed for sparse attention at the moment
  # because we can only forward multiples of the block size
  # TODO The full padded context_tokens would not need to be forwarded, adjust to multiples of block size

diff --git a/megatron/training.py b/megatron/training.py
@@ -416,7 +416,7 @@ def train_step_pipe(neox_args, timers, model, data_iterator):
 
  assert neox_args.deepspeed
  loss = model.train_batch(data_iter=data_iterator)
- loss_dict = {'lm loss': loss}
+ loss_dict = {'lm_loss': loss}
  # Don't break Megatron's timers because we changed code paths.
  for t in ['forward', 'backward', 'allreduce', 'optimizer', 'batch generator', 'data loader']:
  timers(t).reset()
@@ -439,7 +439,7 @@ def train(neox_args, timers, model, optimizer, lr_scheduler,
  timers('interval time').start()
  report_memory_flag = True
 
- # get noise scale logger (if args.log_noise_scale is True)
+ # get noise scale logger (if neox_args.log_gradient_noise_scale is True)
  noise_scale_logger = get_noise_scale_logger(neox_args)
 
  # to monitor if we've skipped many iterations in a row and trigger an early exit

diff --git a/requirements/requirements-dev.txt b/requirements/requirements-dev.txt
@@ -1,2 +1,4 @@
 pytest==6.2.3
+pytest-cov==2.11.1
+pytest-forked==1.3.0
 autopep8==1.5.6
diff --git a/run_tests.py b/run_tests.py
diff --git a/tests/Readme.md b/tests/Readme.md
@@ -0,0 +1,38 @@
+# Dependencies
+
+Tests use pytests with coverage and forked plugins. Install with:
+
+```bash
+pip install -r requirements/requirements-dev.txt
+```
+
+# Run
+
+Tests can be run using pytest. 
+
+* The argument --forked needs to be provided
+* A coverage report can be created using the optional arguments --cov-report and --cov (see pytest documentation)
+* A subset of tests can be selected by pointing to the module within tests
+
+```bash
+# run all tests, output coverage report of megatron module in terminal
+pytest --forked --cov-report term --cov=megatron tests
+
+# run tests in tests/model, output coverage report of megatron module as html
+pytest --forked --cov-report html --cov=megatron tests/model
+
+# run tests in tests/model/test_model_generation.py, don't output coverage report
+pytest --forked tests/model/test_model_generation.py
+```
+
+Some tests can run on cpu only. These are marked with the decorator @pytest.mark.cpu.
+The test cases for cpu can be run with:
+````
+pytest tests -m cpu
+```
+
+If a html coverage report has been created a simple http server can be run to serve static files.
+
+```bash
+python -m http.server --directory htmlcov 8000
+```
diff --git a/tests/__init__.py b/tests/__init__.py
@@ -1,6 +0,0 @@
-"""
-Testcases for GPT NeoX
-"""
-
-from .model import *
-from .neox_args import *

diff --git a/tests/common.py b/tests/common.py
@@ -1,13 +1,24 @@
-"""
-collection of reusable functions in the context of testing
-"""
-
 import os
+import time
 import shutil
+import itertools
 from pathlib import Path
 
+import pytest
+
+import torch
+import torch.distributed as dist
+from torch.multiprocessing import Process
+
+import deepspeed
+
 TEST_CHECKPOINT_DIR = "test_checkpoint"
 TEST_LOG_DIR = "test_logs"
+TEST_TENSORBOARD_DIR = "test_tensorboard"
+
+# Worker timeout *after* the first worker has completed.
+DEEPSPEED_UNIT_WORKER_TIMEOUT = 120
+
 
 def get_root_directory():
  return Path(__file__).parents[1]
@@ -19,7 +30,7 @@ def get_configs_with_path(configs):
  return [str(get_config_directory() / cfg) for cfg in configs]
 
 def get_test_configs_with_path(configs):
- test_config_dir = Path(__file__).parent / "model" / "test_configs"
+ test_config_dir = Path(__file__).parent / "test_configs"
  return [str((test_config_dir / cfg).absolute()) for cfg in configs]
 
 def clear_test_dirs():
@@ -29,4 +40,100 @@ def clear_test_dirs():
 
  checkpoint_dir = os.path.join(get_root_directory(), TEST_CHECKPOINT_DIR)
  if os.path.isdir(checkpoint_dir):
- shutil.rmtree(checkpoint_dir)
+ shutil.rmtree(checkpoint_dir)
+
+ tensorboard_dir = os.path.join(get_root_directory(), TEST_TENSORBOARD_DIR)
+ if os.path.isdir(tensorboard_dir):
+ shutil.rmtree(tensorboard_dir)
+
+def distributed_test(world_size=2, backend='nccl'):
+ """A decorator for executing a function (e.g., a unit test) in a distributed manner.
+ This decorator manages the spawning and joining of processes, initialization of
+ torch.distributed, and catching of errors.
+
+ This function is copied from: https://github.com/EleutherAI/DeeperSpeed/blob/24026e5bb37c528a222b8635c46256b1e1825d2e/tests/unit/common.py#L16
+
+ Usage example:
+ @distributed_test(worker_size=[2,3])
+ def my_test():
+ rank = dist.get_rank()
+ world_size = dist.get_world_size()
+ assert(rank < world_size)
+
+ Arguments:
+ world_size (int or list): number of ranks to spawn. Can be a list to spawn
+ multiple tests.
+ """
+ def dist_wrap(run_func):
+ """Second-level decorator for dist_test. This actually wraps the function. """
+ def dist_init(local_rank, num_procs, *func_args, **func_kwargs):
+ """Initialize torch.distributed and execute the user function. """
+ os.environ['MASTER_ADDR'] = '127.0.0.1'
+ os.environ['MASTER_PORT'] = '29503'
+ os.environ['LOCAL_RANK'] = str(local_rank)
+ # NOTE: unit tests don't support multi-node so local_rank == global rank
+ os.environ['RANK'] = str(local_rank)
+ os.environ['WORLD_SIZE'] = str(num_procs)
+
+ deepspeed.init_distributed(dist_backend=backend)
+
+ if torch.cuda.is_available():
+ torch.cuda.set_device(local_rank)
+
+ run_func(*func_args, **func_kwargs)
+
+ def dist_launcher(num_procs, *func_args, **func_kwargs):
+ """Launch processes and gracefully handle failures. """
+
+ # Spawn all workers on subprocesses.
+ processes = []
+ for local_rank in range(num_procs):
+ p = Process(target=dist_init,
+ args=(local_rank,
+ num_procs,
+ *func_args),
+ kwargs=func_kwargs)
+ p.start()
+ processes.append(p)
+
+ # Now loop and wait for a test to complete. The spin-wait here isn't a big
+ # deal because the number of processes will be O(#GPUs) << O(#CPUs).
+ any_done = False
+ while not any_done:
+ for p in processes:
+ if not p.is_alive():
+ any_done = True
+ break
+
+ # Wait for all other processes to complete
+ for p in processes:
+ p.join(DEEPSPEED_UNIT_WORKER_TIMEOUT)
+
+ failed = [(rank, p) for rank, p in enumerate(processes) if p.exitcode != 0]
+ for rank, p in failed:
+ # If it still hasn't terminated, kill it because it hung.
+ if p.exitcode is None:
+ p.terminate()
+ pytest.fail(f'Worker {rank} hung.', pytrace=False)
+ if p.exitcode < 0:
+ pytest.fail(f'Worker {rank} killed by signal {-p.exitcode}',
+ pytrace=False)
+ if p.exitcode > 0:
+ pytest.fail(f'Worker {rank} exited with code {p.exitcode}',
+ pytrace=False)
+
+ def run_func_decorator(*func_args, **func_kwargs):
+ """Entry point for @distributed_test(). """
+
+ if isinstance(world_size, int):
+ dist_launcher(world_size, *func_args, **func_kwargs)
+ elif isinstance(world_size, list):
+ for procs in world_size:
+ dist_launcher(procs, *func_args, **func_kwargs)
+ time.sleep(0.5)
+ else:
+ raise TypeError(f'world_size must be an integer or a list of integers.')
+
+ return run_func_decorator
+
+ return dist_wrap
diff --git a/tests/config_comparison.py b/tests/config_comparison.py
@@ -0,0 +1,72 @@
+# Should just be called using: "python tests/config_comparison.py"
+# Testing code to see which changes among configs break tests
+
+# Hacky can't remember how to move back to root dir
+import sys 
+from pathlib import Path 
+file = Path(__file__).resolve() 
+package_root_directory = file.parents[1] 
+sys.path.append(str(package_root_directory)) 
+
+from itertools import combinations
+from tests.model import run_test_model_instantiation, run_train_test, run_checkpoint_test
+from tests.common import TEST_CHECKPOINT_DIR, TEST_LOG_DIR, TEST_TENSORBOARD_DIR
+from tests.common import distributed_test, get_test_configs_with_path, get_root_directory, clear_test_dirs
+
+# World size might need to be adjusted depending on test
+@distributed_test(world_size=1)
+def main(subsequence_length: int = 2):
+ """Allows you to easily compare sets of combinations to find which changes are causing issues
+
+ Args:
+ subsequence_length (int, optional): the length of subsequences of elements from the input iterable. Defaults to 2.
+ """
+
+ #choose default params and updated ones
+ base_yaml_list = get_test_configs_with_path(["test_local_setup.yml", "test_small_0.yml"])
+ new_yaml_list = get_test_configs_with_path(["test_local_setup.yml", "test_small_3.yml"])
+
+ # Need to import here as distributed
+ from megatron.neox_arguments import NeoXArgs
+
+ overwrite_values = {
+ "user_script": str(get_root_directory() / "pretrain_gpt2.py"),
+ "save": TEST_CHECKPOINT_DIR,
+ "load": TEST_CHECKPOINT_DIR,
+ "log_dir": TEST_LOG_DIR,
+ "tensorboard_dir": TEST_TENSORBOARD_DIR,
+ }
+ base_args_loaded = NeoXArgs.from_ymls(base_yaml_list, overwrite_values=overwrite_values)
+ new_args_loaded = NeoXArgs.from_ymls(new_yaml_list, overwrite_values=overwrite_values)
+
+ # Find difference between configs
+ diff = {}
+ for key, value in base_args_loaded.all_config.items():
+ if new_args_loaded.all_config[key] != value:
+ diff[key] = new_args_loaded.all_config[key]
+ print(f'key: {key} original: {value}, updated: {new_args_loaded.all_config[key]}')
+
+
+ perms = list(combinations(diff.items(), subsequence_length))
+
+ # Iterate over combinations and run the test function
+ # and print information so you can debug from console as program is distributed
+ for items in perms:
+ param_dict = base_args_loaded.all_config
+ print('running setup with:')
+ for item in items:
+ param_dict[item[0]] = item[1]
+ print(f'key: {item[0]} original: {base_args_loaded.all_config[item[0]]}, updated: {item[1]}')
+
+ # These are interchangable
+
+ run_train_test(param_dict=param_dict)
+ #run_test_model_instantiation(param_dict=param_dict)
+ #run_checkpoint_test(param_dict=param_dict)
+
+ print('finished running setup with:')
+ for item in items:
+ print(f'key: {item[0]} original: {base_args_loaded.all_config[item[0]]}, updated: {item[1]}')
+
+if __name__ == '__main__':
+ main()
diff --git a/tests/model/__init__.py b/tests/model/__init__.py
@@ -1,6 +1,3 @@
-"""
-Tests concerning the GPT2Model class
-"""
-
-from .test_model_checkpoint import TestModelCheckpoint
-from .test_model_instantiation import TestModelInstantiation
+from .test_model_instantiation import run_test_model_instantiation
+from .test_model_train import run_train_test
+from .test_model_checkpoint import run_checkpoint_test
diff --git a/tests/model/test_configs/test_sparse.yml b/tests/model/test_configs/test_sparse.yml