Skip to content

Commit

Permalink
Increase test coverage (#289)
Browse files Browse the repository at this point in the history
* requirements for test coverage

* cleanup tensorboard dir when testing

* simplify using subtests

* fix clear test dirs in subtests

* test update to try and run tests with a worldsize > 1

* fix test model instantiation for world size > 1

* neox args test with import in function

* test readme update

* test model checkpoint with forward option

* test model checkpoint in inference mode

* todo for config data_impl

* upate test configs

* add docstrings to testcases

* test models with overwrite in neox_args

* update tests readme

* test config include sm3 optimizer

* test config adjustments

* add cpu and gpu testing in checkpoint test

* add test for train / backwards step

* requirements for test coverage

* cleanup tensorboard dir when testing

* simplify using subtests

* fix clear test dirs in subtests

* test update to try and run tests with a worldsize > 1

* fix test model instantiation for world size > 1

* neox args test with import in function

* test readme update

* test model checkpoint with forward option

* test model checkpoint in inference mode

* todo for config data_impl

* upate test configs

* add docstrings to testcases

* test models with overwrite in neox_args

* update tests readme

* test config include sm3 optimizer

* test config adjustments

* add cpu and gpu testing in checkpoint test

* add test for train / backwards step

* test model train with right vocab size

* modified test configs

* test train with nan handling of losses

* test model train comment out config 2 (no error, no termination)

* text generation utils - create dir fix

* test model generation init

* changed model tests to allow for init from dict

* fix use recompute kwarg in generation instead of neox_args.recompute

* adjust tests for generation to new main branch

* test text generation with multiple configs

* test model generation with input file

* adding config comparer and figured out what's causing test error

* updated config comparer and config to meet new format

* fix / make loss dict naming consistent

* disable fp32 in testing

* fix error message for unknown activation

* add train_batch_size to known parameters in neox_args used testcase

* fix comment with new variable name

* add train_batch_size] to known properties in neox_args usage testcase

* updated config comparer

* compare arg value in neox args load test

* mark testcases for cpu

* readme for tests on cpu

Co-authored-by: Samuel Weinbach <[email protected]>
Co-authored-by: kip <[email protected]>
  • Loading branch information
3 people committed May 12, 2021
1 parent 079a307 commit bb8222f
Show file tree
Hide file tree
Showing 32 changed files with 1,343 additions and 476 deletions.
2 changes: 1 addition & 1 deletion megatron/model/activations.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def get_activation(neox_args):
elif neox_args.activation == "mish":
activation_func = mish
else:
raise ValueError(f"Activation function {neox_args.activation_func} not recognized")
raise ValueError(f"Activation function {neox_args.activation} not recognized")
return activation_func


Expand Down
2 changes: 1 addition & 1 deletion megatron/text_generation_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -231,7 +231,7 @@ def stream_tokens(neox_args, model, context_tokens: List[List[int]], eos_token_i
token_generation_end_index = torch.ones([batch_size]).long().cuda() * (-1)

while token_index_to_generate <= last_token_index_to_generate:
if neox_args.recompute:
if recompute:
# recompute is needed for sparse attention at the moment
# because we can only forward multiples of the block size
# TODO The full padded context_tokens would not need to be forwarded, adjust to multiples of block size
Expand Down
4 changes: 2 additions & 2 deletions megatron/training.py
Original file line number Diff line number Diff line change
Expand Up @@ -416,7 +416,7 @@ def train_step_pipe(neox_args, timers, model, data_iterator):

assert neox_args.deepspeed
loss = model.train_batch(data_iter=data_iterator)
loss_dict = {'lm loss': loss}
loss_dict = {'lm_loss': loss}
# Don't break Megatron's timers because we changed code paths.
for t in ['forward', 'backward', 'allreduce', 'optimizer', 'batch generator', 'data loader']:
timers(t).reset()
Expand All @@ -439,7 +439,7 @@ def train(neox_args, timers, model, optimizer, lr_scheduler,
timers('interval time').start()
report_memory_flag = True

# get noise scale logger (if args.log_noise_scale is True)
# get noise scale logger (if neox_args.log_gradient_noise_scale is True)
noise_scale_logger = get_noise_scale_logger(neox_args)

# to monitor if we've skipped many iterations in a row and trigger an early exit
Expand Down
2 changes: 2 additions & 0 deletions requirements/requirements-dev.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,4 @@
pytest==6.2.3
pytest-cov==2.11.1
pytest-forked==1.3.0
autopep8==1.5.6
22 changes: 0 additions & 22 deletions run_tests.py

This file was deleted.

38 changes: 38 additions & 0 deletions tests/Readme.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
# Dependencies

Tests use pytests with coverage and forked plugins. Install with:

```bash
pip install -r requirements/requirements-dev.txt
```

# Run

Tests can be run using pytest.

* The argument --forked needs to be provided
* A coverage report can be created using the optional arguments --cov-report and --cov (see pytest documentation)
* A subset of tests can be selected by pointing to the module within tests

```bash
# run all tests, output coverage report of megatron module in terminal
pytest --forked --cov-report term --cov=megatron tests

# run tests in tests/model, output coverage report of megatron module as html
pytest --forked --cov-report html --cov=megatron tests/model

# run tests in tests/model/test_model_generation.py, don't output coverage report
pytest --forked tests/model/test_model_generation.py
```

Some tests can run on cpu only. These are marked with the decorator @pytest.mark.cpu.
The test cases for cpu can be run with:
````
pytest tests -m cpu
```
If a html coverage report has been created a simple http server can be run to serve static files.
```bash
python -m http.server --directory htmlcov 8000
```
6 changes: 0 additions & 6 deletions tests/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +0,0 @@
"""
Testcases for GPT NeoX
"""

from .model import *
from .neox_args import *
119 changes: 113 additions & 6 deletions tests/common.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,24 @@
"""
collection of reusable functions in the context of testing
"""

import os
import time
import shutil
import itertools
from pathlib import Path

import pytest

import torch
import torch.distributed as dist
from torch.multiprocessing import Process

import deepspeed

TEST_CHECKPOINT_DIR = "test_checkpoint"
TEST_LOG_DIR = "test_logs"
TEST_TENSORBOARD_DIR = "test_tensorboard"

# Worker timeout *after* the first worker has completed.
DEEPSPEED_UNIT_WORKER_TIMEOUT = 120


def get_root_directory():
return Path(__file__).parents[1]
Expand All @@ -19,7 +30,7 @@ def get_configs_with_path(configs):
return [str(get_config_directory() / cfg) for cfg in configs]

def get_test_configs_with_path(configs):
test_config_dir = Path(__file__).parent / "model" / "test_configs"
test_config_dir = Path(__file__).parent / "test_configs"
return [str((test_config_dir / cfg).absolute()) for cfg in configs]

def clear_test_dirs():
Expand All @@ -29,4 +40,100 @@ def clear_test_dirs():

checkpoint_dir = os.path.join(get_root_directory(), TEST_CHECKPOINT_DIR)
if os.path.isdir(checkpoint_dir):
shutil.rmtree(checkpoint_dir)
shutil.rmtree(checkpoint_dir)

tensorboard_dir = os.path.join(get_root_directory(), TEST_TENSORBOARD_DIR)
if os.path.isdir(tensorboard_dir):
shutil.rmtree(tensorboard_dir)

def distributed_test(world_size=2, backend='nccl'):
"""A decorator for executing a function (e.g., a unit test) in a distributed manner.
This decorator manages the spawning and joining of processes, initialization of
torch.distributed, and catching of errors.
This function is copied from: https://github.com/EleutherAI/DeeperSpeed/blob/24026e5bb37c528a222b8635c46256b1e1825d2e/tests/unit/common.py#L16
Usage example:
@distributed_test(worker_size=[2,3])
def my_test():
rank = dist.get_rank()
world_size = dist.get_world_size()
assert(rank < world_size)
Arguments:
world_size (int or list): number of ranks to spawn. Can be a list to spawn
multiple tests.
"""
def dist_wrap(run_func):
"""Second-level decorator for dist_test. This actually wraps the function. """
def dist_init(local_rank, num_procs, *func_args, **func_kwargs):
"""Initialize torch.distributed and execute the user function. """
os.environ['MASTER_ADDR'] = '127.0.0.1'
os.environ['MASTER_PORT'] = '29503'
os.environ['LOCAL_RANK'] = str(local_rank)
# NOTE: unit tests don't support multi-node so local_rank == global rank
os.environ['RANK'] = str(local_rank)
os.environ['WORLD_SIZE'] = str(num_procs)

deepspeed.init_distributed(dist_backend=backend)

if torch.cuda.is_available():
torch.cuda.set_device(local_rank)

run_func(*func_args, **func_kwargs)

def dist_launcher(num_procs, *func_args, **func_kwargs):
"""Launch processes and gracefully handle failures. """

# Spawn all workers on subprocesses.
processes = []
for local_rank in range(num_procs):
p = Process(target=dist_init,
args=(local_rank,
num_procs,
*func_args),
kwargs=func_kwargs)
p.start()
processes.append(p)

# Now loop and wait for a test to complete. The spin-wait here isn't a big
# deal because the number of processes will be O(#GPUs) << O(#CPUs).
any_done = False
while not any_done:
for p in processes:
if not p.is_alive():
any_done = True
break

# Wait for all other processes to complete
for p in processes:
p.join(DEEPSPEED_UNIT_WORKER_TIMEOUT)

failed = [(rank, p) for rank, p in enumerate(processes) if p.exitcode != 0]
for rank, p in failed:
# If it still hasn't terminated, kill it because it hung.
if p.exitcode is None:
p.terminate()
pytest.fail(f'Worker {rank} hung.', pytrace=False)
if p.exitcode < 0:
pytest.fail(f'Worker {rank} killed by signal {-p.exitcode}',
pytrace=False)
if p.exitcode > 0:
pytest.fail(f'Worker {rank} exited with code {p.exitcode}',
pytrace=False)

def run_func_decorator(*func_args, **func_kwargs):
"""Entry point for @distributed_test(). """

if isinstance(world_size, int):
dist_launcher(world_size, *func_args, **func_kwargs)
elif isinstance(world_size, list):
for procs in world_size:
dist_launcher(procs, *func_args, **func_kwargs)
time.sleep(0.5)
else:
raise TypeError(f'world_size must be an integer or a list of integers.')

return run_func_decorator

return dist_wrap
72 changes: 72 additions & 0 deletions tests/config_comparison.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
# Should just be called using: "python tests/config_comparison.py"
# Testing code to see which changes among configs break tests

# Hacky can't remember how to move back to root dir
import sys
from pathlib import Path
file = Path(__file__).resolve()
package_root_directory = file.parents[1]
sys.path.append(str(package_root_directory))

from itertools import combinations
from tests.model import run_test_model_instantiation, run_train_test, run_checkpoint_test
from tests.common import TEST_CHECKPOINT_DIR, TEST_LOG_DIR, TEST_TENSORBOARD_DIR
from tests.common import distributed_test, get_test_configs_with_path, get_root_directory, clear_test_dirs

# World size might need to be adjusted depending on test
@distributed_test(world_size=1)
def main(subsequence_length: int = 2):
"""Allows you to easily compare sets of combinations to find which changes are causing issues
Args:
subsequence_length (int, optional): the length of subsequences of elements from the input iterable. Defaults to 2.
"""

#choose default params and updated ones
base_yaml_list = get_test_configs_with_path(["test_local_setup.yml", "test_small_0.yml"])
new_yaml_list = get_test_configs_with_path(["test_local_setup.yml", "test_small_3.yml"])

# Need to import here as distributed
from megatron.neox_arguments import NeoXArgs

overwrite_values = {
"user_script": str(get_root_directory() / "pretrain_gpt2.py"),
"save": TEST_CHECKPOINT_DIR,
"load": TEST_CHECKPOINT_DIR,
"log_dir": TEST_LOG_DIR,
"tensorboard_dir": TEST_TENSORBOARD_DIR,
}
base_args_loaded = NeoXArgs.from_ymls(base_yaml_list, overwrite_values=overwrite_values)
new_args_loaded = NeoXArgs.from_ymls(new_yaml_list, overwrite_values=overwrite_values)

# Find difference between configs
diff = {}
for key, value in base_args_loaded.all_config.items():
if new_args_loaded.all_config[key] != value:
diff[key] = new_args_loaded.all_config[key]
print(f'key: {key} original: {value}, updated: {new_args_loaded.all_config[key]}')


perms = list(combinations(diff.items(), subsequence_length))

# Iterate over combinations and run the test function
# and print information so you can debug from console as program is distributed
for items in perms:
param_dict = base_args_loaded.all_config
print('running setup with:')
for item in items:
param_dict[item[0]] = item[1]
print(f'key: {item[0]} original: {base_args_loaded.all_config[item[0]]}, updated: {item[1]}')

# These are interchangable

run_train_test(param_dict=param_dict)
#run_test_model_instantiation(param_dict=param_dict)
#run_checkpoint_test(param_dict=param_dict)

print('finished running setup with:')
for item in items:
print(f'key: {item[0]} original: {base_args_loaded.all_config[item[0]]}, updated: {item[1]}')

if __name__ == '__main__':
main()
9 changes: 3 additions & 6 deletions tests/model/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,3 @@
"""
Tests concerning the GPT2Model class
"""

from .test_model_checkpoint import TestModelCheckpoint
from .test_model_instantiation import TestModelInstantiation
from .test_model_instantiation import run_test_model_instantiation
from .test_model_train import run_train_test
from .test_model_checkpoint import run_checkpoint_test
15 changes: 0 additions & 15 deletions tests/model/test_configs/test_sparse.yml

This file was deleted.

Loading

0 comments on commit bb8222f

Please sign in to comment.