Skip to content

Commit

Permalink
parameterize more testacases
Browse files Browse the repository at this point in the history
  • Loading branch information
Samuel Weinbach committed May 13, 2021
1 parent 7a4d1b3 commit d581e57
Show file tree
Hide file tree
Showing 5 changed files with 74 additions and 150 deletions.
14 changes: 10 additions & 4 deletions tests/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def get_configs_with_path(configs):
return [str(get_config_directory() / cfg) for cfg in configs]


def get_test_configs_with_path(configs):
def get_test_configs_with_path(configs: list):
test_config_dir = Path(__file__).parent / "test_configs"
return [str((test_config_dir / cfg).absolute()) for cfg in configs]

Expand Down Expand Up @@ -227,19 +227,25 @@ def parametrize(params_to_test: dict, max_tests: int = 50, seed: int = None):
keys, values = zip(*params_to_test.items())
for p in bounded_product(values, n=max_tests, seed=seed):
experiment = dict(zip(keys, p))
to_pop = []
to_add = {}
for k, v in experiment.items():
if "," in k:
keys_split = [i.strip() for i in k.split(',')]
values_separated = experiment.pop(k)
values_separated = experiment[k]
to_pop.append(k)
assert len(values_separated) == len(keys_split)
new_dict = dict(zip(keys_split, values_separated))
experiment.update(new_dict)
to_add.update(new_dict)
experiment.update(to_add)
for k in to_pop:
experiment.pop(k)
base = deepcopy(BASE_CONFIG)
base.update(experiment)
yield base


binary = [True, False]

with open(get_test_configs_with_path("test_train_base.yml")[0], 'r') as f:
with open(get_test_configs_with_path(["test_train_base.yml"])[0], 'r') as f:
BASE_CONFIG = load(f, Loader=Loader)
119 changes: 17 additions & 102 deletions tests/model/test_model_checkpoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,125 +4,40 @@
This tests contain a relatively large number of functions. They are not split into separate tests because a lot of boilerplate (e.g. instantiate model) needs
to run in order to perform follow up tests. Joining in one test reduces runtime at the expense of decreased transparency of test results in case of failures.
"""

import os
from pathlib import Path

from ..common import TEST_CHECKPOINT_DIR, TEST_LOG_DIR, TEST_TENSORBOARD_DIR
from ..common import distributed_test, get_root_directory, get_test_configs_with_path, clear_test_dirs

import pytest
from torch._C import Value
from ..common import distributed_test, clear_test_dirs, model_setup, binary, parametrize
import torch

@distributed_test(world_size=1)
def test_model_checkpoint_small_0():
yaml_list = get_test_configs_with_path(["test_local_setup.yml", "test_small_0.yml"])
run_checkpoint_test(yaml_list=yaml_list, do_forward_pass=False, cpu=True)
PARAMS_TO_TEST = {
"pipe_parallel_size,model_parallel_size": [[0, 1]],
}

@distributed_test(world_size=1)
def test_model_checkpoint_small_1():
yaml_list = get_test_configs_with_path(["test_local_setup.yml", "test_small_1.yml"])
run_checkpoint_test(yaml_list=yaml_list, do_forward_pass=False, cpu=True)

@distributed_test(world_size=2)
def test_model_checkpoint_small_2():
yaml_list = get_test_configs_with_path(["test_local_setup.yml", "test_small_2.yml"])
run_checkpoint_test(yaml_list=yaml_list, do_forward_pass=False, cpu=False)

@distributed_test(world_size=1)
def test_model_checkpoint_small_3():
yaml_list = get_test_configs_with_path(["test_local_setup.yml", "test_small_3.yml"])
run_checkpoint_test(yaml_list=yaml_list, do_forward_pass=False, cpu=False)

@distributed_test(world_size=2)
def test_model_checkpoint_small_4():
yaml_list = get_test_configs_with_path(["test_local_setup.yml", "test_small_4.yml"])
run_checkpoint_test(yaml_list=yaml_list, do_forward_pass=False, cpu=False)
@pytest.mark.parametrize("param_dict", list(parametrize(PARAMS_TO_TEST, max_tests=50, seed=None)))
def test_train(param_dict):
@distributed_test(world_size=2)
def wrapper():
run_checkpoint_test(param_dict=param_dict)
wrapper()

def run_checkpoint_test(yaml_list=None, param_dict=None, do_forward_pass=False, cpu=False):
from megatron.neox_arguments import NeoXArgs
from megatron import initialize_megatron
from megatron.text_generation_utils import get_batch, forward_model
from megatron.training import setup_model_and_optimizer
from megatron.mpu import destroy_model_parallel


from megatron.checkpointing import load_checkpoint
from megatron.checkpointing import save_checkpoint

destroy_model_parallel() # mpu model parallel contains remaining global vars

if torch.distributed.get_world_size() == 1 or torch.distributed.get_rank() == 0:
clear_test_dirs()


overwrite_values = {
"user_script": str(get_root_directory() / "pretrain_gpt2.py"),
"save": TEST_CHECKPOINT_DIR,
"load": TEST_CHECKPOINT_DIR,
"log_dir": TEST_LOG_DIR,
"tensorboard_dir": TEST_TENSORBOARD_DIR,
}

# should not both be none
assert yaml_list is not None or param_dict is not None

# intitially load config from files as would be the case in deepy.py
if yaml_list is not None:
args_loaded = NeoXArgs.from_ymls(yaml_list, overwrite_values=overwrite_values)
else:
p_dict = param_dict.copy()
p_dict.update(overwrite_values)
args_loaded = NeoXArgs.from_dict(p_dict)

args_loaded.build_tokenizer()

initialize_megatron(neox_args=args_loaded)

model, optimizer, lr_scheduler = setup_model_and_optimizer(neox_args=args_loaded, inference=True, get_key_value=True)
model.eval()

model, optimizer, lr_scheduler, args_loaded = model_setup(yaml_list, param_dict)
model.iteration += 1
# save model checkpoint
save_checkpoint(neox_args=args_loaded, iteration=42, model=model, optimizer=optimizer, lr_scheduler=lr_scheduler)


# forward
if do_forward_pass:
context_tokens_tensor = torch.cuda.LongTensor([[1,2,3,4,5],[1,2,3,4,5],[6,7,8,9,10],[1,2,3,4,100]])
tokens, attention_mask, position_ids = get_batch(args_loaded, context_tokens_tensor)
logits, layer_past = forward_model(args_loaded, model, (tokens, position_ids, attention_mask, torch.Tensor()))


# assert logits are the right shape
assert torch.is_tensor(logits), "run_checkpoint_test() forward output is tensor"
assert logits.size(0) == context_tokens_tensor.size(0), "run_checkpoint_test() batch size correct"
assert logits.size(1) == context_tokens_tensor.size(1), "run_checkpoint_test() context size correct"

# assert correct behaviour
assert torch.isclose(logits[0], logits[1]).all().item(), "run_checkpoint_test() forward independent of batch index"
assert not torch.isclose(logits[1], logits[2]).all().item(), "run_checkpoint_test() forward produced different outputs for different inputs"
assert torch.isclose(logits[1, 3], logits[3, 3]).all().item(), "run_checkpoint_test() forward masks right side tokens"

# reload model from checkpoint
if yaml_list is not None:
args_reloaded = NeoXArgs.from_ymls(yaml_list, overwrite_values=overwrite_values)
else:
p_dict = param_dict.copy()
p_dict.update(overwrite_values)
args_reloaded = NeoXArgs.from_dict(p_dict)

args_reloaded.build_tokenizer()

reloaded_model, optimizer, lr_scheduler = setup_model_and_optimizer(neox_args=args_reloaded, inference=True, get_key_value=True)
iteration = load_checkpoint(neox_args=args_reloaded, model=reloaded_model, optimizer=optimizer, lr_scheduler=lr_scheduler)
reloaded_model.eval()
reloaded_model, reloaded_optimizer, reloaded_lr_scheduler, args_reloaded = model_setup(yaml_list, param_dict)
iteration = load_checkpoint(neox_args=args_reloaded, model=reloaded_model, optimizer=reloaded_optimizer, lr_scheduler=reloaded_lr_scheduler)

#ensure same checkpoint is loaded
assert iteration == 42, "run_checkpoint_test() iteration loaded from checkpoint correct"

if do_forward_pass:
#check re-loaded model returns the same results
logits_reloaded, layer_past = forward_model(args_reloaded, model, (tokens, position_ids, attention_mask))
assert torch.isclose(logits, logits_reloaded).all().item(), "run_checkpoint_test() forward output after reloading checkpoint unchanged"

#check all weight groups are the same
for idx, ((n1, p1), (n2, p2)) in enumerate(zip(list(model.module.named_parameters()), list(reloaded_model.module.named_parameters()))):
assert n1 == n2
Expand Down
13 changes: 7 additions & 6 deletions tests/model/test_model_instantiation.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,19 +9,20 @@
from ..common import distributed_test, model_setup, clear_test_dirs, parametrize, binary

PARAMS_TO_TEST = {
"pipe_parallel_size,model_parallel_size": [[0, 1], [1, 1], [2, 2], [0, 2]],
"pipe_parallel_size,model_parallel_size": [[0, 1], [1, 2], [0, 2]],
"no_weight_tying": binary,
"attention_config": [[[["global"], "all"]], [[["local"], "all"]], [[["sparse_variable"], "all"]],
[[["sparse_fixed"], "all"]]],
"scaled_upper_triang_masked_softmax_fusion": [binary, binary],
"bias_gelu_fusion": binary,
"scaled_upper_triang_masked_softmax_fusion,bias_gelu_fusion": [[True, False], [False, True]],
}


@pytest.mark.parametrize("param_dict", list(parametrize(PARAMS_TO_TEST, max_tests=50, seed=None)))
@distributed_test(world_size=2)
def test_model_instantiation(param_dict):
run_test_model_instantiation(param_dict=param_dict)
def test_train(param_dict):
@distributed_test(world_size=2)
def wrapper():
run_test_model_instantiation(param_dict=param_dict)
wrapper()


def run_test_model_instantiation(yaml_list=None, param_dict=None):
Expand Down
8 changes: 5 additions & 3 deletions tests/model/test_model_train.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
"norm,pos_emb,activation": [["layernorm", "learned", "gelu"], ["rmsnorm", "rotary", "gelu"],
["scalenorm", "sinusoidal", "geglu"], ["layernorm", "rpe", "geglu"],
["rmsnorm", "none", "geglu"]],
"pipe_parallel_size,model_parallel_size": [[0, 1], [1, 1], [2, 2], [0, 2]],
"pipe_parallel_size,model_parallel_size": [[0, 1], [1, 2], [0, 2]],
"no_weight_tying": binary,
"attention_config": [[[["global"], "all"]], [[["local"], "all"]], [[["sparse_variable"], "all"]],
[[["sparse_fixed"], "all"]]],
Expand All @@ -24,9 +24,11 @@


@pytest.mark.parametrize("param_dict", list(parametrize(PARAMS_TO_TEST, max_tests=50, seed=None)))
@distributed_test(world_size=2)
def test_train(param_dict):
run_train_test(param_dict=param_dict)
@distributed_test(world_size=2)
def wrapper():
run_train_test(param_dict=param_dict)
wrapper()


def run_train_test(yaml_list=None, param_dict=None):
Expand Down
70 changes: 35 additions & 35 deletions tests/test_configs/test_train_base.yml
Original file line number Diff line number Diff line change
@@ -1,23 +1,23 @@
# GPT-2 pretraining setup
# GPT_2 pretraining setup
{
# parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
# across the node boundaries )
"pipe-parallel-size": 0,
"model-parallel-size": 1,
"pipe_parallel_size": 0,
"model_parallel_size": 1,

# model settings
"num-layers": 3,
"hidden-size": 192,
"num-attention-heads": 12,
"seq-length": 1024,
"max-position-embeddings": 1024,
"num_layers": 3,
"hidden_size": 192,
"num_attention_heads": 12,
"seq_length": 1024,
"max_position_embeddings": 1024,
"norm": "layernorm",
"pos-emb": "rotary",
"no-weight-tying": true,
"pos_emb": "rotary",
"no_weight_tying": true,

# these should provide some speedup but takes a while to build, set to true if desired
"scaled-upper-triang-masked-softmax-fusion": false,
"bias-gelu-fusion": false,
"scaled_upper_triang_masked_softmax_fusion": false,
"bias_gelu_fusion": false,

# optimizer settings
"optimizer": {
Expand All @@ -42,20 +42,20 @@

# batch / data settings
"train_micro_batch_size_per_gpu": 4,
"data-impl": "mmap",
"data_impl": "mmap",
"split": "949,50,1",

# activation checkpointing
"checkpoint-activations": true,
"checkpoint-num-layers": 1,
"partition-activations": true,
"synchronize-each-layer": true,
"checkpoint_activations": true,
"checkpoint_num_layers": 1,
"partition_activations": true,
"synchronize_each_layer": true,

# regularization
"gradient_clipping": 1.0,
"weight-decay": 0.0,
"hidden-dropout": 0.0,
"attention-dropout": 0.0,
"weight_decay": 0.0,
"hidden_dropout": 0.0,
"attention_dropout": 0.0,

# precision settings
"fp16": {
Expand All @@ -67,23 +67,23 @@
},

# misc. training settings
"train-iters": 320000,
"lr-decay-iters": 320000,
"distributed-backend": "nccl",
"lr-decay-style": "cosine",
"train_iters": 320000,
"lr_decay_iters": 320000,
"distributed_backend": "nccl",
"lr_decay_style": "cosine",
"warmup": 0.01,
"save-interval": 10000,
"eval-interval": 1000,
"eval-iters": 10,
"save_interval": 10000,
"eval_interval": 1000,
"eval_iters": 10,

# logging
"log-interval": 100,
"log_interval": 100,
"steps_per_print": 10,
"keep-last-n-checkpoints": 4,
"keep_last_n_checkpoints": 4,
"wall_clock_breakdown": true,

# Suggested data paths when using GPT-NeoX locally
"data-path": "data/enron/enron_text_document",
# Suggested data paths when using GPT_NeoX locally
"data_path": "data/enron/enron_text_document",

# or for weighted datasets:
# "train-data-paths": ["data/enron/enron_text_document", "data/enron/enron_text_document"],
Expand All @@ -93,11 +93,11 @@
# "test-data-weights": [2., 1.],
# "valid-data-weights": [0.5, 0.4],

"vocab-file": "data/gpt2-vocab.json",
"merge-file": "data/gpt2-merges.txt",
"vocab_file": "data/gpt2-vocab.json",
"merge_file": "data/gpt2-merges.txt",
"save": "test_checkpoint",
"load": "test_checkpoint",
"tensorboard-dir": "test_tensorboard",
"log-dir": "test_logs",
"tensorboard_dir": "test_tensorboard",
"log_dir": "test_logs",

}

0 comments on commit d581e57

Please sign in to comment.