Skip to content

Commit

Permalink
tests update and cleanup
Browse files Browse the repository at this point in the history
  • Loading branch information
Samuel Weinbach committed Apr 30, 2021
1 parent 9fdca6f commit cfcfd1c
Show file tree
Hide file tree
Showing 9 changed files with 112 additions and 70 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -140,5 +140,6 @@ checkpoints/

#test logs
test_checkpoint/
test_logs/
logs/
tensorboard/
18 changes: 18 additions & 0 deletions tests/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,13 @@
collection of reusable functions in the context of testing
"""

import os
import shutil
from pathlib import Path

TEST_CHECKPOINT_DIR = "test_checkpoint"
TEST_LOG_DIR = "test_logs"

def get_root_directory():
return Path(__file__).parents[1]

Expand All @@ -12,3 +17,16 @@ def get_config_directory():

def get_configs_with_path(configs):
return [str(get_config_directory() / cfg) for cfg in configs]

def get_test_configs_with_path(configs):
test_config_dir = Path(__file__).parent / "model" / "test_configs"
return [str((test_config_dir / cfg).absolute()) for cfg in configs]

def clear_test_dirs():
log_dir = os.path.join(get_root_directory(),TEST_LOG_DIR)
if os.path.isdir(log_dir):
shutil.rmtree(log_dir)

checkpoint_dir = os.path.join(get_root_directory(), TEST_CHECKPOINT_DIR)
if os.path.isdir(checkpoint_dir):
shutil.rmtree(checkpoint_dir)
2 changes: 1 addition & 1 deletion tests/model/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,4 @@
"""

from .test_model_checkpoint import TestModelCheckpoint
#from .test_model_initialization_pipeline import TestModelInitializationPipeline
from .test_model_instantiation import TestModelInstantiation
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,7 @@
"data-path": "data/enron/enron_text_document",
"vocab-file": "data/gpt2-vocab.json",
"merge-file": "data/gpt2-merges.txt",
"save": "test_checkpoint",
"load": "test_checkpoint",
"tensorboard-dir": null,
"log-dir": "logs",
"log-dir": "test_logs",
"use_wandb": false
}
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
{
# parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
# across the node boundaries )
"pipe-parallel-size": 1,
"pipe-parallel-size": 0,
"model-parallel-size": 1,

# model settings
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
{
# parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
# across the node boundaries )
"pipe-parallel-size": 1,
"pipe-parallel-size": 0,
"model-parallel-size": 1,

# model settings
Expand Down
40 changes: 22 additions & 18 deletions tests/model/test_model_checkpoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
import shutil
import unittest
import logging
from unittest.mock import patch
from pathlib import Path

if __name__ == "__main__":
Expand All @@ -17,21 +16,22 @@

from megatron.checkpointing import load_checkpoint
from megatron.checkpointing import save_checkpoint
from deepspeed import PipelineEngine

from tests.common import get_root_directory, get_configs_with_path
import torch
from tests.common import get_root_directory, get_configs_with_path, get_test_configs_with_path, clear_test_dirs, TEST_CHECKPOINT_DIR, TEST_LOG_DIR

TEST_CHECKPOINT_DIR = "test_checkpoint"
import torch

class TestModelCheckpoint(unittest.TestCase):

def setUp(self):
clear_test_dirs()

def tearDown(self):
clear_test_dirs()

def run_checkpoint_test(self, yaml_list):
destroy_model_parallel() # mpu model parallel contains remaining global vars

tests_directory = Path(__file__).parent / "test_configs"
yaml_list = [str((tests_directory / cfg).absolute()) for cfg in yaml_list]

# intitially load config from files as would be the case in deepy.py

logging.info(self.__class__.__name__ + ".run_checkpoint_test() " + f"Running on: {yaml_list}")
Expand All @@ -40,10 +40,9 @@ def run_checkpoint_test(self, yaml_list):
args_loaded.build_tokenizer()
args_loaded.update_value("user_script", str(get_root_directory() / "pretrain_gpt2.py"))
args_loaded.update_value("use_cpu_initialization", True)

# remove any existing checkpoints if they exist
checkpoint_dir = os.path.join(get_root_directory(), args_loaded.load)
shutil.rmtree(checkpoint_dir)
args_loaded.update_value("save", TEST_CHECKPOINT_DIR)
args_loaded.update_value("load", TEST_CHECKPOINT_DIR)
args_loaded.update_value("log_dir", TEST_LOG_DIR)

logging.debug(self.__class__.__name__ + ".run_checkpoint_test() initializing megatron")
initialize_megatron(neox_args=args_loaded)
Expand Down Expand Up @@ -77,6 +76,9 @@ def run_checkpoint_test(self, yaml_list):
args_reloaded.build_tokenizer()
args_reloaded.update_value("user_script", str(get_root_directory() / "pretrain_gpt2.py"))
args_reloaded.update_value("use_cpu_initialization", True)
args_reloaded.update_value("save", TEST_CHECKPOINT_DIR)
args_reloaded.update_value("load", TEST_CHECKPOINT_DIR)
args_reloaded.update_value("log_dir", TEST_LOG_DIR)

reloaded_model, optimizer, lr_scheduler = setup_model_and_optimizer(neox_args=args_reloaded, inference=False, get_key_value=True)
iteration = load_checkpoint(neox_args=args_reloaded, model=reloaded_model, optimizer=optimizer, lr_scheduler=lr_scheduler)
Expand All @@ -98,21 +100,23 @@ def run_checkpoint_test(self, yaml_list):
if not params_equal:
logging.error(self.__class__.__name__ + ".run_checkpoint_test() " + f"layer {idx} {n1} has same different after loading of checkpoint")

#clear up checkpoint folder
logging.debug( self.__class__.__name__ + ".run_checkpoint_test() " + "cleaning checkpoint")
shutil.rmtree(checkpoint_dir)
def test_model_small(self):
self.run_checkpoint_test(get_configs_with_path(["local_setup.yml", "small.yml"]))

def test_model_medium(self):
self.run_checkpoint_test(get_configs_with_path(["local_setup.yml", "medium.yml"]))

def test_model_small(self):
self.run_checkpoint_test(["local_setup.yml", "small.yml"])
self.run_checkpoint_test(get_test_configs_with_path(["test_local_setup.yml", "test_small.yml"]))

def test_model_medium(self):
self.run_checkpoint_test(["local_setup.yml", "medium.yml"])
self.run_checkpoint_test(get_test_configs_with_path(["test_local_setup.yml", "test_medium.yml"]))

if __name__ == "__main__":
suite = unittest.TestSuite()

#Run all required tests
suite.addTest(TestModelCheckpoint("test_model_small"))
suite.addTest(TestModelCheckpoint("test_model_medium"))
#suite.addTest(TestModelCheckpoint("test_model_medium"))

unittest.TextTestRunner(failfast=False).run(suite)
46 changes: 0 additions & 46 deletions tests/model/test_model_initialization.py

This file was deleted.

67 changes: 67 additions & 0 deletions tests/model/test_model_instantiation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
import os
import sys
import unittest
import shutil
import logging

if __name__ == "__main__":
sys.path.append(os.path.abspath(''))

from deepspeed.runtime.pipe.engine import PipelineEngine, DeepSpeedEngine

from megatron.neox_arguments import NeoXArgs
from megatron.model import GPT2ModelPipe
from megatron import initialize_megatron
from megatron.training import setup_model_and_optimizer
from megatron.mpu import destroy_model_parallel

from tests.common import get_root_directory, get_configs_with_path, get_test_configs_with_path, clear_test_dirs, TEST_CHECKPOINT_DIR, TEST_LOG_DIR

class TestModelInstantiation(unittest.TestCase):

def setUp(self):
clear_test_dirs()

def tearDown(self):
clear_test_dirs()

def run_instantiation_test(self, yaml_list, model_class_expected):
destroy_model_parallel() # mpu model parallel contains remaining global vars

# intitially load config from files as would be the case in deepy.py

logging.info(self.__class__.__name__ + ".run_instantiation_test() " + f"Running on: {yaml_list}")

args_loaded = NeoXArgs.from_ymls(yaml_list)
args_loaded.build_tokenizer()
args_loaded.update_value("user_script", str(get_root_directory() / "pretrain_gpt2.py"))
args_loaded.update_value("use_cpu_initialization", True)
args_loaded.update_value("save", TEST_CHECKPOINT_DIR)
args_loaded.update_value("load", TEST_CHECKPOINT_DIR)
args_loaded.update_value("log_dir", TEST_LOG_DIR)

logging.debug(self.__class__.__name__ + ".run_instantiation_test() initializing megatron")
initialize_megatron(neox_args=args_loaded)

logging.debug(self.__class__.__name__ + ".run_instantiation_test() initializing model")
model, optimizer, lr_scheduler = setup_model_and_optimizer(neox_args=args_loaded, inference=False, get_key_value=True)

self.assertTrue(isinstance(model, model_class_expected))

def test_model_instantiation_small(self):
self.run_instantiation_test(get_configs_with_path(["local_setup.yml", "small.yml"]), PipelineEngine)

def test_model_instantiation_medium(self):
self.run_instantiation_test(get_configs_with_path(["local_setup.yml", "medium.yml"]), PipelineEngine)

def test_model_instantiation_small_test(self):
self.run_instantiation_test(get_test_configs_with_path(["test_local_setup.yml", "test_small.yml"]), DeepSpeedEngine)

def test_model_instantiation_medium_test(self):
self.run_instantiation_test(get_test_configs_with_path(["test_local_setup.yml", "test_medium.yml"]), DeepSpeedEngine)

if __name__ == "__main__":
suite = unittest.TestSuite()
suite.addTest(TestModelInstantiation("test_model_instantiation_small_test"))
suite.addTest(TestModelInstantiation("test_model_instantiation_medium_test"))
unittest.TextTestRunner(failfast=True).run(suite)

0 comments on commit cfcfd1c

Please sign in to comment.