tests update and cleanup

EleutherAI · Apr 30, 2021 · cfcfd1c · cfcfd1c
1 parent 9fdca6f
commit cfcfd1c
Show file tree

Hide file tree

Showing 9 changed files with 112 additions and 70 deletions.
diff --git a/.gitignore b/.gitignore
@@ -140,5 +140,6 @@ checkpoints/
 
 #test logs
 test_checkpoint/
+test_logs/
 logs/
 tensorboard/
diff --git a/tests/common.py b/tests/common.py
@@ -2,8 +2,13 @@
 collection of reusable functions in the context of testing
 """
 
+import os
+import shutil
 from pathlib import Path
 
+TEST_CHECKPOINT_DIR = "test_checkpoint"
+TEST_LOG_DIR = "test_logs"
+
 def get_root_directory():
  return Path(__file__).parents[1]
 
@@ -12,3 +17,16 @@ def get_config_directory():
 
 def get_configs_with_path(configs):
  return [str(get_config_directory() / cfg) for cfg in configs]
+
+def get_test_configs_with_path(configs):
+ test_config_dir = Path(__file__).parent / "model" / "test_configs"
+ return [str((test_config_dir / cfg).absolute()) for cfg in configs]
+
+def clear_test_dirs():
+ log_dir = os.path.join(get_root_directory(),TEST_LOG_DIR)
+ if os.path.isdir(log_dir):
+ shutil.rmtree(log_dir)
+
+ checkpoint_dir = os.path.join(get_root_directory(), TEST_CHECKPOINT_DIR)
+ if os.path.isdir(checkpoint_dir):
+ shutil.rmtree(checkpoint_dir)
diff --git a/tests/model/__init__.py b/tests/model/__init__.py
@@ -3,4 +3,4 @@
 """
 
 from .test_model_checkpoint import TestModelCheckpoint
-#from .test_model_initialization_pipeline import TestModelInitializationPipeline
+from .test_model_instantiation import TestModelInstantiation
diff --git a/tests/model/test_configs/local_setup.yml → ...s/model/test_configs/test_local_setup.yml b/tests/model/test_configs/local_setup.yml → ...s/model/test_configs/test_local_setup.yml
@@ -3,9 +3,7 @@
  "data-path": "data/enron/enron_text_document",
  "vocab-file": "data/gpt2-vocab.json",
  "merge-file": "data/gpt2-merges.txt",
- "save": "test_checkpoint",
- "load": "test_checkpoint",
  "tensorboard-dir": null,
- "log-dir": "logs",
+ "log-dir": "test_logs",
  "use_wandb": false
 }
diff --git a/tests/model/test_configs/medium.yml → tests/model/test_configs/test_medium.yml b/tests/model/test_configs/medium.yml → tests/model/test_configs/test_medium.yml
@@ -2,7 +2,7 @@
 {
  # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
  # across the node boundaries )
- "pipe-parallel-size": 1,
+ "pipe-parallel-size": 0,
  "model-parallel-size": 1,
 
  # model settings

diff --git a/tests/model/test_configs/small.yml → tests/model/test_configs/test_small.yml b/tests/model/test_configs/small.yml → tests/model/test_configs/test_small.yml
@@ -2,7 +2,7 @@
 {
  # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
  # across the node boundaries )
- "pipe-parallel-size": 1,
+ "pipe-parallel-size": 0,
  "model-parallel-size": 1,
 
  # model settings

diff --git a/tests/model/test_model_checkpoint.py b/tests/model/test_model_checkpoint.py
@@ -3,7 +3,6 @@
 import shutil
 import unittest
 import logging
-from unittest.mock import patch
 from pathlib import Path
 
 if __name__ == "__main__":
@@ -17,21 +16,22 @@
 
 from megatron.checkpointing import load_checkpoint
 from megatron.checkpointing import save_checkpoint
-from deepspeed import PipelineEngine
 
-from tests.common import get_root_directory, get_configs_with_path
-import torch
+from tests.common import get_root_directory, get_configs_with_path, get_test_configs_with_path, clear_test_dirs, TEST_CHECKPOINT_DIR, TEST_LOG_DIR
 
-TEST_CHECKPOINT_DIR = "test_checkpoint"
+import torch
 
 class TestModelCheckpoint(unittest.TestCase):
 
+ def setUp(self):
+ clear_test_dirs()
+
+ def tearDown(self):
+ clear_test_dirs()
+
  def run_checkpoint_test(self, yaml_list):
  destroy_model_parallel() # mpu model parallel contains remaining global vars
 
- tests_directory = Path(__file__).parent / "test_configs"
- yaml_list = [str((tests_directory / cfg).absolute()) for cfg in yaml_list]
-
  # intitially load config from files as would be the case in deepy.py
 
  logging.info(self.__class__.__name__ + ".run_checkpoint_test() " + f"Running on: {yaml_list}")
@@ -40,10 +40,9 @@ def run_checkpoint_test(self, yaml_list):
  args_loaded.build_tokenizer()
  args_loaded.update_value("user_script", str(get_root_directory() / "pretrain_gpt2.py"))
  args_loaded.update_value("use_cpu_initialization", True)
-
- # remove any existing checkpoints if they exist
- checkpoint_dir = os.path.join(get_root_directory(), args_loaded.load)
- shutil.rmtree(checkpoint_dir)
+ args_loaded.update_value("save", TEST_CHECKPOINT_DIR)
+ args_loaded.update_value("load", TEST_CHECKPOINT_DIR)
+ args_loaded.update_value("log_dir", TEST_LOG_DIR)
 
  logging.debug(self.__class__.__name__ + ".run_checkpoint_test() initializing megatron")
  initialize_megatron(neox_args=args_loaded)
@@ -77,6 +76,9 @@ def run_checkpoint_test(self, yaml_list):
  args_reloaded.build_tokenizer()
  args_reloaded.update_value("user_script", str(get_root_directory() / "pretrain_gpt2.py"))
  args_reloaded.update_value("use_cpu_initialization", True)
+ args_reloaded.update_value("save", TEST_CHECKPOINT_DIR)
+ args_reloaded.update_value("load", TEST_CHECKPOINT_DIR)
+ args_reloaded.update_value("log_dir", TEST_LOG_DIR)
 
  reloaded_model, optimizer, lr_scheduler = setup_model_and_optimizer(neox_args=args_reloaded, inference=False, get_key_value=True)
  iteration = load_checkpoint(neox_args=args_reloaded, model=reloaded_model, optimizer=optimizer, lr_scheduler=lr_scheduler)
@@ -98,21 +100,23 @@ def run_checkpoint_test(self, yaml_list):
  if not params_equal:
  logging.error(self.__class__.__name__ + ".run_checkpoint_test() " + f"layer {idx} {n1} has same different after loading of checkpoint")
 
- #clear up checkpoint folder
- logging.debug( self.__class__.__name__ + ".run_checkpoint_test() " + "cleaning checkpoint")
- shutil.rmtree(checkpoint_dir)
+ def test_model_small(self):
+ self.run_checkpoint_test(get_configs_with_path(["local_setup.yml", "small.yml"]))
+
+ def test_model_medium(self):
+ self.run_checkpoint_test(get_configs_with_path(["local_setup.yml", "medium.yml"]))
 
  def test_model_small(self):
- self.run_checkpoint_test(["local_setup.yml", "small.yml"])
+ self.run_checkpoint_test(get_test_configs_with_path(["test_local_setup.yml", "test_small.yml"]))
 
  def test_model_medium(self):
- self.run_checkpoint_test(["local_setup.yml", "medium.yml"])
+ self.run_checkpoint_test(get_test_configs_with_path(["test_local_setup.yml", "test_medium.yml"]))
 
 if __name__ == "__main__":
  suite = unittest.TestSuite()
 
  #Run all required tests
  suite.addTest(TestModelCheckpoint("test_model_small"))
- suite.addTest(TestModelCheckpoint("test_model_medium"))
+ #suite.addTest(TestModelCheckpoint("test_model_medium"))
 
  unittest.TextTestRunner(failfast=False).run(suite)
diff --git a/tests/model/test_model_initialization.py b/tests/model/test_model_initialization.py
diff --git a/tests/model/test_model_instantiation.py b/tests/model/test_model_instantiation.py
@@ -0,0 +1,67 @@
+import os
+import sys
+import unittest
+import shutil
+import logging
+
+if __name__ == "__main__":
+ sys.path.append(os.path.abspath(''))
+
+from deepspeed.runtime.pipe.engine import PipelineEngine, DeepSpeedEngine
+
+from megatron.neox_arguments import NeoXArgs
+from megatron.model import GPT2ModelPipe
+from megatron import initialize_megatron
+from megatron.training import setup_model_and_optimizer
+from megatron.mpu import destroy_model_parallel
+
+from tests.common import get_root_directory, get_configs_with_path, get_test_configs_with_path, clear_test_dirs, TEST_CHECKPOINT_DIR, TEST_LOG_DIR
+
+class TestModelInstantiation(unittest.TestCase):
+
+ def setUp(self):
+ clear_test_dirs()
+
+ def tearDown(self):
+ clear_test_dirs()
+
+ def run_instantiation_test(self, yaml_list, model_class_expected):
+ destroy_model_parallel() # mpu model parallel contains remaining global vars
+
+ # intitially load config from files as would be the case in deepy.py
+
+ logging.info(self.__class__.__name__ + ".run_instantiation_test() " + f"Running on: {yaml_list}")
+
+ args_loaded = NeoXArgs.from_ymls(yaml_list)
+ args_loaded.build_tokenizer()
+ args_loaded.update_value("user_script", str(get_root_directory() / "pretrain_gpt2.py"))
+ args_loaded.update_value("use_cpu_initialization", True)
+ args_loaded.update_value("save", TEST_CHECKPOINT_DIR)
+ args_loaded.update_value("load", TEST_CHECKPOINT_DIR)
+ args_loaded.update_value("log_dir", TEST_LOG_DIR)
+
+ logging.debug(self.__class__.__name__ + ".run_instantiation_test() initializing megatron")
+ initialize_megatron(neox_args=args_loaded)
+
+ logging.debug(self.__class__.__name__ + ".run_instantiation_test() initializing model")
+ model, optimizer, lr_scheduler = setup_model_and_optimizer(neox_args=args_loaded, inference=False, get_key_value=True)
+
+ self.assertTrue(isinstance(model, model_class_expected)) 
+
+ def test_model_instantiation_small(self):
+ self.run_instantiation_test(get_configs_with_path(["local_setup.yml", "small.yml"]), PipelineEngine)
+
+ def test_model_instantiation_medium(self):
+ self.run_instantiation_test(get_configs_with_path(["local_setup.yml", "medium.yml"]), PipelineEngine)
+
+ def test_model_instantiation_small_test(self):
+ self.run_instantiation_test(get_test_configs_with_path(["test_local_setup.yml", "test_small.yml"]), DeepSpeedEngine)
+
+ def test_model_instantiation_medium_test(self):
+ self.run_instantiation_test(get_test_configs_with_path(["test_local_setup.yml", "test_medium.yml"]), DeepSpeedEngine)
+
+if __name__ == "__main__":
+ suite = unittest.TestSuite()
+ suite.addTest(TestModelInstantiation("test_model_instantiation_small_test"))
+ suite.addTest(TestModelInstantiation("test_model_instantiation_medium_test"))
+ unittest.TextTestRunner(failfast=True).run(suite)