Enable Bring-your-own-Lightning-model (#417)

- Enable brining arbitrary PyTorch-Lightning models to the InnerEye toolbox - Upgrade mypy and simplify the way we invoke it
microsoft · Apr 19, 2021 · 0d479ba · 0d479ba
1 parent 780e420
commit 0d479ba
Show file tree

Hide file tree

Showing 72 changed files with 3,178 additions and 1,303 deletions.
diff --git a/.flake8 b/.flake8
@@ -2,3 +2,4 @@
 ignore = E226,E302,E41,W391, E701, W291, E722, W503, E128, E126, E127, E731, E401
 max-line-length = 160
 max-complexity = 25
+exclude = fastMRI/
diff --git a/.github/workflows/linting_and_hello_world.yml b/.github/workflows/linting_and_hello_world.yml
@@ -45,6 +45,13 @@ jobs:
  PYTHONPATH: ${{ github.workspace }}
  if: always()
 
+ - name: Run HelloContainer model
+ run: |
+ $CONDA/envs/InnerEye/bin/python ./InnerEye/ML/runner.py --model=HelloContainer
+ env:
+ PYTHONPATH: ${{ github.workspace }}
+ if: always()
+
  windows:
  runs-on: windows-latest
  steps:

diff --git a/.gitmodules b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "fastMRI"]
+ path = fastMRI
+ url = https://github.com/facebookresearch/fastMRI
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -13,6 +13,9 @@ created.
 
 ### Added
 
+- ([#417](https://github.com/microsoft/InnerEye-DeepLearning/pull/417)) Added a generic way of adding PyTorch Lightning
+models to the toolbox. It is now possible to train almost any Lightning model with the InnerEye toolbox in AzureML,
+with only minimum code changes required. See [the MD documentation](docs/bring_your_own_model.md) for details.
 - ([#430](https://github.com/microsoft/InnerEye-DeepLearning/pull/430)) Update conversion to 1.0.1 InnerEye-DICOM-RT to
  add: manufacturer, SoftwareVersions, Interpreter and ROIInterpretedTypes.
 - ([#385](https://github.com/microsoft/InnerEye-DeepLearning/pull/385)) Add the ability to train a model on multiple
@@ -70,6 +73,7 @@ created.
 - ([#437](https://github.com/microsoft/InnerEye-DeepLearning/pull/437)) Fixed multi-node DDP bug in PL v1.2.8. Re-add
  end-to-end test for multi-node.
 ### Removed
+- ([#417](https://github.com/microsoft/InnerEye-DeepLearning/pull/417)) Removed an output file that only contains metadata for a legacy consumer
 
 ### Deprecated
 

diff --git a/InnerEye/Azure/azure_config.py b/InnerEye/Azure/azure_config.py
@@ -275,19 +275,6 @@ def set_script_params_except_submit_flag(self) -> None:
  self.script_params = retained_args
 
 
-@dataclass
-class ExperimentResultLocation:
- """
- Information that is need to recover where the results of an experiment reside.
- """
- results_container_name: Optional[str] = None
- results_uri: Optional[str] = None
- dataset_folder: Optional[str] = None
- dataset_uri: Optional[str] = None
- azure_job_name: Optional[str] = None
- commandline_overrides: Optional[str] = None
-
-
 @dataclass
 class ParserResult:
  """

diff --git a/InnerEye/Azure/azure_runner.py b/InnerEye/Azure/azure_runner.py
@@ -19,17 +19,18 @@
 from azureml.core.runconfig import MpiConfiguration, RunConfiguration
 from azureml.core.workspace import WORKSPACE_DEFAULT_BLOB_STORE_NAME
 from azureml.data import FileDataset
+from azureml.data.dataset_consumption_config import DatasetConsumptionConfig
 
 from InnerEye.Azure import azure_util
 from InnerEye.Azure.azure_config import AzureConfig, ParserResult, SourceConfig
 from InnerEye.Azure.azure_util import CROSS_VALIDATION_SPLIT_INDEX_TAG_KEY, RUN_RECOVERY_FROM_ID_KEY_NAME, \
  RUN_RECOVERY_ID_KEY_NAME, \
- merge_conda_dependencies
+ is_offline_run_context, merge_conda_dependencies
 from InnerEye.Azure.secrets_handling import read_all_settings
 from InnerEye.Azure.tensorboard_monitor import AMLTensorBoardMonitorConfig, monitor
 from InnerEye.Common.generic_parsing import GenericConfig
 from InnerEye.ML.common import ModelExecutionMode
-from InnerEye.ML.utils.config_util import ModelConfigLoader
+from InnerEye.ML.utils.config_loader import ModelConfigLoader
 
 SLEEP_TIME_SECONDS = 30
 INPUT_DATA_KEY = "input_data"
@@ -42,15 +43,12 @@
 
 def submit_to_azureml(azure_config: AzureConfig,
  source_config: SourceConfig,
- model_config_overrides: str,
  azure_dataset_id: str) -> Run:
  """
  The main entry point. It creates an AzureML workspace if needed, submits an experiment using the code
  as specified in source_config, and waits for completion if needed.
  :param azure_config: azure related configurations to setup valid workspace
  :param source_config: The information about which code should be submitted, and which arguments should be used.
- :param model_config_overrides: A string that describes which model parameters were overwritten by commandline
- arguments in the present run. This is only used for diagnostic purposes (it is set as a Tag on the run).
  :param azure_dataset_id: The name of the dataset on blob storage to be used for this run.
  """
  azure_run: Optional[Run] = None
@@ -68,8 +66,7 @@ def interrupt_handler(signal: int, _: Any) -> None:
  for s in [signal.SIGINT, signal.SIGTERM]:
  signal.signal(s, interrupt_handler)
  # create train/test experiment
- azure_run = create_and_submit_experiment(azure_config, source_config, model_config_overrides,
- azure_dataset_id)
+ azure_run = create_and_submit_experiment(azure_config, source_config, azure_dataset_id)
 
  if azure_config.wait_for_completion:
  # We want the job output to be visible on the console, but the program should not exit if the
@@ -79,13 +76,12 @@ def interrupt_handler(signal: int, _: Any) -> None:
  return azure_run
 
 
-def set_run_tags(run: Run, azure_config: AzureConfig, model_config_overrides: str) -> None:
+def set_run_tags(run: Run, azure_config: AzureConfig, commandline_args: str) -> None:
  """
  Set metadata for the run
  :param run: Run to set metadata for.
  :param azure_config: The configurations for the present AzureML job
- :param model_config_overrides: A string that describes which model parameters were overwritten by commandline
- arguments in the present run.
+ :param commandline_args: A string that holds all commandline arguments that were used for the present run.
  """
  git_information = azure_config.get_git_information()
  run.set_tags({
@@ -103,7 +99,7 @@ def set_run_tags(run: Run, azure_config: AzureConfig, model_config_overrides: st
  "source_message": git_information.commit_message,
  "source_author": git_information.commit_author,
  "source_dirty": str(git_information.is_dirty),
- "overrides": model_config_overrides,
+ "commandline_args": commandline_args,
  CROSS_VALIDATION_SPLIT_INDEX_TAG_KEY: -1,
  })
 
@@ -125,14 +121,11 @@ def create_experiment_name(azure_config: AzureConfig) -> str:
 def create_and_submit_experiment(
  azure_config: AzureConfig,
  source_config: SourceConfig,
- model_config_overrides: str,
  azure_dataset_id: str) -> Run:
  """
  Creates an AzureML experiment in the workspace and submits it for execution.
  :param azure_config: azure related configurations to setup valid workspace
  :param source_config: The information about which code should be submitted, and which arguments should be used.
- :param model_config_overrides: A string that describes which model parameters were overwritten by commandline
- arguments in the present run. This is only used for diagnostic purposes (it is set as a Tag on the run).
  :param azure_dataset_id: The name of the dataset in blob storage to be used for this run.
  :returns: Run object for the submitted AzureML run
  """
@@ -144,8 +137,12 @@ def create_and_submit_experiment(
  # submit a training/testing run associated with the experiment
  run: Run = exp.submit(script_run_config)
 
- # set metadata for the run
- set_run_tags(run, azure_config, model_config_overrides)
+ if is_offline_run_context(run):
+ # This codepath will only be executed in unit tests, when exp.submit is mocked.
+ return run
+
+ # Set metadata for the run.
+ set_run_tags(run, azure_config, commandline_args=(" ".join(source_config.script_params)))
 
  print("\n==============================================================================")
  print(f"Successfully queued new run {run.id} in experiment: {exp.name}")
@@ -276,6 +273,21 @@ def get_or_create_python_environment(azure_config: AzureConfig,
  return env
 
 
+def get_dataset_consumption(azure_config: AzureConfig, azure_dataset_id: str) -> DatasetConsumptionConfig:
+ """
+ Creates a configuration for using an AzureML dataset inside of an AzureML run. This will make the AzureML
+ dataset with given name available as a named input, using INPUT_DATA_KEY as the key.
+ :param azure_config: azure related configurations to use for model scale-out behaviour
+ :param azure_dataset_id: The name of the dataset in blob storage to be used for this run. This can be an empty
+ string to not use any datasets.
+ """
+ azureml_dataset = get_or_create_dataset(azure_config, azure_dataset_id=azure_dataset_id)
+ if not azureml_dataset:
+ raise ValueError(f"AzureML dataset {azure_dataset_id} could not be found or created.")
+ named_input = azureml_dataset.as_named_input(INPUT_DATA_KEY)
+ return named_input.as_mount() if azure_config.use_dataset_mount else named_input.as_download()
+
+
 def create_run_config(azure_config: AzureConfig,
  source_config: SourceConfig,
  azure_dataset_id: str = "",
@@ -292,11 +304,7 @@ def create_run_config(azure_config: AzureConfig,
  :return: The configured script run.
  """
  if azure_dataset_id:
- azureml_dataset = get_or_create_dataset(azure_config, azure_dataset_id=azure_dataset_id)
- if not azureml_dataset:
- raise ValueError(f"AzureML dataset {azure_dataset_id} could not be found or created.")
- named_input = azureml_dataset.as_named_input(INPUT_DATA_KEY)
- dataset_consumption = named_input.as_mount() if azure_config.use_dataset_mount else named_input.as_download()
+ dataset_consumption = get_dataset_consumption(azure_config, azure_dataset_id)
  else:
  dataset_consumption = None
  # AzureML seems to sometimes expect the entry script path in Linux format, hence convert to posix path
@@ -354,8 +362,7 @@ def create_runner_parser(model_config_class: type = None) -> argparse.ArgumentPa
 def parse_args_and_add_yaml_variables(parser: ArgumentParser,
  yaml_config_file: Optional[Path] = None,
  project_root: Optional[Path] = None,
- fail_on_unknown_args: bool = False,
- args: List[str] = None) -> ParserResult:
+ fail_on_unknown_args: bool = False) -> ParserResult:
  """
  Reads arguments from sys.argv, modifies them with secrets from local YAML files,
  and parses them using the given argument parser.
@@ -364,14 +371,12 @@ def parse_args_and_add_yaml_variables(parser: ArgumentParser,
  :param yaml_config_file: The path to the YAML file that contains values to supply into sys.argv.
  :param fail_on_unknown_args: If True, raise an exception if the parser encounters an argument that it does not
  recognize. If False, unrecognized arguments will be ignored, and added to the "unknown" field of the parser result.
- :param args: arguments to parse
  :return: The parsed arguments, and overrides
  """
  settings_from_yaml = read_all_settings(yaml_config_file, project_root=project_root)
  return parse_arguments(parser,
  settings_from_yaml=settings_from_yaml,
- fail_on_unknown_args=fail_on_unknown_args,
- args=args)
+ fail_on_unknown_args=fail_on_unknown_args)
 
 
 def _create_default_namespace(parser: ArgumentParser) -> Namespace:
@@ -471,7 +476,7 @@ def run_duration_string_to_seconds(s: str) -> Optional[int]:
  elif suffix == "d":
  multiplier = 24 * 60 * 60
  else:
- raise ArgumentError("s", f"Invalid suffix: Must be one of 's', 'm', 'h', 'd', but got: {s}")
+ raise ArgumentError("s", f"Invalid suffix: Must be one of 's', 'm', 'h', 'd', but got: {s}") # type: ignore
  return int(float(s[:-1]) * multiplier)
 
 

diff --git a/InnerEye/Azure/azure_util.py b/InnerEye/Azure/azure_util.py
@@ -45,15 +45,6 @@
 INNEREYE_SDK_VERSION = "1.0"
 
 
-def get_results_blob_path(run_id: str) -> str:
- """
- Creates the name of the top level folder that contains the results for a given AzureML run.
- :param run_id: The AzureML run ID for which the folder should be created.
- :return: A full Azure blob storage path, starting with the container name.
- """
- return AZUREML_RUN_FOLDER + run_id
-
-
 def create_run_recovery_id(run: Run) -> str:
  """
  Creates an recovery id for a run so it's checkpoints could be recovered for training/testing
@@ -293,6 +284,21 @@ def merge_conda_files(files: List[Path], result_file: Path) -> None:
  ruamel.yaml.dump(unified_definition, f, indent=2, default_flow_style=False)
 
 
+def get_all_environment_files(project_root: Path) -> List[Path]:
+ """
+ Returns a list of all Conda environment files that should be used. This is firstly the InnerEye conda file,
+ and possibly a second environment.yml file that lives at the project root folder.
+ :param project_root: The root folder of the code that starts the present training run.
+ :return: A list with 1 or 2 entries that are conda environment files.
+ """
+ innereye_yaml = fixed_paths.get_environment_yaml_file()
+ project_yaml = project_root / fixed_paths.ENVIRONMENT_YAML_FILE_NAME
+ files = [innereye_yaml]
+ if innereye_yaml != project_yaml:
+ files.append(project_yaml)
+ return files
+
+
 def merge_conda_dependencies(files: List[Path]) -> Tuple[CondaDependencies, str]:
  """
  Creates a CondaDependencies object from the Conda environments specified in one or more files.

diff --git a/InnerEye/Common/build_config.py b/InnerEye/Common/build_config.py
diff --git a/InnerEye/Common/common_util.py b/InnerEye/Common/common_util.py
@@ -389,3 +389,29 @@ def remove_file_or_directory(pth: Path) -> None:
  pth.rmdir()
  elif pth.exists():
  pth.unlink()
+
+
+def add_folder_to_sys_path_if_needed(folder_under_repo_root: str) -> None:
+ """
+ Checks if the Python paths in sys.path already contain the given folder, which is expected to be relative
+ to the repository root. If that folder is not yet in sys.path, add it.
+ """
+ full_folder = repository_root_directory() / folder_under_repo_root
+ for path_str in sys.path:
+ path = Path(path_str)
+ if path == full_folder:
+ return
+ print(f"Adding {full_folder} to sys.path")
+ sys.path.append(str(full_folder))
+
+
+@contextmanager
+def change_working_directory(path_or_str: PathOrString) -> Generator:
+ """
+ Context manager for changing the current working directory
+ """
+ new_path = Path(path_or_str).expanduser()
+ old_path = Path.cwd()
+ os.chdir(new_path)
+ yield
+ os.chdir(old_path)
diff --git a/InnerEye/Common/fixed_paths.py b/InnerEye/Common/fixed_paths.py
@@ -34,6 +34,8 @@ def repository_root_directory(path: Optional[PathOrString] = None) -> Path:
 DEFAULT_AML_LOGS_DIR = "azureml-logs"
 
 DEFAULT_LOGS_DIR_NAME = "logs"
+LOG_FILE_NAME = "stdout.txt"
+
 DEFAULT_MODEL_SUMMARIES_DIR_PATH = Path(DEFAULT_LOGS_DIR_NAME) / "model_summaries"
 # The folder at the project root directory that holds datasets for local execution.
 DATASETS_DIR_NAME = "datasets"

diff --git a/InnerEye/Common/fixed_paths_for_tests.py b/InnerEye/Common/fixed_paths_for_tests.py
@@ -32,17 +32,6 @@ def full_ml_test_data_path(path: str = "") -> Path:
  return _full_test_data_path("ML", path)
 
 
-def full_azure_test_data_path(path: str = "") -> Path:
- """
- Takes a relative path inside of the Azure/tests/test_data folder, and returns its
- full absolute path.
-
- :param path: A path relative to the Tests/Azure/test_data
- :return: The full absolute path of the argument.
- """
- return _full_test_data_path("Azure", path)
-
-
 def _full_test_data_path(prefix: str, suffix: str) -> Path:
  root = tests_root_directory()
  return root / prefix / "test_data" / suffix