microsoft · dumbledad · Jun 23, 2021 · Jun 23, 2021 · Jun 23, 2021 · Jun 23, 2021
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -18,7 +18,8 @@ module on test data with partial ground truth files. (Also [522](https://github.
 - ([#502](https://github.com/microsoft/InnerEye-DeepLearning/pull/502)) More flags for fine control of when to run inference.
 - ([#492](https://github.com/microsoft/InnerEye-DeepLearning/pull/492)) Adding capability for regression tests for test
 jobs that run in AzureML.
-
+- ([#509](https://github.com/microsoft/InnerEye-DeepLearning/pull/509)) Run inference on registered models (single and
+ ensemble) using the parameter `model_id`.
 ### Changed
 - ([#531])(https://github.com/microsoft/InnerEye-DeepLearning/pull/531)) Updated PL to 1.3.8, torchmetrics and pl-bolts and changed relevant metrics and SSL code API.
 - ([#533](https://github.com/microsoft/InnerEye-DeepLearning/pull/533)) Better defaults for inference on ensemble children.
@@ -42,6 +43,8 @@ multiple large checkpoints can time out.
 ### Removed
 
 - ([#520](https://github.com/microsoft/InnerEye-DeepLearning/pull/520)) Disable glaucoma job from Azure pipeline.
+- ([#509](https://github.com/microsoft/InnerEye-DeepLearning/pull/509)) Parameters `local_weights_path` and
+ `weights_url` can no longer be used to initialize a training run, only inference runs.
 
 ### Deprecated
 

diff --git a/InnerEye/Azure/azure_config.py b/InnerEye/Azure/azure_config.py
@@ -77,14 +77,9 @@ class AzureConfig(GenericConfig):
  pytest_mark: str = param.String(doc="If provided, run pytest instead of model training. pytest will only "
  "run the tests that have the mark given in this argument "
  "('--pytest_mark gpu' will run all tests marked with 'pytest.mark.gpu')")
- run_recovery_id: str = param.String(doc="A run recovery id string in the form 'experiment name:run id'"
- " to use for inference or recovering a model training run.")
- pretraining_run_recovery_id: str = param.String(default=None,
- allow_None=True,
- doc="Extra run recovery id to download checkpoints from,"
- "for custom modules (e.g. for loading pretrained weights)."
- "Warning: this argument will be ignored for InnerEyeContainer"
- "models.")
+ run_recovery_id: str = param.String(doc="A run recovery id string in the form 'experiment name:run id' "
+ "to use for inference, recovering a model training run or to register "
+ "a model.")
  experiment_name: str = param.String(doc="If provided, use this string as the name of the AzureML experiment. "
  "If not provided, create the experiment off the git branch name.")
  build_number: int = param.Integer(0, doc="The numeric ID of the Azure pipeline that triggered this training run.")

diff --git a/InnerEye/Common/fixed_paths.py b/InnerEye/Common/fixed_paths.py
@@ -40,9 +40,6 @@ def repository_root_directory(path: Optional[PathOrString] = None) -> Path:
 # The folder at the project root directory that holds datasets for local execution.
 DATASETS_DIR_NAME = "datasets"
 
-# Points to a folder at the project root directory that holds model weights downloaded from URLs.
-MODEL_WEIGHTS_DIR_NAME = "modelweights"
-
 ML_RELATIVE_SOURCE_PATH = os.path.join("ML")
 ML_RELATIVE_RUNNER_PATH = os.path.join(ML_RELATIVE_SOURCE_PATH, "runner.py")
 ML_FULL_SOURCE_FOLDER_PATH = str(repository_root_directory() / ML_RELATIVE_SOURCE_PATH)

diff --git a/InnerEye/ML/configs/classification/CovidHierarchicalModel.py b/InnerEye/ML/configs/classification/CovidHierarchicalModel.py
@@ -170,21 +170,21 @@ def create_model(self) -> LightningModule:
 
  def _get_ssl_checkpoint_path(self) -> Path:
  # Get the SSL weights from the AML run provided via "pretraining_run_recovery_id" command line argument.
- # Accessible via extra_downloaded_run_id field of the config.
- assert self.extra_downloaded_run_id is not None
- assert isinstance(self.extra_downloaded_run_id, RunRecovery)
+ # Accessible via pretraining_run_checkpoints field of the config.
+ assert self.pretraining_run_checkpoints is not None
+ assert isinstance(self.pretraining_run_checkpoints, RunRecovery)
  ssl_path = self.checkpoint_folder / "ssl_checkpoint.ckpt"
 
  if not ssl_path.exists(): # for test (when it is already present) we don't need to redo this.
  if self.name_of_checkpoint is not None:
  logging.info(f"Using checkpoint: {self.name_of_checkpoint} as starting point.")
- path_to_checkpoint = self.extra_downloaded_run_id.checkpoints_roots[0] / self.name_of_checkpoint
+ path_to_checkpoint = self.pretraining_run_checkpoints.checkpoints_roots[0] / self.name_of_checkpoint
  else:
- path_to_checkpoint = self.extra_downloaded_run_id.get_best_checkpoint_paths()[0]
+ path_to_checkpoint = self.pretraining_run_checkpoints.get_best_checkpoint_paths()[0]
  if not path_to_checkpoint.exists():
  logging.info("No best checkpoint found for this model. Getting the latest recovery "
  "checkpoint instead.")
- path_to_checkpoint = self.extra_downloaded_run_id.get_recovery_checkpoint_paths()[0]
+ path_to_checkpoint = self.pretraining_run_checkpoints.get_recovery_checkpoint_paths()[0]
  assert path_to_checkpoint.exists()
  path_to_checkpoint.rename(ssl_path)
  return ssl_path

diff --git a/InnerEye/ML/configs/other/HelloContainer.py b/InnerEye/ML/configs/other/HelloContainer.py
@@ -230,7 +230,7 @@ def on_test_epoch_end(self) -> None:
  """
  average_mse = torch.mean(torch.stack(self.test_mse))
  Path("test_mse.txt").write_text(str(average_mse.item()))
- Path("test_mae.txt").write_text(str(self.test_mae.compute()))
+ Path("test_mae.txt").write_text(str(self.test_mae.compute().item()))
 
 
 class HelloContainer(LightningContainer):

diff --git a/InnerEye/ML/deep_learning_config.py b/InnerEye/ML/deep_learning_config.py
@@ -33,7 +33,6 @@
 EXTRA_RUN_SUBFOLDER = "extra_run_id"
 
 ARGS_TXT = "args.txt"
-WEIGHTS_FILE = "weights.pth"
 
 
 @unique
@@ -216,16 +215,25 @@ class WorkflowParams(param.Parameterized):
  ensemble_inference_on_test_set: Optional[bool] = \
  param.Boolean(None,
  doc="If set, enable/disable full image inference on test set after ensemble training.")
- weights_url: str = param.String(doc="If provided, a url from which weights will be downloaded and used for model "
- "initialization.")
- local_weights_path: Optional[Path] = param.ClassSelector(class_=Path,
- default=None,
- allow_None=True,
- doc="The path to the weights to use for model "
- "initialization, when training outside AzureML.")
+ weights_url: List[str] = param.List(default=[], class_=str,
+ doc="If provided, a set of urls from which checkpoints will be downloaded"
+ "and used for training/inference.")
+ local_weights_path: List[Path] = param.List(default=[], class_=Path,
+ doc="A list of checkpoints paths to use for training/inference, "
+ "when training is running outside Azure.")
+ model_id: str = param.String(default="",
+ doc="A model id string in the form 'model name:version' "
+ "to use a registered model for inference.")
  generate_report: bool = param.Boolean(default=True,
  doc="If True (default), write a modelling report in HTML format. If False,"
  "do not write that report.")
+ pretraining_run_recovery_id: str = param.String(default=None,
+ allow_None=True,
+ doc="Extra run recovery id to download checkpoints from,"
+ "for custom modules (e.g. for loading pretrained weights)."
+ "The downloaded RunRecovery object will be available in"
+ "pretraining_run_checkpoints.")
+
  # The default multiprocessing start_method in both PyTorch and the Python standard library is "fork" for Linux and
  # "spawn" (the only available method) for Windows. There is some evidence that using "forkserver" on Linux
  # can reduce the chance of stuck jobs.
@@ -248,8 +256,13 @@ class WorkflowParams(param.Parameterized):
  "be relative to the repository root directory.")
 
  def validate(self) -> None:
- if self.weights_url and self.local_weights_path:
- raise ValueError("Cannot specify both local_weights_path and weights_url.")
+ if sum([bool(param) for param in [self.weights_url, self.local_weights_path, self.model_id]]) > 1:
+ raise ValueError("Cannot specify more than one of local_weights_path, weights_url or model_id.")
+
+ if self.model_id:
+ if len(self.model_id.split(":")) != 2:
+ raise ValueError(
+ f"model_id should be in the form 'model_name:version', got {self.model_id}")
 
  if self.number_of_cross_validation_splits == 1:
  raise ValueError("At least two splits required to perform cross validation, but got "
@@ -713,7 +726,7 @@ def __init__(self, **params: Any) -> None:
  self.create_filesystem(fixed_paths.repository_root_directory())
  # Disable the PL progress bar because all InnerEye models have their own console output
  self.pl_progress_bar_refresh_rate = 0
- self.extra_downloaded_run_id: Optional[Any] = None
+ self.pretraining_run_checkpoints: Optional[Any] = None
 
  def validate(self) -> None:
  """

diff --git a/InnerEye/ML/lightning_container.py b/InnerEye/ML/lightning_container.py
@@ -3,8 +3,8 @@
 # Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
 # ------------------------------------------------------------------------------------------
 import abc
-from pathlib import Path
 from typing import Any, Dict, Iterator, List, Optional, Tuple
+from pathlib import Path
 
 import param
 import torch
@@ -19,7 +19,7 @@
 from InnerEye.Common.metrics_constants import TrackedMetrics
 from InnerEye.ML.common import ModelExecutionMode
 from InnerEye.ML.deep_learning_config import DatasetParams, OptimizerParams, OutputParams, TrainerParams, \
- WorkflowParams, load_checkpoint
+ WorkflowParams
 from InnerEye.ML.utils import model_util
 from InnerEye.ML.utils.lr_scheduler import SchedulerWithWarmUp
 from InnerEye.ML.utils.run_recovery import RunRecovery
@@ -151,7 +151,7 @@ def __init__(self, **kwargs: Any) -> None:
  super().__init__(**kwargs)
  self._model: Optional[LightningModule] = None
  self._model_name = type(self).__name__
- self.extra_downloaded_run_id: Optional[RunRecovery] = None
+ self.pretraining_run_checkpoints: Optional[RunRecovery] = None
  self.num_nodes = 1
 
  def validate(self) -> None:
@@ -250,36 +250,6 @@ def before_training_on_all_ranks(self) -> None:
  """
  pass
 
- def load_checkpoint_and_modify(self, path_to_checkpoint: Path) -> Dict[str, Any]:
- """
- This method is called when a file with weights for network initialization is supplied at container level,
- in the self.weights_url or self.local_weights_path fields. It can load that file as a Torch checkpoint,
- and rename parameters.
-
- By default, uses torch.load to read and return the state dict from the checkpoint file, and does no modification
- of the checkpoint file.
-
- Overloading this function:
- When weights_url or local_weights_path is set, the file downloaded may not be in the exact
- format expected by the model's load_state_dict() - for example, pretrained Imagenet weights for networks
- may have mismatched layer names in different implementations.
- In such cases, you can overload this function to extract the state dict from the checkpoint.
-
- NOTE: The model checkpoint will be loaded using the torch function load_state_dict() with argument strict=False,
- so extra care needs to be taken to check that the state dict is valid.
- Check the logs for warnings related to missing and unexpected keys.
- See https://pytorch.org/tutorials/beginner/saving_loading_models.html#warmstarting-model-using-parameters
- -from-a-different-model
- for an explanation on why strict=False is useful when loading parameters from other models.
- :param path_to_checkpoint: Path to the checkpoint file.
- :return: Dictionary with model and optimizer state dicts. The dict should have at least the following keys:
- 1. Key ModelAndInfo.MODEL_STATE_DICT_KEY and value set to the model state dict.
- 2. Key ModelAndInfo.EPOCH_KEY and value set to the checkpoint epoch.
- Other (optional) entries corresponding to keys ModelAndInfo.OPTIMIZER_STATE_DICT_KEY and
- ModelAndInfo.MEAN_TEACHER_STATE_DICT_KEY are also supported.
- """
- return load_checkpoint(path_to_checkpoint=path_to_checkpoint, use_gpu=self.use_gpu)
-
  # The code from here on does not need to be modified.
 
  @property
@@ -334,6 +304,15 @@ def get_hyperdrive_config(self, run_config: ScriptRunConfig) -> HyperDriveConfig
  else:
  return self.get_parameter_search_hyperdrive_config(run_config)
 
+ def load_model_checkpoint(self, checkpoint_path: Path) -> None:
+ """
+ Load a checkpoint from the given path. We need to define a separate method since pytorch lightning cannot
+ access the _model attribute to modify it.
+ """
+ if self._model is None:
+ raise ValueError("No Lightning module has been set yet.")
+ self._model = type(self._model).load_from_checkpoint(checkpoint_path=str(checkpoint_path))
+
  def __str__(self) -> str:
  """Returns a string describing the present object, as a list of key: value strings."""
  arguments_str = "\nContainer:\n"