Use registered model for inference

microsoft · dumbledad · Jun 23, 2021 · Jun 23, 2021 · Jun 23, 2021 · Jun 23, 2021
commit 1250de038492201b0fd4ad97ad3841059701d3cb
diff --git a/InnerEye/Azure/azure_config.py b/InnerEye/Azure/azure_config.py
@@ -77,8 +77,8 @@ class AzureConfig(GenericConfig):
     pytest_mark: str = param.String(doc="If provided, run pytest instead of model training. pytest will only "
                                         "run the tests that have the mark given in this argument "
                                         "('--pytest_mark gpu' will run all tests marked with 'pytest.mark.gpu')")
-    run_recovery_id: str = param.String(doc="A run recovery id string in the form 'experiment name:run id'"
-                                            " to use for inference or recovering a model training run.")
+    run_recovery_id: str = param.String(doc="A run recovery id string in the form 'experiment name:run id' "
+                                            "to use for recovering a model training run or to register a model.")
     pretraining_run_recovery_id: str = param.String(default=None,
                                                     allow_None=True,
                                                     doc="Extra run recovery id to download checkpoints from,"
@@ -122,6 +122,8 @@ class AzureConfig(GenericConfig):
     _workspace: Workspace = param.ClassSelector(class_=Workspace,
                                                 doc="The cached workspace object that has been created in the first"
                                                     "call to get_workspace")
+    model_id: str = param.String(doc="A model id string in the form 'model name:version' "
+                                     "to use a registered model for inference.")
 
     def __init__(self, **params: Any) -> None:
         super().__init__(**params)

diff --git a/InnerEye/Common/fixed_paths.py b/InnerEye/Common/fixed_paths.py
@@ -40,9 +40,6 @@ def repository_root_directory(path: Optional[PathOrString] = None) -> Path:
 # The folder at the project root directory that holds datasets for local execution.
 DATASETS_DIR_NAME = "datasets"
 
-# Points to a folder at the project root directory that holds model weights downloaded from URLs.
-MODEL_WEIGHTS_DIR_NAME = "modelweights"
-
 ML_RELATIVE_SOURCE_PATH = os.path.join("ML")
 ML_RELATIVE_RUNNER_PATH = os.path.join(ML_RELATIVE_SOURCE_PATH, "runner.py")
 ML_FULL_SOURCE_FOLDER_PATH = str(repository_root_directory() / ML_RELATIVE_SOURCE_PATH)

diff --git a/InnerEye/ML/deep_learning_config.py b/InnerEye/ML/deep_learning_config.py
@@ -34,7 +34,6 @@
 EXTRA_RUN_SUBFOLDER = "extra_run_id"
 
 ARGS_TXT = "args.txt"
-WEIGHTS_FILE = "weights.pth"
 
 
 @unique
@@ -207,13 +206,12 @@ class WorkflowParams(param.Parameterized):
     perform_validation_and_test_set_inference: bool = \
         param.Boolean(True,
                       doc="If True (default), run full image inference on validation and test set after training.")
-    weights_url: str = param.String(doc="If provided, a url from which weights will be downloaded and used for model "
-                                        "initialization.")
-    local_weights_path: Optional[Path] = param.ClassSelector(class_=Path,
-                                                             default=None,
-                                                             allow_None=True,
-                                                             doc="The path to the weights to use for model "
-                                                                 "initialization, when training outside AzureML.")
+    checkpoint_urls: List[str] = param.List(default=[],
+                                            doc="If provided, a set of urls from which checkpoints will be downloaded"
+                                                "and used for training/inference.")
+    local_checkpoint_paths: List[Path] = param.List(default=[], class_=Path,
+                                                    doc="A list of checkpoints paths to use for training/inference, "
+                                                        "when training is running outside Azure.")
     generate_report: bool = param.Boolean(default=True,
                                           doc="If True (default), write a modelling report in HTML format. If False,"
                                               "do not write that report.")
@@ -239,7 +237,7 @@ class WorkflowParams(param.Parameterized):
                                 "be relative to the repository root directory.")
 
     def validate(self) -> None:
-        if self.weights_url and self.local_weights_path:
+        if self.checkpoint_urls and self.local_checkpoint_paths:
             raise ValueError("Cannot specify both local_weights_path and weights_url.")
 
         if self.number_of_cross_validation_splits == 1:

diff --git a/InnerEye/ML/model_testing.py b/InnerEye/ML/model_testing.py
@@ -35,7 +35,6 @@
 from InnerEye.ML.scalar_config import ScalarModelBase
 from InnerEye.ML.sequence_config import SequenceModelBase
 from InnerEye.ML.utils import io_util, ml_util
-from InnerEye.ML.utils.checkpoint_handling import CheckpointHandler
 from InnerEye.ML.utils.image_util import binaries_from_multi_label_array
 from InnerEye.ML.utils.io_util import ImageHeader, MedicalImageFileType, load_nifti_image, save_lines_to_file
 from InnerEye.ML.utils.metrics_util import MetricsPerPatientWriter
@@ -47,15 +46,15 @@
 
 def model_test(config: ModelConfigBase,
                data_split: ModelExecutionMode,
-               checkpoint_handler: CheckpointHandler,
+               checkpoint_paths: List[Path],
                model_proc: ModelProcessing = ModelProcessing.DEFAULT) -> Optional[InferenceMetrics]:
     """
     Runs model inference on segmentation or classification models, using a given dataset (that could be training,
     test or validation set). The inference results and metrics will be stored and logged in a way that may
     differ for model categories (classification, segmentation).
     :param config: The configuration of the model
     :param data_split: Indicates which of the 3 sets (training, test, or validation) is being processed.
-    :param checkpoint_handler: Checkpoint handler object to find checkpoint paths for model initialization
+    :param checkpoint_paths: Checkpoint paths initialize model.
     :param model_proc: whether we are testing an ensemble or single model; this affects where results are written.
     :return: The metrics that the model achieved on the given data set, or None if the data set is empty.
     """
@@ -67,17 +66,19 @@ def model_test(config: ModelConfigBase,
                         "and additional data loaders are likely to block.")
         return None
     with logging_section(f"Running {model_proc.value} model on {data_split.name.lower()} set"):
+        if not checkpoint_paths:
+            raise ValueError("There were no checkpoints available for model testing.")
         if isinstance(config, SegmentationModelBase):
-            return segmentation_model_test(config, data_split, checkpoint_handler, model_proc)
+            return segmentation_model_test(config, data_split, checkpoint_paths, model_proc)
         if isinstance(config, ScalarModelBase):
-            return classification_model_test(config, data_split, checkpoint_handler, model_proc,
+            return classification_model_test(config, data_split, checkpoint_paths, model_proc,
                                              config.cross_validation_split_index)
     raise ValueError(f"There is no testing code for models of type {type(config)}")
 
 
 def segmentation_model_test(config: SegmentationModelBase,
                             data_split: ModelExecutionMode,
-                            checkpoint_handler: CheckpointHandler,
+                            checkpoint_paths: List[Path],
                             model_proc: ModelProcessing = ModelProcessing.DEFAULT) -> InferenceMetricsForSegmentation:
     """
     The main testing loop for segmentation models.
@@ -88,18 +89,13 @@ def segmentation_model_test(config: SegmentationModelBase,
     :param model_proc: whether we are testing an ensemble or single model
     :return: InferenceMetric object that contains metrics related for all of the checkpoint epochs.
     """
-    checkpoints_to_test = checkpoint_handler.get_checkpoints_to_test()
-
-    if not checkpoints_to_test:
-        raise ValueError("There were no checkpoints available for model testing.")
-
     epoch_results_folder = config.outputs_folder / get_best_epoch_results_path(data_split, model_proc)
     # save the datasets.csv used
     config.write_dataset_files(root=epoch_results_folder)
     epoch_and_split = f"{data_split.value} set"
     epoch_dice_per_image = segmentation_model_test_epoch(config=copy.deepcopy(config),
                                                          data_split=data_split,
-                                                         checkpoint_paths=checkpoints_to_test,
+                                                         checkpoint_paths=checkpoint_paths,
                                                          results_folder=epoch_results_folder,
                                                          epoch_and_split=epoch_and_split)
     if epoch_dice_per_image is None:
@@ -395,7 +391,7 @@ def create_metrics_dict_for_scalar_models(config: ScalarModelBase) -> \
 
 def classification_model_test(config: ScalarModelBase,
                               data_split: ModelExecutionMode,
-                              checkpoint_handler: CheckpointHandler,
+                              checkpoint_paths: List[Path],
                               model_proc: ModelProcessing,
                               cross_val_split_index: int) -> InferenceMetricsForClassification:
     """
@@ -404,16 +400,12 @@ def classification_model_test(config: ScalarModelBase,
     :param config: The model configuration.
     :param data_split: The name of the folder to store the results inside each epoch folder in the outputs_dir,
                        used mainly in model evaluation using different dataset splits.
-    :param checkpoint_handler: Checkpoint handler object to find checkpoint paths for model initialization
+    :param checkpoint_paths: Checkpoint paths to initialize model
     :param model_proc: whether we are testing an ensemble or single model
     :return: InferenceMetricsForClassification object that contains metrics related for all of the checkpoint epochs.
     """
     posthoc_label_transform = config.get_posthoc_label_transform()
 
-    checkpoint_paths = checkpoint_handler.get_checkpoints_to_test()
-    if not checkpoint_paths:
-        raise ValueError("There were no checkpoints available for model testing.")
-
     pipeline = create_inference_pipeline(config=config,
                                          checkpoint_paths=checkpoint_paths)
     if pipeline is None:

diff --git a/InnerEye/ML/model_training.py b/InnerEye/ML/model_training.py
@@ -28,7 +28,6 @@
 from InnerEye.ML.lightning_loggers import AzureMLLogger, StoringLogger
 from InnerEye.ML.lightning_models import SUBJECT_OUTPUT_PER_RANK_PREFIX, ScalarLightning, \
     get_subject_output_file_per_rank
-from InnerEye.ML.utils.checkpoint_handling import CheckpointHandler
 
 TEMP_PREFIX = "temp/"
 
@@ -215,23 +214,21 @@ def start_resource_monitor(config: LightningContainer) -> ResourceMonitor:
     return resource_monitor
 
 
-def model_train(checkpoint_handler: CheckpointHandler,
+def model_train(checkpoint_path: Path,
                 container: LightningContainer,
                 num_nodes: int = 1) -> Tuple[Trainer, Optional[StoringLogger]]:
     """
     The main training loop. It creates the Pytorch model based on the configuration options passed in,
     creates a Pytorch Lightning trainer, and trains the model.
     If a checkpoint was specified, then it loads the checkpoint before resuming training.
-    :param checkpoint_handler: Checkpoint handler object to find checkpoint paths for model initialization
+    :param checkpoint_path: Checkpoint path for model initialization
     :param num_nodes: The number of nodes to use in distributed training.
     :param container: A container object that holds the training data in PyTorch Lightning format
     and the model to train.
     :return: A tuple of [Trainer, StoringLogger]. Trainer is the Lightning Trainer object that was used for fitting
     the model. The StoringLogger object is returned when training an InnerEye built-in model, this is None when
     fitting other models.
     """
-    # Get the path to the checkpoint to recover from
-    checkpoint_path = checkpoint_handler.get_recovery_path_train()
     lightning_model = container.model
 
     resource_monitor: Optional[ResourceMonitor] = None