Upgrade to Pytorch Lightning 1.5.5 (#591)

microsoft · Dec 15, 2021 · e477c9d · e477c9d
1 parent 4aa84b9
commit e477c9d
Show file tree

Hide file tree

Showing 35 changed files with 323 additions and 201 deletions.
diff --git a/.idea/runConfigurations/Template__Run_ML_on_AzureML.xml b/.idea/runConfigurations/Template__Run_ML_on_AzureML.xml
diff --git a/.idea/runConfigurations/Template__Run_ML_on_local_machine.xml b/.idea/runConfigurations/Template__Run_ML_on_local_machine.xml
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -63,6 +63,7 @@ gets uploaded to AzureML, by skipping all test folders.
 - ([#584](https://github.com/microsoft/InnerEye-DeepLearning/pull/584)) SSL models write the optimizer state for the linear head to the checkpoint now.
 - ([#594](https://github.com/microsoft/InnerEye-DeepLearning/pull/594)) Pytorch is now non-deterministic by default. Upgrade to AzureML-SDK 1.36
 - ([#566](https://github.com/microsoft/InnerEye-DeepLearning/pull/566)) Update `hi-ml` dependency to `hi-ml-azure`.
+- ([#591](https://github.com/microsoft/InnerEye-DeepLearning/pull/591)) Upgrade Pytorch Lightning to 1.5.0
 - ([#572](https://github.com/microsoft/InnerEye-DeepLearning/pull/572)) Updated to new version of hi-ml package
 - ([#617](https://github.com/microsoft/InnerEye-DeepLearning/pull/617)) Provide an easier way for LightningContainers to add callbacks.
 - ([#596](https://github.com/microsoft/InnerEye-DeepLearning/pull/596)) Add `cudatoolkit=11.1` specification to environment.yml.

diff --git a/InnerEye/ML/SSL/lightning_modules/ssl_classifier_module.py b/InnerEye/ML/SSL/lightning_modules/ssl_classifier_module.py
@@ -5,10 +5,9 @@
 from typing import Any, List, Optional
 
 import torch
-from torchmetrics import Metric
-from pl_bolts.models.self_supervised import SSLEvaluator
 from health_ml.utils import log_on_epoch
-from torch.nn import functional as F
+from pl_bolts.models.self_supervised import SSLEvaluator
+from torch.nn import ModuleList, functional as F
 
 from InnerEye.ML.SSL.encoders import get_encoder_output_dim
 from InnerEye.ML.dataset.scalar_sample import ScalarItem
@@ -38,18 +37,12 @@ def __init__(self,
  n_classes=num_classes,
  p=0.20)
  if self.num_classes == 2:
- self.train_metrics: List[Metric] = \
- [AreaUnderRocCurve(), AreaUnderPrecisionRecallCurve(), Accuracy05()]
- self.val_metrics: List[Metric] = \
- [AreaUnderRocCurve(), AreaUnderPrecisionRecallCurve(), Accuracy05()]
+ self.train_metrics = ModuleList([AreaUnderRocCurve(), AreaUnderPrecisionRecallCurve(), Accuracy05()])
+ self.val_metrics = ModuleList([AreaUnderRocCurve(), AreaUnderPrecisionRecallCurve(), Accuracy05()])
  else:
  # Note that for multi-class, Accuracy05 is the standard multi-class accuracy.
- self.train_metrics = [Accuracy05()]
- self.val_metrics = [Accuracy05()]
-
- def on_train_start(self) -> None:
- for metric in [*self.train_metrics, *self.val_metrics]:
- metric.to(device=self.device) # type: ignore
+ self.train_metrics = ModuleList([Accuracy05()])
+ self.val_metrics = ModuleList([Accuracy05()])
 
  def train(self, mode: bool = True) -> Any:
  self.classifier_head.train(mode)

diff --git a/InnerEye/ML/SSL/lightning_modules/ssl_online_evaluator.py b/InnerEye/ML/SSL/lightning_modules/ssl_online_evaluator.py
@@ -3,8 +3,6 @@
 # Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
 # ------------------------------------------------------------------------------------------
 
-from typing import Any, Dict, List, Optional, Set, Tuple, Union
-
 import pytorch_lightning as pl
 import torch
 from pl_bolts.callbacks.ssl_online import SSLOnlineEvaluator
@@ -14,8 +12,9 @@
 from torch.nn import SyncBatchNorm, functional as F
 from torch.nn.parallel import DistributedDataParallel
 from torchmetrics import Metric
+from typing import Any, Dict, List, Optional, Set, Tuple, Union
 
-from InnerEye.ML.SSL.utils import SSLDataModuleType
+from InnerEye.ML.SSL.utils import SSLDataModuleType, add_submodules_to_same_device
 from InnerEye.ML.lightning_metrics import Accuracy05, AreaUnderPrecisionRecallCurve, AreaUnderRocCurve
 from InnerEye.ML.utils.layer_util import set_model_to_eval_mode
 from health_ml.utils import log_on_epoch
@@ -81,10 +80,17 @@ def on_pretrain_routine_start(self, trainer: pl.Trainer, pl_module: pl.Lightning
  If training happens via DDP, SyncBatchNorm is enabled for the online evaluator, and it is converted to
  a DDP module.
  """
- for metric in [*self.train_metrics, *self.val_metrics]:
- metric.to(device=pl_module.device) # type: ignore
+ for prefix, metrics in [("train", self.train_metrics), ("val", self.val_metrics)]:
+ add_submodules_to_same_device(pl_module, metrics, prefix=prefix)
  self.evaluator.to(pl_module.device)
- accelerator = trainer.accelerator_connector
+ if hasattr(trainer, "accelerator_connector"):
+ # This works with Lightning 1.3.8
+ accelerator = trainer.accelerator_connector
+ elif hasattr(trainer, "_accelerator_connector"):
+ # This works with Lightning 1.5.5
+ accelerator = trainer._accelerator_connector
+ else:
+ raise ValueError("Unable to retrieve the accelerator information")
  if accelerator.is_distributed:
  if accelerator.use_ddp:
  self.evaluator = SyncBatchNorm.convert_sync_batchnorm(self.evaluator)
@@ -152,7 +158,7 @@ def on_validation_batch_end(self, trainer: pl.Trainer,
  for metric in self.val_metrics:
  log_on_epoch(pl_module, f"ssl_online_evaluator/val/{metric.name}", metric)
 
- def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx, dataloader_idx) -> None: # type: ignore
+ def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx) -> None: # type: ignore
  """
  Get and log training metrics, perform network update.
  """

diff --git a/InnerEye/ML/SSL/utils.py b/InnerEye/ML/SSL/utils.py
@@ -6,7 +6,7 @@
 import logging
 from enum import Enum
 from pathlib import Path
-from typing import Any, Optional
+from typing import Any, Iterable, Optional
 
 import torch
 from yacs.config import CfgNode
@@ -119,3 +119,23 @@ def __init__(self, **kwargs: Any) -> None:
  n_hidden=None)
 
  return _wrap
+
+
+def add_submodules_to_same_device(module: torch.nn.Module,
+ submodules: Iterable[torch.nn.Module],
+ prefix: str = "") -> None:
+ """
+ Adds each of the given submodules to the "main" module, and moves them to the same device as the "main"
+ module. The submodules get a name derived from their class name, with the given prefix.
+
+ :param module: The module to which submodules should be added.
+ :param submodules: The submodules to add.
+ :param prefix: A string prefix that will be used to create the name of the submodule.
+ """
+
+ def _class_name(o: Any) -> str:
+ return type(o).__name__
+
+ for m in submodules:
+ m.to(device=module.device) # type: ignore
+ module.add_module(f"{prefix}{_class_name(m)}", m)
diff --git a/InnerEye/ML/baselines_util.py b/InnerEye/ML/baselines_util.py
@@ -32,8 +32,10 @@
 REGRESSION_TEST_AZUREML_FOLDER = "AZUREML_OUTPUT"
 REGRESSION_TEST_AZUREML_PARENT_FOLDER = "AZUREML_PARENT_OUTPUT"
 CONTENTS_MISMATCH = "Contents mismatch"
+FILE_FORMAT_ERROR = "File format error"
 MISSING_FILE = "Missing"
-TEXT_FILE_SUFFIXES = [".txt", ".csv", ".json", ".html", ".md"]
+CSV_SUFFIX = ".csv"
+TEXT_FILE_SUFFIXES = [".txt", ".json", ".html", ".md"]
 
 INFERENCE_DISABLED_WARNING = "Not performing comparison of model against baseline(s), because inference is currently " \
  "disabled. If comparison is required, use either the inference_on_test_set or " \
@@ -185,7 +187,7 @@ def get_comparison_baselines(outputs_folder: Path, azure_config: AzureConfig,
  return comparison_baselines
 
 
-def compare_files(expected: Path, actual: Path) -> str:
+def compare_files(expected: Path, actual: Path, csv_relative_tolerance: float = 0.0) -> str:
  """
  Compares two individual files for regression testing. It returns an empty string if the two files appear identical.
  If the files are not identical, an error message with details is return. This handles known text file formats,
@@ -195,16 +197,35 @@ def compare_files(expected: Path, actual: Path) -> str:
  :param expected: A file that contains the expected contents. The type of comparison (text or binary) is chosen
  based on the extension of this file.
  :param actual: A file that contains the actual contents.
+ :param csv_relative_tolerance: When comparing CSV files, use this as the maximum allowed relative discrepancy.
+ If 0.0, do not allow any discrepancy.
  :return: An empty string if the files appear identical, or otherwise an error message with details.
  """
 
  def print_lines(prefix: str, lines: List[str]) -> None:
  num_lines = len(lines)
  count = min(5, num_lines)
- logging.debug(f"{prefix} {num_lines} lines, first {count} of those:")
- logging.debug(os.linesep.join(lines[:count]))
+ logging.info(f"{prefix} {num_lines} lines, first {count} of those:")
+ logging.info(os.linesep.join(lines[:count]))
 
- if expected.suffix in TEXT_FILE_SUFFIXES:
+ def try_read_csv(prefix: str, file: Path) -> Optional[pd.DataFrame]:
+ try:
+ return pd.read_csv(file)
+ except Exception as ex:
+ logging.info(f"{prefix} file can't be read as CSV: {str(ex)}")
+ return None
+
+ if expected.suffix == CSV_SUFFIX:
+ expected_df = try_read_csv("Expected", expected)
+ actual_df = try_read_csv("Actual", actual)
+ if expected_df is None or actual_df is None:
+ return FILE_FORMAT_ERROR
+ try:
+ pd.testing.assert_frame_equal(actual_df, expected_df, rtol=csv_relative_tolerance)
+ except Exception as ex:
+ logging.info(str(ex))
+ return CONTENTS_MISMATCH
+ elif expected.suffix in TEXT_FILE_SUFFIXES:
  # Compare line-by-line to avoid issues with line separators
  expected_lines = expected.read_text().splitlines()
  actual_lines = actual.read_text().splitlines()
@@ -216,12 +237,13 @@ def print_lines(prefix: str, lines: List[str]) -> None:
  expected_binary = expected.read_bytes()
  actual_binary = actual.read_bytes()
  if expected_binary != actual_binary:
- logging.debug(f"Expected {len(expected_binary)} bytes, actual {len(actual_binary)} bytes")
+ logging.info(f"Expected {len(expected_binary)} bytes, actual {len(actual_binary)} bytes")
  return CONTENTS_MISMATCH
  return ""
 
 
 def compare_folder_contents(expected_folder: Path,
+ csv_relative_tolerance: float,
  actual_folder: Optional[Path] = None,
  run: Optional[Run] = None) -> List[str]:
  """
@@ -230,9 +252,12 @@ def compare_folder_contents(expected_folder: Path,
  (or the AzureML run), with exactly the same contents, in the same folder structure.
  For example, if there is a file "<expected>/foo/bar/contents.txt", then there must also be a file
  "<actual>/foo/bar/contents.txt"
+
  :param expected_folder: A folder with files that are expected to be present.
  :param actual_folder: The output folder with the actually produced files.
  :param run: An AzureML run
+ :param csv_relative_tolerance: When comparing CSV files, use this as the maximum allowed relative discrepancy.
+ If 0.0, do not allow any discrepancy.
  :return: A list of human readable error messages, with message and file path. If no errors are found, the list is
  empty.
  """
@@ -256,7 +281,8 @@ def compare_folder_contents(expected_folder: Path,
  run.download_file(name=str(file_relative), output_file_path=str(actual_file))
  else:
  raise ValueError("One of the two arguments run, actual_folder must be provided.")
- message = compare_files(expected=file, actual=actual_file) if actual_file.exists() else MISSING_FILE
+ message = compare_files(expected=file, actual=actual_file,
+ csv_relative_tolerance=csv_relative_tolerance) if actual_file.exists() else MISSING_FILE
  if message:
  messages.append(f"{message}: {file_relative}")
  logging.info(f"File {file_relative}: {message or 'OK'}")
@@ -265,15 +291,18 @@ def compare_folder_contents(expected_folder: Path,
  return messages
 
 
-def compare_folders_and_run_outputs(expected: Path, actual: Path) -> None:
+def compare_folders_and_run_outputs(expected: Path, actual: Path, csv_relative_tolerance: float) -> None:
  """
  Compares the actual set of run outputs in the `actual` folder against an expected set of files in the `expected`
  folder. The `expected` folder can have two special subfolders AZUREML_OUTPUT and AZUREML_PARENT_OUTPUT, that
  contain files that are expected to be present in the AzureML run context of the present run (AZUREML_OUTPUT)
  or the run context of the parent run (AZUREML_PARENT_OUTPUT).
  If a file is missing, or does not have the expected contents, an exception is raised.
+
  :param expected: A folder with files that are expected to be present.
  :param actual: The output folder with the actually produced files.
+ :param csv_relative_tolerance: When comparing CSV files, use this as the maximum allowed relative discrepancy.
+ If 0.0, do not allow any discrepancy.
  """
  if not expected.is_dir():
  raise ValueError(f"Folder with expected files does not exist: {expected}")
@@ -289,7 +318,10 @@ def compare_folders_and_run_outputs(expected: Path, actual: Path) -> None:
  if actual_folder is None and run_to_compare is None:
  raise ValueError(f"The set of expected test results in {expected} contains a folder "
  f"{subfolder}, but there is no (parent) run to compare against.")
- new_messages = compare_folder_contents(folder, actual_folder=actual_folder, run=run_to_compare)
+ new_messages = compare_folder_contents(folder,
+ actual_folder=actual_folder,
+ run=run_to_compare,
+ csv_relative_tolerance=csv_relative_tolerance)
  if new_messages:
  messages.append(f"Issues in {message_prefix}:")
  messages.extend(new_messages)

diff --git a/InnerEye/ML/configs/other/HelloContainer.py b/InnerEye/ML/configs/other/HelloContainer.py
@@ -8,7 +8,7 @@
 import numpy as np
 import torch
 from pytorch_lightning import LightningDataModule, LightningModule
-from pytorch_lightning.metrics import MeanAbsoluteError
+from torchmetrics.regression import MeanAbsoluteError
 from torch.optim import Adam, Optimizer
 from torch.optim.lr_scheduler import StepLR, _LRScheduler
 from torch.utils.data import DataLoader, Dataset

diff --git a/InnerEye/ML/deep_learning_config.py b/InnerEye/ML/deep_learning_config.py
@@ -243,6 +243,10 @@ class WorkflowParams(param.Parameterized):
  "folder, and their contents must match exactly. When running in AzureML, you need to "
  "ensure that this folder is part of the snapshot that gets uploaded. The path should "
  "be relative to the repository root directory.")
+ regression_test_csv_tolerance: float = \
+ param.Number(default=0.0, allow_None=False,
+ doc="When comparing CSV files during regression tests, use this value as the maximum allowed "
+ "relative difference of actual and expected results. Default: 0.0 (must match exactly)")
 
  def validate(self) -> None:
  if sum([bool(param) for param in [self.weights_url, self.local_weights_path, self.model_id]]) > 1:
@@ -583,7 +587,7 @@ class TrainerParams(param.Parameterized):
  param.Boolean(default=False,
  doc="Controls the PyTorch Lightning trainer flags 'deterministic' and 'benchmark'. If "
  "'pl_deterministic' is True, results are perfectly reproducible. If False, they are not, but "
- "you may see training speed increases.")
+ "you may see significant training speed increases.")
  pl_find_unused_parameters: bool = \
  param.Boolean(default=False,
  doc="Controls the PyTorch Lightning flag 'find_unused_parameters' for the DDP plugin. "
@@ -606,7 +610,7 @@ class TrainerParams(param.Parameterized):
  monitor_gpu: bool = param.Boolean(default=False,
  doc="If True, add the GPUStatsMonitor callback to the Lightning trainer object. "
  "This will write GPU utilization metrics every 50 batches by default.")
- monitor_loading: bool = param.Boolean(default=True,
+ monitor_loading: bool = param.Boolean(default=False,
  doc="If True, add the BatchTimeCallback callback to the Lightning trainer "
  "object. This will monitor how long individual batches take to load.")