Skip to content
This repository has been archived by the owner on Mar 21, 2024. It is now read-only.

Commit

Permalink
Upgrade to Pytorch Lightning 1.5.5 (#591)
Browse files Browse the repository at this point in the history
  • Loading branch information
ant0nsc committed Dec 15, 2021
1 parent 4aa84b9 commit e477c9d
Show file tree
Hide file tree
Showing 35 changed files with 323 additions and 201 deletions.
2 changes: 1 addition & 1 deletion .idea/runConfigurations/Template__Run_ML_on_AzureML.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ gets uploaded to AzureML, by skipping all test folders.
- ([#584](https://github.com/microsoft/InnerEye-DeepLearning/pull/584)) SSL models write the optimizer state for the linear head to the checkpoint now.
- ([#594](https://github.com/microsoft/InnerEye-DeepLearning/pull/594)) Pytorch is now non-deterministic by default. Upgrade to AzureML-SDK 1.36
- ([#566](https://github.com/microsoft/InnerEye-DeepLearning/pull/566)) Update `hi-ml` dependency to `hi-ml-azure`.
- ([#591](https://github.com/microsoft/InnerEye-DeepLearning/pull/591)) Upgrade Pytorch Lightning to 1.5.0
- ([#572](https://github.com/microsoft/InnerEye-DeepLearning/pull/572)) Updated to new version of hi-ml package
- ([#617](https://github.com/microsoft/InnerEye-DeepLearning/pull/617)) Provide an easier way for LightningContainers to add callbacks.
- ([#596](https://github.com/microsoft/InnerEye-DeepLearning/pull/596)) Add `cudatoolkit=11.1` specification to environment.yml.
Expand Down
19 changes: 6 additions & 13 deletions InnerEye/ML/SSL/lightning_modules/ssl_classifier_module.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,9 @@
from typing import Any, List, Optional

import torch
from torchmetrics import Metric
from pl_bolts.models.self_supervised import SSLEvaluator
from health_ml.utils import log_on_epoch
from torch.nn import functional as F
from pl_bolts.models.self_supervised import SSLEvaluator
from torch.nn import ModuleList, functional as F

from InnerEye.ML.SSL.encoders import get_encoder_output_dim
from InnerEye.ML.dataset.scalar_sample import ScalarItem
Expand Down Expand Up @@ -38,18 +37,12 @@ def __init__(self,
n_classes=num_classes,
p=0.20)
if self.num_classes == 2:
self.train_metrics: List[Metric] = \
[AreaUnderRocCurve(), AreaUnderPrecisionRecallCurve(), Accuracy05()]
self.val_metrics: List[Metric] = \
[AreaUnderRocCurve(), AreaUnderPrecisionRecallCurve(), Accuracy05()]
self.train_metrics = ModuleList([AreaUnderRocCurve(), AreaUnderPrecisionRecallCurve(), Accuracy05()])
self.val_metrics = ModuleList([AreaUnderRocCurve(), AreaUnderPrecisionRecallCurve(), Accuracy05()])
else:
# Note that for multi-class, Accuracy05 is the standard multi-class accuracy.
self.train_metrics = [Accuracy05()]
self.val_metrics = [Accuracy05()]

def on_train_start(self) -> None:
for metric in [*self.train_metrics, *self.val_metrics]:
metric.to(device=self.device) # type: ignore
self.train_metrics = ModuleList([Accuracy05()])
self.val_metrics = ModuleList([Accuracy05()])

def train(self, mode: bool = True) -> Any:
self.classifier_head.train(mode)
Expand Down
20 changes: 13 additions & 7 deletions InnerEye/ML/SSL/lightning_modules/ssl_online_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,6 @@
# Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
# ------------------------------------------------------------------------------------------

from typing import Any, Dict, List, Optional, Set, Tuple, Union

import pytorch_lightning as pl
import torch
from pl_bolts.callbacks.ssl_online import SSLOnlineEvaluator
Expand All @@ -14,8 +12,9 @@
from torch.nn import SyncBatchNorm, functional as F
from torch.nn.parallel import DistributedDataParallel
from torchmetrics import Metric
from typing import Any, Dict, List, Optional, Set, Tuple, Union

from InnerEye.ML.SSL.utils import SSLDataModuleType
from InnerEye.ML.SSL.utils import SSLDataModuleType, add_submodules_to_same_device
from InnerEye.ML.lightning_metrics import Accuracy05, AreaUnderPrecisionRecallCurve, AreaUnderRocCurve
from InnerEye.ML.utils.layer_util import set_model_to_eval_mode
from health_ml.utils import log_on_epoch
Expand Down Expand Up @@ -81,10 +80,17 @@ def on_pretrain_routine_start(self, trainer: pl.Trainer, pl_module: pl.Lightning
If training happens via DDP, SyncBatchNorm is enabled for the online evaluator, and it is converted to
a DDP module.
"""
for metric in [*self.train_metrics, *self.val_metrics]:
metric.to(device=pl_module.device) # type: ignore
for prefix, metrics in [("train", self.train_metrics), ("val", self.val_metrics)]:
add_submodules_to_same_device(pl_module, metrics, prefix=prefix)
self.evaluator.to(pl_module.device)
accelerator = trainer.accelerator_connector
if hasattr(trainer, "accelerator_connector"):
# This works with Lightning 1.3.8
accelerator = trainer.accelerator_connector
elif hasattr(trainer, "_accelerator_connector"):
# This works with Lightning 1.5.5
accelerator = trainer._accelerator_connector
else:
raise ValueError("Unable to retrieve the accelerator information")
if accelerator.is_distributed:
if accelerator.use_ddp:
self.evaluator = SyncBatchNorm.convert_sync_batchnorm(self.evaluator)
Expand Down Expand Up @@ -152,7 +158,7 @@ def on_validation_batch_end(self, trainer: pl.Trainer,
for metric in self.val_metrics:
log_on_epoch(pl_module, f"ssl_online_evaluator/val/{metric.name}", metric)

def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx, dataloader_idx) -> None: # type: ignore
def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx) -> None: # type: ignore
"""
Get and log training metrics, perform network update.
"""
Expand Down
22 changes: 21 additions & 1 deletion InnerEye/ML/SSL/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import logging
from enum import Enum
from pathlib import Path
from typing import Any, Optional
from typing import Any, Iterable, Optional

import torch
from yacs.config import CfgNode
Expand Down Expand Up @@ -119,3 +119,23 @@ def __init__(self, **kwargs: Any) -> None:
n_hidden=None)

return _wrap


def add_submodules_to_same_device(module: torch.nn.Module,
submodules: Iterable[torch.nn.Module],
prefix: str = "") -> None:
"""
Adds each of the given submodules to the "main" module, and moves them to the same device as the "main"
module. The submodules get a name derived from their class name, with the given prefix.
:param module: The module to which submodules should be added.
:param submodules: The submodules to add.
:param prefix: A string prefix that will be used to create the name of the submodule.
"""

def _class_name(o: Any) -> str:
return type(o).__name__

for m in submodules:
m.to(device=module.device) # type: ignore
module.add_module(f"{prefix}{_class_name(m)}", m)
50 changes: 41 additions & 9 deletions InnerEye/ML/baselines_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,10 @@
REGRESSION_TEST_AZUREML_FOLDER = "AZUREML_OUTPUT"
REGRESSION_TEST_AZUREML_PARENT_FOLDER = "AZUREML_PARENT_OUTPUT"
CONTENTS_MISMATCH = "Contents mismatch"
FILE_FORMAT_ERROR = "File format error"
MISSING_FILE = "Missing"
TEXT_FILE_SUFFIXES = [".txt", ".csv", ".json", ".html", ".md"]
CSV_SUFFIX = ".csv"
TEXT_FILE_SUFFIXES = [".txt", ".json", ".html", ".md"]

INFERENCE_DISABLED_WARNING = "Not performing comparison of model against baseline(s), because inference is currently " \
"disabled. If comparison is required, use either the inference_on_test_set or " \
Expand Down Expand Up @@ -185,7 +187,7 @@ def get_comparison_baselines(outputs_folder: Path, azure_config: AzureConfig,
return comparison_baselines


def compare_files(expected: Path, actual: Path) -> str:
def compare_files(expected: Path, actual: Path, csv_relative_tolerance: float = 0.0) -> str:
"""
Compares two individual files for regression testing. It returns an empty string if the two files appear identical.
If the files are not identical, an error message with details is return. This handles known text file formats,
Expand All @@ -195,16 +197,35 @@ def compare_files(expected: Path, actual: Path) -> str:
:param expected: A file that contains the expected contents. The type of comparison (text or binary) is chosen
based on the extension of this file.
:param actual: A file that contains the actual contents.
:param csv_relative_tolerance: When comparing CSV files, use this as the maximum allowed relative discrepancy.
If 0.0, do not allow any discrepancy.
:return: An empty string if the files appear identical, or otherwise an error message with details.
"""

def print_lines(prefix: str, lines: List[str]) -> None:
num_lines = len(lines)
count = min(5, num_lines)
logging.debug(f"{prefix} {num_lines} lines, first {count} of those:")
logging.debug(os.linesep.join(lines[:count]))
logging.info(f"{prefix} {num_lines} lines, first {count} of those:")
logging.info(os.linesep.join(lines[:count]))

if expected.suffix in TEXT_FILE_SUFFIXES:
def try_read_csv(prefix: str, file: Path) -> Optional[pd.DataFrame]:
try:
return pd.read_csv(file)
except Exception as ex:
logging.info(f"{prefix} file can't be read as CSV: {str(ex)}")
return None

if expected.suffix == CSV_SUFFIX:
expected_df = try_read_csv("Expected", expected)
actual_df = try_read_csv("Actual", actual)
if expected_df is None or actual_df is None:
return FILE_FORMAT_ERROR
try:
pd.testing.assert_frame_equal(actual_df, expected_df, rtol=csv_relative_tolerance)
except Exception as ex:
logging.info(str(ex))
return CONTENTS_MISMATCH
elif expected.suffix in TEXT_FILE_SUFFIXES:
# Compare line-by-line to avoid issues with line separators
expected_lines = expected.read_text().splitlines()
actual_lines = actual.read_text().splitlines()
Expand All @@ -216,12 +237,13 @@ def print_lines(prefix: str, lines: List[str]) -> None:
expected_binary = expected.read_bytes()
actual_binary = actual.read_bytes()
if expected_binary != actual_binary:
logging.debug(f"Expected {len(expected_binary)} bytes, actual {len(actual_binary)} bytes")
logging.info(f"Expected {len(expected_binary)} bytes, actual {len(actual_binary)} bytes")
return CONTENTS_MISMATCH
return ""


def compare_folder_contents(expected_folder: Path,
csv_relative_tolerance: float,
actual_folder: Optional[Path] = None,
run: Optional[Run] = None) -> List[str]:
"""
Expand All @@ -230,9 +252,12 @@ def compare_folder_contents(expected_folder: Path,
(or the AzureML run), with exactly the same contents, in the same folder structure.
For example, if there is a file "<expected>/foo/bar/contents.txt", then there must also be a file
"<actual>/foo/bar/contents.txt"
:param expected_folder: A folder with files that are expected to be present.
:param actual_folder: The output folder with the actually produced files.
:param run: An AzureML run
:param csv_relative_tolerance: When comparing CSV files, use this as the maximum allowed relative discrepancy.
If 0.0, do not allow any discrepancy.
:return: A list of human readable error messages, with message and file path. If no errors are found, the list is
empty.
"""
Expand All @@ -256,7 +281,8 @@ def compare_folder_contents(expected_folder: Path,
run.download_file(name=str(file_relative), output_file_path=str(actual_file))
else:
raise ValueError("One of the two arguments run, actual_folder must be provided.")
message = compare_files(expected=file, actual=actual_file) if actual_file.exists() else MISSING_FILE
message = compare_files(expected=file, actual=actual_file,
csv_relative_tolerance=csv_relative_tolerance) if actual_file.exists() else MISSING_FILE
if message:
messages.append(f"{message}: {file_relative}")
logging.info(f"File {file_relative}: {message or 'OK'}")
Expand All @@ -265,15 +291,18 @@ def compare_folder_contents(expected_folder: Path,
return messages


def compare_folders_and_run_outputs(expected: Path, actual: Path) -> None:
def compare_folders_and_run_outputs(expected: Path, actual: Path, csv_relative_tolerance: float) -> None:
"""
Compares the actual set of run outputs in the `actual` folder against an expected set of files in the `expected`
folder. The `expected` folder can have two special subfolders AZUREML_OUTPUT and AZUREML_PARENT_OUTPUT, that
contain files that are expected to be present in the AzureML run context of the present run (AZUREML_OUTPUT)
or the run context of the parent run (AZUREML_PARENT_OUTPUT).
If a file is missing, or does not have the expected contents, an exception is raised.
:param expected: A folder with files that are expected to be present.
:param actual: The output folder with the actually produced files.
:param csv_relative_tolerance: When comparing CSV files, use this as the maximum allowed relative discrepancy.
If 0.0, do not allow any discrepancy.
"""
if not expected.is_dir():
raise ValueError(f"Folder with expected files does not exist: {expected}")
Expand All @@ -289,7 +318,10 @@ def compare_folders_and_run_outputs(expected: Path, actual: Path) -> None:
if actual_folder is None and run_to_compare is None:
raise ValueError(f"The set of expected test results in {expected} contains a folder "
f"{subfolder}, but there is no (parent) run to compare against.")
new_messages = compare_folder_contents(folder, actual_folder=actual_folder, run=run_to_compare)
new_messages = compare_folder_contents(folder,
actual_folder=actual_folder,
run=run_to_compare,
csv_relative_tolerance=csv_relative_tolerance)
if new_messages:
messages.append(f"Issues in {message_prefix}:")
messages.extend(new_messages)
Expand Down
2 changes: 1 addition & 1 deletion InnerEye/ML/configs/other/HelloContainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import numpy as np
import torch
from pytorch_lightning import LightningDataModule, LightningModule
from pytorch_lightning.metrics import MeanAbsoluteError
from torchmetrics.regression import MeanAbsoluteError
from torch.optim import Adam, Optimizer
from torch.optim.lr_scheduler import StepLR, _LRScheduler
from torch.utils.data import DataLoader, Dataset
Expand Down
8 changes: 6 additions & 2 deletions InnerEye/ML/deep_learning_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -243,6 +243,10 @@ class WorkflowParams(param.Parameterized):
"folder, and their contents must match exactly. When running in AzureML, you need to "
"ensure that this folder is part of the snapshot that gets uploaded. The path should "
"be relative to the repository root directory.")
regression_test_csv_tolerance: float = \
param.Number(default=0.0, allow_None=False,
doc="When comparing CSV files during regression tests, use this value as the maximum allowed "
"relative difference of actual and expected results. Default: 0.0 (must match exactly)")

def validate(self) -> None:
if sum([bool(param) for param in [self.weights_url, self.local_weights_path, self.model_id]]) > 1:
Expand Down Expand Up @@ -583,7 +587,7 @@ class TrainerParams(param.Parameterized):
param.Boolean(default=False,
doc="Controls the PyTorch Lightning trainer flags 'deterministic' and 'benchmark'. If "
"'pl_deterministic' is True, results are perfectly reproducible. If False, they are not, but "
"you may see training speed increases.")
"you may see significant training speed increases.")
pl_find_unused_parameters: bool = \
param.Boolean(default=False,
doc="Controls the PyTorch Lightning flag 'find_unused_parameters' for the DDP plugin. "
Expand All @@ -606,7 +610,7 @@ class TrainerParams(param.Parameterized):
monitor_gpu: bool = param.Boolean(default=False,
doc="If True, add the GPUStatsMonitor callback to the Lightning trainer object. "
"This will write GPU utilization metrics every 50 batches by default.")
monitor_loading: bool = param.Boolean(default=True,
monitor_loading: bool = param.Boolean(default=False,
doc="If True, add the BatchTimeCallback callback to the Lightning trainer "
"object. This will monitor how long individual batches take to load.")

Expand Down

0 comments on commit e477c9d

Please sign in to comment.