Skip to content
This repository has been archived by the owner on Mar 21, 2024. It is now read-only.

Commit

Permalink
Upgrade to PyTorch 1.10 (#585)
Browse files Browse the repository at this point in the history
  • Loading branch information
ant0nsc committed Jan 10, 2022
1 parent ac6a312 commit 1523882
Show file tree
Hide file tree
Showing 14 changed files with 73 additions and 60 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ jobs that run in AzureML.

### Changed
- ([#588](https://github.com/microsoft/InnerEye-DeepLearning/pull/588)) Replace SciPy with PIL.PngImagePlugin.PngImageFile to load png files.
- ([#585](https://github.com/microsoft/InnerEye-DeepLearning/pull/585)) Switching to PyTorch 1.10.0 and torchvision 0.11.1
- ([#576](https://github.com/microsoft/InnerEye-DeepLearning/pull/576)) The console output is no longer written to stdout.txt because AzureML handles that better now
- ([#531](https://github.com/microsoft/InnerEye-DeepLearning/pull/531)) Updated PL to 1.3.8, torchmetrics and pl-bolts and changed relevant metrics and SSL code API.
- ([#555](https://github.com/microsoft/InnerEye-DeepLearning/pull/555)) Make the SSLContainer compatible with new datasets
Expand Down
1 change: 1 addition & 0 deletions InnerEye/ML/configs/classification/DummyClassification.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ def __init__(self) -> None:
self.expected_image_size_zyx = (4, 5, 7)
# Trying to run DDP from the test suite hangs, hence restrict to single GPU.
self.max_num_gpus = 1
self.pl_deterministic = True

def get_model_train_test_dataset_splits(self, dataset_df: pd.DataFrame) -> DatasetSplits:
return DatasetSplits.from_proportions(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ def __init__(self) -> None:
self.expected_image_size_zyx = (4, 5, 7)
# Trying to run DDP from the test suite hangs, hence restrict to single GPU.
self.max_num_gpus = 1
self.pl_deterministic = True

def get_model_train_test_dataset_splits(self, dataset_df: pd.DataFrame) -> DatasetSplits:
return DatasetSplits.from_proportions(
Expand Down
2 changes: 0 additions & 2 deletions InnerEye/ML/models/architectures/sequential/gru.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,10 +37,8 @@ def __init__(self, input_size: int, hidden_size: int, use_layer_norm: bool = Fal
self.ln_n = nn.LayerNorm(self.hidden_size) if use_layer_norm else Identity()

def forward(self, input: torch.Tensor, hx: Optional[torch.Tensor] = None) -> torch.Tensor: # type: ignore
self.check_forward_input(input)
if hx is None:
hx = input.new_zeros(size=(input.size(0), self.hidden_size), requires_grad=False)
self.check_forward_hidden(input, hx)

ih = input.mm(self.weight_ih.t())
hh = hx.mm(self.weight_hh.t())
Expand Down
9 changes: 6 additions & 3 deletions InnerEye/ML/visualizers/model_summary.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
import numpy as np
import torch
from torch.utils.hooks import RemovableHandle
from torchprof.profile import Profile
import torch.profiler as profiler

from InnerEye.Common.common_util import logging_only_to_file
from InnerEye.Common.fixed_paths import DEFAULT_MODEL_SUMMARIES_DIR_PATH
Expand Down Expand Up @@ -189,12 +189,15 @@ def print_summary() -> None:

# Register the forward-pass hooks, profile the model, and restore its state
self.model.apply(self._register_hook)
with Profile(self.model, use_cuda=self.use_gpu) as prof:
activities = [profiler.ProfilerActivity.CPU]
if self.use_gpu:
activities.append(profiler.ProfilerActivity.CUDA)
with profiler.profile(activities=activities, record_shapes=True) as prof:
forward_preserve_state(self.model, input_tensors) # type: ignore

# Log the model summary: tensor shapes, num of parameters, memory requirement, and forward pass time
logging.info(self.model)
logging.info('\n' + prof.display(show_events=False))
logging.info('\n' + prof.key_averages().table())
print_summary()

# Remove the hooks via handles
Expand Down
1 change: 1 addition & 0 deletions Tests/ML/configs/ClassificationModelForTesting.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ def __init__(self, conv_in_3d: bool = True, mean_teacher_model: bool = False) ->
self.conv_in_3d = conv_in_3d
# Trying to run DDP from the test suite hangs, hence restrict to single GPU.
self.max_num_gpus = 1
self.pl_deterministic = True

def get_model_train_test_dataset_splits(self, dataset_df: pd.DataFrame) -> DatasetSplits:
return DatasetSplits.from_proportions(
Expand Down
1 change: 1 addition & 0 deletions Tests/ML/configs/ClassificationModelForTesting2D.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ def __init__(self, conv_in_3d: bool = True, mean_teacher_model: bool = False) ->
)
self.expected_image_size_zyx = (5, 7)
self.conv_in_3d = conv_in_3d
self.pl_deterministic = True

def get_model_train_test_dataset_splits(self, dataset_df: pd.DataFrame) -> DatasetSplits:
return DatasetSplits.from_proportions(
Expand Down
1 change: 1 addition & 0 deletions Tests/ML/configs/DummyModel.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ def __init__(self, **kwargs: Any) -> None:
self.add_and_validate(kwargs)
# Trying to run DDP from the test suite hangs, hence restrict to single GPU.
self.max_num_gpus = 1
self.pl_deterministic = True

def get_model_train_test_dataset_splits(self, dataset_df: pd.DataFrame) -> DatasetSplits:
return DatasetSplits(train=dataset_df[dataset_df.subject.isin(self.train_subject_ids)],
Expand Down
4 changes: 2 additions & 2 deletions Tests/ML/models/test_scalar_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,10 +126,10 @@ def test_train_classification_model(class_name: str, test_output_dirs: OutputFol
f"""epoch,subject,prediction_target,model_output,label,data_split,cross_validation_split_index
0,S2,{class_name},0.529514,1,Train,-1
0,S4,{class_name},0.521659,0,Train,-1
1,S4,{class_name},0.521482,0,Train,-1
1,S2,{class_name},0.529475,1,Train,-1
2,S4,{class_name},0.521305,0,Train,-1
1,S4,{class_name},0.521482,0,Train,-1
2,S2,{class_name},0.529437,1,Train,-1
2,S4,{class_name},0.521305,0,Train,-1
3,S2,{class_name},0.529399,1,Train,-1
3,S4,{class_name},0.521128,0,Train,-1
"""
Expand Down
2 changes: 1 addition & 1 deletion Tests/ML/runners/test_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -371,7 +371,7 @@ def run_model_inference_train_and_test(test_output_dirs: OutputFolderForTests,
def test_logging_to_file(test_output_dirs: OutputFolderForTests) -> None:
# Log file should go to a new, non-existent folder, 2 levels deep
file_path = test_output_dirs.root_dir / "subdir1" / "subdir2" / "logfile.txt"
assert common_util.logging_to_file_handler is None
common_util.logging_to_file_handler = None
common_util.logging_to_file(file_path)
assert common_util.logging_to_file_handler is not None
log_line = "foo bar"
Expand Down
14 changes: 7 additions & 7 deletions Tests/ML/test_model_training.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,11 +104,11 @@ def _mean_list(lists: List[List[float]]) -> List[float]:
train_config.check_exclusive = False

if machine_has_gpu:
expected_train_losses = [0.4552919, 0.4548529]
expected_val_losses = [0.455389, 0.455306]
expected_train_losses = [0.4554231, 0.4550124]
expected_val_losses = [0.4553894, 0.4553061]
else:
expected_train_losses = [0.4552919, 0.4548538]
expected_val_losses = [0.4553891, 0.4553060]
expected_train_losses = [0.4554231, 0.4550112]
expected_val_losses = [0.4553893, 0.4553061]
loss_absolute_tolerance = 1e-6
expected_learning_rates = [train_config.l_rate, 5.3589e-4]

Expand Down Expand Up @@ -154,7 +154,7 @@ def assert_all_close(metric: str, expected: List[float], **kwargs: Any) -> None:
# and be the same across 'region' and 'region_1' because they derive from the same Nifti files.
# The following values are read off directly from the results of compute_dice_across_patches in the training loop
# This checks that averages are computed correctly, and that metric computers are reset after each epoch.
train_voxels = [[82860.0, 83212.0, 83087.0], [82831.0, 82900.0, 83212.0]]
train_voxels = [[82765.0, 83212.0, 82740.0], [82831.0, 82647.0, 83255.0]]
val_voxels = [[82765.0, 83212.0], [82765.0, 83212.0]]
_check_voxel_count(model_training_result.train_results_per_epoch(), _mean_list(train_voxels), "Train")
_check_voxel_count(model_training_result.val_results_per_epoch(), _mean_list(val_voxels), "Val")
Expand All @@ -170,8 +170,8 @@ def assert_all_close(metric: str, expected: List[float], **kwargs: Any) -> None:
# The following values are read off directly from the results of compute_dice_across_patches in the
# training loop. Results are slightly different for GPU, hence use a larger tolerance there.
dice_tolerance = 1e-3 if machine_has_gpu else 4.5e-4
train_dice_region = [[0.0, 0.0, 4.0282e-04], [0.0372, 0.0388, 0.1091]]
train_dice_region1 = [[0.4785, 0.4807, 0.4834], [0.4832, 0.4800, 0.4628]]
train_dice_region = [[0.0, 0.0, 0.0], [0.0376, 0.0343, 0.1017]]
train_dice_region1 = [[0.4845, 0.4814, 0.4829], [0.4822, 0.4747, 0.4426]]
# There appears to be some amount of non-determinism here: When using a tolerance of 1e-4, we get occasional
# test failures on Linux in the cloud (not on Windows, not on AzureML) Unclear where it comes from. Even when
# failing here, the losses match up to the expected tolerance.
Expand Down
26 changes: 13 additions & 13 deletions Tests/ML/visualizers/test_model_summary.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,13 @@
# Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
# ------------------------------------------------------------------------------------------
import logging
import torch
from abc import ABC
from typing import List, Tuple

import torch

from InnerEye.Common.common_util import logging_to_stdout
from InnerEye.Common.common_util import logging_to_stdout, change_working_directory
from InnerEye.Common.fixed_paths import DEFAULT_MODEL_SUMMARIES_DIR_PATH
from InnerEye.Common.output_directories import OutputFolderForTests
from InnerEye.ML.configs.classification.GlaucomaPublic import GlaucomaPublic
from InnerEye.ML.models.architectures.base_model import BaseSegmentationModel, CropSizeConstraints
from InnerEye.ML.models.architectures.classification.image_encoder_with_mlp import ImageEncoderWithMlp, \
Expand Down Expand Up @@ -79,19 +79,19 @@ def test_model_summary_on_classification2() -> None:
assert summarizer.n_trainable_params != 0


def test_log_model_summary_to_file() -> None:
def test_log_model_summary_to_file(test_output_dirs: OutputFolderForTests) -> None:
model = MyFavModel()
input_size = (16, 16, 32)
expected_log_file = DEFAULT_MODEL_SUMMARIES_DIR_PATH / "model_log001.txt"
if expected_log_file.exists():
with change_working_directory(test_output_dirs.root_dir):
expected_log_file = DEFAULT_MODEL_SUMMARIES_DIR_PATH / "model_log001.txt"
if expected_log_file.exists():
expected_log_file.unlink()
model.generate_model_summary(input_size, log_summaries_to_files=True)
assert expected_log_file.exists()
assert len(expected_log_file.read_text().splitlines()) >= 3
expected_log_file.unlink()
model.generate_model_summary(input_size, log_summaries_to_files=True)
assert expected_log_file.exists()
with expected_log_file.open() as inpt:
assert len(inpt.readlines()) >= 3
expected_log_file.unlink()
model.generate_model_summary(input_size, log_summaries_to_files=False)
assert not expected_log_file.exists()
model.generate_model_summary(input_size, log_summaries_to_files=False)
assert not expected_log_file.exists()


class MyFavModel(BaseSegmentationModel, ABC):
Expand Down
63 changes: 35 additions & 28 deletions Tests/SSL/test_ssl_containers.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,9 +64,14 @@ def default_runner() -> Runner:
yaml_config_file=fixed_paths.SETTINGS_YAML_FILE)


common_test_args = ["", "--is_debug_model=True", "--num_epochs=1", "--ssl_training_batch_size=10",
common_test_args = ["",
"--is_debug_model=True",
"--num_epochs=1",
"--ssl_training_batch_size=10",
"--linear_head_batch_size=5",
"--num_workers=0"]
"--num_workers=0",
"--pl_deterministic"
""]


def _compare_stored_metrics(runner: Runner, expected_metrics: Dict[str, float], abs: float = 1e-5) -> None:
Expand Down Expand Up @@ -118,16 +123,17 @@ def test_innereye_ssl_container_cifar10_resnet_simclr() -> None:
assert isinstance(loaded_config.model.encoder.cnn_model, ResNet)

# Check the metrics that were recorded during training
expected_metrics = {
'simclr/train/loss': 3.423144578933716,
'simclr/learning_rate': 0.0,
'ssl_online_evaluator/train/loss': 2.6143882274627686,
'ssl_online_evaluator/train/online_AccuracyAtThreshold05': 0.0,
'epoch_started': 0.0,
'simclr/val/loss': 2.886892795562744,
'ssl_online_evaluator/val/loss': 2.2472469806671143,
'ssl_online_evaluator/val/AccuracyAtThreshold05': 0.20000000298023224
}
# Note: It is possible that after the PyTorch 1.10 upgrade, we can't get parity between local runs and runs on
# the hosted build agents. If that suspicion is confirmed, we need to add branching for local and cloud results.
expected_metrics = {'simclr/val/loss': 2.8736939430236816,
'ssl_online_evaluator/val/loss': 2.268489360809326,
'ssl_online_evaluator/val/AccuracyAtThreshold05': 0.20000000298023224,
'simclr/train/loss': 3.6261844635009766,
'simclr/learning_rate': 0.0,
'ssl_online_evaluator/train/loss': 3.1140503883361816,
'ssl_online_evaluator/train/online_AccuracyAtThreshold05': 0.0,
'epoch_started': 0.0}

_compare_stored_metrics(runner, expected_metrics, abs=5e-5)

# Check that the checkpoint contains both the optimizer for the embedding and for the linear head
Expand Down Expand Up @@ -205,22 +211,23 @@ def test_innereye_ssl_container_rsna() -> None:
assert loaded_config.datamodule_args[SSLDataModuleType.ENCODER].augmentation_params.augmentation.use_random_crop
assert loaded_config.datamodule_args[SSLDataModuleType.ENCODER].augmentation_params.augmentation.use_random_affine

expected_metrics = {
'byol/train/loss': 0.00401744619011879,
'byol/tau': 0.9899999499320984,
'byol/learning_rate/0/0': 0.0,
'byol/learning_rate/0/1': 0.0,
'ssl_online_evaluator/train/loss': 0.685592532157898,
'ssl_online_evaluator/train/online_AreaUnderRocCurve': 0.5,
'ssl_online_evaluator/train/online_AreaUnderPRCurve': 0.699999988079071,
'ssl_online_evaluator/train/online_AccuracyAtThreshold05': 0.4000000059604645,
'epoch_started': 0.0,
'byol/val/loss': -0.07644838094711304,
'ssl_online_evaluator/val/loss': 0.6965796947479248,
'ssl_online_evaluator/val/AreaUnderRocCurve': math.nan,
'ssl_online_evaluator/val/AreaUnderPRCurve': math.nan,
'ssl_online_evaluator/val/AccuracyAtThreshold05': 0.0
}
# Note: It is possible that after the PyTorch 1.10 upgrade, we can't get parity between local runs and runs on
# the hosted build agents. If that suspicion is confirmed, we need to add branching for local and cloud results.
expected_metrics = {'byol/val/loss': -0.07644861936569214,
'ssl_online_evaluator/val/loss': 0.6963790059089661,
'ssl_online_evaluator/val/AreaUnderRocCurve': math.nan,
'ssl_online_evaluator/val/AreaUnderPRCurve': math.nan,
'ssl_online_evaluator/val/AccuracyAtThreshold05': 0.0,
'byol/train/loss': 0.004017443861812353,
'byol/tau': 0.9899999499320984,
'byol/learning_rate/0/0': 0.0,
'byol/learning_rate/0/1': 0.0,
'ssl_online_evaluator/train/loss': 0.6938587427139282,
'ssl_online_evaluator/train/online_AreaUnderRocCurve': 0.5,
'ssl_online_evaluator/train/online_AreaUnderPRCurve': 0.6000000238418579,
'ssl_online_evaluator/train/online_AccuracyAtThreshold05': 0.20000000298023224,
'epoch_started': 0.0}

_compare_stored_metrics(runner, expected_metrics)

# Check that we are able to load the checkpoint and create classifier model
Expand Down
7 changes: 3 additions & 4 deletions environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,9 @@ dependencies:
- cudatoolkit=11.1
- pip=20.1.1
- python=3.7.3
- pytorch=1.8.0
- pytorch=1.10.0
- python-blosc=1.7.0
- torchvision=0.9.0
- torchvision=0.11.1
- pip:
- git+https://github.com/analysiscenter/radio.git@6d53e25#egg=radio
- azure-mgmt-resource==12.1.0
Expand All @@ -31,7 +31,7 @@ dependencies:
- joblib==0.16.0
- jupyter==1.0.0
- jupyter-client==6.1.5
- lightning-bolts==0.3.4
- lightning-bolts==0.4.0
- matplotlib==3.3.0
- mlflow==1.17.0
- monai==0.6.0
Expand Down Expand Up @@ -68,7 +68,6 @@ dependencies:
- tabulate==0.8.7
- tensorboard==2.3.0
- tensorboardX==2.1
- torchprof==1.3.3
- torchmetrics==0.6.0
- umap-learn==0.5.2
- yacs==0.1.8

0 comments on commit 1523882

Please sign in to comment.