Upgrade to PyTorch 1.10 (#585)

microsoft · Jan 10, 2022 · 1523882 · 1523882
1 parent ac6a312
commit 1523882
Show file tree

Hide file tree

Showing 14 changed files with 73 additions and 60 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -42,6 +42,7 @@ jobs that run in AzureML.
 
 ### Changed
 - ([#588](https://github.com/microsoft/InnerEye-DeepLearning/pull/588)) Replace SciPy with PIL.PngImagePlugin.PngImageFile to load png files.
+- ([#585](https://github.com/microsoft/InnerEye-DeepLearning/pull/585)) Switching to PyTorch 1.10.0 and torchvision 0.11.1
 - ([#576](https://github.com/microsoft/InnerEye-DeepLearning/pull/576)) The console output is no longer written to stdout.txt because AzureML handles that better now
 - ([#531](https://github.com/microsoft/InnerEye-DeepLearning/pull/531)) Updated PL to 1.3.8, torchmetrics and pl-bolts and changed relevant metrics and SSL code API.
 - ([#555](https://github.com/microsoft/InnerEye-DeepLearning/pull/555)) Make the SSLContainer compatible with new datasets

diff --git a/InnerEye/ML/configs/classification/DummyClassification.py b/InnerEye/ML/configs/classification/DummyClassification.py
@@ -34,6 +34,7 @@ def __init__(self) -> None:
  self.expected_image_size_zyx = (4, 5, 7)
  # Trying to run DDP from the test suite hangs, hence restrict to single GPU.
  self.max_num_gpus = 1
+ self.pl_deterministic = True
 
  def get_model_train_test_dataset_splits(self, dataset_df: pd.DataFrame) -> DatasetSplits:
  return DatasetSplits.from_proportions(

diff --git a/InnerEye/ML/configs/classification/DummyMulticlassClassification.py b/InnerEye/ML/configs/classification/DummyMulticlassClassification.py
@@ -34,6 +34,7 @@ def __init__(self) -> None:
  self.expected_image_size_zyx = (4, 5, 7)
  # Trying to run DDP from the test suite hangs, hence restrict to single GPU.
  self.max_num_gpus = 1
+ self.pl_deterministic = True
 
  def get_model_train_test_dataset_splits(self, dataset_df: pd.DataFrame) -> DatasetSplits:
  return DatasetSplits.from_proportions(

diff --git a/InnerEye/ML/models/architectures/sequential/gru.py b/InnerEye/ML/models/architectures/sequential/gru.py
@@ -37,10 +37,8 @@ def __init__(self, input_size: int, hidden_size: int, use_layer_norm: bool = Fal
  self.ln_n = nn.LayerNorm(self.hidden_size) if use_layer_norm else Identity()
 
  def forward(self, input: torch.Tensor, hx: Optional[torch.Tensor] = None) -> torch.Tensor: # type: ignore
- self.check_forward_input(input)
  if hx is None:
  hx = input.new_zeros(size=(input.size(0), self.hidden_size), requires_grad=False)
- self.check_forward_hidden(input, hx)
 
  ih = input.mm(self.weight_ih.t())
  hh = hx.mm(self.weight_hh.t())

diff --git a/InnerEye/ML/visualizers/model_summary.py b/InnerEye/ML/visualizers/model_summary.py
@@ -11,7 +11,7 @@
 import numpy as np
 import torch
 from torch.utils.hooks import RemovableHandle
-from torchprof.profile import Profile
+import torch.profiler as profiler
 
 from InnerEye.Common.common_util import logging_only_to_file
 from InnerEye.Common.fixed_paths import DEFAULT_MODEL_SUMMARIES_DIR_PATH
@@ -189,12 +189,15 @@ def print_summary() -> None:
 
  # Register the forward-pass hooks, profile the model, and restore its state
  self.model.apply(self._register_hook)
- with Profile(self.model, use_cuda=self.use_gpu) as prof:
+ activities = [profiler.ProfilerActivity.CPU]
+ if self.use_gpu:
+ activities.append(profiler.ProfilerActivity.CUDA)
+ with profiler.profile(activities=activities, record_shapes=True) as prof:
  forward_preserve_state(self.model, input_tensors) # type: ignore
 
  # Log the model summary: tensor shapes, num of parameters, memory requirement, and forward pass time
  logging.info(self.model)
- logging.info('\n' + prof.display(show_events=False))
+ logging.info('\n' + prof.key_averages().table())
  print_summary()
 
  # Remove the hooks via handles

diff --git a/Tests/ML/configs/ClassificationModelForTesting.py b/Tests/ML/configs/ClassificationModelForTesting.py
@@ -35,6 +35,7 @@ def __init__(self, conv_in_3d: bool = True, mean_teacher_model: bool = False) ->
  self.conv_in_3d = conv_in_3d
  # Trying to run DDP from the test suite hangs, hence restrict to single GPU.
  self.max_num_gpus = 1
+ self.pl_deterministic = True
 
  def get_model_train_test_dataset_splits(self, dataset_df: pd.DataFrame) -> DatasetSplits:
  return DatasetSplits.from_proportions(

diff --git a/Tests/ML/configs/ClassificationModelForTesting2D.py b/Tests/ML/configs/ClassificationModelForTesting2D.py
@@ -33,6 +33,7 @@ def __init__(self, conv_in_3d: bool = True, mean_teacher_model: bool = False) ->
  )
  self.expected_image_size_zyx = (5, 7)
  self.conv_in_3d = conv_in_3d
+ self.pl_deterministic = True
 
  def get_model_train_test_dataset_splits(self, dataset_df: pd.DataFrame) -> DatasetSplits:
  return DatasetSplits.from_proportions(

diff --git a/Tests/ML/configs/DummyModel.py b/Tests/ML/configs/DummyModel.py
@@ -67,6 +67,7 @@ def __init__(self, **kwargs: Any) -> None:
  self.add_and_validate(kwargs)
  # Trying to run DDP from the test suite hangs, hence restrict to single GPU.
  self.max_num_gpus = 1
+ self.pl_deterministic = True
 
  def get_model_train_test_dataset_splits(self, dataset_df: pd.DataFrame) -> DatasetSplits:
  return DatasetSplits(train=dataset_df[dataset_df.subject.isin(self.train_subject_ids)],

diff --git a/Tests/ML/models/test_scalar_model.py b/Tests/ML/models/test_scalar_model.py
@@ -126,10 +126,10 @@ def test_train_classification_model(class_name: str, test_output_dirs: OutputFol
  f"""epoch,subject,prediction_target,model_output,label,data_split,cross_validation_split_index
 0,S2,{class_name},0.529514,1,Train,-1
 0,S4,{class_name},0.521659,0,Train,-1
-1,S4,{class_name},0.521482,0,Train,-1
 1,S2,{class_name},0.529475,1,Train,-1
-2,S4,{class_name},0.521305,0,Train,-1
+1,S4,{class_name},0.521482,0,Train,-1
 2,S2,{class_name},0.529437,1,Train,-1
+2,S4,{class_name},0.521305,0,Train,-1
 3,S2,{class_name},0.529399,1,Train,-1
 3,S4,{class_name},0.521128,0,Train,-1
 """

diff --git a/Tests/ML/runners/test_runner.py b/Tests/ML/runners/test_runner.py
@@ -371,7 +371,7 @@ def run_model_inference_train_and_test(test_output_dirs: OutputFolderForTests,
 def test_logging_to_file(test_output_dirs: OutputFolderForTests) -> None:
  # Log file should go to a new, non-existent folder, 2 levels deep
  file_path = test_output_dirs.root_dir / "subdir1" / "subdir2" / "logfile.txt"
- assert common_util.logging_to_file_handler is None
+ common_util.logging_to_file_handler = None
  common_util.logging_to_file(file_path)
  assert common_util.logging_to_file_handler is not None
  log_line = "foo bar"

diff --git a/Tests/ML/test_model_training.py b/Tests/ML/test_model_training.py
@@ -104,11 +104,11 @@ def _mean_list(lists: List[List[float]]) -> List[float]:
  train_config.check_exclusive = False
 
  if machine_has_gpu:
- expected_train_losses = [0.4552919, 0.4548529]
- expected_val_losses = [0.455389, 0.455306]
+ expected_train_losses = [0.4554231, 0.4550124]
+ expected_val_losses = [0.4553894, 0.4553061]
  else:
- expected_train_losses = [0.4552919, 0.4548538]
- expected_val_losses = [0.4553891, 0.4553060]
+ expected_train_losses = [0.4554231, 0.4550112]
+ expected_val_losses = [0.4553893, 0.4553061]
  loss_absolute_tolerance = 1e-6
  expected_learning_rates = [train_config.l_rate, 5.3589e-4]
 
@@ -154,7 +154,7 @@ def assert_all_close(metric: str, expected: List[float], **kwargs: Any) -> None:
  # and be the same across 'region' and 'region_1' because they derive from the same Nifti files.
  # The following values are read off directly from the results of compute_dice_across_patches in the training loop
  # This checks that averages are computed correctly, and that metric computers are reset after each epoch.
- train_voxels = [[82860.0, 83212.0, 83087.0], [82831.0, 82900.0, 83212.0]]
+ train_voxels = [[82765.0, 83212.0, 82740.0], [82831.0, 82647.0, 83255.0]]
  val_voxels = [[82765.0, 83212.0], [82765.0, 83212.0]]
  _check_voxel_count(model_training_result.train_results_per_epoch(), _mean_list(train_voxels), "Train")
  _check_voxel_count(model_training_result.val_results_per_epoch(), _mean_list(val_voxels), "Val")
@@ -170,8 +170,8 @@ def assert_all_close(metric: str, expected: List[float], **kwargs: Any) -> None:
  # The following values are read off directly from the results of compute_dice_across_patches in the
  # training loop. Results are slightly different for GPU, hence use a larger tolerance there.
  dice_tolerance = 1e-3 if machine_has_gpu else 4.5e-4
- train_dice_region = [[0.0, 0.0, 4.0282e-04], [0.0372, 0.0388, 0.1091]]
- train_dice_region1 = [[0.4785, 0.4807, 0.4834], [0.4832, 0.4800, 0.4628]]
+ train_dice_region = [[0.0, 0.0, 0.0], [0.0376, 0.0343, 0.1017]]
+ train_dice_region1 = [[0.4845, 0.4814, 0.4829], [0.4822, 0.4747, 0.4426]]
  # There appears to be some amount of non-determinism here: When using a tolerance of 1e-4, we get occasional
  # test failures on Linux in the cloud (not on Windows, not on AzureML) Unclear where it comes from. Even when
  # failing here, the losses match up to the expected tolerance.

diff --git a/Tests/ML/visualizers/test_model_summary.py b/Tests/ML/visualizers/test_model_summary.py
@@ -3,13 +3,13 @@
 # Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
 # ------------------------------------------------------------------------------------------
 import logging
+import torch
 from abc import ABC
 from typing import List, Tuple
 
-import torch
-
-from InnerEye.Common.common_util import logging_to_stdout
+from InnerEye.Common.common_util import logging_to_stdout, change_working_directory
 from InnerEye.Common.fixed_paths import DEFAULT_MODEL_SUMMARIES_DIR_PATH
+from InnerEye.Common.output_directories import OutputFolderForTests
 from InnerEye.ML.configs.classification.GlaucomaPublic import GlaucomaPublic
 from InnerEye.ML.models.architectures.base_model import BaseSegmentationModel, CropSizeConstraints
 from InnerEye.ML.models.architectures.classification.image_encoder_with_mlp import ImageEncoderWithMlp, \
@@ -79,19 +79,19 @@ def test_model_summary_on_classification2() -> None:
  assert summarizer.n_trainable_params != 0
 
 
-def test_log_model_summary_to_file() -> None:
+def test_log_model_summary_to_file(test_output_dirs: OutputFolderForTests) -> None:
  model = MyFavModel()
  input_size = (16, 16, 32)
- expected_log_file = DEFAULT_MODEL_SUMMARIES_DIR_PATH / "model_log001.txt"
- if expected_log_file.exists():
+ with change_working_directory(test_output_dirs.root_dir):
+ expected_log_file = DEFAULT_MODEL_SUMMARIES_DIR_PATH / "model_log001.txt"
+ if expected_log_file.exists():
+ expected_log_file.unlink()
+ model.generate_model_summary(input_size, log_summaries_to_files=True)
+ assert expected_log_file.exists()
+ assert len(expected_log_file.read_text().splitlines()) >= 3
  expected_log_file.unlink()
- model.generate_model_summary(input_size, log_summaries_to_files=True)
- assert expected_log_file.exists()
- with expected_log_file.open() as inpt:
- assert len(inpt.readlines()) >= 3
- expected_log_file.unlink()
- model.generate_model_summary(input_size, log_summaries_to_files=False)
- assert not expected_log_file.exists()
+ model.generate_model_summary(input_size, log_summaries_to_files=False)
+ assert not expected_log_file.exists()
 
 
 class MyFavModel(BaseSegmentationModel, ABC):

diff --git a/Tests/SSL/test_ssl_containers.py b/Tests/SSL/test_ssl_containers.py
@@ -64,9 +64,14 @@ def default_runner() -> Runner:
  yaml_config_file=fixed_paths.SETTINGS_YAML_FILE)
 
 
-common_test_args = ["", "--is_debug_model=True", "--num_epochs=1", "--ssl_training_batch_size=10",
+common_test_args = ["",
+ "--is_debug_model=True",
+ "--num_epochs=1",
+ "--ssl_training_batch_size=10",
  "--linear_head_batch_size=5",
- "--num_workers=0"]
+ "--num_workers=0",
+ "--pl_deterministic"
+ ""]
 
 
 def _compare_stored_metrics(runner: Runner, expected_metrics: Dict[str, float], abs: float = 1e-5) -> None:
@@ -118,16 +123,17 @@ def test_innereye_ssl_container_cifar10_resnet_simclr() -> None:
  assert isinstance(loaded_config.model.encoder.cnn_model, ResNet)
 
  # Check the metrics that were recorded during training
- expected_metrics = {
- 'simclr/train/loss': 3.423144578933716,
- 'simclr/learning_rate': 0.0,
- 'ssl_online_evaluator/train/loss': 2.6143882274627686,
- 'ssl_online_evaluator/train/online_AccuracyAtThreshold05': 0.0,
- 'epoch_started': 0.0,
- 'simclr/val/loss': 2.886892795562744,
- 'ssl_online_evaluator/val/loss': 2.2472469806671143,
- 'ssl_online_evaluator/val/AccuracyAtThreshold05': 0.20000000298023224
- }
+ # Note: It is possible that after the PyTorch 1.10 upgrade, we can't get parity between local runs and runs on
+ # the hosted build agents. If that suspicion is confirmed, we need to add branching for local and cloud results.
+ expected_metrics = {'simclr/val/loss': 2.8736939430236816,
+ 'ssl_online_evaluator/val/loss': 2.268489360809326,
+ 'ssl_online_evaluator/val/AccuracyAtThreshold05': 0.20000000298023224,
+ 'simclr/train/loss': 3.6261844635009766,
+ 'simclr/learning_rate': 0.0,
+ 'ssl_online_evaluator/train/loss': 3.1140503883361816,
+ 'ssl_online_evaluator/train/online_AccuracyAtThreshold05': 0.0,
+ 'epoch_started': 0.0}
+
  _compare_stored_metrics(runner, expected_metrics, abs=5e-5)
 
  # Check that the checkpoint contains both the optimizer for the embedding and for the linear head
@@ -205,22 +211,23 @@ def test_innereye_ssl_container_rsna() -> None:
  assert loaded_config.datamodule_args[SSLDataModuleType.ENCODER].augmentation_params.augmentation.use_random_crop
  assert loaded_config.datamodule_args[SSLDataModuleType.ENCODER].augmentation_params.augmentation.use_random_affine
 
- expected_metrics = {
- 'byol/train/loss': 0.00401744619011879,
- 'byol/tau': 0.9899999499320984,
- 'byol/learning_rate/0/0': 0.0,
- 'byol/learning_rate/0/1': 0.0,
- 'ssl_online_evaluator/train/loss': 0.685592532157898,
- 'ssl_online_evaluator/train/online_AreaUnderRocCurve': 0.5,
- 'ssl_online_evaluator/train/online_AreaUnderPRCurve': 0.699999988079071,
- 'ssl_online_evaluator/train/online_AccuracyAtThreshold05': 0.4000000059604645,
- 'epoch_started': 0.0,
- 'byol/val/loss': -0.07644838094711304,
- 'ssl_online_evaluator/val/loss': 0.6965796947479248,
- 'ssl_online_evaluator/val/AreaUnderRocCurve': math.nan,
- 'ssl_online_evaluator/val/AreaUnderPRCurve': math.nan,
- 'ssl_online_evaluator/val/AccuracyAtThreshold05': 0.0
- }
+ # Note: It is possible that after the PyTorch 1.10 upgrade, we can't get parity between local runs and runs on
+ # the hosted build agents. If that suspicion is confirmed, we need to add branching for local and cloud results.
+ expected_metrics = {'byol/val/loss': -0.07644861936569214,
+ 'ssl_online_evaluator/val/loss': 0.6963790059089661,
+ 'ssl_online_evaluator/val/AreaUnderRocCurve': math.nan,
+ 'ssl_online_evaluator/val/AreaUnderPRCurve': math.nan,
+ 'ssl_online_evaluator/val/AccuracyAtThreshold05': 0.0,
+ 'byol/train/loss': 0.004017443861812353,
+ 'byol/tau': 0.9899999499320984,
+ 'byol/learning_rate/0/0': 0.0,
+ 'byol/learning_rate/0/1': 0.0,
+ 'ssl_online_evaluator/train/loss': 0.6938587427139282,
+ 'ssl_online_evaluator/train/online_AreaUnderRocCurve': 0.5,
+ 'ssl_online_evaluator/train/online_AreaUnderPRCurve': 0.6000000238418579,
+ 'ssl_online_evaluator/train/online_AccuracyAtThreshold05': 0.20000000298023224,
+ 'epoch_started': 0.0}
+
  _compare_stored_metrics(runner, expected_metrics)
 
  # Check that we are able to load the checkpoint and create classifier model

diff --git a/environment.yml b/environment.yml
@@ -7,9 +7,9 @@ dependencies:
  - cudatoolkit=11.1
  - pip=20.1.1
  - python=3.7.3
- - pytorch=1.8.0
+ - pytorch=1.10.0
  - python-blosc=1.7.0
- - torchvision=0.9.0
+ - torchvision=0.11.1
  - pip:
  - git+https://github.com/analysiscenter/radio.git@6d53e25#egg=radio
  - azure-mgmt-resource==12.1.0
@@ -31,7 +31,7 @@ dependencies:
  - joblib==0.16.0
  - jupyter==1.0.0
  - jupyter-client==6.1.5
- - lightning-bolts==0.3.4
+ - lightning-bolts==0.4.0
  - matplotlib==3.3.0
  - mlflow==1.17.0
  - monai==0.6.0
@@ -68,7 +68,6 @@ dependencies:
  - tabulate==0.8.7
  - tensorboard==2.3.0
  - tensorboardX==2.1
- - torchprof==1.3.3
  - torchmetrics==0.6.0
  - umap-learn==0.5.2
  - yacs==0.1.8