Make pytorch run non-deterministically by default, upgrade to AML SDK…

… 1.36 (#594)
microsoft · Nov 22, 2021 · 8712267 · 8712267
1 parent b96afc3
commit 8712267
Show file tree

Hide file tree

Showing 7 changed files with 30 additions and 18 deletions.
diff --git a/.idea/InnerEye-DeepLearning.iml b/.idea/InnerEye-DeepLearning.iml
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -13,6 +13,7 @@ created.
 ## Upcoming
 
 ### Added
+- ([#594](https://github.com/microsoft/InnerEye-DeepLearning/pull/594)) When supplying a "--tag" argument, the AzureML jobs use that value as the display name, to more easily distinguish run.
 - ([#577](https://github.com/microsoft/InnerEye-DeepLearning/pull/577)) Commandline switch `monitor_gpu` to monitor 
  GPU utilization via Lightning's `GpuStatsMonitor`, switch `monitor_loading` to check batch loading times via
  `BatchTimeCallback`, and `pl_profiler` to turn on the Lightning profiler (`simple`, `advanced`, or `pytorch`)
@@ -53,6 +54,7 @@ gets uploaded to AzureML, by skipping all test folders.
 - ([#554](https://github.com/microsoft/InnerEye-DeepLearning/pull/554)) Updated report in CovidModel. Set parameters
  in the config to run inference on both the validation and test sets by default.
 - ([#584](https://github.com/microsoft/InnerEye-DeepLearning/pull/584)) SSL models write the optimizer state for the linear head to the checkpoint now.
+- ([#594](https://github.com/microsoft/InnerEye-DeepLearning/pull/594)) Pytorch is now non-deterministic by default. Upgrade to AzureML-SDK 1.36
 - ([#566](https://github.com/microsoft/InnerEye-DeepLearning/pull/566)) Update `hi-ml` dependency to `hi-ml-azure`.
 - ([#572](https://github.com/microsoft/InnerEye-DeepLearning/pull/572)) Updated to new version of hi-ml package
 

diff --git a/InnerEye/ML/dataset/full_image_dataset.py b/InnerEye/ML/dataset/full_image_dataset.py
@@ -7,11 +7,10 @@
 from abc import ABC
 from collections import Counter
 from pathlib import Path
-from typing import Any, Callable, Dict, Generic, List, Optional, TypeVar, Tuple
+from typing import Any, Callable, Dict, Generic, List, Mapping, Optional, Tuple, TypeVar
 
 import pandas as pd
 import torch.utils.data
-from torch._six import container_abcs
 from torch.utils.data import BatchSampler, DataLoader, Dataset, RandomSampler, Sampler, SequentialSampler
 from torch.utils.data.dataloader import default_collate # type: ignore
 
@@ -36,7 +35,7 @@ def collate_with_metadata(batch: List[Dict[str, Any]]) -> Dict[str, Any]:
  :return: collated result
  """
  elem = batch[0]
- if isinstance(elem, container_abcs.Mapping):
+ if isinstance(elem, Mapping):
  result = dict()
  for key in elem:
  # Special handling for all fields that store metadata, and for fields that are list.

diff --git a/InnerEye/ML/deep_learning_config.py b/InnerEye/ML/deep_learning_config.py
@@ -582,7 +582,7 @@ class TrainerParams(param.Parameterized):
  doc="PyTorch Lightning trainer flag 'num_sanity_val_steps': Number of validation "
  "steps to run before training, to identify possible problems")
  pl_deterministic: bool = \
- param.Integer(default=True,
+ param.Boolean(default=False,
  doc="Controls the PyTorch Lightning trainer flags 'deterministic' and 'benchmark'. If "
  "'pl_deterministic' is True, results are perfectly reproducible. If False, they are not, but "
  "you may see training speed increases.")

diff --git a/InnerEye/ML/lightning_base.py b/InnerEye/ML/lightning_base.py
@@ -243,9 +243,12 @@ def set_optimizer_and_scheduler(self, config: DeepLearningConfig) -> None:
  def configure_optimizers(self) -> Tuple[List[Optimizer], List[_LRScheduler]]:
  return [self.optimizer], [self.l_rate_scheduler] # type: ignore
 
+ @rank_zero_only
  def on_fit_end(self) -> None:
  """
- Flushes all logger objects that the present object holds.
+ Flushes all logger objects that the present object holds. This should only be run on rank zero, because
+ otherwise ranks != 0 will create empty log files that can clash with the non-empty log files written on
+ rank 0.
  """
  self.train_epoch_metrics_logger.flush()
  self.val_epoch_metrics_logger.flush()

diff --git a/InnerEye/ML/runner.py b/InnerEye/ML/runner.py
@@ -12,6 +12,7 @@
 # Suppress all errors here because the imports after code cause loads of warnings. We can't specifically suppress
 # individual warnings only.
 # flake8: noqa
+
 # Workaround for an issue with how AzureML and Pytorch Lightning interact: When spawning additional processes for DDP,
 # the working directory is not correctly picked up in sys.path
 print(f"Starting InnerEye runner at {sys.argv[0]}")
@@ -26,6 +27,7 @@
 fixed_paths.add_submodules_to_path()
 
 from azureml._base_sdk_common import user_agent
+from azureml._restclient.constants import RunStatus
 from azureml.core import Run, ScriptRunConfig
 from health_azure import AzureRunInfo, submit_to_azure_if_needed
 from health_azure.utils import create_run_recovery_id, is_global_rank_zero, is_local_rank_zero, merge_conda_files, \
@@ -271,15 +273,18 @@ def after_submission_hook(azure_run: Run) -> None:
  f"InnerEye/Azure/tensorboard_monitor.py --run_ids={azure_run.id}")
 
  if self.azure_config.wait_for_completion:
- # We want the job output to be visible on the console, but the program should not exit if the
- # job fails because we need to download the pytest result file.
+ # We want the job output to be visible on the console. Do not exit yet if the job fails, because we
+ # may need to download the pytest result file.
  azure_run.wait_for_completion(show_output=True, raise_on_error=False)
- if self.azure_config.pytest_mark and self.azure_config.wait_for_completion:
- # The AzureML job can optionally run pytest. Attempt to download it to the current directory.
- # A build step will pick up that file and publish it to Azure DevOps.
- # If pytest_mark is set, this file must exist.
- logging.info("Downloading pytest result file.")
- download_pytest_result(azure_run)
+ if self.azure_config.pytest_mark:
+ # The AzureML job can optionally run pytest. Attempt to download it to the current directory.
+ # A build step will pick up that file and publish it to Azure DevOps.
+ # If pytest_mark is set, this file must exist.
+ logging.info("Downloading pytest result file.")
+ download_pytest_result(azure_run)
+ if azure_run.status == RunStatus.FAILED:
+ raise ValueError(f"The AzureML run failed. Please check this URL for details: "
+ f"{azure_run.get_portal_url()}")
 
  hyperdrive_config = None
  if self.azure_config.hyperdrive:
@@ -326,12 +331,14 @@ def after_submission_hook(azure_run: Run) -> None:
  commandline_args=" ".join(source_config.script_params)),
  after_submission=after_submission_hook,
  hyperdrive_config=hyperdrive_config)
+ # Set the default display name to what was provided as the "tag"
+ if self.azure_config.tag:
+ azure_run_info.run.display_name = self.azure_config.tag
  else:
  # compute_cluster_name is a required parameter in early versions of the HI-ML package
  azure_run_info = submit_to_azure_if_needed(
  input_datasets=input_datasets,
- submit_to_azureml=False,
- compute_cluster_name="")
+ submit_to_azureml=False)
  finally:
  if temp_conda:
  temp_conda.unlink()

diff --git a/environment.yml b/environment.yml
@@ -12,9 +12,9 @@ dependencies:
  - git+https://github.com/analysiscenter/radio.git@6d53e25#egg=radio
  - azure-mgmt-resource==12.1.0
  - azure-mgmt-datafactory==1.1.0
- - azureml-mlflow==1.32.0
- - azureml-sdk==1.32.0
- - azureml-tensorboard==1.32.0
+ - azureml-mlflow==1.36.0
+ - azureml-sdk==1.36.0
+ - azureml-tensorboard==1.36.0
  - conda-merge==0.1.5
  - cryptography==3.3.2
  - dataclasses-json==0.5.2