diff --git a/.idea/InnerEye-DeepLearning.iml b/.idea/InnerEye-DeepLearning.iml
index aeaeafd64..76e3fef5c 100644
--- a/.idea/InnerEye-DeepLearning.iml
+++ b/.idea/InnerEye-DeepLearning.iml
@@ -6,6 +6,7 @@
+
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 6b8a423a1..94ef443b2 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -13,6 +13,7 @@ created.
## Upcoming
### Added
+- ([#594](https://github.com/microsoft/InnerEye-DeepLearning/pull/594)) When supplying a "--tag" argument, the AzureML jobs use that value as the display name, to more easily distinguish run.
- ([#577](https://github.com/microsoft/InnerEye-DeepLearning/pull/577)) Commandline switch `monitor_gpu` to monitor
GPU utilization via Lightning's `GpuStatsMonitor`, switch `monitor_loading` to check batch loading times via
`BatchTimeCallback`, and `pl_profiler` to turn on the Lightning profiler (`simple`, `advanced`, or `pytorch`)
@@ -53,6 +54,7 @@ gets uploaded to AzureML, by skipping all test folders.
- ([#554](https://github.com/microsoft/InnerEye-DeepLearning/pull/554)) Updated report in CovidModel. Set parameters
in the config to run inference on both the validation and test sets by default.
- ([#584](https://github.com/microsoft/InnerEye-DeepLearning/pull/584)) SSL models write the optimizer state for the linear head to the checkpoint now.
+- ([#594](https://github.com/microsoft/InnerEye-DeepLearning/pull/594)) Pytorch is now non-deterministic by default. Upgrade to AzureML-SDK 1.36
- ([#566](https://github.com/microsoft/InnerEye-DeepLearning/pull/566)) Update `hi-ml` dependency to `hi-ml-azure`.
- ([#572](https://github.com/microsoft/InnerEye-DeepLearning/pull/572)) Updated to new version of hi-ml package
diff --git a/InnerEye/ML/dataset/full_image_dataset.py b/InnerEye/ML/dataset/full_image_dataset.py
index 63883c006..f2105b0be 100644
--- a/InnerEye/ML/dataset/full_image_dataset.py
+++ b/InnerEye/ML/dataset/full_image_dataset.py
@@ -7,11 +7,10 @@
from abc import ABC
from collections import Counter
from pathlib import Path
-from typing import Any, Callable, Dict, Generic, List, Optional, TypeVar, Tuple
+from typing import Any, Callable, Dict, Generic, List, Mapping, Optional, Tuple, TypeVar
import pandas as pd
import torch.utils.data
-from torch._six import container_abcs
from torch.utils.data import BatchSampler, DataLoader, Dataset, RandomSampler, Sampler, SequentialSampler
from torch.utils.data.dataloader import default_collate # type: ignore
@@ -36,7 +35,7 @@ def collate_with_metadata(batch: List[Dict[str, Any]]) -> Dict[str, Any]:
:return: collated result
"""
elem = batch[0]
- if isinstance(elem, container_abcs.Mapping):
+ if isinstance(elem, Mapping):
result = dict()
for key in elem:
# Special handling for all fields that store metadata, and for fields that are list.
diff --git a/InnerEye/ML/deep_learning_config.py b/InnerEye/ML/deep_learning_config.py
index a48279a37..6537539bf 100644
--- a/InnerEye/ML/deep_learning_config.py
+++ b/InnerEye/ML/deep_learning_config.py
@@ -582,7 +582,7 @@ class TrainerParams(param.Parameterized):
doc="PyTorch Lightning trainer flag 'num_sanity_val_steps': Number of validation "
"steps to run before training, to identify possible problems")
pl_deterministic: bool = \
- param.Integer(default=True,
+ param.Boolean(default=False,
doc="Controls the PyTorch Lightning trainer flags 'deterministic' and 'benchmark'. If "
"'pl_deterministic' is True, results are perfectly reproducible. If False, they are not, but "
"you may see training speed increases.")
diff --git a/InnerEye/ML/lightning_base.py b/InnerEye/ML/lightning_base.py
index b67810b58..99afcb72c 100644
--- a/InnerEye/ML/lightning_base.py
+++ b/InnerEye/ML/lightning_base.py
@@ -243,9 +243,12 @@ def set_optimizer_and_scheduler(self, config: DeepLearningConfig) -> None:
def configure_optimizers(self) -> Tuple[List[Optimizer], List[_LRScheduler]]:
return [self.optimizer], [self.l_rate_scheduler] # type: ignore
+ @rank_zero_only
def on_fit_end(self) -> None:
"""
- Flushes all logger objects that the present object holds.
+ Flushes all logger objects that the present object holds. This should only be run on rank zero, because
+ otherwise ranks != 0 will create empty log files that can clash with the non-empty log files written on
+ rank 0.
"""
self.train_epoch_metrics_logger.flush()
self.val_epoch_metrics_logger.flush()
diff --git a/InnerEye/ML/runner.py b/InnerEye/ML/runner.py
index 565b1c5e1..fdd88e0cb 100755
--- a/InnerEye/ML/runner.py
+++ b/InnerEye/ML/runner.py
@@ -12,6 +12,7 @@
# Suppress all errors here because the imports after code cause loads of warnings. We can't specifically suppress
# individual warnings only.
# flake8: noqa
+
# Workaround for an issue with how AzureML and Pytorch Lightning interact: When spawning additional processes for DDP,
# the working directory is not correctly picked up in sys.path
print(f"Starting InnerEye runner at {sys.argv[0]}")
@@ -26,6 +27,7 @@
fixed_paths.add_submodules_to_path()
from azureml._base_sdk_common import user_agent
+from azureml._restclient.constants import RunStatus
from azureml.core import Run, ScriptRunConfig
from health_azure import AzureRunInfo, submit_to_azure_if_needed
from health_azure.utils import create_run_recovery_id, is_global_rank_zero, is_local_rank_zero, merge_conda_files, \
@@ -271,15 +273,18 @@ def after_submission_hook(azure_run: Run) -> None:
f"InnerEye/Azure/tensorboard_monitor.py --run_ids={azure_run.id}")
if self.azure_config.wait_for_completion:
- # We want the job output to be visible on the console, but the program should not exit if the
- # job fails because we need to download the pytest result file.
+ # We want the job output to be visible on the console. Do not exit yet if the job fails, because we
+ # may need to download the pytest result file.
azure_run.wait_for_completion(show_output=True, raise_on_error=False)
- if self.azure_config.pytest_mark and self.azure_config.wait_for_completion:
- # The AzureML job can optionally run pytest. Attempt to download it to the current directory.
- # A build step will pick up that file and publish it to Azure DevOps.
- # If pytest_mark is set, this file must exist.
- logging.info("Downloading pytest result file.")
- download_pytest_result(azure_run)
+ if self.azure_config.pytest_mark:
+ # The AzureML job can optionally run pytest. Attempt to download it to the current directory.
+ # A build step will pick up that file and publish it to Azure DevOps.
+ # If pytest_mark is set, this file must exist.
+ logging.info("Downloading pytest result file.")
+ download_pytest_result(azure_run)
+ if azure_run.status == RunStatus.FAILED:
+ raise ValueError(f"The AzureML run failed. Please check this URL for details: "
+ f"{azure_run.get_portal_url()}")
hyperdrive_config = None
if self.azure_config.hyperdrive:
@@ -326,12 +331,14 @@ def after_submission_hook(azure_run: Run) -> None:
commandline_args=" ".join(source_config.script_params)),
after_submission=after_submission_hook,
hyperdrive_config=hyperdrive_config)
+ # Set the default display name to what was provided as the "tag"
+ if self.azure_config.tag:
+ azure_run_info.run.display_name = self.azure_config.tag
else:
# compute_cluster_name is a required parameter in early versions of the HI-ML package
azure_run_info = submit_to_azure_if_needed(
input_datasets=input_datasets,
- submit_to_azureml=False,
- compute_cluster_name="")
+ submit_to_azureml=False)
finally:
if temp_conda:
temp_conda.unlink()
diff --git a/environment.yml b/environment.yml
index 108c736f7..b63c5d1f0 100644
--- a/environment.yml
+++ b/environment.yml
@@ -12,9 +12,9 @@ dependencies:
- git+https://github.com/analysiscenter/radio.git@6d53e25#egg=radio
- azure-mgmt-resource==12.1.0
- azure-mgmt-datafactory==1.1.0
- - azureml-mlflow==1.32.0
- - azureml-sdk==1.32.0
- - azureml-tensorboard==1.32.0
+ - azureml-mlflow==1.36.0
+ - azureml-sdk==1.36.0
+ - azureml-tensorboard==1.36.0
- conda-merge==0.1.5
- cryptography==3.3.2
- dataclasses-json==0.5.2