diff --git a/.idea/InnerEye-DeepLearning.iml b/.idea/InnerEye-DeepLearning.iml index aeaeafd64..76e3fef5c 100644 --- a/.idea/InnerEye-DeepLearning.iml +++ b/.idea/InnerEye-DeepLearning.iml @@ -6,6 +6,7 @@ + diff --git a/CHANGELOG.md b/CHANGELOG.md index 6b8a423a1..94ef443b2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,6 +13,7 @@ created. ## Upcoming ### Added +- ([#594](https://github.com/microsoft/InnerEye-DeepLearning/pull/594)) When supplying a "--tag" argument, the AzureML jobs use that value as the display name, to more easily distinguish run. - ([#577](https://github.com/microsoft/InnerEye-DeepLearning/pull/577)) Commandline switch `monitor_gpu` to monitor GPU utilization via Lightning's `GpuStatsMonitor`, switch `monitor_loading` to check batch loading times via `BatchTimeCallback`, and `pl_profiler` to turn on the Lightning profiler (`simple`, `advanced`, or `pytorch`) @@ -53,6 +54,7 @@ gets uploaded to AzureML, by skipping all test folders. - ([#554](https://github.com/microsoft/InnerEye-DeepLearning/pull/554)) Updated report in CovidModel. Set parameters in the config to run inference on both the validation and test sets by default. - ([#584](https://github.com/microsoft/InnerEye-DeepLearning/pull/584)) SSL models write the optimizer state for the linear head to the checkpoint now. +- ([#594](https://github.com/microsoft/InnerEye-DeepLearning/pull/594)) Pytorch is now non-deterministic by default. Upgrade to AzureML-SDK 1.36 - ([#566](https://github.com/microsoft/InnerEye-DeepLearning/pull/566)) Update `hi-ml` dependency to `hi-ml-azure`. - ([#572](https://github.com/microsoft/InnerEye-DeepLearning/pull/572)) Updated to new version of hi-ml package diff --git a/InnerEye/ML/dataset/full_image_dataset.py b/InnerEye/ML/dataset/full_image_dataset.py index 63883c006..f2105b0be 100644 --- a/InnerEye/ML/dataset/full_image_dataset.py +++ b/InnerEye/ML/dataset/full_image_dataset.py @@ -7,11 +7,10 @@ from abc import ABC from collections import Counter from pathlib import Path -from typing import Any, Callable, Dict, Generic, List, Optional, TypeVar, Tuple +from typing import Any, Callable, Dict, Generic, List, Mapping, Optional, Tuple, TypeVar import pandas as pd import torch.utils.data -from torch._six import container_abcs from torch.utils.data import BatchSampler, DataLoader, Dataset, RandomSampler, Sampler, SequentialSampler from torch.utils.data.dataloader import default_collate # type: ignore @@ -36,7 +35,7 @@ def collate_with_metadata(batch: List[Dict[str, Any]]) -> Dict[str, Any]: :return: collated result """ elem = batch[0] - if isinstance(elem, container_abcs.Mapping): + if isinstance(elem, Mapping): result = dict() for key in elem: # Special handling for all fields that store metadata, and for fields that are list. diff --git a/InnerEye/ML/deep_learning_config.py b/InnerEye/ML/deep_learning_config.py index a48279a37..6537539bf 100644 --- a/InnerEye/ML/deep_learning_config.py +++ b/InnerEye/ML/deep_learning_config.py @@ -582,7 +582,7 @@ class TrainerParams(param.Parameterized): doc="PyTorch Lightning trainer flag 'num_sanity_val_steps': Number of validation " "steps to run before training, to identify possible problems") pl_deterministic: bool = \ - param.Integer(default=True, + param.Boolean(default=False, doc="Controls the PyTorch Lightning trainer flags 'deterministic' and 'benchmark'. If " "'pl_deterministic' is True, results are perfectly reproducible. If False, they are not, but " "you may see training speed increases.") diff --git a/InnerEye/ML/lightning_base.py b/InnerEye/ML/lightning_base.py index b67810b58..99afcb72c 100644 --- a/InnerEye/ML/lightning_base.py +++ b/InnerEye/ML/lightning_base.py @@ -243,9 +243,12 @@ def set_optimizer_and_scheduler(self, config: DeepLearningConfig) -> None: def configure_optimizers(self) -> Tuple[List[Optimizer], List[_LRScheduler]]: return [self.optimizer], [self.l_rate_scheduler] # type: ignore + @rank_zero_only def on_fit_end(self) -> None: """ - Flushes all logger objects that the present object holds. + Flushes all logger objects that the present object holds. This should only be run on rank zero, because + otherwise ranks != 0 will create empty log files that can clash with the non-empty log files written on + rank 0. """ self.train_epoch_metrics_logger.flush() self.val_epoch_metrics_logger.flush() diff --git a/InnerEye/ML/runner.py b/InnerEye/ML/runner.py index 565b1c5e1..fdd88e0cb 100755 --- a/InnerEye/ML/runner.py +++ b/InnerEye/ML/runner.py @@ -12,6 +12,7 @@ # Suppress all errors here because the imports after code cause loads of warnings. We can't specifically suppress # individual warnings only. # flake8: noqa + # Workaround for an issue with how AzureML and Pytorch Lightning interact: When spawning additional processes for DDP, # the working directory is not correctly picked up in sys.path print(f"Starting InnerEye runner at {sys.argv[0]}") @@ -26,6 +27,7 @@ fixed_paths.add_submodules_to_path() from azureml._base_sdk_common import user_agent +from azureml._restclient.constants import RunStatus from azureml.core import Run, ScriptRunConfig from health_azure import AzureRunInfo, submit_to_azure_if_needed from health_azure.utils import create_run_recovery_id, is_global_rank_zero, is_local_rank_zero, merge_conda_files, \ @@ -271,15 +273,18 @@ def after_submission_hook(azure_run: Run) -> None: f"InnerEye/Azure/tensorboard_monitor.py --run_ids={azure_run.id}") if self.azure_config.wait_for_completion: - # We want the job output to be visible on the console, but the program should not exit if the - # job fails because we need to download the pytest result file. + # We want the job output to be visible on the console. Do not exit yet if the job fails, because we + # may need to download the pytest result file. azure_run.wait_for_completion(show_output=True, raise_on_error=False) - if self.azure_config.pytest_mark and self.azure_config.wait_for_completion: - # The AzureML job can optionally run pytest. Attempt to download it to the current directory. - # A build step will pick up that file and publish it to Azure DevOps. - # If pytest_mark is set, this file must exist. - logging.info("Downloading pytest result file.") - download_pytest_result(azure_run) + if self.azure_config.pytest_mark: + # The AzureML job can optionally run pytest. Attempt to download it to the current directory. + # A build step will pick up that file and publish it to Azure DevOps. + # If pytest_mark is set, this file must exist. + logging.info("Downloading pytest result file.") + download_pytest_result(azure_run) + if azure_run.status == RunStatus.FAILED: + raise ValueError(f"The AzureML run failed. Please check this URL for details: " + f"{azure_run.get_portal_url()}") hyperdrive_config = None if self.azure_config.hyperdrive: @@ -326,12 +331,14 @@ def after_submission_hook(azure_run: Run) -> None: commandline_args=" ".join(source_config.script_params)), after_submission=after_submission_hook, hyperdrive_config=hyperdrive_config) + # Set the default display name to what was provided as the "tag" + if self.azure_config.tag: + azure_run_info.run.display_name = self.azure_config.tag else: # compute_cluster_name is a required parameter in early versions of the HI-ML package azure_run_info = submit_to_azure_if_needed( input_datasets=input_datasets, - submit_to_azureml=False, - compute_cluster_name="") + submit_to_azureml=False) finally: if temp_conda: temp_conda.unlink() diff --git a/environment.yml b/environment.yml index 108c736f7..b63c5d1f0 100644 --- a/environment.yml +++ b/environment.yml @@ -12,9 +12,9 @@ dependencies: - git+https://github.com/analysiscenter/radio.git@6d53e25#egg=radio - azure-mgmt-resource==12.1.0 - azure-mgmt-datafactory==1.1.0 - - azureml-mlflow==1.32.0 - - azureml-sdk==1.32.0 - - azureml-tensorboard==1.32.0 + - azureml-mlflow==1.36.0 + - azureml-sdk==1.36.0 + - azureml-tensorboard==1.36.0 - conda-merge==0.1.5 - cryptography==3.3.2 - dataclasses-json==0.5.2