Skip to content
This repository has been archived by the owner on Mar 21, 2024. It is now read-only.

Commit

Permalink
Make pytorch run non-deterministically by default, upgrade to AML SDK…
Browse files Browse the repository at this point in the history
… 1.36 (#594)
  • Loading branch information
ant0nsc committed Nov 22, 2021
1 parent b96afc3 commit 8712267
Show file tree
Hide file tree
Showing 7 changed files with 30 additions and 18 deletions.
1 change: 1 addition & 0 deletions .idea/InnerEye-DeepLearning.iml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ created.
## Upcoming

### Added
- ([#594](https://github.com/microsoft/InnerEye-DeepLearning/pull/594)) When supplying a "--tag" argument, the AzureML jobs use that value as the display name, to more easily distinguish run.
- ([#577](https://github.com/microsoft/InnerEye-DeepLearning/pull/577)) Commandline switch `monitor_gpu` to monitor
GPU utilization via Lightning's `GpuStatsMonitor`, switch `monitor_loading` to check batch loading times via
`BatchTimeCallback`, and `pl_profiler` to turn on the Lightning profiler (`simple`, `advanced`, or `pytorch`)
Expand Down Expand Up @@ -53,6 +54,7 @@ gets uploaded to AzureML, by skipping all test folders.
- ([#554](https://github.com/microsoft/InnerEye-DeepLearning/pull/554)) Updated report in CovidModel. Set parameters
in the config to run inference on both the validation and test sets by default.
- ([#584](https://github.com/microsoft/InnerEye-DeepLearning/pull/584)) SSL models write the optimizer state for the linear head to the checkpoint now.
- ([#594](https://github.com/microsoft/InnerEye-DeepLearning/pull/594)) Pytorch is now non-deterministic by default. Upgrade to AzureML-SDK 1.36
- ([#566](https://github.com/microsoft/InnerEye-DeepLearning/pull/566)) Update `hi-ml` dependency to `hi-ml-azure`.
- ([#572](https://github.com/microsoft/InnerEye-DeepLearning/pull/572)) Updated to new version of hi-ml package

Expand Down
5 changes: 2 additions & 3 deletions InnerEye/ML/dataset/full_image_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,10 @@
from abc import ABC
from collections import Counter
from pathlib import Path
from typing import Any, Callable, Dict, Generic, List, Optional, TypeVar, Tuple
from typing import Any, Callable, Dict, Generic, List, Mapping, Optional, Tuple, TypeVar

import pandas as pd
import torch.utils.data
from torch._six import container_abcs
from torch.utils.data import BatchSampler, DataLoader, Dataset, RandomSampler, Sampler, SequentialSampler
from torch.utils.data.dataloader import default_collate # type: ignore

Expand All @@ -36,7 +35,7 @@ def collate_with_metadata(batch: List[Dict[str, Any]]) -> Dict[str, Any]:
:return: collated result
"""
elem = batch[0]
if isinstance(elem, container_abcs.Mapping):
if isinstance(elem, Mapping):
result = dict()
for key in elem:
# Special handling for all fields that store metadata, and for fields that are list.
Expand Down
2 changes: 1 addition & 1 deletion InnerEye/ML/deep_learning_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -582,7 +582,7 @@ class TrainerParams(param.Parameterized):
doc="PyTorch Lightning trainer flag 'num_sanity_val_steps': Number of validation "
"steps to run before training, to identify possible problems")
pl_deterministic: bool = \
param.Integer(default=True,
param.Boolean(default=False,
doc="Controls the PyTorch Lightning trainer flags 'deterministic' and 'benchmark'. If "
"'pl_deterministic' is True, results are perfectly reproducible. If False, they are not, but "
"you may see training speed increases.")
Expand Down
5 changes: 4 additions & 1 deletion InnerEye/ML/lightning_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -243,9 +243,12 @@ def set_optimizer_and_scheduler(self, config: DeepLearningConfig) -> None:
def configure_optimizers(self) -> Tuple[List[Optimizer], List[_LRScheduler]]:
return [self.optimizer], [self.l_rate_scheduler] # type: ignore

@rank_zero_only
def on_fit_end(self) -> None:
"""
Flushes all logger objects that the present object holds.
Flushes all logger objects that the present object holds. This should only be run on rank zero, because
otherwise ranks != 0 will create empty log files that can clash with the non-empty log files written on
rank 0.
"""
self.train_epoch_metrics_logger.flush()
self.val_epoch_metrics_logger.flush()
Expand Down
27 changes: 17 additions & 10 deletions InnerEye/ML/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
# Suppress all errors here because the imports after code cause loads of warnings. We can't specifically suppress
# individual warnings only.
# flake8: noqa

# Workaround for an issue with how AzureML and Pytorch Lightning interact: When spawning additional processes for DDP,
# the working directory is not correctly picked up in sys.path
print(f"Starting InnerEye runner at {sys.argv[0]}")
Expand All @@ -26,6 +27,7 @@
fixed_paths.add_submodules_to_path()

from azureml._base_sdk_common import user_agent
from azureml._restclient.constants import RunStatus
from azureml.core import Run, ScriptRunConfig
from health_azure import AzureRunInfo, submit_to_azure_if_needed
from health_azure.utils import create_run_recovery_id, is_global_rank_zero, is_local_rank_zero, merge_conda_files, \
Expand Down Expand Up @@ -271,15 +273,18 @@ def after_submission_hook(azure_run: Run) -> None:
f"InnerEye/Azure/tensorboard_monitor.py --run_ids={azure_run.id}")

if self.azure_config.wait_for_completion:
# We want the job output to be visible on the console, but the program should not exit if the
# job fails because we need to download the pytest result file.
# We want the job output to be visible on the console. Do not exit yet if the job fails, because we
# may need to download the pytest result file.
azure_run.wait_for_completion(show_output=True, raise_on_error=False)
if self.azure_config.pytest_mark and self.azure_config.wait_for_completion:
# The AzureML job can optionally run pytest. Attempt to download it to the current directory.
# A build step will pick up that file and publish it to Azure DevOps.
# If pytest_mark is set, this file must exist.
logging.info("Downloading pytest result file.")
download_pytest_result(azure_run)
if self.azure_config.pytest_mark:
# The AzureML job can optionally run pytest. Attempt to download it to the current directory.
# A build step will pick up that file and publish it to Azure DevOps.
# If pytest_mark is set, this file must exist.
logging.info("Downloading pytest result file.")
download_pytest_result(azure_run)
if azure_run.status == RunStatus.FAILED:
raise ValueError(f"The AzureML run failed. Please check this URL for details: "
f"{azure_run.get_portal_url()}")

hyperdrive_config = None
if self.azure_config.hyperdrive:
Expand Down Expand Up @@ -326,12 +331,14 @@ def after_submission_hook(azure_run: Run) -> None:
commandline_args=" ".join(source_config.script_params)),
after_submission=after_submission_hook,
hyperdrive_config=hyperdrive_config)
# Set the default display name to what was provided as the "tag"
if self.azure_config.tag:
azure_run_info.run.display_name = self.azure_config.tag
else:
# compute_cluster_name is a required parameter in early versions of the HI-ML package
azure_run_info = submit_to_azure_if_needed(
input_datasets=input_datasets,
submit_to_azureml=False,
compute_cluster_name="")
submit_to_azureml=False)
finally:
if temp_conda:
temp_conda.unlink()
Expand Down
6 changes: 3 additions & 3 deletions environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,9 @@ dependencies:
- git+https://github.com/analysiscenter/radio.git@6d53e25#egg=radio
- azure-mgmt-resource==12.1.0
- azure-mgmt-datafactory==1.1.0
- azureml-mlflow==1.32.0
- azureml-sdk==1.32.0
- azureml-tensorboard==1.32.0
- azureml-mlflow==1.36.0
- azureml-sdk==1.36.0
- azureml-tensorboard==1.36.0
- conda-merge==0.1.5
- cryptography==3.3.2
- dataclasses-json==0.5.2
Expand Down

0 comments on commit 8712267

Please sign in to comment.