Skip to content
This repository has been archived by the owner on Mar 21, 2024. It is now read-only.

Commit

Permalink
Switching batch time loading diagnostics to hi-ml (#577)
Browse files Browse the repository at this point in the history
- Using the BatchTimeCallback from hi-ml
- Switches to trigger PL profiling
  • Loading branch information
ant0nsc committed Nov 3, 2021
1 parent 8495a2e commit bf4cb62
Show file tree
Hide file tree
Showing 21 changed files with 204 additions and 356 deletions.
2 changes: 1 addition & 1 deletion .flake8
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@
ignore = E226,E302,E41,W391, E701, W291, E722, W503, E128, E126, E127, E731, E401
max-line-length = 160
max-complexity = 25
exclude = fastMRI/ test_outputs/
exclude = fastMRI/ test_outputs/ hi-ml/
4 changes: 3 additions & 1 deletion .idea/InnerEye-DeepLearning.iml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,9 @@ created.
## Upcoming

### Added
- ([#577](https://github.com/microsoft/InnerEye-DeepLearning/pull/577)) Commandline switch `monitor_gpu` to monitor
GPU utilization via Lightning's `GpuStatsMonitor`, switch `monitor_loading` to check batch loading times via
`BatchTimeCallback`, and `pl_profiler` to turn on the Lightning profiler (`simple`, `advanced`, or `pytorch`)
- ([#544](https://github.com/microsoft/InnerEye-DeepLearning/pull/544)) Add documentation for segmentation model evaluation.
- ([#465](https://github.com/microsoft/InnerEye-DeepLearning/pull/465/)) Adding ability to run segmentation inference
module on test data with partial ground truth files. (Also [522](https://github.com/microsoft/InnerEye-DeepLearning/pull/522).)
Expand Down Expand Up @@ -77,6 +80,8 @@ in inference-only runs when using lightning containers.

### Removed

- ([#577](https://github.com/microsoft/InnerEye-DeepLearning/pull/577)) Removing the monitoring of batch loading time,
use the `BatchTimeCallback` from `hi-ml` instead
- ([#542](https://github.com/microsoft/InnerEye-DeepLearning/pull/542)) Removed Windows test leg from build pipeline.
- ([#509](https://github.com/microsoft/InnerEye-DeepLearning/pull/509)) Parameters `local_weights_path` and
`weights_url` can no longer be used to initialize a training run, only inference runs.
Expand Down
3 changes: 2 additions & 1 deletion InnerEye/Common/fixed_paths.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,8 @@ def add_submodules_to_path() -> None:
innereye_root = repository_root_directory()
folders_to_add = [(innereye_root, "InnerEye"),
(innereye_root / "fastMRI", "fastmri"),
(innereye_root / "hi-ml" / "src", "health")]
(innereye_root / "hi-ml" / "hi-ml-azure" / "src", "health_azure"),
(innereye_root / "hi-ml" / "hi-ml" / "src", "health_ml")]
for (folder, subfolder_that_must_exist) in folders_to_add:
if (folder / subfolder_that_must_exist).is_dir():
folder_str = str(folder)
Expand Down
7 changes: 0 additions & 7 deletions InnerEye/Common/metrics_constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
# Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
# ------------------------------------------------------------------------------------------
from enum import Enum, unique

# String prefixes when writing training or validation set metrics to a logger
from typing import Union

Expand Down Expand Up @@ -45,8 +44,6 @@ class LoggingColumns(Enum):
AccuracyAtThreshold05 = "accuracy_at_threshold_05"
Loss = "loss"
CrossEntropy = "cross_entropy"
SecondsPerEpoch = "seconds_per_epoch"
SecondsPerBatch = "seconds_per_batch"
AreaUnderRocCurve = "area_under_roc_curve"
AreaUnderPRCurve = "area_under_pr_curve"
CrossValidationSplitIndex = "cross_validation_split_index"
Expand Down Expand Up @@ -100,8 +97,6 @@ class MetricType(Enum):
EXPLAINED_VAR = "ExplainedVariance"

# Common metrics
SECONDS_PER_BATCH = "SecondsPerBatch"
SECONDS_PER_EPOCH = "SecondsPerEpoch"
SUBJECT_COUNT = "SubjectCount"
LEARNING_RATE = "LearningRate"

Expand All @@ -114,8 +109,6 @@ class MetricType(Enum):
MetricType.LOSS.value: LoggingColumns.Loss,
MetricType.ACCURACY_AT_THRESHOLD_05.value: LoggingColumns.AccuracyAtThreshold05,
MetricType.CROSS_ENTROPY.value: LoggingColumns.CrossEntropy,
MetricType.SECONDS_PER_BATCH.value: LoggingColumns.SecondsPerBatch,
MetricType.SECONDS_PER_EPOCH.value: LoggingColumns.SecondsPerEpoch,
MetricType.AREA_UNDER_ROC_CURVE.value: LoggingColumns.AreaUnderRocCurve,
MetricType.AREA_UNDER_PR_CURVE.value: LoggingColumns.AreaUnderPRCurve,
MetricType.SUBJECT_COUNT.value: LoggingColumns.SubjectCount,
Expand Down
3 changes: 2 additions & 1 deletion InnerEye/Common/type_annotations.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
# Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
# ------------------------------------------------------------------------------------------
from pathlib import Path
from typing import Dict, Iterable, Optional, Tuple, TypeVar, Union
from typing import Dict, Iterable, List, Optional, Tuple, TypeVar, Union

T = TypeVar('T')
PathOrString = Union[Path, str]
Expand All @@ -15,3 +15,4 @@
TupleFloat9 = Tuple[float, float, float, float, float, float, float, float, float]
IntOrTuple3 = Union[int, TupleInt3, Iterable]
DictStrFloat = Dict[str, float]
DictStrFloatOrFloatList = Dict[str, Union[float, List[float]]]
2 changes: 1 addition & 1 deletion InnerEye/ML/SSL/lightning_containers/ssl_container.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ class SSLContainer(LightningContainer):

def setup(self) -> None:
from InnerEye.ML.SSL.lightning_containers.ssl_image_classifier import SSLClassifierContainer
self.total_num_gpus = self.num_gpus_per_node * self.num_nodes
self.total_num_gpus = self.num_gpus_per_node() * self.num_nodes
self._load_config()
# If you're using the same data for training and linear head, allow the user to specify the dataset only
# once. Or if you are doing just finetuning of linear head, the user should be able to specify dataset via
Expand Down
13 changes: 11 additions & 2 deletions InnerEye/ML/deep_learning_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -218,7 +218,7 @@ class WorkflowParams(param.Parameterized):
doc="If set, enable/disable full image inference on test set after ensemble training.")
weights_url: List[str] = param.List(default=[], class_=str,
doc="If provided, a set of urls from which checkpoints will be downloaded"
"and used for inference.")
"and used for inference.")
local_weights_path: List[Path] = param.List(default=[], class_=Path,
doc="A list of checkpoints paths to use for inference, "
"when the job is running outside Azure.")
Expand Down Expand Up @@ -590,6 +590,16 @@ class TrainerParams(param.Parameterized):
param.Boolean(default=False,
doc="Controls the PyTorch Lightning flag 'find_unused_parameters' for the DDP plugin. "
"Setting it to True comes with a performance hit.")
monitor_gpu: bool = param.Boolean(default=False,
doc="If True, add the GPUStatsMonitor callback to the Lightning trainer object. "
"This will write GPU utilization metrics every 50 batches by default.")
monitor_loading: bool = param.Boolean(default=True,
doc="If True, add the BatchTimeCallback callback to the Lightning trainer "
"object. This will monitor how long individual batches take to load.")
pl_profiler: Optional[str] = \
param.String(default=None,
doc="The value to use for the 'profiler' argument for the Lightning trainer. "
"Set to either 'simple', 'advanced', or 'pytorch'")

@property
def use_gpu(self) -> bool:
Expand All @@ -602,7 +612,6 @@ def use_gpu(self) -> bool:
from InnerEye.ML.utils.ml_util import is_gpu_available
return is_gpu_available()

@property
def num_gpus_per_node(self) -> int:
"""
Computes the number of gpus to use for each node: either the number of gpus available on the device
Expand Down
Loading

0 comments on commit bf4cb62

Please sign in to comment.