Switching batch time loading diagnostics to hi-ml (#577)

- Using the BatchTimeCallback from hi-ml - Switches to trigger PL profiling
microsoft · Nov 3, 2021 · bf4cb62 · bf4cb62
1 parent 8495a2e
commit bf4cb62
Show file tree

Hide file tree

Showing 21 changed files with 204 additions and 356 deletions.
diff --git a/.flake8 b/.flake8
@@ -2,4 +2,4 @@
 ignore = E226,E302,E41,W391, E701, W291, E722, W503, E128, E126, E127, E731, E401
 max-line-length = 160
 max-complexity = 25
-exclude = fastMRI/ test_outputs/
+exclude = fastMRI/ test_outputs/ hi-ml/
diff --git a/.idea/InnerEye-DeepLearning.iml b/.idea/InnerEye-DeepLearning.iml
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -13,6 +13,9 @@ created.
 ## Upcoming
 
 ### Added
+- ([#577](https://github.com/microsoft/InnerEye-DeepLearning/pull/577)) Commandline switch `monitor_gpu` to monitor 
+ GPU utilization via Lightning's `GpuStatsMonitor`, switch `monitor_loading` to check batch loading times via
+ `BatchTimeCallback`, and `pl_profiler` to turn on the Lightning profiler (`simple`, `advanced`, or `pytorch`)
 - ([#544](https://github.com/microsoft/InnerEye-DeepLearning/pull/544)) Add documentation for segmentation model evaluation.
 - ([#465](https://github.com/microsoft/InnerEye-DeepLearning/pull/465/)) Adding ability to run segmentation inference
 module on test data with partial ground truth files. (Also [522](https://github.com/microsoft/InnerEye-DeepLearning/pull/522).)
@@ -77,6 +80,8 @@ in inference-only runs when using lightning containers.
 
 ### Removed
 
+- ([#577](https://github.com/microsoft/InnerEye-DeepLearning/pull/577)) Removing the monitoring of batch loading time,
+ use the `BatchTimeCallback` from `hi-ml` instead
 - ([#542](https://github.com/microsoft/InnerEye-DeepLearning/pull/542)) Removed Windows test leg from build pipeline.
 - ([#509](https://github.com/microsoft/InnerEye-DeepLearning/pull/509)) Parameters `local_weights_path` and
  `weights_url` can no longer be used to initialize a training run, only inference runs.

diff --git a/InnerEye/Common/fixed_paths.py b/InnerEye/Common/fixed_paths.py
@@ -106,7 +106,8 @@ def add_submodules_to_path() -> None:
  innereye_root = repository_root_directory()
  folders_to_add = [(innereye_root, "InnerEye"),
  (innereye_root / "fastMRI", "fastmri"),
- (innereye_root / "hi-ml" / "src", "health")]
+ (innereye_root / "hi-ml" / "hi-ml-azure" / "src", "health_azure"),
+ (innereye_root / "hi-ml" / "hi-ml" / "src", "health_ml")]
  for (folder, subfolder_that_must_exist) in folders_to_add:
  if (folder / subfolder_that_must_exist).is_dir():
  folder_str = str(folder)

diff --git a/InnerEye/Common/metrics_constants.py b/InnerEye/Common/metrics_constants.py
@@ -3,7 +3,6 @@
 # Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
 # ------------------------------------------------------------------------------------------
 from enum import Enum, unique
-
 # String prefixes when writing training or validation set metrics to a logger
 from typing import Union
 
@@ -45,8 +44,6 @@ class LoggingColumns(Enum):
  AccuracyAtThreshold05 = "accuracy_at_threshold_05"
  Loss = "loss"
  CrossEntropy = "cross_entropy"
- SecondsPerEpoch = "seconds_per_epoch"
- SecondsPerBatch = "seconds_per_batch"
  AreaUnderRocCurve = "area_under_roc_curve"
  AreaUnderPRCurve = "area_under_pr_curve"
  CrossValidationSplitIndex = "cross_validation_split_index"
@@ -100,8 +97,6 @@ class MetricType(Enum):
  EXPLAINED_VAR = "ExplainedVariance"
 
  # Common metrics
- SECONDS_PER_BATCH = "SecondsPerBatch"
- SECONDS_PER_EPOCH = "SecondsPerEpoch"
  SUBJECT_COUNT = "SubjectCount"
  LEARNING_RATE = "LearningRate"
 
@@ -114,8 +109,6 @@ class MetricType(Enum):
  MetricType.LOSS.value: LoggingColumns.Loss,
  MetricType.ACCURACY_AT_THRESHOLD_05.value: LoggingColumns.AccuracyAtThreshold05,
  MetricType.CROSS_ENTROPY.value: LoggingColumns.CrossEntropy,
- MetricType.SECONDS_PER_BATCH.value: LoggingColumns.SecondsPerBatch,
- MetricType.SECONDS_PER_EPOCH.value: LoggingColumns.SecondsPerEpoch,
  MetricType.AREA_UNDER_ROC_CURVE.value: LoggingColumns.AreaUnderRocCurve,
  MetricType.AREA_UNDER_PR_CURVE.value: LoggingColumns.AreaUnderPRCurve,
  MetricType.SUBJECT_COUNT.value: LoggingColumns.SubjectCount,

diff --git a/InnerEye/Common/type_annotations.py b/InnerEye/Common/type_annotations.py
@@ -3,7 +3,7 @@
 # Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
 # ------------------------------------------------------------------------------------------
 from pathlib import Path
-from typing import Dict, Iterable, Optional, Tuple, TypeVar, Union
+from typing import Dict, Iterable, List, Optional, Tuple, TypeVar, Union
 
 T = TypeVar('T')
 PathOrString = Union[Path, str]
@@ -15,3 +15,4 @@
 TupleFloat9 = Tuple[float, float, float, float, float, float, float, float, float]
 IntOrTuple3 = Union[int, TupleInt3, Iterable]
 DictStrFloat = Dict[str, float]
+DictStrFloatOrFloatList = Dict[str, Union[float, List[float]]]
diff --git a/InnerEye/ML/SSL/lightning_containers/ssl_container.py b/InnerEye/ML/SSL/lightning_containers/ssl_container.py
@@ -100,7 +100,7 @@ class SSLContainer(LightningContainer):
 
  def setup(self) -> None:
  from InnerEye.ML.SSL.lightning_containers.ssl_image_classifier import SSLClassifierContainer
- self.total_num_gpus = self.num_gpus_per_node * self.num_nodes
+ self.total_num_gpus = self.num_gpus_per_node() * self.num_nodes
  self._load_config()
  # If you're using the same data for training and linear head, allow the user to specify the dataset only
  # once. Or if you are doing just finetuning of linear head, the user should be able to specify dataset via

diff --git a/InnerEye/ML/deep_learning_config.py b/InnerEye/ML/deep_learning_config.py
@@ -218,7 +218,7 @@ class WorkflowParams(param.Parameterized):
  doc="If set, enable/disable full image inference on test set after ensemble training.")
  weights_url: List[str] = param.List(default=[], class_=str,
  doc="If provided, a set of urls from which checkpoints will be downloaded"
-  "and used for inference.")
+ "and used for inference.")
  local_weights_path: List[Path] = param.List(default=[], class_=Path,
  doc="A list of checkpoints paths to use for inference, "
  "when the job is running outside Azure.")
@@ -590,6 +590,16 @@ class TrainerParams(param.Parameterized):
  param.Boolean(default=False,
  doc="Controls the PyTorch Lightning flag 'find_unused_parameters' for the DDP plugin. "
  "Setting it to True comes with a performance hit.")
+ monitor_gpu: bool = param.Boolean(default=False,
+ doc="If True, add the GPUStatsMonitor callback to the Lightning trainer object. "
+ "This will write GPU utilization metrics every 50 batches by default.")
+ monitor_loading: bool = param.Boolean(default=True,
+ doc="If True, add the BatchTimeCallback callback to the Lightning trainer "
+ "object. This will monitor how long individual batches take to load.")
+ pl_profiler: Optional[str] = \
+ param.String(default=None,
+ doc="The value to use for the 'profiler' argument for the Lightning trainer. "
+ "Set to either 'simple', 'advanced', or 'pytorch'")
 
  @property
  def use_gpu(self) -> bool:
@@ -602,7 +612,6 @@ def use_gpu(self) -> bool:
  from InnerEye.ML.utils.ml_util import is_gpu_available
  return is_gpu_available()
 
- @property
  def num_gpus_per_node(self) -> int:
  """
  Computes the number of gpus to use for each node: either the number of gpus available on the device