ENH: Upgrading package versions for security patches (#757)

* 🚧 💥 Update to secure versions of packages * ⬆️ Upgrade hi-ml version * 🏷️ Fix mypy and update windows env * ✅ Update current epoch value in lightning tests * 🐛 Fix wnidows line endings * ✅ Remove checkpoint load epoch check * 📌 Lock env anew * 🎨 🐛 Add merge_conda_files() to common_util * ✅ Add logging to tests * 🚧 Update VarINetWithImageLogging logger syntax * ✅ Fix Train2Nodes tests * ✅ Remove cwd change, update CIFAR SSL metrics * ⚰️ Remove unnecessary PL backwards compatibility * 📌 Lock env * 📌 Upgrade to hi-ml v0.2.5 * 📌 Testing lightning 1.6.5 * ♻️ Resolve PR comments
microsoft · Sep 14, 2022 · 5b21840 · 5b21840
1 parent 7894498
commit 5b21840
Show file tree

Hide file tree

Showing 15 changed files with 210 additions and 142 deletions.
diff --git a/InnerEye/Common/common_util.py b/InnerEye/Common/common_util.py
@@ -12,7 +12,14 @@
 from enum import Enum
 from functools import wraps
 from pathlib import Path
-from typing import Any, Callable, Generator, Iterable, List, Optional, Union
+from typing import Any, Callable, Dict, Generator, Iterable, List, Optional, Union
+
+import conda_merge
+import ruamel.yaml
+from health_azure.utils import (
+ CONDA_CHANNELS, CONDA_DEPENDENCIES, CONDA_NAME, CONDA_PIP, CondaDependencies, PinnedOperator,
+ _log_conda_dependencies_stats, _retrieve_unique_deps, is_conda_file_with_pip_include, is_pip_include_dependency
+)
 
 from InnerEye.Common.fixed_paths import repository_root_directory
 from InnerEye.Common.type_annotations import PathOrString
@@ -427,3 +434,75 @@ def change_working_directory(path_or_str: PathOrString) -> Generator:
  os.chdir(new_path)
  yield
  os.chdir(old_path)
+
+
+def merge_conda_files(
+ conda_files: List[Path],
+ result_file: Path,
+ pip_files: Optional[List[Path]] = None,
+) -> None:
+ """
+ Merges the given Conda environment files using the conda_merge package, optionally adds any
+ dependencies from pip requirements files, and writes the merged file to disk.
+
+ :param conda_files: The Conda environment files to read.
+ :param result_file: The location where the merge results should be written.
+ :param pip_files: An optional list of one or more pip requirements files including extra dependencies.
+ """
+ env_definitions: List[Any] = []
+ for file in conda_files:
+ _, pip_without_include = is_conda_file_with_pip_include(file)
+ env_definitions.append(pip_without_include)
+ unified_definition = {}
+
+ extra_pip_deps = []
+ for pip_file in pip_files or []:
+ additional_pip_deps = [d for d in pip_file.read_text().split("\n") if d and not is_pip_include_dependency(d)]
+ extra_pip_deps.extend(additional_pip_deps)
+
+ name = conda_merge.merge_names(env.get(CONDA_NAME) for env in env_definitions)
+ if name:
+ unified_definition[CONDA_NAME] = name
+
+ try:
+ channels = conda_merge.merge_channels(env.get(CONDA_CHANNELS) for env in env_definitions)
+ except conda_merge.MergeError:
+ logging.error("Failed to merge channel priorities.")
+ raise
+ if channels:
+ unified_definition[CONDA_CHANNELS] = channels
+
+ try:
+ deps_to_merge = [env.get(CONDA_DEPENDENCIES) for env in env_definitions]
+ if len(extra_pip_deps) > 0:
+ deps_to_merge.append([{CONDA_PIP: extra_pip_deps}])
+ deps = conda_merge.merge_dependencies(deps_to_merge)
+
+ # Get conda dependencies and pip dependencies from specification
+ pip_deps_entries = [d for d in deps if isinstance(d, dict) and CONDA_PIP in d] # type: ignore
+ if len(pip_deps_entries) == 0:
+ raise ValueError("Didn't find a dictionary with the key 'pip' in the list of dependencies")
+ pip_deps_entry: Dict[str, List[str]] = pip_deps_entries[0]
+ pip_deps = pip_deps_entry[CONDA_PIP]
+ # temporarily remove pip dependencies from deps to be added back after deduplicaton
+ deps.remove(pip_deps_entry)
+
+ # remove all non-pip duplicates from the list of dependencies
+ unique_deps = _retrieve_unique_deps(deps, PinnedOperator.CONDA)
+
+ unique_pip_deps = sorted(_retrieve_unique_deps(pip_deps, PinnedOperator.PIP))
+
+ # finally add back the deduplicated list of dependencies
+ unique_deps.append({CONDA_PIP: unique_pip_deps}) # type: ignore
+
+ except conda_merge.MergeError:
+ logging.error("Failed to merge dependencies.")
+ raise
+ if unique_deps:
+ unified_definition[CONDA_DEPENDENCIES] = unique_deps
+ else:
+ raise ValueError("No dependencies found in any of the conda files.")
+
+ with result_file.open("w") as f:
+ ruamel.yaml.dump(unified_definition, f, indent=2, default_flow_style=False)
+ _log_conda_dependencies_stats(CondaDependencies(result_file), "Merged Conda environment")
diff --git a/InnerEye/ML/SSL/lightning_modules/ssl_online_evaluator.py b/InnerEye/ML/SSL/lightning_modules/ssl_online_evaluator.py
@@ -97,16 +97,10 @@ def on_pretrain_routine_start(self, trainer: pl.Trainer, pl_module: pl.Lightning
  p=self.drop_p,
  n_hidden=self.hidden_dim)
  self.evaluator.to(pl_module.device)
- if hasattr(trainer, "accelerator_connector"):
- # This works with Lightning 1.3.8
- accelerator = trainer.accelerator_connector
- elif hasattr(trainer, "_accelerator_connector"):
- # This works with Lightning 1.5.5
- accelerator = trainer._accelerator_connector
- else:
- raise ValueError("Unable to retrieve the accelerator information")
+ accelerator = trainer._accelerator_connector
+
  if accelerator.is_distributed:
- if accelerator.use_ddp:
+ if accelerator.strategy.strategy_name == "ddp":
  self.evaluator = SyncBatchNorm.convert_sync_batchnorm(self.evaluator)
  self.evaluator = DistributedDataParallel(self.evaluator, device_ids=[pl_module.device]) # type: ignore
  else:

diff --git a/InnerEye/ML/configs/other/fastmri_varnet.py b/InnerEye/ML/configs/other/fastmri_varnet.py
@@ -32,8 +32,8 @@ class VarNetWithImageLogging(VarNetModule):
  """
 
  def log_image(self, name: str, image: torch.Tensor) -> None:
- experiments = self.logger.experiment if isinstance(self.logger.experiment, list) \
- else [self.logger.experiment]
+ experiments = self.loggers[0].experiment if isinstance(self.loggers[0].experiment, list) \
+ else [self.loggers[0].experiment]
  for experiment in experiments:
  if isinstance(experiment, SummaryWriter):
  experiment.add_image(name, image, global_step=self.global_step)

diff --git a/InnerEye/ML/lightning_base.py b/InnerEye/ML/lightning_base.py
@@ -289,7 +289,7 @@ def on_train_end(self) -> None:
  This hook is called at the very end of training. Use that to write the very last set of training and
  validation metrics from the StoringLogger to disk.
  """
- self.read_epoch_results_from_logger_and_store(epoch=self.current_epoch)
+ self.read_epoch_results_from_logger_and_store(epoch=self.current_epoch-1)
 
  @rank_zero_only
  def read_epoch_results_from_logger_and_store(self, epoch: int) -> None:

diff --git a/InnerEye/ML/model_training.py b/InnerEye/ML/model_training.py
@@ -120,7 +120,7 @@ def create_lightning_trainer(container: LightningContainer,
  save_top_k=0)
  recovery_checkpoint_callback = ModelCheckpoint(dirpath=str(container.checkpoint_folder),
  filename=AUTOSAVE_CHECKPOINT_FILE_NAME,
- every_n_val_epochs=container.autosave_every_n_val_epochs,
+ every_n_epochs=container.autosave_every_n_val_epochs,
  save_last=False)
  callbacks: List[Callback] = [
  last_checkpoint_callback,
@@ -264,11 +264,10 @@ def model_train(checkpoint_path: Optional[Path],
  lightning_model.storing_logger = storing_logger
 
  logging.info("Starting training")
- # When training models that are not built-in InnerEye models, we have no guarantee that they write
- # files to the right folder. Best guess is to change the current working directory to where files should go.
- with change_working_directory(container.outputs_folder):
- trainer.fit(lightning_model, datamodule=data_module)
- trainer.logger.close() # type: ignore
+
+ trainer.fit(lightning_model, datamodule=data_module)
+ trainer.logger.close() # type: ignore
+
  world_size = getattr(trainer, "world_size", 0)
  is_azureml_run = not is_offline_run_context(RUN_CONTEXT)
  # Per-subject model outputs for regression models are written per rank, and need to be aggregated here.

diff --git a/InnerEye/ML/run_ml.py b/InnerEye/ML/run_ml.py
@@ -15,33 +15,37 @@
 import torch.multiprocessing
 from azureml._restclient.constants import RunStatus
 from azureml.core import Model, Run, model
+from health_azure import AzureRunInfo
+from health_azure.utils import ENVIRONMENT_VERSION, create_run_recovery_id, is_global_rank_zero
 from pytorch_lightning import LightningModule, seed_everything
 from pytorch_lightning.core.datamodule import LightningDataModule
 from torch.utils.data import DataLoader
 
 from InnerEye.Azure import azure_util
 from InnerEye.Azure.azure_config import AzureConfig
 from InnerEye.Azure.azure_runner import ENV_OMPI_COMM_WORLD_RANK, get_git_tags
-from InnerEye.Azure.azure_util import CROSS_VALIDATION_SPLIT_INDEX_TAG_KEY, DEFAULT_CROSS_VALIDATION_SPLIT_INDEX, \
- EFFECTIVE_RANDOM_SEED_KEY_NAME, IS_ENSEMBLE_KEY_NAME, MODEL_ID_KEY_NAME, PARENT_RUN_CONTEXT, \
- PARENT_RUN_ID_KEY_NAME, RUN_CONTEXT, RUN_RECOVERY_FROM_ID_KEY_NAME, RUN_RECOVERY_ID_KEY_NAME, \
- get_all_environment_files, is_offline_run_context
+from InnerEye.Azure.azure_util import (
+ CROSS_VALIDATION_SPLIT_INDEX_TAG_KEY, DEFAULT_CROSS_VALIDATION_SPLIT_INDEX, EFFECTIVE_RANDOM_SEED_KEY_NAME,
+ IS_ENSEMBLE_KEY_NAME, MODEL_ID_KEY_NAME, PARENT_RUN_CONTEXT, PARENT_RUN_ID_KEY_NAME, RUN_CONTEXT,
+ RUN_RECOVERY_FROM_ID_KEY_NAME, RUN_RECOVERY_ID_KEY_NAME, get_all_environment_files, is_offline_run_context
+)
 from InnerEye.Common import fixed_paths
-from InnerEye.Common.common_util import (BASELINE_COMPARISONS_FOLDER, BASELINE_WILCOXON_RESULTS_FILE,
-  CROSSVAL_RESULTS_FOLDER, ENSEMBLE_SPLIT_NAME, FULL_METRICS_DATAFRAME_FILE,
-  METRICS_AGGREGATES_FILE, ModelProcessing,
-  OTHER_RUNS_SUBDIR_NAME, SCATTERPLOTS_SUBDIR_NAME, SUBJECT_METRICS_FILE_NAME,
-  change_working_directory, get_best_epoch_results_path, is_windows,
- logging_section, print_exception, remove_file_or_directory)
+from InnerEye.Common.common_util import (
+ BASELINE_COMPARISONS_FOLDER, BASELINE_WILCOXON_RESULTS_FILE, CROSSVAL_RESULTS_FOLDER, ENSEMBLE_SPLIT_NAME,
+ FULL_METRICS_DATAFRAME_FILE, METRICS_AGGREGATES_FILE, OTHER_RUNS_SUBDIR_NAME, SCATTERPLOTS_SUBDIR_NAME,
+ SUBJECT_METRICS_FILE_NAME, ModelProcessing, change_working_directory, get_best_epoch_results_path,
+ is_windows, logging_section, merge_conda_files, print_exception, remove_file_or_directory
+)
 from InnerEye.Common.fixed_paths import INNEREYE_PACKAGE_NAME, PYTHON_ENVIRONMENT_NAME
 from InnerEye.Common.type_annotations import PathOrString
 from InnerEye.ML.baselines_util import compare_folders_and_run_outputs
-from InnerEye.ML.common import CHECKPOINT_FOLDER, EXTRA_RUN_SUBFOLDER, FINAL_ENSEMBLE_MODEL_FOLDER, \
- FINAL_MODEL_FOLDER, \
- ModelExecutionMode
+from InnerEye.ML.common import (
+ CHECKPOINT_FOLDER, EXTRA_RUN_SUBFOLDER, FINAL_ENSEMBLE_MODEL_FOLDER, FINAL_MODEL_FOLDER, ModelExecutionMode
+)
 from InnerEye.ML.config import SegmentationModelBase
-from InnerEye.ML.deep_learning_config import DeepLearningConfig, ModelCategory, MultiprocessingStartMethod, \
- load_checkpoint
+from InnerEye.ML.deep_learning_config import (
+ DeepLearningConfig, ModelCategory, MultiprocessingStartMethod, load_checkpoint
+)
 from InnerEye.ML.lightning_base import InnerEyeContainer
 from InnerEye.ML.lightning_container import InnerEyeInference, LightningContainer
 from InnerEye.ML.lightning_loggers import StoringLogger
@@ -50,16 +54,16 @@
 from InnerEye.ML.model_inference_config import ModelInferenceConfig
 from InnerEye.ML.model_testing import model_test
 from InnerEye.ML.model_training import create_lightning_trainer, model_train
-from InnerEye.ML.reports.notebook_report import generate_classification_crossval_notebook, \
- generate_classification_multilabel_notebook, generate_classification_notebook, generate_segmentation_notebook, \
- get_ipynb_report_name, reports_folder
+from InnerEye.ML.reports.notebook_report import (
+ generate_classification_crossval_notebook, generate_classification_multilabel_notebook,
+ generate_classification_notebook, generate_segmentation_notebook, get_ipynb_report_name, reports_folder
+)
 from InnerEye.ML.scalar_config import ScalarModelBase
 from InnerEye.ML.utils.checkpoint_handling import CheckpointHandler, download_all_checkpoints_from_run
 from InnerEye.ML.visualizers import activation_maps
-from InnerEye.ML.visualizers.plot_cross_validation import \
+from InnerEye.ML.visualizers.plot_cross_validation import (
  get_config_and_results_for_offline_runs, plot_cross_validation_from_files
-from health_azure import AzureRunInfo
-from health_azure.utils import ENVIRONMENT_VERSION, create_run_recovery_id, is_global_rank_zero, merge_conda_files
+)
 
 ModelDeploymentHookSignature = Callable[[LightningContainer, AzureConfig, Model, ModelProcessing], Any]
 PostCrossValidationHookSignature = Callable[[ModelConfigBase, Path], None]
@@ -797,8 +801,10 @@ def create_ensemble_model_and_run_inference(self) -> None:
  remove_file_or_directory(other_runs_dir)
 
  def plot_cross_validation_and_upload_results(self) -> Path:
- from InnerEye.ML.visualizers.plot_cross_validation import crossval_config_from_model_config, \
- plot_cross_validation, unroll_aggregate_metrics
+ from InnerEye.ML.visualizers.plot_cross_validation import (
+ crossval_config_from_model_config, plot_cross_validation, unroll_aggregate_metrics
+ )
+
  # perform aggregation as cross val splits are now ready
  plot_crossval_config = crossval_config_from_model_config(self.innereye_config)
  plot_crossval_config.run_recovery_id = PARENT_RUN_CONTEXT.tags[RUN_RECOVERY_ID_KEY_NAME]

diff --git a/InnerEye/ML/runner.py b/InnerEye/ML/runner.py
@@ -24,36 +24,36 @@
 # in a submodule
 fixed_paths.add_submodules_to_path()
 
+import matplotlib
 from azureml._base_sdk_common import user_agent
 from azureml._restclient.constants import RunStatus
 from azureml.core import Run, ScriptRunConfig
 from health_azure import AzureRunInfo, submit_to_azure_if_needed
-from health_azure.utils import create_run_recovery_id, is_global_rank_zero, is_local_rank_zero, merge_conda_files, \
- to_azure_friendly_string
-import matplotlib
+from health_azure.utils import create_run_recovery_id, is_global_rank_zero, is_local_rank_zero, to_azure_friendly_string
 
-from InnerEye.Azure.tensorboard_monitor import AMLTensorBoardMonitorConfig, monitor
 from InnerEye.Azure import azure_util
 from InnerEye.Azure.azure_config import AzureConfig, ParserResult, SourceConfig
-from InnerEye.Azure.azure_runner import (DEFAULT_DOCKER_BASE_IMAGE, create_dataset_configs, create_experiment_name,
-  create_runner_parser,
-   get_git_tags,
-  parse_args_and_add_yaml_variables,
- parse_arguments, additional_run_tags,
- set_environment_variables_for_multi_node)
-from InnerEye.Azure.azure_util import (RUN_CONTEXT, RUN_RECOVERY_ID_KEY_NAME, get_all_environment_files,
- is_offline_run_context)
+from InnerEye.Azure.azure_runner import (
+ DEFAULT_DOCKER_BASE_IMAGE, additional_run_tags, create_dataset_configs,
+ create_experiment_name, create_runner_parser, get_git_tags,
+ parse_args_and_add_yaml_variables, parse_arguments, set_environment_variables_for_multi_node
+)
+from InnerEye.Azure.azure_util import (
+ RUN_CONTEXT, RUN_RECOVERY_ID_KEY_NAME, get_all_environment_files, is_offline_run_context
+)
 from InnerEye.Azure.run_pytest import download_pytest_result, run_pytest
-from InnerEye.Common.common_util import (FULL_METRICS_DATAFRAME_FILE, METRICS_AGGREGATES_FILE,
- is_linux, logging_to_stdout)
+from InnerEye.Azure.tensorboard_monitor import AMLTensorBoardMonitorConfig, monitor
+from InnerEye.Common.common_util import (
+ FULL_METRICS_DATAFRAME_FILE, METRICS_AGGREGATES_FILE, is_linux, logging_to_stdout, merge_conda_files
+)
 from InnerEye.Common.generic_parsing import GenericConfig
 from InnerEye.ML.common import DATASET_CSV_FILE_NAME
 from InnerEye.ML.deep_learning_config import DeepLearningConfig
 from InnerEye.ML.lightning_base import InnerEyeContainer
+from InnerEye.ML.lightning_container import LightningContainer
 from InnerEye.ML.model_config_base import ModelConfigBase
 from InnerEye.ML.run_ml import MLRunner, ModelDeploymentHookSignature, PostCrossValidationHookSignature
 from InnerEye.ML.utils.config_loader import ModelConfigLoader
-from InnerEye.ML.lightning_container import LightningContainer
 
 # We change the current working directory before starting the actual training. However, this throws off starting
 # the child training threads because sys.argv[0] is a relative path when running in AzureML. Turn that into an absolute