FastMRI dataset onboarding script and detailed examples (#444)

Add necessary tooling and examples for running fastMRI reconstruction models. - Script to create and run an Azure Data Factory to download the raw data, and place them into a storage account - Detailed examples to run the VarNet model from the fastMRI github repo - Ability to work with fixed mounting points for datasets
microsoft · May 19, 2021 · 8bae42e · 8bae42e
1 parent 77e87ac
commit 8bae42e
Show file tree

Hide file tree

Showing 29 changed files with 1,170 additions and 264 deletions.
diff --git a/.idea/InnerEye-DeepLearning.iml b/.idea/InnerEye-DeepLearning.iml
diff --git a/.idea/runConfigurations/Template__Run_ML_on_AzureML.xml b/.idea/runConfigurations/Template__Run_ML_on_AzureML.xml
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -62,6 +62,10 @@ with only minimum code changes required. See [the MD documentation](docs/bring_y
  (`ScalarLoss.CustomClassification` and `CustomRegression`), prediction targets (`ScalarModelBase.target_names`),
  and reporting (`ModelConfigBase.generate_custom_report()`) in scalar configs, providing more flexibility for defining
  model configs with custom behaviour while leveraging the existing InnerEye workflows.
+- ([#444](https://github.com/microsoft/InnerEye-DeepLearning/pull/444)) Added setup scripts and documentation to work
+with the FastMRI challenge datasets.
+- ([#444](https://github.com/microsoft/InnerEye-DeepLearning/pull/444)) Git-related information is now printed to the
+console for easier diagnostics.
 - ([#445](https://github.com/microsoft/InnerEye-DeepLearning/pull/445)) Adding test coverage for the `HelloContainer`
  model with multiple GPUs
 - ([#450](https://github.com/microsoft/InnerEye-DeepLearning/pull/450)) Adds the metric "Accuracy at threshold 0.5" to the classification report (`classification_crossval_report.ipynb`). 
@@ -98,6 +102,9 @@ with only minimum code changes required. See [the MD documentation](docs/bring_y
  named `recovery_epoch=x.ckpt` instead of `recovery.ckpt` or `recovery-v0.ckpt`.
 - ([#451](https://github.com/microsoft/InnerEye-DeepLearning/pull/451)) Change the signature for function `generate_custom_report` 
  in `ModelConfigBase` to take only the path to the reports folder and a `ModelProcessing` object.
+- ([#444](https://github.com/microsoft/InnerEye-DeepLearning/pull/444)) The method `before_training_on_rank_zero` of
+ the `LightningContainer` class has been renamed to `before_training_on_global_rank_zero`. The order in which the
+ hooks are called has been changed.
 
 ### Fixed
 

diff --git a/InnerEye/Azure/azure_config.py b/InnerEye/Azure/azure_config.py
@@ -12,8 +12,10 @@
 from typing import Any, Callable, Dict, List, Optional, Union
 
 import param
-from azureml.core import Run, ScriptRunConfig, Workspace
+from azureml.core import Dataset, Datastore, Run, ScriptRunConfig, Workspace
 from azureml.core.authentication import InteractiveLoginAuthentication, ServicePrincipalAuthentication
+from azureml.data import FileDataset
+from azureml.data.dataset_consumption_config import DatasetConsumptionConfig
 from azureml.train.hyperdrive import HyperDriveConfig
 from git import Repo
 
@@ -25,6 +27,8 @@
 # The name of the "azureml" property of AzureConfig
 AZURECONFIG_SUBMIT_TO_AZUREML = "azureml"
 
+INPUT_DATA_KEY = "input_data"
+
 
 @dataclass(frozen=True)
 class GitInformation:
@@ -159,7 +163,7 @@ def get_git_information(self) -> GitInformation:
  # Is_dirty in the present settings ignores untracked files.
  is_dirty = git_repo.is_dirty()
  except:
- logging.info("This folder does not seem to be a git repository.")
+ logging.debug("This folder does not seem to be a git repository.")
  return GitInformation(
  repository=repository,
  branch=branch,
@@ -242,6 +246,64 @@ def fetch_run(self, run_recovery_id: str) -> Run:
  """
  return fetch_run(workspace=self.get_workspace(), run_recovery_id=run_recovery_id)
 
+ def get_or_create_dataset(self, azure_dataset_id: str) -> FileDataset:
+ """
+ Looks in the AzureML datastore for a dataset of the given name. If there is no such dataset, a dataset is
+ created and registered, assuming that the files are in a folder that has the same name as the dataset.
+ For example, if azure_dataset_id is 'foo', then the 'foo' dataset should be pointing to the folder
+ <container_root>/datasets/foo/
+ """
+ if not self.azureml_datastore:
+ raise ValueError("No value set for 'azureml_datastore' (name of the datastore in the AzureML workspace)")
+ if not azure_dataset_id:
+ raise ValueError("No dataset ID provided.")
+ logging.info(f"Retrieving datastore '{self.azureml_datastore}' from AzureML workspace")
+ workspace = self.get_workspace()
+ datastore = Datastore.get(workspace, self.azureml_datastore)
+ try:
+ logging.info(f"Trying to retrieve AzureML Dataset '{azure_dataset_id}'")
+ azureml_dataset = Dataset.get_by_name(workspace, name=azure_dataset_id)
+ logging.info("Dataset found.")
+ except:
+ logging.info(f"Dataset does not yet exist, creating a new one from data in folder '{azure_dataset_id}'")
+ # Ensure that there is a / at the end of the file path, otherwise folder that share a prefix could create
+ # trouble (for example, folders foo and foo_bar exist, and I'm trying to create a dataset from "foo")
+ azureml_dataset = Dataset.File.from_files(path=(datastore, azure_dataset_id + "/"))
+ logging.info("Registering the dataset for future use.")
+ azureml_dataset.register(workspace, name=azure_dataset_id)
+ return azureml_dataset
+
+ def get_dataset_consumption(self,
+ azure_dataset_id: str,
+ dataset_index: int,
+ mountpoint: str) -> DatasetConsumptionConfig:
+ """
+ Creates a configuration for using an AzureML dataset inside of an AzureML run. This will make the AzureML
+ dataset with given name available as a named input, using INPUT_DATA_KEY as the key.
+ :param mountpoint: The path at which the dataset should be made available.
+ :param azure_dataset_id: The name of the dataset in blob storage to be used for this run. This can be an empty
+ string to not use any datasets.
+ :param dataset_index: suffix for the dataset name, dataset name will be set to INPUT_DATA_KEY_idx
+ """
+ status = f"Dataset {azure_dataset_id} (index {dataset_index}) will be "
+ azureml_dataset = self.get_or_create_dataset(azure_dataset_id=azure_dataset_id)
+ if not azureml_dataset:
+ raise ValueError(f"AzureML dataset {azure_dataset_id} could not be found or created.")
+ named_input = azureml_dataset.as_named_input(f"{INPUT_DATA_KEY}_{dataset_index}")
+ path_on_compute = mountpoint or None
+ if self.use_dataset_mount:
+ status += "mounted at "
+ result = named_input.as_mount(path_on_compute)
+ else:
+ status += "downloaded to "
+ result = named_input.as_download(path_on_compute)
+ if path_on_compute:
+ status += f"{path_on_compute}."
+ else:
+ status += "a randomly chosen folder."
+ logging.info(status)
+ return result
+
 
 @dataclass
 class SourceConfig: