Skip to content
This repository has been archived by the owner on Mar 21, 2024. It is now read-only.

Commit

Permalink
FastMRI dataset onboarding script and detailed examples (#444)
Browse files Browse the repository at this point in the history
Add necessary tooling and examples for running fastMRI reconstruction models.
- Script to create and run an Azure Data Factory to download the raw data, and place them into a storage account
- Detailed examples to run the VarNet model from the fastMRI github repo
- Ability to work with fixed mounting points for datasets
  • Loading branch information
ant0nsc committed May 19, 2021
1 parent 77e87ac commit 8bae42e
Show file tree
Hide file tree
Showing 29 changed files with 1,170 additions and 264 deletions.
1 change: 1 addition & 0 deletions .idea/InnerEye-DeepLearning.iml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions .idea/runConfigurations/Template__Run_ML_on_AzureML.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

7 changes: 7 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,10 @@ with only minimum code changes required. See [the MD documentation](docs/bring_y
(`ScalarLoss.CustomClassification` and `CustomRegression`), prediction targets (`ScalarModelBase.target_names`),
and reporting (`ModelConfigBase.generate_custom_report()`) in scalar configs, providing more flexibility for defining
model configs with custom behaviour while leveraging the existing InnerEye workflows.
- ([#444](https://github.com/microsoft/InnerEye-DeepLearning/pull/444)) Added setup scripts and documentation to work
with the FastMRI challenge datasets.
- ([#444](https://github.com/microsoft/InnerEye-DeepLearning/pull/444)) Git-related information is now printed to the
console for easier diagnostics.
- ([#445](https://github.com/microsoft/InnerEye-DeepLearning/pull/445)) Adding test coverage for the `HelloContainer`
model with multiple GPUs
- ([#450](https://github.com/microsoft/InnerEye-DeepLearning/pull/450)) Adds the metric "Accuracy at threshold 0.5" to the classification report (`classification_crossval_report.ipynb`).
Expand Down Expand Up @@ -98,6 +102,9 @@ with only minimum code changes required. See [the MD documentation](docs/bring_y
named `recovery_epoch=x.ckpt` instead of `recovery.ckpt` or `recovery-v0.ckpt`.
- ([#451](https://github.com/microsoft/InnerEye-DeepLearning/pull/451)) Change the signature for function `generate_custom_report`
in `ModelConfigBase` to take only the path to the reports folder and a `ModelProcessing` object.
- ([#444](https://github.com/microsoft/InnerEye-DeepLearning/pull/444)) The method `before_training_on_rank_zero` of
the `LightningContainer` class has been renamed to `before_training_on_global_rank_zero`. The order in which the
hooks are called has been changed.

### Fixed

Expand Down
66 changes: 64 additions & 2 deletions InnerEye/Azure/azure_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,10 @@
from typing import Any, Callable, Dict, List, Optional, Union

import param
from azureml.core import Run, ScriptRunConfig, Workspace
from azureml.core import Dataset, Datastore, Run, ScriptRunConfig, Workspace
from azureml.core.authentication import InteractiveLoginAuthentication, ServicePrincipalAuthentication
from azureml.data import FileDataset
from azureml.data.dataset_consumption_config import DatasetConsumptionConfig
from azureml.train.hyperdrive import HyperDriveConfig
from git import Repo

Expand All @@ -25,6 +27,8 @@
# The name of the "azureml" property of AzureConfig
AZURECONFIG_SUBMIT_TO_AZUREML = "azureml"

INPUT_DATA_KEY = "input_data"


@dataclass(frozen=True)
class GitInformation:
Expand Down Expand Up @@ -159,7 +163,7 @@ def get_git_information(self) -> GitInformation:
# Is_dirty in the present settings ignores untracked files.
is_dirty = git_repo.is_dirty()
except:
logging.info("This folder does not seem to be a git repository.")
logging.debug("This folder does not seem to be a git repository.")
return GitInformation(
repository=repository,
branch=branch,
Expand Down Expand Up @@ -242,6 +246,64 @@ def fetch_run(self, run_recovery_id: str) -> Run:
"""
return fetch_run(workspace=self.get_workspace(), run_recovery_id=run_recovery_id)

def get_or_create_dataset(self, azure_dataset_id: str) -> FileDataset:
"""
Looks in the AzureML datastore for a dataset of the given name. If there is no such dataset, a dataset is
created and registered, assuming that the files are in a folder that has the same name as the dataset.
For example, if azure_dataset_id is 'foo', then the 'foo' dataset should be pointing to the folder
<container_root>/datasets/foo/
"""
if not self.azureml_datastore:
raise ValueError("No value set for 'azureml_datastore' (name of the datastore in the AzureML workspace)")
if not azure_dataset_id:
raise ValueError("No dataset ID provided.")
logging.info(f"Retrieving datastore '{self.azureml_datastore}' from AzureML workspace")
workspace = self.get_workspace()
datastore = Datastore.get(workspace, self.azureml_datastore)
try:
logging.info(f"Trying to retrieve AzureML Dataset '{azure_dataset_id}'")
azureml_dataset = Dataset.get_by_name(workspace, name=azure_dataset_id)
logging.info("Dataset found.")
except:
logging.info(f"Dataset does not yet exist, creating a new one from data in folder '{azure_dataset_id}'")
# Ensure that there is a / at the end of the file path, otherwise folder that share a prefix could create
# trouble (for example, folders foo and foo_bar exist, and I'm trying to create a dataset from "foo")
azureml_dataset = Dataset.File.from_files(path=(datastore, azure_dataset_id + "/"))
logging.info("Registering the dataset for future use.")
azureml_dataset.register(workspace, name=azure_dataset_id)
return azureml_dataset

def get_dataset_consumption(self,
azure_dataset_id: str,
dataset_index: int,
mountpoint: str) -> DatasetConsumptionConfig:
"""
Creates a configuration for using an AzureML dataset inside of an AzureML run. This will make the AzureML
dataset with given name available as a named input, using INPUT_DATA_KEY as the key.
:param mountpoint: The path at which the dataset should be made available.
:param azure_dataset_id: The name of the dataset in blob storage to be used for this run. This can be an empty
string to not use any datasets.
:param dataset_index: suffix for the dataset name, dataset name will be set to INPUT_DATA_KEY_idx
"""
status = f"Dataset {azure_dataset_id} (index {dataset_index}) will be "
azureml_dataset = self.get_or_create_dataset(azure_dataset_id=azure_dataset_id)
if not azureml_dataset:
raise ValueError(f"AzureML dataset {azure_dataset_id} could not be found or created.")
named_input = azureml_dataset.as_named_input(f"{INPUT_DATA_KEY}_{dataset_index}")
path_on_compute = mountpoint or None
if self.use_dataset_mount:
status += "mounted at "
result = named_input.as_mount(path_on_compute)
else:
status += "downloaded to "
result = named_input.as_download(path_on_compute)
if path_on_compute:
status += f"{path_on_compute}."
else:
status += "a randomly chosen folder."
logging.info(status)
return result


@dataclass
class SourceConfig:
Expand Down
Loading

0 comments on commit 8bae42e

Please sign in to comment.