This repository has been archived by the owner on Mar 21, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 142
/
test_download_upload.py
263 lines (230 loc) · 13.4 KB
/
test_download_upload.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
# ------------------------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
# ------------------------------------------------------------------------------------------
import logging
import shutil
from pathlib import Path
from typing import Any, List, Optional
from unittest import mock
import pytest
from InnerEye.Azure.azure_config import AzureConfig
from InnerEye.Common import fixed_paths
from InnerEye.Common.common_util import OTHER_RUNS_SUBDIR_NAME, logging_section, logging_to_stdout
from InnerEye.Common.fixed_paths_for_tests import full_ml_test_data_path
from InnerEye.Common.output_directories import OutputFolderForTests
from InnerEye.ML.common import DATASET_CSV_FILE_NAME
from InnerEye.ML.deep_learning_config import DeepLearningConfig
from InnerEye.ML.lightning_container import LightningContainer
from InnerEye.ML.model_config_base import ModelConfigBase
from InnerEye.ML.run_ml import MLRunner
from InnerEye.ML.utils.run_recovery import RunRecovery
from Tests.AfterTraining.test_after_training import FALLBACK_ENSEMBLE_RUN, FALLBACK_SINGLE_RUN, get_most_recent_run
from Tests.ML.configs.DummyModel import DummyModel
from Tests.ML.configs.lightning_test_containers import DummyContainerWithDatasets
from Tests.ML.util import get_default_azure_config
from health_azure import AzureRunInfo
logging_to_stdout(logging.DEBUG)
@pytest.fixture
def runner_config() -> AzureConfig:
"""
Gets an Azure config that masks out the storage account for datasets, to avoid accidental overwriting.
:return:
"""
config = get_default_azure_config()
config.model = ""
config.train = False
return config
def check_single_checkpoint(downloaded_checkpoints: List[Path]) -> None:
assert len(downloaded_checkpoints) == 1
assert downloaded_checkpoints[0].is_file()
@pytest.mark.after_training_single_run
def test_download_recovery_single_run(test_output_dirs: OutputFolderForTests,
runner_config: AzureConfig) -> None:
output_dir = test_output_dirs.root_dir
config = ModelConfigBase(should_validate=False)
config.set_output_to(output_dir)
run = get_most_recent_run(fallback_run_id_for_local_execution=FALLBACK_SINGLE_RUN)
run_recovery = RunRecovery.download_all_checkpoints_from_run(config, run)
# This fails if there is no recovery checkpoint
check_single_checkpoint(run_recovery.get_recovery_checkpoint_paths())
check_single_checkpoint(run_recovery.get_best_checkpoint_paths())
@pytest.mark.after_training_ensemble_run
def test_download_best_checkpoints_ensemble_run(test_output_dirs: OutputFolderForTests,
runner_config: AzureConfig) -> None:
output_dir = test_output_dirs.root_dir
config = ModelConfigBase(should_validate=False)
config.set_output_to(output_dir)
run = get_most_recent_run(fallback_run_id_for_local_execution=FALLBACK_ENSEMBLE_RUN)
run_recovery = RunRecovery.download_best_checkpoints_from_child_runs(config, run)
other_runs_folder = config.checkpoint_folder / OTHER_RUNS_SUBDIR_NAME
assert other_runs_folder.is_dir()
for child in ["0", "1"]:
assert (other_runs_folder / child).is_dir(), "Child run folder does not exist"
for checkpoint in run_recovery.get_best_checkpoint_paths():
assert checkpoint.is_file(), f"File {checkpoint} does not exist"
def test_download_azureml_dataset(test_output_dirs: OutputFolderForTests) -> None:
dataset_name = "test-dataset"
config = DummyModel()
config.local_dataset = None
config.azure_dataset_id = ""
azure_config = get_default_azure_config()
runner = MLRunner(config, azure_config=azure_config)
# If the model has neither local_dataset or azure_dataset_id, mount_or_download_dataset should fail.
# This mounting call must happen before any other operations on the container, because already the model
# creation may need access to the dataset.
with pytest.raises(ValueError) as ex:
runner.setup()
assert ex.value.args[0] == "Expecting that a dataset is available here."
runner.project_root = test_output_dirs.root_dir
# Pointing the model to a dataset folder that does not exist should raise an Exception
fake_folder = runner.project_root / "foo"
runner.container.local_dataset = fake_folder
with pytest.raises(FileNotFoundError):
runner.download_or_use_existing_dataset(runner.container.azure_dataset_id, runner.container.local_dataset)
# If the local dataset folder exists, mount_or_download_dataset should not do anything.
fake_folder.mkdir()
local_dataset = runner.download_or_use_existing_dataset(runner.container.azure_dataset_id,
runner.container.local_dataset)
assert local_dataset == fake_folder
# Pointing the model to a dataset in Azure should trigger a download
runner.container.local_dataset = None
runner.container.azure_dataset_id = dataset_name
with logging_section("Starting download"):
result_path = runner.download_or_use_existing_dataset(runner.container.azure_dataset_id,
runner.container.local_dataset)
# Download goes into <project_root> / "datasets" / "test_dataset"
expected_path = runner.project_root / fixed_paths.DATASETS_DIR_NAME / dataset_name
assert result_path == expected_path
assert result_path.is_dir()
dataset_csv = Path(result_path) / DATASET_CSV_FILE_NAME
assert dataset_csv.is_file()
# Check that each individual file in the dataset is present
for folder in [1, 10]:
sub_folder = result_path / str(folder)
sub_folder.is_dir()
for file in ["esophagus", "heart", "lung_l", "lung_r", "spinalcord"]:
f = (sub_folder / file).with_suffix(".nii.gz")
assert f.is_file()
def _test_mount_for_lightning_container(test_output_dirs: OutputFolderForTests,
is_offline_run: bool,
local_dataset: Optional[Path],
azure_dataset: str,
is_lightning_model: bool) -> LightningContainer:
config: Optional[DeepLearningConfig] = None
container: Optional[LightningContainer] = None
if is_lightning_model:
container = DummyContainerWithDatasets()
container.azure_dataset_id = azure_dataset
container.local_dataset = local_dataset
else:
config = DummyModel()
config.azure_dataset_id = azure_dataset
config.local_dataset = local_dataset
# The legacy InnerEye models require an existing dataset_csv file present in the dataset folder. Create that.
download_path = test_output_dirs.root_dir / "downloaded"
mount_path = test_output_dirs.root_dir / "mounted"
if not is_lightning_model:
train_and_test_data = "train_and_test_data"
for path in [download_path, mount_path, test_output_dirs.root_dir]:
# If destination folder exists, delete content to ensure consistency and avoid 'FileExistsError'
if (path / train_and_test_data).is_dir():
shutil.rmtree(path / train_and_test_data)
# Creates directory structure and copy data
shutil.copytree(full_ml_test_data_path(train_and_test_data), path / train_and_test_data)
# Copy 'data.csv' file
shutil.copy(full_ml_test_data_path(DATASET_CSV_FILE_NAME), path / DATASET_CSV_FILE_NAME)
with mock.patch("InnerEye.ML.run_ml.MLRunner.is_offline_run", is_offline_run):
with mock.patch("InnerEye.ML.run_ml.download_dataset", return_value=download_path):
runner = MLRunner(config, container=container,
azure_config=None, project_root=test_output_dirs.root_dir)
path_from_aml: List[Optional[Path]] = [None] if is_offline_run else [mount_path]
runner.setup(azure_run_info=AzureRunInfo(input_datasets=path_from_aml,
output_datasets=[],
run=None,
is_running_in_azure_ml=False,
output_folder=Path(),
logs_folder=Path(),
mount_contexts=[]
))
return runner.container
@pytest.mark.parametrize(("is_lightning_model", "expected_error"),
[
# A built-in InnerEye model must have either local dataset or azure dataset provided.
(False, "Expecting that a dataset is available here."),
# ... but this is OK for Lightning container models. A Lightning container could simply
# download its data from the web before training.
(True, "")
])
def test_mount_failing_offline_runs(test_output_dirs: OutputFolderForTests,
is_lightning_model: bool,
expected_error: str) -> None:
"""
Test cases when MLRunner.mount_or_download_dataset raises an exception, when running outside AzureML.
"""
def run() -> Any:
return _test_mount_for_lightning_container(test_output_dirs=test_output_dirs,
is_offline_run=True,
local_dataset=None,
azure_dataset="",
is_lightning_model=is_lightning_model)
if expected_error:
with pytest.raises(ValueError) as ex:
run()
assert expected_error in str(ex)
else:
assert run().local_dataset is None
def test_mount_in_azureml1(test_output_dirs: OutputFolderForTests) -> None:
"""
Test cases when MLRunner.mount_or_download_dataset runs inside AzureML.
"""
container = _test_mount_for_lightning_container(test_output_dirs=test_output_dirs,
is_offline_run=False,
local_dataset=None,
azure_dataset="foo",
is_lightning_model=False)
assert "mounted" in str(container.local_dataset)
def test_mount_in_azureml2(test_output_dirs: OutputFolderForTests) -> None:
"""
Test cases when MLRunner.mount_or_download_dataset runs inside AzureML.
"""
container = _test_mount_for_lightning_container(test_output_dirs=test_output_dirs,
is_offline_run=False,
local_dataset=None,
azure_dataset="",
is_lightning_model=True)
assert container.local_dataset is None
def test_mount_or_download(test_output_dirs: OutputFolderForTests) -> None:
"""
Tests the different combinations of local and Azure datasets, with Innereye built-in and container models.
"""
root = test_output_dirs.root_dir
for is_lightning_model in [True, False]:
# With runs outside of AzureML, an AML dataset should get downloaded.
container = _test_mount_for_lightning_container(test_output_dirs=test_output_dirs,
is_offline_run=True,
local_dataset=None,
azure_dataset="foo",
is_lightning_model=is_lightning_model)
assert "downloaded" in str(container.local_dataset)
# For all InnerEye built-in models, the paths from container level need to be copied down to legacy config
# level.
if not is_lightning_model:
assert container.config.local_dataset == container.local_dataset
# With runs in AzureML, an AML dataset should get mounted.
container = _test_mount_for_lightning_container(test_output_dirs=test_output_dirs,
is_offline_run=False,
local_dataset=None,
azure_dataset="foo",
is_lightning_model=is_lightning_model)
assert "mounted" in str(container.local_dataset)
if not is_lightning_model:
assert container.config.local_dataset == container.local_dataset
container = _test_mount_for_lightning_container(test_output_dirs=test_output_dirs,
is_offline_run=True,
local_dataset=root,
azure_dataset="",
is_lightning_model=is_lightning_model)
assert container.local_dataset == root
if not is_lightning_model:
assert container.config.local_dataset == container.local_dataset