Skip to content
This repository has been archived by the owner on Mar 21, 2024. It is now read-only.

Upgrade to Pytorch 1.8 #411

Merged
merged 27 commits into from
Mar 15, 2021
Merged
Changes from 1 commit
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
e7d098d
package upgrade
ant0nsc Mar 6, 2021
470551e
upgrade torchprof
ant0nsc Mar 7, 2021
83311fd
removing a test that now fails because AML is behind
ant0nsc Mar 7, 2021
52a3882
forking torchprof for now
ant0nsc Mar 8, 2021
eb8cf34
adding bias term
ant0nsc Mar 8, 2021
6a03041
updated pytorch lightning
ant0nsc Mar 8, 2021
676032d
updated metrics location
ant0nsc Mar 8, 2021
3c7abbd
avoiding the legacy use_ddp flag
ant0nsc Mar 8, 2021
22c6deb
trying to fix
ant0nsc Mar 8, 2021
8749465
update pillow to avoid component governance
ant0nsc Mar 9, 2021
14c4b9d
fix metrics problems
ant0nsc Mar 10, 2021
6b5584a
switch to new torchprof
ant0nsc Mar 10, 2021
a69dfd6
mypy
ant0nsc Mar 10, 2021
9e7b58a
exclude time from metrics for scalar models
ant0nsc Mar 10, 2021
e95a1ac
fix tolerance issues
ant0nsc Mar 10, 2021
63e200b
project file
ant0nsc Mar 10, 2021
30cba02
cleanup
ant0nsc Mar 10, 2021
54e5055
test fixes
ant0nsc Mar 10, 2021
68f76d4
CHANGELOG.md
ant0nsc Mar 10, 2021
34ee256
test fixes
ant0nsc Mar 10, 2021
45be974
test fixes
ant0nsc Mar 10, 2021
c0c0020
Merge branch 'main' into antonsc/pytorch18
ant0nsc Mar 11, 2021
32c144f
Merge remote-tracking branch 'origin/main' into antonsc/pytorch18
ant0nsc Mar 12, 2021
97756d2
downgrade PL to 1.1.8
ant0nsc Mar 12, 2021
f74c3c4
Merge branch 'antonsc/pytorch18' of https://github.com/microsoft/Inne…
ant0nsc Mar 12, 2021
bee0272
Merge remote-tracking branch 'origin/main' into antonsc/pytorch18
ant0nsc Mar 12, 2021
78d59fe
PR comments
ant0nsc Mar 12, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
fix tolerance issues
  • Loading branch information
ant0nsc committed Mar 10, 2021
commit e95a1ac0a2348311f6611e2279e2b7b669e8d4c0
21 changes: 14 additions & 7 deletions Tests/ML/test_model_training.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@
from InnerEye.ML.utils.training_util import ModelTrainingResults
from InnerEye.ML.visualizers.patch_sampling import PATCH_SAMPLING_FOLDER
from Tests.ML.configs.DummyModel import DummyModel
from Tests.ML.util import get_default_checkpoint_handler
from Tests.ML.util import get_default_checkpoint_handler, machine_has_gpu

config_path = full_ml_test_data_path()
base_path = full_ml_test_data_path()
Expand Down Expand Up @@ -101,8 +101,12 @@ def _mean_list(lists: List[List[float]]) -> List[float]:
train_config.store_dataset_sample = True
train_config.recovery_checkpoint_save_interval = 1

expected_train_losses = [0.4553468, 0.454904]
expected_val_losses = [0.4553881, 0.4553041]
if machine_has_gpu:
expected_train_losses = [0.4553468, 0.454904]
expected_val_losses = [0.4553881, 0.4553041]
else:
expected_train_losses = [0.4553469, 0.4548947]
expected_val_losses = [0.4553880, 0.4553041]
loss_absolute_tolerance = 1e-6
expected_learning_rates = [train_config.l_rate, 5.3589e-4]

Expand Down Expand Up @@ -147,17 +151,20 @@ def assert_all_close(metric: str, expected: List[float], **kwargs: Any) -> None:
tracked_metric = TrackedMetrics.Val_Loss.value[len(VALIDATION_PREFIX):]
for val_result in model_training_result.val_results_per_epoch:
assert tracked_metric in val_result
# The following values are read off directly from the results of compute_dice_across_patches in the training loop

# The following values are read off directly from the results of compute_dice_across_patches in the
# training loop. Results are slightly different for CPU, hence use a larger tolerance there.
dice_tolerance = 1e-4 if machine_has_gpu else 3e-4
train_dice_region = [[0.0, 0.0, 4.0282e-04], [0.0309, 0.0334, 0.0961]]
train_dice_region1 = [[0.4806, 0.4800, 0.4832], [0.4812, 0.4842, 0.4663]]
# There appears to be some amount of non-determinism here: When using a tolerance of 1e-4, we get occasional
# test failures on Linux in the cloud (not on Windows, not on AzureML) Unclear where it comes from. Even when
# failing here, the losses match up to the expected tolerance.
assert_all_close("Dice/region", _mean_list(train_dice_region), atol=1e-4)
assert_all_close("Dice/region_1", _mean_list(train_dice_region1), atol=1e-4)
assert_all_close("Dice/region", _mean_list(train_dice_region), atol=dice_tolerance)
assert_all_close("Dice/region_1", _mean_list(train_dice_region1), atol=dice_tolerance)
expected_average_dice = [_mean(train_dice_region[i] + train_dice_region1[i]) # type: ignore
for i in range(len(train_dice_region))]
assert_all_close("Dice/AverageAcrossStructures", expected_average_dice, atol=1e-4)
assert_all_close("Dice/AverageAcrossStructures", expected_average_dice, atol=dice_tolerance)

# check output files/directories
assert train_config.outputs_folder.is_dir()
Expand Down