From 3a678ec84ca6717b666c7418eec9db692be4b984 Mon Sep 17 00:00:00 2001 From: woshiyyya Date: Thu, 6 Apr 2023 16:36:28 -0700 Subject: [PATCH 01/20] init Signed-off-by: woshiyyya --- .buildkite/pipeline.build.yml | 12 ++++++++++++ python/ray/train/BUILD | 8 ++++---- .../requirements/compat/requirements_py38_compat.txt | 10 ++++++++++ 3 files changed, 26 insertions(+), 4 deletions(-) create mode 100644 python/requirements/compat/requirements_py38_compat.txt diff --git a/.buildkite/pipeline.build.yml b/.buildkite/pipeline.build.yml index 387df5754ec5e..05391b23118ff 100644 --- a/.buildkite/pipeline.build.yml +++ b/.buildkite/pipeline.build.yml @@ -625,6 +625,18 @@ - python ./ci/env/check_minimal_install.py - bazel test --config=ci $(./ci/run/bazel_export_options) --build_tests_only --test_tag_filters=minimal python/ray/train/... +- label: ":cold_face: :python: Ray Python 3.8 ML compatibility tests" + conditions: + ["ALWAYS", "RAY_CI_TUNE_AFFECTED", "RAY_CI_TRAIN_AFFECTED", "RAY_CI_ML_AFFECTED", ] + instance_size: large + commands: + - cleanup() { if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then ./ci/build/upload_build_info.sh; fi }; trap cleanup EXIT + - ./ci/env/install-minimal.sh 3.8 + - pip install -r python/requirements/compat/requirements_py38_compat.txt + - pip install -U typing-extensions + - ./ci/env/env_info.sh + - bazel test --config=ci $(./scripts/bazel_export_options) --test_tag_filters=compat_py38 + python/ray/train/... - label: ":cold_face: :python: Ray Python 3.6 ML compatibility tests" conditions: diff --git a/python/ray/train/BUILD b/python/ray/train/BUILD index e2a244455ace3..a52ab574c57de 100644 --- a/python/ray/train/BUILD +++ b/python/ray/train/BUILD @@ -412,7 +412,7 @@ py_test( name = "test_lightning_checkpoint", size = "medium", srcs = ["tests/test_lightning_checkpoint.py"], - tags = ["team:ml", "exclusive", "ray_air", "gpu"], + tags = ["team:ml", "exclusive", "ray_air", "gpu", "compat_py38"], deps = [":train_lib"] ) @@ -420,7 +420,7 @@ py_test( name = "test_lightning_trainer_restore", size = "medium", srcs = ["tests/test_lightning_trainer_restore.py"], - tags = ["team:ml", "exclusive", "ray_air", "gpu"], + tags = ["team:ml", "exclusive", "ray_air", "gpu", "compat_py38"], deps = [":train_lib"] ) @@ -428,7 +428,7 @@ py_test( name = "test_lightning_trainer", size = "large", srcs = ["tests/test_lightning_trainer.py"], - tags = ["team:ml", "exclusive", "ray_air", "gpu"], + tags = ["team:ml", "exclusive", "ray_air", "gpu", "compat_py38"], deps = [":train_lib"] ) @@ -436,7 +436,7 @@ py_test( name = "test_lightning_predictor", size = "medium", srcs = ["tests/test_lightning_predictor.py"], - tags = ["team:ml", "exclusive", "ray_air", "gpu"], + tags = ["team:ml", "exclusive", "ray_air", "gpu", "compat_py38"], deps = [":train_lib"] ) diff --git a/python/requirements/compat/requirements_py38_compat.txt b/python/requirements/compat/requirements_py38_compat.txt new file mode 100644 index 0000000000000..3da6084aef145 --- /dev/null +++ b/python/requirements/compat/requirements_py38_compat.txt @@ -0,0 +1,10 @@ +# These are compatibility requirements to make sure certain workflows continue to work +# with these dependency versions and on Python 3.8. +# Concretely, we set up a fresh Python 3.8 environment and +# run the pipeline job in `Ray Python 3.8 ML compatibility tests` with these dependencies installed. + +# ML libraries +lightning==2.0.0 +pytorch-lightning==2.0.0 + +ray[tune,data] From bab988749ca24036e6e4d0520f64043e0e6c88d7 Mon Sep 17 00:00:00 2001 From: woshiyyya Date: Thu, 6 Apr 2023 18:27:02 -0700 Subject: [PATCH 02/20] fix buildkite Signed-off-by: woshiyyya --- .buildkite/pipeline.build.yml | 13 ------------- .buildkite/pipeline.gpu_large.yml | 13 +++++++++++++ .../compat/requirements_py38_compat.txt | 10 ---------- 3 files changed, 13 insertions(+), 23 deletions(-) delete mode 100644 python/requirements/compat/requirements_py38_compat.txt diff --git a/.buildkite/pipeline.build.yml b/.buildkite/pipeline.build.yml index 05391b23118ff..1d8969a95ee7d 100644 --- a/.buildkite/pipeline.build.yml +++ b/.buildkite/pipeline.build.yml @@ -625,19 +625,6 @@ - python ./ci/env/check_minimal_install.py - bazel test --config=ci $(./ci/run/bazel_export_options) --build_tests_only --test_tag_filters=minimal python/ray/train/... -- label: ":cold_face: :python: Ray Python 3.8 ML compatibility tests" - conditions: - ["ALWAYS", "RAY_CI_TUNE_AFFECTED", "RAY_CI_TRAIN_AFFECTED", "RAY_CI_ML_AFFECTED", ] - instance_size: large - commands: - - cleanup() { if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then ./ci/build/upload_build_info.sh; fi }; trap cleanup EXIT - - ./ci/env/install-minimal.sh 3.8 - - pip install -r python/requirements/compat/requirements_py38_compat.txt - - pip install -U typing-extensions - - ./ci/env/env_info.sh - - bazel test --config=ci $(./scripts/bazel_export_options) --test_tag_filters=compat_py38 - python/ray/train/... - - label: ":cold_face: :python: Ray Python 3.6 ML compatibility tests" conditions: ["ALWAYS", "RAY_CI_TUNE_AFFECTED", "RAY_CI_TRAIN_AFFECTED", "RAY_CI_ML_AFFECTED", ] diff --git a/.buildkite/pipeline.gpu_large.yml b/.buildkite/pipeline.gpu_large.yml index 2f993cd965463..434cd4e8996ed 100644 --- a/.buildkite/pipeline.gpu_large.yml +++ b/.buildkite/pipeline.gpu_large.yml @@ -49,3 +49,16 @@ - pip install -Ur ./python/requirements/ml/requirements_ml_docker.txt - ./ci/env/env_info.sh - bazel test --config=ci $(./ci/run/bazel_export_options) --build_tests_only --test_tag_filters=gpu,-timeseries_libs,-py37,-post_wheel_build doc/... + +- label: ":tc: :python: Python 3.8 Train GPU tests" + conditions: + ["ALWAYS", "RAY_CI_TUNE_AFFECTED", "RAY_CI_TRAIN_AFFECTED", "RAY_CI_ML_AFFECTED", ] + instance_size: large + commands: + - cleanup() { if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then ./ci/build/upload_build_info.sh; fi }; trap cleanup EXIT + - DATA_PROCESSING_TESTING=1 TRAIN_TESTING=1 TUNE_TESTING=1 ./ci/env/install-dependencies.sh + - pip uninstall pytorch-lightning + - pip install lightning==2.0.0 + - ./ci/env/env_info.sh + - bazel test --config=ci $(./scripts/bazel_export_options) --test_tag_filters=lightning_v2 + python/ray/train/... \ No newline at end of file diff --git a/python/requirements/compat/requirements_py38_compat.txt b/python/requirements/compat/requirements_py38_compat.txt deleted file mode 100644 index 3da6084aef145..0000000000000 --- a/python/requirements/compat/requirements_py38_compat.txt +++ /dev/null @@ -1,10 +0,0 @@ -# These are compatibility requirements to make sure certain workflows continue to work -# with these dependency versions and on Python 3.8. -# Concretely, we set up a fresh Python 3.8 environment and -# run the pipeline job in `Ray Python 3.8 ML compatibility tests` with these dependencies installed. - -# ML libraries -lightning==2.0.0 -pytorch-lightning==2.0.0 - -ray[tune,data] From 7a5910bef7198bdf9470a9254993b75777985fe8 Mon Sep 17 00:00:00 2001 From: woshiyyya Date: Fri, 7 Apr 2023 12:01:17 -0700 Subject: [PATCH 03/20] fix pipeline tag Signed-off-by: woshiyyya --- .buildkite/pipeline.gpu_large.yml | 6 +++--- python/ray/train/BUILD | 8 ++++---- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/.buildkite/pipeline.gpu_large.yml b/.buildkite/pipeline.gpu_large.yml index 434cd4e8996ed..a6f1d7d9d0cdc 100644 --- a/.buildkite/pipeline.gpu_large.yml +++ b/.buildkite/pipeline.gpu_large.yml @@ -50,15 +50,15 @@ - ./ci/env/env_info.sh - bazel test --config=ci $(./ci/run/bazel_export_options) --build_tests_only --test_tag_filters=gpu,-timeseries_libs,-py37,-post_wheel_build doc/... -- label: ":tc: :python: Python 3.8 Train GPU tests" +- label: ":tc: :python: Lightning 2.0 Train GPU tests" conditions: ["ALWAYS", "RAY_CI_TUNE_AFFECTED", "RAY_CI_TRAIN_AFFECTED", "RAY_CI_ML_AFFECTED", ] instance_size: large commands: - cleanup() { if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then ./ci/build/upload_build_info.sh; fi }; trap cleanup EXIT - DATA_PROCESSING_TESTING=1 TRAIN_TESTING=1 TUNE_TESTING=1 ./ci/env/install-dependencies.sh - - pip uninstall pytorch-lightning + - pip uninstall -y pytorch-lightning - pip install lightning==2.0.0 - ./ci/env/env_info.sh - - bazel test --config=ci $(./scripts/bazel_export_options) --test_tag_filters=lightning_v2 + - bazel test --config=ci $(./scripts/bazel_export_options) --test_tag_filters=ptl_v2 python/ray/train/... \ No newline at end of file diff --git a/python/ray/train/BUILD b/python/ray/train/BUILD index a52ab574c57de..1d932ca912a8d 100644 --- a/python/ray/train/BUILD +++ b/python/ray/train/BUILD @@ -412,7 +412,7 @@ py_test( name = "test_lightning_checkpoint", size = "medium", srcs = ["tests/test_lightning_checkpoint.py"], - tags = ["team:ml", "exclusive", "ray_air", "gpu", "compat_py38"], + tags = ["team:ml", "exclusive", "ray_air", "gpu", "ptl_v2"], deps = [":train_lib"] ) @@ -420,7 +420,7 @@ py_test( name = "test_lightning_trainer_restore", size = "medium", srcs = ["tests/test_lightning_trainer_restore.py"], - tags = ["team:ml", "exclusive", "ray_air", "gpu", "compat_py38"], + tags = ["team:ml", "exclusive", "ray_air", "gpu", "ptl_v2"], deps = [":train_lib"] ) @@ -428,7 +428,7 @@ py_test( name = "test_lightning_trainer", size = "large", srcs = ["tests/test_lightning_trainer.py"], - tags = ["team:ml", "exclusive", "ray_air", "gpu", "compat_py38"], + tags = ["team:ml", "exclusive", "ray_air", "gpu", "ptl_v2"], deps = [":train_lib"] ) @@ -436,7 +436,7 @@ py_test( name = "test_lightning_predictor", size = "medium", srcs = ["tests/test_lightning_predictor.py"], - tags = ["team:ml", "exclusive", "ray_air", "gpu", "compat_py38"], + tags = ["team:ml", "exclusive", "ray_air", "gpu", "ptl_v2"], deps = [":train_lib"] ) From 6d0668dff7ff13dcb671918e2e22eb1cd24e59a1 Mon Sep 17 00:00:00 2001 From: woshiyyya Date: Fri, 7 Apr 2023 12:04:40 -0700 Subject: [PATCH 04/20] change test model definition for ptl20 Signed-off-by: woshiyyya --- .../ray/train/tests/lightning_test_utils.py | 26 ++++++++++++++----- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/python/ray/train/tests/lightning_test_utils.py b/python/ray/train/tests/lightning_test_utils.py index fcf37af1becc9..c3b91538b7bab 100644 --- a/python/ray/train/tests/lightning_test_utils.py +++ b/python/ray/train/tests/lightning_test_utils.py @@ -10,6 +10,7 @@ class LinearModule(pl.LightningModule): def __init__(self, input_dim, output_dim) -> None: super().__init__() self.linear = nn.Linear(input_dim, output_dim) + self.loss = [] def forward(self, input): return self.linear(input) @@ -22,11 +23,13 @@ def training_step(self, batch): def validation_step(self, val_batch, batch_idx): loss = self.forward(val_batch) + self.loss.append(loss) return {"val_loss": loss} - def validation_epoch_end(self, outputs) -> None: - avg_loss = torch.stack([x["val_loss"] for x in outputs]).mean() + def on_validation_epoch_end(self) -> None: + avg_loss = torch.stack(self.loss).mean() self.log("val_loss", avg_loss) + self.loss.clear() def predict_step(self, batch, batch_idx): return self.forward(batch) @@ -40,6 +43,7 @@ def __init__(self, input_dim_1, input_dim_2, output_dim) -> None: super().__init__() self.linear_1 = nn.Linear(input_dim_1, output_dim) self.linear_2 = nn.Linear(input_dim_2, output_dim) + self.loss = [] def forward(self, batch): input_1 = batch["input_1"] @@ -54,12 +58,14 @@ def training_step(self, batch): def validation_step(self, val_batch, batch_idx): loss = self.forward(val_batch) + self.loss.append(loss) return {"val_loss": loss} - def validation_epoch_end(self, outputs) -> None: + def on_validation_epoch_end(self) -> None: print("Validation Epoch:", self.current_epoch) - avg_loss = torch.stack([x["val_loss"] for x in outputs]).mean() + avg_loss = torch.stack(self.loss).mean() self.log("val_loss", avg_loss) + self.loss.clear() def predict_step(self, batch, batch_idx): return self.forward(batch) @@ -92,6 +98,8 @@ def __init__(self, lr: float, layer_1: int, layer_2: int): self.layer_2 = torch.nn.Linear(layer_1, layer_2) self.layer_3 = torch.nn.Linear(layer_2, 10) self.accuracy = Accuracy() + self.val_acc_list = [] + self.val_loss_list = [] def forward(self, x): batch_size, channels, width, height = x.size() @@ -121,13 +129,17 @@ def validation_step(self, val_batch, batch_idx): logits = self.forward(x) loss = F.nll_loss(logits, y) acc = self.accuracy(logits, y) + self.val_acc_list.append(acc) + self.val_loss_list.append(loss) return {"val_loss": loss, "val_accuracy": acc} - def validation_epoch_end(self, outputs): - avg_loss = torch.stack([x["val_loss"] for x in outputs]).mean() - avg_acc = torch.stack([x["val_accuracy"] for x in outputs]).mean() + def on_validation_epoch_end(self): + avg_loss = torch.stack(self.val_loss_list).mean() + avg_acc = torch.stack(self.val_acc_list).mean() self.log("ptl/val_loss", avg_loss) self.log("ptl/val_accuracy", avg_acc) + self.val_acc_list.clear() + self.val_loss_list.clear() def predict_step(self, batch, batch_idx, dataloader_idx=None): x = batch From 723f934ff7940e42c11b2a4de36385e08d2154c3 Mon Sep 17 00:00:00 2001 From: woshiyyya Date: Fri, 7 Apr 2023 13:39:18 -0700 Subject: [PATCH 05/20] specify py38 Signed-off-by: woshiyyya --- .buildkite/pipeline.gpu_large.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.buildkite/pipeline.gpu_large.yml b/.buildkite/pipeline.gpu_large.yml index a6f1d7d9d0cdc..1be686fcc9acb 100644 --- a/.buildkite/pipeline.gpu_large.yml +++ b/.buildkite/pipeline.gpu_large.yml @@ -56,6 +56,7 @@ instance_size: large commands: - cleanup() { if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then ./ci/build/upload_build_info.sh; fi }; trap cleanup EXIT + - ./ci/env/install-minimal.sh 3.8 - DATA_PROCESSING_TESTING=1 TRAIN_TESTING=1 TUNE_TESTING=1 ./ci/env/install-dependencies.sh - pip uninstall -y pytorch-lightning - pip install lightning==2.0.0 From 976a9a1eb5b4361acc0f14404f5ab82f0591e7f8 Mon Sep 17 00:00:00 2001 From: woshiyyya Date: Wed, 12 Apr 2023 11:14:35 -0700 Subject: [PATCH 06/20] fixing Signed-off-by: woshiyyya --- .buildkite/pipeline.gpu_large.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.buildkite/pipeline.gpu_large.yml b/.buildkite/pipeline.gpu_large.yml index 1be686fcc9acb..8b43f09cd8cbe 100644 --- a/.buildkite/pipeline.gpu_large.yml +++ b/.buildkite/pipeline.gpu_large.yml @@ -57,7 +57,7 @@ commands: - cleanup() { if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then ./ci/build/upload_build_info.sh; fi }; trap cleanup EXIT - ./ci/env/install-minimal.sh 3.8 - - DATA_PROCESSING_TESTING=1 TRAIN_TESTING=1 TUNE_TESTING=1 ./ci/env/install-dependencies.sh + - PYTHON=3.8 DATA_PROCESSING_TESTING=1 TRAIN_TESTING=1 TUNE_TESTING=1 ./ci/env/install-dependencies.sh - pip uninstall -y pytorch-lightning - pip install lightning==2.0.0 - ./ci/env/env_info.sh From f36ec3dc2f95b3634931fe7de0fdc0bb99aebae7 Mon Sep 17 00:00:00 2001 From: woshiyyya Date: Wed, 12 Apr 2023 12:14:53 -0700 Subject: [PATCH 07/20] fixing Signed-off-by: woshiyyya --- .buildkite/pipeline.gpu_large.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.buildkite/pipeline.gpu_large.yml b/.buildkite/pipeline.gpu_large.yml index 8b43f09cd8cbe..3589b71d4484c 100644 --- a/.buildkite/pipeline.gpu_large.yml +++ b/.buildkite/pipeline.gpu_large.yml @@ -56,8 +56,8 @@ instance_size: large commands: - cleanup() { if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then ./ci/build/upload_build_info.sh; fi }; trap cleanup EXIT - - ./ci/env/install-minimal.sh 3.8 - - PYTHON=3.8 DATA_PROCESSING_TESTING=1 TRAIN_TESTING=1 TUNE_TESTING=1 ./ci/env/install-dependencies.sh + - PYTHON=3.8 DATA_PROCESSING_TESTING=1 DOC_TESTING=1 TRAIN_TESTING=1 TUNE_TESTING=1 ./ci/env/install-dependencies.sh + - pip install -Ur ./python/requirements/ml/requirements_ml_docker.txt - pip uninstall -y pytorch-lightning - pip install lightning==2.0.0 - ./ci/env/env_info.sh From d326403ff12296ec18d0aecb386cd101aced529b Mon Sep 17 00:00:00 2001 From: woshiyyya Date: Wed, 12 Apr 2023 14:04:55 -0700 Subject: [PATCH 08/20] fix Signed-off-by: woshiyyya --- .buildkite/pipeline.gpu_large.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.buildkite/pipeline.gpu_large.yml b/.buildkite/pipeline.gpu_large.yml index 3589b71d4484c..82b58ad5ad52c 100644 --- a/.buildkite/pipeline.gpu_large.yml +++ b/.buildkite/pipeline.gpu_large.yml @@ -61,5 +61,5 @@ - pip uninstall -y pytorch-lightning - pip install lightning==2.0.0 - ./ci/env/env_info.sh - - bazel test --config=ci $(./scripts/bazel_export_options) --test_tag_filters=ptl_v2 + - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only --test_tag_filters=ptl_v2,gpu python/ray/train/... \ No newline at end of file From aeac5317c2f80a2ea89323f4ca96f70c8a0285ba Mon Sep 17 00:00:00 2001 From: woshiyyya Date: Wed, 12 Apr 2023 15:41:41 -0700 Subject: [PATCH 09/20] still trying to fix ray.train.lightning not found error Signed-off-by: woshiyyya --- .buildkite/pipeline.gpu_large.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.buildkite/pipeline.gpu_large.yml b/.buildkite/pipeline.gpu_large.yml index 82b58ad5ad52c..7c2d6a06bdc6f 100644 --- a/.buildkite/pipeline.gpu_large.yml +++ b/.buildkite/pipeline.gpu_large.yml @@ -50,9 +50,9 @@ - ./ci/env/env_info.sh - bazel test --config=ci $(./ci/run/bazel_export_options) --build_tests_only --test_tag_filters=gpu,-timeseries_libs,-py37,-post_wheel_build doc/... -- label: ":tc: :python: Lightning 2.0 Train GPU tests" +- label: ":tv: :python: Lightning 2.0 Train GPU tests" conditions: - ["ALWAYS", "RAY_CI_TUNE_AFFECTED", "RAY_CI_TRAIN_AFFECTED", "RAY_CI_ML_AFFECTED", ] + ["NO_WHEELS_REQUIRED", "RAY_CI_TUNE_AFFECTED", "RAY_CI_TRAIN_AFFECTED", "RAY_CI_ML_AFFECTED", ] instance_size: large commands: - cleanup() { if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then ./ci/build/upload_build_info.sh; fi }; trap cleanup EXIT @@ -61,5 +61,5 @@ - pip uninstall -y pytorch-lightning - pip install lightning==2.0.0 - ./ci/env/env_info.sh - - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only --test_tag_filters=ptl_v2,gpu + - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only --test_tag_filters=ptl_v2 python/ray/train/... \ No newline at end of file From 9d24bcee96175cdf3bac85fb0ec4e566c75c508f Mon Sep 17 00:00:00 2001 From: woshiyyya Date: Wed, 12 Apr 2023 16:08:07 -0700 Subject: [PATCH 10/20] still no train.lightning module Signed-off-by: woshiyyya --- .buildkite/pipeline.gpu_large.yml | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/.buildkite/pipeline.gpu_large.yml b/.buildkite/pipeline.gpu_large.yml index 7c2d6a06bdc6f..111560d13660d 100644 --- a/.buildkite/pipeline.gpu_large.yml +++ b/.buildkite/pipeline.gpu_large.yml @@ -52,14 +52,12 @@ - label: ":tv: :python: Lightning 2.0 Train GPU tests" conditions: - ["NO_WHEELS_REQUIRED", "RAY_CI_TUNE_AFFECTED", "RAY_CI_TRAIN_AFFECTED", "RAY_CI_ML_AFFECTED", ] - instance_size: large + ["NO_WHEELS_REQUIRED", "RAY_CI_TRAIN_AFFECTED"] commands: - cleanup() { if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then ./ci/build/upload_build_info.sh; fi }; trap cleanup EXIT - - PYTHON=3.8 DATA_PROCESSING_TESTING=1 DOC_TESTING=1 TRAIN_TESTING=1 TUNE_TESTING=1 ./ci/env/install-dependencies.sh + - PYTHON=3.8 DOC_TESTING=1 TRAIN_TESTING=1 TUNE_TESTING=1 ./ci/env/install-dependencies.sh - pip install -Ur ./python/requirements/ml/requirements_ml_docker.txt - pip uninstall -y pytorch-lightning - pip install lightning==2.0.0 - ./ci/env/env_info.sh - - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only --test_tag_filters=ptl_v2 - python/ray/train/... \ No newline at end of file + - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only --test_tag_filters=ptl_v2 python/ray/train/... \ No newline at end of file From 0c6dc85e640cc63a406df0a7aa3f7f1437d5c3a5 Mon Sep 17 00:00:00 2001 From: woshiyyya Date: Wed, 12 Apr 2023 17:09:20 -0700 Subject: [PATCH 11/20] rm build_tests_only Signed-off-by: woshiyyya --- .buildkite/pipeline.gpu_large.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.buildkite/pipeline.gpu_large.yml b/.buildkite/pipeline.gpu_large.yml index 111560d13660d..dea45f44a1563 100644 --- a/.buildkite/pipeline.gpu_large.yml +++ b/.buildkite/pipeline.gpu_large.yml @@ -60,4 +60,4 @@ - pip uninstall -y pytorch-lightning - pip install lightning==2.0.0 - ./ci/env/env_info.sh - - bazel test --config=ci $(./scripts/bazel_export_options) --build_tests_only --test_tag_filters=ptl_v2 python/ray/train/... \ No newline at end of file + - bazel test --config=ci $(./scripts/bazel_export_options) --test_tag_filters=ptl_v2 python/ray/train/... \ No newline at end of file From 27e62034fca4058dadaa2143a62191899e0fc2ea Mon Sep 17 00:00:00 2001 From: woshiyyya Date: Wed, 12 Apr 2023 17:51:05 -0700 Subject: [PATCH 12/20] try to remove NO_WHEELS_REQUIRED Signed-off-by: woshiyyya --- .buildkite/pipeline.build.yml | 1 + .buildkite/pipeline.gpu_large.yml | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/.buildkite/pipeline.build.yml b/.buildkite/pipeline.build.yml index 1d8969a95ee7d..387df5754ec5e 100644 --- a/.buildkite/pipeline.build.yml +++ b/.buildkite/pipeline.build.yml @@ -625,6 +625,7 @@ - python ./ci/env/check_minimal_install.py - bazel test --config=ci $(./ci/run/bazel_export_options) --build_tests_only --test_tag_filters=minimal python/ray/train/... + - label: ":cold_face: :python: Ray Python 3.6 ML compatibility tests" conditions: ["ALWAYS", "RAY_CI_TUNE_AFFECTED", "RAY_CI_TRAIN_AFFECTED", "RAY_CI_ML_AFFECTED", ] diff --git a/.buildkite/pipeline.gpu_large.yml b/.buildkite/pipeline.gpu_large.yml index dea45f44a1563..cb13ad2d15fe0 100644 --- a/.buildkite/pipeline.gpu_large.yml +++ b/.buildkite/pipeline.gpu_large.yml @@ -52,7 +52,7 @@ - label: ":tv: :python: Lightning 2.0 Train GPU tests" conditions: - ["NO_WHEELS_REQUIRED", "RAY_CI_TRAIN_AFFECTED"] + ["RAY_CI_TRAIN_AFFECTED"] commands: - cleanup() { if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then ./ci/build/upload_build_info.sh; fi }; trap cleanup EXIT - PYTHON=3.8 DOC_TESTING=1 TRAIN_TESTING=1 TUNE_TESTING=1 ./ci/env/install-dependencies.sh From eee5365965ddaef5f5905201b69e45444c519b52 Mon Sep 17 00:00:00 2001 From: woshiyyya Date: Fri, 14 Apr 2023 11:35:02 -0700 Subject: [PATCH 13/20] still fixing Signed-off-by: woshiyyya --- .buildkite/pipeline.gpu_large.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.buildkite/pipeline.gpu_large.yml b/.buildkite/pipeline.gpu_large.yml index cb13ad2d15fe0..fe34ae18f89fa 100644 --- a/.buildkite/pipeline.gpu_large.yml +++ b/.buildkite/pipeline.gpu_large.yml @@ -55,6 +55,7 @@ ["RAY_CI_TRAIN_AFFECTED"] commands: - cleanup() { if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then ./ci/build/upload_build_info.sh; fi }; trap cleanup EXIT + - ./ci/env/install-minimal.sh 3.8 - PYTHON=3.8 DOC_TESTING=1 TRAIN_TESTING=1 TUNE_TESTING=1 ./ci/env/install-dependencies.sh - pip install -Ur ./python/requirements/ml/requirements_ml_docker.txt - pip uninstall -y pytorch-lightning From 17b5268e40d49b19b5d89559d3cac6dfbb910d25 Mon Sep 17 00:00:00 2001 From: woshiyyya Date: Fri, 14 Apr 2023 11:43:34 -0700 Subject: [PATCH 14/20] still fixing Signed-off-by: woshiyyya --- .buildkite/pipeline.gpu_large.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.buildkite/pipeline.gpu_large.yml b/.buildkite/pipeline.gpu_large.yml index fe34ae18f89fa..3058eb139e630 100644 --- a/.buildkite/pipeline.gpu_large.yml +++ b/.buildkite/pipeline.gpu_large.yml @@ -52,7 +52,7 @@ - label: ":tv: :python: Lightning 2.0 Train GPU tests" conditions: - ["RAY_CI_TRAIN_AFFECTED"] + ["NO_WHEELS_REQUIRED", "RAY_CI_TRAIN_AFFECTED"] commands: - cleanup() { if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then ./ci/build/upload_build_info.sh; fi }; trap cleanup EXIT - ./ci/env/install-minimal.sh 3.8 From b7296f1e432e34ef42e40335818148fbd9347525 Mon Sep 17 00:00:00 2001 From: Kai Fricke Date: Tue, 18 Apr 2023 12:01:12 +0100 Subject: [PATCH 15/20] [no_early_kickoff] No dashboard Signed-off-by: Kai Fricke --- .buildkite/pipeline.gpu_large.yml | 2 +- ci/ci.sh | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/.buildkite/pipeline.gpu_large.yml b/.buildkite/pipeline.gpu_large.yml index 3058eb139e630..f582c122f87c7 100644 --- a/.buildkite/pipeline.gpu_large.yml +++ b/.buildkite/pipeline.gpu_large.yml @@ -55,7 +55,7 @@ ["NO_WHEELS_REQUIRED", "RAY_CI_TRAIN_AFFECTED"] commands: - cleanup() { if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then ./ci/build/upload_build_info.sh; fi }; trap cleanup EXIT - - ./ci/env/install-minimal.sh 3.8 + - NO_DASHBOARD=1 ./ci/env/install-minimal.sh 3.8 - PYTHON=3.8 DOC_TESTING=1 TRAIN_TESTING=1 TUNE_TESTING=1 ./ci/env/install-dependencies.sh - pip install -Ur ./python/requirements/ml/requirements_ml_docker.txt - pip uninstall -y pytorch-lightning diff --git a/ci/ci.sh b/ci/ci.sh index ab97adf249505..57ab0ca0f50ba 100755 --- a/ci/ci.sh +++ b/ci/ci.sh @@ -284,6 +284,8 @@ install_npm_project() { build_dashboard_front_end() { if [ "${OSTYPE}" = msys ]; then { echo "WARNING: Skipping dashboard due to NPM incompatibilities with Windows"; } 2> /dev/null + elif [ "${NO_DASHBOARD}" = "1" ]; then + echo "Skipping dashboard build" else ( cd ray/dashboard/client From c0cea0ac4c31ac83d307591e89bc0cf8a5d1351b Mon Sep 17 00:00:00 2001 From: Kai Fricke Date: Tue, 18 Apr 2023 12:03:30 +0100 Subject: [PATCH 16/20] set -xe Signed-off-by: Kai Fricke --- ci/env/install-minimal.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ci/env/install-minimal.sh b/ci/env/install-minimal.sh index e99e453ea11e3..9da00d7517c3f 100755 --- a/ci/env/install-minimal.sh +++ b/ci/env/install-minimal.sh @@ -1,5 +1,7 @@ #!/usr/bin/env bash +set -xe + # Python version can be specified as 3.7, 3.8, 3.9, etc.. if [ -z "$1" ]; then PYTHON_VERSION=${PYTHON-3.7} From 447cdadf909b8181cb339c7149cbb5bce5672084 Mon Sep 17 00:00:00 2001 From: Kai Fricke Date: Tue, 18 Apr 2023 12:08:41 +0100 Subject: [PATCH 17/20] unbound Signed-off-by: Kai Fricke --- ci/ci.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/ci.sh b/ci/ci.sh index 57ab0ca0f50ba..9eba9c0b6ba3b 100755 --- a/ci/ci.sh +++ b/ci/ci.sh @@ -284,7 +284,7 @@ install_npm_project() { build_dashboard_front_end() { if [ "${OSTYPE}" = msys ]; then { echo "WARNING: Skipping dashboard due to NPM incompatibilities with Windows"; } 2> /dev/null - elif [ "${NO_DASHBOARD}" = "1" ]; then + elif [ "${NO_DASHBOARD-}" = "1" ]; then echo "Skipping dashboard build" else ( From f039751dbc337c4f2a5d7ac9b1fd34c0b54e97db Mon Sep 17 00:00:00 2001 From: woshiyyya Date: Wed, 19 Apr 2023 22:58:24 -0700 Subject: [PATCH 18/20] fix ci tests Signed-off-by: woshiyyya --- python/ray/train/tests/lightning_test_utils.py | 11 ++++++++--- python/ray/train/tests/test_lightning_checkpoint.py | 10 ++++++++-- python/ray/train/tests/test_lightning_predictor.py | 5 ++++- 3 files changed, 20 insertions(+), 6 deletions(-) diff --git a/python/ray/train/tests/lightning_test_utils.py b/python/ray/train/tests/lightning_test_utils.py index c3b91538b7bab..c58ae623336b3 100644 --- a/python/ray/train/tests/lightning_test_utils.py +++ b/python/ray/train/tests/lightning_test_utils.py @@ -7,10 +7,11 @@ class LinearModule(pl.LightningModule): - def __init__(self, input_dim, output_dim) -> None: + def __init__(self, input_dim, output_dim, strategy="ddp") -> None: super().__init__() self.linear = nn.Linear(input_dim, output_dim) self.loss = [] + self.strategy = strategy def forward(self, input): return self.linear(input) @@ -35,7 +36,11 @@ def predict_step(self, batch, batch_idx): return self.forward(batch) def configure_optimizers(self): - return torch.optim.SGD(self.parameters(), lr=0.1) + if self.strategy == "fsdp": + # Feed FSDP wrapped model parameters to optimizer + return torch.optim.SGD(self.trainer.model.parameters(), lr=0.1) + else: + return torch.optim.SGD(self.parameters(), lr=0.1) class DoubleLinearModule(pl.LightningModule): @@ -97,7 +102,7 @@ def __init__(self, lr: float, layer_1: int, layer_2: int): self.layer_1 = torch.nn.Linear(28 * 28, layer_1) self.layer_2 = torch.nn.Linear(layer_1, layer_2) self.layer_3 = torch.nn.Linear(layer_2, 10) - self.accuracy = Accuracy() + self.accuracy = Accuracy(task="multiclass", num_classes=10) self.val_acc_list = [] self.val_loss_list = [] diff --git a/python/ray/train/tests/test_lightning_checkpoint.py b/python/ray/train/tests/test_lightning_checkpoint.py index e253bb2a8b855..64bcd40b32bec 100644 --- a/python/ray/train/tests/test_lightning_checkpoint.py +++ b/python/ray/train/tests/test_lightning_checkpoint.py @@ -38,7 +38,10 @@ def test_load_from_path(): # Train one epoch and save a checkpoint trainer = pl.Trainer( - max_epochs=1, enable_progress_bar=False, enable_checkpointing=False + max_epochs=1, + accelerator="cpu", + enable_progress_bar=False, + enable_checkpointing=False, ) trainer.fit(model=model, train_dataloaders=dataloader) ckpt_path = f"{tmpdir}/random_checkpoint_name.ckpt" @@ -75,7 +78,10 @@ def test_from_directory(): # Train one epoch and save a checkpoint trainer = pl.Trainer( - max_epochs=1, enable_progress_bar=False, enable_checkpointing=False + max_epochs=1, + accelerator="cpu", + enable_progress_bar=False, + enable_checkpointing=False, ) trainer.fit(model=model, train_dataloaders=dataloader) trainer.save_checkpoint(f"{tmpdir}/{MODEL_KEY}") diff --git a/python/ray/train/tests/test_lightning_predictor.py b/python/ray/train/tests/test_lightning_predictor.py index 49ee42073b163..1fdfb21a16b29 100644 --- a/python/ray/train/tests/test_lightning_predictor.py +++ b/python/ray/train/tests/test_lightning_predictor.py @@ -28,7 +28,10 @@ def test_repr(): def save_checkpoint(model: pl.LightningModule, ckpt_path: str): - trainer = pl.Trainer(max_epochs=0) + trainer = pl.Trainer( + max_epochs=0, + accelerator="cpu", + ) trainer.fit(model, train_dataloaders=DataLoader(torch.randn(1))) trainer.save_checkpoint(ckpt_path) From 95203bb74deebc9b7f6acd344f7814427bc3a810 Mon Sep 17 00:00:00 2001 From: woshiyyya Date: Wed, 19 Apr 2023 23:00:57 -0700 Subject: [PATCH 19/20] WIP Signed-off-by: woshiyyya --- python/ray/train/tests/test_lightning_predictor.py | 5 +---- python/ray/train/tests/test_lightning_trainer.py | 4 ++-- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/python/ray/train/tests/test_lightning_predictor.py b/python/ray/train/tests/test_lightning_predictor.py index 1fdfb21a16b29..2c34b5dcc9845 100644 --- a/python/ray/train/tests/test_lightning_predictor.py +++ b/python/ray/train/tests/test_lightning_predictor.py @@ -28,10 +28,7 @@ def test_repr(): def save_checkpoint(model: pl.LightningModule, ckpt_path: str): - trainer = pl.Trainer( - max_epochs=0, - accelerator="cpu", - ) + trainer = pl.Trainer(max_epochs=0, accelerator="cpu") trainer.fit(model, train_dataloaders=DataLoader(torch.randn(1))) trainer.save_checkpoint(ckpt_path) diff --git a/python/ray/train/tests/test_lightning_trainer.py b/python/ray/train/tests/test_lightning_trainer.py index a35f37ac54e9b..aab21fb4a6d1e 100644 --- a/python/ray/train/tests/test_lightning_trainer.py +++ b/python/ray/train/tests/test_lightning_trainer.py @@ -74,7 +74,7 @@ def test_trainer_with_native_dataloader( config_builder = ( LightningConfigBuilder() - .module(LinearModule, input_dim=32, output_dim=4) + .module(LinearModule, input_dim=32, output_dim=4, strategy=strategy) .trainer(max_epochs=num_epochs, accelerator=accelerator) .strategy(strategy) ) @@ -124,7 +124,7 @@ def test_trainer_with_ray_data(ray_start_6_cpus_2_gpus, strategy, accelerator): lightning_config = ( LightningConfigBuilder() - .module(cls=LinearModule, input_dim=32, output_dim=4) + .module(cls=LinearModule, input_dim=32, output_dim=4, strategy=strategy) .trainer(max_epochs=num_epochs, accelerator=accelerator) .strategy(strategy) .build() From 29551e9d019a1761b5f2d978eb9a6c9df92dc314 Mon Sep 17 00:00:00 2001 From: woshiyyya Date: Wed, 26 Apr 2023 10:36:46 -0700 Subject: [PATCH 20/20] change emoji Signed-off-by: woshiyyya --- .buildkite/pipeline.gpu_large.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.buildkite/pipeline.gpu_large.yml b/.buildkite/pipeline.gpu_large.yml index f582c122f87c7..e15ee57050eac 100644 --- a/.buildkite/pipeline.gpu_large.yml +++ b/.buildkite/pipeline.gpu_large.yml @@ -50,7 +50,7 @@ - ./ci/env/env_info.sh - bazel test --config=ci $(./ci/run/bazel_export_options) --build_tests_only --test_tag_filters=gpu,-timeseries_libs,-py37,-post_wheel_build doc/... -- label: ":tv: :python: Lightning 2.0 Train GPU tests" +- label: ":zap: :python: Lightning 2.0 Train GPU tests" conditions: ["NO_WHEELS_REQUIRED", "RAY_CI_TRAIN_AFFECTED"] commands: