diff --git a/.buildkite/pipeline.gpu_large.yml b/.buildkite/pipeline.gpu_large.yml index 2f993cd965463..e15ee57050eac 100644 --- a/.buildkite/pipeline.gpu_large.yml +++ b/.buildkite/pipeline.gpu_large.yml @@ -49,3 +49,16 @@ - pip install -Ur ./python/requirements/ml/requirements_ml_docker.txt - ./ci/env/env_info.sh - bazel test --config=ci $(./ci/run/bazel_export_options) --build_tests_only --test_tag_filters=gpu,-timeseries_libs,-py37,-post_wheel_build doc/... + +- label: ":zap: :python: Lightning 2.0 Train GPU tests" + conditions: + ["NO_WHEELS_REQUIRED", "RAY_CI_TRAIN_AFFECTED"] + commands: + - cleanup() { if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then ./ci/build/upload_build_info.sh; fi }; trap cleanup EXIT + - NO_DASHBOARD=1 ./ci/env/install-minimal.sh 3.8 + - PYTHON=3.8 DOC_TESTING=1 TRAIN_TESTING=1 TUNE_TESTING=1 ./ci/env/install-dependencies.sh + - pip install -Ur ./python/requirements/ml/requirements_ml_docker.txt + - pip uninstall -y pytorch-lightning + - pip install lightning==2.0.0 + - ./ci/env/env_info.sh + - bazel test --config=ci $(./scripts/bazel_export_options) --test_tag_filters=ptl_v2 python/ray/train/... \ No newline at end of file diff --git a/ci/ci.sh b/ci/ci.sh index 98071a8bfeaa8..516fbb3f43e3d 100755 --- a/ci/ci.sh +++ b/ci/ci.sh @@ -284,6 +284,8 @@ install_npm_project() { build_dashboard_front_end() { if [ "${OSTYPE}" = msys ]; then { echo "WARNING: Skipping dashboard due to NPM incompatibilities with Windows"; } 2> /dev/null + elif [ "${NO_DASHBOARD-}" = "1" ]; then + echo "Skipping dashboard build" else ( cd ray/dashboard/client diff --git a/ci/env/install-minimal.sh b/ci/env/install-minimal.sh index e99e453ea11e3..9da00d7517c3f 100755 --- a/ci/env/install-minimal.sh +++ b/ci/env/install-minimal.sh @@ -1,5 +1,7 @@ #!/usr/bin/env bash +set -xe + # Python version can be specified as 3.7, 3.8, 3.9, etc.. if [ -z "$1" ]; then PYTHON_VERSION=${PYTHON-3.7} diff --git a/python/ray/train/BUILD b/python/ray/train/BUILD index e2a244455ace3..1d932ca912a8d 100644 --- a/python/ray/train/BUILD +++ b/python/ray/train/BUILD @@ -412,7 +412,7 @@ py_test( name = "test_lightning_checkpoint", size = "medium", srcs = ["tests/test_lightning_checkpoint.py"], - tags = ["team:ml", "exclusive", "ray_air", "gpu"], + tags = ["team:ml", "exclusive", "ray_air", "gpu", "ptl_v2"], deps = [":train_lib"] ) @@ -420,7 +420,7 @@ py_test( name = "test_lightning_trainer_restore", size = "medium", srcs = ["tests/test_lightning_trainer_restore.py"], - tags = ["team:ml", "exclusive", "ray_air", "gpu"], + tags = ["team:ml", "exclusive", "ray_air", "gpu", "ptl_v2"], deps = [":train_lib"] ) @@ -428,7 +428,7 @@ py_test( name = "test_lightning_trainer", size = "large", srcs = ["tests/test_lightning_trainer.py"], - tags = ["team:ml", "exclusive", "ray_air", "gpu"], + tags = ["team:ml", "exclusive", "ray_air", "gpu", "ptl_v2"], deps = [":train_lib"] ) @@ -436,7 +436,7 @@ py_test( name = "test_lightning_predictor", size = "medium", srcs = ["tests/test_lightning_predictor.py"], - tags = ["team:ml", "exclusive", "ray_air", "gpu"], + tags = ["team:ml", "exclusive", "ray_air", "gpu", "ptl_v2"], deps = [":train_lib"] ) diff --git a/python/ray/train/tests/lightning_test_utils.py b/python/ray/train/tests/lightning_test_utils.py index fcf37af1becc9..c58ae623336b3 100644 --- a/python/ray/train/tests/lightning_test_utils.py +++ b/python/ray/train/tests/lightning_test_utils.py @@ -7,9 +7,11 @@ class LinearModule(pl.LightningModule): - def __init__(self, input_dim, output_dim) -> None: + def __init__(self, input_dim, output_dim, strategy="ddp") -> None: super().__init__() self.linear = nn.Linear(input_dim, output_dim) + self.loss = [] + self.strategy = strategy def forward(self, input): return self.linear(input) @@ -22,17 +24,23 @@ def training_step(self, batch): def validation_step(self, val_batch, batch_idx): loss = self.forward(val_batch) + self.loss.append(loss) return {"val_loss": loss} - def validation_epoch_end(self, outputs) -> None: - avg_loss = torch.stack([x["val_loss"] for x in outputs]).mean() + def on_validation_epoch_end(self) -> None: + avg_loss = torch.stack(self.loss).mean() self.log("val_loss", avg_loss) + self.loss.clear() def predict_step(self, batch, batch_idx): return self.forward(batch) def configure_optimizers(self): - return torch.optim.SGD(self.parameters(), lr=0.1) + if self.strategy == "fsdp": + # Feed FSDP wrapped model parameters to optimizer + return torch.optim.SGD(self.trainer.model.parameters(), lr=0.1) + else: + return torch.optim.SGD(self.parameters(), lr=0.1) class DoubleLinearModule(pl.LightningModule): @@ -40,6 +48,7 @@ def __init__(self, input_dim_1, input_dim_2, output_dim) -> None: super().__init__() self.linear_1 = nn.Linear(input_dim_1, output_dim) self.linear_2 = nn.Linear(input_dim_2, output_dim) + self.loss = [] def forward(self, batch): input_1 = batch["input_1"] @@ -54,12 +63,14 @@ def training_step(self, batch): def validation_step(self, val_batch, batch_idx): loss = self.forward(val_batch) + self.loss.append(loss) return {"val_loss": loss} - def validation_epoch_end(self, outputs) -> None: + def on_validation_epoch_end(self) -> None: print("Validation Epoch:", self.current_epoch) - avg_loss = torch.stack([x["val_loss"] for x in outputs]).mean() + avg_loss = torch.stack(self.loss).mean() self.log("val_loss", avg_loss) + self.loss.clear() def predict_step(self, batch, batch_idx): return self.forward(batch) @@ -91,7 +102,9 @@ def __init__(self, lr: float, layer_1: int, layer_2: int): self.layer_1 = torch.nn.Linear(28 * 28, layer_1) self.layer_2 = torch.nn.Linear(layer_1, layer_2) self.layer_3 = torch.nn.Linear(layer_2, 10) - self.accuracy = Accuracy() + self.accuracy = Accuracy(task="multiclass", num_classes=10) + self.val_acc_list = [] + self.val_loss_list = [] def forward(self, x): batch_size, channels, width, height = x.size() @@ -121,13 +134,17 @@ def validation_step(self, val_batch, batch_idx): logits = self.forward(x) loss = F.nll_loss(logits, y) acc = self.accuracy(logits, y) + self.val_acc_list.append(acc) + self.val_loss_list.append(loss) return {"val_loss": loss, "val_accuracy": acc} - def validation_epoch_end(self, outputs): - avg_loss = torch.stack([x["val_loss"] for x in outputs]).mean() - avg_acc = torch.stack([x["val_accuracy"] for x in outputs]).mean() + def on_validation_epoch_end(self): + avg_loss = torch.stack(self.val_loss_list).mean() + avg_acc = torch.stack(self.val_acc_list).mean() self.log("ptl/val_loss", avg_loss) self.log("ptl/val_accuracy", avg_acc) + self.val_acc_list.clear() + self.val_loss_list.clear() def predict_step(self, batch, batch_idx, dataloader_idx=None): x = batch diff --git a/python/ray/train/tests/test_lightning_checkpoint.py b/python/ray/train/tests/test_lightning_checkpoint.py index e253bb2a8b855..64bcd40b32bec 100644 --- a/python/ray/train/tests/test_lightning_checkpoint.py +++ b/python/ray/train/tests/test_lightning_checkpoint.py @@ -38,7 +38,10 @@ def test_load_from_path(): # Train one epoch and save a checkpoint trainer = pl.Trainer( - max_epochs=1, enable_progress_bar=False, enable_checkpointing=False + max_epochs=1, + accelerator="cpu", + enable_progress_bar=False, + enable_checkpointing=False, ) trainer.fit(model=model, train_dataloaders=dataloader) ckpt_path = f"{tmpdir}/random_checkpoint_name.ckpt" @@ -75,7 +78,10 @@ def test_from_directory(): # Train one epoch and save a checkpoint trainer = pl.Trainer( - max_epochs=1, enable_progress_bar=False, enable_checkpointing=False + max_epochs=1, + accelerator="cpu", + enable_progress_bar=False, + enable_checkpointing=False, ) trainer.fit(model=model, train_dataloaders=dataloader) trainer.save_checkpoint(f"{tmpdir}/{MODEL_KEY}") diff --git a/python/ray/train/tests/test_lightning_predictor.py b/python/ray/train/tests/test_lightning_predictor.py index 49ee42073b163..2c34b5dcc9845 100644 --- a/python/ray/train/tests/test_lightning_predictor.py +++ b/python/ray/train/tests/test_lightning_predictor.py @@ -28,7 +28,7 @@ def test_repr(): def save_checkpoint(model: pl.LightningModule, ckpt_path: str): - trainer = pl.Trainer(max_epochs=0) + trainer = pl.Trainer(max_epochs=0, accelerator="cpu") trainer.fit(model, train_dataloaders=DataLoader(torch.randn(1))) trainer.save_checkpoint(ckpt_path) diff --git a/python/ray/train/tests/test_lightning_trainer.py b/python/ray/train/tests/test_lightning_trainer.py index a35f37ac54e9b..aab21fb4a6d1e 100644 --- a/python/ray/train/tests/test_lightning_trainer.py +++ b/python/ray/train/tests/test_lightning_trainer.py @@ -74,7 +74,7 @@ def test_trainer_with_native_dataloader( config_builder = ( LightningConfigBuilder() - .module(LinearModule, input_dim=32, output_dim=4) + .module(LinearModule, input_dim=32, output_dim=4, strategy=strategy) .trainer(max_epochs=num_epochs, accelerator=accelerator) .strategy(strategy) ) @@ -124,7 +124,7 @@ def test_trainer_with_ray_data(ray_start_6_cpus_2_gpus, strategy, accelerator): lightning_config = ( LightningConfigBuilder() - .module(cls=LinearModule, input_dim=32, output_dim=4) + .module(cls=LinearModule, input_dim=32, output_dim=4, strategy=strategy) .trainer(max_epochs=num_epochs, accelerator=accelerator) .strategy(strategy) .build()