Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[CI] Add Lightning 2.0 compatibility test pipeline #34147

Merged
merged 24 commits into from
Apr 27, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions .buildkite/pipeline.gpu_large.yml
Original file line number Diff line number Diff line change
Expand Up @@ -49,3 +49,16 @@
- pip install -Ur ./python/requirements/ml/requirements_ml_docker.txt
- ./ci/env/env_info.sh
- bazel test --config=ci $(./ci/run/bazel_export_options) --build_tests_only --test_tag_filters=gpu,-timeseries_libs,-py37,-post_wheel_build doc/...

- label: ":zap: :python: Lightning 2.0 Train GPU tests"
conditions:
["NO_WHEELS_REQUIRED", "RAY_CI_TRAIN_AFFECTED"]
commands:
- cleanup() { if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then ./ci/build/upload_build_info.sh; fi }; trap cleanup EXIT
- NO_DASHBOARD=1 ./ci/env/install-minimal.sh 3.8
woshiyyya marked this conversation as resolved.
Show resolved Hide resolved
- PYTHON=3.8 DOC_TESTING=1 TRAIN_TESTING=1 TUNE_TESTING=1 ./ci/env/install-dependencies.sh
- pip install -Ur ./python/requirements/ml/requirements_ml_docker.txt
woshiyyya marked this conversation as resolved.
Show resolved Hide resolved
- pip uninstall -y pytorch-lightning
- pip install lightning==2.0.0
- ./ci/env/env_info.sh
- bazel test --config=ci $(./scripts/bazel_export_options) --test_tag_filters=ptl_v2 python/ray/train/...
2 changes: 2 additions & 0 deletions ci/ci.sh
Original file line number Diff line number Diff line change
Expand Up @@ -284,6 +284,8 @@ install_npm_project() {
build_dashboard_front_end() {
if [ "${OSTYPE}" = msys ]; then
{ echo "WARNING: Skipping dashboard due to NPM incompatibilities with Windows"; } 2> /dev/null
elif [ "${NO_DASHBOARD-}" = "1" ]; then
echo "Skipping dashboard build"
else
(
cd ray/dashboard/client
Expand Down
2 changes: 2 additions & 0 deletions ci/env/install-minimal.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
#!/usr/bin/env bash

set -xe
woshiyyya marked this conversation as resolved.
Show resolved Hide resolved

# Python version can be specified as 3.7, 3.8, 3.9, etc..
if [ -z "$1" ]; then
PYTHON_VERSION=${PYTHON-3.7}
Expand Down
8 changes: 4 additions & 4 deletions python/ray/train/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -412,31 +412,31 @@ py_test(
name = "test_lightning_checkpoint",
size = "medium",
srcs = ["tests/test_lightning_checkpoint.py"],
tags = ["team:ml", "exclusive", "ray_air", "gpu"],
tags = ["team:ml", "exclusive", "ray_air", "gpu", "ptl_v2"],
woshiyyya marked this conversation as resolved.
Show resolved Hide resolved
deps = [":train_lib"]
)

py_test(
name = "test_lightning_trainer_restore",
size = "medium",
srcs = ["tests/test_lightning_trainer_restore.py"],
tags = ["team:ml", "exclusive", "ray_air", "gpu"],
tags = ["team:ml", "exclusive", "ray_air", "gpu", "ptl_v2"],
deps = [":train_lib"]
)

py_test(
name = "test_lightning_trainer",
size = "large",
srcs = ["tests/test_lightning_trainer.py"],
tags = ["team:ml", "exclusive", "ray_air", "gpu"],
tags = ["team:ml", "exclusive", "ray_air", "gpu", "ptl_v2"],
deps = [":train_lib"]
)

py_test(
name = "test_lightning_predictor",
size = "medium",
srcs = ["tests/test_lightning_predictor.py"],
tags = ["team:ml", "exclusive", "ray_air", "gpu"],
tags = ["team:ml", "exclusive", "ray_air", "gpu", "ptl_v2"],
deps = [":train_lib"]
)

Expand Down
37 changes: 27 additions & 10 deletions python/ray/train/tests/lightning_test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,11 @@


class LinearModule(pl.LightningModule):
def __init__(self, input_dim, output_dim) -> None:
def __init__(self, input_dim, output_dim, strategy="ddp") -> None:
super().__init__()
self.linear = nn.Linear(input_dim, output_dim)
self.loss = []
self.strategy = strategy

def forward(self, input):
return self.linear(input)
Expand All @@ -22,24 +24,31 @@ def training_step(self, batch):

def validation_step(self, val_batch, batch_idx):
loss = self.forward(val_batch)
self.loss.append(loss)
return {"val_loss": loss}

def validation_epoch_end(self, outputs) -> None:
avg_loss = torch.stack([x["val_loss"] for x in outputs]).mean()
def on_validation_epoch_end(self) -> None:
avg_loss = torch.stack(self.loss).mean()
self.log("val_loss", avg_loss)
self.loss.clear()
woshiyyya marked this conversation as resolved.
Show resolved Hide resolved

def predict_step(self, batch, batch_idx):
return self.forward(batch)

def configure_optimizers(self):
return torch.optim.SGD(self.parameters(), lr=0.1)
if self.strategy == "fsdp":
# Feed FSDP wrapped model parameters to optimizer
return torch.optim.SGD(self.trainer.model.parameters(), lr=0.1)
else:
return torch.optim.SGD(self.parameters(), lr=0.1)
woshiyyya marked this conversation as resolved.
Show resolved Hide resolved


class DoubleLinearModule(pl.LightningModule):
def __init__(self, input_dim_1, input_dim_2, output_dim) -> None:
super().__init__()
self.linear_1 = nn.Linear(input_dim_1, output_dim)
self.linear_2 = nn.Linear(input_dim_2, output_dim)
self.loss = []

def forward(self, batch):
input_1 = batch["input_1"]
Expand All @@ -54,12 +63,14 @@ def training_step(self, batch):

def validation_step(self, val_batch, batch_idx):
loss = self.forward(val_batch)
self.loss.append(loss)
return {"val_loss": loss}

def validation_epoch_end(self, outputs) -> None:
def on_validation_epoch_end(self) -> None:
print("Validation Epoch:", self.current_epoch)
avg_loss = torch.stack([x["val_loss"] for x in outputs]).mean()
avg_loss = torch.stack(self.loss).mean()
self.log("val_loss", avg_loss)
self.loss.clear()

def predict_step(self, batch, batch_idx):
return self.forward(batch)
Expand Down Expand Up @@ -91,7 +102,9 @@ def __init__(self, lr: float, layer_1: int, layer_2: int):
self.layer_1 = torch.nn.Linear(28 * 28, layer_1)
self.layer_2 = torch.nn.Linear(layer_1, layer_2)
self.layer_3 = torch.nn.Linear(layer_2, 10)
self.accuracy = Accuracy()
self.accuracy = Accuracy(task="multiclass", num_classes=10)
self.val_acc_list = []
self.val_loss_list = []

def forward(self, x):
batch_size, channels, width, height = x.size()
Expand Down Expand Up @@ -121,13 +134,17 @@ def validation_step(self, val_batch, batch_idx):
logits = self.forward(x)
loss = F.nll_loss(logits, y)
acc = self.accuracy(logits, y)
self.val_acc_list.append(acc)
self.val_loss_list.append(loss)
return {"val_loss": loss, "val_accuracy": acc}

def validation_epoch_end(self, outputs):
avg_loss = torch.stack([x["val_loss"] for x in outputs]).mean()
avg_acc = torch.stack([x["val_accuracy"] for x in outputs]).mean()
def on_validation_epoch_end(self):
avg_loss = torch.stack(self.val_loss_list).mean()
avg_acc = torch.stack(self.val_acc_list).mean()
self.log("ptl/val_loss", avg_loss)
self.log("ptl/val_accuracy", avg_acc)
self.val_acc_list.clear()
self.val_loss_list.clear()

def predict_step(self, batch, batch_idx, dataloader_idx=None):
x = batch
Expand Down
10 changes: 8 additions & 2 deletions python/ray/train/tests/test_lightning_checkpoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,10 @@ def test_load_from_path():

# Train one epoch and save a checkpoint
trainer = pl.Trainer(
max_epochs=1, enable_progress_bar=False, enable_checkpointing=False
max_epochs=1,
accelerator="cpu",
enable_progress_bar=False,
enable_checkpointing=False,
)
trainer.fit(model=model, train_dataloaders=dataloader)
ckpt_path = f"{tmpdir}/random_checkpoint_name.ckpt"
Expand Down Expand Up @@ -75,7 +78,10 @@ def test_from_directory():

# Train one epoch and save a checkpoint
trainer = pl.Trainer(
max_epochs=1, enable_progress_bar=False, enable_checkpointing=False
max_epochs=1,
accelerator="cpu",
enable_progress_bar=False,
enable_checkpointing=False,
)
trainer.fit(model=model, train_dataloaders=dataloader)
trainer.save_checkpoint(f"{tmpdir}/{MODEL_KEY}")
Expand Down
2 changes: 1 addition & 1 deletion python/ray/train/tests/test_lightning_predictor.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ def test_repr():


def save_checkpoint(model: pl.LightningModule, ckpt_path: str):
trainer = pl.Trainer(max_epochs=0)
trainer = pl.Trainer(max_epochs=0, accelerator="cpu")
trainer.fit(model, train_dataloaders=DataLoader(torch.randn(1)))
trainer.save_checkpoint(ckpt_path)

Expand Down
4 changes: 2 additions & 2 deletions python/ray/train/tests/test_lightning_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ def test_trainer_with_native_dataloader(

config_builder = (
LightningConfigBuilder()
.module(LinearModule, input_dim=32, output_dim=4)
.module(LinearModule, input_dim=32, output_dim=4, strategy=strategy)
.trainer(max_epochs=num_epochs, accelerator=accelerator)
.strategy(strategy)
)
Expand Down Expand Up @@ -124,7 +124,7 @@ def test_trainer_with_ray_data(ray_start_6_cpus_2_gpus, strategy, accelerator):

lightning_config = (
LightningConfigBuilder()
.module(cls=LinearModule, input_dim=32, output_dim=4)
.module(cls=LinearModule, input_dim=32, output_dim=4, strategy=strategy)
.trainer(max_epochs=num_epochs, accelerator=accelerator)
.strategy(strategy)
.build()
Expand Down