ray-project · krfricke · Apr 27, 2023 · Apr 6, 2023 · Apr 7, 2023 · Apr 7, 2023
diff --git a/.buildkite/pipeline.gpu_large.yml b/.buildkite/pipeline.gpu_large.yml
@@ -49,3 +49,16 @@
  - pip install -Ur ./python/requirements/ml/requirements_ml_docker.txt
  - ./ci/env/env_info.sh
  - bazel test --config=ci $(./ci/run/bazel_export_options) --build_tests_only --test_tag_filters=gpu,-timeseries_libs,-py37,-post_wheel_build doc/...
+
+- label: ":zap: :python: Lightning 2.0 Train GPU tests"
+ conditions:
+ ["NO_WHEELS_REQUIRED", "RAY_CI_TRAIN_AFFECTED"]
+ commands:
+ - cleanup() { if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then ./ci/build/upload_build_info.sh; fi }; trap cleanup EXIT
+ - NO_DASHBOARD=1 ./ci/env/install-minimal.sh 3.8
+ - PYTHON=3.8 DOC_TESTING=1 TRAIN_TESTING=1 TUNE_TESTING=1 ./ci/env/install-dependencies.sh
+ - pip install -Ur ./python/requirements/ml/requirements_ml_docker.txt
+ - pip uninstall -y pytorch-lightning
+ - pip install lightning==2.0.0
+ - ./ci/env/env_info.sh
+ - bazel test --config=ci $(./scripts/bazel_export_options) --test_tag_filters=ptl_v2 python/ray/train/...
diff --git a/ci/ci.sh b/ci/ci.sh
@@ -284,6 +284,8 @@ install_npm_project() {
 build_dashboard_front_end() {
  if [ "${OSTYPE}" = msys ]; then
  { echo "WARNING: Skipping dashboard due to NPM incompatibilities with Windows"; } 2> /dev/null
+ elif [ "${NO_DASHBOARD-}" = "1" ]; then
+ echo "Skipping dashboard build"
  else
  (
  cd ray/dashboard/client

diff --git a/ci/env/install-minimal.sh b/ci/env/install-minimal.sh
@@ -1,5 +1,7 @@
 #!/usr/bin/env bash
 
+set -xe
+
 # Python version can be specified as 3.7, 3.8, 3.9, etc..
 if [ -z "$1" ]; then
  PYTHON_VERSION=${PYTHON-3.7}

@@ -412,31 +412,31 @@ py_test(
  name = "test_lightning_checkpoint",
  size = "medium",
  srcs = ["tests/test_lightning_checkpoint.py"],
- tags = ["team:ml", "exclusive", "ray_air", "gpu"],
+ tags = ["team:ml", "exclusive", "ray_air", "gpu", "ptl_v2"],
  deps = [":train_lib"]
 )
 
 py_test(
  name = "test_lightning_trainer_restore",
  size = "medium",
  srcs = ["tests/test_lightning_trainer_restore.py"],
- tags = ["team:ml", "exclusive", "ray_air", "gpu"],
+ tags = ["team:ml", "exclusive", "ray_air", "gpu", "ptl_v2"],
  deps = [":train_lib"]
 )
 
 py_test(
  name = "test_lightning_trainer",
  size = "large",
  srcs = ["tests/test_lightning_trainer.py"],
- tags = ["team:ml", "exclusive", "ray_air", "gpu"],
+ tags = ["team:ml", "exclusive", "ray_air", "gpu", "ptl_v2"],
  deps = [":train_lib"]
 )
 
 py_test(
  name = "test_lightning_predictor",
  size = "medium",
  srcs = ["tests/test_lightning_predictor.py"],
- tags = ["team:ml", "exclusive", "ray_air", "gpu"],
+ tags = ["team:ml", "exclusive", "ray_air", "gpu", "ptl_v2"],
  deps = [":train_lib"]
 )
 

@@ -7,9 +7,11 @@
 
 
 class LinearModule(pl.LightningModule):
- def __init__(self, input_dim, output_dim) -> None:
+ def __init__(self, input_dim, output_dim, strategy="ddp") -> None:
  super().__init__()
  self.linear = nn.Linear(input_dim, output_dim)
+ self.loss = []
+ self.strategy = strategy
 
  def forward(self, input):
  return self.linear(input)
@@ -22,24 +24,31 @@ def training_step(self, batch):
 
  def validation_step(self, val_batch, batch_idx):
  loss = self.forward(val_batch)
+ self.loss.append(loss)
  return {"val_loss": loss}
 
- def validation_epoch_end(self, outputs) -> None:
- avg_loss = torch.stack([x["val_loss"] for x in outputs]).mean()
+ def on_validation_epoch_end(self) -> None:
+ avg_loss = torch.stack(self.loss).mean()
  self.log("val_loss", avg_loss)
+ self.loss.clear()
 
  def predict_step(self, batch, batch_idx):
  return self.forward(batch)
 
  def configure_optimizers(self):
- return torch.optim.SGD(self.parameters(), lr=0.1)
+ if self.strategy == "fsdp":
+ # Feed FSDP wrapped model parameters to optimizer
+ return torch.optim.SGD(self.trainer.model.parameters(), lr=0.1)
+ else:
+ return torch.optim.SGD(self.parameters(), lr=0.1)
 
 
 class DoubleLinearModule(pl.LightningModule):
  def __init__(self, input_dim_1, input_dim_2, output_dim) -> None:
  super().__init__()
  self.linear_1 = nn.Linear(input_dim_1, output_dim)
  self.linear_2 = nn.Linear(input_dim_2, output_dim)
+ self.loss = []
 
  def forward(self, batch):
  input_1 = batch["input_1"]
@@ -54,12 +63,14 @@ def training_step(self, batch):
 
  def validation_step(self, val_batch, batch_idx):
  loss = self.forward(val_batch)
+ self.loss.append(loss)
  return {"val_loss": loss}
 
- def validation_epoch_end(self, outputs) -> None:
+ def on_validation_epoch_end(self) -> None:
  print("Validation Epoch:", self.current_epoch)
- avg_loss = torch.stack([x["val_loss"] for x in outputs]).mean()
+ avg_loss = torch.stack(self.loss).mean()
  self.log("val_loss", avg_loss)
+ self.loss.clear()
 
  def predict_step(self, batch, batch_idx):
  return self.forward(batch)
@@ -91,7 +102,9 @@ def __init__(self, lr: float, layer_1: int, layer_2: int):
  self.layer_1 = torch.nn.Linear(28 * 28, layer_1)
  self.layer_2 = torch.nn.Linear(layer_1, layer_2)
  self.layer_3 = torch.nn.Linear(layer_2, 10)
- self.accuracy = Accuracy()
+ self.accuracy = Accuracy(task="multiclass", num_classes=10)
+ self.val_acc_list = []
+ self.val_loss_list = []
 
  def forward(self, x):
  batch_size, channels, width, height = x.size()
@@ -121,13 +134,17 @@ def validation_step(self, val_batch, batch_idx):
  logits = self.forward(x)
  loss = F.nll_loss(logits, y)
  acc = self.accuracy(logits, y)
+ self.val_acc_list.append(acc)
+ self.val_loss_list.append(loss)
  return {"val_loss": loss, "val_accuracy": acc}
 
- def validation_epoch_end(self, outputs):
- avg_loss = torch.stack([x["val_loss"] for x in outputs]).mean()
- avg_acc = torch.stack([x["val_accuracy"] for x in outputs]).mean()
+ def on_validation_epoch_end(self):
+ avg_loss = torch.stack(self.val_loss_list).mean()
+ avg_acc = torch.stack(self.val_acc_list).mean()
  self.log("ptl/val_loss", avg_loss)
  self.log("ptl/val_accuracy", avg_acc)
+ self.val_acc_list.clear()
+ self.val_loss_list.clear()
 
  def predict_step(self, batch, batch_idx, dataloader_idx=None):
  x = batch

@@ -38,7 +38,10 @@ def test_load_from_path():
 
  # Train one epoch and save a checkpoint
  trainer = pl.Trainer(
- max_epochs=1, enable_progress_bar=False, enable_checkpointing=False
+ max_epochs=1,
+ accelerator="cpu",
+ enable_progress_bar=False,
+ enable_checkpointing=False,
  )
  trainer.fit(model=model, train_dataloaders=dataloader)
  ckpt_path = f"{tmpdir}/random_checkpoint_name.ckpt"
@@ -75,7 +78,10 @@ def test_from_directory():
 
  # Train one epoch and save a checkpoint
  trainer = pl.Trainer(
- max_epochs=1, enable_progress_bar=False, enable_checkpointing=False
+ max_epochs=1,
+ accelerator="cpu",
+ enable_progress_bar=False,
+ enable_checkpointing=False,
  )
  trainer.fit(model=model, train_dataloaders=dataloader)
  trainer.save_checkpoint(f"{tmpdir}/{MODEL_KEY}")

@@ -28,7 +28,7 @@ def test_repr():
 
 
 def save_checkpoint(model: pl.LightningModule, ckpt_path: str):
- trainer = pl.Trainer(max_epochs=0)
+ trainer = pl.Trainer(max_epochs=0, accelerator="cpu")
  trainer.fit(model, train_dataloaders=DataLoader(torch.randn(1)))
  trainer.save_checkpoint(ckpt_path)
 

@@ -74,7 +74,7 @@ def test_trainer_with_native_dataloader(
 
  config_builder = (
  LightningConfigBuilder()
- .module(LinearModule, input_dim=32, output_dim=4)
+ .module(LinearModule, input_dim=32, output_dim=4, strategy=strategy)
  .trainer(max_epochs=num_epochs, accelerator=accelerator)
  .strategy(strategy)
  )
@@ -124,7 +124,7 @@ def test_trainer_with_ray_data(ray_start_6_cpus_2_gpus, strategy, accelerator):
 
  lightning_config = (
  LightningConfigBuilder()
- .module(cls=LinearModule, input_dim=32, output_dim=4)
+ .module(cls=LinearModule, input_dim=32, output_dim=4, strategy=strategy)
  .trainer(max_epochs=num_epochs, accelerator=accelerator)
  .strategy(strategy)
  .build()