Skip to content

Commit

Permalink
[2/n] Lightweight Ray AIR API refactor (ray-project#37123)
Browse files Browse the repository at this point in the history
This PR migrates all the train and tune examples and docstrings to the new API convention, see https://github.com/ray-project/enhancements/

Continuation of ray-project#36706 and ray-project#37906

Co-authored-by: matthewdeng <[email protected]>
  • Loading branch information
2 people authored and harborn committed Aug 17, 2023
1 parent c49cff1 commit 4015cf0
Show file tree
Hide file tree
Showing 284 changed files with 1,445 additions and 1,474 deletions.
12 changes: 6 additions & 6 deletions doc/external/pytorch_tutorials_hyperparameter_tuning_tutorial.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,8 @@
from torch.utils.data import random_split
import torchvision
import torchvision.transforms as transforms
from ray import tune
from ray.air import Checkpoint, session
from ray import train, tune
from ray.train import Checkpoint
from ray.tune.schedulers import ASHAScheduler

######################################################################
Expand Down Expand Up @@ -125,7 +125,7 @@ def forward(self, x):
#
# net = Net(config["l1"], config["l2"])
#
# checkpoint = session.get_checkpoint()
# checkpoint = ray.train.get_checkpoint()
#
# if checkpoint:
# checkpoint_state = checkpoint.to_dict()
Expand Down Expand Up @@ -189,7 +189,7 @@ def forward(self, x):
# }
# checkpoint = Checkpoint.from_dict(checkpoint_data)
#
# session.report(
# ray.train.report(
# {"loss": val_loss / val_steps, "accuracy": correct / total},
# checkpoint=checkpoint,
# )
Expand Down Expand Up @@ -226,7 +226,7 @@ def train_cifar(config, data_dir=None):
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=config["lr"], momentum=0.9)

checkpoint = session.get_checkpoint()
checkpoint = train.get_checkpoint()

if checkpoint:
checkpoint_state = checkpoint.to_dict()
Expand Down Expand Up @@ -303,7 +303,7 @@ def train_cifar(config, data_dir=None):
}
checkpoint = Checkpoint.from_dict(checkpoint_data)

session.report(
train.report(
{"loss": val_loss / val_steps, "accuracy": correct / total},
checkpoint=checkpoint,
)
Expand Down
2 changes: 1 addition & 1 deletion doc/source/data/doc_code/preprocessors.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@

from ray.data.preprocessors import MinMaxScaler
from ray.train.xgboost import XGBoostTrainer
from ray.air.config import ScalingConfig
from ray.train import ScalingConfig

train_dataset = ray.data.from_items([{"x": x, "y": 2 * x} for x in range(0, 32, 3)])
valid_dataset = ray.data.from_items([{"x": x, "y": 2 * x} for x in range(1, 32, 3)])
Expand Down
5 changes: 3 additions & 2 deletions doc/source/data/working-with-pytorch.rst
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,8 @@ Ray Data integrates with :ref:`Ray Train <train-docs>` for easy data ingest for
import torch
from torch import nn
import ray
from ray.air import session, ScalingConfig
from ray import train
from ray.train import ScalingConfig
from ray.train.torch import TorchTrainer

def train_func(config):
Expand All @@ -56,7 +57,7 @@ Ray Data integrates with :ref:`Ray Train <train-docs>` for easy data ingest for
optimizer = torch.optim.SGD(model.parameters(), lr=0.001)

# Datasets can be accessed in your train_func via ``get_dataset_shard``.
train_data_shard = session.get_dataset_shard("train")
train_data_shard = train.get_dataset_shard("train")

for epoch_idx in range(2):
for batch in train_data_shard.iter_torch_batches(batch_size=128, dtypes=torch.float32):
Expand Down
4 changes: 2 additions & 2 deletions doc/source/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ predictions.show(limit=1)
</div>
<div class="tab-pane fade" id="v-pills-training" role="tabpanel" aria-labelledby="v-pills-training-tab" style="user-select:none;">
<pre style="margin:0;"><code class="language-python not-selectable">
from ray.air.config import ScalingConfig
from ray.train import ScalingConfig
from ray.train.torch import TorchTrainer
# Step 1: setup PyTorch model training as you normally would
Expand Down Expand Up @@ -166,7 +166,7 @@ result = trainer.fit()
<div class="tab-pane fade" id="v-pills-tuning" role="tabpanel" aria-labelledby="v-pills-tuning-tab" style="user-select:none;" style="user-select:none;">
<pre style="margin:0;"><code class="language-python not-selectable">
from ray import tune
from ray.air.config import ScalingConfig
from ray.train import ScalingConfig
from ray.train.lightgbm import LightGBMTrainer
train_dataset, eval_dataset = ...
Expand Down
6 changes: 3 additions & 3 deletions doc/source/ray-air/computer-vision.rst
Original file line number Diff line number Diff line change
Expand Up @@ -207,13 +207,13 @@ Training vision models
Creating checkpoints
--------------------

:class:`Checkpoints <ray.air.checkpoint.Checkpoint>` are required for batch inference and model
:class:`Checkpoints <ray.train.Checkpoint>` are required for batch inference and model
serving. They contain model state and optionally a preprocessor.

If you're going from training to prediction, don't create a new checkpoint.
:meth:`Trainer.fit() <ray.train.trainer.BaseTrainer.fit>` returns a
:class:`~ray.air.result.Result` object. Use
:attr:`Result.checkpoint <ray.air.result.Result.checkpoint>` instead.
:class:`~ray.train.Result` object. Use
:attr:`Result.checkpoint <ray.train.Result.checkpoint>` instead.

.. tab-set::

Expand Down
13 changes: 6 additions & 7 deletions doc/source/ray-air/doc_code/computer_vision.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,12 +186,11 @@ def train_torch_model(dataset, preprocessor, per_epoch_preprocessor):
from torchvision import models

from ray import train
from ray.air import session
from ray.air.config import ScalingConfig
from ray.train import ScalingConfig
from ray.train.torch import TorchCheckpoint, TorchTrainer

def train_one_epoch(model, *, criterion, optimizer, batch_size, epoch):
dataset_shard = session.get_dataset_shard("train")
dataset_shard = train.get_dataset_shard("train")

running_loss = 0
for i, batch in enumerate(
Expand All @@ -210,7 +209,7 @@ def train_one_epoch(model, *, criterion, optimizer, batch_size, epoch):

running_loss += loss.item()
if i % 2000 == 1999:
session.report(
train.report(
metrics={
"epoch": epoch,
"batch": i,
Expand Down Expand Up @@ -254,13 +253,13 @@ def train_tensorflow_model(dataset, preprocessor, per_epoch_preprocessor):
# __tensorflow_training_loop_start__
import tensorflow as tf

from ray.air import session
from ray import train
from ray.air.integrations.keras import ReportCheckpointCallback

def train_loop_per_worker(config):
strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy()

train_shard = session.get_dataset_shard("train")
train_shard = train.get_dataset_shard("train")
train_dataset = train_shard.to_tf(
"image",
"label",
Expand All @@ -286,7 +285,7 @@ def train_loop_per_worker(config):
# __tensorflow_training_loop_stop__

# __tensorflow_trainer_start__
from ray.air import ScalingConfig
from ray.train import ScalingConfig
from ray.train.tensorflow import TensorflowTrainer

# The following transform operation is lazy.
Expand Down
13 changes: 6 additions & 7 deletions doc/source/ray-air/doc_code/hvd_trainer.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,11 @@
import horovod.torch as hvd
import ray
import ray.train as train
from ray import train
from ray.train import Checkpoint, ScalingConfig
import ray.train.torch # Need this to use `train.torch.get_device()`
import horovod.torch as hvd
from ray.train.horovod import HorovodTrainer
import torch
import torch.nn as nn
from ray.air import session, Checkpoint
from ray.train.horovod import HorovodTrainer
from ray.air.config import ScalingConfig

# If using GPUs, set this to True.
use_gpu = False
Expand All @@ -31,7 +30,7 @@ def forward(self, input):

def train_loop_per_worker():
hvd.init()
dataset_shard = session.get_dataset_shard("train")
dataset_shard = train.get_dataset_shard("train")
model = NeuralNetwork()
device = train.torch.get_device()
model.to(device)
Expand All @@ -56,7 +55,7 @@ def train_loop_per_worker():
loss.backward()
optimizer.step()
print(f"epoch: {epoch}, loss: {loss.item()}")
session.report(
train.report(
{},
checkpoint=Checkpoint.from_dict(dict(model=model.state_dict())),
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,8 @@
# __air_session_start__

import tensorflow as tf
from ray.air import session
from ray.air.checkpoint import Checkpoint
from ray.air.config import ScalingConfig
from ray import train
from ray.train import Checkpoint, ScalingConfig
from ray.train.tensorflow import TensorflowTrainer


Expand All @@ -22,7 +21,7 @@ def build_model() -> tf.keras.Model:


def train_func():
ckpt = session.get_checkpoint()
ckpt = train.get_checkpoint()
if ckpt:
with ckpt.as_directory() as loaded_checkpoint_dir:
import tensorflow as tf
Expand All @@ -32,9 +31,7 @@ def train_func():
model = build_model()

model.save("my_model", overwrite=True)
session.report(
metrics={"iter": 1}, checkpoint=Checkpoint.from_directory("my_model")
)
train.report(metrics={"iter": 1}, checkpoint=Checkpoint.from_directory("my_model"))


scaling_config = ScalingConfig(num_workers=2)
Expand Down
6 changes: 3 additions & 3 deletions doc/source/ray-air/doc_code/tf_starter.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,10 @@
import ray
import tensorflow as tf

from ray.air import session
from ray import train
from ray.train import ScalingConfig
from ray.air.integrations.keras import ReportCheckpointCallback
from ray.train.tensorflow import TensorflowTrainer
from ray.air.config import ScalingConfig


# If using GPUs, set this to True.
Expand Down Expand Up @@ -46,7 +46,7 @@ def train_func(config: dict):
metrics=[tf.keras.metrics.mean_squared_error],
)

dataset = session.get_dataset_shard("train")
dataset = train.get_dataset_shard("train")

results = []
for _ in range(epochs):
Expand Down
7 changes: 3 additions & 4 deletions doc/source/ray-air/doc_code/torch_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,8 @@

import ray
from ray import train
from ray.air import session, Checkpoint
from ray.train import Checkpoint, ScalingConfig
from ray.train.torch import TorchTrainer
from ray.air.config import ScalingConfig


# If using GPUs, set this to True.
Expand All @@ -30,7 +29,7 @@ def forward(self, input):


def train_loop_per_worker():
dataset_shard = session.get_dataset_shard("train")
dataset_shard = train.get_dataset_shard("train")
model = NeuralNetwork()
loss_fn = nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
Expand All @@ -49,7 +48,7 @@ def train_loop_per_worker():
optimizer.step()
print(f"epoch: {epoch}, loss: {loss.item()}")

session.report(
train.report(
{},
checkpoint=Checkpoint.from_dict(
dict(epoch=epoch, model=model.state_dict())
Expand Down
Loading

0 comments on commit 4015cf0

Please sign in to comment.