[AIR/train] Use new Train API (ray-project#25735)

Uses the new AIR Train API for examples and tests. The `Result` object gets a new attribute - `log_dir`, pointing to the Trial's `logdir` allowing users to access tensorboard logs and artifacts of other loggers. This PR only deals with "low hanging fruit" - tests that need substantial rewriting or Train user guide are not touched. Those will be updated in followup PRs. Tests and examples that concern deprecated features or which are duplicated in AIR have been removed or disabled. Requires ray-project#25943 to be merged in first
krfricke · Jul 7, 2022 · b9a4f64 · b9a4f64
1 parent 40f9561
commit b9a4f64
Show file tree

Hide file tree

Showing 38 changed files with 666 additions and 622 deletions.
diff --git a/doc/source/train/examples.rst b/doc/source/train/examples.rst
@@ -15,10 +15,10 @@ General Examples
 PyTorch
 ~~~~~~~
 
-* :doc:`/train/examples/train_linear_example`:
+* :doc:`/train/examples/torch_linear_example`:
  Simple example for PyTorch.
 
-* :doc:`/train/examples/train_fashion_mnist_example`:
+* :doc:`/train/examples/torch_fashion_mnist_example`:
  End-to-end example for PyTorch.
 
 * :doc:`/train/examples/transformers/transformers_example`:
@@ -59,10 +59,10 @@ Ray Datasets Integration Examples
 * :doc:`/train/examples/tensorflow_linear_dataset_example`:
  Simple example for training a linear TensorFlow model.
 
-* :doc:`/train/examples/train_linear_dataset_example`:
+* :doc:`/train/examples/torch_linear_dataset_example`:
  Simple example for training a linear PyTorch model.
 
-* :doc:`/train/examples/tune_linear_dataset_example`:
+* :doc:`/train/examples/tune_torch_linear_dataset_example`:
  Simple example for tuning a linear PyTorch model.
 
 
@@ -75,7 +75,7 @@ Ray Tune Integration Examples
 * :doc:`/train/examples/tune_tensorflow_mnist_example`:
  End-to-end example for tuning a TensorFlow model.
 
-* :doc:`/train/examples/tune_cifar_pytorch_pbt_example`:
+* :doc:`/train/examples/tune_cifar_torch_pbt_example`:
  End-to-end example for tuning a PyTorch model with PBT.
 
 ..

diff --git a/doc/source/train/examples/torch_fashion_mnist_example.rst b/doc/source/train/examples/torch_fashion_mnist_example.rst
@@ -0,0 +1,6 @@
+:orphan:
+
+torch_fashion_mnist_example
+===========================
+
+.. literalinclude:: /../../python/ray/train/examples/torch_fashion_mnist_example.py
diff --git a/doc/source/train/examples/torch_linear_dataset_example.rst b/doc/source/train/examples/torch_linear_dataset_example.rst
@@ -0,0 +1,6 @@
+:orphan:
+
+torch_linear_dataset_example
+============================
+
+.. literalinclude:: /../../python/ray/train/examples/torch_linear_dataset_example.py
diff --git a/doc/source/train/examples/torch_linear_example.rst b/doc/source/train/examples/torch_linear_example.rst
@@ -0,0 +1,6 @@
+:orphan:
+
+torch_linear_example
+====================
+
+.. literalinclude:: /../../python/ray/train/examples/torch_linear_example.py
diff --git a/doc/source/train/examples/train_fashion_mnist_example.rst b/doc/source/train/examples/train_fashion_mnist_example.rst
diff --git a/doc/source/train/examples/train_linear_dataset_example.rst b/doc/source/train/examples/train_linear_dataset_example.rst
diff --git a/doc/source/train/examples/train_linear_example.rst b/doc/source/train/examples/train_linear_example.rst
diff --git a/doc/source/train/examples/tune_cifar_pytorch_pbt_example.rst b/doc/source/train/examples/tune_cifar_pytorch_pbt_example.rst
diff --git a/doc/source/train/examples/tune_cifar_torch_pbt_example.rst b/doc/source/train/examples/tune_cifar_torch_pbt_example.rst
@@ -0,0 +1,6 @@
+:orphan:
+
+tune_cifar_torch_pbt_example
+============================
+
+.. literalinclude:: /../../python/ray/train/examples/tune_cifar_torch_pbt_example.py
diff --git a/doc/source/train/examples/tune_linear_dataset_example.rst b/doc/source/train/examples/tune_linear_dataset_example.rst
diff --git a/doc/source/train/examples/tune_torch_linear_dataset_example.rst b/doc/source/train/examples/tune_torch_linear_dataset_example.rst
@@ -0,0 +1,6 @@
+:orphan:
+
+tune_torch_linear_dataset_example
+=================================
+
+.. literalinclude:: /../../python/ray/air/examples/pytorch/tune_torch_linear_dataset_example.py
diff --git a/python/ray/air/result.py b/python/ray/air/result.py
@@ -1,5 +1,6 @@
-from typing import Any, Dict, List, Optional, Tuple
 from dataclasses import dataclass
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple
 
 from ray.air.checkpoint import Checkpoint
 from ray.util.annotations import PublicAPI
@@ -15,7 +16,7 @@ class Result:
  This is the class produced by Trainer.fit().
  It contains a checkpoint, which can be used for resuming training and for
  creating a Predictor object. It also contains a metrics object describing
- training metrics. `error` is included so that non successful runs
+ training metrics. ``error`` is included so that non successful runs
  and trials can be represented as well.
 
  The constructor is a private API.
@@ -24,6 +25,7 @@ class Result:
  metrics: The final metrics as reported by an Trainable.
  checkpoint: The final checkpoint of the Trainable.
  error: The execution error of the Trainable run, if the trial finishes in error.
+ log_dir: Directory where the trial logs are saved.
  metrics_dataframe: The full result dataframe of the Trainable.
  The dataframe is indexed by iterations and contains reported
  metrics.
@@ -37,6 +39,7 @@ class Result:
  metrics: Optional[Dict[str, Any]]
  checkpoint: Optional[Checkpoint]
  error: Optional[Exception]
+ log_dir: Optional[Path]
  metrics_dataframe: Optional[pd.DataFrame]
  best_checkpoints: Optional[List[Tuple[Checkpoint, Dict[str, Any]]]]
 

diff --git a/python/ray/train/BUILD b/python/ray/train/BUILD
@@ -39,15 +39,6 @@ py_test(
  deps = [":train_lib"]
 )
 
-py_test(
- name = "torch_tensorboard_profiler_example",
- size = "small",
- main = "examples/torch_tensorboard_profiler_example.py",
- srcs = ["examples/torch_tensorboard_profiler_example.py"],
- tags = ["team:ml", "exclusive"],
- deps = [":train_lib"]
-)
-
 py_test(
  name = "transformers_example_gpu",
  size = "large",
@@ -73,25 +64,15 @@ py_test(
 )
 
 py_test(
- name = "tune_cifar_pytorch_pbt_example",
+ name = "tune_cifar_torch_pbt_example",
  size = "medium",
- main = "examples/tune_cifar_pytorch_pbt_example.py",
- srcs = ["examples/tune_cifar_pytorch_pbt_example.py"],
+ main = "examples/tune_cifar_torch_pbt_example.py",
+ srcs = ["examples/tune_cifar_torch_pbt_example.py"],
  tags = ["team:ml", "exclusive", "pytorch", "tune"],
  deps = [":train_lib"],
  args = ["--smoke-test"]
 )
 
-py_test(
- name = "tune_linear_dataset_example",
- size = "medium",
- main = "examples/tune_linear_dataset_example.py",
- srcs = ["examples/tune_linear_dataset_example.py"],
- tags = ["team:ml", "exclusive", "gpu_only", "tune"],
- deps = [":train_lib"],
- args = ["--smoke-test", "--use-gpu"]
-)
-
 py_test(
  name = "tune_linear_example",
  size = "medium",

diff --git a/python/ray/train/examples/horovod/horovod_example.py b/python/ray/train/examples/horovod/horovod_example.py
@@ -2,15 +2,17 @@
 import os
 
 import horovod.torch as hvd
-import ray
 import torch.nn as nn
 import torch.nn.functional as F
 import torch.optim as optim
 import torch.utils.data.distributed
 from filelock import FileLock
-from ray.train import Trainer
 from torchvision import datasets, transforms
 
+import ray
+from ray import train
+from ray.train.horovod import HorovodTrainer
+
 
 def metric_average(val, name):
  tensor = torch.tensor(val)
@@ -142,21 +144,21 @@ def train_func(config):
 
  model, optimizer, train_loader, train_sampler = setup(config)
 
- results = []
  for epoch in range(num_epochs):
  loss = train_epoch(
  model, optimizer, train_sampler, train_loader, epoch, log_interval, use_cuda
  )
- results.append(loss)
- return results
+ train.report(loss=loss)
 
 
 def main(num_workers, use_gpu, kwargs):
- trainer = Trainer("horovod", use_gpu=use_gpu, num_workers=num_workers)
- trainer.start()
- loss_per_epoch = trainer.run(train_func, config=kwargs)
- trainer.shutdown()
- print(loss_per_epoch)
+ trainer = HorovodTrainer(
+ train_func,
+ train_loop_config=kwargs,
+ scaling_config={"use_gpu": use_gpu, "num_workers": num_workers},
+ )
+ results = trainer.fit()
+ print(results.metrics)
 
 
 # Horovod Class API.

diff --git a/python/ray/train/examples/mlflow_fashion_mnist_example.py b/python/ray/train/examples/mlflow_fashion_mnist_example.py
@@ -1,20 +1,23 @@
 import argparse
 
-from ray.train import Trainer
-from ray.train.examples.train_fashion_mnist_example import train_func
-from ray.train.callbacks.logging import MLflowLoggerCallback
+from ray.air import RunConfig
+from ray.train.examples.torch_fashion_mnist_example import train_func
+from ray.train.torch import TorchTrainer
+from ray.tune.integration.mlflow import MLflowLoggerCallback
 
 
 def main(num_workers=2, use_gpu=False):
- trainer = Trainer(backend="torch", num_workers=num_workers, use_gpu=use_gpu)
- trainer.start()
- final_results = trainer.run(
- train_func=train_func,
- config={"lr": 1e-3, "batch_size": 64, "epochs": 4},
- callbacks=[MLflowLoggerCallback(experiment_name="train_fashion_mnist")],
+ trainer = TorchTrainer(
+ train_func,
+ train_loop_config={"lr": 1e-3, "batch_size": 64, "epochs": 4},
+ scaling_config={"num_workers": num_workers, "use_gpu": use_gpu},
+ run_config=RunConfig(
+ callbacks=[MLflowLoggerCallback(experiment_name="train_fashion_mnist")]
+ ),
  )
+ final_results = trainer.fit()
 
- print("Full losses for rank 0 worker: ", final_results)
+ print("Final metrics: ", final_results.metrics)
 
 
 if __name__ == "__main__":
@@ -44,7 +47,7 @@ def main(num_workers=2, use_gpu=False):
  import ray
 
  if args.smoke_test:
- ray.init(num_cpus=2)
+ ray.init(num_cpus=4)
  args.num_workers = 2
  args.use_gpu = False
  else:

diff --git a/python/ray/train/examples/mlflow_simple_example.py b/python/ray/train/examples/mlflow_simple_example.py
@@ -1,40 +1,53 @@
 from ray import train
-from ray.train import Trainer
-from ray.train.callbacks import MLflowLoggerCallback, TBXLoggerCallback
+from ray.air import RunConfig
+from ray.train.torch import TorchTrainer
+from ray.tune.integration.mlflow import MLflowLoggerCallback
+from ray.tune.logger import TBXLoggerCallback
 
 
 def train_func():
  for i in range(3):
  train.report(epoch=i)
 
 
-trainer = Trainer(backend="torch", num_workers=2)
-trainer.start()
+trainer = TorchTrainer(
+ train_func,
+ scaling_config={"num_workers": 2},
+ run_config=RunConfig(
+ callbacks=[
+ MLflowLoggerCallback(experiment_name="train_experiment"),
+ TBXLoggerCallback(),
+ ],
+ ),
+)
 
 # Run the training function, logging all the intermediate results
 # to MLflow and Tensorboard.
-result = trainer.run(
- train_func,
- callbacks=[
- MLflowLoggerCallback(experiment_name="train_experiment"),
- TBXLoggerCallback(),
- ],
-)
+result = trainer.fit()
 
-# Print the latest run directory and keep note of it.
-# For example: /home/ray_results/train_2021-09-01_12-00-00/run_001
-print("Run directory:", trainer.latest_run_dir)
+# For MLFLow logs:
+
+# MLFlow logs will by default be saved in an `mlflow` directory
+# in the current working directory.
 
-trainer.shutdown()
+# $ cd mlflow
+# # View the MLflow UI.
+# $ mlflow ui
+
+# You can change the directory by setting the `tracking_uri` argument
+# in `MLflowLoggerCallback`.
+
+# For TensorBoard logs:
+
+# Print the latest run directory and keep note of it.
+# For example: /home/ubuntu/ray_results/TorchTrainer_2022-06-13_20-31-06
+print("Run directory:", result.log_dir.parent) # TensorBoard is saved in parent dir
 
 # How to visualize the logs
 
 # Navigate to the run directory of the trainer.
-# For example `cd /home/ray_results/train_2021-09-01_12-00-00/run_001`
+# For example `cd /home/ubuntu/ray_results/TorchTrainer_2022-06-13_20-31-06`
 # $ cd <TRAINER_RUN_DIR>
 #
-# # View the MLflow UI.
-# $ mlflow ui
-#
 # # View the tensorboard UI.
 # $ tensorboard --logdir .