[train] minor documentation improvements (ray-project#38391)

Signed-off-by: Matthew Deng <[email protected]>
angelinalg · Aug 15, 2023 · 7762af3 · 7762af3
1 parent 993c131
commit 7762af3
Show file tree

Hide file tree

Showing 10 changed files with 47 additions and 205 deletions.
diff --git a/doc/BUILD b/doc/BUILD
@@ -223,9 +223,7 @@ py_test(
 py_test_run_all_subdirectory(
  size = "large",
  include = ["source/train/doc_code/*.py"],
- exclude = [
- "source/train/doc_code/hf_trainer.py", # Too large
- ],
+ exclude = [],
  extra_srcs = [],
  tags = ["exclusive", "team:ml"],
 )

diff --git a/doc/source/_static/js/custom.js b/doc/source/_static/js/custom.js
@@ -46,9 +46,9 @@ document.addEventListener("DOMContentLoaded", function() {
  // Ray Data
  "Ray Data", "Ray Data API", "Integrations",
  // Ray Train
- "Ray Train", "Ray Train API",
- "Distributed PyTorch", "Advanced Topics", "More Frameworks",
- "Ray Train Internals",
+ "Ray Train", "More Frameworks",
+ "Advanced Topics", "Internals",
+ "Ray Train API",
  // Ray Tune
  "Ray Tune", "Ray Tune Examples", "Ray Tune API",
  // Ray Serve

diff --git a/doc/source/_toc.yml b/doc/source/_toc.yml
@@ -69,20 +69,15 @@ parts:
  title: Hugging Face Transformers
  - file: train/more-frameworks
  sections:
- - file: train/huggingface-transformers-accelerate
- title: Hugging Face Transformers & Accelerate
+ - file: train/huggingface-accelerate
+ title: Hugging Face Accelerate
  - file: train/distributed-tensorflow-keras
  title: TensorFlow & Keras
  - file: train/distributed-xgboost-lightgbm
  title: XGBoost & LightGBM
  - file: train/horovod
  - file: train/user-guides
  title: User Guides
- - file: train/internals/index
- sections:
- - file: train/internals/architecture
- - file: train/internals/benchmarks
- - file: train/internals/environment-variables
  - file: train/examples
  title: "Examples"
  sections:
@@ -114,8 +109,14 @@ parts:
  title: "PyTorch Finetuning ResNet Example"
  - file: train/examples/lightning/vicuna_13b_lightning_deepspeed_finetune
  title: "Fine-tune Vicuna-13B with DeepSpeed and PyTorch Lightning"
- - file: train/faq
+ - file: train/internals/index
+ title: "Internals"
+ sections:
+ - file: train/internals/architecture
+ - file: train/internals/benchmarks
+ - file: train/internals/environment-variables
  - file: train/api/api
+ - file: train/faq
 
  - file: tune/index
  title: Ray Tune

diff --git a/doc/source/train/api/api.rst b/doc/source/train/api/api.rst
@@ -32,16 +32,25 @@ Scale out your PyTorch, Lightning, Hugging Face code with Ray TorchTrainer.
 PyTorch
 *******
 
+**Basic**
+
 .. autosummary::
  :toctree: doc/
 
+ ~train.torch.get_device
  ~train.torch.prepare_model
- ~train.torch.prepare_optimizer
  ~train.torch.prepare_data_loader
- ~train.torch.get_device
+
+
+**Advanced**
+
+.. autosummary::
+ :toctree: doc/
+
+ ~train.torch.enable_reproducibility
  ~train.torch.accelerate
+ ~train.torch.prepare_optimizer
  ~train.torch.backward
- ~train.torch.enable_reproducibility
 
 .. _train-lightning-integration:
 
@@ -183,6 +192,7 @@ Ray Train Config
  ~train.ScalingConfig
  ~train.RunConfig
  ~train.CheckpointConfig
+ ~train.FailureConfig
  ~train.DataConfig
 
 .. _train-loop-api:
@@ -199,60 +209,20 @@ Ray Train Loop
  ~train.report
 
 
-Ray Train Checkpoints
----------------------
-
-.. autosummary::
- :toctree: doc/
-
- ~train.Checkpoint
-
-
-.. _trainer-restore:
-
-Ray Train Experiment Restoration
---------------------------------
+Ray Train Output
+----------------
 
 .. autosummary::
+ :template: autosummary/class_without_autosummary.rst
  :toctree: doc/
 
- train.trainer.BaseTrainer.restore
-
-.. note::
-
- All trainer classes have a `restore` method that takes in a path
- pointing to the directory of the experiment to be restored.
- `restore` also exposes a subset of construtor arguments that can be re-specified.
- See :ref:`train-framework-specific-restore`
- below for details on `restore` arguments for different trainer integrations.
-
-.. _train-framework-specific-restore:
-
-Restoration API for Built-in Trainers
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ ~train.Result
 
 .. autosummary::
  :toctree: doc/
 
- train.data_parallel_trainer.DataParallelTrainer.restore
-
-.. autosummary::
-
- train.torch.TorchTrainer.restore
- train.huggingface.TransformersTrainer.restore
-
-.. note::
-
- `TorchTrainer.restore`, `TensorflowTrainer.restore`, and `HorovodTrainer.restore`
- can take in the same parameters as their parent class's
- :meth:`DataParallelTrainer.restore <ray.train.data_parallel_trainer.DataParallelTrainer.restore>`.
-
- Unless otherwise specified, other trainers will accept the same parameters as
- :meth:`BaseTrainer.restore <ray.train.trainer.BaseTrainer.restore>`.
-
-.. seealso::
+ ~train.Checkpoint
 
- See :ref:`train-restore-guide` for more details on when and how trainer restore should be used.
 
 Ray Train Base Classes (Developer APIs)
 ---------------------------------------
@@ -269,18 +239,6 @@ Trainer Base Classes
  ~train.data_parallel_trainer.DataParallelTrainer
  ~train.gbdt_trainer.GBDTTrainer
 
-``BaseTrainer`` API
-*******************
-
-.. autosummary::
- :toctree: doc/
-
- ~train.trainer.BaseTrainer.fit
- ~train.trainer.BaseTrainer.setup
- ~train.trainer.BaseTrainer.preprocess_datasets
- ~train.trainer.BaseTrainer.training_loop
- ~train.trainer.BaseTrainer.as_trainable
-
 
 Train Backend Base Classes
 ~~~~~~~~~~~~~~~~~~~~~~~~~~

diff --git a/doc/source/train/distributed-xgboost-lightgbm.rst b/doc/source/train/distributed-xgboost-lightgbm.rst
@@ -79,8 +79,6 @@ save a checkpoint on every boosting round and will only keep the latest checkpoi
  Once checkpointing is enabled, you can follow :ref:`this guide <train-fault-tolerance>`
  to enable fault tolerance.
 
- See the :ref:`Trainer restore API reference <trainer-restore>` for more details.
-
 
 How to scale out training?
 --------------------------

diff --git a/doc/source/train/doc_code/hf_trainer.py b/doc/source/train/doc_code/hf_trainer.py
diff --git a/...n/huggingface-transformers-accelerate.rst → doc/source/train/huggingface-accelerate.rst b/...n/huggingface-transformers-accelerate.rst → doc/source/train/huggingface-accelerate.rst
@@ -1,36 +1,10 @@
-.. _train-hf-transformers-accelerate:
+.. _train-hf-accelerate:
 
-Training with HuggingFace Transformers and Accelerate
-=====================================================
+Training with HuggingFace Accelerate
+====================================
 
 .. TODO: Remove this guide later when the other guides are ready.
 
-TransformersTrainer
--------------------
-
-.. TODO: Move this into its own guide when the TorchTrainer API is ready.
-
-:class:`TransformersTrainer <ray.train.huggingface.TransformersTrainer>` further extends :class:`TorchTrainer <ray.train.torch.TorchTrainer>`, built
-for interoperability with the HuggingFace Transformers library.
-
-Users are required to provide a ``trainer_init_per_worker`` function which returns a
-``transformers.Trainer`` object. The ``trainer_init_per_worker`` function
-will have access to preprocessed train and evaluation datasets.
-
-Upon calling `TransformersTrainer.fit()`, multiple workers (ray actors) will be spawned,
-and each worker will create its own copy of a ``transformers.Trainer``.
-
-Each worker will then invoke ``transformers.Trainer.train()``, which will perform distributed
-training via Pytorch DDP.
-
-
-.. dropdown:: Code example
-
- .. literalinclude:: ./doc_code/hf_trainer.py
- :language: python
- :start-after: __hf_trainer_start__
- :end-before: __hf_trainer_end__
-
 AccelerateTrainer
 -----------------
 

diff --git a/doc/source/train/more-frameworks.rst b/doc/source/train/more-frameworks.rst
@@ -8,31 +8,31 @@ More Frameworks
  :class-container: container pb-3
 
  .. grid-item-card::
- :img-top: /ray-overview/images/ray_svg_logo.svg
+ :img-top: /images/hugging.png
  :class-img-top: pt-2 w-75 d-block mx-auto fixed-height-img
 
- .. button-ref:: train-hf-transformers-accelerate
+ .. button-ref:: train-hf-accelerate
 
- Hugging Face Transformers & Accelerate
+ Hugging Face Accelerate
 
  .. grid-item-card::
- :img-top: /ray-overview/images/ray_svg_logo.svg
+ :img-top: /images/tf_logo.png
  :class-img-top: pt-2 w-75 d-block mx-auto fixed-height-img
 
  .. button-ref:: distributed-tensorflow-keras
 
  TensorFlow & Keras
 
  .. grid-item-card::
- :img-top: /ray-overview/images/ray_svg_logo.svg
+ :img-top: /images/xgboost_logo.png
  :class-img-top: pt-2 w-75 d-block mx-auto fixed-height-img
 
  .. button-ref:: distributed-xgboost-lightgbm
 
  XGBoost & LightGBM
 
  .. grid-item-card::
- :img-top: /ray-overview/images/ray_svg_logo.svg
+ :img-top: /images/horovod.png
  :class-img-top: pt-2 w-75 d-block mx-auto fixed-height-img
 
  .. button-ref:: horovod

diff --git a/doc/source/train/user-guides/fault-tolerance.rst b/doc/source/train/user-guides/fault-tolerance.rst
@@ -37,7 +37,7 @@ passed to the ``Trainer``:
 Restore a Ray Train Experiment
 ------------------------------
 
-At the experiment level, :ref:`Trainer restoration <trainer-restore>`
+At the experiment level, Trainer restoration 
 allows you to resume a previously interrupted experiment from where it left off.
 
 A Train experiment may be interrupted due to one of the following reasons:
@@ -90,7 +90,12 @@ If the experiment has been interrupted due to one of the reasons listed above, u
  Different trainers may allow more parameters to be optionally re-specified on restore.
  Only **datasets** are required to be re-specified on restore, if they were supplied originally.
 
- See :ref:`train-framework-specific-restore` for more details.
+ `TorchTrainer.restore`, `TensorflowTrainer.restore`, and `HorovodTrainer.restore`
+ can take in the same parameters as their parent class's
+ :meth:`DataParallelTrainer.restore <ray.train.data_parallel_trainer.DataParallelTrainer.restore>`.
+
+ Unless otherwise specified, other trainers will accept the same parameters as
+ :meth:`BaseTrainer.restore <ray.train.trainer.BaseTrainer.restore>`.
 
 
 Auto-resume

diff --git a/doc/source/tune/api/result_grid.rst b/doc/source/tune/api/result_grid.rst
@@ -28,7 +28,6 @@ Result (air.Result)
 -------------------
 
 .. autosummary::
- :toctree: doc/
  :template: autosummary/class_without_autosummary.rst
 
  ~air.Result