From 9d793abd107358a221466ea2050c8c593874b6f9 Mon Sep 17 00:00:00 2001
From: angelinalg <122562471+angelinalg@users.noreply.github.com>
Date: Mon, 11 Sep 2023 16:49:13 -0700
Subject: [PATCH] polish examples: make titles more consistent, add links to
 guides

Signed-off-by: angelinalg <122562471+angelinalg@users.noreply.github.com>
---
 .github/styles/Vocab/Train/accept.txt         |   1 +
 doc/source/train/examples.rst                 |  24 ++--
 .../accelerate/accelerate_example.rst         |  21 +++-
 .../lightning/lightning_mnist_example.ipynb   |  33 +++--
 .../pytorch/dreambooth_finetuning.rst         | 119 +++++++++---------
 .../pytorch/torch_fashion_mnist_example.rst   |  16 ++-
 .../transformers_torch_trainer_basic.rst      |  18 ++-
 .../getting-started-pytorch-lightning.rst     |  12 +-
 doc/source/train/getting-started-pytorch.rst  |  26 ++--
 doc/source/train/huggingface-accelerate.rst   |  12 +-
 .../accelerate/accelerate_torch_trainer.py    |   4 +-
 .../transformers_torch_trainer_basic.py       |  10 +-
 python/ray/train/torch/torch_trainer.py       |   4 +-
 13 files changed, 182 insertions(+), 118 deletions(-)

diff --git a/.github/styles/Vocab/Train/accept.txt b/.github/styles/Vocab/Train/accept.txt
index d832f7f80e7ce..d0c7e09aaea0c 100644
--- a/.github/styles/Vocab/Train/accept.txt
+++ b/.github/styles/Vocab/Train/accept.txt
@@ -1,5 +1,6 @@
 Horovod
 Hugging Face
+hyperparameters?
 Keras
 LightGBM
 PyTorch
diff --git a/doc/source/train/examples.rst b/doc/source/train/examples.rst
index 3b2cf585618ce..ac1252e92e9e5 100644
--- a/doc/source/train/examples.rst
+++ b/doc/source/train/examples.rst
@@ -3,7 +3,7 @@
 Ray Train Examples
 ==================
 
-.. Example .rst files should be organized in the same manner as the
+.. Organize example .rst files in the same manner as the
    .py files in ray/python/ray/train/examples.
 
 Below are examples for using Ray Train with a variety of frameworks and use cases.
@@ -18,17 +18,17 @@ Beginner
   * - Framework
     - Example
   * - PyTorch
-    - :ref:`Training an Fashion MNIST Image Classifier with PyTorch <torch_fashion_mnist_ex>`
+    - :ref:`Train a Fashion MNIST Image Classifier with PyTorch <torch_fashion_mnist_ex>`
   * - Lightning
-    - :ref:`Training an MNIST Image Classifier with Lightning <lightning_mnist_example>`
+    - :ref:`Train an MNIST Image Classifier with Lightning <lightning_mnist_example>`
   * - Transformers
-    - :ref:`Fine-tuning a Text Classifier on Yelp Reviews Dataset with HF Transformers <transformers_torch_trainer_basic_example>`
+    - :ref:`Fine-tune a Text Classifier on the Yelp Reviews Dataset with HF Transformers <transformers_torch_trainer_basic_example>`
   * - Accelerate
     - :ref:`Distributed Data Parallel Training with HF Accelerate <accelerate_example>`
   * - DeepSpeed
-    - :ref:`Distributed Training with DeepSpeed ZeRO-3 <deepspeed_example>`
+    - :ref:`Train with DeepSpeed ZeRO-3 <deepspeed_example>`
   * - TensorFlow
-    - :ref:`TensorFlow MNIST Training Example <tensorflow_mnist_example>`
+    - :ref:`Train with TensorFlow MNIST <tensorflow_mnist_example>`
   * - Horovod
     - :ref:`End-to-end Horovod Training Example <horovod_example>`
 
@@ -42,11 +42,11 @@ Intermediate
   * - Framework
     - Example
   * - PyTorch
-    - `DreamBooth fine-tuning of Stable Diffusion with Ray Train <https://github.com/ray-project/ray/tree/master/doc/source/templates/05_dreambooth_finetuning>`_
+    - :ref:`Fine-tune of Stable Diffusion with DreamBooth and Ray Train <torch_finetune_dreambooth_ex>`
   * - Lightning
     - :ref:`Model Training with PyTorch Lightning and Ray Data <lightning_advanced_example>`
   * - Accelerate
-    - :ref:`Fine-tuning a Text Classifier on GLUE Benchmark with HF Accelerate. <train_transformers_accelerate_example>`
+    - :ref:`Fine-tune a text classifier on GLUE Benchmark with HF Accelerate <train_transformers_accelerate_example>`
 
 
 Advanced
@@ -59,10 +59,10 @@ Advanced
   * - Framework
     - Example
   * - Accelerate, DeepSpeed
-    - `Fine-tuning Llama-2 series models with Deepspeed, Accelerate, and Ray Train TorchTrainer <https://github.com/ray-project/ray/tree/master/doc/source/templates/04_finetuning_llms_with_deepspeed>`_
+    - `Fine-tune Llama-2 series models with Deepspeed, Accelerate, and Ray Train TorchTrainer <https://github.com/ray-project/ray/tree/master/doc/source/templates/04_finetuning_llms_with_deepspeed>`_
   * - Transformers, DeepSpeed
-    - :ref:`Fine-tuning GPT-J-6B with Ray Train and DeepSpeed <gptj_deepspeed_finetune>`
+    - :ref:`Fine-tune GPT-J-6B with Ray Train and DeepSpeed <gptj_deepspeed_finetune>`
   * - Lightning, DeepSpeed
-    - :ref:`Fine-tuning vicuna-13b with PyTorch Lightning and DeepSpeed <vicuna_lightning_deepspeed_finetuning>`
+    - :ref:`Fine-tune vicuna-13b with PyTorch Lightning and DeepSpeed <vicuna_lightning_deepspeed_finetuning>`
   * - Lightning
-    - :ref:`Fine-tuning dolly-v2-7b with PyTorch Lightning and FSDP <dolly_lightning_fsdp_finetuning>`
+    - :ref:`Fine-tune dolly-v2-7b with PyTorch Lightning and FSDP <dolly_lightning_fsdp_finetuning>`
diff --git a/doc/source/train/examples/accelerate/accelerate_example.rst b/doc/source/train/examples/accelerate/accelerate_example.rst
index 6205add5ac48a..e082bf11f2a30 100644
--- a/doc/source/train/examples/accelerate/accelerate_example.rst
+++ b/doc/source/train/examples/accelerate/accelerate_example.rst
@@ -2,7 +2,24 @@
 
 .. _accelerate_example:
 
-Hugging Face Accelerate Distributed Training Example with Ray Train
-===================================================================
+Distributed Training Example with Hugging Face Accelerate
+=========================================================
+
+This example does distributed data parallel training
+with Hugging Face (HF) Accelerate, Ray Train, and Ray Data.
+It fine-tunes a BERT model and is adapted from
+https://github.com/huggingface/accelerate/blob/main/examples/nlp_example.py
+
+
+Code example
+------------
 
 .. literalinclude:: /../../python/ray/train/examples/accelerate/accelerate_torch_trainer.py
+
+See also
+--------
+
+For a tutorial on using Ray Train and HF Accelerate, 
+see :ref:`Training with Hugging Face Accelerate <train-hf-accelerate>`.
+
+For more Train examples, see :ref:`Ray Train Examples <train-examples>`.
diff --git a/doc/source/train/examples/lightning/lightning_mnist_example.ipynb b/doc/source/train/examples/lightning/lightning_mnist_example.ipynb
index f721884879873..6686b958f9827 100644
--- a/doc/source/train/examples/lightning/lightning_mnist_example.ipynb
+++ b/doc/source/train/examples/lightning/lightning_mnist_example.ipynb
@@ -51,7 +51,7 @@
             "source": [
                 "## Prepare a dataset and module\n",
                 "\n",
-                "The Pytorch Lightning Trainer takes either `torch.utils.data.DataLoader` or `pl.LightningDataModule` as data inputs. You can keep using them without any changes with Ray Train. "
+                "The Pytorch Lightning Trainer takes either `torch.utils.data.DataLoader` or `pl.LightningDataModule` as data inputs. You can continue using them without any changes with Ray Train. "
             ]
         },
         {
@@ -75,7 +75,7 @@
                 "                self.data_dir, train=True, download=True, transform=self.transform\n",
                 "            )\n",
                 "\n",
-                "            # split data into train and val sets\n",
+                "            # Split data into train and val sets\n",
                 "            self.mnist_train, self.mnist_val = random_split(mnist, [55000, 5000])\n",
                 "\n",
                 "    def train_dataloader(self):\n",
@@ -175,7 +175,7 @@
             "cell_type": "markdown",
             "metadata": {},
             "source": [
-                "You don't need to make any change to the definition of PyTorch Lightning model and datamodule."
+                "You don't need to modify the definition of the PyTorch Lightning model or datamodule."
             ]
         },
         {
@@ -183,18 +183,18 @@
             "cell_type": "markdown",
             "metadata": {},
             "source": [
-                "## Define the training loop\n",
+                "## Define a training function\n",
                 "\n",
-                "This code defines a training loop for each worker. Comparing the training loop with the original PyTorch Lightning code, there are 3 main differences:\n",
+                "This code defines a {ref}`training function <train-overview-training-function>` for each worker. Comparing the training fuction with the original PyTorch Lightning code, notice three main differences:\n",
                 "\n",
                 "- Distributed strategy: Use {class}`RayDDPStrategy <ray.train.lightning.RayDDPStrategy>`.\n",
                 "- Cluster environment: Use {class}`RayLightningEnvironment <ray.train.lightning.RayLightningEnvironment>`.\n",
-                "- Parallel devices: Always sets to `devices=\"auto\"` to use all available devices configured by ``TorchTrainer``.\n",
+                "- Parallel devices: Always set to `devices=\"auto\"` to use all available devices configured by ``TorchTrainer``.\n",
                 "\n",
                 "See {ref}`Getting Started with PyTorch Lightning <train-pytorch-lightning>` for more information.\n",
                 "\n",
                 "\n",
-                "For checkpoint reportining, Ray Train provides a minimal {class}`RayTrainReportCallback <ray.train.lightning.RayTrainReportCallback>` that reports metrics and checkpoint on each train epoch end. For more complex checkpoint logic, please implement custom callbacks as described in {ref}`Saving and Loading Checkpoint <train-checkpointing>` user guide."
+                "For checkpoint reporting, Ray Train provides a minimal {class}`RayTrainReportCallback <ray.train.lightning.RayTrainReportCallback>` class that reports metrics and checkpoints at the end of each train epoch. For more complex checkpoint logic, implement custom callbacks. See {ref}`Saving and Loading Checkpoint <train-checkpointing>`."
             ]
         },
         {
@@ -203,7 +203,7 @@
             "metadata": {},
             "outputs": [],
             "source": [
-                "use_gpu = True # Set it to False if you want to run without GPUs\n",
+                "use_gpu = True # Set to False if you want to run without GPUs\n",
                 "num_workers = 4"
             ]
         },
@@ -804,7 +804,7 @@
             "cell_type": "markdown",
             "metadata": {},
             "source": [
-                "## Check the Training Results and Checkpoints"
+                "## Check training results and checkpoints"
             ]
         },
         {
@@ -857,9 +857,9 @@
             "cell_type": "markdown",
             "metadata": {},
             "source": [
-                "As we can see, three checkpoints(`checkpoint_000007`, `checkpoint_000008`, `checkpoint_000009`) have been saved in the trial directory. To retrieve the latest checkpoint from the fit results and load it back into the model, follow these steps.\n",
+                "Ray Train saved three checkpoints(`checkpoint_000007`, `checkpoint_000008`, `checkpoint_000009`) in the trial directory. The following code retrieves the latest checkpoint from the fit results and loads it back into the model.\n",
                 "\n",
-                "If you lost the in-memory result object, you can also restore the model from the checkpoint file. Here the checkpoint path is: `/tmp/ray_results/ptl-mnist-example/TorchTrainer_eb925_00000_0_2023-08-07_23-15-06/checkpoint_000009/checkpoint.ckpt`."
+                "If you lost the in-memory result object, you can restore the model from the checkpoint file. The checkpoint path is: `/tmp/ray_results/ptl-mnist-example/TorchTrainer_eb925_00000_0_2023-08-07_23-15-06/checkpoint_000009/checkpoint.ckpt`."
             ]
         },
         {
@@ -903,6 +903,17 @@
                 "\n",
                 "best_model"
             ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "## See also\n",
+                "\n",
+                "For a tutorial on using Ray Train and PyTorch Lightning, see {ref}`Getting Started with PyTorch Lightning <train-pytorch-lightning>`.\n",
+                "\n",
+                "For more Train examples, see :ref:`Ray Train Examples <train-examples>`."
+            ]
         }
     ],
     "metadata": {
diff --git a/doc/source/train/examples/pytorch/dreambooth_finetuning.rst b/doc/source/train/examples/pytorch/dreambooth_finetuning.rst
index 96ea9a5a1f9d8..05bd3e37bf8a1 100644
--- a/doc/source/train/examples/pytorch/dreambooth_finetuning.rst
+++ b/doc/source/train/examples/pytorch/dreambooth_finetuning.rst
@@ -1,7 +1,9 @@
 :orphan:
 
-Fine-tuning DreamBooth with Ray Train
-=====================================
+.. _torch_finetune_dreambooth_ex:
+
+Fine-tune of Stable Diffusion with DreamBooth and Ray Train
+===========================================================
 
 This example shows how to do DreamBooth fine-tuning of a Stable Diffusion model using Ray Train.
 See the original `DreamBooth project homepage <https://dreambooth.github.io/>`_ for more details on what this fine-tuning method achieves.
@@ -10,41 +12,41 @@ See the original `DreamBooth project homepage <https://dreambooth.github.io/>`_
   :target: https://dreambooth.github.io
   :alt: DreamBooth fine-tuning overview
 
-This example is built on top of `this HuggingFace 🤗 tutorial <https://huggingface.co/docs/diffusers/training/dreambooth>`_.
-See the HuggingFace tutorial for useful explanations and suggestions on hyperparameters.
+This example builds on `this Hugging Face 🤗 tutorial <https://huggingface.co/docs/diffusers/training/dreambooth>`_.
+See the Hugging Face tutorial for useful explanations and suggestions on hyperparameters.
 **Adapting this example to Ray Train allows you to easily scale up the fine-tuning to an arbitrary number of distributed training workers.**
 
 **Compute requirements:**
 
-* Because of the large model sizes, you'll need a machine with at least 1 A10G GPU.
-* Each training worker uses 1 GPU. You can use multiple GPUs/workers to leverage data-parallel training to speed up training time.
+* Because of the large model sizes, you need a machine with at least 1 A10G GPU.
+* Each training worker uses 1 GPU. You can use multiple GPUs or workers to leverage data-parallel training to speed up training time.
 
-This example fine-tunes both the ``text_encoder`` and ``unet`` models used in the Stable Diffusion process, with respect to a prior preserving loss.
+This example fine-tunes both the ``text_encoder`` and ``unet`` models used in the stable diffusion process, with respect to a prior preserving loss.
 
 
 .. image:: /templates/05_dreambooth_finetuning/dreambooth/images/dreambooth_example.png
    :alt: DreamBooth overview
 
-The full code repository can be found here: `https://github.com/ray-project/ray/tree/master/doc/source/templates/05_dreambooth_finetuning <https://github.com/ray-project/ray/tree/master/doc/source/templates/05_dreambooth_finetuning>`_
+Find the full code repository at `https://github.com/ray-project/ray/tree/master/doc/source/templates/05_dreambooth_finetuning <https://github.com/ray-project/ray/tree/master/doc/source/templates/05_dreambooth_finetuning>`_
 
 
 How it works
 ------------
 
-This example leverages Ray Data for data loading and Ray Train for distributed training.
+This example uses Ray Data for data loading and Ray Train for distributed training.
 
 Data loading
 ^^^^^^^^^^^^
 
 .. note::
-    You can find the latest version of the code here: `dataset.py <https://github.com/ray-project/ray/tree/master/doc/source/templates/05_dreambooth_finetuning/dreambooth/dataset.py>`_
+    Find the latest version of the code at `dataset.py <https://github.com/ray-project/ray/tree/master/doc/source/templates/05_dreambooth_finetuning/dreambooth/dataset.py>`_
 
     The latest version might differ slightly from the code presented here.
 
 
-We use Ray Data for data loading. The code has three interesting parts.
+Use Ray Data for data loading. The code has three interesting parts.
 
-First, we load two datasets using :func:`ray.data.read_images`:
+First, load two datasets using :func:`ray.data.read_images`:
 
 .. literalinclude:: /templates/05_dreambooth_finetuning/dreambooth/dataset.py
   :language: python
@@ -52,7 +54,7 @@ First, we load two datasets using :func:`ray.data.read_images`:
   :end-at: class_dataset = read
   :dedent: 4
 
-Then, we tokenize the prompt that generated these images:
+Then, tokenize the prompt that generated these images:
 
 .. literalinclude:: /templates/05_dreambooth_finetuning/dreambooth/dataset.py
   :language: python
@@ -61,7 +63,7 @@ Then, we tokenize the prompt that generated these images:
   :dedent: 4
 
 
-And lastly, we apply a ``torchvision`` preprocessing pipeline to the images:
+And lastly, apply a ``torchvision`` preprocessing pipeline to the images:
 
 .. literalinclude:: /templates/05_dreambooth_finetuning/dreambooth/dataset.py
   :language: python
@@ -69,8 +71,7 @@ And lastly, we apply a ``torchvision`` preprocessing pipeline to the images:
   :end-before: END: image preprocessing
   :dedent: 4
 
-We apply all of this in final step:
-
+Apply all three parts in a final step:
 
 .. literalinclude:: /templates/05_dreambooth_finetuning/dreambooth/dataset.py
   :language: python
@@ -79,29 +80,28 @@ We apply all of this in final step:
   :dedent: 4
 
 
-
 Distributed training
 ^^^^^^^^^^^^^^^^^^^^
 
 
 .. note::
-    You can find the latest version of the code here: `train.py <https://github.com/ray-project/ray/tree/master/doc/source/templates/05_dreambooth_finetuning/dreambooth/train.py>`_
+    Find the latest version of the code at `train.py <https://github.com/ray-project/ray/tree/master/doc/source/templates/05_dreambooth_finetuning/dreambooth/train.py>`_
 
     The latest version might differ slightly from the code presented here.
 
 
-The central part of the training code is the *training function*. This function accepts a configuration dict that contains the hyperparameters. It then defines a regular PyTorch training loop.
+The central part of the training code is the :ref:`training function <train-overview-training-function>`. This function accepts a configuration dict that contains the hyperparameters. It then defines a regular PyTorch training loop.
 
-There are only a few locations where we interact with the Ray Train API. We marked them with in-line comments in the snippet below.
+You interact with the Ray Train API in only a few locations, which follow in-line comments in the snippet below.
 
-Remember that we want to do data-parallel training for all our models.
+Remember that you want to do data-parallel training for all the models.
 
 
-#. We load the data shard for each worker with session.get_dataset_shard("train")
-#. We iterate over the dataset with train_dataset.iter_torch_batches()
-#. We report results to Ray Train with session.report(results)
+#. Load the data shard for each worker with `session.get_dataset_shard("train")``
+#. Iterate over the dataset with `train_dataset.iter_torch_batches()``
+#. Report results to Ray Train with `session.report(results)``
 
-The code was compacted for brevity. The `full code <https://github.com/ray-project/ray/tree/master/doc/source/templates/05_dreambooth_finetuning/dreambooth/train.py>`_ is more thoroughly annotated.
+The code is compacted for brevity. The `full code <https://github.com/ray-project/ray/tree/master/doc/source/templates/05_dreambooth_finetuning/dreambooth/train.py>`_ is more thoroughly annotated.
 
 
 .. literalinclude:: /templates/05_dreambooth_finetuning/dreambooth/train.py
@@ -109,7 +109,7 @@ The code was compacted for brevity. The `full code <https://github.com/ray-proje
   :start-at: def train_fn(config)
   :end-before: END: Training loop
 
-We can then run this training loop with Ray Train's TorchTrainer:
+You can then run this training function with Ray Train's TorchTrainer:
 
 
 .. literalinclude:: /templates/05_dreambooth_finetuning/dreambooth/train.py
@@ -121,11 +121,11 @@ We can then run this training loop with Ray Train's TorchTrainer:
 Configuring the scale
 ^^^^^^^^^^^^^^^^^^^^^
 
-In the TorchTrainer, we can easily configure our scale.
-The above example uses the ``num_workers`` argument to specify the number
-of workers. This defaults to 2 workers with 1 GPU each - so 2 GPUs in total.
+In the TorchTrainer, you can easily configure the scale.
+The preceding example uses the ``num_workers`` argument to specify the number
+of workers. This argument defaults to 2 workers with 1 GPU each, totalling to 2 GPUs.
 
-To run the example on 4 GPUs, just set the number of workers to 4 using ``--num-workers=4``!
+To run the example on 4 GPUs, set the number of workers to 4 using ``--num-workers=4``.
 Or you can change the scaling config directly:
 
 .. code-block:: diff
@@ -136,16 +136,16 @@ Or you can change the scaling config directly:
     +    num_workers=4,
      )
 
-If you're running multi-node training, you should make sure that all nodes have access to a shared
-storage (e.g. via NFS or EFS). In the example script below, you can adjust this location with the
+If you're running multi-node training, make sure that all nodes have access to a shared
+storage like NFS or EFS. In the following example script, you can adjust the location with the
 ``DATA_PREFIX`` environment variable.
 
 Training throughput
 ~~~~~~~~~~~~~~~~~~~
 
-We ran training using 1,  2, and 4 workers/GPUs to compare throughput.
+Compare throughput of the preceding training runs that used 1,  2, and 4 workers or GPUs.
 
-Setup:
+Consider the following setup:
 
 * 1 GCE g2-standard-48-nvidia-l4-4 instance with 4 GPUs
 * Model as configured below
@@ -154,7 +154,7 @@ Setup:
 * Training for 4 epochs (local batch size = 2)
 * 3 runs per configuration
 
-We expect that the training time should benefit from scale and decreases when running with
+You expect that the training time should benefit from scale and decreases when running with
 more workers and GPUs.
 
 .. image:: /templates/05_dreambooth_finetuning/dreambooth/images/dreambooth_training.png
@@ -173,26 +173,26 @@ more workers and GPUs.
      - 313.25
 
 
-While the training time decreases linearly with the amount of workers/GPUs, we observe some penalty.
-Specifically, with double the amount of workers we don't get half of the training time.
+While the training time decreases linearly with the amount of workers/GPUs, you can observe some penalty.
+Specifically, with double the amount of workers you don't get half of the training time.
 
-This is most likely due to additional communication between processes and the transfer of large model
-weights. We are also only training with a batch size of one because our GPU memory is limited. On larger
-GPUs with higher batch sizes we would expect a greater benefit from scaling out.
+This penalty is most likely due to additional communication between processes and the transfer of large model
+weights. You are also only training with a batch size of one because of the GPU memory limitation. On larger
+GPUs with higher batch sizes you would expect a greater benefit from scaling out.
 
 
 Run the example
 ---------------
 
-First, we download the pre-trained stable diffusion model as a starting point.
+First, download the pre-trained Stable Diffusion model as a starting point.
 
-We will then train this model with a few images of our subject.
+Then train this model with a few images of a subject.
 
-To achieve this, we choose a non-word as an identifier, e.g. ``unqtkn``. When fine-tuning the model with our subject, we will teach it that the prompt is ``A photo of a unqtkn <class>``.
+To achieve this, choose a non-word as an identifier, such as ``unqtkn``. When fine-tuning the model with this subject, you teach the model that the prompt is ``A photo of a unqtkn <class>``.
 
-After fine-tuning we can run inference with this specific prompt.
-For instance: ``A photo of a unqtkn <class>`` will create an image of our subject.
-Similarly, ``A photo of a unqtkn <class> at the beach`` will create an image of our subject at the beach.
+After fine-tuning you can run inference with this specific prompt.
+For instance: ``A photo of a unqtkn <class>`` creates an image of the subject.
+Similarly, ``A photo of a unqtkn <class> at the beach`` creates an image of the subject at the beach.
 
 Step 0: Preparation
 ^^^^^^^^^^^^^^^^^^^
@@ -216,7 +216,7 @@ Prepare some directories and environment variables.
 Step 1: Download the pre-trained model
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-Download and cache a pre-trained Stable-Diffusion model locally.
+Download and cache a pre-trained Stable Diffusion model locally.
 
 .. literalinclude:: /templates/05_dreambooth_finetuning/dreambooth_run.sh
   :language: bash
@@ -228,10 +228,10 @@ You can access the downloaded model checkpoint at the ``$ORIG_MODEL_PATH``.
 Step 2: Supply images of your subject
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-Use one of the sample datasets (dog, lego car), or provide your own directory
+Use one of the sample datasets, like `dog` or `lego car`, or provide your own directory
 of images, and specify the directory with the ``$INSTANCE_DIR`` environment variable.
 
-Then, we copy these images to ``$IMAGES_OWN_DIR``.
+Then, copy these images to ``$IMAGES_OWN_DIR``.
 
 .. literalinclude:: /templates/05_dreambooth_finetuning/dreambooth_run.sh
   :language: bash
@@ -247,7 +247,7 @@ Step 3: Create the regularization images
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 Create a regularization image set for a class of subjects using the pre-trained
-Stable Diffusion model. This is used to regularize the fine-tuning by ensuring that
+Stable Diffusion model. This set regularizes the fine-tuning by ensuring that
 the model still produces decent images for random images of the same class,
 rather than just optimize for producing good images of the subject.
 
@@ -256,12 +256,12 @@ rather than just optimize for producing good images of the subject.
   :start-after: Step 3: START
   :end-before: Step 3: END
 
-We use Ray Data to do batch inference with 4 workers, so more images can be generated in parallel.
+Use Ray Data to do batch inference with 4 workers, to generate more images in parallel.
 
 Step 4: Fine-tune the model
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-Save a few (4 to 5) images of the subject being fine-tuned
+Save a few, like 4 to 5, images of the subject being fine-tuned
 in a local directory. Then launch the training job with:
 
 .. literalinclude:: /templates/05_dreambooth_finetuning/dreambooth_run.sh
@@ -269,21 +269,28 @@ in a local directory. Then launch the training job with:
   :start-after: Step 4: START
   :end-before: Step 4: END
 
-Step 5: Generate images of our subject
+Step 5: Generate images of the subject
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 Try your model with the same command line as Step 2, but point
-to your own model this time!
+to your own model this time.
 
 .. literalinclude:: /templates/05_dreambooth_finetuning/dreambooth_run.sh
   :language: bash
   :start-after: Step 5: START
   :end-before: Step 5: END
 
-Next, try replacing the prompt with something more interesting!
+Next, try replacing the prompt with something more interesting.
 
 For example, for the dog subject, you can try:
 
 - "photo of a unqtkn dog in a bucket"
 - "photo of a unqtkn dog sleeping"
-- "photo of a unqtkn dog in a doghouse"
\ No newline at end of file
+- "photo of a unqtkn dog in a doghouse"
+
+See also
+--------
+
+For more Train examples, see :ref:`Ray Train Examples <train-examples>`.
+
+For how-to guides, see :ref:`Ray Train User Guides <train-user-guides>`.
\ No newline at end of file
diff --git a/doc/source/train/examples/pytorch/torch_fashion_mnist_example.rst b/doc/source/train/examples/pytorch/torch_fashion_mnist_example.rst
index 2955441efaf08..c3006634b86d6 100644
--- a/doc/source/train/examples/pytorch/torch_fashion_mnist_example.rst
+++ b/doc/source/train/examples/pytorch/torch_fashion_mnist_example.rst
@@ -2,7 +2,19 @@
 
 .. _torch_fashion_mnist_ex:
 
-Running Distributed Training of a PyTorch Model on Fashion MNIST with Ray Train
-===============================================================================
+Train a PyTorch Model on Fashion MNIST
+======================================
+
+This example runs distributed training of a PyTorch model on Fashion MNIST with Ray Train.
+
+Code example
+------------
 
 .. literalinclude:: /../../python/ray/train/examples/pytorch/torch_fashion_mnist_example.py
+
+See also
+--------
+
+For a tutorial on using Ray Train and PyTorch, see :ref:`Getting Started with PyTorch <train-pytorch>`.
+
+For more Train examples, see :ref:`Ray Train Examples <train-examples>`.
diff --git a/doc/source/train/examples/transformers/transformers_torch_trainer_basic.rst b/doc/source/train/examples/transformers/transformers_torch_trainer_basic.rst
index d4bb78290cf5b..587fa1673dda7 100644
--- a/doc/source/train/examples/transformers/transformers_torch_trainer_basic.rst
+++ b/doc/source/train/examples/transformers/transformers_torch_trainer_basic.rst
@@ -2,7 +2,21 @@
 
 .. _transformers_torch_trainer_basic_example :
 
-Ray Train Basic Example for HuggingFace Transformers
-====================================================
+Fine-tune a Text Classifier with Hugging Face Transformers
+==========================================================
+
+This basic example of distributed training with Ray Train and Hugging Face (HF) Transformers
+fine-tunes a text classifier on the Yelp review dataset using HF Transformers and Ray Train.
+
+Code example
+------------
 
 .. literalinclude:: /../../python/ray/train/examples/transformers/transformers_torch_trainer_basic.py
+
+See also
+--------
+
+For a tutorial on using Ray Train and Transformers, 
+see :ref:`Getting Started with Hugging Face Transformers <train-pytorch-transformers>`.
+
+For more Train examples, see :ref:`Ray Train Examples <train-examples>`.
diff --git a/doc/source/train/getting-started-pytorch-lightning.rst b/doc/source/train/getting-started-pytorch-lightning.rst
index 00b8af39828e0..d9ea9ed540ffb 100644
--- a/doc/source/train/getting-started-pytorch-lightning.rst
+++ b/doc/source/train/getting-started-pytorch-lightning.rst
@@ -29,7 +29,7 @@ For reference, the final code follows:
     trainer = TorchTrainer(train_func, scaling_config=scaling_config)
     result = trainer.fit()
 
-1. Your `train_func` is the Python code that is executed on each distributed training worker.
+1. Your `train_func` is the Python code that each distributed training worker executes.
 2. Your `ScalingConfig` defines the number of distributed training workers and whether to use GPUs.
 3. Your `TorchTrainer` launches the distributed training job.
 
@@ -147,8 +147,8 @@ Compare a PyTorch Lightning training script with and without Ray Train.
             result = trainer.fit()            
 
 
-Setting up your training function
----------------------------------
+Set up the training function
+----------------------------
 
 First, update your training code to support distributed training. 
 Begin by wrapping your code in a function:
@@ -158,7 +158,7 @@ Begin by wrapping your code in a function:
     def train_func(config):
         # Your PyTorch Lightning training code here.
 
-This function is executed on each distributed training worker.
+Each distributed training worker executes this function.
 
 
 Ray Train sets up your distributed process group on each worker. You only need to 
@@ -364,7 +364,7 @@ information about the training run, including the metrics and checkpoints report
 Next steps
 ---------- 
 
-After you have converted your PyTorch Lightningtraining script to use Ray Train:
+After you have converted your PyTorch Lightning training script to use Ray Train:
 
 * See :ref:`User Guides <train-user-guides>` to learn more about how to perform specific tasks.
 * Browse the :ref:`Examples <train-examples>` for end-to-end examples of how to use Ray Train.
@@ -374,7 +374,7 @@ Version Compatibility
 ---------------------
 
 Ray Train is tested with `pytorch_lightning` versions `1.6.5` and `2.0.4`. For full compatibility, use ``pytorch_lightning>=1.6.5`` . 
-Earlier versions are not prohibited but may result in unexpected issues. If you run into any compatibility issues, consider upgrading your PyTorch Lightning version or 
+Earlier versions aren't prohibited but may result in unexpected issues. If you run into any compatibility issues, consider upgrading your PyTorch Lightning version or 
 `file an issue <https://github.com/ray-project/ray/issues>`_. 
 
 .. _lightning-trainer-migration-guide:
diff --git a/doc/source/train/getting-started-pytorch.rst b/doc/source/train/getting-started-pytorch.rst
index b903ac7937c13..08b78c6a8de43 100644
--- a/doc/source/train/getting-started-pytorch.rst
+++ b/doc/source/train/getting-started-pytorch.rst
@@ -7,11 +7,11 @@ This tutorial walks through the process of converting an existing PyTorch script
 
 Learn how to:
 
-1. Configure your model so that it runs distributed and is placed on the correct CPU/GPU device.
-2. Configure your dataloader so that it is sharded across the workers and place data on the correct CPU/GPU device.
-3. Configure your training function to report metrics and save checkpoints.
-4. Configure scale and CPU/GPU resource requirements for your training job.
-5. Launch your distributed training job with a :class:`~ray.train.torch.TorchTrainer`.
+1. Configure a model to run distributed and on the correct CPU/GPU device.
+2. Configure a dataloader to shard data across the workers and place data on the correct CPU/GPU device.
+3. Configure a training function to report metrics and save checkpoints.
+4. Configure scale and CPU/GPU resource requirements for a training job.
+5. Launch a distributed training job with a :class:`~ray.train.torch.TorchTrainer` class.
 
 Quickstart
 ----------
@@ -30,9 +30,9 @@ For reference, the final code follows:
     trainer = TorchTrainer(train_func, scaling_config=scaling_config)
     result = trainer.fit()
 
-1. Your `train_func` is the Python code that is executed on each distributed training worker.
-2. Your `ScalingConfig` defines the number of distributed training workers and whether to use GPUs.
-3. Your `TorchTrainer` launches the distributed training job.
+1. `train_func` is the Python code that executes on each distributed training worker.
+2. `ScalingConfig` defines the number of distributed training workers and whether to use GPUs.
+3. `TorchTrainer` launches the distributed training job.
 
 Compare a PyTorch training script with and without Ray Train.
 
@@ -135,19 +135,19 @@ Setting up your training function
 ---------------------------------
 
 First, update your training code to support distributed training. 
-You can begin by wrapping your code in a function:
+You can begin by wrapping your code in a :ref:`training function <train-overview-training-function>`:
 
 .. code-block:: python
 
     def train_func(config):
         # Your PyTorch training code here.
 
-This function is executed on each distributed training worker.
+Each distributed training worker executes this function.
 
 Setting up your model
 ^^^^^^^^^^^^^^^^^^^^^
 
-Use the :func:`ray.train.torch.prepare_model` utility function. This will:
+Use the :func:`ray.train.torch.prepare_model` utility function to:
 
 1. Move your model to the right device.
 2. Wrap it in ``DistributedDataParallel``.
@@ -182,8 +182,8 @@ Use the :func:`ray.train.torch.prepare_data_loader` utility function, which:
 1. Adds a ``DistributedSampler`` to your ``DataLoader``.
 2. Moves the batches to the right device. 
 
-Note that this step is not necessary if you are passing in Ray Data to your Trainer
-(see :ref:`data-ingest-torch`):
+Note that this step isn't necessary if you're passing in Ray Data to your Trainer.
+See :ref:`data-ingest-torch`.
 
 .. code-block:: diff
 
diff --git a/doc/source/train/huggingface-accelerate.rst b/doc/source/train/huggingface-accelerate.rst
index dd4e86dc65090..93dc096dda3ed 100644
--- a/doc/source/train/huggingface-accelerate.rst
+++ b/doc/source/train/huggingface-accelerate.rst
@@ -1,11 +1,11 @@
 .. _train-hf-accelerate:
 
-Training with HuggingFace Accelerate
-====================================
+Training with Hugging Face Accelerate
+=====================================
 
 The :class:`~ray.train.torch.TorchTrainer` can help you easily launch your `Accelelate <https://huggingface.co/docs/accelerate>`_  training across a distributed Ray cluster.
 
-All you need to do is run your existing training code with a TorchTrainer. You can expect the final code to look like this:
+You only need to run your existing training code with a TorchTrainer. You can expect the final code to look like this:
 
 .. code-block:: python
 
@@ -161,11 +161,11 @@ object in your training function. Below are starter examples for configuring Acc
             trainer.fit()
 
 Note that Accelerate also provides a CLI tool, `"accelerate config"`, to generate a configuration and launch your training 
-job with `"accelerate launch"`. However, it is not necessary here because Ray's `TorchTrainer` already sets up the Torch 
+job with `"accelerate launch"`. However, it's not necessary here because Ray's `TorchTrainer` already sets up the Torch 
 distributed environment and launches the training function on all workers.
 
 
-Next, check these end-to-end examples below for more details:
+Next, see these end-to-end examples below for more details:
 
 .. tabs::
 
@@ -211,6 +211,6 @@ Aside from that, the functionality of ``AccelerateTrainer`` is identical to ``To
 
 However, this caused confusion around whether this was the *only* way to run Accelerate code. 
 Because the full Accelerate functionality can be expressed with the ``Accelerator`` and ``TorchTrainer`` combination, the ``AccelerateTrainer`` will be deprecated in Ray 2.8, 
-and it is recommend to run your  Accelerate code directly with ``TorchTrainer``. 
+and it's recommend to run your  Accelerate code directly with ``TorchTrainer``. 
 
 
diff --git a/python/ray/train/examples/accelerate/accelerate_torch_trainer.py b/python/ray/train/examples/accelerate/accelerate_torch_trainer.py
index 41969a71f0210..64992f0bc2240 100644
--- a/python/ray/train/examples/accelerate/accelerate_torch_trainer.py
+++ b/python/ray/train/examples/accelerate/accelerate_torch_trainer.py
@@ -25,7 +25,7 @@
 
 
 def train_func(config):
-    """Your training function that will be launched on each worker."""
+    """Your training function that is launched on each worker."""
 
     # Unpack training configs
     lr = config["lr"]
@@ -116,7 +116,7 @@ def collate_fn(batch):
         eval_metric = metric.compute()
         accelerator.print(f"epoch {epoch}:", eval_metric)
 
-        # Report Checkpoint and metrics to Ray Train
+        # Report checkpoint and metrics to Ray Train
         # ==========================================
         with TemporaryDirectory() as tmpdir:
             if accelerator.is_main_process:
diff --git a/python/ray/train/examples/transformers/transformers_torch_trainer_basic.py b/python/ray/train/examples/transformers/transformers_torch_trainer_basic.py
index 630177424f28c..79d3f993f3d3b 100644
--- a/python/ray/train/examples/transformers/transformers_torch_trainer_basic.py
+++ b/python/ray/train/examples/transformers/transformers_torch_trainer_basic.py
@@ -14,8 +14,8 @@
 from ray.train.torch import TorchTrainer
 
 
-# [1] Define a training function that includes all your training logics
-# =====================================================================
+# [1] Define a training function that includes all your training logic
+# ====================================================================
 def train_func(config):
     # Datasets
     dataset = load_dataset("yelp_review_full")
@@ -34,7 +34,7 @@ def tokenize_function(examples):
         "bert-base-cased", num_labels=5
     )
 
-    # Evaluation Metrics
+    # Evaluation metrics
     metric = evaluate.load("accuracy")
 
     def compute_metrics(eval_pred):
@@ -42,7 +42,7 @@ def compute_metrics(eval_pred):
         predictions = np.argmax(logits, axis=-1)
         return metric.compute(predictions=predictions, references=labels)
 
-    # HuggingFace Trainer
+    # Hugging Face Trainer
     training_args = TrainingArguments(
         output_dir="test_trainer", evaluation_strategy="epoch", report_to="none"
     )
@@ -59,7 +59,7 @@ def compute_metrics(eval_pred):
     # ===============================================
     trainer.add_callback(RayTrainReportCallback())
 
-    # [3] Prepare your trainer for Ray Data Integration
+    # [3] Prepare your trainer for Ray Data integration
     # =================================================
     trainer = prepare_trainer(trainer)
 
diff --git a/python/ray/train/torch/torch_trainer.py b/python/ray/train/torch/torch_trainer.py
index e61d87e3386fd..735c9fad19665 100644
--- a/python/ray/train/torch/torch_trainer.py
+++ b/python/ray/train/torch/torch_trainer.py
@@ -25,7 +25,9 @@ class TorchTrainer(DataParallelTrainer):
     4. Runs the input ``train_loop_per_worker(train_loop_config)``
        on all workers.
 
-    For more details, see the :ref:`PyTorch User Guide <train-pytorch>`.
+    For more details, see the :ref:`PyTorch User Guide <train-pytorch>`, 
+    :ref:`PyTorch Lightning User Guide <train-pytorch-lightning>`, 
+    or :ref:`PyTorch User Guide <train-pytorch-transformers>`.
 
     Example: