From 9d793abd107358a221466ea2050c8c593874b6f9 Mon Sep 17 00:00:00 2001 From: angelinalg <122562471+angelinalg@users.noreply.github.com> Date: Mon, 11 Sep 2023 16:49:13 -0700 Subject: [PATCH] polish examples: make titles more consistent, add links to guides Signed-off-by: angelinalg <122562471+angelinalg@users.noreply.github.com> --- .github/styles/Vocab/Train/accept.txt | 1 + doc/source/train/examples.rst | 24 ++-- .../accelerate/accelerate_example.rst | 21 +++- .../lightning/lightning_mnist_example.ipynb | 33 +++-- .../pytorch/dreambooth_finetuning.rst | 119 +++++++++--------- .../pytorch/torch_fashion_mnist_example.rst | 16 ++- .../transformers_torch_trainer_basic.rst | 18 ++- .../getting-started-pytorch-lightning.rst | 12 +- doc/source/train/getting-started-pytorch.rst | 26 ++-- doc/source/train/huggingface-accelerate.rst | 12 +- .../accelerate/accelerate_torch_trainer.py | 4 +- .../transformers_torch_trainer_basic.py | 10 +- python/ray/train/torch/torch_trainer.py | 4 +- 13 files changed, 182 insertions(+), 118 deletions(-) diff --git a/.github/styles/Vocab/Train/accept.txt b/.github/styles/Vocab/Train/accept.txt index d832f7f80e7ce..d0c7e09aaea0c 100644 --- a/.github/styles/Vocab/Train/accept.txt +++ b/.github/styles/Vocab/Train/accept.txt @@ -1,5 +1,6 @@ Horovod Hugging Face +hyperparameters? Keras LightGBM PyTorch diff --git a/doc/source/train/examples.rst b/doc/source/train/examples.rst index 3b2cf585618ce..ac1252e92e9e5 100644 --- a/doc/source/train/examples.rst +++ b/doc/source/train/examples.rst @@ -3,7 +3,7 @@ Ray Train Examples ================== -.. Example .rst files should be organized in the same manner as the +.. Organize example .rst files in the same manner as the .py files in ray/python/ray/train/examples. Below are examples for using Ray Train with a variety of frameworks and use cases. @@ -18,17 +18,17 @@ Beginner * - Framework - Example * - PyTorch - - :ref:`Training an Fashion MNIST Image Classifier with PyTorch ` + - :ref:`Train a Fashion MNIST Image Classifier with PyTorch ` * - Lightning - - :ref:`Training an MNIST Image Classifier with Lightning ` + - :ref:`Train an MNIST Image Classifier with Lightning ` * - Transformers - - :ref:`Fine-tuning a Text Classifier on Yelp Reviews Dataset with HF Transformers ` + - :ref:`Fine-tune a Text Classifier on the Yelp Reviews Dataset with HF Transformers ` * - Accelerate - :ref:`Distributed Data Parallel Training with HF Accelerate ` * - DeepSpeed - - :ref:`Distributed Training with DeepSpeed ZeRO-3 ` + - :ref:`Train with DeepSpeed ZeRO-3 ` * - TensorFlow - - :ref:`TensorFlow MNIST Training Example ` + - :ref:`Train with TensorFlow MNIST ` * - Horovod - :ref:`End-to-end Horovod Training Example ` @@ -42,11 +42,11 @@ Intermediate * - Framework - Example * - PyTorch - - `DreamBooth fine-tuning of Stable Diffusion with Ray Train `_ + - :ref:`Fine-tune of Stable Diffusion with DreamBooth and Ray Train ` * - Lightning - :ref:`Model Training with PyTorch Lightning and Ray Data ` * - Accelerate - - :ref:`Fine-tuning a Text Classifier on GLUE Benchmark with HF Accelerate. ` + - :ref:`Fine-tune a text classifier on GLUE Benchmark with HF Accelerate ` Advanced @@ -59,10 +59,10 @@ Advanced * - Framework - Example * - Accelerate, DeepSpeed - - `Fine-tuning Llama-2 series models with Deepspeed, Accelerate, and Ray Train TorchTrainer `_ + - `Fine-tune Llama-2 series models with Deepspeed, Accelerate, and Ray Train TorchTrainer `_ * - Transformers, DeepSpeed - - :ref:`Fine-tuning GPT-J-6B with Ray Train and DeepSpeed ` + - :ref:`Fine-tune GPT-J-6B with Ray Train and DeepSpeed ` * - Lightning, DeepSpeed - - :ref:`Fine-tuning vicuna-13b with PyTorch Lightning and DeepSpeed ` + - :ref:`Fine-tune vicuna-13b with PyTorch Lightning and DeepSpeed ` * - Lightning - - :ref:`Fine-tuning dolly-v2-7b with PyTorch Lightning and FSDP ` + - :ref:`Fine-tune dolly-v2-7b with PyTorch Lightning and FSDP ` diff --git a/doc/source/train/examples/accelerate/accelerate_example.rst b/doc/source/train/examples/accelerate/accelerate_example.rst index 6205add5ac48a..e082bf11f2a30 100644 --- a/doc/source/train/examples/accelerate/accelerate_example.rst +++ b/doc/source/train/examples/accelerate/accelerate_example.rst @@ -2,7 +2,24 @@ .. _accelerate_example: -Hugging Face Accelerate Distributed Training Example with Ray Train -=================================================================== +Distributed Training Example with Hugging Face Accelerate +========================================================= + +This example does distributed data parallel training +with Hugging Face (HF) Accelerate, Ray Train, and Ray Data. +It fine-tunes a BERT model and is adapted from +https://github.com/huggingface/accelerate/blob/main/examples/nlp_example.py + + +Code example +------------ .. literalinclude:: /../../python/ray/train/examples/accelerate/accelerate_torch_trainer.py + +See also +-------- + +For a tutorial on using Ray Train and HF Accelerate, +see :ref:`Training with Hugging Face Accelerate `. + +For more Train examples, see :ref:`Ray Train Examples `. diff --git a/doc/source/train/examples/lightning/lightning_mnist_example.ipynb b/doc/source/train/examples/lightning/lightning_mnist_example.ipynb index f721884879873..6686b958f9827 100644 --- a/doc/source/train/examples/lightning/lightning_mnist_example.ipynb +++ b/doc/source/train/examples/lightning/lightning_mnist_example.ipynb @@ -51,7 +51,7 @@ "source": [ "## Prepare a dataset and module\n", "\n", - "The Pytorch Lightning Trainer takes either `torch.utils.data.DataLoader` or `pl.LightningDataModule` as data inputs. You can keep using them without any changes with Ray Train. " + "The Pytorch Lightning Trainer takes either `torch.utils.data.DataLoader` or `pl.LightningDataModule` as data inputs. You can continue using them without any changes with Ray Train. " ] }, { @@ -75,7 +75,7 @@ " self.data_dir, train=True, download=True, transform=self.transform\n", " )\n", "\n", - " # split data into train and val sets\n", + " # Split data into train and val sets\n", " self.mnist_train, self.mnist_val = random_split(mnist, [55000, 5000])\n", "\n", " def train_dataloader(self):\n", @@ -175,7 +175,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "You don't need to make any change to the definition of PyTorch Lightning model and datamodule." + "You don't need to modify the definition of the PyTorch Lightning model or datamodule." ] }, { @@ -183,18 +183,18 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Define the training loop\n", + "## Define a training function\n", "\n", - "This code defines a training loop for each worker. Comparing the training loop with the original PyTorch Lightning code, there are 3 main differences:\n", + "This code defines a {ref}`training function ` for each worker. Comparing the training fuction with the original PyTorch Lightning code, notice three main differences:\n", "\n", "- Distributed strategy: Use {class}`RayDDPStrategy `.\n", "- Cluster environment: Use {class}`RayLightningEnvironment `.\n", - "- Parallel devices: Always sets to `devices=\"auto\"` to use all available devices configured by ``TorchTrainer``.\n", + "- Parallel devices: Always set to `devices=\"auto\"` to use all available devices configured by ``TorchTrainer``.\n", "\n", "See {ref}`Getting Started with PyTorch Lightning ` for more information.\n", "\n", "\n", - "For checkpoint reportining, Ray Train provides a minimal {class}`RayTrainReportCallback ` that reports metrics and checkpoint on each train epoch end. For more complex checkpoint logic, please implement custom callbacks as described in {ref}`Saving and Loading Checkpoint ` user guide." + "For checkpoint reporting, Ray Train provides a minimal {class}`RayTrainReportCallback ` class that reports metrics and checkpoints at the end of each train epoch. For more complex checkpoint logic, implement custom callbacks. See {ref}`Saving and Loading Checkpoint `." ] }, { @@ -203,7 +203,7 @@ "metadata": {}, "outputs": [], "source": [ - "use_gpu = True # Set it to False if you want to run without GPUs\n", + "use_gpu = True # Set to False if you want to run without GPUs\n", "num_workers = 4" ] }, @@ -804,7 +804,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Check the Training Results and Checkpoints" + "## Check training results and checkpoints" ] }, { @@ -857,9 +857,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "As we can see, three checkpoints(`checkpoint_000007`, `checkpoint_000008`, `checkpoint_000009`) have been saved in the trial directory. To retrieve the latest checkpoint from the fit results and load it back into the model, follow these steps.\n", + "Ray Train saved three checkpoints(`checkpoint_000007`, `checkpoint_000008`, `checkpoint_000009`) in the trial directory. The following code retrieves the latest checkpoint from the fit results and loads it back into the model.\n", "\n", - "If you lost the in-memory result object, you can also restore the model from the checkpoint file. Here the checkpoint path is: `/tmp/ray_results/ptl-mnist-example/TorchTrainer_eb925_00000_0_2023-08-07_23-15-06/checkpoint_000009/checkpoint.ckpt`." + "If you lost the in-memory result object, you can restore the model from the checkpoint file. The checkpoint path is: `/tmp/ray_results/ptl-mnist-example/TorchTrainer_eb925_00000_0_2023-08-07_23-15-06/checkpoint_000009/checkpoint.ckpt`." ] }, { @@ -903,6 +903,17 @@ "\n", "best_model" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## See also\n", + "\n", + "For a tutorial on using Ray Train and PyTorch Lightning, see {ref}`Getting Started with PyTorch Lightning `.\n", + "\n", + "For more Train examples, see :ref:`Ray Train Examples `." + ] } ], "metadata": { diff --git a/doc/source/train/examples/pytorch/dreambooth_finetuning.rst b/doc/source/train/examples/pytorch/dreambooth_finetuning.rst index 96ea9a5a1f9d8..05bd3e37bf8a1 100644 --- a/doc/source/train/examples/pytorch/dreambooth_finetuning.rst +++ b/doc/source/train/examples/pytorch/dreambooth_finetuning.rst @@ -1,7 +1,9 @@ :orphan: -Fine-tuning DreamBooth with Ray Train -===================================== +.. _torch_finetune_dreambooth_ex: + +Fine-tune of Stable Diffusion with DreamBooth and Ray Train +=========================================================== This example shows how to do DreamBooth fine-tuning of a Stable Diffusion model using Ray Train. See the original `DreamBooth project homepage `_ for more details on what this fine-tuning method achieves. @@ -10,41 +12,41 @@ See the original `DreamBooth project homepage `_ :target: https://dreambooth.github.io :alt: DreamBooth fine-tuning overview -This example is built on top of `this HuggingFace 🤗 tutorial `_. -See the HuggingFace tutorial for useful explanations and suggestions on hyperparameters. +This example builds on `this Hugging Face 🤗 tutorial `_. +See the Hugging Face tutorial for useful explanations and suggestions on hyperparameters. **Adapting this example to Ray Train allows you to easily scale up the fine-tuning to an arbitrary number of distributed training workers.** **Compute requirements:** -* Because of the large model sizes, you'll need a machine with at least 1 A10G GPU. -* Each training worker uses 1 GPU. You can use multiple GPUs/workers to leverage data-parallel training to speed up training time. +* Because of the large model sizes, you need a machine with at least 1 A10G GPU. +* Each training worker uses 1 GPU. You can use multiple GPUs or workers to leverage data-parallel training to speed up training time. -This example fine-tunes both the ``text_encoder`` and ``unet`` models used in the Stable Diffusion process, with respect to a prior preserving loss. +This example fine-tunes both the ``text_encoder`` and ``unet`` models used in the stable diffusion process, with respect to a prior preserving loss. .. image:: /templates/05_dreambooth_finetuning/dreambooth/images/dreambooth_example.png :alt: DreamBooth overview -The full code repository can be found here: `https://github.com/ray-project/ray/tree/master/doc/source/templates/05_dreambooth_finetuning `_ +Find the full code repository at `https://github.com/ray-project/ray/tree/master/doc/source/templates/05_dreambooth_finetuning `_ How it works ------------ -This example leverages Ray Data for data loading and Ray Train for distributed training. +This example uses Ray Data for data loading and Ray Train for distributed training. Data loading ^^^^^^^^^^^^ .. note:: - You can find the latest version of the code here: `dataset.py `_ + Find the latest version of the code at `dataset.py `_ The latest version might differ slightly from the code presented here. -We use Ray Data for data loading. The code has three interesting parts. +Use Ray Data for data loading. The code has three interesting parts. -First, we load two datasets using :func:`ray.data.read_images`: +First, load two datasets using :func:`ray.data.read_images`: .. literalinclude:: /templates/05_dreambooth_finetuning/dreambooth/dataset.py :language: python @@ -52,7 +54,7 @@ First, we load two datasets using :func:`ray.data.read_images`: :end-at: class_dataset = read :dedent: 4 -Then, we tokenize the prompt that generated these images: +Then, tokenize the prompt that generated these images: .. literalinclude:: /templates/05_dreambooth_finetuning/dreambooth/dataset.py :language: python @@ -61,7 +63,7 @@ Then, we tokenize the prompt that generated these images: :dedent: 4 -And lastly, we apply a ``torchvision`` preprocessing pipeline to the images: +And lastly, apply a ``torchvision`` preprocessing pipeline to the images: .. literalinclude:: /templates/05_dreambooth_finetuning/dreambooth/dataset.py :language: python @@ -69,8 +71,7 @@ And lastly, we apply a ``torchvision`` preprocessing pipeline to the images: :end-before: END: image preprocessing :dedent: 4 -We apply all of this in final step: - +Apply all three parts in a final step: .. literalinclude:: /templates/05_dreambooth_finetuning/dreambooth/dataset.py :language: python @@ -79,29 +80,28 @@ We apply all of this in final step: :dedent: 4 - Distributed training ^^^^^^^^^^^^^^^^^^^^ .. note:: - You can find the latest version of the code here: `train.py `_ + Find the latest version of the code at `train.py `_ The latest version might differ slightly from the code presented here. -The central part of the training code is the *training function*. This function accepts a configuration dict that contains the hyperparameters. It then defines a regular PyTorch training loop. +The central part of the training code is the :ref:`training function `. This function accepts a configuration dict that contains the hyperparameters. It then defines a regular PyTorch training loop. -There are only a few locations where we interact with the Ray Train API. We marked them with in-line comments in the snippet below. +You interact with the Ray Train API in only a few locations, which follow in-line comments in the snippet below. -Remember that we want to do data-parallel training for all our models. +Remember that you want to do data-parallel training for all the models. -#. We load the data shard for each worker with session.get_dataset_shard("train") -#. We iterate over the dataset with train_dataset.iter_torch_batches() -#. We report results to Ray Train with session.report(results) +#. Load the data shard for each worker with `session.get_dataset_shard("train")`` +#. Iterate over the dataset with `train_dataset.iter_torch_batches()`` +#. Report results to Ray Train with `session.report(results)`` -The code was compacted for brevity. The `full code `_ is more thoroughly annotated. +The code is compacted for brevity. The `full code `_ is more thoroughly annotated. .. literalinclude:: /templates/05_dreambooth_finetuning/dreambooth/train.py @@ -109,7 +109,7 @@ The code was compacted for brevity. The `full code ``. +To achieve this, choose a non-word as an identifier, such as ``unqtkn``. When fine-tuning the model with this subject, you teach the model that the prompt is ``A photo of a unqtkn ``. -After fine-tuning we can run inference with this specific prompt. -For instance: ``A photo of a unqtkn `` will create an image of our subject. -Similarly, ``A photo of a unqtkn at the beach`` will create an image of our subject at the beach. +After fine-tuning you can run inference with this specific prompt. +For instance: ``A photo of a unqtkn `` creates an image of the subject. +Similarly, ``A photo of a unqtkn at the beach`` creates an image of the subject at the beach. Step 0: Preparation ^^^^^^^^^^^^^^^^^^^ @@ -216,7 +216,7 @@ Prepare some directories and environment variables. Step 1: Download the pre-trained model ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Download and cache a pre-trained Stable-Diffusion model locally. +Download and cache a pre-trained Stable Diffusion model locally. .. literalinclude:: /templates/05_dreambooth_finetuning/dreambooth_run.sh :language: bash @@ -228,10 +228,10 @@ You can access the downloaded model checkpoint at the ``$ORIG_MODEL_PATH``. Step 2: Supply images of your subject ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Use one of the sample datasets (dog, lego car), or provide your own directory +Use one of the sample datasets, like `dog` or `lego car`, or provide your own directory of images, and specify the directory with the ``$INSTANCE_DIR`` environment variable. -Then, we copy these images to ``$IMAGES_OWN_DIR``. +Then, copy these images to ``$IMAGES_OWN_DIR``. .. literalinclude:: /templates/05_dreambooth_finetuning/dreambooth_run.sh :language: bash @@ -247,7 +247,7 @@ Step 3: Create the regularization images ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Create a regularization image set for a class of subjects using the pre-trained -Stable Diffusion model. This is used to regularize the fine-tuning by ensuring that +Stable Diffusion model. This set regularizes the fine-tuning by ensuring that the model still produces decent images for random images of the same class, rather than just optimize for producing good images of the subject. @@ -256,12 +256,12 @@ rather than just optimize for producing good images of the subject. :start-after: Step 3: START :end-before: Step 3: END -We use Ray Data to do batch inference with 4 workers, so more images can be generated in parallel. +Use Ray Data to do batch inference with 4 workers, to generate more images in parallel. Step 4: Fine-tune the model ^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Save a few (4 to 5) images of the subject being fine-tuned +Save a few, like 4 to 5, images of the subject being fine-tuned in a local directory. Then launch the training job with: .. literalinclude:: /templates/05_dreambooth_finetuning/dreambooth_run.sh @@ -269,21 +269,28 @@ in a local directory. Then launch the training job with: :start-after: Step 4: START :end-before: Step 4: END -Step 5: Generate images of our subject +Step 5: Generate images of the subject ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Try your model with the same command line as Step 2, but point -to your own model this time! +to your own model this time. .. literalinclude:: /templates/05_dreambooth_finetuning/dreambooth_run.sh :language: bash :start-after: Step 5: START :end-before: Step 5: END -Next, try replacing the prompt with something more interesting! +Next, try replacing the prompt with something more interesting. For example, for the dog subject, you can try: - "photo of a unqtkn dog in a bucket" - "photo of a unqtkn dog sleeping" -- "photo of a unqtkn dog in a doghouse" \ No newline at end of file +- "photo of a unqtkn dog in a doghouse" + +See also +-------- + +For more Train examples, see :ref:`Ray Train Examples `. + +For how-to guides, see :ref:`Ray Train User Guides `. \ No newline at end of file diff --git a/doc/source/train/examples/pytorch/torch_fashion_mnist_example.rst b/doc/source/train/examples/pytorch/torch_fashion_mnist_example.rst index 2955441efaf08..c3006634b86d6 100644 --- a/doc/source/train/examples/pytorch/torch_fashion_mnist_example.rst +++ b/doc/source/train/examples/pytorch/torch_fashion_mnist_example.rst @@ -2,7 +2,19 @@ .. _torch_fashion_mnist_ex: -Running Distributed Training of a PyTorch Model on Fashion MNIST with Ray Train -=============================================================================== +Train a PyTorch Model on Fashion MNIST +====================================== + +This example runs distributed training of a PyTorch model on Fashion MNIST with Ray Train. + +Code example +------------ .. literalinclude:: /../../python/ray/train/examples/pytorch/torch_fashion_mnist_example.py + +See also +-------- + +For a tutorial on using Ray Train and PyTorch, see :ref:`Getting Started with PyTorch `. + +For more Train examples, see :ref:`Ray Train Examples `. diff --git a/doc/source/train/examples/transformers/transformers_torch_trainer_basic.rst b/doc/source/train/examples/transformers/transformers_torch_trainer_basic.rst index d4bb78290cf5b..587fa1673dda7 100644 --- a/doc/source/train/examples/transformers/transformers_torch_trainer_basic.rst +++ b/doc/source/train/examples/transformers/transformers_torch_trainer_basic.rst @@ -2,7 +2,21 @@ .. _transformers_torch_trainer_basic_example : -Ray Train Basic Example for HuggingFace Transformers -==================================================== +Fine-tune a Text Classifier with Hugging Face Transformers +========================================================== + +This basic example of distributed training with Ray Train and Hugging Face (HF) Transformers +fine-tunes a text classifier on the Yelp review dataset using HF Transformers and Ray Train. + +Code example +------------ .. literalinclude:: /../../python/ray/train/examples/transformers/transformers_torch_trainer_basic.py + +See also +-------- + +For a tutorial on using Ray Train and Transformers, +see :ref:`Getting Started with Hugging Face Transformers `. + +For more Train examples, see :ref:`Ray Train Examples `. diff --git a/doc/source/train/getting-started-pytorch-lightning.rst b/doc/source/train/getting-started-pytorch-lightning.rst index 00b8af39828e0..d9ea9ed540ffb 100644 --- a/doc/source/train/getting-started-pytorch-lightning.rst +++ b/doc/source/train/getting-started-pytorch-lightning.rst @@ -29,7 +29,7 @@ For reference, the final code follows: trainer = TorchTrainer(train_func, scaling_config=scaling_config) result = trainer.fit() -1. Your `train_func` is the Python code that is executed on each distributed training worker. +1. Your `train_func` is the Python code that each distributed training worker executes. 2. Your `ScalingConfig` defines the number of distributed training workers and whether to use GPUs. 3. Your `TorchTrainer` launches the distributed training job. @@ -147,8 +147,8 @@ Compare a PyTorch Lightning training script with and without Ray Train. result = trainer.fit() -Setting up your training function ---------------------------------- +Set up the training function +---------------------------- First, update your training code to support distributed training. Begin by wrapping your code in a function: @@ -158,7 +158,7 @@ Begin by wrapping your code in a function: def train_func(config): # Your PyTorch Lightning training code here. -This function is executed on each distributed training worker. +Each distributed training worker executes this function. Ray Train sets up your distributed process group on each worker. You only need to @@ -364,7 +364,7 @@ information about the training run, including the metrics and checkpoints report Next steps ---------- -After you have converted your PyTorch Lightningtraining script to use Ray Train: +After you have converted your PyTorch Lightning training script to use Ray Train: * See :ref:`User Guides ` to learn more about how to perform specific tasks. * Browse the :ref:`Examples ` for end-to-end examples of how to use Ray Train. @@ -374,7 +374,7 @@ Version Compatibility --------------------- Ray Train is tested with `pytorch_lightning` versions `1.6.5` and `2.0.4`. For full compatibility, use ``pytorch_lightning>=1.6.5`` . -Earlier versions are not prohibited but may result in unexpected issues. If you run into any compatibility issues, consider upgrading your PyTorch Lightning version or +Earlier versions aren't prohibited but may result in unexpected issues. If you run into any compatibility issues, consider upgrading your PyTorch Lightning version or `file an issue `_. .. _lightning-trainer-migration-guide: diff --git a/doc/source/train/getting-started-pytorch.rst b/doc/source/train/getting-started-pytorch.rst index b903ac7937c13..08b78c6a8de43 100644 --- a/doc/source/train/getting-started-pytorch.rst +++ b/doc/source/train/getting-started-pytorch.rst @@ -7,11 +7,11 @@ This tutorial walks through the process of converting an existing PyTorch script Learn how to: -1. Configure your model so that it runs distributed and is placed on the correct CPU/GPU device. -2. Configure your dataloader so that it is sharded across the workers and place data on the correct CPU/GPU device. -3. Configure your training function to report metrics and save checkpoints. -4. Configure scale and CPU/GPU resource requirements for your training job. -5. Launch your distributed training job with a :class:`~ray.train.torch.TorchTrainer`. +1. Configure a model to run distributed and on the correct CPU/GPU device. +2. Configure a dataloader to shard data across the workers and place data on the correct CPU/GPU device. +3. Configure a training function to report metrics and save checkpoints. +4. Configure scale and CPU/GPU resource requirements for a training job. +5. Launch a distributed training job with a :class:`~ray.train.torch.TorchTrainer` class. Quickstart ---------- @@ -30,9 +30,9 @@ For reference, the final code follows: trainer = TorchTrainer(train_func, scaling_config=scaling_config) result = trainer.fit() -1. Your `train_func` is the Python code that is executed on each distributed training worker. -2. Your `ScalingConfig` defines the number of distributed training workers and whether to use GPUs. -3. Your `TorchTrainer` launches the distributed training job. +1. `train_func` is the Python code that executes on each distributed training worker. +2. `ScalingConfig` defines the number of distributed training workers and whether to use GPUs. +3. `TorchTrainer` launches the distributed training job. Compare a PyTorch training script with and without Ray Train. @@ -135,19 +135,19 @@ Setting up your training function --------------------------------- First, update your training code to support distributed training. -You can begin by wrapping your code in a function: +You can begin by wrapping your code in a :ref:`training function `: .. code-block:: python def train_func(config): # Your PyTorch training code here. -This function is executed on each distributed training worker. +Each distributed training worker executes this function. Setting up your model ^^^^^^^^^^^^^^^^^^^^^ -Use the :func:`ray.train.torch.prepare_model` utility function. This will: +Use the :func:`ray.train.torch.prepare_model` utility function to: 1. Move your model to the right device. 2. Wrap it in ``DistributedDataParallel``. @@ -182,8 +182,8 @@ Use the :func:`ray.train.torch.prepare_data_loader` utility function, which: 1. Adds a ``DistributedSampler`` to your ``DataLoader``. 2. Moves the batches to the right device. -Note that this step is not necessary if you are passing in Ray Data to your Trainer -(see :ref:`data-ingest-torch`): +Note that this step isn't necessary if you're passing in Ray Data to your Trainer. +See :ref:`data-ingest-torch`. .. code-block:: diff diff --git a/doc/source/train/huggingface-accelerate.rst b/doc/source/train/huggingface-accelerate.rst index dd4e86dc65090..93dc096dda3ed 100644 --- a/doc/source/train/huggingface-accelerate.rst +++ b/doc/source/train/huggingface-accelerate.rst @@ -1,11 +1,11 @@ .. _train-hf-accelerate: -Training with HuggingFace Accelerate -==================================== +Training with Hugging Face Accelerate +===================================== The :class:`~ray.train.torch.TorchTrainer` can help you easily launch your `Accelelate `_ training across a distributed Ray cluster. -All you need to do is run your existing training code with a TorchTrainer. You can expect the final code to look like this: +You only need to run your existing training code with a TorchTrainer. You can expect the final code to look like this: .. code-block:: python @@ -161,11 +161,11 @@ object in your training function. Below are starter examples for configuring Acc trainer.fit() Note that Accelerate also provides a CLI tool, `"accelerate config"`, to generate a configuration and launch your training -job with `"accelerate launch"`. However, it is not necessary here because Ray's `TorchTrainer` already sets up the Torch +job with `"accelerate launch"`. However, it's not necessary here because Ray's `TorchTrainer` already sets up the Torch distributed environment and launches the training function on all workers. -Next, check these end-to-end examples below for more details: +Next, see these end-to-end examples below for more details: .. tabs:: @@ -211,6 +211,6 @@ Aside from that, the functionality of ``AccelerateTrainer`` is identical to ``To However, this caused confusion around whether this was the *only* way to run Accelerate code. Because the full Accelerate functionality can be expressed with the ``Accelerator`` and ``TorchTrainer`` combination, the ``AccelerateTrainer`` will be deprecated in Ray 2.8, -and it is recommend to run your Accelerate code directly with ``TorchTrainer``. +and it's recommend to run your Accelerate code directly with ``TorchTrainer``. diff --git a/python/ray/train/examples/accelerate/accelerate_torch_trainer.py b/python/ray/train/examples/accelerate/accelerate_torch_trainer.py index 41969a71f0210..64992f0bc2240 100644 --- a/python/ray/train/examples/accelerate/accelerate_torch_trainer.py +++ b/python/ray/train/examples/accelerate/accelerate_torch_trainer.py @@ -25,7 +25,7 @@ def train_func(config): - """Your training function that will be launched on each worker.""" + """Your training function that is launched on each worker.""" # Unpack training configs lr = config["lr"] @@ -116,7 +116,7 @@ def collate_fn(batch): eval_metric = metric.compute() accelerator.print(f"epoch {epoch}:", eval_metric) - # Report Checkpoint and metrics to Ray Train + # Report checkpoint and metrics to Ray Train # ========================================== with TemporaryDirectory() as tmpdir: if accelerator.is_main_process: diff --git a/python/ray/train/examples/transformers/transformers_torch_trainer_basic.py b/python/ray/train/examples/transformers/transformers_torch_trainer_basic.py index 630177424f28c..79d3f993f3d3b 100644 --- a/python/ray/train/examples/transformers/transformers_torch_trainer_basic.py +++ b/python/ray/train/examples/transformers/transformers_torch_trainer_basic.py @@ -14,8 +14,8 @@ from ray.train.torch import TorchTrainer -# [1] Define a training function that includes all your training logics -# ===================================================================== +# [1] Define a training function that includes all your training logic +# ==================================================================== def train_func(config): # Datasets dataset = load_dataset("yelp_review_full") @@ -34,7 +34,7 @@ def tokenize_function(examples): "bert-base-cased", num_labels=5 ) - # Evaluation Metrics + # Evaluation metrics metric = evaluate.load("accuracy") def compute_metrics(eval_pred): @@ -42,7 +42,7 @@ def compute_metrics(eval_pred): predictions = np.argmax(logits, axis=-1) return metric.compute(predictions=predictions, references=labels) - # HuggingFace Trainer + # Hugging Face Trainer training_args = TrainingArguments( output_dir="test_trainer", evaluation_strategy="epoch", report_to="none" ) @@ -59,7 +59,7 @@ def compute_metrics(eval_pred): # =============================================== trainer.add_callback(RayTrainReportCallback()) - # [3] Prepare your trainer for Ray Data Integration + # [3] Prepare your trainer for Ray Data integration # ================================================= trainer = prepare_trainer(trainer) diff --git a/python/ray/train/torch/torch_trainer.py b/python/ray/train/torch/torch_trainer.py index e61d87e3386fd..735c9fad19665 100644 --- a/python/ray/train/torch/torch_trainer.py +++ b/python/ray/train/torch/torch_trainer.py @@ -25,7 +25,9 @@ class TorchTrainer(DataParallelTrainer): 4. Runs the input ``train_loop_per_worker(train_loop_config)`` on all workers. - For more details, see the :ref:`PyTorch User Guide `. + For more details, see the :ref:`PyTorch User Guide `, + :ref:`PyTorch Lightning User Guide `, + or :ref:`PyTorch User Guide `. Example: