[Train] Add support for handling multiple batch data types for prepar…

…e_data_loader (ray-project#26386) When working with Ray Train, using the ray.train.torch.prepare_data_loader method with a dataset that returns a dictionary instead of a tuple from its __getitem__ method causes issues. Co-authored-by: matthewdeng <[email protected]>
krfricke · Jul 12, 2022 · 36229d1 · 36229d1
1 parent 8bb6742
commit 36229d1
Show file tree

Hide file tree

Showing 2 changed files with 37 additions and 9 deletions.
diff --git a/python/ray/train/tests/test_gpu.py b/python/ray/train/tests/test_gpu.py
@@ -42,6 +42,13 @@ def ray_start_1_cpu_1_gpu():
  ray.shutdown()
 
 
+class LinearDatasetDict(LinearDataset):
+ """Modifies the LinearDataset to return a Dict instead of a Tuple."""
+
+ def __getitem__(self, index):
+ return {"x": self.x[index, None], "y": self.y[index, None]}
+
+
 # TODO: Refactor as a backend test.
 @pytest.mark.parametrize("num_gpus_per_worker", [0.5, 1])
 def test_torch_get_device(ray_start_4_cpus_2_gpus, num_gpus_per_worker):
@@ -92,8 +99,9 @@ def train_fn():
 
 
 # TODO: Refactor as a backend test.
-def test_torch_prepare_dataloader(ray_start_4_cpus_2_gpus):
- data_loader = DataLoader(LinearDataset(a=1, b=2, size=10))
+@pytest.mark.parametrize("dataset", (LinearDataset, LinearDatasetDict))
+def test_torch_prepare_dataloader(ray_start_4_cpus_2_gpus, dataset):
+ data_loader = DataLoader(dataset(a=1, b=2, size=10))
 
  def train_fn():
  wrapped_data_loader = train.torch.prepare_data_loader(data_loader)
@@ -102,12 +110,20 @@ def train_fn():
  assert isinstance(wrapped_data_loader.sampler, DistributedSampler)
 
  # Make sure you can properly iterate through the DataLoader.
- for batch in wrapped_data_loader:
- X = batch[0]
- y = batch[1]
-
- # Make sure the data is on the correct device.
- assert X.is_cuda and y.is_cuda
+ # Case where the dataset returns a tuple or list from __getitem__.
+ if isinstance(wrapped_data_loader.dataset[0], (tuple, list)):
+ for batch in wrapped_data_loader:
+ x = batch[0]
+ y = batch[1]
+
+ # Make sure the data is on the correct device.
+ assert x.is_cuda and y.is_cuda
+ # Case where the dataset returns a dict from __getitem__.
+ elif isinstance(wrapped_data_loader.dataset[0], dict):
+ for batch in wrapped_data_loader:
+ for x, y in zip(batch["x"], batch["y"]):
+ # Make sure the data is on the correct device.
+ assert x.is_cuda and y.is_cuda
 
  trainer = Trainer("torch", num_workers=2, use_gpu=True)
  trainer.start()

diff --git a/python/ray/train/torch/train_loop_utils.py b/python/ray/train/torch/train_loop_utils.py
@@ -4,6 +4,7 @@
 import random
 import types
 import warnings
+import collections
 
 from pathlib import Path
 from typing import Any, Dict, Optional
@@ -548,7 +549,18 @@ def try_move_device(i):
  return i
 
  with torch.cuda.stream(self._memcpy_stream):
- return tuple(try_move_device(i) for i in item)
+ if isinstance(item, collections.abc.Mapping):
+ item_on_device = {k: self._move_to_device(v) for k, v in item.items()}
+ elif isinstance(item, (tuple, list)):
+ item_on_device = type(item)(self._move_to_device(i) for i in item)
+ elif isinstance(item, torch.Tensor):
+ item_on_device = try_move_device(item)
+ else:
+ logger.info(
+ f"Data type {type(item)} doesn't support being moved to device."
+ )
+
+ return item_on_device
 
  def _wait_for_batch(self, item):
  if self._memcpy_stream is None: