[Data] Improve docstring and warning message for from_huggingface (r…

…ay-project#35206) Corrects return type hint, add docstring example, and log warning message for from_huggingface, per confusions from user feedback. --------- Signed-off-by: amogkam <[email protected]>
krfricke · May 15, 2023 · 5a5155c · 5a5155c
1 parent 3ca8632
commit 5a5155c
Show file tree

Hide file tree

Showing 2 changed files with 58 additions and 14 deletions.
diff --git a/doc/requirements-doc.txt b/doc/requirements-doc.txt
@@ -5,6 +5,7 @@ accelerate>=0.17.0
 click
 colorama
 colorful
+datasets
 # Newer versions of fairscale do not support Python 3.6 even though they still have wheels for it.
 # Have to manually pin it: https://github.com/facebookresearch/fairscale/issues/962
 fairscale; python_version >= '3.7'

diff --git a/python/ray/data/read_api.py b/python/ray/data/read_api.py
@@ -267,11 +267,12 @@ def range_tensor(n: int, *, shape: Tuple = (1,), parallelism: int = -1) -> Datas
  Examples:
  >>> import ray
  >>> ds = ray.data.range_tensor(1000, shape=(2, 2))
- >>> ds # doctest: +ellipsis
+ >>> ds # doctest: +ELLIPSIS
  Dataset(
- num_blocks=...,
- num_rows=1000,
- schema={data: numpy.ndarray(shape=(2, 2), dtype=int64)})
+ num_blocks=...,
+ num_rows=1000,
+ schema={data: numpy.ndarray(shape=(2, 2), dtype=int64)}
+ )
  >>> ds.map_batches(lambda arr: arr * 2).take(2) # doctest: +SKIP
  [array([[0, 0],
  [0, 0]]),
@@ -855,8 +856,8 @@ def read_json(
  from file paths. If your data adheres to a different partitioning scheme, set
  the ``partitioning`` parameter.
 
- >>> ds = ray.data.read_json("example:https://year=2022/month=09/sales.json") # doctest: + SKIP
- >>> ds.take(1) # doctest: + SKIP
+ >>> ds = ray.data.read_json("example:https://year=2022/month=09/sales.json") # doctest: +SKIP
+ >>> ds.take(1) # doctest: +SKIP
  [{'order_number': 10107, 'quantity': 30, 'year': '2022', 'month': '09'}
 
  Args:
@@ -950,8 +951,8 @@ def read_csv(
  from file paths. If your data adheres to a different partitioning scheme, set
  the ``partitioning`` parameter.
 
- >>> ds = ray.data.read_csv("example:https://year=2022/month=09/sales.csv") # doctest: + SKIP
- >>> ds.take(1) # doctest: + SKIP
+ >>> ds = ray.data.read_csv("example:https://year=2022/month=09/sales.csv") # doctest: +SKIP
+ >>> ds.take(1) # doctest: +SKIP
  [{'order_number': 10107, 'quantity': 30, 'year': '2022', 'month': '09'}]
 
  By default, ``read_csv`` reads all files from file paths. If you want to filter
@@ -1772,20 +1773,52 @@ def from_spark(
 @PublicAPI
 def from_huggingface(
  dataset: Union["datasets.Dataset", "datasets.DatasetDict"],
-) -> Union[MaterializedDataset]:
+) -> Union[MaterializedDataset, Dict[str, MaterializedDataset]]:
  """Create a dataset from a Hugging Face Datasets Dataset.
 
  This function is not parallelized, and is intended to be used
  with Hugging Face Datasets that are loaded into memory (as opposed
  to memory-mapped).
 
+ Example:
+
+ .. doctest::
+ :options: +ELLIPSIS
+
+ >>> import ray
+ >>> import datasets
+ >>> hf_dataset = datasets.load_dataset("tweet_eval", "emotion")
+ Downloading ...
+ >>> ray_ds = ray.data.from_huggingface(hf_dataset)
+ >>> ray_ds
+ {'train': MaterializedDataset(
+ num_blocks=1,
+ num_rows=3257,
+ schema={text: string, label: int64}
+ ), 'test': MaterializedDataset(
+ num_blocks=1,
+ num_rows=1421,
+ schema={text: string, label: int64}
+ ), 'validation': MaterializedDataset(
+ num_blocks=1,
+ num_rows=374,
+ schema={text: string, label: int64}
+ )}
+ >>> ray_ds = ray.data.from_huggingface(hf_dataset["train"])
+ >>> ray_ds
+ MaterializedDataset(
+ num_blocks=1,
+ num_rows=3257,
+ schema={text: string, label: int64}
+ )
+
  Args:
- dataset: A Hugging Face ``Dataset``, or ``DatasetDict``.
- ``IterableDataset`` is not supported.
+ dataset: A Hugging Face Dataset, or DatasetDict. IterableDataset is not
+ supported. ``IterableDataset`` is not supported.
 
  Returns:
- MaterializedDataset holding Arrow records from the Hugging Face Dataset, or a
- dict of MaterializedDataset in case ``dataset`` is a ``DatasetDict``.
+ Dataset holding Arrow records from the Hugging Face Dataset, or a dict of
+  datasets in case dataset is a DatasetDict.
  """
  import datasets
 
@@ -1797,12 +1830,22 @@ def convert(ds: "datasets.Dataset") -> Dataset:
  return ray_ds
 
  if isinstance(dataset, datasets.DatasetDict):
+ available_keys = list(dataset.keys())
+ logger.warning(
+ "You provided a Huggingface DatasetDict which contains multiple "
+ "datasets. The output of `from_huggingface` is a dictionary of Ray "
+ "Datasets. To convert just a single Huggingface Dataset to a "
+ "Ray Dataset, specify a split. For example, "
+ "`ray.data.from_huggingface(my_dataset_dictionary"
+ f"['{available_keys[0]}'])`. "
+ f"Available splits are {available_keys}."
+ )
  return {k: convert(ds) for k, ds in dataset.items()}
  elif isinstance(dataset, datasets.Dataset):
  return convert(dataset)
  else:
  raise TypeError(
- "`dataset` must be a `datasets.Dataset` or `datasets.DatasetDict`, "
+ "`dataset` must be a `datasets.Dataset` or `datasets.DatasetDict`."
  f"got {type(dataset)}"
  )