Skip to content

Commit

Permalink
[Data] Improve docstring and warning message for from_huggingface (r…
Browse files Browse the repository at this point in the history
…ay-project#35206)

Corrects return type hint, add docstring example, and log warning message for from_huggingface, per confusions from user feedback.

---------

Signed-off-by: amogkam <[email protected]>
  • Loading branch information
amogkam committed May 15, 2023
1 parent 3ca8632 commit 5a5155c
Show file tree
Hide file tree
Showing 2 changed files with 58 additions and 14 deletions.
1 change: 1 addition & 0 deletions doc/requirements-doc.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ accelerate>=0.17.0
click
colorama
colorful
datasets
# Newer versions of fairscale do not support Python 3.6 even though they still have wheels for it.
# Have to manually pin it: https://github.com/facebookresearch/fairscale/issues/962
fairscale; python_version >= '3.7'
Expand Down
71 changes: 57 additions & 14 deletions python/ray/data/read_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -267,11 +267,12 @@ def range_tensor(n: int, *, shape: Tuple = (1,), parallelism: int = -1) -> Datas
Examples:
>>> import ray
>>> ds = ray.data.range_tensor(1000, shape=(2, 2))
>>> ds # doctest: +ellipsis
>>> ds # doctest: +ELLIPSIS
Dataset(
num_blocks=...,
num_rows=1000,
schema={data: numpy.ndarray(shape=(2, 2), dtype=int64)})
num_blocks=...,
num_rows=1000,
schema={data: numpy.ndarray(shape=(2, 2), dtype=int64)}
)
>>> ds.map_batches(lambda arr: arr * 2).take(2) # doctest: +SKIP
[array([[0, 0],
[0, 0]]),
Expand Down Expand Up @@ -855,8 +856,8 @@ def read_json(
from file paths. If your data adheres to a different partitioning scheme, set
the ``partitioning`` parameter.
>>> ds = ray.data.read_json("example:https://year=2022/month=09/sales.json") # doctest: + SKIP
>>> ds.take(1) # doctest: + SKIP
>>> ds = ray.data.read_json("example:https://year=2022/month=09/sales.json") # doctest: +SKIP
>>> ds.take(1) # doctest: +SKIP
[{'order_number': 10107, 'quantity': 30, 'year': '2022', 'month': '09'}
Args:
Expand Down Expand Up @@ -950,8 +951,8 @@ def read_csv(
from file paths. If your data adheres to a different partitioning scheme, set
the ``partitioning`` parameter.
>>> ds = ray.data.read_csv("example:https://year=2022/month=09/sales.csv") # doctest: + SKIP
>>> ds.take(1) # doctest: + SKIP
>>> ds = ray.data.read_csv("example:https://year=2022/month=09/sales.csv") # doctest: +SKIP
>>> ds.take(1) # doctest: +SKIP
[{'order_number': 10107, 'quantity': 30, 'year': '2022', 'month': '09'}]
By default, ``read_csv`` reads all files from file paths. If you want to filter
Expand Down Expand Up @@ -1772,20 +1773,52 @@ def from_spark(
@PublicAPI
def from_huggingface(
dataset: Union["datasets.Dataset", "datasets.DatasetDict"],
) -> Union[MaterializedDataset]:
) -> Union[MaterializedDataset, Dict[str, MaterializedDataset]]:
"""Create a dataset from a Hugging Face Datasets Dataset.
This function is not parallelized, and is intended to be used
with Hugging Face Datasets that are loaded into memory (as opposed
to memory-mapped).
Example:
.. doctest::
:options: +ELLIPSIS
>>> import ray
>>> import datasets
>>> hf_dataset = datasets.load_dataset("tweet_eval", "emotion")
Downloading ...
>>> ray_ds = ray.data.from_huggingface(hf_dataset)
>>> ray_ds
{'train': MaterializedDataset(
num_blocks=1,
num_rows=3257,
schema={text: string, label: int64}
), 'test': MaterializedDataset(
num_blocks=1,
num_rows=1421,
schema={text: string, label: int64}
), 'validation': MaterializedDataset(
num_blocks=1,
num_rows=374,
schema={text: string, label: int64}
)}
>>> ray_ds = ray.data.from_huggingface(hf_dataset["train"])
>>> ray_ds
MaterializedDataset(
num_blocks=1,
num_rows=3257,
schema={text: string, label: int64}
)
Args:
dataset: A Hugging Face ``Dataset``, or ``DatasetDict``.
``IterableDataset`` is not supported.
dataset: A Hugging Face Dataset, or DatasetDict. IterableDataset is not
supported. ``IterableDataset`` is not supported.
Returns:
MaterializedDataset holding Arrow records from the Hugging Face Dataset, or a
dict of MaterializedDataset in case ``dataset`` is a ``DatasetDict``.
Dataset holding Arrow records from the Hugging Face Dataset, or a dict of
datasets in case dataset is a DatasetDict.
"""
import datasets

Expand All @@ -1797,12 +1830,22 @@ def convert(ds: "datasets.Dataset") -> Dataset:
return ray_ds

if isinstance(dataset, datasets.DatasetDict):
available_keys = list(dataset.keys())
logger.warning(
"You provided a Huggingface DatasetDict which contains multiple "
"datasets. The output of `from_huggingface` is a dictionary of Ray "
"Datasets. To convert just a single Huggingface Dataset to a "
"Ray Dataset, specify a split. For example, "
"`ray.data.from_huggingface(my_dataset_dictionary"
f"['{available_keys[0]}'])`. "
f"Available splits are {available_keys}."
)
return {k: convert(ds) for k, ds in dataset.items()}
elif isinstance(dataset, datasets.Dataset):
return convert(dataset)
else:
raise TypeError(
"`dataset` must be a `datasets.Dataset` or `datasets.DatasetDict`, "
"`dataset` must be a `datasets.Dataset` or `datasets.DatasetDict`."
f"got {type(dataset)}"
)

Expand Down

0 comments on commit 5a5155c

Please sign in to comment.