diff --git a/doc/requirements-doc.txt b/doc/requirements-doc.txt index 05156ea26c2a56..8ea8d767da4e98 100644 --- a/doc/requirements-doc.txt +++ b/doc/requirements-doc.txt @@ -5,6 +5,7 @@ accelerate>=0.17.0 click colorama colorful +datasets # Newer versions of fairscale do not support Python 3.6 even though they still have wheels for it. # Have to manually pin it: https://github.com/facebookresearch/fairscale/issues/962 fairscale; python_version >= '3.7' diff --git a/python/ray/data/read_api.py b/python/ray/data/read_api.py index 384c8998d60e3f..3cf4649c61fea1 100644 --- a/python/ray/data/read_api.py +++ b/python/ray/data/read_api.py @@ -267,11 +267,12 @@ def range_tensor(n: int, *, shape: Tuple = (1,), parallelism: int = -1) -> Datas Examples: >>> import ray >>> ds = ray.data.range_tensor(1000, shape=(2, 2)) - >>> ds # doctest: +ellipsis + >>> ds # doctest: +ELLIPSIS Dataset( - num_blocks=..., - num_rows=1000, - schema={data: numpy.ndarray(shape=(2, 2), dtype=int64)}) + num_blocks=..., + num_rows=1000, + schema={data: numpy.ndarray(shape=(2, 2), dtype=int64)} + ) >>> ds.map_batches(lambda arr: arr * 2).take(2) # doctest: +SKIP [array([[0, 0], [0, 0]]), @@ -855,8 +856,8 @@ def read_json( from file paths. If your data adheres to a different partitioning scheme, set the ``partitioning`` parameter. - >>> ds = ray.data.read_json("example://year=2022/month=09/sales.json") # doctest: + SKIP - >>> ds.take(1) # doctest: + SKIP + >>> ds = ray.data.read_json("example://year=2022/month=09/sales.json") # doctest: +SKIP + >>> ds.take(1) # doctest: +SKIP [{'order_number': 10107, 'quantity': 30, 'year': '2022', 'month': '09'} Args: @@ -950,8 +951,8 @@ def read_csv( from file paths. If your data adheres to a different partitioning scheme, set the ``partitioning`` parameter. - >>> ds = ray.data.read_csv("example://year=2022/month=09/sales.csv") # doctest: + SKIP - >>> ds.take(1) # doctest: + SKIP + >>> ds = ray.data.read_csv("example://year=2022/month=09/sales.csv") # doctest: +SKIP + >>> ds.take(1) # doctest: +SKIP [{'order_number': 10107, 'quantity': 30, 'year': '2022', 'month': '09'}] By default, ``read_csv`` reads all files from file paths. If you want to filter @@ -1772,20 +1773,52 @@ def from_spark( @PublicAPI def from_huggingface( dataset: Union["datasets.Dataset", "datasets.DatasetDict"], -) -> Union[MaterializedDataset]: +) -> Union[MaterializedDataset, Dict[str, MaterializedDataset]]: """Create a dataset from a Hugging Face Datasets Dataset. This function is not parallelized, and is intended to be used with Hugging Face Datasets that are loaded into memory (as opposed to memory-mapped). + Example: + + .. doctest:: + :options: +ELLIPSIS + + >>> import ray + >>> import datasets + >>> hf_dataset = datasets.load_dataset("tweet_eval", "emotion") + Downloading ... + >>> ray_ds = ray.data.from_huggingface(hf_dataset) + >>> ray_ds + {'train': MaterializedDataset( + num_blocks=1, + num_rows=3257, + schema={text: string, label: int64} + ), 'test': MaterializedDataset( + num_blocks=1, + num_rows=1421, + schema={text: string, label: int64} + ), 'validation': MaterializedDataset( + num_blocks=1, + num_rows=374, + schema={text: string, label: int64} + )} + >>> ray_ds = ray.data.from_huggingface(hf_dataset["train"]) + >>> ray_ds + MaterializedDataset( + num_blocks=1, + num_rows=3257, + schema={text: string, label: int64} + ) + Args: - dataset: A Hugging Face ``Dataset``, or ``DatasetDict``. - ``IterableDataset`` is not supported. + dataset: A Hugging Face Dataset, or DatasetDict. IterableDataset is not + supported. ``IterableDataset`` is not supported. Returns: - MaterializedDataset holding Arrow records from the Hugging Face Dataset, or a - dict of MaterializedDataset in case ``dataset`` is a ``DatasetDict``. + Dataset holding Arrow records from the Hugging Face Dataset, or a dict of + datasets in case dataset is a DatasetDict. """ import datasets @@ -1797,12 +1830,22 @@ def convert(ds: "datasets.Dataset") -> Dataset: return ray_ds if isinstance(dataset, datasets.DatasetDict): + available_keys = list(dataset.keys()) + logger.warning( + "You provided a Huggingface DatasetDict which contains multiple " + "datasets. The output of `from_huggingface` is a dictionary of Ray " + "Datasets. To convert just a single Huggingface Dataset to a " + "Ray Dataset, specify a split. For example, " + "`ray.data.from_huggingface(my_dataset_dictionary" + f"['{available_keys[0]}'])`. " + f"Available splits are {available_keys}." + ) return {k: convert(ds) for k, ds in dataset.items()} elif isinstance(dataset, datasets.Dataset): return convert(dataset) else: raise TypeError( - "`dataset` must be a `datasets.Dataset` or `datasets.DatasetDict`, " + "`dataset` must be a `datasets.Dataset` or `datasets.DatasetDict`." f"got {type(dataset)}" )