diff --git a/.buildkite/pipeline.build.yml b/.buildkite/pipeline.build.yml index 5d58c27011597..139ac7196a693 100644 --- a/.buildkite/pipeline.build.yml +++ b/.buildkite/pipeline.build.yml @@ -577,7 +577,7 @@ # Horovod needs to be installed separately (needed for API ref imports) - ./ci/env/install-horovod.sh # See https://stackoverflow.com/questions/63383400/error-cannot-uninstall-ruamel-yaml-while-creating-docker-image-for-azure-ml-a - - pip install mosaicml==0.10.1 --ignore-installed + - pip install mosaicml==0.12.1 --ignore-installed - ./ci/ci.sh build - label: ":octopus: Tune multinode tests" diff --git a/.buildkite/pipeline.ml.yml b/.buildkite/pipeline.ml.yml index 4cdd62ea2b606..a0cc925feab67 100644 --- a/.buildkite/pipeline.ml.yml +++ b/.buildkite/pipeline.ml.yml @@ -287,6 +287,7 @@ conditions: ["NO_WHEELS_REQUIRED", "RAY_CI_PYTHON_AFFECTED", "RAY_CI_DATA_AFFECTED"] commands: - cleanup() { if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then ./ci/build/upload_build_info.sh; fi }; trap cleanup EXIT + - ./ci/env/install-java.sh - DATA_PROCESSING_TESTING=1 ARROW_VERSION=9.* ARROW_MONGO_VERSION=0.5.* ./ci/env/install-dependencies.sh - ./ci/env/env_info.sh - sudo apt-get purge -y mongodb* @@ -302,6 +303,7 @@ instance_size: medium commands: - cleanup() { if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then ./ci/build/upload_build_info.sh; fi }; trap cleanup EXIT + - ./ci/env/install-java.sh - DATA_PROCESSING_TESTING=1 ARROW_VERSION=7.* ./ci/env/install-dependencies.sh - ./ci/env/env_info.sh - bazel test --config=ci $(./ci/run/bazel_export_options) --action_env=RAY_DATASET_USE_STREAMING_EXECUTOR=1 --build_tests_only --test_tag_filters=-dataset_integration python/ray/data/... diff --git a/ci/env/install-dependencies.sh b/ci/env/install-dependencies.sh index 743279c42492a..6b99044207552 100755 --- a/ci/env/install-dependencies.sh +++ b/ci/env/install-dependencies.sh @@ -367,7 +367,6 @@ install_pip_packages() { # Additional Train test dependencies. if [ "${TRAIN_TESTING-}" = 1 ] || [ "${DOC_TESTING-}" = 1 ]; then - rm -rf "${SITE_PACKAGES}"/ruamel* # https://stackoverflow.com/questions/63383400/error-cannot-uninstall-ruamel-yaml-while-creating-docker-image-for-azure-ml-a pip install -U -c "${WORKSPACE_DIR}"/python/requirements.txt -r "${WORKSPACE_DIR}"/python/requirements/ml/requirements_train.txt fi diff --git a/cpp/BUILD.bazel b/cpp/BUILD.bazel index 52db9f78bc4dc..d09b9a3f0e240 100644 --- a/cpp/BUILD.bazel +++ b/cpp/BUILD.bazel @@ -21,13 +21,13 @@ cc_binary( linkstatic = 1, visibility = ["//visibility:public"], deps = [ - ":ray_api", + ":ray_api_lib", ":symbols/ray_api_exported_symbols_linux.lds", ], ) cc_library( - name = "ray_api", + name = "ray_api_lib", srcs = glob([ "src/ray/api.cc", "src/ray/api/*.cc", @@ -95,7 +95,7 @@ cc_binary( "@bazel_tools//src/conditions:windows": [ # TODO(SongGuyang): Change to use dynamic library # "ray_cpp_lib" when we make it work on Windows. - "ray_api", + "ray_api_lib", ], "//conditions:default": [ "ray_cpp_lib", @@ -111,7 +111,6 @@ genrule( name = "ray_cpp_pkg", srcs = [ "default_worker", - "ray_api", "libray_api.so", ], outs = ["ray_cpp_pkg.out"], @@ -175,7 +174,7 @@ cc_test( linkstatic = True, tags = ["team:core"], deps = [ - "ray_api", + "ray_api_lib", "@com_google_googletest//:gtest_main", ], ) @@ -205,7 +204,7 @@ cc_test( linkstatic = True, tags = ["team:core"], deps = [ - "ray_api", + "ray_api_lib", "@com_google_googletest//:gtest_main", ], ) @@ -228,7 +227,7 @@ cc_test( linkstatic = True, tags = ["team:core"], deps = [ - "ray_api", + "ray_api_lib", "@com_google_googletest//:gtest_main", ], ) @@ -293,7 +292,7 @@ cc_test( linkstatic = True, tags = ["team:core"], deps = [ - "ray_api", + "ray_api_lib", ], ) @@ -327,7 +326,7 @@ cc_binary( linkstatic = True, tags = ["team:core"], deps = [ - ":ray_api", + ":ray_api_lib", ], ) diff --git a/dashboard/agent.py b/dashboard/agent.py index 94c12a4b10110..f24b74582dfd4 100644 --- a/dashboard/agent.py +++ b/dashboard/agent.py @@ -243,7 +243,7 @@ async def _check_parent(): error = True except Exception as e: msg += f"Failed to read Raylet logs at {log_path}: {e}!" - logger.exception() + logger.exception(msg) error = True if error: logger.error(msg) diff --git a/dashboard/state_aggregator.py b/dashboard/state_aggregator.py index c7fa98ce4f10a..56a633b8c01f5 100644 --- a/dashboard/state_aggregator.py +++ b/dashboard/state_aggregator.py @@ -421,7 +421,7 @@ def _to_task_state(task_attempt: dict) -> dict: ], ), (task_attempt, ["task_id", "attempt_number", "job_id"]), - (state_updates, ["node_id", "worker_id"]), + (state_updates, ["node_id", "worker_id", "error_type"]), ] for src, keys in mappings: for key in keys: diff --git a/doc/BUILD b/doc/BUILD index 8417eb6c0e010..d4527dd6bf173 100644 --- a/doc/BUILD +++ b/doc/BUILD @@ -27,6 +27,13 @@ py_test( tags = ["exclusive", "team:ml"] ) +py_test( + name = "tensor", + size = "small", + srcs = ["source/data/doc_code/tensor.py"], + tags = ["exclusive", "team:ml"] +) + py_test( name = "big_data_ingestion", size = "small", diff --git a/doc/source/_toc.yml b/doc/source/_toc.yml index 7889f9cd676a5..fd9dbde9a9825 100644 --- a/doc/source/_toc.yml +++ b/doc/source/_toc.yml @@ -54,6 +54,7 @@ parts: - file: ray-air/examples/index sections: - file: ray-air/examples/torch_image_example + - file: ray-air/examples/torch_detection - file: ray-air/examples/convert_existing_pytorch_code_to_ray_air - file: ray-air/examples/convert_existing_tf_code_to_ray_air - file: ray-air/examples/tfx_tabular_train_to_serve @@ -74,7 +75,10 @@ parts: - file: ray-air/examples/batch_forecasting - file: ray-air/examples/pytorch_resnet_batch_prediction - file: ray-air/examples/stablediffusion_batch_prediction + - file: ray-air/examples/gptj_deepspeed_fine_tuning - file: ray-air/examples/gptj_batch_prediction + - file: ray-air/examples/gptj_serving + - file: ray-air/examples/dreambooth_finetuning - file: ray-air/api/api - file: ray-air/benchmarks diff --git a/doc/source/cluster/kubernetes/configs/static-ray-cluster.with-fault-tolerance.yaml b/doc/source/cluster/kubernetes/configs/static-ray-cluster.with-fault-tolerance.yaml index 7619cb0ed250f..3a941bf5f19d5 100644 --- a/doc/source/cluster/kubernetes/configs/static-ray-cluster.with-fault-tolerance.yaml +++ b/doc/source/cluster/kubernetes/configs/static-ray-cluster.with-fault-tolerance.yaml @@ -73,6 +73,7 @@ metadata: labels: app: ray-cluster-head spec: + clusterIP: None ports: - name: client protocol: TCP @@ -198,7 +199,7 @@ spec: imagePullPolicy: Always command: ["/bin/bash", "-c", "--"] args: - - "ray start --num-cpus=$MY_CPU_REQUEST --address=$SERVICE_RAY_CLUSTER_SERVICE_HOST:$SERVICE_RAY_CLUSTER_SERVICE_PORT_GCS_SERVER --object-manager-port=8076 --node-manager-port=8077 --dashboard-agent-grpc-port=8078 --dashboard-agent-listen-port=52365 --block" + - "ray start --num-cpus=$MY_CPU_REQUEST --address=service-ray-cluster:6380 --object-manager-port=8076 --node-manager-port=8077 --dashboard-agent-grpc-port=8078 --dashboard-agent-listen-port=52365 --block" # This volume allocates shared memory for Ray to use for its plasma # object store. If you do not provide this, Ray will fall back to diff --git a/doc/source/data/creating-datasets.rst b/doc/source/data/creating-datasets.rst index 2d1a8a702412c..199e3ea79b0e5 100644 --- a/doc/source/data/creating-datasets.rst +++ b/doc/source/data/creating-datasets.rst @@ -77,9 +77,8 @@ Supported File Formats Read Parquet files into a tabular ``Dataset``. The Parquet data will be read into `Arrow Table `__ blocks. Although this simple example demonstrates reading a single file, note that - Datasets can also read directories of Parquet files, with one tabular block created - per file. For Parquet in particular, we also support reading partitioned Parquet - datasets with partition column values pulled from the file paths. + Datasets can also read directories of Parquet files. We also support reading partitioned + Parquet datasets with partition column values pulled from the file paths. .. literalinclude:: ./doc_code/creating_datasets.py :language: python diff --git a/doc/source/data/dask-on-ray.rst b/doc/source/data/dask-on-ray.rst index d13c2bdb7d7a3..587175c68abf6 100644 --- a/doc/source/data/dask-on-ray.rst +++ b/doc/source/data/dask-on-ray.rst @@ -31,8 +31,10 @@ workload. Using the Dask-on-Ray scheduler, the entire Dask ecosystem can be exec * - Ray Version - Dask Version + * - ``2.3.0`` + - ``2022.10.1`` * - ``2.2.0`` - - ``2022.2.0`` + - ``2022.10.1`` * - ``2.1.0`` - ``2022.2.0`` * - ``2.0.0`` diff --git a/doc/source/data/doc_code/tensor.py b/doc/source/data/doc_code/tensor.py index 6b41c6bfbd358..fc2945ba836e3 100644 --- a/doc/source/data/doc_code/tensor.py +++ b/doc/source/data/doc_code/tensor.py @@ -489,17 +489,21 @@ def add_one(batch: Dict[str, Any]) -> Dict[str, Any]: # fmt: off # __create_variable_shaped_tensors_begin___ # Create a Dataset of variable-shaped tensors. -arr = np.array([np.ones((2, 2)), np.ones((3, 3))], dtype=object) -ds = ray.data.from_numpy([arr, arr]) +ragged_array = np.array([np.ones((2, 2)), np.ones((3, 3))], dtype=object) +df = pd.DataFrame({"feature": ragged_array, "label": [1, 1]}) +ds = ray.data.from_pandas([df, df]) # -> Dataset(num_blocks=2, num_rows=4, -# schema={__value__: ArrowVariableShapedTensorType(dtype=double)}) +# schema={feature: TensorDtype(shape=(None, None), dtype=float64), +# label: int64}) ds.take(2) -# -> [array([[1., 1.], -# [1., 1.]]), -# array([[1., 1., 1.], -# [1., 1., 1.], -# [1., 1., 1.]])] +# -> [{'feature': array([[1., 1.], +# [1., 1.]]), +# 'label': 1}, +# {'feature': array([[1., 1., 1.], +# [1., 1., 1.], +# [1., 1., 1.]]), +# 'label': 1}] # __create_variable_shaped_tensors_end__ # fmt: off @@ -507,13 +511,16 @@ def add_one(batch: Dict[str, Any]) -> Dict[str, Any]: # Convert Ray Dataset to a TensorFlow Dataset. tf_ds = ds.to_tf( batch_size=2, - output_signature=tf.RaggedTensorSpec(shape=(None, None, None), dtype=tf.float64), + feature_columns="feature", + label_columns="label" ) # Iterate through the tf.RaggedTensors. for ragged_tensor in tf_ds: print(ragged_tensor) -# -> -# +# -> (, +# ) +# (, +# ) # __tf_variable_shaped_tensors_end__ diff --git a/doc/source/data/getting-started.rst b/doc/source/data/getting-started.rst index 6ff45965180be..20839ed0207c8 100644 --- a/doc/source/data/getting-started.rst +++ b/doc/source/data/getting-started.rst @@ -62,7 +62,17 @@ transform datasets. Ray executes transformations in parallel for performance at .. testoutput:: MapBatches(transform_batch) - +- Dataset(num_blocks=1, num_rows=150, schema={sepal length (cm): double, sepal width (cm): double, petal length (cm): double, petal width (cm): double, target: int64}) + +- Dataset( + num_blocks=..., + num_rows=150, + schema={ + sepal length (cm): double, + sepal width (cm): double, + petal length (cm): double, + petal width (cm): double, + target: int64 + } + ) To learn more about transforming datasets, read :ref:`Transforming datasets `. diff --git a/doc/source/data/glossary.rst b/doc/source/data/glossary.rst index 067491ec9cd89..e1de9b0596605 100644 --- a/doc/source/data/glossary.rst +++ b/doc/source/data/glossary.rst @@ -116,7 +116,11 @@ Ray Datasets Glossary >>> import numpy as np >>> import ray >>> ray.data.from_numpy(np.zeros((100, 32, 32, 3))) - Dataset(num_blocks=1, num_rows=100, schema={__value__: ArrowTensorType(shape=(32, 32, 3), dtype=double)}) + Dataset( + num_blocks=1, + num_rows=100, + schema={__value__: ArrowTensorType(shape=(32, 32, 3), dtype=double)} + ) Tabular Dataset A Dataset that represents columnar data. @@ -125,7 +129,17 @@ Ray Datasets Glossary >>> import ray >>> ray.data.read_csv("s3://anonymous@air-example-data/iris.csv") - Dataset(num_blocks=1, num_rows=150, schema={sepal length (cm): double, sepal width (cm): double, petal length (cm): double, petal width (cm): double, target: int64}) + Dataset( + num_blocks=1, + num_rows=150, + schema={ + sepal length (cm): double, + sepal width (cm): double, + petal length (cm): double, + petal width (cm): double, + target: int64 + } + ) User-defined function (UDF) A callable that transforms batches or :term:`records ` of data. UDFs let you arbitrarily transform datasets. diff --git a/doc/source/images/detection.jpeg b/doc/source/images/detection.jpeg new file mode 100644 index 0000000000000..1692c91956d8b Binary files /dev/null and b/doc/source/images/detection.jpeg differ diff --git a/doc/source/ray-air/examples/BUILD b/doc/source/ray-air/examples/BUILD index 7e7cba21877fa..ff2f4930cdb59 100644 --- a/doc/source/ray-air/examples/BUILD +++ b/doc/source/ray-air/examples/BUILD @@ -46,8 +46,11 @@ py_test_run_all_notebooks( "feast_example.ipynb", # REGRESSION "upload_to_comet_ml.ipynb", # Needs credentials "upload_to_wandb.ipynb", # Needs credentials + "torch_detection.ipynb", # Requires GPUs "gptj_batch_prediction.ipynb", # Requires GPUs + "gptj_serving.ipynb", # Requires GPUs "stablediffusion_batch_prediction.ipynb", # Requires GPUs + "gptj_deepspeed_fine_tuning.ipynb", # Requires release test ], data = ["//doc/source/ray-air/examples:air_examples"], tags = ["exclusive", "team:ml", "ray_air"], @@ -79,7 +82,9 @@ py_test_run_all_notebooks( include = [ "huggingface_text_classification.ipynb", "pytorch_resnet_batch_prediction.ipynb", + "torch_detection.ipynb", "gptj_batch_prediction.ipynb", + "gptj_serving.ipynb", "stablediffusion_batch_prediction.ipynb", ], exclude = [], diff --git a/doc/source/ray-air/examples/dreambooth_finetuning.rst b/doc/source/ray-air/examples/dreambooth_finetuning.rst new file mode 100644 index 0000000000000..4c8b58b49264b --- /dev/null +++ b/doc/source/ray-air/examples/dreambooth_finetuning.rst @@ -0,0 +1,183 @@ +Fine-tuning DreamBooth with Ray AIR +=================================== + +.. include:: ../../../../python/ray/air/examples/dreambooth/README.rst + :start-after: section_intro + :end-before: How it works + +How it works +------------ + +This example leverages Ray Data for data loading and Ray Train for distributed training. + +Data loading +^^^^^^^^^^^^ + +.. note:: + You can find the latest version of the code here: `dataset.py `_ + + The latest version might differ slightly from the code presented here. + + +We use Ray Data for data loading. The code has three interesting parts. + +First, we load two datasets using :func:`ray.data.read_images`: + +.. literalinclude:: ../../../../python/ray/air/examples/dreambooth/dataset.py + :language: python + :start-at: instance_dataset = read + :end-at: class_dataset = read + :dedent: 4 + +Then, we tokenize the prompt that generated these images: + +.. literalinclude:: ../../../../python/ray/air/examples/dreambooth/dataset.py + :language: python + :start-at: tokenizer = AutoTokenizer + :end-at: instance_prompt_ids = _tokenize + :dedent: 4 + + +And lastly, we apply a ``torchvision`` preprocessing pipeline to the images: + +.. literalinclude:: ../../../../python/ray/air/examples/dreambooth/dataset.py + :language: python + :start-at: transform = transforms.Compose + :end-at: preprocessor = TorchVisionPreprocessor + :dedent: 4 + +We apply all of this in final step: + + +.. literalinclude:: ../../../../python/ray/air/examples/dreambooth/dataset.py + :language: python + :start-at: instance_dataset = preprocessor + :end-before: --- + :dedent: 4 + + + +Distributed training +^^^^^^^^^^^^^^^^^^^^ + + +.. note:: + You can find the latest version of the code here: `train.py `_ + + The latest version might differ slightly from the code presented here. + + +The central part of the training code is the *training function*. This function accepts a configuration dict that contains the hyperparameters. It then defines a regular PyTorch training loop. + +There are only a few locations where we interact with the Ray AIR API. We marked them with in-line comments in the snippet below. + +Remember that we want to do data-parallel training for all our models. + + +#. We load the data shard for each worker with session.get_dataset_shard("train") +#. We iterate over the dataset with train_dataset.iter_torch_batches() +#. We report results to Ray AIR with session.report(results) + +The code was compacted for brevity. The `full code `_ is more thoroughly annotated. + + +.. literalinclude:: ../../../../python/ray/air/examples/dreambooth/train.py + :language: python + :start-at: def train_fn(config) + :end-at: session.report(results) + +We can then run this training loop with Ray AIR's TorchTrainer: + + +.. literalinclude:: ../../../../python/ray/air/examples/dreambooth/train.py + :language: python + :start-at: args = train_arguments + :end-at: trainer.fit() + :dedent: 4 + +Configuring the scale +^^^^^^^^^^^^^^^^^^^^^ + +In the TorchTrainer, we can easily configure our scale. +The above example uses the ``num_workers`` argument to specify the number +of workers. This defaults to 2 workers with 2 GPUs each - so 4 GPUs in total. + +To run the example on 8 GPUs, just set the number of workers to 4 using ``--num-workers=4``! +Or you can change the scaling config directly: + +.. code-block:: diff + + scaling_config=ScalingConfig( + use_gpu=True, + - num_workers=args.num_workers, + + num_workers=4, + resources_per_worker={ + "GPU": 2, + }, + ) + +If you're running multi-node training, you should make sure that all nodes have access to a shared +storage (e.g. via NFS or EFS). In the example script below, you can adjust this location with the +``DATA_PREFIX`` environment variable. + +Training throughput +~~~~~~~~~~~~~~~~~~~ + +We ran training using 1, 2, 4, and 8 workers (and 2, 4, 8, and 16 GPUs, respectively) to compare throughput. + +Setup: + +* 2 x g5.12xlarge nodes with 4 A10G GPUs each +* Model as configured below +* Data from this example +* 200 regularization images +* Training for 4 epochs (800 steps) +* Use a mounted External File System to share data between nodes +* 3 runs per configuration + +Because network storage can be slow, we excluded the time it takes to save the final model from the training time. + +We expect that the training time should benefit from scale and decreases when running with +more workers and GPUs. + + +.. image:: images/dreambooth_training.png + :target: images/dreambooth_training.png + :alt: DreamBooth training times + + +.. list-table:: + :header-rows: 1 + + * - Number of workers + - Number of GPUs + - Training time + * - 1 + - 2 + - 458.16 (3.82) + * - 2 + - 4 + - 364.61 (1.65) + * - 4 + - 8 + - 252.37 (3.18) + * - 8 + - 16 + - 160.97 (1.36) + + +While the training time decreases linearly with the amount of workers/GPUs, we observe some penalty. +Specifically, with double the amount of workers we don't get half of the training time. + +This is most likely due to additional communication between processes and the transfer of large model +weights. We are also only training with a batch size of one because our GPU memory is limited. On larger +GPUs with higher batch sizes we would expect a greater benefit from scaling out. + + +Run the example +--------------- + +.. include:: ../../../../python/ray/air/examples/dreambooth/README.rst + :start-after: section_run_example + + diff --git a/doc/source/ray-air/examples/gptj_batch_prediction.ipynb b/doc/source/ray-air/examples/gptj_batch_prediction.ipynb index 9a9cf77b19b88..9ce17b1db9c46 100644 --- a/doc/source/ray-air/examples/gptj_batch_prediction.ipynb +++ b/doc/source/ray-air/examples/gptj_batch_prediction.ipynb @@ -5,14 +5,16 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# GPT-J6-B Batch Prediction with Ray AIR\n", + "# GPT-J-6B Batch Prediction with Ray AIR\n", "\n", - "In this example, we will showcase how to use the Ray AIR for **GPT-J batch inference**. GPT-J is a GPT-2-like causal language model trained on the Pile dataset. This particular model has 6 billion parameters.\n", + "This example showcases how to use the Ray AIR for **GPT-J batch inference**. GPT-J is a GPT-2-like causal language model trained on the Pile dataset. This model has 6 billion parameters. For more information on GPT-J, click [here](https://huggingface.co/docs/transformers/model_doc/gptj).\n", "\n", - "We will use Ray Data to carry out this task and a pretrained model from Hugging Face hub. Note that you can easily adapt this example to use other similar models.\n", + "We use Ray Data and a pretrained model from Hugging Face hub. Note that you can easily adapt this example to use other similar models.\n", "\n", "It is highly recommended to read [Ray AIR Key Concepts](air-key-concepts) and [Ray Data Key Concepts](data_key_concepts) before starting this example.\n", "\n", + "If you are interested in serving (online inference), see {doc}`/ray-air/examples/gptj_serving`.\n", + "\n", "```{note}\n", "In order to run this example, make sure your Ray cluster has access to at least one GPU with 16 or more GBs of memory. The amount of memory needed will depend on the model.\n", "```" diff --git a/doc/source/ray-air/examples/gptj_deepspeed_fine_tuning.ipynb b/doc/source/ray-air/examples/gptj_deepspeed_fine_tuning.ipynb new file mode 100644 index 0000000000000..8002a71dc9c8e --- /dev/null +++ b/doc/source/ray-air/examples/gptj_deepspeed_fine_tuning.ipynb @@ -0,0 +1,1148 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# GPT-J-6B Fine-Tuning with Ray AIR and DeepSpeed\n", + "\n", + "In this example, we will showcase how to use the Ray AIR for **GPT-J fine-tuning**. GPT-J is a GPT-2-like causal language model trained on the Pile dataset. This particular model has 6 billion parameters. For more information on GPT-J, click [here](https://huggingface.co/docs/transformers/model_doc/gptj).\n", + "\n", + "We will use Ray AIR (with the 🤗 Transformers integration) and a pretrained model from Hugging Face hub. Note that you can easily adapt this example to use other similar models.\n", + "\n", + "This example focuses more on the performance and distributed computing aspects of Ray AIR. If you are looking for a more beginner friendly introduction to Ray AIR 🤗 Transformers integration, see {doc}`this example `.\n", + "\n", + "It is highly recommended to read [Ray AIR Key Concepts](air-key-concepts) and [Ray Data Key Concepts](data_key_concepts) before starting this example.\n", + "\n", + "```{note}\n", + "In order to run this example, make sure your Ray cluster has access to at least one GPU with 16 or more GBs of memory. The amount of memory needed will depend on the model. This notebook is being tested with 16 g4dn.4xlarge instances.\n", + "```\n", + "\n", + "In this notebook, we will:\n", + "1. [Set up Ray](#setup)\n", + "2. [Load the dataset](#load)\n", + "3. [Preprocess the dataset with Ray AIR](#preprocess)\n", + "4. [Run the training with Ray AIR](#train)\n", + "5. [Generate text from prompt with Ray AIR](#predict)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Uncomment and run the following line in order to install all the necessary dependencies (this notebook is being tested with `transformers==4.26.0`):" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "#! pip install \"datasets\" \"evaluate\" \"accelerate>=0.16.0\" \"transformers>=4.26.0\" \"torch>=1.12.0\" \"deepspeed\"" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "import os" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Set up Ray \n", + "\n", + "First, let's set some global variables. We will use 16 workers, each being assigned 1 GPU and 8 CPUs." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "model_name = \"EleutherAI/gpt-j-6B\"\n", + "use_gpu = True\n", + "num_workers = 16\n", + "cpus_per_worker = 8" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We will use `ray.init()` to initialize a local cluster. By default, this cluster will be comprised of only the machine you are running this notebook on. You can also run this notebook on an Anyscale cluster.\n", + "\n", + "We define a {ref}`runtime environment ` to ensure that the Ray workers have access to all the necessary packages. You can omit the `runtime_env` argument if you have all of the packages already installed on each node in your cluster." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "
\n", + "

Ray

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + "\n", + "
Python version:3.8.16
Ray version: 3.0.0.dev0
Dashboard:http://console.anyscale-staging.com/api/v2/sessions/ses_sedlspnpy16naa5lm9kf2cmi2y/services?redirect_to=dashboard
\n", + "
\n", + "
\n" + ], + "text/plain": [ + "RayContext(dashboard_url='console.anyscale-staging.com/api/v2/sessions/ses_sedlspnpy16naa5lm9kf2cmi2y/services?redirect_to=dashboard', python_version='3.8.16', ray_version='3.0.0.dev0', ray_commit='4ddbbb3c4b19c2d27bbf54f8c5ffc100dceafbcf', address_info={'node_ip_address': '10.0.30.196', 'raylet_ip_address': '10.0.30.196', 'redis_address': None, 'object_store_address': '/tmp/ray/session_2023-03-06_15-55-37_997701_162/sockets/plasma_store', 'raylet_socket_name': '/tmp/ray/session_2023-03-06_15-55-37_997701_162/sockets/raylet', 'webui_url': 'console.anyscale-staging.com/api/v2/sessions/ses_sedlspnpy16naa5lm9kf2cmi2y/services?redirect_to=dashboard', 'session_dir': '/tmp/ray/session_2023-03-06_15-55-37_997701_162', 'metrics_export_port': 8085, 'gcs_address': '10.0.30.196:6379', 'address': '10.0.30.196:6379', 'dashboard_agent_listen_port': 52365, 'node_id': '77de483c435bf4987fd6f1e91d47602554e876fd41230d8d50c05333'})" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import ray\n", + "\n", + "ray.init(\n", + " runtime_env={\n", + " \"pip\": [\n", + " \"datasets\",\n", + " \"evaluate\",\n", + " \"accelerate>=0.16.0\",\n", + " \"transformers>=4.26.0\",\n", + " \"torch>=1.12.0\",\n", + " \"deepspeed\",\n", + " ]\n", + " }\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "tags": [ + "hide-cell" + ] + }, + "outputs": [], + "source": [ + "# THIS SHOULD BE HIDDEN IN DOCS AND ONLY RAN IN CI\n", + "# Download the model from our S3 mirror as it's faster\n", + "\n", + "import ray\n", + "import subprocess\n", + "import ray.util.scheduling_strategies\n", + "\n", + "\n", + "def force_on_node(node_id: str, remote_func_or_actor_class):\n", + " scheduling_strategy = ray.util.scheduling_strategies.NodeAffinitySchedulingStrategy(\n", + " node_id=node_id, soft=False\n", + " )\n", + " options = {\"scheduling_strategy\": scheduling_strategy}\n", + " return remote_func_or_actor_class.options(**options)\n", + "\n", + "\n", + "def run_on_every_node(remote_func_or_actor_class, **remote_kwargs):\n", + " refs = []\n", + " for node in ray.nodes():\n", + " if node[\"Alive\"] and node[\"Resources\"].get(\"GPU\", None):\n", + " refs.append(\n", + " force_on_node(node[\"NodeID\"], remote_func_or_actor_class).remote(\n", + " **remote_kwargs\n", + " )\n", + " )\n", + " return ray.get(refs)\n", + "\n", + "\n", + "@ray.remote(num_gpus=1)\n", + "def download_model():\n", + " from transformers.utils.hub import TRANSFORMERS_CACHE\n", + "\n", + " path = os.path.expanduser(\n", + " os.path.join(TRANSFORMERS_CACHE, \"models--EleutherAI--gpt-j-6B\")\n", + " )\n", + " subprocess.run([\"mkdir\", \"-p\", os.path.join(path, \"snapshots\", \"main\")])\n", + " subprocess.run([\"mkdir\", \"-p\", os.path.join(path, \"refs\")])\n", + " if os.path.exists(os.path.join(path, \"refs\", \"main\")):\n", + " return\n", + " subprocess.run(\n", + " [\n", + " \"aws\",\n", + " \"s3\",\n", + " \"sync\",\n", + " \"--quiet\",\n", + " \"s3://large-dl-models-mirror/models--EleutherAI--gpt-j-6B/main/\",\n", + " os.path.join(path, \"snapshots\", \"main\"),\n", + " ]\n", + " )\n", + " with open(os.path.join(path, \"snapshots\", \"main\", \"hash\"), \"r\") as f:\n", + " f_hash = f.read().strip()\n", + " with open(os.path.join(path, \"refs\", \"main\"), \"w\") as f:\n", + " f.write(f_hash)\n", + " os.rename(\n", + " os.path.join(path, \"snapshots\", \"main\"), os.path.join(path, \"snapshots\", f_hash)\n", + " )\n", + "\n", + "\n", + "_ = run_on_every_node(download_model)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Loading the dataset \n", + "\n", + "We will be fine-tuning the model on the [`tiny_shakespeare` dataset](https://huggingface.co/datasets/tiny_shakespeare), comprised of 40,000 lines of Shakespeare from a variety of Shakespeare's plays. The aim will be to make the GPT-J model better at generating text in the style of Shakespeare." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Loading tiny_shakespeare dataset\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Found cached dataset tiny_shakespeare (/home/ray/.cache/huggingface/datasets/tiny_shakespeare/default/1.0.0/b5b13969f09fe8707337f6cb296314fbe06960bd9a868dca39e713e163d27b5e)\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "65894225f3b84e5caa117c4d08d9f99d", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/3 [00:00 pd.DataFrame:\n", + " text = list(batch[\"text\"])\n", + " flat_text = \"\".join(text)\n", + " split_text = [\n", + " x.strip()\n", + " for x in flat_text.split(\"\\n\")\n", + " if x.strip() and not x.strip()[-1] == \":\"\n", + " ]\n", + " return pd.DataFrame(split_text, columns=[\"text\"])\n", + "\n", + "\n", + "def tokenize(batch: pd.DataFrame) -> dict:\n", + " tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)\n", + " tokenizer.pad_token = tokenizer.eos_token\n", + " ret = tokenizer(\n", + " list(batch[\"text\"]),\n", + " truncation=True,\n", + " max_length=block_size,\n", + " padding=\"max_length\",\n", + " return_tensors=\"np\",\n", + " )\n", + " ret[\"labels\"] = ret[\"input_ids\"].copy()\n", + " return dict(ret)\n", + "\n", + "\n", + "splitter = BatchMapper(split_text, batch_format=\"pandas\")\n", + "tokenizer = BatchMapper(tokenize, batch_format=\"pandas\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Fine-tuning the model with Ray AIR \n", + "\n", + "We can now configure Ray AIR's {class}`~ray.train.huggingface.huggingface_trainer.HuggingFaceTrainer` to perform distributed fine-tuning of the model. In order to do that, we specify a `trainer_init_per_worker` function, which creates a 🤗 Transformers `Trainer` that will be distributed by Ray using Distributed Data Parallelism (using PyTorch Distributed backend internally). This means that each worker will have its own copy of the model, but operate on different data, At the end of each step, all the workers will sync gradients.\n", + "\n", + "Because GPT-J is a relatively large model, it may not be possible to fit it on smaller GPU types (<=16 GB GRAM). To deal with that issue, we can use [DeepSpeed](https://github.com/microsoft/DeepSpeed), a library to optimize the training process and allow us to (among other things) offload and partition optimizer and parameter states, reducing GRAM usage. Furthermore, DeepSpeed ZeRO Stage 3 allows us to load large models without running out of memory.\n", + "\n", + "🤗 Transformers and Ray AIR's integration ({class}`~ray.train.huggingface.huggingface_trainer.HuggingFaceTrainer`) allow you to easily configure and use DDP and DeepSpeed. All you need to do is specify the DeepSpeed configuration in the [`TrainingArguments`](https://huggingface.co/docs/transformers/en/main_classes/trainer#transformers.TrainingArguments) object.\n", + "\n", + "```{tip}\n", + "There are many DeepSpeed settings that allow you to trade-off speed for memory usage. The settings used below are tailored to the cluster setup used (16 g4dn.4xlarge nodes) and per device batch size of 16. Some things to keep in mind:\n", + "- If your GPUs support bfloat16, use that instead of float16 mixed precision to get better performance and prevent overflows. Replace `fp16=True` with `bf16=True` in `TrainingArguments`.\n", + "- If you are running out of GRAM: try reducing batch size (defined in the cell below the next one), set `\"overlap_comm\": False` in DeepSpeed config.\n", + "- If you are running out of RAM, add more nodes to your cluster, use nodes with more RAM, set `\"pin_memory\": False` in the DeepSpeed config, reduce the batch size, and remove `\"offload_param\"` from the DeepSpeed config.\n", + "\n", + "For more information on DeepSpeed configuration, refer to [Hugging Face documentation](https://huggingface.co/docs/transformers/main_classes/deepspeed) and [DeepSpeed documentation](https://www.deepspeed.ai/docs/config-json/).\n", + "\n", + "Additionally, if you prefer a lower-level API, the logic below can be expressed as an [Accelerate training loop](https://github.com/huggingface/accelerate/blob/main/examples/by_feature/deepspeed_with_config_support.py) distributed by a Ray AIR {class}`~ray.train.torch.torch_trainer.TorchTrainer`.\n", + "```\n", + "\n", + "#### Training speed\n", + "\n", + "As we are using data parallelism, each worker operates on its own shard of the data. The batch size set in `TrainingArguments` is the **per device batch size** (per worker batch size). By changing the number of workers, we can change the **effective batch size** and thus the time needed for training to complete. The effective batch size is then calculated as `per device batch size * number of workers * number of gradient accumulation steps`. As we add more workers, the effective batch size rises and thus we need less time to complete a full epoch. While the speedup is not exactly linear due to extra communication overheads, in many cases it can be close to linear.\n", + "\n", + "The preprocessed dataset has 1348 examples. We have set per device batch size to 16.\n", + "\n", + "* With 16 g4dn.4xlarge nodes, the effective batch size was 256, which equals to 85 steps per epoch. One epoch took **~2440 seconds** (including initialization time).\n", + "\n", + "* With 32 g4dn.4xlarge nodes, the effective batch size was 512, which equals to 43 steps per epoch. One epoch took **~1280 seconds** (including initialization time)." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "import evaluate\n", + "from transformers import Trainer, TrainingArguments\n", + "from transformers import (\n", + " GPTJForCausalLM,\n", + " AutoTokenizer,\n", + " default_data_collator,\n", + ")\n", + "from transformers.utils.logging import disable_progress_bar, enable_progress_bar\n", + "import torch\n", + "\n", + "from ray.air import session\n", + "\n", + "\n", + "def trainer_init_per_worker(train_dataset, eval_dataset=None, **config):\n", + " # Use the actual number of CPUs assigned by Ray\n", + " os.environ[\"OMP_NUM_THREADS\"] = str(\n", + " session.get_trial_resources().bundles[-1].get(\"CPU\", 1)\n", + " )\n", + " # Enable tf32 for better performance\n", + " torch.backends.cuda.matmul.allow_tf32 = True\n", + "\n", + " batch_size = config.get(\"batch_size\", 4)\n", + " epochs = config.get(\"epochs\", 2)\n", + " warmup_steps = config.get(\"warmup_steps\", 0)\n", + " learning_rate = config.get(\"learning_rate\", 0.00002)\n", + " weight_decay = config.get(\"weight_decay\", 0.01)\n", + "\n", + " deepspeed = {\n", + " \"fp16\": {\n", + " \"enabled\": \"auto\",\n", + " \"initial_scale_power\": 8,\n", + " },\n", + " \"bf16\": {\"enabled\": \"auto\"},\n", + " \"optimizer\": {\n", + " \"type\": \"AdamW\",\n", + " \"params\": {\n", + " \"lr\": \"auto\",\n", + " \"betas\": \"auto\",\n", + " \"eps\": \"auto\",\n", + " },\n", + " },\n", + " \"zero_optimization\": {\n", + " \"stage\": 3,\n", + " \"offload_optimizer\": {\n", + " \"device\": \"cpu\",\n", + " \"pin_memory\": True,\n", + " },\n", + " \"offload_param\": {\n", + " \"device\": \"cpu\",\n", + " \"pin_memory\": True,\n", + " },\n", + " \"overlap_comm\": True,\n", + " \"contiguous_gradients\": True,\n", + " \"reduce_bucket_size\": \"auto\",\n", + " \"stage3_prefetch_bucket_size\": \"auto\",\n", + " \"stage3_param_persistence_threshold\": \"auto\",\n", + " \"gather_16bit_weights_on_model_save\": True,\n", + " \"round_robin_gradients\": True,\n", + " },\n", + " \"gradient_accumulation_steps\": \"auto\",\n", + " \"gradient_clipping\": \"auto\",\n", + " \"steps_per_print\": 10,\n", + " \"train_batch_size\": \"auto\",\n", + " \"train_micro_batch_size_per_gpu\": \"auto\",\n", + " \"wall_clock_breakdown\": False,\n", + " }\n", + "\n", + " print(\"Preparing training arguments\")\n", + " training_args = TrainingArguments(\n", + " \"output\",\n", + " per_device_train_batch_size=batch_size,\n", + " logging_steps=1,\n", + " save_strategy=\"no\",\n", + " per_device_eval_batch_size=batch_size,\n", + " learning_rate=learning_rate,\n", + " weight_decay=weight_decay,\n", + " warmup_steps=warmup_steps,\n", + " label_names=[\"input_ids\", \"attention_mask\"],\n", + " num_train_epochs=epochs,\n", + " push_to_hub=False,\n", + " disable_tqdm=True, # declutter the output a little\n", + " fp16=True,\n", + " gradient_checkpointing=True,\n", + " deepspeed=deepspeed,\n", + " )\n", + " disable_progress_bar()\n", + "\n", + " tokenizer = AutoTokenizer.from_pretrained(model_name)\n", + " tokenizer.pad_token = tokenizer.eos_token\n", + "\n", + " print(\"Loading model\")\n", + "\n", + " model = GPTJForCausalLM.from_pretrained(model_name, use_cache=False)\n", + " model.resize_token_embeddings(len(tokenizer))\n", + "\n", + " print(\"Model loaded\")\n", + "\n", + " enable_progress_bar()\n", + "\n", + " metric = evaluate.load(\"accuracy\")\n", + "\n", + " def compute_metrics(eval_pred):\n", + " logits, labels = eval_pred\n", + " predictions = np.argmax(logits, axis=-1)\n", + " return metric.compute(predictions=predictions, references=labels)\n", + "\n", + " trainer = Trainer(\n", + " model=model,\n", + " args=training_args,\n", + " train_dataset=train_dataset,\n", + " eval_dataset=eval_dataset,\n", + " compute_metrics=compute_metrics,\n", + " tokenizer=tokenizer,\n", + " data_collator=default_data_collator,\n", + " )\n", + " return trainer" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "With our `trainer_init_per_worker` complete, we can now instantiate the {class}`~ray.train.huggingface.huggingface_trainer.HuggingFaceTrainer`. Aside from the function, we set the `scaling_config`, controlling the amount of workers and resources used, and the `datasets` we will use for training and evaluation.\n", + "\n", + "We pass the preprocessors we have defined earlier as an argument, wrapped in a {class}`~ray.data.preprocessors.chain.Chain`. The preprocessor will be included with the returned {class}`~ray.air.checkpoint.Checkpoint`, meaning it will also be applied during inference.\n", + "\n", + "```{note}\n", + "If you want to upload checkpoints to cloud storage (eg. S3), use {class}`~ray.tune.syncer.SyncConfig` - see {ref}`train-config-sync` for an example. Using cloud storage is highly recommended, especially for production.\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "from ray.train.huggingface import HuggingFaceTrainer\n", + "from ray.air.config import ScalingConfig\n", + "from ray.data.preprocessors import Chain\n", + "\n", + "\n", + "trainer = HuggingFaceTrainer(\n", + " trainer_init_per_worker=trainer_init_per_worker,\n", + " trainer_init_config={\n", + " \"batch_size\": 16, # per device\n", + " \"epochs\": 1,\n", + " },\n", + " scaling_config=ScalingConfig(\n", + " num_workers=num_workers,\n", + " use_gpu=use_gpu,\n", + " resources_per_worker={\"GPU\": 1, \"CPU\": cpus_per_worker},\n", + " ),\n", + " datasets={\"train\": ray_datasets[\"train\"], \"evaluation\": ray_datasets[\"validation\"]},\n", + " preprocessor=Chain(splitter, tokenizer),\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Finally, we call the {meth}`~ray.train.huggingface.huggingface_trainer.HuggingFaceTrainer.fit` method to start training with Ray AIR. We will save the {class}`~ray.air.Result` object to a variable so we can access metrics and checkpoints." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "
\n", + "
\n", + "

Tune Status

\n", + " \n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
Current time:2023-03-06 17:18:41
Running for: 00:43:11.46
Memory: 31.9/62.0 GiB
\n", + "
\n", + "
\n", + "
\n", + "

System Info

\n", + " Using FIFO scheduling algorithm.
Resources requested: 0/256 CPUs, 0/16 GPUs, 0.0/675.29 GiB heap, 0.0/291.99 GiB objects (0.0/16.0 accelerator_type:T4)\n", + "
\n", + " \n", + "
\n", + "
\n", + "
\n", + "

Trial Status

\n", + " \n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
Trial name status loc iter total time (s) loss learning_rate epoch
HuggingFaceTrainer_f623d_00000TERMINATED10.0.30.196:30861 85 2579.30.0715 4.70588e-07 1
\n", + "
\n", + "
\n", + "\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "(RayTrainWorker pid=31281) 2023-03-06 16:36:00,447\tINFO bulk_executor.py:41 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[BatchMapper]\n", + "(RayTrainWorker pid=1964, ip=10.0.26.83) /tmp/ray/session_2023-03-06_15-55-37_997701_162/runtime_resources/py_modules_files/_ray_pkg_f864ba6869d6802c/ray/train/_internal/dataset_iterator.py:64: UserWarning: session.get_dataset_shard returns a ray.data.DatasetIterator instead of a Dataset/DatasetPipeline as of Ray v2.3. Use iter_torch_batches(), to_tf(), or iter_batches() to iterate over one epoch. See https://docs.ray.io/en/latest/data/api/dataset_iterator.html for full DatasetIterator docs.\n", + "(RayTrainWorker pid=1964, ip=10.0.26.83) warnings.warn(\n", + "(RayTrainWorker pid=1964, ip=10.0.26.83) 2023-03-06 16:36:00,453\tINFO bulk_executor.py:41 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[BatchMapper]\n", + "(RayTrainWorker pid=1963, ip=10.0.54.163) /tmp/ray/session_2023-03-06_15-55-37_997701_162/runtime_resources/py_modules_files/_ray_pkg_f864ba6869d6802c/ray/train/_internal/dataset_iterator.py:64: UserWarning: session.get_dataset_shard returns a ray.data.DatasetIterator instead of a Dataset/DatasetPipeline as of Ray v2.3. Use iter_torch_batches(), to_tf(), or iter_batches() to iterate over one epoch. See https://docs.ray.io/en/latest/data/api/dataset_iterator.html for full DatasetIterator docs.\n", + "(RayTrainWorker pid=1963, ip=10.0.54.163) warnings.warn(\n", + "(RayTrainWorker pid=1963, ip=10.0.54.163) 2023-03-06 16:36:00,452\tINFO bulk_executor.py:41 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[BatchMapper]\n", + "(RayTrainWorker pid=1954, ip=10.0.15.115) /tmp/ray/session_2023-03-06_15-55-37_997701_162/runtime_resources/py_modules_files/_ray_pkg_f864ba6869d6802c/ray/train/_internal/dataset_iterator.py:64: UserWarning: session.get_dataset_shard returns a ray.data.DatasetIterator instead of a Dataset/DatasetPipeline as of Ray v2.3. Use iter_torch_batches(), to_tf(), or iter_batches() to iterate over one epoch. See https://docs.ray.io/en/latest/data/api/dataset_iterator.html for full DatasetIterator docs.\n", + "(RayTrainWorker pid=1954, ip=10.0.15.115) warnings.warn(\n", + "(RayTrainWorker pid=1954, ip=10.0.15.115) 2023-03-06 16:36:00,452\tINFO bulk_executor.py:41 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[BatchMapper]\n", + "(RayTrainWorker pid=1955, ip=10.0.58.255) /tmp/ray/session_2023-03-06_15-55-37_997701_162/runtime_resources/py_modules_files/_ray_pkg_f864ba6869d6802c/ray/train/_internal/dataset_iterator.py:64: UserWarning: session.get_dataset_shard returns a ray.data.DatasetIterator instead of a Dataset/DatasetPipeline as of Ray v2.3. Use iter_torch_batches(), to_tf(), or iter_batches() to iterate over one epoch. See https://docs.ray.io/en/latest/data/api/dataset_iterator.html for full DatasetIterator docs.\n", + "(RayTrainWorker pid=1955, ip=10.0.58.255) warnings.warn(\n", + "(RayTrainWorker pid=1955, ip=10.0.58.255) 2023-03-06 16:36:00,453\tINFO bulk_executor.py:41 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[BatchMapper]\n", + "(RayTrainWorker pid=1942, ip=10.0.57.85) 2023-03-06 16:36:00,452\tINFO bulk_executor.py:41 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[BatchMapper]\n", + "(RayTrainWorker pid=1963, ip=10.0.29.205) 2023-03-06 16:36:00,452\tINFO bulk_executor.py:41 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[BatchMapper]\n", + "(RayTrainWorker pid=1942, ip=10.0.51.113) 2023-03-06 16:36:00,454\tINFO bulk_executor.py:41 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[BatchMapper]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(RayTrainWorker pid=31281) Preparing training arguments\n", + "(RayTrainWorker pid=31281) Loading model\n", + "(RayTrainWorker pid=31281) [2023-03-06 16:37:21,252] [INFO] [partition_parameters.py:415:__exit__] finished initializing model with 6.05B parameters\n", + "(RayTrainWorker pid=31281) Model loaded\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "(RayTrainWorker pid=31281) Using cuda_amp half precision backend\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(RayTrainWorker pid=31281) [2023-03-06 16:38:03,431] [INFO] [logging.py:75:log_dist] [Rank 0] DeepSpeed info: version=0.8.1, git-hash=unknown, git-branch=unknown\n", + "(RayTrainWorker pid=31281) [2023-03-06 16:38:03,450] [INFO] [logging.py:75:log_dist] [Rank 0] DeepSpeed Flops Profiler Enabled: False\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "(RayTrainWorker pid=31281) ***** Running training *****\n", + "(RayTrainWorker pid=31281) Num examples = 1348\n", + "(RayTrainWorker pid=31281) Num Epochs = 1\n", + "(RayTrainWorker pid=31281) Instantaneous batch size per device = 16\n", + "(RayTrainWorker pid=31281) Total train batch size (w. parallel, distributed & accumulation) = 256\n", + "(RayTrainWorker pid=31281) Gradient Accumulation steps = 1\n", + "(RayTrainWorker pid=31281) Total optimization steps = 85\n", + "(RayTrainWorker pid=31281) Number of trainable parameters = 0\n", + "(RayTrainWorker pid=31281) /home/ray/anaconda3/lib/python3.8/site-packages/torch/distributed/distributed_c10d.py:2387: UserWarning: torch.distributed._all_gather_base is a private function and will be deprecated. Please use torch.distributed.all_gather_into_tensor instead.\n", + "(RayTrainWorker pid=31281) warnings.warn(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,024] [INFO] [logging.py:75:log_dist] [Rank 0] DeepSpeed Final Optimizer = adamw\n", + "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,024] [INFO] [logging.py:75:log_dist] [Rank 0] DeepSpeed using client callable to create LR scheduler\n", + "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,025] [INFO] [logging.py:75:log_dist] [Rank 0] DeepSpeed LR Scheduler = \n", + "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,025] [INFO] [logging.py:75:log_dist] [Rank 0] step=0, skipped=0, lr=[2e-05], mom=[[0.9, 0.999]]\n", + "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,025] [INFO] [config.py:1009:print] DeepSpeedEngine configuration:\n", + "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,026] [INFO] [config.py:1013:print] activation_checkpointing_config {\n", + "(RayTrainWorker pid=31281) \"partition_activations\": false, \n", + "(RayTrainWorker pid=31281) \"contiguous_memory_optimization\": false, \n", + "(RayTrainWorker pid=31281) \"cpu_checkpointing\": false, \n", + "(RayTrainWorker pid=31281) \"number_checkpoints\": null, \n", + "(RayTrainWorker pid=31281) \"synchronize_checkpoint_boundary\": false, \n", + "(RayTrainWorker pid=31281) \"profile\": false\n", + "(RayTrainWorker pid=31281) }\n", + "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,026] [INFO] [config.py:1013:print] aio_config ................... {'block_size': 1048576, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': True}\n", + "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,026] [INFO] [config.py:1013:print] amp_enabled .................. False\n", + "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,026] [INFO] [config.py:1013:print] amp_params ................... False\n", + "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,027] [INFO] [config.py:1013:print] autotuning_config ............ {\n", + "(RayTrainWorker pid=31281) \"enabled\": false, \n", + "(RayTrainWorker pid=31281) \"start_step\": null, \n", + "(RayTrainWorker pid=31281) \"end_step\": null, \n", + "(RayTrainWorker pid=31281) \"metric_path\": null, \n", + "(RayTrainWorker pid=31281) \"arg_mappings\": null, \n", + "(RayTrainWorker pid=31281) \"metric\": \"throughput\", \n", + "(RayTrainWorker pid=31281) \"model_info\": null, \n", + "(RayTrainWorker pid=31281) \"results_dir\": \"autotuning_results\", \n", + "(RayTrainWorker pid=31281) \"exps_dir\": \"autotuning_exps\", \n", + "(RayTrainWorker pid=31281) \"overwrite\": true, \n", + "(RayTrainWorker pid=31281) \"fast\": true, \n", + "(RayTrainWorker pid=31281) \"start_profile_step\": 3, \n", + "(RayTrainWorker pid=31281) \"end_profile_step\": 5, \n", + "(RayTrainWorker pid=31281) \"tuner_type\": \"gridsearch\", \n", + "(RayTrainWorker pid=31281) \"tuner_early_stopping\": 5, \n", + "(RayTrainWorker pid=31281) \"tuner_num_trials\": 50, \n", + "(RayTrainWorker pid=31281) \"model_info_path\": null, \n", + "(RayTrainWorker pid=31281) \"mp_size\": 1, \n", + "(RayTrainWorker pid=31281) \"max_train_batch_size\": null, \n", + "(RayTrainWorker pid=31281) \"min_train_batch_size\": 1, \n", + "(RayTrainWorker pid=31281) \"max_train_micro_batch_size_per_gpu\": 1.024000e+03, \n", + "(RayTrainWorker pid=31281) \"min_train_micro_batch_size_per_gpu\": 1, \n", + "(RayTrainWorker pid=31281) \"num_tuning_micro_batch_sizes\": 3\n", + "(RayTrainWorker pid=31281) }\n", + "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,027] [INFO] [config.py:1013:print] bfloat16_enabled ............. False\n", + "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,027] [INFO] [config.py:1013:print] checkpoint_parallel_write_pipeline False\n", + "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,027] [INFO] [config.py:1013:print] checkpoint_tag_validation_enabled True\n", + "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,027] [INFO] [config.py:1013:print] checkpoint_tag_validation_fail False\n", + "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,027] [INFO] [config.py:1013:print] comms_config ................. \n", + "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,027] [INFO] [config.py:1013:print] communication_data_type ...... None\n", + "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,027] [INFO] [config.py:1013:print] compression_config ........... {'weight_quantization': {'shared_parameters': {'enabled': False, 'quantizer_kernel': False, 'schedule_offset': 0, 'quantize_groups': 1, 'quantize_verbose': False, 'quantization_type': 'symmetric', 'quantize_weight_in_forward': False, 'rounding': 'nearest', 'fp16_mixed_quantize': False, 'quantize_change_ratio': 0.001}, 'different_groups': {}}, 'activation_quantization': {'shared_parameters': {'enabled': False, 'quantization_type': 'symmetric', 'range_calibration': 'dynamic', 'schedule_offset': 1000}, 'different_groups': {}}, 'sparse_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'row_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'head_pruning': {'shared_parameters': {'enabled': False, 'method': 'topk', 'schedule_offset': 1000}, 'different_groups': {}}, 'channel_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'layer_reduction': {'enabled': False}}\n", + "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,027] [INFO] [config.py:1013:print] curriculum_enabled_legacy .... False\n", + "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,027] [INFO] [config.py:1013:print] curriculum_params_legacy ..... False\n", + "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,027] [INFO] [config.py:1013:print] data_efficiency_config ....... {'enabled': False, 'seed': 1234, 'data_sampling': {'enabled': False, 'num_epochs': 1000, 'num_workers': 0, 'curriculum_learning': {'enabled': False}}, 'data_routing': {'enabled': False, 'random_ltd': {'enabled': False, 'layer_token_lr_schedule': {'enabled': False}}}}\n", + "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,027] [INFO] [config.py:1013:print] data_efficiency_enabled ...... False\n", + "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,027] [INFO] [config.py:1013:print] dataloader_drop_last ......... False\n", + "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,027] [INFO] [config.py:1013:print] disable_allgather ............ False\n", + "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,027] [INFO] [config.py:1013:print] dump_state ................... False\n", + "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,027] [INFO] [config.py:1013:print] dynamic_loss_scale_args ...... {'init_scale': 256, 'scale_window': 1000, 'delayed_shift': 2, 'min_scale': 1}\n", + "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,027] [INFO] [config.py:1013:print] eigenvalue_enabled ........... False\n", + "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,027] [INFO] [config.py:1013:print] eigenvalue_gas_boundary_resolution 1\n", + "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,027] [INFO] [config.py:1013:print] eigenvalue_layer_name ........ bert.encoder.layer\n", + "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,027] [INFO] [config.py:1013:print] eigenvalue_layer_num ......... 0\n", + "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,027] [INFO] [config.py:1013:print] eigenvalue_max_iter .......... 100\n", + "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,027] [INFO] [config.py:1013:print] eigenvalue_stability ......... 1e-06\n", + "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,027] [INFO] [config.py:1013:print] eigenvalue_tol ............... 0.01\n", + "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,027] [INFO] [config.py:1013:print] eigenvalue_verbose ........... False\n", + "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,027] [INFO] [config.py:1013:print] elasticity_enabled ........... False\n", + "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,027] [INFO] [config.py:1013:print] flops_profiler_config ........ {\n", + "(RayTrainWorker pid=31281) \"enabled\": false, \n", + "(RayTrainWorker pid=31281) \"profile_step\": 1, \n", + "(RayTrainWorker pid=31281) \"module_depth\": -1, \n", + "(RayTrainWorker pid=31281) \"top_modules\": 1, \n", + "(RayTrainWorker pid=31281) \"detailed\": true, \n", + "(RayTrainWorker pid=31281) \"output_file\": null\n", + "(RayTrainWorker pid=31281) }\n", + "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,027] [INFO] [config.py:1013:print] fp16_auto_cast ............... False\n", + "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,027] [INFO] [config.py:1013:print] fp16_enabled ................. True\n", + "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,027] [INFO] [config.py:1013:print] fp16_master_weights_and_gradients False\n", + "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,027] [INFO] [config.py:1013:print] global_rank .................. 0\n", + "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,027] [INFO] [config.py:1013:print] grad_accum_dtype ............. None\n", + "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,027] [INFO] [config.py:1013:print] gradient_accumulation_steps .. 1\n", + "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,028] [INFO] [config.py:1013:print] gradient_clipping ............ 1.0\n", + "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,028] [INFO] [config.py:1013:print] gradient_predivide_factor .... 1.0\n", + "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,028] [INFO] [config.py:1013:print] initial_dynamic_scale ........ 256\n", + "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,028] [INFO] [config.py:1013:print] load_universal_checkpoint .... False\n", + "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,028] [INFO] [config.py:1013:print] loss_scale ................... 0\n", + "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,028] [INFO] [config.py:1013:print] memory_breakdown ............. False\n", + "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,028] [INFO] [config.py:1013:print] monitor_config ............... tensorboard=TensorBoardConfig(enabled=False, output_path='', job_name='DeepSpeedJobName') wandb=WandbConfig(enabled=False, group=None, team=None, project='deepspeed') csv_monitor=CSVConfig(enabled=False, output_path='', job_name='DeepSpeedJobName') enabled=False\n", + "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,028] [INFO] [config.py:1013:print] nebula_config ................ {\n", + "(RayTrainWorker pid=31281) \"enabled\": false, \n", + "(RayTrainWorker pid=31281) \"persistent_storage_path\": null, \n", + "(RayTrainWorker pid=31281) \"persistent_time_interval\": 100, \n", + "(RayTrainWorker pid=31281) \"num_of_version_in_retention\": 2, \n", + "(RayTrainWorker pid=31281) \"enable_nebula_load\": true, \n", + "(RayTrainWorker pid=31281) \"load_path\": null\n", + "(RayTrainWorker pid=31281) }\n", + "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,028] [INFO] [config.py:1013:print] optimizer_legacy_fusion ...... False\n", + "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,028] [INFO] [config.py:1013:print] optimizer_name ............... adamw\n", + "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,028] [INFO] [config.py:1013:print] optimizer_params ............. {'lr': 2e-05, 'betas': [0.9, 0.999], 'eps': 1e-08}\n", + "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,028] [INFO] [config.py:1013:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0}\n", + "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,028] [INFO] [config.py:1013:print] pld_enabled .................. False\n", + "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,028] [INFO] [config.py:1013:print] pld_params ................... False\n", + "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,028] [INFO] [config.py:1013:print] prescale_gradients ........... False\n", + "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,028] [INFO] [config.py:1013:print] scheduler_name ............... None\n", + "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,028] [INFO] [config.py:1013:print] scheduler_params ............. None\n", + "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,028] [INFO] [config.py:1013:print] sparse_attention ............. None\n", + "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,028] [INFO] [config.py:1013:print] sparse_gradients_enabled ..... False\n", + "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,028] [INFO] [config.py:1013:print] steps_per_print .............. 10\n", + "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,028] [INFO] [config.py:1013:print] train_batch_size ............. 256\n", + "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,028] [INFO] [config.py:1013:print] train_micro_batch_size_per_gpu 16\n", + "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,028] [INFO] [config.py:1013:print] use_node_local_storage ....... False\n", + "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,028] [INFO] [config.py:1013:print] wall_clock_breakdown ......... False\n", + "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,028] [INFO] [config.py:1013:print] world_size ................... 16\n", + "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,028] [INFO] [config.py:1013:print] zero_allow_untested_optimizer False\n", + "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,028] [INFO] [config.py:1013:print] zero_config .................. stage=3 contiguous_gradients=True reduce_scatter=True reduce_bucket_size=16777216 allgather_partitions=True allgather_bucket_size=500,000,000 overlap_comm=True load_from_fp32_weights=True elastic_checkpoint=False offload_param=DeepSpeedZeroOffloadParamConfig(device='cpu', nvme_path=None, buffer_count=5, buffer_size=100,000,000, max_in_cpu=1,000,000,000, pin_memory=True) offload_optimizer=DeepSpeedZeroOffloadOptimizerConfig(device='cpu', nvme_path=None, buffer_count=4, pin_memory=True, pipeline=False, pipeline_read=False, pipeline_write=False, fast_init=False) sub_group_size=1,000,000,000 cpu_offload_param=None cpu_offload_use_pin_memory=None cpu_offload=None prefetch_bucket_size=15099494 param_persistence_threshold=40960 model_persistence_threshold=sys.maxsize max_live_parameters=1,000,000,000 max_reuse_distance=1,000,000,000 gather_16bit_weights_on_model_save=True stage3_gather_fp16_weights_on_model_save=False ignore_unused_parameters=True legacy_stage1=False round_robin_gradients=True\n", + "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,028] [INFO] [config.py:1013:print] zero_enabled ................. True\n", + "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,028] [INFO] [config.py:1013:print] zero_optimization_stage ...... 3\n", + "(RayTrainWorker pid=31281) [2023-03-06 16:38:25,029] [INFO] [config.py:998:print_user_config] json = {\n", + "(RayTrainWorker pid=31281) \"fp16\": {\n", + "(RayTrainWorker pid=31281) \"enabled\": true, \n", + "(RayTrainWorker pid=31281) \"initial_scale_power\": 8\n", + "(RayTrainWorker pid=31281) }, \n", + "(RayTrainWorker pid=31281) \"bf16\": {\n", + "(RayTrainWorker pid=31281) \"enabled\": false\n", + "(RayTrainWorker pid=31281) }, \n", + "(RayTrainWorker pid=31281) \"optimizer\": {\n", + "(RayTrainWorker pid=31281) \"type\": \"AdamW\", \n", + "(RayTrainWorker pid=31281) \"params\": {\n", + "(RayTrainWorker pid=31281) \"lr\": 2e-05, \n", + "(RayTrainWorker pid=31281) \"betas\": [0.9, 0.999], \n", + "(RayTrainWorker pid=31281) \"eps\": 1e-08\n", + "(RayTrainWorker pid=31281) }\n", + "(RayTrainWorker pid=31281) }, \n", + "(RayTrainWorker pid=31281) \"zero_optimization\": {\n", + "(RayTrainWorker pid=31281) \"stage\": 3, \n", + "(RayTrainWorker pid=31281) \"offload_optimizer\": {\n", + "(RayTrainWorker pid=31281) \"device\": \"cpu\", \n", + "(RayTrainWorker pid=31281) \"pin_memory\": true\n", + "(RayTrainWorker pid=31281) }, \n", + "(RayTrainWorker pid=31281) \"offload_param\": {\n", + "(RayTrainWorker pid=31281) \"device\": \"cpu\", \n", + "(RayTrainWorker pid=31281) \"pin_memory\": true\n", + "(RayTrainWorker pid=31281) }, \n", + "(RayTrainWorker pid=31281) \"overlap_comm\": true, \n", + "(RayTrainWorker pid=31281) \"contiguous_gradients\": true, \n", + "(RayTrainWorker pid=31281) \"reduce_bucket_size\": 1.677722e+07, \n", + "(RayTrainWorker pid=31281) \"stage3_prefetch_bucket_size\": 1.509949e+07, \n", + "(RayTrainWorker pid=31281) \"stage3_param_persistence_threshold\": 4.096000e+04, \n", + "(RayTrainWorker pid=31281) \"gather_16bit_weights_on_model_save\": true, \n", + "(RayTrainWorker pid=31281) \"round_robin_gradients\": true\n", + "(RayTrainWorker pid=31281) }, \n", + "(RayTrainWorker pid=31281) \"gradient_accumulation_steps\": 1, \n", + "(RayTrainWorker pid=31281) \"gradient_clipping\": 1.0, \n", + "(RayTrainWorker pid=31281) \"steps_per_print\": 10, \n", + "(RayTrainWorker pid=31281) \"train_batch_size\": 256, \n", + "(RayTrainWorker pid=31281) \"train_micro_batch_size_per_gpu\": 16, \n", + "(RayTrainWorker pid=31281) \"wall_clock_breakdown\": false\n", + "(RayTrainWorker pid=31281) }\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "(RayTrainWorker pid=31281) Model weights saved in output/checkpoint-85/pytorch_model.bin\n", + "(RayTrainWorker pid=31281) tokenizer config file saved in output/checkpoint-85/tokenizer_config.json\n", + "(RayTrainWorker pid=31281) Special tokens file saved in output/checkpoint-85/special_tokens_map.json\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(RayTrainWorker pid=31281) [2023-03-06 17:18:13,320] [INFO] [engine.py:3516:save_16bit_model] Saving model weights to output/checkpoint-85/pytorch_model.bin\n", + "(RayTrainWorker pid=31281) [2023-03-06 17:18:13,320] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving output/checkpoint-85/pytorch_model.bin...\n", + "(RayTrainWorker pid=31281) [2023-03-06 17:18:29,075] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved output/checkpoint-85/pytorch_model.bin.\n", + "(RayTrainWorker pid=31281) [2023-03-06 17:18:29,087] [INFO] [logging.py:75:log_dist] [Rank 0] [Torch] Checkpoint global_step85 is begin to save!\n", + "(RayTrainWorker pid=31281) [2023-03-06 17:18:29,109] [INFO] [logging.py:75:log_dist] [Rank 0] Saving model checkpoint: output/checkpoint-85/global_step85/zero_pp_rank_0_mp_rank_00_model_states.pt\n", + "(RayTrainWorker pid=31281) [2023-03-06 17:18:29,109] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving output/checkpoint-85/global_step85/zero_pp_rank_0_mp_rank_00_model_states.pt...\n", + "(RayTrainWorker pid=31281) [2023-03-06 17:18:37,982] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved output/checkpoint-85/global_step85/zero_pp_rank_0_mp_rank_00_optim_states.pt.\n", + "(RayTrainWorker pid=31281) [2023-03-06 17:18:37,984] [INFO] [engine.py:3407:_save_zero_checkpoint] zero checkpoint saved output/checkpoint-85/global_step85/zero_pp_rank_0_mp_rank_00_optim_states.pt\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "(RayTrainWorker pid=31281) \n", + "(RayTrainWorker pid=31281) \n", + "(RayTrainWorker pid=31281) Training completed. Do not forget to share your model on huggingface.co/models =)\n", + "(RayTrainWorker pid=31281) \n", + "(RayTrainWorker pid=31281) \n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(RayTrainWorker pid=31281) [2023-03-06 17:18:38,143] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step85 is ready now!\n", + "(RayTrainWorker pid=31281) {'train_runtime': 2413.1243, 'train_samples_per_second': 0.559, 'train_steps_per_second': 0.035, 'train_loss': 0.32492108064539293, 'epoch': 1.0}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2023-03-06 17:18:41,018\tINFO tune.py:825 -- Total run time: 2591.59 seconds (2591.46 seconds for the tuning loop).\n" + ] + } + ], + "source": [ + "results = trainer.fit()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can use the returned {class}`~ray.air.Result` object to access metrics and the Ray AIR {class}`~ray.air.checkpoint.Checkpoint` associated with the last iteration." + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "HuggingFaceCheckpoint(local_path=/home/ray/ray_results/HuggingFaceTrainer_2023-03-06_16-35-29/HuggingFaceTrainer_f623d_00000_0_2023-03-06_16-35-30/checkpoint_000000)" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "checkpoint = results.checkpoint\n", + "checkpoint" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Generate text from prompt\n", + "\n", + "We can use the {class}`~ray.train.huggingface.huggingface_predictor.HuggingFacePredictor` to generate predictions from our fine-tuned model.\n", + "\n", + "```{tip}\n", + "For large scale batch inference, consider configuring cloud checkpointing and then pass the cloud-backed {class}`~ray.air.checkpoint.Checkpoint` to {class}`~ray.train.batch_predictor.BatchPredictor`. More information [here](air-predictors).\n", + "```\n", + "\n", + "Because the {class}`~ray.train.huggingface.huggingface_predictor.HuggingFacePredictor` uses a 🤗 Transformers [`pipeline`](https://huggingface.co/docs/transformers/en/main_classes/pipelines) under the hood, we disable the tokenizer AIR Preprocessor we have used for training and let the `pipeline` to tokenize the data itself." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "checkpoint.set_preprocessor(None)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + ". We also set `device_map=\"auto\"` so that the model is automatically placed on the right device and set the `task` to `\"text-generation\"`. The `predict` method passes the arguments to a 🤗 Transformers `pipeline` call." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "from ray.train.huggingface import HuggingFacePredictor\n", + "import pandas as pd\n", + "\n", + "prompts = pd.DataFrame([\"Romeo and Juliet\", \"Romeo\", \"Juliet\"], columns=[\"text\"])\n", + "\n", + "# Predict on the head node.\n", + "predictor = HuggingFacePredictor.from_checkpoint(\n", + " checkpoint=checkpoint,\n", + " task=\"text-generation\",\n", + " torch_dtype=torch.float16 if use_gpu else None,\n", + " device_map=\"auto\",\n", + " use_gpu=use_gpu,\n", + ")\n", + "prediction = predictor.predict(\n", + " prompts,\n", + " do_sample=True,\n", + " temperature=0.9,\n", + " min_length=32,\n", + " max_length=128,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
generated_text
0Romeo and Juliet, they are married: and it is ...
1Romeo, thou art Romeo and a Montague; for only...
2Juliet's name; but I do not sound an ear to na...
\n", + "
" + ], + "text/plain": [ + " generated_text\n", + "0 Romeo and Juliet, they are married: and it is ...\n", + "1 Romeo, thou art Romeo and a Montague; for only...\n", + "2 Juliet's name; but I do not sound an ear to na..." + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "prediction" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.16" + }, + "vscode": { + "interpreter": { + "hash": "3c0d54d489a08ae47a06eae2fd00ff032d6cddb527c382959b7b2575f6a8167f" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/doc/source/ray-air/examples/gptj_serving.ipynb b/doc/source/ray-air/examples/gptj_serving.ipynb new file mode 100644 index 0000000000000..fb84ec5d6a136 --- /dev/null +++ b/doc/source/ray-air/examples/gptj_serving.ipynb @@ -0,0 +1,247 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# GPT-J-6B Serving with Ray AIR\n", + "\n", + "In this example, we will showcase how to use the Ray AIR for **GPT-J serving (online inference)**. GPT-J is a GPT-2-like causal language model trained on the Pile dataset. This particular model has 6 billion parameters. For more information on GPT-J, click [here](https://huggingface.co/docs/transformers/model_doc/gptj).\n", + "\n", + "We will use Ray Serve for online inference and a pretrained model from Hugging Face hub. Note that you can easily adapt this example to use other similar models.\n", + "\n", + "It is highly recommended to read [Ray AIR Key Concepts](air-key-concepts) and [Ray Serve Key Concepts](serve-key-concepts) before starting this example.\n", + "\n", + "If you are interested in batch prediction (offline inference), see {doc}`/ray-air/examples/gptj_batch_prediction`.\n", + "\n", + "```{note}\n", + "In order to run this example, make sure your Ray cluster has access to at least one GPU with 16 or more GBs of memory. The amount of memory needed will depend on the model.\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "model_id = \"EleutherAI/gpt-j-6B\"\n", + "revision = \"float16\" # use float16 weights to fit in 16GB GPUs\n", + "prompt = (\n", + " \"In a shocking finding, scientists discovered a herd of unicorns living in a remote, \"\n", + " \"previously unexplored valley, in the Andes Mountains. Even more surprising to the \"\n", + " \"researchers was the fact that the unicorns spoke perfect English.\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import ray" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We define a {ref}`runtime environment ` to ensure that the Ray workers have access to all the necessary packages. You can omit the `runtime_env` argument if you have all of the packages already installed on each node in your cluster." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "ray.init(\n", + " runtime_env={\n", + " \"pip\": [\n", + " \"accelerate>=0.16.0\",\n", + " \"transformers>=4.26.0\",\n", + " \"torch\",\n", + " ]\n", + " }\n", + ")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Setting up basic serving with Ray Serve is very similar to {doc}`batch inference with Ray Data `. First, we define a callable class that will serve as the [Serve deployment](serve-key-concepts-deployment). At runtime, a deployment consists of a number of *replicas*, which are individual copies of the class or function that are started in separate Ray Actors (processes). The number of replicas can be scaled up or down (or even autoscaled) to match the incoming request load.\n", + "\n", + "We make sure to set the deployment to use 1 GPU by setting `\"num_gpus\"` in `ray_actor_options`. We load the model in `__init__`, which will allow us to save time by initializing a model just once and then use it to handle multiple requests.\n", + "\n", + "```{tip}\n", + "If you want to use inter-node model parallelism, you can also increase `num_gpus`. As we have created the model with `device_map=\"auto\"`, it will be automatically placed on correct devices. Note that this requires nodes with multiple GPUs.\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "from ray import serve\n", + "from starlette.requests import Request\n", + "\n", + "\n", + "@serve.deployment(ray_actor_options={\"num_gpus\": 1})\n", + "class PredictDeployment:\n", + " def __init__(self, model_id: str, revision: str = None):\n", + " from transformers import AutoModelForCausalLM, AutoTokenizer\n", + " import torch\n", + "\n", + " self.model = AutoModelForCausalLM.from_pretrained(\n", + " model_id,\n", + " revision=revision,\n", + " torch_dtype=torch.float16,\n", + " low_cpu_mem_usage=True,\n", + " device_map=\"auto\", # automatically makes use of all GPUs available to the Actor\n", + " )\n", + " self.tokenizer = AutoTokenizer.from_pretrained(model_id)\n", + "\n", + " def generate(self, text: str) -> pd.DataFrame:\n", + " input_ids = self.tokenizer(text, return_tensors=\"pt\").input_ids.to(\n", + " self.model.device\n", + " )\n", + "\n", + " gen_tokens = self.model.generate(\n", + " input_ids,\n", + " do_sample=True,\n", + " temperature=0.9,\n", + " max_length=100,\n", + " )\n", + " return pd.DataFrame(\n", + " self.tokenizer.batch_decode(gen_tokens), columns=[\"responses\"]\n", + " )\n", + "\n", + " async def __call__(self, http_request: Request) -> str:\n", + " json_request: str = await http_request.json()\n", + " prompts = []\n", + " for prompt in json_request:\n", + " text = prompt[\"text\"]\n", + " if isinstance(text, list):\n", + " prompts.extend(text)\n", + " else:\n", + " prompts.append(text)\n", + " return self.generate(prompts)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can now `bind` the deployment with our arguments, and use {meth}`~ray.serve.run` to start it.\n", + "\n", + "```{note}\n", + "If you were running this script outside of a Jupyter notebook, the recommended way is to use the [`serve run` CLI command](serve-cli). In this case, you would remove the `serve.run(deployment)` line, and instead start the deployment by calling `serve run FILENAME:deployment`.\n", + "\n", + "For more information, see [Serve Development Workflow](serve-dev-workflow).\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "RayServeSyncHandle(deployment='PredictDeployment')" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "deployment = PredictDeployment.bind(model_id=model_id, revision=revision)\n", + "serve.run(deployment)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's try submitting a request to our deployment. We will use the same prompt as before, and send a POST request. The deployment will generate a response and return it." + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "(ServeReplica:PredictDeployment pid=651, ip=10.0.8.161) The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n", + "(ServeReplica:PredictDeployment pid=651, ip=10.0.8.161) Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[{'responses': 'In a shocking finding, scientists discovered a herd of unicorns living in a remote, previously unexplored valley, in the Andes Mountains. Even more surprising to the researchers was the fact that the unicorns spoke perfect English.\\n\\nThe findings come from a recent expedition to the region of Cordillera del Divisor, in northern Peru. The region was previously known to have an unusually high number of native animals.\\n\\n\"Our team was conducting a population census of the region’'}]\n" + ] + } + ], + "source": [ + "import requests\n", + "\n", + "sample_input = {\"text\": prompt}\n", + "\n", + "output = requests.post(\"http://localhost:8000/\", json=[sample_input]).json()\n", + "print(output)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You may notice that we are not using an AIR {class}`Predictor ` here. This is because Predictors are mainly intended to be used with AIR {class}`Checkpoints `, which we don't for this example. See {ref}`air-predictors` for more information and usage examples." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.8.10 (default, Nov 14 2022, 12:59:47) \n[GCC 9.4.0]" + }, + "orig_nbformat": 4, + "vscode": { + "interpreter": { + "hash": "3c0d54d489a08ae47a06eae2fd00ff032d6cddb527c382959b7b2575f6a8167f" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/doc/source/ray-air/examples/images/dreambooth_example.png b/doc/source/ray-air/examples/images/dreambooth_example.png new file mode 120000 index 0000000000000..3b65acc893ea7 --- /dev/null +++ b/doc/source/ray-air/examples/images/dreambooth_example.png @@ -0,0 +1 @@ +../../../../../python/ray/air/examples/dreambooth/images/dreambooth_example.png \ No newline at end of file diff --git a/doc/source/ray-air/examples/images/dreambooth_training.png b/doc/source/ray-air/examples/images/dreambooth_training.png new file mode 120000 index 0000000000000..8aa963b347ce8 --- /dev/null +++ b/doc/source/ray-air/examples/images/dreambooth_training.png @@ -0,0 +1 @@ +../../../../../python/ray/air/examples/dreambooth/images/dreambooth_training.png \ No newline at end of file diff --git a/doc/source/ray-air/examples/index.rst b/doc/source/ray-air/examples/index.rst index 2917e588b2c79..252364456b638 100644 --- a/doc/source/ray-air/examples/index.rst +++ b/doc/source/ray-air/examples/index.rst @@ -25,12 +25,16 @@ Text/NLP -------- - :doc:`/ray-air/examples/huggingface_text_classification`: How to use Ray AIR to run Hugging Face Transformers fine-tuning on a text classification task. +- :doc:`/ray-air/examples/gptj_deepspeed_fine_tuning`: How to use Ray AIR to run Hugging Face Transformers with DeepSpeed for fine-tuning a large model. - :doc:`/ray-air/examples/gptj_batch_prediction`: How to use Ray AIR to do batch prediction with the Hugging Face Transformers GPT-J model. +- :doc:`/ray-air/examples/gptj_serving`: How to use Ray AIR to do online serving with the Hugging Face Transformers GPT-J model. +- :doc:`/ray-air/examples/dreambooth_finetuning`: How to fine-tune a DreamBooth text-to-image model with your own images. Image/CV -------- - :doc:`/ray-air/examples/torch_image_example` +- :doc:`/ray-air/examples/torch_detection` - :doc:`/ray-air/examples/pytorch_resnet_batch_prediction` - :doc:`/ray-air/examples/stablediffusion_batch_prediction`: How to use Ray AIR to do batch prediction with the Stable Diffusion text-to-image model. diff --git a/doc/source/ray-air/examples/torch_detection.ipynb b/doc/source/ray-air/examples/torch_detection.ipynb new file mode 100644 index 0000000000000..7b8de7f853f48 --- /dev/null +++ b/doc/source/ray-air/examples/torch_detection.ipynb @@ -0,0 +1,1011 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "da5b9b7e", + "metadata": {}, + "source": [ + "# Fine-tuning a Torch object detection model\n", + "\n", + "This tutorial explains how to fine-tune `fasterrcnn_resnet50_fpn` using the [Ray AI Runtime](air) for parallel data ingest and training.\n", + "\n", + "Here's what you'll do:\n", + "1. Load raw images and [VOC-style](http://host.robots.ox.ac.uk/pascal/VOC/) annotations into a Dataset\n", + "2. Fine-tune `fasterrcnn_resnet50_fpn` (the backbone is pre-trained on ImageNet)\n", + "3. Evaluate the model's accuracy\n", + "\n", + "You should be familiar with [PyTorch](https://pytorch.org/) before starting the\n", + "tutorial. If you need a refresher, read PyTorch's\n", + "[training a classifier](https://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html)\n", + "tutorial.\n", + "\n", + "## Before you begin" + ] + }, + { + "cell_type": "markdown", + "id": "e9a6d043", + "metadata": {}, + "source": [ + "* Install the [Ray AI Runtime](air)." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "2d3ae999", + "metadata": { + "vscode": { + "languageId": "shellscript" + } + }, + "outputs": [], + "source": [ + "!pip install 'ray[air]'" + ] + }, + { + "cell_type": "markdown", + "id": "9b3d4302", + "metadata": {}, + "source": [ + "* Install `torch`, `torchmetrics`, `torchvision`, and `xmltodict`." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "8251d9d9", + "metadata": { + "vscode": { + "languageId": "shellscript" + } + }, + "outputs": [], + "source": [ + "!pip install torch torchmetrics torchvision xmltodict" + ] + }, + { + "cell_type": "markdown", + "id": "65bf13b8", + "metadata": {}, + "source": [ + "## Create a `Dataset`\n", + "\n", + "You'll work with a subset of [Pascal VOC](http://host.robots.ox.ac.uk/pascal/VOC/) that contains cats and dogs (the full dataset has 20 classes)." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "10df1b5d", + "metadata": {}, + "outputs": [], + "source": [ + "CLASS_TO_LABEL = {\n", + " \"background\": 0,\n", + " \"cat\": 1,\n", + " \"dog\": 2,\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "5567a6d6", + "metadata": {}, + "source": [ + "The dataset contain two subdirectories: `JPEGImages` and `Annotations`. `JPEGImages` contains raw images, and\n", + "`Annotations` contains XML annotations.\n", + "\n", + "```\n", + "AnimalDetection\n", + "├── Annotations\n", + "│ ├── 2007_000063.xml\n", + "│ ├── 2007_000528.xml\n", + "│ └── ...\n", + "└── JPEGImages\n", + " ├── 2007_000063.jpg\n", + " ├── 2007_000528.jpg\n", + " └── ...\n", + "```" + ] + }, + { + "cell_type": "markdown", + "id": "f821e93d", + "metadata": {}, + "source": [ + "### Define a custom datasource\n", + "\n", + "Each annotation describes the objects in an image.\n", + "\n", + "For example, view this image of a dog:" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "a29845a5", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import io\n", + "\n", + "from PIL import Image\n", + "import requests\n", + "\n", + "response = requests.get(\"https://s3-us-west-2.amazonaws.com/air-example-data/AnimalDetection/JPEGImages/2007_000063.jpg\")\n", + "image = Image.open(io.BytesIO(response.content))\n", + "image" + ] + }, + { + "cell_type": "markdown", + "id": "b8ab2cf1", + "metadata": {}, + "source": [ + "Then, print the image's annotation:" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "ee5e074a", + "metadata": { + "vscode": { + "languageId": "shellscript" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\n", + "\tVOC2012\n", + "\t2007_000063.jpg\n", + "\t\n", + "\t\tThe VOC2007 Database\n", + "\t\tPASCAL VOC2007\n", + "\t\tflickr\n", + "\t\n", + "\t\n", + "\t\t500\n", + "\t\t375\n", + "\t\t3\n", + "\t\n", + "\t1\n", + "\t\n", + "\t\tdog\n", + "\t\tUnspecified\n", + "\t\t0\n", + "\t\t0\n", + "\t\t\n", + "\t\t\t123\n", + "\t\t\t115\n", + "\t\t\t379\n", + "\t\t\t275\n", + "\t\t\n", + "\t\n", + "" + ] + } + ], + "source": [ + "!curl \"https://s3-us-west-2.amazonaws.com/air-example-data/AnimalDetection/Annotations/2007_000063.xml\"" + ] + }, + { + "cell_type": "markdown", + "id": "686f0885", + "metadata": {}, + "source": [ + "Notice how there's one object labeled \"dog\"\n", + "\n", + "```\n", + "dog\n", + "Unspecified\n", + "0\n", + "0\n", + "\n", + " 123\n", + " 115\n", + " 379\n", + " 275\n", + "\n", + "```\n", + "\n", + "[Ray Datasets](datasets) lets you read and preprocess data in parallel. Datasets doesn't\n", + "have built-in support for VOC-style annotations, so you'll need to define a custom\n", + "datasource.\n", + "\n", + "A Datasource is an object that reads data of a particular type. For example, Datasets\n", + "implements a Datasource that reads CSV files. Your datasource will parse labels and\n", + "bounding boxes from XML files. Later, you'll read the corresponding images.\n", + "\n", + "To implement the datasource, extend the built-in `FileBasedDatasource` class\n", + "and override the `_read_file` method." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "0a4b8820", + "metadata": {}, + "outputs": [], + "source": [ + "from typing import List, Tuple\n", + "\n", + "import xmltodict\n", + "import pandas as pd\n", + "import pyarrow as pa\n", + "\n", + "from ray.data.datasource import FileBasedDatasource\n", + "from ray.data.extensions import TensorArray\n", + "\n", + "\n", + "class VOCAnnotationDatasource(FileBasedDatasource):\n", + " def _read_file(self, f: pa.NativeFile, path: str, **reader_args) -> pd.DataFrame:\n", + " text = f.read().decode(\"utf-8\")\n", + " annotation = xmltodict.parse(text)[\"annotation\"]\n", + "\n", + " objects = annotation[\"object\"]\n", + " # If there's one object, `objects` is a `dict`; otherwise, it's a `list[dict]`.\n", + " if isinstance(objects, dict):\n", + " objects = [objects]\n", + "\n", + " boxes: List[Tuple] = []\n", + " for obj in objects:\n", + " x1 = float(obj[\"bndbox\"][\"xmin\"])\n", + " y1 = float(obj[\"bndbox\"][\"ymin\"])\n", + " x2 = float(obj[\"bndbox\"][\"xmax\"])\n", + " y2 = float(obj[\"bndbox\"][\"ymax\"])\n", + " boxes.append((x1, y1, x2, y2))\n", + "\n", + " labels: List[int] = [CLASS_TO_LABEL[obj[\"name\"]] for obj in objects]\n", + "\n", + " filename = annotation[\"filename\"]\n", + "\n", + " return pd.DataFrame(\n", + " {\n", + " \"boxes\": TensorArray([boxes]),\n", + " \"labels\": TensorArray([labels]),\n", + " \"filename\": [filename],\n", + " }\n", + " )\n", + "\n", + " def _rows_per_file(self):\n", + " return 1" + ] + }, + { + "cell_type": "markdown", + "id": "10d6ed44", + "metadata": {}, + "source": [ + "### Read annotations\n", + "\n", + "To load the annotations into a `Dataset`, call `ray.data.read_datasource` and pass\n", + "the custom datasource to the constructor. Ray will read the annotations in parallel." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "0a4717e2", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "find: ‘.git’: No such file or directory\n", + "2023-03-01 13:05:51,314\tINFO worker.py:1360 -- Connecting to existing Ray cluster at address: 10.0.26.109:6379...\n", + "2023-03-01 13:05:51,327\tINFO worker.py:1548 -- Connected to Ray cluster. View the dashboard at \u001b[1m\u001b[32mhttps://console.anyscale-staging.com/api/v2/sessions/ses_mf1limh36cs2yrh9wkf6h2a75k/services?redirect_to=dashboard \u001b[39m\u001b[22m\n", + "2023-03-01 13:05:52,269\tINFO packaging.py:330 -- Pushing file package 'gcs://_ray_pkg_00aff5a3a84ab6438be1961b97a5beaa.zip' (266.32MiB) to Ray cluster...\n", + "2023-03-01 13:05:58,529\tINFO packaging.py:343 -- Successfully pushed file package 'gcs://_ray_pkg_00aff5a3a84ab6438be1961b97a5beaa.zip'.\n" + ] + } + ], + "source": [ + "import os\n", + "import ray\n", + "\n", + "\n", + "annotations: ray.data.Dataset = ray.data.read_datasource(\n", + " VOCAnnotationDatasource(), paths=\"s3://anonymous@air-example-data/AnimalDetection/Annotations\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "db3d0ee6", + "metadata": {}, + "source": [ + "Look at the first two samples. `VOCAnnotationDatasource` should've correctly parsed\n", + "labels and bounding boxes." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "e0039edf", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[{'boxes': array([[123., 115., 379., 275.]]),\n", + " 'labels': 2,\n", + " 'filename': '2007_000063.jpg'},\n", + " {'boxes': array([[124., 68., 319., 310.]]),\n", + " 'labels': 1,\n", + " 'filename': '2007_000528.jpg'}]" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "annotations.take(2)" + ] + }, + { + "cell_type": "markdown", + "id": "5ff0097f", + "metadata": {}, + "source": [ + "### Load images into memory" + ] + }, + { + "cell_type": "markdown", + "id": "87846ae1", + "metadata": {}, + "source": [ + "Each row of `annotations` contains the filename of an image.\n", + "\n", + "Write a user-defined function that loads these images. For each annotation, your function will:\n", + "1. Open the image associated with the annotation.\n", + "2. Add the image to a new `\"image\"` column." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "494c71d6", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2023-03-01 13:06:08,005\tINFO bulk_executor.py:41 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[read->MapBatches(read_images)]\n", + "read->MapBatches(read_images): 100%|██████████| 128/128 [00:24<00:00, 5.25it/s]\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "016ad4fe729a4949bf6f59153b039ec7", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "VBox(children=(HTML(value='

Dataset

'), Tab(children=(HTML(value='
Dict[str, np.ndarray]:\n", + " images: List[np.ndarray] = []\n", + " for filename in batch[\"filename\"]:\n", + " url = os.path.join(\"https://s3-us-west-2.amazonaws.com/air-example-data/AnimalDetection/JPEGImages\", filename)\n", + " response = requests.get(url)\n", + " image = Image.open(io.BytesIO(response.content))\n", + " images.append(np.array(image))\n", + " batch[\"image\"] = np.array(images, dtype=object)\n", + " return batch\n", + "\n", + "\n", + "dataset = annotations.map_batches(read_images)\n", + "dataset" + ] + }, + { + "cell_type": "markdown", + "id": "e7cdc755", + "metadata": {}, + "source": [ + "### Split the dataset into train and test sets" + ] + }, + { + "cell_type": "markdown", + "id": "9cfddd49", + "metadata": {}, + "source": [ + "Once you've created a `Dataset`, split the dataset into train and test sets." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "f92ee5c1", + "metadata": {}, + "outputs": [], + "source": [ + "train_dataset, test_dataset = dataset.train_test_split(0.2)" + ] + }, + { + "cell_type": "markdown", + "id": "6b68209a", + "metadata": {}, + "source": [ + "## Define preprocessing logic" + ] + }, + { + "cell_type": "markdown", + "id": "9dbea4b4", + "metadata": {}, + "source": [ + "A `Preprocessor` is an object that defines preprocessing logic. It's the standard way\n", + "to preprocess data with Ray.\n", + "\n", + "Create two preprocessors: one to transpose and scale images (`ToTensor`), and another to\n", + "randomly augment images every epoch (`RandomHorizontalFlip`). You'll later pass these\n", + "preprocessors to a `Trainer`.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "bbba448e", + "metadata": {}, + "outputs": [], + "source": [ + "from torchvision import transforms\n", + "\n", + "from ray.data.preprocessors import TorchVisionPreprocessor\n", + "\n", + "transform = transforms.ToTensor()\n", + "preprocessor = TorchVisionPreprocessor(columns=[\"image\"], transform=transform)\n", + "\n", + "per_epoch_transform = transforms.RandomHorizontalFlip(p=0.5)\n", + "per_epoch_preprocessor = TorchVisionPreprocessor(columns=[\"image\"], transform=per_epoch_transform)" + ] + }, + { + "cell_type": "markdown", + "id": "1c647be8", + "metadata": {}, + "source": [ + "## Fine-tune the object detection model\n", + "\n", + "### Define the training loop\n", + "\n", + "Write a function that trains `fasterrcnn_resnet50_fpn`. Your code will look like\n", + "standard Torch code with a few changes.\n", + "\n", + "Here are a few things to point out:\n", + "1. Distribute the model with `ray.train.torch.prepare_model`. Don't use `DistributedDataParallel`.\n", + "2. Pass your Dataset to the Trainer. The Trainer automatically shards the data across workers.\n", + "3. Iterate over data with `DatasetIterator.iter_batches`. Don't use a Torch `DataLoader`.\n", + "4. Pass preprocessors to the Trainer.\n", + "\n", + "In addition, report metrics and checkpoints with `session.report`. `session.report` tracks these metrics in Ray AIR's internal bookkeeping, allowing you to monitor training and analyze training runs after they've finished." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "44ec65fc", + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "from torchvision import models\n", + "\n", + "from ray.air import Checkpoint\n", + "from ray.air import session\n", + "\n", + "\n", + "def train_one_epoch(*, model, optimizer, batch_size, epoch):\n", + " model.train()\n", + "\n", + " lr_scheduler = None\n", + " if epoch == 0:\n", + " warmup_factor = 1.0 / 1000\n", + " lr_scheduler = torch.optim.lr_scheduler.LinearLR(\n", + " optimizer, start_factor=warmup_factor, total_iters=250\n", + " )\n", + "\n", + " device = ray.train.torch.get_device()\n", + " train_dataset_shard = session.get_dataset_shard(\"train\")\n", + "\n", + " batches = train_dataset_shard.iter_batches(batch_size=batch_size)\n", + " for batch in batches:\n", + " inputs = [torch.as_tensor(image).to(device) for image in batch[\"image\"]]\n", + " targets = [\n", + " {\n", + " \"boxes\": torch.as_tensor(boxes).to(device),\n", + " \"labels\": torch.as_tensor(labels).to(device),\n", + " }\n", + " for boxes, labels in zip(batch[\"boxes\"], batch[\"labels\"])\n", + " ]\n", + " loss_dict = model(inputs, targets)\n", + " losses = sum(loss for loss in loss_dict.values())\n", + "\n", + " optimizer.zero_grad()\n", + " losses.backward()\n", + " optimizer.step()\n", + "\n", + " if lr_scheduler is not None:\n", + " lr_scheduler.step()\n", + "\n", + " session.report(\n", + " {\n", + " \"losses\": losses.item(),\n", + " \"epoch\": epoch,\n", + " \"lr\": optimizer.param_groups[0][\"lr\"],\n", + " **{key: value.item() for key, value in loss_dict.items()},\n", + " }\n", + " )\n", + "\n", + "\n", + "def train_loop_per_worker(config):\n", + " # By default, `fasterrcnn_resnet50_fpn`'s backbone is pre-trained on ImageNet.\n", + " model = models.detection.fasterrcnn_resnet50_fpn(num_classes=3)\n", + " model = ray.train.torch.prepare_model(model)\n", + " parameters = [p for p in model.parameters() if p.requires_grad]\n", + " optimizer = torch.optim.SGD(\n", + " parameters,\n", + " lr=config[\"lr\"],\n", + " momentum=config[\"momentum\"],\n", + " weight_decay=config[\"weight_decay\"],\n", + " )\n", + " lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(\n", + " optimizer, milestones=config[\"lr_steps\"], gamma=config[\"lr_gamma\"]\n", + " )\n", + "\n", + " for epoch in range(0, config[\"epochs\"]):\n", + " train_one_epoch(\n", + " model=model,\n", + " optimizer=optimizer,\n", + " batch_size=config[\"batch_size\"],\n", + " epoch=epoch,\n", + " )\n", + " lr_scheduler.step()\n", + " checkpoint = Checkpoint.from_dict(\n", + " {\n", + " \"model\": model.module.state_dict(),\n", + " \"optimizer\": optimizer.state_dict(),\n", + " \"lr_scheduler\": lr_scheduler.state_dict(),\n", + " \"config\": config,\n", + " \"epoch\": epoch,\n", + " }\n", + " )\n", + " session.report({}, checkpoint=checkpoint)" + ] + }, + { + "cell_type": "markdown", + "id": "0d68c97c", + "metadata": {}, + "source": [ + "### Fine-tune the model" + ] + }, + { + "cell_type": "markdown", + "id": "eef58891", + "metadata": {}, + "source": [ + "Once you've defined the training loop, create a `TorchTrainer` and pass the training\n", + "loop to the constructor. Then, call `TorchTrainer.fit` to train the model." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "06a59e9b", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2023-03-01 13:06:39,486\tINFO instantiator.py:21 -- Created a temporary directory at /tmp/tmp1stz0z_r\n", + "2023-03-01 13:06:39,488\tINFO instantiator.py:76 -- Writing /tmp/tmp1stz0z_r/_remote_module_non_scriptable.py\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "
\n", + "
\n", + "

Tune Status

\n", + " \n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
Current time:2023-03-01 13:08:45
Running for: 00:02:05.37
Memory: 50.5/480.2 GiB
\n", + "
\n", + "
\n", + "
\n", + "

System Info

\n", + " Using FIFO scheduling algorithm.
Resources requested: 0/64 CPUs, 0/8 GPUs, 0.0/324.83 GiB heap, 0.0/143.21 GiB objects (0.0/1.0 accelerator_type:V100)\n", + "
\n", + " \n", + "
\n", + "
\n", + "
\n", + "

Trial Status

\n", + " \n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
Trial name status loc iter total time (s)
TorchTrainer_f5aa9_00000TERMINATED10.0.26.109:175347 244 108.703
\n", + "
\n", + "
\n", + "\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "(RayTrainWorker pid=175611) 2023-03-01 13:06:56,331\tINFO config.py:86 -- Setting up process group for: env:// [rank=0, world_size=4]\n", + "(TorchTrainer pid=175347) 2023-03-01 13:07:00,615\tINFO bulk_executor.py:41 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[TorchVisionPreprocessor] -> AllToAllOperator[randomize_block_order]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(autoscaler +1m25s) Tip: use `ray status` to view detailed cluster status. To disable these messages, set RAY_SCHEDULER_EVENTS=0.\n", + "(autoscaler +1m25s) Warning: The following resource request cannot be scheduled right now: {'CPU': 1.0}. This is likely due to all cluster resources being claimed by actors. Consider creating fewer actors or adding more nodes to this Ray cluster.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "(TorchTrainer pid=175347) /home/ray/anaconda3/lib/python3.9/site-packages/ray/train/_internal/dataset_iterator.py:64: UserWarning: session.get_dataset_shard returns a ray.data.DatasetIterator instead of a Dataset/DatasetPipeline as of Ray v2.3. Use iter_torch_batches(), to_tf(), or iter_batches() to iterate over one epoch. See https://docs.ray.io/en/latest/data/api/dataset_iterator.html for full DatasetIterator docs.\n", + "(TorchTrainer pid=175347) warnings.warn(\n", + "Stage 0: 0%| | 0/1 [00:00 TaskPoolMapOperator[TorchVisionPreprocessor]\n", + "(PipelineSplitExecutorCoordinator pid=191352) \n", + "Stage 0: : 2it [00:08, 4.31s/it] 2023-03-01 13:07:33,990\tINFO bulk_executor.py:41 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[TorchVisionPreprocessor]\n", + "(RayTrainWorker pid=175612) 2023-03-01 13:07:34,394\tWARNING plan.py:527 -- Warning: The Ray cluster currently does not have any available CPUs. The Dataset job will hang unless more CPUs are freed up. A common reason is that cluster resources are used by Actors or Tune trials; see the following link for more details: https://docs.ray.io/en/master/data/dataset-internals.html#datasets-and-tune\n", + "(PipelineSplitExecutorCoordinator pid=191352) \n", + "Stage 0: : 3it [00:13, 4.48s/it]2023-03-01 13:07:38,660\tINFO bulk_executor.py:41 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[TorchVisionPreprocessor]\n", + "(RayTrainWorker pid=175612) /tmp/ipykernel_160001/3839218723.py:23: UserWarning: The given NumPy array is not writable, and PyTorch does not support non-writable tensors. This means writing to this tensor will result in undefined behavior. You may want to copy the array to protect its data or make it writable before converting it to a tensor. This type of warning will be suppressed for the rest of this program. (Triggered internally at ../torch/csrc/utils/tensor_numpy.cpp:199.)\n", + "(RayTrainWorker pid=175614) /tmp/ipykernel_160001/3839218723.py:26: UserWarning: The given NumPy array is not writable, and PyTorch does not support non-writable tensors. This means writing to this tensor will result in undefined behavior. You may want to copy the array to protect its data or make it writable before converting it to a tensor. This type of warning will be suppressed for the rest of this program. (Triggered internally at ../torch/csrc/utils/tensor_numpy.cpp:199.)\n", + "(RayTrainWorker pid=175611) /tmp/ipykernel_160001/3839218723.py:26: UserWarning: The given NumPy array is not writable, and PyTorch does not support non-writable tensors. This means writing to this tensor will result in undefined behavior. You may want to copy the array to protect its data or make it writable before converting it to a tensor. This type of warning will be suppressed for the rest of this program. (Triggered internally at ../torch/csrc/utils/tensor_numpy.cpp:199.)\n", + "(RayTrainWorker pid=175613) /tmp/ipykernel_160001/3839218723.py:23: UserWarning: The given NumPy array is not writable, and PyTorch does not support non-writable tensors. This means writing to this tensor will result in undefined behavior. You may want to copy the array to protect its data or make it writable before converting it to a tensor. This type of warning will be suppressed for the rest of this program. (Triggered internally at ../torch/csrc/utils/tensor_numpy.cpp:199.)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "

Trial Progress

\n", + " \n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
Trial name date done experiment_taghostname iterations_since_restorenode_ip pidshould_checkpoint time_since_restore time_this_iter_s time_total_s timestamp training_iterationtrial_id
TorchTrainer_f5aa9_000002023-03-01_13-08-41True 0ip-10-0-26-109 24410.0.26.109175347True 108.703 4.2088 108.703 1677704918 244f5aa9_00000
\n", + "
\n", + "\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "(RayTrainWorker pid=175612) 2023-03-01 13:07:41,980\tINFO distributed.py:1027 -- Reducer buckets have been rebuilt in this iteration.\n", + "(PipelineSplitExecutorCoordinator pid=191352) \n", + "Stage 0: : 4it [01:11, 25.77s/it]2023-03-01 13:08:37,068\tINFO bulk_executor.py:41 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[TorchVisionPreprocessor]\n", + "(RayTrainWorker pid=175614) 2023-03-01 13:08:37,464\tWARNING plan.py:527 -- Warning: The Ray cluster currently does not have any available CPUs. The Dataset job will hang unless more CPUs are freed up. A common reason is that cluster resources are used by Actors or Tune trials; see the following link for more details: https://docs.ray.io/en/master/data/dataset-internals.html#datasets-and-tune\n", + "2023-03-01 13:08:45,074\tINFO tune.py:825 -- Total run time: 125.51 seconds (125.36 seconds for the tuning loop).\n" + ] + } + ], + "source": [ + "from ray.air.config import DatasetConfig, ScalingConfig\n", + "from ray.train.torch import TorchTrainer\n", + "\n", + "\n", + "trainer = TorchTrainer(\n", + " train_loop_per_worker=train_loop_per_worker,\n", + " train_loop_config={\n", + " \"batch_size\": 2,\n", + " \"lr\": 0.02,\n", + " \"epochs\": 1, # You'd normally train for 26 epochs.\n", + " \"momentum\": 0.9,\n", + " \"weight_decay\": 1e-4,\n", + " \"lr_steps\": [16, 22],\n", + " \"lr_gamma\": 0.1,\n", + " },\n", + " scaling_config=ScalingConfig(num_workers=4, use_gpu=True),\n", + " datasets={\"train\": train_dataset},\n", + " dataset_config={\n", + " # Don't augment test images. Only apply `per_epoch_preprocessor` to the train\n", + " # set.\n", + " \"train\": DatasetConfig(\n", + " per_epoch_preprocessor=per_epoch_preprocessor\n", + " ),\n", + " },\n", + " preprocessor=preprocessor,\n", + ")\n", + "results = trainer.fit()" + ] + }, + { + "cell_type": "markdown", + "id": "224a1139", + "metadata": {}, + "source": [ + "## Evaluate the model on test data\n", + "\n", + "Now that you've fine-tuned the model, you'll evaluate it on the test data.\n", + "\n", + "### Generate predictions on the test data" + ] + }, + { + "cell_type": "markdown", + "id": "1fc9bac2", + "metadata": {}, + "source": [ + "`Predictors` let you perform scalable [batch prediction](batch-prediction) and\n", + "[online inference](air-serving-guide). To evaluate the model, you'll use\n", + "`BatchPredictor` to perform inference in a distributed fashion.\n", + "\n", + "Create a `BatchPredictor` and pass `TorchDetectionPredictor` to the constructor. Then,\n", + "call `BatchPredictor.predict` to detect objects in the test data." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "cc3cc662", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2023-03-01 13:08:48,113\tINFO batch_predictor.py:214 -- `num_gpus_per_worker` is set for `BatchPreditor`.Automatically enabling GPU prediction for this predictor. To disable set `use_gpu` to `False` in `BatchPredictor.predict`.\n", + "2023-03-01 13:08:48,945\tINFO bulk_executor.py:41 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[TorchVisionPreprocessor] -> ActorPoolMapOperator[MapBatches(ScoringWrapper)]\n", + "TorchVisionPreprocessor: 100%|██████████| 26/26 [00:17<00:00, 1.49it/s]\n", + "MapBatches(ScoringWrapper), 0 actors [26 locality hits, 0 misses]: 100%|██████████| 26/26 [00:32<00:00, 1.25s/it] \n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "468b32006b5f440dae152b288d84d5d3", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "VBox(children=(HTML(value='

Dataset

'), Tab(children=(HTML(value='
pip install "ray[rllib]" tensorflow - rllib train --algo DQN --env CartPole-v1 + rllib train --algo DQN --env CartPole-v1 --stop '{"training_iteration": 30}'
.. margin:: @@ -43,7 +43,7 @@ RLlib supports any Farama-Foundation Gymnasium environment, as well as a number It also supports a large number of algorithms (see :ref:`rllib-algorithms-doc`) to choose from. -Running the above will return one of the `checkpoints` that get generated during training, +Running the above will return one of the `checkpoints` that get generated during training after 30 training iterations, as well as a command that you can use to evaluate the trained algorithm. You can evaluate the trained algorithm with the following command (assuming the checkpoint path is called ``checkpoint``): diff --git a/doc/source/serve/doc_code/production_fruit_example.py b/doc/source/serve/doc_code/production_fruit_example.py index 4ff5618cdad96..987ac186a8b21 100644 --- a/doc/source/serve/doc_code/production_fruit_example.py +++ b/doc/source/serve/doc_code/production_fruit_example.py @@ -9,8 +9,7 @@ from ray.serve.http_adapters import json_request # These imports are used only for type hints: -from typing import Dict, List -from starlette.requests import Request +from typing import Dict @serve.deployment(num_replicas=2) @@ -88,10 +87,6 @@ def check_price(self, amount: float) -> float: return self.price * amount -async def json_resolver(request: Request) -> List: - return await request.json() - - with InputNode() as query: fruit, amount = query[0], query[1] diff --git a/doc/source/serve/tutorials/batch.md b/doc/source/serve/tutorials/batch.md index 24d3983cb4410..cf367893eff53 100644 --- a/doc/source/serve/tutorials/batch.md +++ b/doc/source/serve/tutorials/batch.md @@ -186,4 +186,31 @@ Finally, let's run the script. $ python tutorial_batch.py ``` -You should get a similar output like before! \ No newline at end of file +You should get a similar output like before! + +## Troubleshooting + +If you see the following error: + +```console +TypeError: Descriptors cannot not be created directly. + If this call came from a _pb2.py file, your generated code is out of date and must be regenerated with protoc >= 3.19.0. + If you cannot immediately regenerate your protos, some other possible workarounds are: + 1. Downgrade the protobuf package to 3.20.x or lower. + 2. Set PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python (but this will use pure-Python parsing and will be much slower). +``` + +You can downgrade the protobuf package to 3.20.x or lower in your Docker image, or tell Ray to do it at runtime by specifying a [runtime environment](runtime-environments): + +Open a new YAML file called `batch_env.yaml` for runtime environment. + +```yaml +pip: + - protobuf==3.20.3 +``` + +Then, run the following command to deploy the model with the runtime environment. + +```console +$ serve run --runtime-env batch_env.yaml tutorial_batch:generator +``` diff --git a/doc/source/serve/tutorials/serve-ml-models.md b/doc/source/serve/tutorials/serve-ml-models.md index 9a9e0c6f62e4f..5e63c4779c2d5 100644 --- a/doc/source/serve/tutorials/serve-ml-models.md +++ b/doc/source/serve/tutorials/serve-ml-models.md @@ -61,10 +61,39 @@ Now that we've defined our Serve deployment, let's prepare it so that it can be ::: Finally, we can deploy our model to Ray Serve through the terminal. + ```console $ serve run tutorial_tensorflow:mnist_model ``` +:::{note} +If you see the following error: + +```console +TypeError: Descriptors cannot not be created directly. + If this call came from a _pb2.py file, your generated code is out of date and must be regenerated with protoc >= 3.19.0. + If you cannot immediately regenerate your protos, some other possible workarounds are: + 1. Downgrade the protobuf package to 3.20.x or lower. + 2. Set PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python (but this will use pure-Python parsing and will be much slower). +``` + +You can downgrade the protobuf package to 3.20.x or lower in your Docker image, or tell Ray to do it at runtime by specifying a [runtime environment](runtime-environments): + +Open a new YAML file called `tf_env.yaml` for runtime environment. + +```yaml +pip: + - protobuf==3.20.3 +``` + +Then, run the following command to deploy the model with the runtime environment. + +```console +$ serve run --runtime-env tf_env.yaml tutorial_tensorflow:mnist_model +``` + +::: + Let's query it! While Serve is running, open a separate terminal window, and run the following in an interactive Python shell or a separate Python script: ```python diff --git a/doc/source/train/config_guide.rst b/doc/source/train/config_guide.rst index 0b6c0381ddcbf..b2a010024808a 100644 --- a/doc/source/train/config_guide.rst +++ b/doc/source/train/config_guide.rst @@ -52,6 +52,8 @@ are :ref:`not tunable `. :start-after: __failure_config_start__ :end-before: __failure_config_end__ +.. _train-config-sync: + Sync configurations in Train (``SyncConfig``) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/doc/source/train/dl_guide.rst b/doc/source/train/dl_guide.rst index 3fa2e6d812a3f..fd07a15c45919 100644 --- a/doc/source/train/dl_guide.rst +++ b/doc/source/train/dl_guide.rst @@ -61,7 +61,7 @@ training. and :func:`ray.train.torch.prepare_data_loader` utilities below, and instead handle the logic directly inside your training function. - First, use the :func:~ray.train.torch.prepare_model` function to automatically move your model to the right device and wrap it in + First, use the :func:`~ray.train.torch.prepare_model` function to automatically move your model to the right device and wrap it in ``DistributedDataParallel`` .. code-block:: diff diff --git a/doc/source/train/getting-started.rst b/doc/source/train/getting-started.rst index d071b2c44ee83..113eb6ff607ed 100644 --- a/doc/source/train/getting-started.rst +++ b/doc/source/train/getting-started.rst @@ -107,6 +107,7 @@ Here are examples for some of the commonly used trainers: :language: python :start-after: __torch_single_run_begin__ :end-before: __torch_single_run_end__ + :dedent: Now let's convert this to a distributed multi-worker training function! @@ -128,6 +129,7 @@ Here are examples for some of the commonly used trainers: :language: python :start-after: __torch_trainer_begin__ :end-before: __torch_trainer_end__ + :dedent: See :ref:`train-porting-code` for a more comprehensive example. @@ -156,6 +158,7 @@ Here are examples for some of the commonly used trainers: :language: python :start-after: __tf_single_run_begin__ :end-before: __tf_single_run_end__ + :dedent: Now let's convert this to a distributed multi-worker training function! All you need to do is: @@ -177,6 +180,7 @@ Here are examples for some of the commonly used trainers: :language: python :start-after: __tf_trainer_begin__ :end-before: __tf_trainer_end__ + :dedent: See :ref:`train-porting-code` for a more comprehensive example. diff --git a/doc/source/tune/examples/tune-xgboost.ipynb b/doc/source/tune/examples/tune-xgboost.ipynb index edf640b0b30cf..cde9b8ce92e06 100644 --- a/doc/source/tune/examples/tune-xgboost.ipynb +++ b/doc/source/tune/examples/tune-xgboost.ipynb @@ -127,6 +127,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "ec2a13f8", "metadata": {}, @@ -226,7 +227,7 @@ "\n", "To address this fact, XGBoost uses a parameter called *Eta*, which is sometimes called\n", "the *learning rate*. Don't confuse this with learning rates from gradient descent!\n", - "The original [paper on stochastic gradient boosting](https://www.sciencedirect.com/science/article/abs/pii/S0167947301000652)\n", + "The original [paper on stochastic gradient boosting](https://jerryfriedman.su.domains/ftp/stobst.pdf)\n", "introduces this parameter like so:\n", "\n", "$$\n", @@ -1252,4 +1253,4 @@ }, "nbformat": 4, "nbformat_minor": 5 -} \ No newline at end of file +} diff --git a/doc/source/tune/faq.rst b/doc/source/tune/faq.rst index bb02b09ddb060..bbefeef2f4cd9 100644 --- a/doc/source/tune/faq.rst +++ b/doc/source/tune/faq.rst @@ -797,13 +797,11 @@ The reasons for this are: 3. Concurrent jobs are harder to debug. If a trial of job A fills the disk, trials from job B on the same node are impacted. In practice, it's hard to reason about these conditions from the logs if something goes wrong. -4. Some internal implementations in Ray Tune assume that you only have one job - running at a time. This can lead to conflicts. -The fourth reason is especially problematic when you run concurrent tuning jobs. For instance, -a symptom is when trials from job A use parameters specified in job B, leading to unexpected -results. +Previously, some internal implementations in Ray Tune assumed that you only have one job +running at a time. A symptom was when trials from job A used parameters specified in job B, +leading to unexpected results. Please refer to [this github issue](https://github.com/ray-project/ray/issues/30091#issuecomment-1431676976) -for more context and a workaround. +for more context and a workaround if you run into this issue. diff --git a/docker/ray-ml/Dockerfile b/docker/ray-ml/Dockerfile index 80085327a5563..18454405bd523 100644 --- a/docker/ray-ml/Dockerfile +++ b/docker/ray-ml/Dockerfile @@ -24,7 +24,7 @@ RUN sudo apt-get update \ && $HOME/anaconda3/bin/pip --no-cache-dir install -U \ -r requirements.txt \ # Then, keep requirements bounds as constraints and install remaining test dependencies - && $HOME/anaconda3/bin/pip --use-deprecated=legacy-resolver --no-cache-dir install -U \ + && $HOME/anaconda3/bin/pip --no-cache-dir install -U \ -c requirements.txt \ -r requirements_rllib.txt \ -r requirements_train.txt \ diff --git a/python/ray/_private/state_api_test_utils.py b/python/ray/_private/state_api_test_utils.py index 95bb75dc0601d..8ac0f91007360 100644 --- a/python/ray/_private/state_api_test_utils.py +++ b/python/ray/_private/state_api_test_utils.py @@ -9,6 +9,7 @@ import time import traceback from typing import Callable, Dict, List, Optional +from ray.experimental.state.api import list_tasks import ray from ray.actor import ActorHandle @@ -306,3 +307,15 @@ def periodic_invoke_state_apis_with_actor(*args, **kwargs) -> ActorHandle: print("State api actor is ready now.") actor.start.remote() return actor + + +def verify_failed_task(name: str, error_type: str) -> bool: + """ + Check if a task with 'name' has failed with the exact error type 'error_type' + """ + tasks = list_tasks(filters=[("name", "=", name)]) + assert len(tasks) == 1, tasks + t = tasks[0] + assert t["state"] == "FAILED", t + assert t["error_type"] == error_type, t + return True diff --git a/python/ray/_private/test_utils.py b/python/ray/_private/test_utils.py index 8e5ccccb9ce4d..dd78ab87ab575 100644 --- a/python/ray/_private/test_utils.py +++ b/python/ray/_private/test_utils.py @@ -1797,7 +1797,7 @@ def wandb_populate_run_location_hook(): def safe_write_to_results_json( - result: str, + result: dict, default_file_name: str = "/tmp/release_test_output.json", env_var: Optional[str] = "TEST_OUTPUT_JSON", ): @@ -1810,3 +1810,5 @@ def safe_write_to_results_json( with open(test_output_json_tmp, "wt") as f: json.dump(result, f) os.replace(test_output_json_tmp, test_output_json) + logger.info(f"Wrote results to {test_output_json}") + logger.info(json.dumps(result)) diff --git a/python/ray/_private/worker.py b/python/ray/_private/worker.py index 99eef572a0b3e..0465302bee2d8 100644 --- a/python/ray/_private/worker.py +++ b/python/ray/_private/worker.py @@ -3008,8 +3008,7 @@ def remote( invocation. The default value is 1. Pass "dynamic" to allow the task to decide how many return values to return during execution, and the caller will - receive an ObjectRef[ObjectRefGenerator] (note, this setting is - experimental). + receive an ObjectRef[ObjectRefGenerator]. See :ref:`dynamic generators ` for more details. num_cpus: The quantity of CPU resources to reserve for this task or for the lifetime of the actor. diff --git a/python/ray/_raylet.pyx b/python/ray/_raylet.pyx index 416bf4d5a16c3..e931cb6845621 100644 --- a/python/ray/_raylet.pyx +++ b/python/ray/_raylet.pyx @@ -137,7 +137,6 @@ import ray.cloudpickle as ray_pickle from ray.core.generated.common_pb2 import ActorDiedErrorContext from ray._private.async_compat import sync_to_async, get_new_event_loop from ray._private.client_mode_hook import disable_client_hook -from ray._private.signature import DUMMY_TYPE import ray._private.gcs_utils as gcs_utils import ray._private.memory_monitor as memory_monitor import ray._private.profiling as profiling @@ -170,13 +169,6 @@ current_task_id_lock = threading.Lock() job_config_initialized = False job_config_initialization_lock = threading.Lock() -# The cached serialized dummy arg b`__RAY_DUMMY__`. -cdef dummy_type_serialized_arg = None -# The type of DUMMY_TYPE. -cdef dummy_type_type = type(DUMMY_TYPE) -# The value of DUMMY_TYPE, cdef DUMMY_TYPE to avoid global lookup. -cdef dummy_type_value = DUMMY_TYPE - class ObjectRefGenerator: def __init__(self, refs): @@ -464,35 +456,20 @@ cdef prepare_args_internal( unique_ptr[CTaskArg](new CTaskArgByReference( c_arg, c_owner_address, - (arg).call_site_data))) # Avoid calling Python function + arg.call_site()))) else: - # The type check is because some custom types may not implement __eq__ - # well. So, we only handle the args which type and value are exactly match - # the DUMMY_TYPE. - # TODO(fyrestone): Maybe we can remove the DUMMY_TYPE or make the - # DUMMY_TYPE None. - # https://github.com/ray-project/ray/pull/32478/ - if type(arg) is dummy_type_type and arg == dummy_type_value: - global dummy_type_serialized_arg - if dummy_type_serialized_arg is None: - # Cache the serialized dummy arg. - dummy_type_serialized_arg = serialized_arg = \ - worker.get_serialization_context().serialize(arg) - else: - serialized_arg = dummy_type_serialized_arg - else: - try: - serialized_arg = worker.get_serialization_context( - ).serialize(arg) - except TypeError as e: - msg = ( - "Could not serialize the argument " - f"{repr(arg)} for a task or actor " - f"{function_descriptor.repr}. Check " - "https://docs.ray.io/en/master/ray-core/objects/serialization.html#troubleshooting " # noqa - "for more information.") - raise TypeError(msg) from e + try: + serialized_arg = worker.get_serialization_context( + ).serialize(arg) + except TypeError as e: + msg = ( + "Could not serialize the argument " + f"{repr(arg)} for a task or actor " + f"{function_descriptor.repr}. Check " + "https://docs.ray.io/en/master/ray-core/objects/serialization.html#troubleshooting " # noqa + "for more information.") + raise TypeError(msg) from e metadata = serialized_arg.metadata if language != Language.PYTHON: metadata_fields = metadata.split(b",") @@ -1984,19 +1961,18 @@ cdef class CoreWorker: self.python_scheduling_strategy_to_c( scheduling_strategy, &c_scheduling_strategy) - if retry_exception_allowlist: - try: - serialized_retry_exception_allowlist = ray_pickle.dumps( - retry_exception_allowlist, - ) - except TypeError as e: - msg = ( - "Could not serialize the retry exception allowlist" - f"{retry_exception_allowlist} for task {function_descriptor.repr}. " - "Check " - "https://docs.ray.io/en/master/ray-core/objects/serialization.html#troubleshooting " # noqa - "for more information.") - raise TypeError(msg) from e + try: + serialized_retry_exception_allowlist = ray_pickle.dumps( + retry_exception_allowlist, + ) + except TypeError as e: + msg = ( + "Could not serialize the retry exception allowlist" + f"{retry_exception_allowlist} for task {function_descriptor.repr}. " + "Check " + "https://docs.ray.io/en/master/ray-core/objects/serialization.html#troubleshooting " # noqa + "for more information.") + raise TypeError(msg) from e with self.profile_event(b"submit_task"): prepare_resources(resources, &c_resources) @@ -2672,10 +2648,7 @@ cdef class CoreWorker: eventloop, async_thread = self.get_event_loop( function_descriptor, specified_cgname) coroutine = func(*args, **kwargs) - if threading.get_ident() == async_thread.ident: - future = asyncio.ensure_future(coroutine, eventloop) - else: - future = asyncio.run_coroutine_threadsafe(coroutine, eventloop) + future = asyncio.run_coroutine_threadsafe(coroutine, eventloop) future.add_done_callback(lambda _: event.Notify()) with nogil: (CCoreWorkerProcess.GetCoreWorker() diff --git a/python/ray/air/checkpoint.py b/python/ray/air/checkpoint.py index 17e1c425186d8..f103ec2b6ad6b 100644 --- a/python/ray/air/checkpoint.py +++ b/python/ray/air/checkpoint.py @@ -211,6 +211,7 @@ def __init__( self._data_dict: Optional[Dict[str, Any]] = data_dict self._uri: Optional[str] = uri self._override_preprocessor: Optional["Preprocessor"] = None + self._override_preprocessor_set = False self._uuid = uuid.uuid4() @@ -396,7 +397,7 @@ def to_dict(self) -> dict: checkpoint_data[_METADATA_KEY] = self._metadata # If override_preprocessor is specified, then set that in the output dict. - if self._override_preprocessor: + if self._override_preprocessor_set: checkpoint_data[PREPROCESSOR_KEY] = self._override_preprocessor return checkpoint_data @@ -426,16 +427,15 @@ def from_directory(cls, path: Union[str, os.PathLike]) -> "Checkpoint": return checkpoint - # TODO: Deprecate `from_checkpoint`. For context, see #29058. @classmethod + @DeveloperAPI def from_checkpoint(cls, other: "Checkpoint") -> "Checkpoint": - """Create a checkpoint from a generic :py:class:`Checkpoint`. + """Create a checkpoint from a generic :class:`Checkpoint`. This method can be used to create a framework-specific checkpoint from a - generic :py:class:`Checkpoint` object. + generic :class:`Checkpoint` object. Examples: - >>> result = TorchTrainer.fit(...) # doctest: +SKIP >>> checkpoint = TorchCheckpoint.from_checkpoint(result.checkpoint) # doctest: +SKIP # noqa: E501 >>> model = checkpoint.get_model() # doctest: +SKIP @@ -541,7 +541,7 @@ def _to_directory(self, path: str, move_instead_of_copy: bool = False) -> None: self._save_checkpoint_metadata_in_directory(path) - if self._override_preprocessor: + if self._override_preprocessor_set and self._override_preprocessor: save_preprocessor_to_dir(self._override_preprocessor, path) def _to_directory_safe(self, path: str, move_instead_of_copy: bool = False) -> None: @@ -764,7 +764,7 @@ def __fspath__(self): def get_preprocessor(self) -> Optional["Preprocessor"]: """Return the saved preprocessor, if one exists.""" - if self._override_preprocessor: + if self._override_preprocessor_set: return self._override_preprocessor # The preprocessor will either be stored in an in-memory dict or @@ -787,10 +787,11 @@ def get_preprocessor(self) -> Optional["Preprocessor"]: return preprocessor - def set_preprocessor(self, preprocessor: "Preprocessor"): + def set_preprocessor(self, preprocessor: Optional["Preprocessor"]): """Saves the provided preprocessor to this Checkpoint.""" self._override_preprocessor = preprocessor + self._override_preprocessor_set = True @classmethod def _get_checkpoint_type( diff --git a/python/ray/air/examples/dreambooth/README.rst b/python/ray/air/examples/dreambooth/README.rst new file mode 100644 index 0000000000000..90ff959e68bff --- /dev/null +++ b/python/ray/air/examples/dreambooth/README.rst @@ -0,0 +1,134 @@ +Fine-tuning DreamBooth with Ray AIR +=================================== + +.. + ATTN: This should be kept in sync with release/air_examples/dreambooth/dreambooth_run.sh + +.. + section_intro + + +This example shows how to fine-tune a DreamBooth model using Ray AIR. + +Because of the large model sizes, you'll need 2 A10G GPUs per worker. + +The example can leverage data-parallel training to speed up training time. Of course, this will +require more GPUs. + +The demo tunes both the text_encoder and unet parts of Stable Diffusion, and utilizes the prior preserving loss function. + + +.. image:: images/dreambooth_example.png + :target: images/dreambooth_example.png + :alt: DreamBooth example + + +The full code repository can be found here: `https://github.com/ray-project/ray/blob/master/python/ray/air/examples/dreambooth/ `_ + +How it works +------------ + +For an up-to-date description of how the code works, +`please refer to the example in our documentation `_. + +Run the example +--------------- + +.. + section_run_example + +First, we download the pre-trained stable diffusion model as a starting point. + +We will then train this model with a few images of our subject. + +To achieve this, we choose a non-word as an identifier, e.g. ``unqtkn``. When fine-tuning the model with our subject, we will teach it that the prompt is ``A photo of a unqtkn ``. + +After fine-tuning we can run inference with this specific prompt. For instance: ``A photo of a unqtkn `` will create an image of our subject. + +Step 0: Preparation +^^^^^^^^^^^^^^^^^^^ + +Clone the Ray repository, go to the example directory, and install dependencies. + +.. code-block:: bash + + git clone https://github.com/ray-project/ray.git + cd ray/python/ray/air/examples/dreambooth + pip install -Ur requirements.txt + +Prepare some directories and environment variables. + +.. code-block:: bash + + export DATA_PREFIX="./" + export ORIG_MODEL_NAME="CompVis/stable-diffusion-v1-4" + export ORIG_MODEL_HASH="249dd2d739844dea6a0bc7fc27b3c1d014720b28" + export ORIG_MODEL_DIR="$DATA_PREFIX/model-orig" + export ORIG_MODEL_PATH="$ORIG_MODEL_DIR/models--${ORIG_MODEL_NAME/\//--}/snapshots/$ORIG_MODEL_HASH" + export TUNED_MODEL_DIR="$DATA_PREFIX/model-tuned" + export IMAGES_REG_DIR="$DATA_PREFIX/images-reg" + export IMAGES_OWN_DIR="$DATA_PREFIX/images-own" + export IMAGES_NEW_DIR="$DATA_PREFIX/images-new" + + export CLASS_NAME="toy car" + + mkdir -p $ORIG_MODEL_DIR $TUNED_MODEL_DIR $IMAGES_REG_DIR $IMAGES_OWN_DIR $IMAGES_NEW_DIR + +Copy some images for fine-tuning into ``$IMAGES_OWN_DIR``. + +Step 1: Download the pre-trained model +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Download and cache a pre-trained Stable-Diffusion model locally. +Default model and version are ``CompVis/stable-diffusion-v1-4`` +at git hash ``3857c45b7d4e78b3ba0f39d4d7f50a2a05aa23d4``. + +.. code-block:: + + python cache_model.py --model_dir=$ORIG_MODEL_DIR --model_name=$ORIG_MODEL_NAME --revision=$ORIG_MODEL_HASH + +Note that actual model files will be downloaded into +``\\snapshots\\`` directory. + +Step 2: Create the regularization images +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Create a regularization image set for a class of subjects: + +.. code-block:: + + python run_model.py \ + --model_dir=$ORIG_MODEL_PATH \ + --output_dir=$IMAGES_REG_DIR \ + --prompts="photo of a $CLASS_NAME" \ + --num_samples_per_prompt=200 + +Step 3: Fine-tune the model +^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Save a few (4 to 5) images of the subject being fine-tuned +in a local directory. Then launch the training job with: + +.. code-block:: + + python train.py \ + --model_dir=$ORIG_MODEL_PATH \ + --output_dir=$TUNED_MODEL_DIR \ + --instance_images_dir=$IMAGES_OWN_DIR \ + --instance_prompt="a photo of unqtkn $CLASS_NAME" \ + --class_images_dir=$IMAGES_REG_DIR \ + --class_prompt="a photo of a $CLASS_NAME" + +Step 4: Generate images of our subject +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Try your model with the same commandline as Step 2, but point +to your own model this time! + +.. code-block:: + + python run_model.py \ + --model_dir=$TUNED_MODEL_DIR \ + --output_dir=$IMAGES_NEW_DIR \ + --prompts="photo of a unqtkn $CLASS_NAME" \ + --num_samples_per_prompt=20 diff --git a/python/ray/air/examples/dreambooth/cache_model.py b/python/ray/air/examples/dreambooth/cache_model.py new file mode 100644 index 0000000000000..2d8d62e43e371 --- /dev/null +++ b/python/ray/air/examples/dreambooth/cache_model.py @@ -0,0 +1,20 @@ +# Cache model files to a local directory + +import os + +from huggingface_hub import snapshot_download + +from flags import cache_model_flags + + +def cache(args): + os.makedirs(args.model_dir, exist_ok=True) + + snapshot_download( + repo_id=args.model_name, revision=args.revision, cache_dir=args.model_dir + ) + + +if __name__ == "__main__": + args = cache_model_flags().parse_args() + cache(args) diff --git a/python/ray/air/examples/dreambooth/dataset.py b/python/ray/air/examples/dreambooth/dataset.py new file mode 100644 index 0000000000000..c9f078b42bcea --- /dev/null +++ b/python/ray/air/examples/dreambooth/dataset.py @@ -0,0 +1,92 @@ +import pandas as pd +import torch + +from ray.data import read_images +from ray.data.preprocessors import TorchVisionPreprocessor +from torchvision import transforms +from transformers import AutoTokenizer + + +def get_train_dataset(args, image_resolution=512): + """Build a Ray Dataset for fine-tuning DreamBooth model.""" + # Load images into Ray Dataset + instance_dataset = read_images(args.instance_images_dir) + class_dataset = read_images(args.class_images_dir) + + # We now duplicate the instance images multiple times to make the + # two sets contain exactly the same number of images. + # This is so we can zip them up during training to compute the + # prior preserving loss in one pass. + dup_times = class_dataset.count() // instance_dataset.count() + instance_dataset = instance_dataset.map_batches( + lambda df: pd.concat([df] * dup_times) + ) + + # Load tokenizer for tokenizing the image prompts. + tokenizer = AutoTokenizer.from_pretrained( + pretrained_model_name_or_path=args.model_dir, + subfolder="tokenizer", + ) + + def _tokenize(prompt): + return tokenizer( + prompt, + truncation=True, + padding="max_length", + max_length=tokenizer.model_max_length, + return_tensors="pt", + ).input_ids.numpy() + + # Get the token ids for both prompts. + class_prompt_ids = _tokenize(args.class_prompt)[0] + instance_prompt_ids = _tokenize(args.instance_prompt)[0] + + # Image preprocessing. + # Instance and class images used by this example are in sizes 700x700 + # and 512x512 respectively. + # Depending on the sizes of actual training images, there may need to be a + # transforms.Resize() step as well. + transform = transforms.Compose( + [ + transforms.ToTensor(), + transforms.RandomCrop(image_resolution), + transforms.Normalize([0.5], [0.5]), + ] + ) + preprocessor = TorchVisionPreprocessor(["image"], transform=transform) + + instance_dataset = preprocessor.transform(instance_dataset).add_column( + "prompt_ids", lambda df: [instance_prompt_ids] * len(df) + ) + class_dataset = preprocessor.transform(class_dataset).add_column( + "prompt_ids", lambda df: [class_prompt_ids] * len(df) + ) + # --- + + # Now, zip the images up. + final_size = min(instance_dataset.count(), class_dataset.count()) + train_dataset = ( + instance_dataset.limit(final_size) + .repartition(final_size) + .zip(class_dataset.limit(final_size).repartition(final_size)) + ) + + return train_dataset.random_shuffle() + + +def collate(batch, device, dtype): + """Build Torch training batch.""" + # Layout of the batch is that instance image data (pixels, prompt ids) occupy + # the top half of the batch. And class image data occupy the bottom half + # of the batch. + # During training, a batch will be chunked into 2 sub-batches for prior + # preserving loss calculation. + images = torch.squeeze(torch.stack([batch["image"], batch["image_1"]])) + images = images.to(memory_format=torch.contiguous_format).float() + + prompt_ids = torch.cat([batch["prompt_ids"], batch["prompt_ids_1"]], dim=0) + + return { + "prompt_ids": prompt_ids.to(device), # token ids should stay int. + "images": images.to(device, dtype=dtype), + } diff --git a/python/ray/air/examples/dreambooth/flags.py b/python/ray/air/examples/dreambooth/flags.py new file mode 100644 index 0000000000000..55c1d23930fba --- /dev/null +++ b/python/ray/air/examples/dreambooth/flags.py @@ -0,0 +1,136 @@ +import argparse + + +def train_arguments(): + """Commandline arguments for running DreamBooth training script.""" + parser = argparse.ArgumentParser() + + parser.add_argument( + "--model_dir", + type=str, + default=None, + required=True, + help="Path to a pretrained huggingface Stable Diffusion model.", + ) + parser.add_argument( + "--output_dir", + type=str, + default=None, + required=True, + help="Directory where trained models are saved.", + ) + parser.add_argument( + "--instance_images_dir", + type=str, + default=None, + required=True, + help=( + "Directory where a few images of the instance to be fine tuned " + "into the model are saved." + ), + ) + parser.add_argument( + "--instance_prompt", + type=str, + default=None, + required=True, + help=("Prompt for creating the instance images."), + ) + parser.add_argument( + "--class_images_dir", + type=str, + default=None, + required=True, + help=( + "Directory where images of similar objects for preserving " + "model priors are saved." + ), + ) + parser.add_argument( + "--class_prompt", + type=str, + default=None, + required=True, + help=("Prompt for creating the class images."), + ) + parser.add_argument( + "--train_batch_size", type=int, default=1, help="Train batch size." + ) + parser.add_argument("--lr", type=float, default=5e-6, help="Train learning rate.") + parser.add_argument( + "--num_epochs", type=int, default=4, help="Number of epochs to train." + ) + parser.add_argument( + "--prior_loss_weight", + type=float, + default=1.0, + help="The weight for prior preservation loss.", + ) + parser.add_argument( + "--max_grad_norm", type=float, default=1.0, help="Maximum gradient norm." + ) + parser.add_argument("--num_workers", type=int, default=2, help="Number of workers.") + + return parser + + +def cache_model_flags(): + """Commandline arguments for running local model caching script.""" + parser = argparse.ArgumentParser() + + parser.add_argument( + "--model_dir", + type=str, + default=None, + required=True, + help="Directory to write the cached model files.", + ) + parser.add_argument( + "--model_name", + type=str, + default="CompVis/stable-diffusion-v1-4", + help="Name of the huggingface model.", + ) + parser.add_argument( + "--revision", + type=str, + default="3857c45b7d4e78b3ba0f39d4d7f50a2a05aa23d4", + help="Revision of the huggingface model repo to cache.", + ) + + return parser + + +def run_model_flags(): + """Commandline arguments for running a tuned DreamBooth model.""" + parser = argparse.ArgumentParser() + + parser.add_argument( + "--model_dir", + type=str, + default=None, + required=True, + help="Directory of the tuned model files.", + ) + parser.add_argument( + "--output_dir", + type=str, + default=None, + required=True, + help="Directory to save the generated images.", + ) + parser.add_argument( + "--prompts", + type=str, + default=None, + required=True, + help="Comma separated prompt strings for generating the images.", + ) + parser.add_argument( + "--num_samples_per_prompt", + type=int, + default=1, + help="Number of images to generate for each prompt.", + ) + + return parser diff --git a/python/ray/air/examples/dreambooth/images/dreambooth_example.png b/python/ray/air/examples/dreambooth/images/dreambooth_example.png new file mode 100644 index 0000000000000..4909d8ed13b1a Binary files /dev/null and b/python/ray/air/examples/dreambooth/images/dreambooth_example.png differ diff --git a/python/ray/air/examples/dreambooth/images/dreambooth_training.png b/python/ray/air/examples/dreambooth/images/dreambooth_training.png new file mode 100644 index 0000000000000..8a759bcd51da9 Binary files /dev/null and b/python/ray/air/examples/dreambooth/images/dreambooth_training.png differ diff --git a/python/ray/air/examples/dreambooth/images/unqtkn/1.jpg b/python/ray/air/examples/dreambooth/images/unqtkn/1.jpg new file mode 100644 index 0000000000000..1d629ecb3b4c5 Binary files /dev/null and b/python/ray/air/examples/dreambooth/images/unqtkn/1.jpg differ diff --git a/python/ray/air/examples/dreambooth/images/unqtkn/2.jpg b/python/ray/air/examples/dreambooth/images/unqtkn/2.jpg new file mode 100644 index 0000000000000..017556bc8d4e4 Binary files /dev/null and b/python/ray/air/examples/dreambooth/images/unqtkn/2.jpg differ diff --git a/python/ray/air/examples/dreambooth/images/unqtkn/3.jpg b/python/ray/air/examples/dreambooth/images/unqtkn/3.jpg new file mode 100644 index 0000000000000..ecfe0a1903c82 Binary files /dev/null and b/python/ray/air/examples/dreambooth/images/unqtkn/3.jpg differ diff --git a/python/ray/air/examples/dreambooth/images/unqtkn/4.jpg b/python/ray/air/examples/dreambooth/images/unqtkn/4.jpg new file mode 100644 index 0000000000000..b425f40aa0dae Binary files /dev/null and b/python/ray/air/examples/dreambooth/images/unqtkn/4.jpg differ diff --git a/python/ray/air/examples/dreambooth/images/unqtkn/5.jpg b/python/ray/air/examples/dreambooth/images/unqtkn/5.jpg new file mode 100644 index 0000000000000..95ebf0001f420 Binary files /dev/null and b/python/ray/air/examples/dreambooth/images/unqtkn/5.jpg differ diff --git a/python/ray/air/examples/dreambooth/requirements.txt b/python/ray/air/examples/dreambooth/requirements.txt new file mode 100644 index 0000000000000..4d031ccc7c4fc --- /dev/null +++ b/python/ray/air/examples/dreambooth/requirements.txt @@ -0,0 +1,8 @@ +accelerate==0.15.0 +bitsandbytes +diffusers==0.11.1 +flax==0.6.4 +huggingface_hub +numpy==1.21 +torchvision +transformers>=4.25.1 diff --git a/python/ray/air/examples/dreambooth/run_model.py b/python/ray/air/examples/dreambooth/run_model.py new file mode 100644 index 0000000000000..e8d8a5d6da8ba --- /dev/null +++ b/python/ray/air/examples/dreambooth/run_model.py @@ -0,0 +1,33 @@ +import hashlib +from os import path + +from diffusers import DiffusionPipeline +import torch + +from flags import run_model_flags + + +def run(args): + print(f"Loading model from {args.model_dir}") + pipeline = DiffusionPipeline.from_pretrained( + args.model_dir, torch_dtype=torch.float16 + ) + pipeline.set_progress_bar_config(disable=True) + if torch.cuda.is_available(): + pipeline.to("cuda") + + prompts = args.prompts.split(",") + + # Generate 1 image to reduce memory consumption. + for prompt in prompts: + for i in range(args.num_samples_per_prompt): + for image in pipeline(prompt).images: + hash_image = hashlib.sha1(image.tobytes()).hexdigest() + image_filename = path.join(args.output_dir, f"{i}-{hash_image}.jpg") + image.save(image_filename) + print(f"Saved {image_filename}") + + +if __name__ == "__main__": + args = run_model_flags().parse_args() + run(args) diff --git a/python/ray/air/examples/dreambooth/train.py b/python/ray/air/examples/dreambooth/train.py new file mode 100644 index 0000000000000..643ab66cf2b7b --- /dev/null +++ b/python/ray/air/examples/dreambooth/train.py @@ -0,0 +1,228 @@ +import itertools + +from diffusers import ( + AutoencoderKL, + DDPMScheduler, + DiffusionPipeline, + UNet2DConditionModel, +) +from diffusers.utils.import_utils import is_xformers_available +from ray.air import session, ScalingConfig +from ray.train.torch import TorchTrainer +import torch +import torch.nn.functional as F +from torch.nn.parallel import DistributedDataParallel +from torch.nn.utils import clip_grad_norm_ +from transformers import CLIPTextModel + +from dataset import collate, get_train_dataset +from flags import train_arguments + + +def prior_preserving_loss(model_pred, target, weight): + # Chunk the noise and model_pred into two parts and compute + # the loss on each part separately. + model_pred, model_pred_prior = torch.chunk(model_pred, 2, dim=0) + target, target_prior = torch.chunk(target, 2, dim=0) + + # Compute instance loss + loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean") + + # Compute prior loss + prior_loss = F.mse_loss( + model_pred_prior.float(), target_prior.float(), reduction="mean" + ) + + # Add the prior loss to the instance loss. + return loss + weight * prior_loss + + +def get_target(scheduler, noise, latents, timesteps): + """Get the target for loss depending on the prediction type.""" + pred_type = scheduler.config.prediction_type + if pred_type == "epsilon": + return noise + if pred_type == "v_prediction": + return scheduler.get_velocity(latents, noise, timesteps) + raise ValueError(f"Unknown prediction type {pred_type}") + + +def load_models(config, cuda): + """Load pre-trained Stable Diffusion models.""" + # Load all models in bfloat16 to save GRAM. + # For models that are only used for inferencing, + # full precision is also not required. + dtype = torch.bfloat16 + + text_encoder = CLIPTextModel.from_pretrained( + args.model_dir, + subfolder="text_encoder", + torch_dtype=dtype, + ) + text_encoder.to(cuda[1]) + text_encoder.train() + + noise_scheduler = DDPMScheduler.from_pretrained( + config["model_dir"], + subfolder="scheduler", + torch_dtype=dtype, + ) + + # VAE is only used for inference, keeping weights in full precision is not required. + vae = AutoencoderKL.from_pretrained( + config["model_dir"], + subfolder="vae", + torch_dtype=dtype, + ) + # We are not training VAE part of the model. + vae.requires_grad_(False) + vae.to(cuda[1]) + + # Convert unet to bf16 to save GRAM. + unet = UNet2DConditionModel.from_pretrained( + config["model_dir"], + subfolder="unet", + torch_dtype=dtype, + ) + if is_xformers_available(): + unet.enable_xformers_memory_efficient_attention() + # UNET is the largest component, occupying first GPU by itself. + unet.to(cuda[0]) + unet.train() + + torch.cuda.empty_cache() + + return text_encoder, noise_scheduler, vae, unet + + +def get_cuda_devices(): + devices = [f"cuda:{i}" for i in range(torch.cuda.device_count())] + local_rank = session.get_local_rank() + assert len(devices) >= 2, "Require at least 2 GPU devices to work." + return devices[(local_rank * 2) : ((local_rank * 2) + 2)] + + +def train_fn(config): + cuda = get_cuda_devices() + + # Load pre-trained models. + text_encoder, noise_scheduler, vae, unet = load_models(config, cuda) + + # Wrap in DDP + text_encoder = DistributedDataParallel( + text_encoder, device_ids=[cuda[1]], output_device=cuda[1] + ) + unet = DistributedDataParallel(unet, device_ids=[cuda[0]], output_device=cuda[0]) + + # Use the regular AdamW optimizer to work with bfloat16 weights. + optimizer = torch.optim.AdamW( + itertools.chain(text_encoder.parameters(), unet.parameters()), + lr=config["lr"], + ) + + train_dataset = session.get_dataset_shard("train") + + # Train! + num_train_epochs = config["num_epochs"] + + print(f"Running {num_train_epochs} epochs.") + + global_step = 0 + for epoch in range(num_train_epochs): + for step, batch in enumerate( + train_dataset.iter_torch_batches( + batch_size=config["train_batch_size"], device=cuda[1] + ) + ): + # Load batch on GPU 2 because VAE and text encoder are there. + batch = collate(batch, cuda[1], torch.bfloat16) + + optimizer.zero_grad() + + # Convert images to latent space + latents = vae.encode(batch["images"]).latent_dist.sample() * 0.18215 + + # Sample noise that we'll add to the latents + noise = torch.randn_like(latents) + bsz = latents.shape[0] + # Sample a random timestep for each image + timesteps = torch.randint( + 0, + noise_scheduler.config.num_train_timesteps, + (bsz,), + device=latents.device, + ) + timesteps = timesteps.long() + + # Add noise to the latents according to the noise magnitude at each timestep + # (this is the forward diffusion process) + noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps) + + # Get the text embedding for conditioning + encoder_hidden_states = text_encoder(batch["prompt_ids"])[0] + + # Predict the noise residual. We need to move all data bits to GPU 1. + model_pred = unet( + noisy_latents.to(cuda[0]), + timesteps.to(cuda[0]), + encoder_hidden_states.to(cuda[0]), + ).sample + target = get_target(noise_scheduler, noise, latents, timesteps).to(cuda[0]) + + # Now, move model prediction to GPU 2 for loss calculation. + loss = prior_preserving_loss( + model_pred, target, config["prior_loss_weight"] + ) + loss.backward() + + # Gradient clipping before optimizer stepping. + clip_grad_norm_( + itertools.chain(text_encoder.parameters(), unet.parameters()), + config["max_grad_norm"], + ) + + optimizer.step() # Step all optimizers. + + global_step += 1 + results = { + "step": global_step, + "loss": loss.detach().item(), + } + session.report(results) + + # Create pipeline using the trained modules and save it. + if session.get_world_rank() == 0: + pipeline = DiffusionPipeline.from_pretrained( + config["model_dir"], + text_encoder=text_encoder.module, + unet=unet.module, + ) + pipeline.save_pretrained(config["output_dir"]) + + +if __name__ == "__main__": + args = train_arguments().parse_args() + + # Build training dataset. + train_dataset = get_train_dataset(args) + + print(f"Loaded training dataset (size: {train_dataset.count()})") + + # Train with Ray AIR TorchTrainer. + trainer = TorchTrainer( + train_fn, + train_loop_config=vars(args), + scaling_config=ScalingConfig( + use_gpu=True, + num_workers=args.num_workers, + resources_per_worker={ + "GPU": 2, + }, + ), + datasets={ + "train": train_dataset, + }, + ) + result = trainer.fit() + + print(result) diff --git a/python/ray/air/tests/execution/test_tracked_actor.py b/python/ray/air/tests/execution/test_tracked_actor.py index 924ee88f7f258..f3eadaed6450b 100644 --- a/python/ray/air/tests/execution/test_tracked_actor.py +++ b/python/ray/air/tests/execution/test_tracked_actor.py @@ -1,4 +1,5 @@ from collections import Counter +import gc from typing import Any, Optional, Type import pytest @@ -39,6 +40,14 @@ def ray_start_4_cpus(): ray.shutdown() +@pytest.fixture +def cleanup(): + # Garbage collect at the start + # This ensures that all resources are freed up for the upcoming test. + gc.collect() + yield + + class Actor: def __init__(self, **kwargs): self.kwargs = kwargs @@ -176,7 +185,7 @@ def stop_callback(tracked_actor): "resource_manager_cls", [FixedResourceManager, PlacementGroupResourceManager] ) @pytest.mark.parametrize("where", ["init", "fn"]) -def test_actor_fail(ray_start_4_cpus, resource_manager_cls, where): +def test_actor_fail(ray_start_4_cpus, cleanup, resource_manager_cls, where): """Test that actor failures are handled properly. - Start actor that either fails on init or in a task (RayActorError) diff --git a/python/ray/air/tests/test_checkpoints.py b/python/ray/air/tests/test_checkpoints.py index feecfc0d432d4..0b6aced414b6a 100644 --- a/python/ray/air/tests/test_checkpoints.py +++ b/python/ray/air/tests/test_checkpoints.py @@ -736,6 +736,11 @@ def testDictCheckpointSetPreprocessor(self): preprocessor = checkpoint.get_preprocessor() assert preprocessor.multiplier == 1 + # Check that we can set it to None + checkpoint.set_preprocessor(None) + preprocessor = checkpoint.get_preprocessor() + assert preprocessor is None + def testDictCheckpointSetPreprocessorAsDir(self): preprocessor = DummyPreprocessor(1) data = {"metric": 5} diff --git a/python/ray/data/__init__.py b/python/ray/data/__init__.py index 9a114f84490a2..f4894f0a3d985 100644 --- a/python/ray/data/__init__.py +++ b/python/ray/data/__init__.py @@ -1,3 +1,11 @@ +import sys + +# Short term workaround for https://github.com/ray-project/ray/issues/32435 +# Datasets currently has a hard dependency on pandas, so it doesn't need to be delayed. +# ray.data import is still eager for all ray imports for Python 3.6: +if sys.version_info >= (3, 7): + import pandas # noqa + from ray.data._internal.compute import ActorPoolStrategy from ray.data._internal.progress_bar import set_progress_bars from ray.data.dataset import Dataset diff --git a/python/ray/data/_internal/block_batching.py b/python/ray/data/_internal/block_batching.py index febee5a06b072..477d739a225cb 100644 --- a/python/ray/data/_internal/block_batching.py +++ b/python/ray/data/_internal/block_batching.py @@ -240,12 +240,31 @@ def _resolve_blocks( An iterator over resolved blocks. """ + hit = 0 + miss = 0 + unknown = 0 for block_ref in block_ref_iter: if block_ref is not None: stats_timer = stats.iter_get_s.timer() if stats else nullcontext() + # Count the number of blocks that we hit locally or miss (so have to + # fetch from remote node). This is to measure the effectiveness of + # prefetch. + loc = ray.experimental.get_object_locations([block_ref]) + nodes = loc[block_ref]["node_ids"] + if nodes: + current = ray.get_runtime_context().get_node_id() + if current in nodes: + hit += 1 + else: + miss += 1 + else: + unknown += 1 with stats_timer: block = ray.get(block_ref) yield block + stats.iter_blocks_local = hit + stats.iter_blocks_remote = miss + stats.iter_unknown_location = unknown def _prefetch_blocks( diff --git a/python/ray/data/_internal/bulk_dataset_iterator.py b/python/ray/data/_internal/dataset_iterator_impl.py similarity index 98% rename from python/ray/data/_internal/bulk_dataset_iterator.py rename to python/ray/data/_internal/dataset_iterator_impl.py index d7d19caaa906a..edcc1b6daae46 100644 --- a/python/ray/data/_internal/bulk_dataset_iterator.py +++ b/python/ray/data/_internal/dataset_iterator_impl.py @@ -11,7 +11,7 @@ from ray.data import Dataset -class BulkDatasetIterator(DatasetIterator): +class DatasetIteratorImpl(DatasetIterator): def __init__( self, base_dataset: "Dataset", diff --git a/python/ray/data/_internal/execution/bulk_executor.py b/python/ray/data/_internal/execution/bulk_executor.py index 53ca0e59b8023..e4e522b97dd8b 100644 --- a/python/ray/data/_internal/execution/bulk_executor.py +++ b/python/ray/data/_internal/execution/bulk_executor.py @@ -5,6 +5,7 @@ from ray.data._internal.execution.interfaces import ( Executor, ExecutionOptions, + OutputIterator, RefBundle, PhysicalOperator, ) @@ -81,7 +82,7 @@ def execute_recursive(op: PhysicalOperator) -> List[RefBundle]: ) return output - return execute_recursive(dag) + return OutputIterator(execute_recursive(dag)) def get_stats(self) -> DatasetStats: return self._stats diff --git a/python/ray/data/_internal/execution/interfaces.py b/python/ray/data/_internal/execution/interfaces.py index 2aee8c531eb93..6b52c90df210a 100644 --- a/python/ray/data/_internal/execution/interfaces.py +++ b/python/ray/data/_internal/execution/interfaces.py @@ -1,5 +1,5 @@ from dataclasses import dataclass -from typing import Dict, List, Optional, Iterable, Tuple, Callable +from typing import Dict, List, Optional, Iterable, Iterator, Tuple, Callable, Union import ray from ray.data._internal.logical.interfaces import Operator @@ -9,6 +9,9 @@ from ray.data.context import DatasetContext from ray.types import ObjectRef +# Node id string returned by `ray.get_runtime_context().get_node_id()`. +NodeIdStr = str + @dataclass class RefBundle: @@ -38,6 +41,9 @@ class RefBundle: # output splits. It is otherwise None. output_split_idx: Optional[int] = None + # Cached location, used for get_cached_location(). + _cached_location: Optional[NodeIdStr] = None + def __post_init__(self): for b in self.blocks: assert isinstance(b, tuple), b @@ -74,6 +80,28 @@ def destroy_if_owned(self) -> int: trace_deallocation(b[0], "RefBundle.destroy_if_owned", free=should_free) return self.size_bytes() if should_free else 0 + def get_cached_location(self) -> Optional[NodeIdStr]: + """Return a location for this bundle's data, if possible. + + Caches the resolved location so multiple calls to this are efficient. + """ + if self._cached_location is None: + # Only consider the first block in the bundle for now. TODO(ekl) consider + # taking into account other blocks. + ref = self.blocks[0][0] + # This call is pretty fast for owned objects (~5k/s), so we don't need to + # batch it for now. + locs = ray.experimental.get_object_locations([ref]) + nodes = locs[ref]["node_ids"] + if nodes: + self._cached_location = nodes[0] + else: + self._cached_location = "" + if self._cached_location: + return self._cached_location + else: + return None # Return None if cached location is "". + def __eq__(self, other) -> bool: return self is other @@ -167,14 +195,16 @@ class ExecutionOptions: resource_limits: ExecutionResources = ExecutionResources() # Set this to prefer running tasks on the same node as the output - # node (node driving the execution). - locality_with_output: bool = False + # node (node driving the execution). It can also be set to a list of node ids + # to spread the outputs across those nodes. + locality_with_output: Union[bool, List[NodeIdStr]] = False # Set this to preserve the ordering between blocks processed by operators under the # streaming executor. The bulk executor always preserves order. preserve_order: bool = False - # Whether to enable locality-aware task dispatch to actors (on by default). + # Whether to enable locality-aware task dispatch to actors (on by default). This + # applies to both ActorPoolStrategy map and streaming_split operations. actor_locality_enabled: bool = True @@ -334,6 +364,15 @@ def get_work_refs(self) -> List[ray.ObjectRef]: """ return [] + def throttling_disabled(self) -> bool: + """Whether to disable resource throttling for this operator. + + This should return True for operators that only manipulate bundle metadata + (e.g., the OutputSplitter operator). This hints to the execution engine that + these operators should not be throttled based on resource usage. + """ + return False + def num_active_work_refs(self) -> int: """Return the number of active work refs. @@ -390,6 +429,34 @@ def incremental_resource_usage(self) -> ExecutionResources: return ExecutionResources() +class OutputIterator(Iterator[RefBundle]): + """Iterator used to access the output of an Executor execution. + + This is a blocking iterator. Datasets guarantees that all its iterators are + thread-safe (i.e., multiple threads can block on them at the same time). + """ + + def __init__(self, base: Iterable[RefBundle]): + self._it = iter(base) + + def get_next(self, output_split_idx: Optional[int] = None) -> RefBundle: + """Can be used to pull outputs by a specified output index. + + This is used to support the streaming_split() API, where the output of a + streaming execution is to be consumed by multiple processes. + + Args: + output_split_idx: The output split index to get results for. This arg is + only allowed for iterators created by `Dataset.streaming_split()`. + """ + if output_split_idx is not None: + raise NotImplementedError() + return next(self._it) + + def __next__(self) -> RefBundle: + return self.get_next() + + class Executor: """Abstract class for executors, which implement physical operator execution. @@ -404,7 +471,7 @@ def __init__(self, options: ExecutionOptions): def execute( self, dag: PhysicalOperator, initial_stats: Optional[DatasetStats] = None - ) -> Iterable[RefBundle]: + ) -> OutputIterator: """Start execution. Args: diff --git a/python/ray/data/_internal/execution/legacy_compat.py b/python/ray/data/_internal/execution/legacy_compat.py index 9d01e88b0b188..ddacb80510305 100644 --- a/python/ray/data/_internal/execution/legacy_compat.py +++ b/python/ray/data/_internal/execution/legacy_compat.py @@ -41,21 +41,43 @@ def execute_to_legacy_block_iterator( allow_clear_input_blocks: bool, dataset_uuid: str, ) -> Iterator[ObjectRef[Block]]: - """Execute a plan with the new executor and return a block iterator. + """Same as execute_to_legacy_bundle_iterator but returning blocks.""" + bundle_iter = execute_to_legacy_bundle_iterator( + executor, plan, allow_clear_input_blocks, dataset_uuid + ) + for bundle in bundle_iter: + for block, _ in bundle.blocks: + yield block + + +def execute_to_legacy_bundle_iterator( + executor: Executor, + plan: ExecutionPlan, + allow_clear_input_blocks: bool, + dataset_uuid: str, + dag_rewrite=None, +) -> Iterator[RefBundle]: + """Execute a plan with the new executor and return a bundle iterator. Args: executor: The executor to use. plan: The legacy plan to execute. allow_clear_input_blocks: Whether the executor may consider clearing blocks. dataset_uuid: UUID of the dataset for this execution. + dag_rewrite: Callback that can be used to mutate the DAG prior to execution. + This is currently used as a legacy hack to inject the OutputSplit operator + for `Dataset.streaming_split()`. Returns: - The output as a block iterator. + The output as a bundle iterator. """ + if DatasetContext.get_current().optimizer_enabled: dag, stats = get_execution_plan(plan._logical_plan).dag, None else: dag, stats = _to_operator_dag(plan, allow_clear_input_blocks) + if dag_rewrite: + dag = dag_rewrite(dag) # Enforce to preserve ordering if the plan has stages required to do so, such as # Zip and Sort. @@ -64,10 +86,7 @@ def execute_to_legacy_block_iterator( executor._options.preserve_order = True bundle_iter = executor.execute(dag, initial_stats=stats) - - for bundle in bundle_iter: - for block, _ in bundle.blocks: - yield block + return bundle_iter def execute_to_legacy_block_list( diff --git a/python/ray/data/_internal/execution/operators/actor_pool_map_operator.py b/python/ray/data/_internal/execution/operators/actor_pool_map_operator.py index cd91bdf9740da..286f16c1f228a 100644 --- a/python/ray/data/_internal/execution/operators/actor_pool_map_operator.py +++ b/python/ray/data/_internal/execution/operators/actor_pool_map_operator.py @@ -12,6 +12,7 @@ ExecutionOptions, PhysicalOperator, TaskContext, + NodeIdStr, ) from ray.data._internal.execution.operators.map_operator import ( MapOperator, @@ -21,9 +22,6 @@ from ray.types import ObjectRef from ray._raylet import ObjectRefGenerator -# Type alias for a node id. -NodeIdStr = str - # Higher values here are better for prefetching and locality. It's ok for this to be # fairly high since streaming backpressure prevents us from overloading actors. DEFAULT_MAX_TASKS_IN_FLIGHT = 4 @@ -656,14 +654,4 @@ def _get_location(self, bundle: RefBundle) -> Optional[NodeIdStr]: Returns: A node id associated with the bundle, or None if unknown. """ - # Only consider the first block in the bundle for now. TODO(ekl) consider - # taking into account other blocks. - ref = bundle.blocks[0][0] - # This call is pretty fast for owned objects (~5k/s), so we don't need to - # batch it for now. - locs = ray.experimental.get_object_locations([ref]) - nodes = locs[ref]["node_ids"] - if nodes: - return nodes[0] - else: - return None + return bundle.get_cached_location() diff --git a/python/ray/data/_internal/execution/operators/map_operator.py b/python/ray/data/_internal/execution/operators/map_operator.py index 3a55725a234a2..57fb4e7d0b4a1 100644 --- a/python/ray/data/_internal/execution/operators/map_operator.py +++ b/python/ray/data/_internal/execution/operators/map_operator.py @@ -1,4 +1,5 @@ from abc import ABC, abstractmethod +import copy from dataclasses import dataclass import itertools from typing import List, Iterator, Any, Dict, Optional, Union @@ -46,6 +47,7 @@ def __init__( self._transform_fn = transform_fn self._ray_remote_args = _canonicalize_ray_remote_args(ray_remote_args or {}) + self._ray_remote_args_factory = None # Bundles block references up to the min_rows_per_bundle target. self._block_ref_bundler = _BlockRefBundler(min_rows_per_bundle) @@ -132,14 +134,30 @@ def start(self, options: "ExecutionOptions"): self._output_queue = _OrderedOutputQueue() else: self._output_queue = _UnorderedOutputQueue() + if options.locality_with_output: - # Try to schedule tasks locally. - self._ray_remote_args[ - "scheduling_strategy" - ] = NodeAffinitySchedulingStrategy( - ray.get_runtime_context().get_node_id(), - soft=True, - ) + if isinstance(options.locality_with_output, list): + locs = options.locality_with_output + else: + locs = [ray.get_runtime_context().get_node_id()] + + class RoundRobinAssign: + def __init__(self, locs): + self.locs = locs + self.i = 0 + + def __call__(self, args): + args = copy.deepcopy(args) + args["scheduling_strategy"] = NodeAffinitySchedulingStrategy( + self.locs[self.i], + soft=True, + ) + self.i += 1 + self.i %= len(self.locs) + return args + + self._ray_remote_args_factory = RoundRobinAssign(locs) + # Put the function def in the object store to avoid repeated serialization # in case it's large (i.e., closure captures large objects). self._transform_fn_ref = ray.put(self._transform_fn) @@ -159,6 +177,11 @@ def add_input(self, refs: RefBundle, input_index: int): bundle = self._block_ref_bundler.get_next_bundle() self._add_bundled_input(bundle) + def _get_runtime_ray_remote_args(self) -> Dict[str, Any]: + if self._ray_remote_args_factory: + return self._ray_remote_args_factory(self._ray_remote_args) + return self._ray_remote_args + @abstractmethod def _add_bundled_input(self, refs: RefBundle): """Add a pre-bundled upstream output to this operator. diff --git a/python/ray/data/_internal/execution/operators/output_splitter.py b/python/ray/data/_internal/execution/operators/output_splitter.py index 41971bb21cb4c..b75fcce0e09f9 100644 --- a/python/ray/data/_internal/execution/operators/output_splitter.py +++ b/python/ray/data/_internal/execution/operators/output_splitter.py @@ -1,5 +1,5 @@ import math -from typing import List +from typing import List, Dict, Optional from ray.data.block import Block, BlockMetadata, BlockAccessor from ray.data._internal.remote_fn import cached_remote_fn @@ -7,7 +7,9 @@ from ray.data._internal.execution.interfaces import ( RefBundle, PhysicalOperator, + ExecutionOptions, ExecutionResources, + NodeIdStr, ) from ray.types import ObjectRef @@ -17,10 +19,12 @@ class OutputSplitter(PhysicalOperator): The output bundles of this operator will have a `bundle.output_split_idx` attr set to an integer from [0..n-1]. This operator tries to divide the rows evenly - across output splits. + across output splits. If the `equal` option is set, the operator will furthermore + guarantee an exact split of rows across outputs, truncating the Dataset as needed. - If the `equal` option is set, the operator will furthermore guarantee an exact - split of rows across outputs, truncating the Dataset as needed. + Implementation wise, this operator keeps an internal buffer of bundles. The buffer + has a minimum size calculated to enable a good locality hit rate, as well as ensure + we can satisfy the `equal` requirement. OutputSplitter does not provide any ordering guarantees. """ @@ -30,6 +34,7 @@ def __init__( input_op: PhysicalOperator, n: int, equal: bool, + locality_hints: Optional[List[NodeIdStr]] = None, ): super().__init__(f"split({n}, equal={equal})", [input_op]) self._equal = equal @@ -40,6 +45,40 @@ def __init__( # The number of rows output to each output split so far. self._num_output: List[int] = [0 for _ in range(n)] + if locality_hints is not None: + if n != len(locality_hints): + raise ValueError( + "Locality hints list must have length `n`: " + f"len({locality_hints}) != {n}" + ) + self._locality_hints = locality_hints + if locality_hints: + # To optimize locality, we should buffer a certain number of elements + # internally before dispatch to allow the locality algorithm a good chance + # of selecting a preferred location. We use a small multiple of `n` since + # it's reasonable to buffer a couple blocks per consumer. + self._min_buffer_size = 2 * n + else: + self._min_buffer_size = 0 + self._locality_hits = 0 + self._locality_misses = 0 + + def start(self, options: ExecutionOptions) -> None: + super().start(options) + # Force disable locality optimization. + if not options.actor_locality_enabled: + self._locality_hints = None + self._min_buffer_size = 0 + + def throttling_disabled(self) -> bool: + """Disables resource-based throttling. + + It doesn't make sense to throttle the inputs to this operator, since all that + would do is lower the buffer size and prevent us from emitting outputs / + reduce the locality hit rate. + """ + return True + def has_next(self) -> bool: return len(self._output_queue) > 0 @@ -47,6 +86,9 @@ def get_next(self) -> RefBundle: return self._output_queue.pop() def get_stats(self) -> StatsDict: + return {"split": []} # TODO(ekl) add split metrics? + + def get_metrics(self) -> Dict[str, int]: stats = {} for i, num in enumerate(self._num_output): stats[f"num_output_{i}"] = num @@ -59,9 +101,10 @@ def add_input(self, bundle, input_index) -> None: self._dispatch_bundles() def inputs_done(self) -> None: + super().inputs_done() if not self._equal: - # There shouldn't be any buffered data if we're not in equal split mode. - assert not self._buffer + self._dispatch_bundles(dispatch_all=True) + assert not self._buffer, "Should have dispatched all bundles." return # Otherwise: @@ -97,21 +140,31 @@ def current_resource_usage(self) -> ExecutionResources: ) def progress_str(self) -> str: - if self._equal: - return f"{len(self._buffer)} buffered" - assert not self._buffer - return "" + if self._locality_hints: + return ( + f"[{self._locality_hits} locality hits, {self._locality_misses} misses]" + ) + else: + return "[locality disabled]" - def _dispatch_bundles(self) -> None: + def _dispatch_bundles(self, dispatch_all: bool = False) -> None: # Dispatch all dispatchable bundles from the internal buffer. # This may not dispatch all bundles when equal=True. - while self._buffer: + while self._buffer and ( + dispatch_all or len(self._buffer) >= self._min_buffer_size + ): target_index = self._select_output_index() target_bundle = self._pop_bundle_to_dispatch(target_index) if self._can_safely_dispatch(target_index, target_bundle.num_rows()): target_bundle.output_split_idx = target_index self._num_output[target_index] += target_bundle.num_rows() self._output_queue.append(target_bundle) + if self._locality_hints: + preferred_loc = self._locality_hints[target_index] + if self._get_location(target_bundle) == preferred_loc: + self._locality_hits += 1 + else: + self._locality_misses += 1 else: # Put it back and abort. self._buffer.insert(0, target_bundle) @@ -123,7 +176,12 @@ def _select_output_index(self) -> int: return i def _pop_bundle_to_dispatch(self, target_index: int) -> RefBundle: - # TODO implement locality aware bundle selection. + if self._locality_hints: + preferred_loc = self._locality_hints[target_index] + for bundle in self._buffer: + if self._get_location(bundle) == preferred_loc: + self._buffer.remove(bundle) + return bundle return self._buffer.pop(0) def _can_safely_dispatch(self, target_index: int, nrow: int) -> bool: @@ -160,6 +218,16 @@ def _split_from_buffer(self, nrow: int) -> List[RefBundle]: assert sum(b.num_rows() for b in output) == nrow, (acc, nrow) return output + def _get_location(self, bundle: RefBundle) -> Optional[NodeIdStr]: + """Ask Ray for the node id of the given bundle. + + This method may be overriden for testing. + + Returns: + A node id associated with the bundle, or None if unknown. + """ + return bundle.get_cached_location() + def _split(bundle: RefBundle, left_size: int) -> (RefBundle, RefBundle): left_blocks, left_meta = [], [] diff --git a/python/ray/data/_internal/execution/operators/task_pool_map_operator.py b/python/ray/data/_internal/execution/operators/task_pool_map_operator.py index abeec33e1904c..89d51d7857ac6 100644 --- a/python/ray/data/_internal/execution/operators/task_pool_map_operator.py +++ b/python/ray/data/_internal/execution/operators/task_pool_map_operator.py @@ -52,9 +52,9 @@ def _add_bundled_input(self, bundle: RefBundle): map_task = cached_remote_fn(_map_task, num_returns="dynamic") input_blocks = [block for block, _ in bundle.blocks] ctx = TaskContext(task_idx=self._next_task_idx) - ref = map_task.options(**self._ray_remote_args, name=self.name).remote( - self._transform_fn_ref, ctx, *input_blocks - ) + ref = map_task.options( + **self._get_runtime_ray_remote_args(), name=self.name + ).remote(self._transform_fn_ref, ctx, *input_blocks) self._next_task_idx += 1 task = _TaskState(bundle) self._tasks[ref] = task diff --git a/python/ray/data/_internal/execution/operators/zip_operator.py b/python/ray/data/_internal/execution/operators/zip_operator.py new file mode 100644 index 0000000000000..3238948ff3e42 --- /dev/null +++ b/python/ray/data/_internal/execution/operators/zip_operator.py @@ -0,0 +1,252 @@ +import itertools +from typing import Callable, List, Optional, Tuple + +import ray +from ray.data._internal.delegating_block_builder import DelegatingBlockBuilder +from ray.data._internal.remote_fn import cached_remote_fn +from ray.data._internal.split import _split_at_indices +from ray.data._internal.stats import StatsDict +from ray.data._internal.execution.interfaces import ( + RefBundle, + PhysicalOperator, +) +from ray.data.block import ( + Block, + BlockAccessor, + BlockExecStats, + BlockMetadata, + BlockPartition, +) + + +class ZipOperator(PhysicalOperator): + """An operator that zips its inputs together. + + NOTE: the implementation is bulk for now, which materializes all its inputs in + object store, before starting execution. Should re-implement it as a streaming + operator in the future. + """ + + def __init__( + self, + left_input_op: PhysicalOperator, + right_input_op: PhysicalOperator, + ): + """Create a ZipOperator. + + Args: + left_input_ops: The input operator at left hand side. + right_input_op: The input operator at right hand side. + """ + self._left_buffer: List[RefBundle] = [] + self._right_buffer: List[RefBundle] = [] + self._output_buffer: List[RefBundle] = [] + self._stats: StatsDict = {} + super().__init__("Zip", [left_input_op, right_input_op]) + + def num_outputs_total(self) -> Optional[int]: + left_num_outputs = self.input_dependencies[0].num_outputs_total() + right_num_outputs = self.input_dependencies[1].num_outputs_total() + if left_num_outputs is not None and right_num_outputs is not None: + return max(left_num_outputs, right_num_outputs) + elif left_num_outputs is not None: + return left_num_outputs + else: + return right_num_outputs + + def add_input(self, refs: RefBundle, input_index: int) -> None: + assert not self.completed() + assert input_index == 0 or input_index == 1, input_index + if input_index == 0: + self._left_buffer.append(refs) + else: + self._right_buffer.append(refs) + + def inputs_done(self) -> None: + self._output_buffer, self._stats = self._zip( + self._left_buffer, self._right_buffer + ) + self._left_buffer.clear() + self._right_buffer.clear() + super().inputs_done() + + def has_next(self) -> bool: + return len(self._output_buffer) > 0 + + def get_next(self) -> RefBundle: + return self._output_buffer.pop(0) + + def get_stats(self) -> StatsDict: + return self._stats + + def get_transformation_fn(self) -> Callable: + return self._zip + + def _zip( + self, left_input: List[RefBundle], right_input: List[RefBundle] + ) -> Tuple[List[RefBundle], StatsDict]: + """Zip the RefBundles from `left_input` and `right_input` together. + + Zip is done in 2 steps: aligning blocks, and zipping blocks from + both sides. + + Aligning blocks (optional): check the blocks from `left_input` and + `right_input` are aligned or not, i.e. if having different number of blocks, or + having different number of rows in some blocks. If not aligned, repartition the + smaller input with `_split_at_indices` to align with larger input. + + Zipping blocks: after blocks from both sides are aligned, zip + blocks from both sides together in parallel. + """ + left_blocks_with_metadata = [] + for bundle in left_input: + for block, meta in bundle.blocks: + left_blocks_with_metadata.append((block, meta)) + right_blocks_with_metadata = [] + for bundle in right_input: + for block, meta in bundle.blocks: + right_blocks_with_metadata.append((block, meta)) + + left_block_rows, left_block_bytes = self._calculate_blocks_rows_and_bytes( + left_blocks_with_metadata + ) + right_block_rows, right_block_bytes = self._calculate_blocks_rows_and_bytes( + right_blocks_with_metadata + ) + + # Check that both sides have the same number of rows. + # TODO(Clark): Support different number of rows via user-directed + # dropping/padding. + total_left_rows = sum(left_block_rows) + total_right_rows = sum(right_block_rows) + if total_left_rows != total_right_rows: + raise ValueError( + "Cannot zip datasets of different number of rows: " + f"{total_left_rows}, {total_right_rows}" + ) + + # Whether the left and right input sides are inverted + input_side_inverted = False + if sum(right_block_bytes) > sum(left_block_bytes): + # Make sure that right side is smaller, so we minimize splitting + # work when aligning both sides. + # TODO(Clark): Improve this heuristic for minimizing splitting work, + # e.g. by generating the splitting plans for each route (via + # _generate_per_block_split_indices) and choosing the plan that splits + # the least cumulative bytes. + left_blocks_with_metadata, right_blocks_with_metadata = ( + right_blocks_with_metadata, + left_blocks_with_metadata, + ) + left_block_rows, right_block_rows = right_block_rows, left_block_rows + input_side_inverted = True + + # Get the split indices that will align both sides. + indices = list(itertools.accumulate(left_block_rows)) + indices.pop(-1) + + # Split other at the alignment indices, such that for every block from + # left side, we have a list of blocks from right side that have the same + # cumulative number of rows as that left block. + # NOTE: _split_at_indices has a no-op fastpath if the blocks are already + # aligned. + aligned_right_blocks_with_metadata = _split_at_indices( + right_blocks_with_metadata, + indices, + block_rows=right_block_rows, + ) + del right_blocks_with_metadata + + left_blocks = [b for b, _ in left_blocks_with_metadata] + right_blocks_list = aligned_right_blocks_with_metadata[0] + del left_blocks_with_metadata, aligned_right_blocks_with_metadata + + zip_one_block = cached_remote_fn(_zip_one_block, num_returns=2) + + output_blocks = [] + output_metadata = [] + for left_block, right_blocks in zip(left_blocks, right_blocks_list): + # For each block from left side, zip it together with 1 or more blocks from + # right side. We're guaranteed to have that left_block has the same number + # of rows as right_blocks has cumulatively. + res, meta = zip_one_block.remote( + left_block, *right_blocks, inverted=input_side_inverted + ) + output_blocks.append(res) + output_metadata.append(meta) + + # Early release memory. + del left_blocks, right_blocks_list + + # TODO(ekl) it might be nice to have a progress bar here. + output_metadata = ray.get(output_metadata) + output_refs = [] + input_owned = all(b.owns_blocks for b in left_input) + for block, meta in zip(output_blocks, output_metadata): + output_refs.append( + RefBundle( + [ + ( + block, + meta, + ) + ], + owns_blocks=input_owned, + ) + ) + stats = {self._name: output_metadata} + + # Clean up inputs. + for ref in left_input: + ref.destroy_if_owned() + for ref in right_input: + ref.destroy_if_owned() + + return output_refs, stats + + def _calculate_blocks_rows_and_bytes( + self, + blocks_with_metadata: BlockPartition, + ) -> Tuple[List[int], List[int]]: + """Calculate the number of rows and size in bytes for a list of blocks with + metadata. + """ + get_num_rows_and_bytes = cached_remote_fn(_get_num_rows_and_bytes) + block_rows = [] + block_bytes = [] + for block, metadata in blocks_with_metadata: + if metadata.num_rows is None or metadata.size_bytes is None: + # Need to fetch number of rows or size in bytes, so just fetch both. + num_rows, size_bytes = ray.get(get_num_rows_and_bytes.remote(block)) + # Cache on the block metadata. + metadata.num_rows = num_rows + metadata.size_bytes = size_bytes + block_rows.append(metadata.num_rows) + block_bytes.append(metadata.size_bytes) + return block_rows, block_bytes + + +def _zip_one_block( + block: Block, *other_blocks: Block, inverted: bool = False +) -> Tuple[Block, BlockMetadata]: + """Zip together `block` with `other_blocks`.""" + stats = BlockExecStats.builder() + # Concatenate other blocks. + # TODO(Clark): Extend BlockAccessor.zip() to work with N other blocks, + # so we don't need to do this concatenation. + builder = DelegatingBlockBuilder() + for other_block in other_blocks: + builder.add_block(other_block) + other_block = builder.build() + if inverted: + # Swap blocks if ordering was inverted during block alignment splitting. + block, other_block = other_block, block + # Zip block and other blocks. + result = BlockAccessor.for_block(block).zip(other_block) + br = BlockAccessor.for_block(result) + return result, br.get_metadata(input_files=[], exec_stats=stats.build()) + + +def _get_num_rows_and_bytes(block: Block) -> Tuple[int, int]: + block = BlockAccessor.for_block(block) + return block.num_rows(), block.size_bytes() diff --git a/python/ray/data/_internal/execution/streaming_executor.py b/python/ray/data/_internal/execution/streaming_executor.py index b22f14c435fe7..68b587fb8ceeb 100644 --- a/python/ray/data/_internal/execution/streaming_executor.py +++ b/python/ray/data/_internal/execution/streaming_executor.py @@ -10,6 +10,7 @@ Executor, ExecutionOptions, ExecutionResources, + OutputIterator, RefBundle, PhysicalOperator, ) @@ -83,18 +84,30 @@ def execute( self._output_node: OpState = self._topology[dag] self.start() - # Drain items from the runner thread until completion. - try: - item = self._output_node.get_output_blocking() - while item is not None: - if isinstance(item, Exception): - raise item - else: - self._output_info.update(1) - yield item - item = self._output_node.get_output_blocking() - finally: - self.shutdown() + class StreamIterator(OutputIterator): + def __init__(self, outer: Executor): + self._outer = outer + + def get_next(self, output_split_idx: Optional[int] = None) -> RefBundle: + try: + item = self._outer._output_node.get_output_blocking( + output_split_idx + ) + # Translate the special sentinel values for MaybeRefBundle into + # exceptions. + if item is None: + raise StopIteration + elif isinstance(item, Exception): + raise item + else: + # Otherwise return a concrete RefBundle. + self._outer._output_info.update(1) + return item + except Exception: + self._outer.shutdown() + raise + + return StreamIterator(self) def shutdown(self): with self._shutdown_lock: diff --git a/python/ray/data/_internal/execution/streaming_executor_state.py b/python/ray/data/_internal/execution/streaming_executor_state.py index ed43f5f5d346b..34530ed145a66 100644 --- a/python/ray/data/_internal/execution/streaming_executor_state.py +++ b/python/ray/data/_internal/execution/streaming_executor_state.py @@ -144,7 +144,7 @@ def dispatch_next_task(self) -> None: return assert False, "Nothing to dispatch" - def get_output_blocking(self) -> MaybeRefBundle: + def get_output_blocking(self, output_split_idx: Optional[int]) -> MaybeRefBundle: """Get an item from this node's output queue, blocking as needed. Returns: @@ -152,9 +152,27 @@ def get_output_blocking(self) -> MaybeRefBundle: """ while True: try: - return self.outqueue.popleft() + # Non-split output case. + if output_split_idx is None: + return self.outqueue.popleft() + + # Scan the queue and look for outputs tagged for the given index. + for i in range(len(self.outqueue)): + bundle = self.outqueue[i] + if bundle is None or isinstance(bundle, Exception): + # End of stream for this index! Note that we + # do not remove the None, so that it can act + # as the termination signal for all indices. + return bundle + elif bundle.output_split_idx == output_split_idx: + self.outqueue.remove(bundle) + return bundle + + # Didn't find any outputs matching this index, repeat the loop until + # we find one or hit a None. except IndexError: - time.sleep(0.01) + pass + time.sleep(0.01) def outqueue_memory_usage(self) -> int: """Return the object store memory of this operator's outqueue. @@ -306,9 +324,14 @@ def select_operator_to_run( if not ops: return None - # Equally penalize outqueue length and num bundles processing for backpressure. + # Run metadata-only operators first. After that, equally penalize outqueue length + # and num bundles processing for backpressure. return min( - ops, key=lambda op: len(topology[op].outqueue) + topology[op].num_processing() + ops, + key=lambda op: ( + not op.throttling_disabled(), + len(topology[op].outqueue) + topology[op].num_processing(), + ), ) @@ -335,6 +358,10 @@ def _execution_allowed( Returns: Whether the op is allowed to run. """ + + if op.throttling_disabled(): + return True + assert isinstance(global_usage, TopologyResourceUsage), global_usage # To avoid starvation problems when dealing with fractional resource types, # convert all quantities to integer (0 or 1) for deciding admissibility. This diff --git a/python/ray/data/_internal/logical/operators/map_operator.py b/python/ray/data/_internal/logical/operators/map_operator.py index 413cfabbe6457..7fad1807ba22f 100644 --- a/python/ray/data/_internal/logical/operators/map_operator.py +++ b/python/ray/data/_internal/logical/operators/map_operator.py @@ -8,7 +8,6 @@ ) from ray.data.block import BatchUDF, RowUDF from ray.data.context import DEFAULT_BATCH_SIZE -from ray.data.datasource import Datasource if sys.version_info >= (3, 8): @@ -142,26 +141,6 @@ def __init__( ) -class Write(AbstractUDFMap): - """Logical operator for write.""" - - def __init__( - self, - input_op: LogicalOperator, - datasource: Datasource, - ray_remote_args: Optional[Dict[str, Any]] = None, - **write_args, - ): - super().__init__( - "Write", - input_op, - fn=lambda x: x, - ray_remote_args=ray_remote_args, - ) - self._datasource = datasource - self._write_args = write_args - - class Filter(AbstractUDFMap): """Logical operator for filter.""" diff --git a/python/ray/data/_internal/logical/operators/n_ary_operator.py b/python/ray/data/_internal/logical/operators/n_ary_operator.py new file mode 100644 index 0000000000000..145979f5ee3b5 --- /dev/null +++ b/python/ray/data/_internal/logical/operators/n_ary_operator.py @@ -0,0 +1,17 @@ +from ray.data._internal.logical.interfaces import LogicalOperator + + +class Zip(LogicalOperator): + """Logical operator for zip.""" + + def __init__( + self, + left_input_op: LogicalOperator, + right_input_op: LogicalOperator, + ): + """ + Args: + left_input_ops: The input operator at left hand side. + right_input_op: The input operator at right hand side. + """ + super().__init__("Zip", [left_input_op, right_input_op]) diff --git a/python/ray/data/_internal/logical/optimizers.py b/python/ray/data/_internal/logical/optimizers.py index f1c0d83ff16bb..5d37da3952e0d 100644 --- a/python/ray/data/_internal/logical/optimizers.py +++ b/python/ray/data/_internal/logical/optimizers.py @@ -10,6 +10,7 @@ OperatorFusionRule, ReorderRandomizeBlocksRule, ) +from ray.data._internal.logical.util import record_operators_usage from ray.data._internal.planner.planner import Planner @@ -37,6 +38,8 @@ def get_execution_plan(logical_plan: LogicalPlan) -> PhysicalPlan: (2) planning: convert logical to physical operators. (3) physical optimization: optimize physical operators. """ + # Record usage of logical operators. + record_operators_usage(logical_plan.dag) logical_plan = LogicalOptimizer().optimize(logical_plan) physical_plan = Planner().plan(logical_plan) diff --git a/python/ray/data/_internal/logical/util.py b/python/ray/data/_internal/logical/util.py new file mode 100644 index 0000000000000..3a32f3b6c55f4 --- /dev/null +++ b/python/ray/data/_internal/logical/util.py @@ -0,0 +1,87 @@ +from typing import Dict +import json +import threading + +from ray._private.usage.usage_lib import TagKey, record_extra_usage_tag +from ray.data._internal.logical.interfaces import LogicalOperator +from ray.data._internal.logical.operators.read_operator import Read +from ray.data._internal.logical.operators.write_operator import Write + +# The dictionary for the operator name and count. +_recorded_operators = dict() +_recorded_operators_lock = threading.Lock() + +# The white list of operator names allowed to be recorded. +_op_name_white_list = [ + # Read + "ReadRange", + "ReadMongo", + "ReadParquet", + "ReadParquetBulk", + "ReadImage", + "ReadJSON", + "ReadCSV", + "ReadText", + "ReadNumpy", + "ReadTFRecord", + "ReadBinary", + "ReadCustom", + # Write + "WriteParquet", + "WriteJSON", + "WriteCSV", + "WriteTFRecord", + "WriteNumpy", + "WriteMongo", + "WriteCustom", + # Map + "MapBatches", + "MapRows", + "Filter", + "FlatMap", + # All-to-all + "RandomizeBlocks", + "RandomShuffle", + "Repartition", + "Sort", + "Aggregate", +] + + +def record_operators_usage(op: LogicalOperator): + """Record logical operator usage with Ray telemetry.""" + ops_dict = dict() + _collect_operators_to_dict(op, ops_dict) + ops_json_str = "" + with _recorded_operators_lock: + for op, count in ops_dict.items(): + _recorded_operators.setdefault(op, 0) + _recorded_operators[op] += count + ops_json_str = json.dumps(_recorded_operators) + + record_extra_usage_tag(TagKey.DATA_LOGICAL_OPS, ops_json_str) + + +def _collect_operators_to_dict(op: LogicalOperator, ops_dict: Dict[str, int]): + """Collect the logical operator name and count into `ops_dict`.""" + for child in op.input_dependencies: + _collect_operators_to_dict(child, ops_dict) + + op_name = op.name + + # Check read and write operator, and anonymize user-defined data source. + if isinstance(op, Read): + op_name = f"Read{op._datasource.get_name()}" + if op_name not in _op_name_white_list: + op_name = "ReadCustom" + elif isinstance(op, Write): + op_name = f"Write{op._datasource.get_name()}" + if op_name not in _op_name_white_list: + op_name = "WriteCustom" + + # Anonymize any operator name if not in white list. + if op_name not in _op_name_white_list: + op_name = "Unknown" + + ops_dict.setdefault(op_name, 0) + ops_dict[op_name] += 1 diff --git a/python/ray/data/_internal/plan.py b/python/ray/data/_internal/plan.py index 7b50da8129a9b..4976a63ab89bb 100644 --- a/python/ray/data/_internal/plan.py +++ b/python/ray/data/_internal/plan.py @@ -238,6 +238,51 @@ def get_plan_as_string(self) -> str: num_blocks, count, schema_str ) + # If the resulting string representation fits in one line, use it directly. + SCHEMA_LINE_CHAR_LIMIT = 80 + MIN_FIELD_LENGTH = 10 + INDENT_STR = " " * 3 + trailing_space = " " * (max(num_stages, 0) * 3) + if len(dataset_str) > SCHEMA_LINE_CHAR_LIMIT: + # If the resulting string representation exceeds the line char limit, + # first try breaking up each `Dataset` parameter into its own line + # and check if each line fits within the line limit. We check the + # `schema` param's length, since this is likely the longest string. + schema_str_on_new_line = f"{trailing_space}{INDENT_STR}schema={schema_str}" + if len(schema_str_on_new_line) > SCHEMA_LINE_CHAR_LIMIT: + # If the schema cannot fit on a single line, break up each field + # into its own line. + schema_str = [] + for n, t in zip(schema.names, schema.types): + if hasattr(t, "__name__"): + t = t.__name__ + col_str = f"{trailing_space}{INDENT_STR * 2}{n}: {t}" + # If the field line exceeds the char limit, abbreviate + # the field name to fit while maintaining the full type + if len(col_str) > SCHEMA_LINE_CHAR_LIMIT: + shortened_suffix = f"...: {str(t)}" + # Show at least 10 characters of the field name, even if + # we have already hit the line limit with the type. + chars_left_for_col_name = max( + SCHEMA_LINE_CHAR_LIMIT - len(shortened_suffix), + MIN_FIELD_LENGTH, + ) + col_str = ( + f"{col_str[:chars_left_for_col_name]}{shortened_suffix}" + ) + schema_str.append(col_str) + schema_str = ",\n".join(schema_str) + schema_str = ( + "{\n" + schema_str + f"\n{trailing_space}{INDENT_STR}" + "}" + ) + dataset_str = ( + f"Dataset(" + f"\n{trailing_space}{INDENT_STR}num_blocks={num_blocks}," + f"\n{trailing_space}{INDENT_STR}num_rows={count}," + f"\n{trailing_space}{INDENT_STR}schema={schema_str}" + f"\n{trailing_space})" + ) + if num_stages == 0: plan_str = dataset_str else: diff --git a/python/ray/data/_internal/planner/plan_udf_map_op.py b/python/ray/data/_internal/planner/plan_udf_map_op.py index e02ab405ed10e..49a47ae76ffe8 100644 --- a/python/ray/data/_internal/planner/plan_udf_map_op.py +++ b/python/ray/data/_internal/planner/plan_udf_map_op.py @@ -14,13 +14,11 @@ FlatMap, MapBatches, MapRows, - Write, ) from ray.data._internal.planner.filter import generate_filter_fn from ray.data._internal.planner.flat_map import generate_flat_map_fn from ray.data._internal.planner.map_batches import generate_map_batches_fn from ray.data._internal.planner.map_rows import generate_map_rows_fn -from ray.data._internal.planner.write import generate_write_fn from ray.data.block import Block, CallableClass @@ -45,8 +43,6 @@ def _plan_udf_map_op( transform_fn = generate_flat_map_fn() elif isinstance(op, Filter): transform_fn = generate_filter_fn() - elif isinstance(op, Write): - transform_fn = generate_write_fn(op._datasource, **op._write_args) else: raise ValueError(f"Found unknown logical operator during planning: {op}") diff --git a/python/ray/data/_internal/planner/planner.py b/python/ray/data/_internal/planner/planner.py index af60ac1a4019b..8a27e968c614f 100644 --- a/python/ray/data/_internal/planner/planner.py +++ b/python/ray/data/_internal/planner/planner.py @@ -1,12 +1,14 @@ from typing import Dict from ray.data._internal.execution.interfaces import PhysicalOperator +from ray.data._internal.execution.operators.zip_operator import ZipOperator from ray.data._internal.logical.interfaces import ( LogicalOperator, LogicalPlan, PhysicalPlan, ) from ray.data._internal.logical.operators.all_to_all_operator import AbstractAllToAll +from ray.data._internal.logical.operators.n_ary_operator import Zip from ray.data._internal.logical.operators.read_operator import Read from ray.data._internal.logical.operators.write_operator import Write from ray.data._internal.logical.operators.map_operator import AbstractUDFMap @@ -49,6 +51,9 @@ def _plan(self, logical_op: LogicalOperator) -> PhysicalOperator: elif isinstance(logical_op, AbstractAllToAll): assert len(physical_children) == 1 physical_op = _plan_all_to_all_op(logical_op, physical_children[0]) + elif isinstance(logical_op, Zip): + assert len(physical_children) == 2 + physical_op = ZipOperator(physical_children[0], physical_children[1]) else: raise ValueError( f"Found unknown logical operator during planning: {logical_op}" diff --git a/python/ray/data/_internal/stats.py b/python/ray/data/_internal/stats.py index c4a490d56351f..270916b2c77e3 100644 --- a/python/ray/data/_internal/stats.py +++ b/python/ray/data/_internal/stats.py @@ -223,6 +223,16 @@ def __init__( self.iter_total_s: Timer = Timer() self.extra_metrics = {} + # Block fetch stats during iteration. + # These are stats about locations of blocks when the iterator is trying to + # consume them. The iteration performance will be affected depending on + # whether the block is in the local object store of the node where the + # iterator is running. + # This serves as an indicator of block prefetching effectiveness. + self.iter_blocks_local: int = 0 + self.iter_blocks_remote: int = 0 + self.iter_unknown_location: int = 0 + @property def stats_actor(self): return _get_or_create_stats_actor() @@ -279,6 +289,9 @@ def to_summary(self) -> "DatasetStatsSummary": self.iter_format_batch_s, self.iter_user_s, self.iter_total_s, + self.iter_blocks_local, + self.iter_blocks_remote, + self.iter_unknown_location, ) stats_summary_parents = [] if self.parents is not None: @@ -616,6 +629,12 @@ class IterStatsSummary: user_time: Timer # Total time taken by Dataset iterator, in seconds total_time: Timer + # Num of blocks that are in local object store + iter_blocks_local: int + # Num of blocks that are in remote node and have to fetch locally + iter_blocks_remote: int + # Num of blocks with unknown locations + iter_unknown_location: int def __str__(self) -> str: out = "" @@ -629,6 +648,11 @@ def __str__(self) -> str: out += "\nDataset iterator time breakdown:\n" out += "* In ray.wait(): {}\n".format(fmt(self.wait_time.get())) out += "* In ray.get(): {}\n".format(fmt(self.get_time.get())) + out += "* Num blocks local: {}\n".format(self.iter_blocks_local) + out += "* Num blocks remote: {}\n".format(self.iter_blocks_remote) + out += "* Num blocks unknown location: {}\n".format( + self.iter_unknown_location + ) out += "* In next_batch(): {}\n".format(fmt(self.next_time.get())) out += "* In format_batch(): {}\n".format(fmt(self.format_time.get())) out += "* In user code: {}\n".format(fmt(self.user_time.get())) diff --git a/python/ray/data/_internal/stream_split_dataset_iterator.py b/python/ray/data/_internal/stream_split_dataset_iterator.py new file mode 100644 index 0000000000000..c7249d0c4db8f --- /dev/null +++ b/python/ray/data/_internal/stream_split_dataset_iterator.py @@ -0,0 +1,199 @@ +import copy +import logging +import sys +import threading +from typing import ( + List, + Dict, + Optional, + Iterator, + Callable, + Any, + Union, + TYPE_CHECKING, +) + +import ray + +from ray.data.dataset_iterator import DatasetIterator +from ray.data.block import Block, DataBatch +from ray.data.context import DatasetContext +from ray.data._internal.execution.streaming_executor import StreamingExecutor +from ray.data._internal.execution.legacy_compat import ( + execute_to_legacy_bundle_iterator, +) +from ray.data._internal.block_batching import batch_block_refs +from ray.data._internal.execution.operators.output_splitter import OutputSplitter +from ray.data._internal.execution.interfaces import NodeIdStr, RefBundle +from ray.types import ObjectRef +from ray.util.scheduling_strategies import NodeAffinitySchedulingStrategy + +if TYPE_CHECKING: + import pyarrow + from ray.data import Dataset + +if sys.version_info >= (3, 8): + from typing import Literal +else: + from typing_extensions import Literal + +logger = logging.getLogger(__name__) + + +class StreamSplitDatasetIterator(DatasetIterator): + """Implements a collection of iterators over a shared data stream.""" + + @staticmethod + def create( + base_dataset: "Dataset", + n: int, + equal: bool, + locality_hints: Optional[List[NodeIdStr]], + ) -> List["StreamSplitDatasetIterator"]: + """Create a split iterator from the given base Dataset and options. + + See also: `Dataset.streaming_split`. + """ + ctx = DatasetContext.get_current() + + # To avoid deadlock, the concurrency on this actor must be set to at least `n`. + coord_actor = SplitCoordinator.options( + max_concurrency=n, + scheduling_strategy=NodeAffinitySchedulingStrategy( + ray.get_runtime_context().get_node_id(), soft=False + ), + ).remote(ctx, base_dataset, n, equal, locality_hints) + + return [ + StreamSplitDatasetIterator(base_dataset, coord_actor, i) for i in range(n) + ] + + def __init__( + self, + base_dataset: "Dataset", + coord_actor: ray.actor.ActorHandle, + output_split_idx: int, + ): + self._base_dataset = base_dataset + self._coord_actor = coord_actor + self._output_split_idx = output_split_idx + + def iter_batches( + self, + *, + prefetch_blocks: int = 0, + batch_size: int = 256, + batch_format: Literal["default", "numpy", "pandas"] = "default", + drop_last: bool = False, + local_shuffle_buffer_size: Optional[int] = None, + local_shuffle_seed: Optional[int] = None, + _collate_fn: Optional[Callable[[DataBatch], Any]] = None, + ) -> Iterator[DataBatch]: + """Implements DatasetIterator.""" + + def gen_blocks() -> Iterator[ObjectRef[Block]]: + future: ObjectRef[ + Optional[ObjectRef[Block]] + ] = self._coord_actor.get.remote(self._output_split_idx) + while True: + block_ref: Optional[ObjectRef[Block]] = ray.get(future) + if not block_ref: + break + else: + future = self._coord_actor.get.remote(self._output_split_idx) + yield block_ref + + yield from batch_block_refs( + gen_blocks(), + stats=None, + prefetch_blocks=prefetch_blocks, + batch_size=batch_size, + batch_format=batch_format, + drop_last=drop_last, + collate_fn=_collate_fn, + shuffle_buffer_min_size=local_shuffle_buffer_size, + shuffle_seed=local_shuffle_seed, + ) + + def stats(self) -> str: + """Implements DatasetIterator.""" + return self._base_dataset.stats() + + def schema(self) -> Union[type, "pyarrow.lib.Schema"]: + """Implements DatasetIterator.""" + return self._base_dataset.schema() + + +@ray.remote(num_cpus=0) +class SplitCoordinator: + """Coordinator actor for routing blocks to output splits. + + This actor runs a streaming executor locally on its main thread. Clients can + retrieve results via actor calls running on other threads. + """ + + def __init__( + self, + ctx: DatasetContext, + dataset: "Dataset", + n: int, + equal: bool, + locality_hints: Optional[List[NodeIdStr]], + ): + # Automatically set locality with output to the specified location hints. + if locality_hints: + ctx.execution_options.locality_with_output = locality_hints + logger.info(f"Auto configuring locality_with_output={locality_hints}") + + DatasetContext._set_current(ctx) + self._base_dataset = dataset + self._n = n + self._equal = equal + self._locality_hints = locality_hints + self._finished = False + self._lock = threading.RLock() + # Guarded by self._lock. + self._next_bundle: Dict[int, RefBundle] = {} + + executor = StreamingExecutor(copy.deepcopy(ctx.execution_options)) + + def add_split_op(dag): + return OutputSplitter(dag, n, equal, locality_hints) + + self._output_iterator = execute_to_legacy_bundle_iterator( + executor, + dataset._plan, + True, + dataset._plan._dataset_uuid, + dag_rewrite=add_split_op, + ) + + def get(self, output_split_idx: int) -> Optional[ObjectRef[Block]]: + """Blocking get operation. + + This is intended to be called concurrently from multiple clients. + """ + try: + # Ensure there is at least one bundle. + with self._lock: + if output_split_idx in self._next_bundle: + next_bundle = self._next_bundle[output_split_idx] + else: + next_bundle = None + + # Fetch next bundle if needed. + if next_bundle is None: + # This is a BLOCKING call, so do it outside the lock. + next_bundle = self._output_iterator.get_next(output_split_idx) + + block = next_bundle.blocks.pop()[0] + + # Accumulate any remaining blocks in next_bundle map as needed. + with self._lock: + self._next_bundle[output_split_idx] = next_bundle + if not next_bundle.blocks: + del self._next_bundle[output_split_idx] + + return block + except StopIteration: + return None diff --git a/python/ray/data/_internal/util.py b/python/ray/data/_internal/util.py index f947e44fd763f..8274423deaff8 100644 --- a/python/ray/data/_internal/util.py +++ b/python/ray/data/_internal/util.py @@ -343,7 +343,7 @@ def _consumption_api( """ base = ( " will trigger execution of the lazy transformations performed on " - "this dataset, and will block until execution completes." + "this dataset." ) if delegate: message = delegate + base diff --git a/python/ray/data/dataset.py b/python/ray/data/dataset.py index d29fe31169938..58314b05d4ce9 100644 --- a/python/ray/data/dataset.py +++ b/python/ray/data/dataset.py @@ -35,6 +35,7 @@ Repartition, Sort, ) +from ray.data._internal.logical.operators.n_ary_operator import Zip from ray.data._internal.logical.optimizers import LogicalPlan from ray.data._internal.logical.operators.map_operator import ( Filter, @@ -50,7 +51,8 @@ from ray.data._internal.planner.write import generate_write_fn from ray.data.dataset_iterator import DatasetIterator from ray.data._internal.block_list import BlockList -from ray.data._internal.bulk_dataset_iterator import BulkDatasetIterator +from ray.data._internal.dataset_iterator_impl import DatasetIteratorImpl +from ray.data._internal.stream_split_dataset_iterator import StreamSplitDatasetIterator from ray.data._internal.compute import ( ActorPoolStrategy, CallableClass, @@ -146,7 +148,7 @@ from ray.data.dataset_pipeline import DatasetPipeline from ray.data.grouped_dataset import GroupedDataset - from ray.data._internal.execution.interfaces import Executor + from ray.data._internal.execution.interfaces import Executor, NodeIdStr from ray.data._internal.torch_iterable_dataset import TorchTensorBatchType @@ -286,7 +288,7 @@ def map( ... [{"value": i} for i in range(1000)]) >>> ds.map(lambda record: {"v2": record["value"] * 2}) Map - +- Dataset(num_blocks=..., num_rows=1000, schema={value: int64}) + +- Dataset(num_blocks=200, num_rows=1000, schema={value: int64}) >>> # Define a callable class that persists state across >>> # function invocations for efficiency. >>> init_model = ... # doctest: +SKIP @@ -801,7 +803,11 @@ def select_columns( >>> ds = ds.select_columns(cols=["col1", "col2"]) >>> ds MapBatches() - +- Dataset(num_blocks=10, num_rows=10, schema={col1: int64, col2: int64, col3: int64}) + +- Dataset( + num_blocks=10, + num_rows=10, + schema={col1: int64, col2: int64, col3: int64} + ) Time complexity: O(dataset size / parallelism) @@ -1140,6 +1146,44 @@ def process_batch(batch): return self.map_batches(process_batch) + @ConsumptionAPI + def streaming_split( + self, + n: int, + *, + equal: bool = False, + locality_hints: Optional[List["NodeIdStr"]] = None, + ) -> List[DatasetIterator]: + """Returns ``n`` :class:`~ray.data.DatasetIterator`s that can be used to read + disjoint subsets of the dataset in parallel. + + This method is the recommended way to consume Datasets from multiple processes + (e.g., for distributed training). It requires streaming execution mode. + + The returned iterators are Ray-serializable and can be freely passed to any + Ray task or actor. + + Examples: + >>> import ray + >>> ds = ray.data.range(1000000) + >>> it1, it2 = ds.streaming_split(2, equal=True) + >>> list(it1.iter_batches()) # doctest: +SKIP + >>> list(it2.iter_batches()) # doctest: +SKIP + + Args: + n: Number of output iterators to return. + equal: If True, each output iterator will see an exactly equal number + of rows, dropping data if necessary. If False, some iterators may see + slightly more or less rows than other, but no data will be dropped. + locality_hints: Specify the node ids corresponding to each iterator + location. Datasets will try to minimize data movement based on the + iterator output locations. This list must have length ``n``. + + Returns: + The output iterator splits. + """ + return StreamSplitDatasetIterator.create(self, n, equal, locality_hints) + @ConsumptionAPI def split( self, n: int, *, equal: bool = False, locality_hints: Optional[List[Any]] = None @@ -1160,7 +1204,8 @@ def split( Time complexity: O(1) - See also: ``Dataset.split_at_indices``, ``Dataset.split_proportionately`` + See also: ``Dataset.split_at_indices``, ``Dataset.split_proportionately``, + and ``Dataset.streaming_split``. Args: n: Number of child datasets to return. @@ -1361,7 +1406,8 @@ def split_at_indices(self, indices: List[int]) -> List["Dataset[T]"]: Time complexity: O(num splits) - See also: ``Dataset.split``, ``Dataset.split_proportionately`` + See also: ``Dataset.split_at_indices``, ``Dataset.split_proportionately``, + and ``Dataset.streaming_split``. Args: indices: List of sorted integers which indicate where the dataset @@ -2029,7 +2075,7 @@ def sort( ... [{"value": i} for i in range(1000)]) >>> ds.sort("value", descending=True) Sort - +- Dataset(num_blocks=..., num_rows=1000, schema={value: int64}) + +- Dataset(num_blocks=200, num_rows=1000, schema={value: int64}) >>> # Sort by a key function. >>> ds.sort(lambda record: record["value"]) # doctest: +SKIP @@ -2097,7 +2143,13 @@ def zip(self, other: "Dataset[U]") -> "Dataset[(T, U)]": """ plan = self._plan.with_stage(ZipStage(other)) - return Dataset(plan, self._epoch, self._lazy) + + logical_plan = self._logical_plan + other_logical_plan = other._logical_plan + if logical_plan is not None and other_logical_plan is not None: + op = Zip(logical_plan.dag, other_logical_plan.dag) + logical_plan = LogicalPlan(op) + return Dataset(plan, self._epoch, self._lazy, logical_plan) @ConsumptionAPI def limit(self, limit: int) -> "Dataset[T]": @@ -2819,7 +2871,7 @@ def iterator(self) -> DatasetIterator: It is recommended to use ``DatasetIterator`` methods over directly calling methods such as ``iter_batches()``. """ - return BulkDatasetIterator(self) + return DatasetIteratorImpl(self) @ConsumptionAPI def iter_rows(self, *, prefetch_blocks: int = 0) -> Iterator[Union[T, TableRow]]: @@ -3221,7 +3273,17 @@ def to_tf( >>> import ray >>> ds = ray.data.read_csv("s3://anonymous@air-example-data/iris.csv") >>> ds - Dataset(num_blocks=1, num_rows=150, schema={sepal length (cm): double, sepal width (cm): double, petal length (cm): double, petal width (cm): double, target: int64}) + Dataset( + num_blocks=1, + num_rows=150, + schema={ + sepal length (cm): double, + sepal width (cm): double, + petal length (cm): double, + petal width (cm): double, + target: int64 + } + ) If your model accepts a single tensor as input, specify a single feature column. @@ -3242,7 +3304,17 @@ def to_tf( >>> ds = preprocessor.transform(ds) >>> ds Concatenator - +- Dataset(num_blocks=1, num_rows=150, schema={sepal length (cm): double, sepal width (cm): double, petal length (cm): double, petal width (cm): double, target: int64}) + +- Dataset( + num_blocks=1, + num_rows=150, + schema={ + sepal length (cm): double, + sepal width (cm): double, + petal length (cm): double, + petal width (cm): double, + target: int64 + } + ) >>> ds.to_tf("features", "target") # doctest: +SKIP <_OptionsDataset element_spec=(TensorSpec(shape=(None, 4), dtype=tf.float64, name='features'), TensorSpec(shape=(None,), dtype=tf.int64, name='target'))> diff --git a/python/ray/data/dataset_iterator.py b/python/ray/data/dataset_iterator.py index 47397fc728eb5..ea000b3f713e0 100644 --- a/python/ray/data/dataset_iterator.py +++ b/python/ray/data/dataset_iterator.py @@ -497,7 +497,17 @@ def to_tf( ... "s3://anonymous@air-example-data/iris.csv" ... ) >>> it = ds.iterator(); it - DatasetIterator(Dataset(num_blocks=1, num_rows=150, schema={sepal length (cm): double, sepal width (cm): double, petal length (cm): double, petal width (cm): double, target: int64})) + DatasetIterator(Dataset( + num_blocks=1, + num_rows=150, + schema={ + sepal length (cm): double, + sepal width (cm): double, + petal length (cm): double, + petal width (cm): double, + target: int64 + } + )) If your model accepts a single tensor as input, specify a single feature column. @@ -518,7 +528,17 @@ def to_tf( >>> it = preprocessor.transform(ds).iterator() >>> it DatasetIterator(Concatenator - +- Dataset(num_blocks=1, num_rows=150, schema={sepal length (cm): double, sepal width (cm): double, petal length (cm): double, petal width (cm): double, target: int64})) + +- Dataset( + num_blocks=1, + num_rows=150, + schema={ + sepal length (cm): double, + sepal width (cm): double, + petal length (cm): double, + petal width (cm): double, + target: int64 + } + )) >>> it.to_tf("features", "target") # doctest: +SKIP <_OptionsDataset element_spec=(TensorSpec(shape=(None, 4), dtype=tf.float64, name='features'), TensorSpec(shape=(None,), dtype=tf.int64, name='target'))> diff --git a/python/ray/data/datasource/file_based_datasource.py b/python/ray/data/datasource/file_based_datasource.py index 2badee7158f6f..1d13ad2016218 100644 --- a/python/ray/data/datasource/file_based_datasource.py +++ b/python/ray/data/datasource/file_based_datasource.py @@ -1,3 +1,4 @@ +import itertools import logging import pathlib import posixpath @@ -13,12 +14,17 @@ Optional, Tuple, Union, + TypeVar, ) +import numpy as np + from ray.data._internal.arrow_block import ArrowRow from ray.data._internal.delegating_block_builder import DelegatingBlockBuilder from ray.data._internal.execution.interfaces import TaskContext from ray.data._internal.output_buffer import BlockOutputBuffer +from ray.data._internal.progress_bar import ProgressBar +from ray.data._internal.remote_fn import cached_remote_fn from ray.data._internal.util import _check_pyarrow_version, _resolve_custom_scheme from ray.data.block import Block, BlockAccessor from ray.data.context import DatasetContext @@ -45,6 +51,13 @@ logger = logging.getLogger(__name__) +# We should parallelize file size fetch operations beyond this threshold. +FILE_SIZE_FETCH_PARALLELIZATION_THRESHOLD = 16 + +# 16 file size fetches from S3 takes ~1.5 seconds with Arrow's S3FileSystem. +PATHS_PER_FILE_SIZE_FETCH_TASK = 16 + + @DeveloperAPI class BlockWritePathProvider: """Abstract callable that provides concrete output paths when writing @@ -288,9 +301,7 @@ def write( def write_block(write_path: str, block: Block): logger.debug(f"Writing {write_path} file.") - fs = filesystem - if isinstance(fs, _S3FileSystemWrapper): - fs = fs.unwrap() + fs = _unwrap_s3_serialization_workaround(filesystem) if _block_udf is not None: block = _block_udf(block) @@ -373,8 +384,9 @@ def __init__( self._block_udf = _block_udf self._reader_args = reader_args paths, self._filesystem = _resolve_paths_and_filesystem(paths, filesystem) - self._paths, self._file_sizes = meta_provider.expand_paths( - paths, self._filesystem + self._paths, self._file_sizes = map( + list, + zip(*meta_provider.expand_paths(paths, self._filesystem, partitioning)), ) if self._partition_filter is not None: # Use partition filter to skip files which are not needed. @@ -418,8 +430,7 @@ def read_files( fs: Union["pyarrow.fs.FileSystem", _S3FileSystemWrapper], ) -> Iterable[Block]: logger.debug(f"Reading {len(read_paths)} files.") - if isinstance(fs, _S3FileSystemWrapper): - fs = fs.unwrap() + fs = _unwrap_s3_serialization_workaround(filesystem) ctx = DatasetContext.get_current() output_buffer = BlockOutputBuffer( block_udf=_block_udf, target_max_block_size=ctx.target_max_block_size @@ -672,48 +683,6 @@ def _resolve_paths_and_filesystem( return resolved_paths, filesystem -def _expand_directory( - path: str, - filesystem: "pyarrow.fs.FileSystem", - exclude_prefixes: Optional[List[str]] = None, -) -> List[str]: - """ - Expand the provided directory path to a list of file paths. - - Args: - path: The directory path to expand. - filesystem: The filesystem implementation that should be used for - reading these files. - exclude_prefixes: The file relative path prefixes that should be - excluded from the returned file set. Default excluded prefixes are - "." and "_". - - Returns: - A list of file paths contained in the provided directory. - """ - if exclude_prefixes is None: - exclude_prefixes = [".", "_"] - - from pyarrow.fs import FileSelector - - selector = FileSelector(path, recursive=True) - files = filesystem.get_file_info(selector) - base_path = selector.base_dir - filtered_paths = [] - for file_ in files: - if not file_.is_file: - continue - file_path = file_.path - if not file_path.startswith(base_path): - continue - relative = file_path[len(base_path) :] - if any(relative.startswith(prefix) for prefix in exclude_prefixes): - continue - filtered_paths.append((file_path, file_)) - # We sort the paths to guarantee a stable order. - return zip(*sorted(filtered_paths, key=lambda x: x[0])) - - def _is_url(path) -> bool: return urllib.parse.urlparse(path).scheme != "" @@ -752,6 +721,15 @@ def _wrap_s3_serialization_workaround(filesystem: "pyarrow.fs.FileSystem"): return filesystem +def _unwrap_s3_serialization_workaround( + filesystem: Union["pyarrow.fs.FileSystem", "_S3FileSystemWrapper"] +): + if isinstance(filesystem, _S3FileSystemWrapper): + return filesystem.unwrap() + else: + return filesystem + + class _S3FileSystemWrapper: def __init__(self, fs: "pyarrow.fs.S3FileSystem"): self._fs = fs @@ -792,3 +770,31 @@ def _resolve_kwargs( kwarg_overrides = kwargs_fn() kwargs.update(kwarg_overrides) return kwargs + + +Uri = TypeVar("Uri") +Meta = TypeVar("Meta") + + +def _fetch_metadata_parallel( + uris: List[Uri], + fetch_func: Callable[[List[Uri]], List[Meta]], + desired_uris_per_task: int, + **ray_remote_args, +) -> Iterator[Meta]: + """Fetch file metadata in parallel using Ray tasks.""" + remote_fetch_func = cached_remote_fn(fetch_func, num_cpus=0.5) + if ray_remote_args: + remote_fetch_func = remote_fetch_func.options(**ray_remote_args) + # Choose a parallelism that results in a # of metadata fetches per task that + # dominates the Ray task overhead while ensuring good parallelism. + # Always launch at least 2 parallel fetch tasks. + parallelism = max(len(uris) // desired_uris_per_task, 2) + metadata_fetch_bar = ProgressBar("Metadata Fetch Progress", total=parallelism) + fetch_tasks = [] + for uri_chunk in np.array_split(uris, parallelism): + if len(uri_chunk) == 0: + continue + fetch_tasks.append(remote_fetch_func.remote(uri_chunk)) + results = metadata_fetch_bar.fetch_until_complete(fetch_tasks) + yield from itertools.chain.from_iterable(results) diff --git a/python/ray/data/datasource/file_meta_provider.py b/python/ray/data/datasource/file_meta_provider.py index f1fc90027b0b8..68a6bc14f1170 100644 --- a/python/ray/data/datasource/file_meta_provider.py +++ b/python/ray/data/datasource/file_meta_provider.py @@ -1,18 +1,23 @@ +import itertools import logging +import pathlib +import os import re from typing import ( List, Optional, Union, - TYPE_CHECKING, + Iterator, Tuple, Any, + TYPE_CHECKING, ) if TYPE_CHECKING: import pyarrow from ray.data.block import BlockMetadata +from ray.data.datasource.partitioning import Partitioning from ray.util.annotations import DeveloperAPI logger = logging.getLogger(__name__) @@ -97,7 +102,8 @@ def expand_paths( self, paths: List[str], filesystem: Optional["pyarrow.fs.FileSystem"], - ) -> Tuple[List[str], List[Optional[int]]]: + partitioning: Optional[Partitioning] = None, + ) -> Iterator[Tuple[str, int]]: """Expands all paths into concrete file paths by walking directories. Also returns a sidecar of file sizes. @@ -112,11 +118,9 @@ def expand_paths( expanding all paths and reading their files. Returns: - A tuple whose first item contains the list of file paths discovered, - and whose second item contains the size of each file. `None` may be - returned if a file size is either unknown or will be fetched later - by `_get_block_metadata()`, but the length of both lists must be - equal. + An iterator of (file_path, file_size) pairs. None may be returned for the + file size if it is either unknown or will be fetched later by + `_get_block_metadata()`, but the length of both lists must be equal. """ raise NotImplementedError @@ -154,10 +158,8 @@ def expand_paths( self, paths: List[str], filesystem: "pyarrow.fs.FileSystem", - ) -> Tuple[List[str], List[Optional[int]]]: - from pyarrow.fs import FileType - from ray.data.datasource.file_based_datasource import _expand_directory - + partitioning: Optional[Partitioning] = None, + ) -> Iterator[Tuple[str, int]]: if len(paths) > 1: logger.warning( f"Expanding {len(paths)} path(s). This may be a HIGH LATENCY " @@ -165,24 +167,8 @@ def expand_paths( f"all point to files and never directories, try rerunning this read " f"with `meta_provider=FastFileMetadataProvider()`." ) - expanded_paths = [] - file_infos = [] - for path in paths: - try: - file_info = filesystem.get_file_info(path) - except OSError as e: - _handle_read_os_error(e, path) - if file_info.type == FileType.Directory: - paths, file_infos_ = _expand_directory(path, filesystem) - expanded_paths.extend(paths) - file_infos.extend(file_infos_) - elif file_info.type == FileType.File: - expanded_paths.append(path) - file_infos.append(file_info) - else: - raise FileNotFoundError(path) - file_sizes = [file_info.size for file_info in file_infos] - return expanded_paths, file_sizes + + yield from _expand_paths(paths, filesystem, partitioning) @DeveloperAPI @@ -201,15 +187,15 @@ def expand_paths( self, paths: List[str], filesystem: "pyarrow.fs.FileSystem", - ) -> Tuple[List[str], List[Optional[int]]]: + partitioning: Optional[Partitioning] = None, + ) -> Iterator[Tuple[str, int]]: logger.warning( f"Skipping expansion of {len(paths)} path(s). If your paths contain " f"directories or if file size collection is required, try rerunning this " f"read with `meta_provider=DefaultFileMetadataProvider()`." ) - import numpy as np - return paths, np.empty(len(paths), dtype=object) + yield from zip(paths, itertools.repeat(None, len(paths))) @DeveloperAPI @@ -322,12 +308,25 @@ def prefetch_file_metadata( ) -> Optional[List["pyarrow.parquet.FileMetaData"]]: from ray.data.datasource.parquet_datasource import ( PARALLELIZE_META_FETCH_THRESHOLD, - _fetch_metadata_remotely, + PIECES_PER_META_FETCH, + _SerializedPiece, + _fetch_metadata_serialization_wrapper, _fetch_metadata, ) + from ray.data.datasource.file_based_datasource import _fetch_metadata_parallel if len(pieces) > PARALLELIZE_META_FETCH_THRESHOLD: - return _fetch_metadata_remotely(pieces, **ray_remote_args) + # Wrap Parquet fragments in serialization workaround. + pieces = [_SerializedPiece(piece) for piece in pieces] + # Fetch Parquet metadata in parallel using Ray tasks. + return list( + _fetch_metadata_parallel( + pieces, + _fetch_metadata_serialization_wrapper, + PIECES_PER_META_FETCH, + **ray_remote_args, + ) + ) else: return _fetch_metadata(pieces) @@ -365,3 +364,161 @@ def _handle_read_os_error(error: OSError, paths: Union[str, List[str]]) -> str: ) else: raise error + + +def _expand_paths( + paths: List[str], + filesystem: "pyarrow.fs.FileSystem", + partitioning: Optional[Partitioning], +) -> Iterator[Tuple[str, int]]: + """Get the file sizes for all provided file paths.""" + from pyarrow.fs import LocalFileSystem + from ray.data.datasource.file_based_datasource import ( + FILE_SIZE_FETCH_PARALLELIZATION_THRESHOLD, + _unwrap_protocol, + ) + + # We break down our processing paths into a few key cases: + # 1. If len(paths) < threshold, fetch the file info for the individual files/paths + # serially. + # 2. If all paths are contained under the same parent directory (or base directory, + # if using partitioning), fetch all file infos at this prefix and filter to the + # provided paths on the client; this should be a single file info request. + # 3. If more than threshold requests required, parallelize them via Ray tasks. + + # 1. Small # of paths case. + if ( + len(paths) < FILE_SIZE_FETCH_PARALLELIZATION_THRESHOLD + # Local file systems are very fast to hit. + or isinstance(filesystem, LocalFileSystem) + ): + yield from _get_file_infos_serial(paths, filesystem) + else: + # 2. Common path prefix case. + # Get longest common path of all paths. + common_path = os.path.commonpath(paths) + # If parent directory (or base directory, if using partitioning) is common to + # all paths, fetch all file infos at that prefix and filter the response to the + # provided paths. + if ( + partitioning is not None + and common_path == _unwrap_protocol(partitioning.base_dir) + ) or all(str(pathlib.Path(path).parent) == common_path for path in paths): + yield from _get_file_infos_common_path_prefix( + paths, common_path, filesystem + ) + # 3. Parallelization case. + else: + # Parallelize requests via Ray tasks. + yield from _get_file_infos_parallel(paths, filesystem) + + +def _get_file_infos_serial( + paths: List[str], + filesystem: "pyarrow.fs.FileSystem", +) -> Iterator[Tuple[str, int]]: + for path in paths: + yield from _get_file_infos(path, filesystem) + + +def _get_file_infos_common_path_prefix( + paths: List[str], + common_path: str, + filesystem: "pyarrow.fs.FileSystem", +) -> Iterator[Tuple[str, int]]: + path_to_size = {path: None for path in paths} + for path, file_size in _get_file_infos(common_path, filesystem): + if path in path_to_size: + path_to_size[path] = file_size + # Dictionaries are insertion-ordered, so this path + size pairs should be + # yielded in the order of the original paths arg. + for path, size in path_to_size.items(): + assert size is not None + yield path, size + + +def _get_file_infos_parallel( + paths: List[str], + filesystem: "pyarrow.fs.FileSystem", +) -> Iterator[Tuple[str, int]]: + from ray.data.datasource.file_based_datasource import ( + PATHS_PER_FILE_SIZE_FETCH_TASK, + _wrap_s3_serialization_workaround, + _unwrap_s3_serialization_workaround, + _fetch_metadata_parallel, + ) + + # Capture the filesystem in the fetcher func closure, but wrap it in our + # serialization workaround to make sure that the pickle roundtrip works as expected. + filesystem = _wrap_s3_serialization_workaround(filesystem) + + def _file_infos_fetcher(paths: List[str]) -> List[Tuple[str, int]]: + fs = _unwrap_s3_serialization_workaround(filesystem) + return list( + itertools.chain.from_iterable(_get_file_infos(path, fs) for path in paths) + ) + + yield from _fetch_metadata_parallel( + paths, _file_infos_fetcher, PATHS_PER_FILE_SIZE_FETCH_TASK + ) + + +def _get_file_infos( + path: str, + filesystem: "pyarrow.fs.FileSystem", +) -> Iterator[Tuple[str, int]]: + """Get the file info for all files at or under the provided path.""" + from pyarrow.fs import FileType + + try: + file_info = filesystem.get_file_info(path) + except OSError as e: + _handle_read_os_error(e, path) + if file_info.type == FileType.Directory: + yield from _expand_directory(path, filesystem) + elif file_info.type == FileType.File: + yield path, file_info.size + else: + raise FileNotFoundError(path) + + +def _expand_directory( + path: str, + filesystem: "pyarrow.fs.FileSystem", + exclude_prefixes: Optional[List[str]] = None, +) -> Iterator[Tuple[str, int]]: + """ + Expand the provided directory path to a list of file paths. + + Args: + path: The directory path to expand. + filesystem: The filesystem implementation that should be used for + reading these files. + exclude_prefixes: The file relative path prefixes that should be + excluded from the returned file set. Default excluded prefixes are + "." and "_". + + Returns: + An iterator of (file_path, file_size) tuples. + """ + if exclude_prefixes is None: + exclude_prefixes = [".", "_"] + + from pyarrow.fs import FileSelector + + selector = FileSelector(path, recursive=True) + files = filesystem.get_file_info(selector) + base_path = selector.base_dir + out = [] + for file_ in files: + if not file_.is_file: + continue + file_path = file_.path + if not file_path.startswith(base_path): + continue + relative = file_path[len(base_path) :] + if any(relative.startswith(prefix) for prefix in exclude_prefixes): + continue + out.append((file_path, file_.size)) + # We sort the paths to guarantee a stable order. + yield from sorted(out) diff --git a/python/ray/data/datasource/parquet_base_datasource.py b/python/ray/data/datasource/parquet_base_datasource.py index 69fa32df95f49..0956e1acd6f8c 100644 --- a/python/ray/data/datasource/parquet_base_datasource.py +++ b/python/ray/data/datasource/parquet_base_datasource.py @@ -24,7 +24,7 @@ class ParquetBaseDatasource(FileBasedDatasource): def get_name(self): """Return a human-readable name for this datasource. This will be used as the names of the read tasks. - Note: overrides the base `Datasource` method. + Note: overrides the base `FileBasedDatasource` method. """ return "ParquetBulk" diff --git a/python/ray/data/datasource/parquet_datasource.py b/python/ray/data/datasource/parquet_datasource.py index 674410a1c29b2..ef2a42246163d 100644 --- a/python/ray/data/datasource/parquet_datasource.py +++ b/python/ray/data/datasource/parquet_datasource.py @@ -1,4 +1,3 @@ -import itertools import logging from typing import TYPE_CHECKING, Callable, Iterator, List, Optional, Union @@ -18,7 +17,6 @@ _handle_read_os_error, ) from ray.data.datasource.parquet_base_datasource import ParquetBaseDatasource -from ray.types import ObjectRef from ray.util.annotations import PublicAPI import ray.cloudpickle as cloudpickle @@ -161,6 +159,13 @@ class ParquetDatasource(ParquetBaseDatasource): [{"a": 1, "b": "foo"}, ...] """ + def get_name(self): + """Return a human-readable name for this datasource. + This will be used as the names of the read tasks. + Note: overrides the base `ParquetBaseDatasource` method. + """ + return "Parquet" + def create_reader(self, **kwargs): return _ParquetDatasourceReader(**kwargs) @@ -403,29 +408,8 @@ def _read_pieces( yield output_buffer.next() -def _fetch_metadata_remotely( - pieces: List["pyarrow._dataset.ParquetFileFragment"], - **ray_remote_args, -) -> List[ObjectRef["pyarrow.parquet.FileMetaData"]]: - - remote_fetch_metadata = cached_remote_fn(_fetch_metadata_serialization_wrapper) - metas = [] - parallelism = min(len(pieces) // PIECES_PER_META_FETCH, 100) - meta_fetch_bar = ProgressBar("Metadata Fetch Progress", total=parallelism) - for pcs in np.array_split(pieces, parallelism): - if len(pcs) == 0: - continue - metas.append( - remote_fetch_metadata.options(**ray_remote_args).remote( - [_SerializedPiece(p) for p in pcs] - ) - ) - metas = meta_fetch_bar.fetch_until_complete(metas) - return list(itertools.chain.from_iterable(metas)) - - def _fetch_metadata_serialization_wrapper( - pieces: str, + pieces: _SerializedPiece, ) -> List["pyarrow.parquet.FileMetaData"]: pieces: List[ diff --git a/python/ray/data/read_api.py b/python/ray/data/read_api.py index bc25d2a92fb8d..7484d7c16e514 100644 --- a/python/ray/data/read_api.py +++ b/python/ray/data/read_api.py @@ -492,7 +492,17 @@ def read_parquet( ... ("variety", pa.string())] >>> ray.data.read_parquet("example://iris.parquet", ... schema=pa.schema(fields)) - Dataset(num_blocks=..., num_rows=150, schema={sepal.length: double, ...}) + Dataset( + num_blocks=1, + num_rows=150, + schema={ + sepal.length: double, + sepal.width: double, + petal.length: double, + petal.width: double, + variety: string + } + ) For further arguments you can pass to pyarrow as a keyword argument, see https://arrow.apache.org/docs/python/generated/pyarrow.dataset.Scanner.html#pyarrow.dataset.Scanner.from_fragment @@ -1268,6 +1278,14 @@ def from_pandas( if isinstance(dfs, pd.DataFrame): dfs = [dfs] + + from ray.air.util.data_batch_conversion import ( + _cast_ndarray_columns_to_tensor_extension, + ) + + context = DatasetContext.get_current() + if context.enable_tensor_extension_casting: + dfs = [_cast_ndarray_columns_to_tensor_extension(df.copy()) for df in dfs] return from_pandas_refs([ray.put(df) for df in dfs]) diff --git a/python/ray/data/tests/conftest.py b/python/ray/data/tests/conftest.py index 30e935ca0f995..ba67ec79ab704 100644 --- a/python/ray/data/tests/conftest.py +++ b/python/ray/data/tests/conftest.py @@ -185,10 +185,12 @@ def _write_partitioned_df( partition_keys, partition_path_encoder, file_writer_fn, + file_name_suffix="_1", ): import urllib.parse df_partitions = [df for _, df in df.groupby(partition_keys, as_index=False)] + paths = [] for df_partition in df_partitions: partition_values = [] for key in partition_keys: @@ -197,12 +199,15 @@ def _write_partitioned_df( partition_path_encoder.scheme.resolved_filesystem.create_dir(path) base_dir = partition_path_encoder.scheme.base_dir parsed_base_dir = urllib.parse.urlparse(base_dir) + file_name = f"test_{file_name_suffix}.tmp" if parsed_base_dir.scheme: # replace the protocol removed by the partition path generator - path = posixpath.join(f"{parsed_base_dir.scheme}://{path}", "test.tmp") + path = posixpath.join(f"{parsed_base_dir.scheme}://{path}", file_name) else: - path = os.path.join(path, "test.tmp") + path = os.path.join(path, file_name) file_writer_fn(df_partition, path) + paths.append(path) + return paths yield _write_partitioned_df @@ -246,21 +251,35 @@ def _assert_base_partitioned_ds( assert ds.schema() is not None actual_input_files = ds.input_files() assert len(actual_input_files) == num_input_files, actual_input_files - assert ( - str(ds) == f"Dataset(num_blocks={num_input_files}, num_rows={num_rows}, " - f"schema={schema})" - ), ds - assert ( - repr(ds) == f"Dataset(num_blocks={num_input_files}, num_rows={num_rows}, " - f"schema={schema})" - ), ds + + # For Datasets with long string representations, the format will include + # whitespace and newline characters, which is difficult to generalize + # without implementing the formatting logic again (from + # `ExecutionPlan.get_plan_as_string()`). Therefore, we remove whitespace + # characters to test the string contents regardless of the string repr length. + def _remove_whitespace(ds_str): + for c in ["\n", " ", " "]: + ds_str = ds_str.replace(c, "") + return ds_str + + assert "Dataset(num_blocks={},num_rows={},schema={})".format( + num_input_files, + num_rows, + _remove_whitespace(schema), + ) == _remove_whitespace(str(ds)), ds + assert "Dataset(num_blocks={},num_rows={},schema={})".format( + num_input_files, + num_rows, + _remove_whitespace(schema), + ) == _remove_whitespace(repr(ds)), ds + if num_computed is not None: assert ( ds._plan.execute()._num_computed() == num_computed ), f"{ds._plan.execute()._num_computed()} != {num_computed}" # Force a data read. - values = ds_take_transform_fn(ds.take()) + values = ds_take_transform_fn(ds.take_all()) if num_computed is not None: assert ( ds._plan.execute()._num_computed() == num_computed diff --git a/python/ray/data/tests/test_dataset.py b/python/ray/data/tests/test_dataset.py deleted file mode 100644 index aab3e64ef9211..0000000000000 --- a/python/ray/data/tests/test_dataset.py +++ /dev/null @@ -1,4620 +0,0 @@ -import itertools -import logging -import math -import os -import random -import signal -import time -from typing import Iterator -from unittest.mock import patch - -import numpy as np -import pandas as pd -import pyarrow as pa -import pyarrow.parquet as pq -import pytest - -import ray -from ray._private.test_utils import wait_for_condition -from ray.data._internal.dataset_logger import DatasetLogger -from ray.data._internal.stats import _StatsActor -from ray.data._internal.arrow_block import ArrowRow -from ray.data._internal.block_builder import BlockBuilder -from ray.data._internal.lazy_block_list import LazyBlockList -from ray.data._internal.pandas_block import PandasRow -from ray.data.aggregate import AggregateFn, Count, Max, Mean, Min, Std, Sum -from ray.data.block import BlockAccessor, BlockMetadata -from ray.data.context import DatasetContext -from ray.data.dataset import Dataset, _sliding_window -from ray.data.datasource.datasource import Datasource, ReadTask -from ray.data.datasource.csv_datasource import CSVDatasource -from ray.data.extensions.tensor_extension import ( - ArrowTensorArray, - ArrowTensorType, - TensorArray, - TensorDtype, -) -from ray.data.row import TableRow -from ray.data.tests.conftest import * # noqa -from ray.tests.conftest import * # noqa -from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy - - -def maybe_pipeline(ds, enabled): - if enabled: - return ds.window(blocks_per_window=1) - else: - return ds - - -class SlowCSVDatasource(CSVDatasource): - def _read_stream(self, f: "pa.NativeFile", path: str, **reader_args): - for block in CSVDatasource._read_stream(self, f, path, **reader_args): - time.sleep(3) - yield block - - -# Tests that we don't block on exponential rampup when doing bulk reads. -# https://github.com/ray-project/ray/issues/20625 -@pytest.mark.parametrize("block_split", [False, True]) -def test_bulk_lazy_eval_split_mode(shutdown_only, block_split, tmp_path): - # Defensively shutdown Ray for the first test here to make sure there - # is no existing Ray cluster. - ray.shutdown() - - ray.init(num_cpus=8) - ctx = ray.data.context.DatasetContext.get_current() - - try: - original = ctx.block_splitting_enabled - - ray.data.range(8, parallelism=8).write_csv(str(tmp_path)) - if not block_split: - # Setting infinite block size effectively disables block splitting. - ctx.target_max_block_size = float("inf") - ds = ray.data.read_datasource( - SlowCSVDatasource(), parallelism=8, paths=str(tmp_path) - ) - - start = time.time() - ds.map(lambda x: x) - delta = time.time() - start - - print("full read time", delta) - # Should run in ~3 seconds. It takes >9 seconds if bulk read is broken. - assert delta < 8, delta - finally: - ctx.block_splitting_enabled = original - - -@pytest.mark.parametrize("pipelined", [False, True]) -def test_basic_actors(shutdown_only, pipelined): - ray.init(num_cpus=6) - n = 5 - ds = ray.data.range(n) - ds = maybe_pipeline(ds, pipelined) - assert sorted(ds.map(lambda x: x + 1, compute="actors").take()) == list( - range(1, n + 1) - ) - - # Should still work even if num actors > num cpus. - ds = ray.data.range(n) - ds = maybe_pipeline(ds, pipelined) - assert sorted( - ds.map(lambda x: x + 1, compute=ray.data.ActorPoolStrategy(4, 4)).take() - ) == list(range(1, n + 1)) - - # Test setting custom max inflight tasks. - ds = ray.data.range(10, parallelism=5) - ds = maybe_pipeline(ds, pipelined) - assert sorted( - ds.map( - lambda x: x + 1, - compute=ray.data.ActorPoolStrategy(max_tasks_in_flight_per_actor=3), - ).take() - ) == list(range(1, 11)) - - # Test invalid max tasks inflight arg. - with pytest.raises(ValueError): - ray.data.range(10).map( - lambda x: x, - compute=ray.data.ActorPoolStrategy(max_tasks_in_flight_per_actor=0), - ) - - # Test min no more than max check. - with pytest.raises(ValueError): - ray.data.range(10).map(lambda x: x, compute=ray.data.ActorPoolStrategy(8, 4)) - - -@pytest.mark.parametrize("pipelined", [False, True]) -def test_avoid_placement_group_capture(shutdown_only, pipelined): - ray.init(num_cpus=2) - - @ray.remote - def run(): - ds0 = ray.data.range(5) - ds = maybe_pipeline(ds0, pipelined) - assert sorted(ds.map(lambda x: x + 1).take()) == [1, 2, 3, 4, 5] - ds = maybe_pipeline(ds0, pipelined) - assert ds.count() == 5 - ds = maybe_pipeline(ds0, pipelined) - assert sorted(ds.iter_rows()) == [0, 1, 2, 3, 4] - - pg = ray.util.placement_group([{"CPU": 1}]) - ray.get( - run.options( - scheduling_strategy=PlacementGroupSchedulingStrategy( - placement_group=pg, placement_group_capture_child_tasks=True - ) - ).remote() - ) - - -def test_callable_classes(shutdown_only): - ray.init(num_cpus=2) - ds = ray.data.range(10, parallelism=10) - - class StatefulFn: - def __init__(self): - self.num_reuses = 0 - - def __call__(self, x): - r = self.num_reuses - self.num_reuses += 1 - return r - - # Need to specify compute explicitly. - with pytest.raises(ValueError): - ds.map(StatefulFn).take() - - # Need to specify actor compute strategy. - with pytest.raises(ValueError): - ds.map(StatefulFn, compute="tasks").take() - - # Need to specify compute explicitly. - with pytest.raises(ValueError): - ds.flat_map(StatefulFn).take() - - # Need to specify actor compute strategy. - with pytest.raises(ValueError): - ds.flat_map(StatefulFn, compute="tasks") - - # Need to specify compute explicitly. - with pytest.raises(ValueError): - ds.filter(StatefulFn).take() - - # Need to specify actor compute strategy. - with pytest.raises(ValueError): - ds.filter(StatefulFn, compute="tasks") - - # map - actor_reuse = ds.map(StatefulFn, compute="actors").take() - assert sorted(actor_reuse) == list(range(10)), actor_reuse - - class StatefulFn: - def __init__(self): - self.num_reuses = 0 - - def __call__(self, x): - r = self.num_reuses - self.num_reuses += 1 - return [r] - - # flat map - actor_reuse = ds.flat_map(StatefulFn, compute="actors").take() - assert sorted(actor_reuse) == list(range(10)), actor_reuse - - # map batches - actor_reuse = ds.map_batches(StatefulFn, batch_size=1, compute="actors").take() - assert sorted(actor_reuse) == list(range(10)), actor_reuse - - class StatefulFn: - def __init__(self): - self.num_reuses = 0 - - def __call__(self, x): - r = self.num_reuses - self.num_reuses += 1 - return r > 0 - - # filter - actor_reuse = ds.filter(StatefulFn, compute="actors").take() - assert len(actor_reuse) == 9, actor_reuse - - -def test_transform_failure(shutdown_only): - ray.init(num_cpus=2) - ds = ray.data.from_items([0, 10], parallelism=2) - - def mapper(x): - time.sleep(x) - raise ValueError("oops") - return x - - with pytest.raises(ray.exceptions.RayTaskError): - ds.map(mapper).fully_executed() - - -def test_dataset_lineage_serialization(shutdown_only): - ray.init() - ds = ray.data.range(10) - ds = ds.map(lambda x: x + 1) - ds = ds.map(lambda x: x + 1) - ds = ds.random_shuffle() - epoch = ds._get_epoch() - uuid = ds._get_uuid() - plan_uuid = ds._plan._dataset_uuid - - serialized_ds = ds.serialize_lineage() - # Confirm that the original Dataset was properly copied before clearing/mutating. - in_blocks = ds._plan._in_blocks - # Should not raise. - in_blocks._check_if_cleared() - assert isinstance(in_blocks, LazyBlockList) - assert in_blocks._block_partition_refs[0] is None - - ray.shutdown() - ray.init() - - ds = Dataset.deserialize_lineage(serialized_ds) - # Check Dataset state. - assert ds._get_epoch() == epoch - assert ds._get_uuid() == uuid - assert ds._plan._dataset_uuid == plan_uuid - # Check Dataset content. - assert ds.count() == 10 - assert sorted(ds.take()) == list(range(2, 12)) - - -def test_dataset_lineage_serialization_unsupported(shutdown_only): - ray.init() - # In-memory data sources not supported. - ds = ray.data.from_items(list(range(10))) - ds = ds.map(lambda x: x + 1) - ds = ds.map(lambda x: x + 1) - - with pytest.raises(ValueError): - ds.serialize_lineage() - - # In-memory data source unions not supported. - ds = ray.data.from_items(list(range(10))) - ds1 = ray.data.from_items(list(range(10, 20))) - ds2 = ds.union(ds1) - - with pytest.raises(ValueError): - ds2.serialize_lineage() - - # Post-lazy-read unions not supported. - ds = ray.data.range(10).map(lambda x: x + 1) - ds1 = ray.data.range(20).map(lambda x: 2 * x) - ds2 = ds.union(ds1) - - with pytest.raises(ValueError): - ds2.serialize_lineage() - - # Lazy read unions supported. - ds = ray.data.range(10) - ds1 = ray.data.range(20) - ds2 = ds.union(ds1) - - serialized_ds = ds2.serialize_lineage() - ds3 = Dataset.deserialize_lineage(serialized_ds) - assert ds3.take(30) == list(range(10)) + list(range(20)) - - # Zips not supported. - ds = ray.data.from_items(list(range(10))) - ds1 = ray.data.from_items(list(range(10, 20))) - ds2 = ds.zip(ds1) - - with pytest.raises(ValueError): - ds2.serialize_lineage() - - -@pytest.mark.parametrize("pipelined", [False, True]) -def test_basic(ray_start_regular_shared, pipelined): - ds0 = ray.data.range(5) - ds = maybe_pipeline(ds0, pipelined) - assert sorted(ds.map(lambda x: x + 1).take()) == [1, 2, 3, 4, 5] - ds = maybe_pipeline(ds0, pipelined) - assert ds.count() == 5 - ds = maybe_pipeline(ds0, pipelined) - assert sorted(ds.iter_rows()) == [0, 1, 2, 3, 4] - - -def test_flat_map_generator(ray_start_regular_shared): - ds = ray.data.range(3) - - def map_generator(item: int) -> Iterator[int]: - for _ in range(2): - yield item + 1 - - assert sorted(ds.flat_map(map_generator).take()) == [1, 1, 2, 2, 3, 3] - - -def test_zip(ray_start_regular_shared): - ds1 = ray.data.range(5, parallelism=5) - ds2 = ray.data.range(5, parallelism=5).map(lambda x: x + 1) - ds = ds1.zip(ds2) - assert ds.schema() == tuple - assert ds.take() == [(0, 1), (1, 2), (2, 3), (3, 4), (4, 5)] - with pytest.raises(ValueError): - ds.zip(ray.data.range(3)).fully_executed() - - -@pytest.mark.parametrize( - "num_blocks1,num_blocks2", - list(itertools.combinations_with_replacement(range(1, 12), 2)), -) -def test_zip_different_num_blocks_combinations( - ray_start_regular_shared, num_blocks1, num_blocks2 -): - n = 12 - ds1 = ray.data.range(n, parallelism=num_blocks1) - ds2 = ray.data.range(n, parallelism=num_blocks2).map(lambda x: x + 1) - ds = ds1.zip(ds2) - assert ds.schema() == tuple - assert ds.take() == list(zip(range(n), range(1, n + 1))) - - -@pytest.mark.parametrize( - "num_cols1,num_cols2,should_invert", - [ - (1, 1, False), - (4, 1, False), - (1, 4, True), - (1, 10, True), - (10, 10, False), - ], -) -def test_zip_different_num_blocks_split_smallest( - ray_start_regular_shared, - num_cols1, - num_cols2, - should_invert, -): - n = 12 - num_blocks1 = 4 - num_blocks2 = 2 - ds1 = ray.data.from_items( - [{str(i): i for i in range(num_cols1)}] * n, parallelism=num_blocks1 - ) - ds2 = ray.data.from_items( - [{str(i): i for i in range(num_cols1, num_cols1 + num_cols2)}] * n, - parallelism=num_blocks2, - ) - ds = ds1.zip(ds2).fully_executed() - num_blocks = ds._plan._snapshot_blocks.executed_num_blocks() - assert ds.take() == [{str(i): i for i in range(num_cols1 + num_cols2)}] * n - if should_invert: - assert num_blocks == num_blocks2 - else: - assert num_blocks == num_blocks1 - - -def test_zip_pandas(ray_start_regular_shared): - ds1 = ray.data.from_pandas(pd.DataFrame({"col1": [1, 2], "col2": [4, 5]})) - ds2 = ray.data.from_pandas(pd.DataFrame({"col3": ["a", "b"], "col4": ["d", "e"]})) - ds = ds1.zip(ds2) - assert ds.count() == 2 - assert "{col1: int64, col2: int64, col3: object, col4: object}" in str(ds) - result = [r.as_pydict() for r in ds.take()] - assert result[0] == {"col1": 1, "col2": 4, "col3": "a", "col4": "d"} - - ds3 = ray.data.from_pandas(pd.DataFrame({"col2": ["a", "b"], "col4": ["d", "e"]})) - ds = ds1.zip(ds3) - assert ds.count() == 2 - assert "{col1: int64, col2: int64, col2_1: object, col4: object}" in str(ds) - result = [r.as_pydict() for r in ds.take()] - assert result[0] == {"col1": 1, "col2": 4, "col2_1": "a", "col4": "d"} - - -def test_zip_arrow(ray_start_regular_shared): - ds1 = ray.data.range_table(5).map(lambda r: {"id": r["value"]}) - ds2 = ray.data.range_table(5).map( - lambda r: {"a": r["value"] + 1, "b": r["value"] + 2} - ) - ds = ds1.zip(ds2) - assert ds.count() == 5 - assert "{id: int64, a: int64, b: int64}" in str(ds) - result = [r.as_pydict() for r in ds.take()] - assert result[0] == {"id": 0, "a": 1, "b": 2} - - # Test duplicate column names. - ds = ds1.zip(ds1).zip(ds1) - assert ds.count() == 5 - assert "{id: int64, id_1: int64, id_2: int64}" in str(ds) - result = [r.as_pydict() for r in ds.take()] - assert result[0] == {"id": 0, "id_1": 0, "id_2": 0} - - -def test_arrow_block_select(): - df = pd.DataFrame({"one": [10, 11, 12], "two": [11, 12, 13], "three": [14, 15, 16]}) - table = pa.Table.from_pandas(df) - block_accessor = BlockAccessor.for_block(table) - - block = block_accessor.select(["two"]) - assert block.schema == pa.schema([("two", pa.int64())]) - assert block.to_pandas().equals(df[["two"]]) - - block = block_accessor.select(["two", "one"]) - assert block.schema == pa.schema([("two", pa.int64()), ("one", pa.int64())]) - assert block.to_pandas().equals(df[["two", "one"]]) - - with pytest.raises(ValueError): - block = block_accessor.select([lambda x: x % 3, "two"]) - - -def test_pandas_block_select(): - df = pd.DataFrame({"one": [10, 11, 12], "two": [11, 12, 13], "three": [14, 15, 16]}) - block_accessor = BlockAccessor.for_block(df) - - block = block_accessor.select(["two"]) - assert block.equals(df[["two"]]) - - block = block_accessor.select(["two", "one"]) - assert block.equals(df[["two", "one"]]) - - with pytest.raises(ValueError): - block = block_accessor.select([lambda x: x % 3, "two"]) - - -def test_simple_block_select(): - xs = list(range(100)) - block_accessor = BlockAccessor.for_block(xs) - - block = block_accessor.select([lambda x: x % 3]) - assert block == [x % 3 for x in xs] - - with pytest.raises(ValueError): - block = block_accessor.select(["foo"]) - - with pytest.raises(ValueError): - block = block_accessor.select([]) - - -def test_arrow_block_slice_copy(): - # Test that ArrowBlock slicing properly copies the underlying Arrow - # table. - def check_for_copy(table1, table2, a, b, is_copy): - expected_slice = table1.slice(a, b - a) - assert table2.equals(expected_slice) - assert table2.schema == table1.schema - assert table1.num_columns == table2.num_columns - for col1, col2 in zip(table1.columns, table2.columns): - assert col1.num_chunks == col2.num_chunks - for chunk1, chunk2 in zip(col1.chunks, col2.chunks): - bufs1 = chunk1.buffers() - bufs2 = chunk2.buffers() - expected_offset = 0 if is_copy else a - assert chunk2.offset == expected_offset - assert len(chunk2) == b - a - if is_copy: - assert bufs2[1].address != bufs1[1].address - else: - assert bufs2[1].address == bufs1[1].address - - n = 20 - df = pd.DataFrame( - {"one": list(range(n)), "two": ["a"] * n, "three": [np.nan] + [1.5] * (n - 1)} - ) - table = pa.Table.from_pandas(df) - a, b = 5, 10 - block_accessor = BlockAccessor.for_block(table) - - # Test with copy. - table2 = block_accessor.slice(a, b, True) - check_for_copy(table, table2, a, b, is_copy=True) - - # Test without copy. - table2 = block_accessor.slice(a, b, False) - check_for_copy(table, table2, a, b, is_copy=False) - - -def test_arrow_block_slice_copy_empty(): - # Test that ArrowBlock slicing properly copies the underlying Arrow - # table when the table is empty. - df = pd.DataFrame({"one": []}) - table = pa.Table.from_pandas(df) - a, b = 0, 0 - expected_slice = table.slice(a, b - a) - block_accessor = BlockAccessor.for_block(table) - - # Test with copy. - table2 = block_accessor.slice(a, b, True) - assert table2.equals(expected_slice) - assert table2.schema == table.schema - assert table2.num_rows == 0 - - # Test without copy. - table2 = block_accessor.slice(a, b, False) - assert table2.equals(expected_slice) - assert table2.schema == table.schema - assert table2.num_rows == 0 - - -def test_range_table(ray_start_regular_shared): - ds = ray.data.range_table(10, parallelism=10) - assert ds.num_blocks() == 10 - assert ds.count() == 10 - assert ds.take() == [{"value": i} for i in range(10)] - - ds = ray.data.range_table(10, parallelism=2) - assert ds.num_blocks() == 2 - assert ds.count() == 10 - assert ds.take() == [{"value": i} for i in range(10)] - - -def test_empty_shuffle(ray_start_regular_shared): - ds = ray.data.range(100, parallelism=100) - ds = ds.filter(lambda x: x) - ds = ds.map_batches(lambda x: x) - ds = ds.random_shuffle() # Would prev. crash with AssertionError: pyarrow.Table. - ds.show() - - -def test_empty_dataset(ray_start_regular_shared): - ds = ray.data.range(0) - assert ds.count() == 0 - assert ds.size_bytes() is None - assert ds.schema() is None - - ds = ray.data.range(1) - ds = ds.filter(lambda x: x > 1) - ds.fully_executed() - assert str(ds) == "Dataset(num_blocks=1, num_rows=0, schema=Unknown schema)" - - # Test map on empty dataset. - ds = ray.data.from_items([]) - ds = ds.map(lambda x: x) - ds.fully_executed() - assert ds.count() == 0 - - # Test filter on empty dataset. - ds = ray.data.from_items([]) - ds = ds.filter(lambda: True) - ds.fully_executed() - assert ds.count() == 0 - - -def test_schema(ray_start_regular_shared): - ds = ray.data.range(10, parallelism=10) - ds2 = ray.data.range_table(10, parallelism=10) - ds3 = ds2.repartition(5) - ds3.fully_executed() - ds4 = ds3.map(lambda x: {"a": "hi", "b": 1.0}).limit(5).repartition(1) - ds4.fully_executed() - assert str(ds) == "Dataset(num_blocks=10, num_rows=10, schema=)" - assert str(ds2) == "Dataset(num_blocks=10, num_rows=10, schema={value: int64})" - assert str(ds3) == "Dataset(num_blocks=5, num_rows=10, schema={value: int64})" - assert ( - str(ds4) == "Dataset(num_blocks=1, num_rows=5, schema={a: string, b: double})" - ) - - -def test_schema_lazy(ray_start_regular_shared): - ds = ray.data.range(100, parallelism=10) - # We do not kick off the read task by default. - assert ds._plan._in_blocks._num_computed() == 0 - schema = ds.schema() - assert schema == int - assert ds._plan._in_blocks._num_computed() == 1 - # Fetching the schema should not trigger execution of extra read tasks. - assert ds._plan.execute()._num_computed() == 1 - - -def test_count_lazy(ray_start_regular_shared): - ds = ray.data.range(100, parallelism=10) - # We do not kick off the read task by default. - assert ds._plan._in_blocks._num_computed() == 0 - assert ds.count() == 100 - # Getting number of rows should not trigger execution of any read tasks - # for ray.data.range(), as the number of rows is known beforehand. - assert ds._plan._in_blocks._num_computed() == 0 - - -def test_lazy_loading_exponential_rampup(ray_start_regular_shared): - ds = ray.data.range(100, parallelism=20) - - def check_num_computed(expected): - if ray.data.context.DatasetContext.get_current().use_streaming_executor: - # In streaing executor, ds.take() will not invoke partial execution - # in LazyBlocklist. - assert ds._plan.execute()._num_computed() == 0 - else: - assert ds._plan.execute()._num_computed() == expected - - check_num_computed(0) - assert ds.take(10) == list(range(10)) - check_num_computed(2) - assert ds.take(20) == list(range(20)) - check_num_computed(4) - assert ds.take(30) == list(range(30)) - check_num_computed(8) - assert ds.take(50) == list(range(50)) - check_num_computed(16) - assert ds.take(100) == list(range(100)) - check_num_computed(20) - - -def test_dataset_repr(ray_start_regular_shared): - ds = ray.data.range(10, parallelism=10) - assert repr(ds) == "Dataset(num_blocks=10, num_rows=10, schema=)" - ds = ds.map_batches(lambda x: x) - assert repr(ds) == ( - "MapBatches()\n" - "+- Dataset(num_blocks=10, num_rows=10, schema=)" - ) - ds = ds.filter(lambda x: x > 0) - assert repr(ds) == ( - "Filter\n" - "+- MapBatches()\n" - " +- Dataset(num_blocks=10, num_rows=10, schema=)" - ) - ds = ds.random_shuffle() - assert repr(ds) == ( - "RandomShuffle\n" - "+- Filter\n" - " +- MapBatches()\n" - " +- Dataset(num_blocks=10, num_rows=10, schema=)" - ) - ds.fully_executed() - assert repr(ds) == "Dataset(num_blocks=10, num_rows=9, schema=)" - ds = ds.map_batches(lambda x: x) - assert repr(ds) == ( - "MapBatches()\n" - "+- Dataset(num_blocks=10, num_rows=9, schema=)" - ) - ds1, ds2 = ds.split(2) - assert ( - repr(ds1) - == f"Dataset(num_blocks=5, num_rows={ds1.count()}, schema=)" - ) - assert ( - repr(ds2) - == f"Dataset(num_blocks=5, num_rows={ds2.count()}, schema=)" - ) - ds3 = ds1.union(ds2) - assert repr(ds3) == "Dataset(num_blocks=10, num_rows=9, schema=)" - ds = ds.zip(ds3) - assert repr(ds) == ( - "Zip\n" "+- Dataset(num_blocks=10, num_rows=9, schema=)" - ) - - def my_dummy_fn(x): - return x - - ds = ray.data.range(10, parallelism=10) - ds = ds.map_batches(my_dummy_fn) - assert repr(ds) == ( - "MapBatches(my_dummy_fn)\n" - "+- Dataset(num_blocks=10, num_rows=10, schema=)" - ) - - -@pytest.mark.parametrize("lazy", [False, True]) -def test_limit(ray_start_regular_shared, lazy): - ds = ray.data.range(100, parallelism=20) - if not lazy: - ds = ds.fully_executed() - for i in range(100): - assert ds.limit(i).take(200) == list(range(i)) - - -# NOTE: We test outside the power-of-2 range in order to ensure that we're not reading -# redundant files due to exponential ramp-up. -@pytest.mark.parametrize("limit,expected", [(10, 1), (20, 2), (30, 3), (60, 6)]) -def test_limit_no_redundant_read(ray_start_regular_shared, limit, expected): - # Test that dataset truncation eliminates redundant reads. - @ray.remote - class Counter: - def __init__(self): - self.count = 0 - - def increment(self): - self.count += 1 - - def get(self): - return self.count - - def reset(self): - self.count = 0 - - class CountingRangeDatasource(Datasource): - def __init__(self): - self.counter = Counter.remote() - - def prepare_read(self, parallelism, n): - def range_(i): - ray.get(self.counter.increment.remote()) - return [list(range(parallelism * i, parallelism * i + n))] - - return [ - ReadTask( - lambda i=i: range_(i), - BlockMetadata( - num_rows=n, - size_bytes=None, - schema=None, - input_files=None, - exec_stats=None, - ), - ) - for i in range(parallelism) - ] - - source = CountingRangeDatasource() - - ds = ray.data.read_datasource( - source, - parallelism=10, - n=10, - ) - ds2 = ds.limit(limit) - # Check content. - assert ds2.take(limit) == list(range(limit)) - # Check number of read tasks launched. - assert ray.get(source.counter.get.remote()) == expected - - -def test_limit_no_num_row_info(ray_start_regular_shared): - # Test that datasources with no number-of-rows metadata available are still able to - # be truncated, falling back to kicking off all read tasks. - class DumbOnesDatasource(Datasource): - def prepare_read(self, parallelism, n): - return parallelism * [ - ReadTask( - lambda: [[1] * n], - BlockMetadata( - num_rows=None, - size_bytes=None, - schema=None, - input_files=None, - exec_stats=None, - ), - ) - ] - - ds = ray.data.read_datasource(DumbOnesDatasource(), parallelism=10, n=10) - for i in range(1, 100): - assert ds.limit(i).take(100) == [1] * i - - -def test_convert_types(ray_start_regular_shared): - plain_ds = ray.data.range(1) - arrow_ds = plain_ds.map(lambda x: {"a": x}) - assert arrow_ds.take() == [{"a": 0}] - assert "ArrowRow" in arrow_ds.map(lambda x: str(type(x))).take()[0] - - arrow_ds = ray.data.range_table(1) - assert arrow_ds.map(lambda x: "plain_{}".format(x["value"])).take() == ["plain_0"] - # In streaming, we set batch_format to "default" (because calling - # ds.dataset_format() will still invoke bulk execution and we want - # to avoid that). As a result, it's receiving PandasRow (the defaut - # batch format), which unwraps [0] to plain 0. - if ray.data.context.DatasetContext.get_current().use_streaming_executor: - assert arrow_ds.map(lambda x: {"a": (x["value"],)}).take() == [{"a": 0}] - else: - assert arrow_ds.map(lambda x: {"a": (x["value"],)}).take() == [{"a": [0]}] - - -def test_from_items(ray_start_regular_shared): - ds = ray.data.from_items(["hello", "world"]) - assert ds.take() == ["hello", "world"] - - -@pytest.mark.parametrize("parallelism", list(range(1, 21))) -def test_from_items_parallelism(ray_start_regular_shared, parallelism): - # Test that specifying parallelism yields the expected number of blocks. - n = 20 - records = [{"a": i} for i in range(n)] - ds = ray.data.from_items(records, parallelism=parallelism) - out = ds.take_all() - assert out == records - assert ds.num_blocks() == parallelism - - -def test_from_items_parallelism_truncated(ray_start_regular_shared): - # Test that specifying parallelism greater than the number of items is truncated to - # the number of items. - n = 10 - parallelism = 20 - records = [{"a": i} for i in range(n)] - ds = ray.data.from_items(records, parallelism=parallelism) - out = ds.take_all() - assert out == records - assert ds.num_blocks() == n - - -def test_repartition_shuffle(ray_start_regular_shared): - ds = ray.data.range(20, parallelism=10) - assert ds.num_blocks() == 10 - assert ds.sum() == 190 - assert ds._block_num_rows() == [2] * 10 - - ds2 = ds.repartition(5, shuffle=True) - assert ds2.num_blocks() == 5 - assert ds2.sum() == 190 - assert ds2._block_num_rows() == [10, 10, 0, 0, 0] - - ds3 = ds2.repartition(20, shuffle=True) - assert ds3.num_blocks() == 20 - assert ds3.sum() == 190 - assert ds3._block_num_rows() == [2] * 10 + [0] * 10 - - large = ray.data.range(10000, parallelism=10) - large = large.repartition(20, shuffle=True) - assert large._block_num_rows() == [500] * 20 - - -def test_repartition_noshuffle(ray_start_regular_shared): - ds = ray.data.range(20, parallelism=10) - assert ds.num_blocks() == 10 - assert ds.sum() == 190 - assert ds._block_num_rows() == [2] * 10 - - ds2 = ds.repartition(5, shuffle=False) - assert ds2.num_blocks() == 5 - assert ds2.sum() == 190 - assert ds2._block_num_rows() == [4, 4, 4, 4, 4] - - ds3 = ds2.repartition(20, shuffle=False) - assert ds3.num_blocks() == 20 - assert ds3.sum() == 190 - assert ds3._block_num_rows() == [1] * 20 - - # Test num_partitions > num_rows - ds4 = ds.repartition(40, shuffle=False) - assert ds4.num_blocks() == 40 - blocks = ray.get(ds4.get_internal_block_refs()) - assert all(isinstance(block, list) for block in blocks), blocks - assert ds4.sum() == 190 - assert ds4._block_num_rows() == [1] * 20 + [0] * 20 - - ds5 = ray.data.range(22).repartition(4) - assert ds5.num_blocks() == 4 - assert ds5._block_num_rows() == [5, 6, 5, 6] - - large = ray.data.range(10000, parallelism=10) - large = large.repartition(20) - assert large._block_num_rows() == [500] * 20 - - -def test_repartition_shuffle_arrow(ray_start_regular_shared): - ds = ray.data.range_table(20, parallelism=10) - assert ds.num_blocks() == 10 - assert ds.count() == 20 - assert ds._block_num_rows() == [2] * 10 - - ds2 = ds.repartition(5, shuffle=True) - assert ds2.num_blocks() == 5 - assert ds2.count() == 20 - assert ds2._block_num_rows() == [10, 10, 0, 0, 0] - - ds3 = ds2.repartition(20, shuffle=True) - assert ds3.num_blocks() == 20 - assert ds3.count() == 20 - assert ds3._block_num_rows() == [2] * 10 + [0] * 10 - - large = ray.data.range_table(10000, parallelism=10) - large = large.repartition(20, shuffle=True) - assert large._block_num_rows() == [500] * 20 - - -def test_take_all(ray_start_regular_shared): - assert ray.data.range(5).take_all() == [0, 1, 2, 3, 4] - - with pytest.raises(ValueError): - assert ray.data.range(5).take_all(4) - - -def test_convert_to_pyarrow(ray_start_regular_shared, tmp_path): - ds = ray.data.range(100) - assert ds.to_dask().sum().compute()[0] == 4950 - path = os.path.join(tmp_path, "test_parquet_dir") - os.mkdir(path) - ds.write_parquet(path) - assert ray.data.read_parquet(path).count() == 100 - - -def test_pyarrow(ray_start_regular_shared): - ds = ray.data.range_table(5) - assert ds.map(lambda x: {"b": x["value"] + 2}).take() == [ - {"b": 2}, - {"b": 3}, - {"b": 4}, - {"b": 5}, - {"b": 6}, - ] - assert ds.map(lambda x: {"b": x["value"] + 2}).filter( - lambda x: x["b"] % 2 == 0 - ).take() == [{"b": 2}, {"b": 4}, {"b": 6}] - assert ds.filter(lambda x: x["value"] == 0).flat_map( - lambda x: [{"b": x["value"] + 2}, {"b": x["value"] + 20}] - ).take() == [{"b": 2}, {"b": 20}] - - -def test_sliding_window(): - arr = list(range(10)) - - # Test all windows over this iterable. - window_sizes = list(range(1, len(arr) + 1)) - for window_size in window_sizes: - windows = list(_sliding_window(arr, window_size)) - assert len(windows) == len(arr) - window_size + 1 - assert all(len(window) == window_size for window in windows) - assert all( - list(window) == arr[i : i + window_size] for i, window in enumerate(windows) - ) - - # Test window size larger than iterable length. - windows = list(_sliding_window(arr, 15)) - assert len(windows) == 1 - assert list(windows[0]) == arr - - -def test_iter_rows(ray_start_regular_shared): - # Test simple rows. - n = 10 - ds = ray.data.range(n) - for row, k in zip(ds.iter_rows(), range(n)): - assert row == k - - # Test tabular rows. - t1 = pa.Table.from_pydict({"one": [1, 2, 3], "two": [2, 3, 4]}) - t2 = pa.Table.from_pydict({"one": [4, 5, 6], "two": [5, 6, 7]}) - t3 = pa.Table.from_pydict({"one": [7, 8, 9], "two": [8, 9, 10]}) - t4 = pa.Table.from_pydict({"one": [10, 11, 12], "two": [11, 12, 13]}) - ts = [t1, t2, t3, t4] - t = pa.concat_tables(ts) - ds = ray.data.from_arrow(ts) - - def to_pylist(table): - pydict = table.to_pydict() - names = table.schema.names - pylist = [ - {column: pydict[column][row] for column in names} - for row in range(table.num_rows) - ] - return pylist - - # Default ArrowRows. - for row, t_row in zip(ds.iter_rows(), to_pylist(t)): - assert isinstance(row, TableRow) - # In streaming, we set batch_format to "default" because calling - # ds.dataset_format() will still invoke bulk execution and we want - # to avoid that. As a result, it's receiving PandasRow (the defaut - # batch format). - if ray.data.context.DatasetContext.get_current().use_streaming_executor: - assert isinstance(row, PandasRow) - else: - assert isinstance(row, ArrowRow) - assert row == t_row - - # PandasRows after conversion. - pandas_ds = ds.map_batches(lambda x: x, batch_format="pandas") - df = t.to_pandas() - for row, (index, df_row) in zip(pandas_ds.iter_rows(), df.iterrows()): - assert isinstance(row, TableRow) - assert isinstance(row, PandasRow) - assert row == df_row.to_dict() - - # Prefetch. - for row, t_row in zip(ds.iter_rows(prefetch_blocks=1), to_pylist(t)): - assert isinstance(row, TableRow) - # In streaming, we set batch_format to "default" because calling - # ds.dataset_format() will still invoke bulk execution and we want - # to avoid that. As a result, it's receiving PandasRow (the defaut - # batch format). - if ray.data.context.DatasetContext.get_current().use_streaming_executor: - assert isinstance(row, PandasRow) - else: - assert isinstance(row, ArrowRow) - assert row == t_row - - -def test_iter_batches_basic(ray_start_regular_shared): - df1 = pd.DataFrame({"one": [1, 2, 3], "two": [2, 3, 4]}) - df2 = pd.DataFrame({"one": [4, 5, 6], "two": [5, 6, 7]}) - df3 = pd.DataFrame({"one": [7, 8, 9], "two": [8, 9, 10]}) - df4 = pd.DataFrame({"one": [10, 11, 12], "two": [11, 12, 13]}) - dfs = [df1, df2, df3, df4] - ds = ray.data.from_pandas(dfs) - - # Default. - for batch, df in zip(ds.iter_batches(batch_size=None, batch_format="pandas"), dfs): - assert isinstance(batch, pd.DataFrame) - assert batch.equals(df) - - # pyarrow.Table format. - for batch, df in zip(ds.iter_batches(batch_size=None, batch_format="pyarrow"), dfs): - assert isinstance(batch, pa.Table) - assert batch.equals(pa.Table.from_pandas(df)) - - # NumPy format. - for batch, df in zip(ds.iter_batches(batch_size=None, batch_format="numpy"), dfs): - assert isinstance(batch, dict) - assert list(batch.keys()) == ["one", "two"] - assert all(isinstance(col, np.ndarray) for col in batch.values()) - pd.testing.assert_frame_equal(pd.DataFrame(batch), df) - - # Numpy format (single column). - ds2 = ds.select_columns(["one"]) - for batch, df in zip(ds2.iter_batches(batch_size=None, batch_format="numpy"), dfs): - assert isinstance(batch, dict) - assert list(batch.keys()) == ["one"] - assert all(isinstance(col, np.ndarray) for col in batch.values()) - pd.testing.assert_frame_equal(pd.DataFrame(batch), df[["one"]]) - - # Test NumPy format on Arrow blocks. - ds2 = ds.map_batches(lambda b: b, batch_size=None, batch_format="pyarrow") - for batch, df in zip(ds2.iter_batches(batch_size=None, batch_format="numpy"), dfs): - assert isinstance(batch, dict) - assert list(batch.keys()) == ["one", "two"] - assert all(isinstance(col, np.ndarray) for col in batch.values()) - pd.testing.assert_frame_equal(pd.DataFrame(batch), df) - - # Test NumPy format on Arrow blocks (single column). - ds3 = ds2.select_columns(["one"]) - for batch, df in zip(ds3.iter_batches(batch_size=None, batch_format="numpy"), dfs): - assert isinstance(batch, dict) - assert list(batch.keys()) == ["one"] - assert all(isinstance(col, np.ndarray) for col in batch.values()) - pd.testing.assert_frame_equal(pd.DataFrame(batch), df[["one"]]) - - # Native format (deprecated). - for batch, df in zip(ds.iter_batches(batch_size=None, batch_format="native"), dfs): - assert BlockAccessor.for_block(batch).to_pandas().equals(df) - - # Default format. - for batch, df in zip(ds.iter_batches(batch_size=None, batch_format="default"), dfs): - assert BlockAccessor.for_block(batch).to_pandas().equals(df) - - # Batch size. - batch_size = 2 - batches = list(ds.iter_batches(batch_size=batch_size, batch_format="pandas")) - assert all(len(batch) == batch_size for batch in batches) - assert len(batches) == math.ceil( - (len(df1) + len(df2) + len(df3) + len(df4)) / batch_size - ) - assert pd.concat(batches, ignore_index=True).equals( - pd.concat(dfs, ignore_index=True) - ) - - # Batch size larger than block. - batch_size = 4 - batches = list(ds.iter_batches(batch_size=batch_size, batch_format="pandas")) - assert all(len(batch) == batch_size for batch in batches) - assert len(batches) == math.ceil( - (len(df1) + len(df2) + len(df3) + len(df4)) / batch_size - ) - assert pd.concat(batches, ignore_index=True).equals( - pd.concat(dfs, ignore_index=True) - ) - - # Batch size larger than dataset. - batch_size = 15 - batches = list(ds.iter_batches(batch_size=batch_size, batch_format="pandas")) - assert all(len(batch) == ds.count() for batch in batches) - assert len(batches) == 1 - assert pd.concat(batches, ignore_index=True).equals( - pd.concat(dfs, ignore_index=True) - ) - - # Batch size drop partial. - batch_size = 5 - batches = list( - ds.iter_batches(batch_size=batch_size, drop_last=True, batch_format="pandas") - ) - assert all(len(batch) == batch_size for batch in batches) - assert len(batches) == (len(df1) + len(df2) + len(df3) + len(df4)) // batch_size - assert pd.concat(batches, ignore_index=True).equals( - pd.concat(dfs, ignore_index=True)[:10] - ) - - # Batch size don't drop partial. - batch_size = 5 - batches = list( - ds.iter_batches(batch_size=batch_size, drop_last=False, batch_format="pandas") - ) - assert all(len(batch) == batch_size for batch in batches[:-1]) - assert len(batches[-1]) == (len(df1) + len(df2) + len(df3) + len(df4)) % batch_size - assert len(batches) == math.ceil( - (len(df1) + len(df2) + len(df3) + len(df4)) / batch_size - ) - assert pd.concat(batches, ignore_index=True).equals( - pd.concat(dfs, ignore_index=True) - ) - - # Prefetch. - batches = list( - ds.iter_batches(prefetch_blocks=1, batch_size=None, batch_format="pandas") - ) - assert len(batches) == len(dfs) - for batch, df in zip(batches, dfs): - assert isinstance(batch, pd.DataFrame) - assert batch.equals(df) - - batch_size = 2 - batches = list( - ds.iter_batches(prefetch_blocks=2, batch_size=batch_size, batch_format="pandas") - ) - assert all(len(batch) == batch_size for batch in batches) - assert len(batches) == math.ceil( - (len(df1) + len(df2) + len(df3) + len(df4)) / batch_size - ) - assert pd.concat(batches, ignore_index=True).equals( - pd.concat(dfs, ignore_index=True) - ) - - # Prefetch more than number of blocks. - batches = list( - ds.iter_batches( - prefetch_blocks=len(dfs), batch_size=None, batch_format="pandas" - ) - ) - assert len(batches) == len(dfs) - for batch, df in zip(batches, dfs): - assert isinstance(batch, pd.DataFrame) - assert batch.equals(df) - - # Prefetch with ray.wait. - context = DatasetContext.get_current() - old_config = context.actor_prefetcher_enabled - try: - context.actor_prefetcher_enabled = False - batches = list( - ds.iter_batches(prefetch_blocks=1, batch_size=None, batch_format="pandas") - ) - assert len(batches) == len(dfs) - for batch, df in zip(batches, dfs): - assert isinstance(batch, pd.DataFrame) - assert batch.equals(df) - finally: - context.actor_prefetcher_enabled = old_config - - -def test_iter_batches_empty_block(ray_start_regular_shared): - ds = ray.data.range(1).repartition(10) - assert list(ds.iter_batches(batch_size=None)) == [[0]] - assert list(ds.iter_batches(batch_size=1, local_shuffle_buffer_size=1)) == [[0]] - - -@pytest.mark.parametrize("pipelined", [False, True]) -@pytest.mark.parametrize("ds_format", ["arrow", "pandas", "simple"]) -def test_iter_batches_local_shuffle(shutdown_only, pipelined, ds_format): - # Input validation. - # Batch size must be given for local shuffle. - with pytest.raises(ValueError): - list( - ray.data.range(100).iter_batches( - batch_size=None, local_shuffle_buffer_size=10 - ) - ) - - def range(n, parallelism=200): - if ds_format == "simple": - ds = ray.data.range(n, parallelism=parallelism) - elif ds_format == "arrow": - ds = ray.data.range_table(n, parallelism=parallelism) - elif ds_format == "pandas": - ds = ray.data.range_table(n, parallelism=parallelism).map_batches( - lambda df: df, batch_size=None, batch_format="pandas" - ) - if pipelined: - pipe = ds.repeat(2) - return pipe - else: - return ds - - def to_row_dicts(batch): - if isinstance(batch, pd.DataFrame): - batch = batch.to_dict(orient="records") - return batch - - def unbatch(batches): - return [r for batch in batches for r in to_row_dicts(batch)] - - def sort(r): - if ds_format == "simple": - return sorted(r) - return sorted(r, key=lambda v: v["value"]) - - base = range(100).take_all() - - # Local shuffle. - r1 = unbatch( - range(100, parallelism=10).iter_batches( - batch_size=3, - local_shuffle_buffer_size=25, - ) - ) - r2 = unbatch( - range(100, parallelism=10).iter_batches( - batch_size=3, - local_shuffle_buffer_size=25, - ) - ) - # Check randomness of shuffle. - assert r1 != r2, (r1, r2) - assert r1 != base - assert r2 != base - # Check content. - assert sort(r1) == sort(base) - assert sort(r2) == sort(base) - - # Set seed. - r1 = unbatch( - range(100, parallelism=10).iter_batches( - batch_size=3, - local_shuffle_buffer_size=25, - local_shuffle_seed=0, - ) - ) - r2 = unbatch( - range(100, parallelism=10).iter_batches( - batch_size=3, - local_shuffle_buffer_size=25, - local_shuffle_seed=0, - ) - ) - # Check randomness of shuffle. - assert r1 == r2, (r1, r2) - assert r1 != base - # Check content. - assert sort(r1) == sort(base) - - # Single block. - r1 = unbatch( - range(100, parallelism=1).iter_batches( - batch_size=3, - local_shuffle_buffer_size=25, - ) - ) - r2 = unbatch( - range(100, parallelism=1).iter_batches( - batch_size=3, - local_shuffle_buffer_size=25, - ) - ) - # Check randomness of shuffle. - assert r1 != r2, (r1, r2) - assert r1 != base - assert r2 != base - # Check content. - assert sort(r1) == sort(base) - assert sort(r2) == sort(base) - - # Single-row blocks. - r1 = unbatch( - range(100, parallelism=100).iter_batches( - batch_size=3, - local_shuffle_buffer_size=25, - ) - ) - r2 = unbatch( - range(100, parallelism=100).iter_batches( - batch_size=3, - local_shuffle_buffer_size=25, - ) - ) - # Check randomness of shuffle. - assert r1 != r2, (r1, r2) - assert r1 != base - assert r2 != base - # Check content. - assert sort(r1) == sort(base) - assert sort(r2) == sort(base) - - # Buffer larger than dataset. - r1 = unbatch( - range(100, parallelism=10).iter_batches( - batch_size=3, - local_shuffle_buffer_size=200, - ) - ) - r2 = unbatch( - range(100, parallelism=10).iter_batches( - batch_size=3, - local_shuffle_buffer_size=200, - ) - ) - # Check randomness of shuffle. - assert r1 != r2, (r1, r2) - assert r1 != base - assert r2 != base - # Check content. - assert sort(r1) == sort(base) - assert sort(r2) == sort(base) - - # Batch size larger than block. - r1 = unbatch( - range(100, parallelism=20).iter_batches( - batch_size=12, - local_shuffle_buffer_size=25, - ) - ) - r2 = unbatch( - range(100, parallelism=20).iter_batches( - batch_size=12, - local_shuffle_buffer_size=25, - ) - ) - # Check randomness of shuffle. - assert r1 != r2, (r1, r2) - assert r1 != base - assert r2 != base - # Check content. - assert sort(r1) == sort(base) - assert sort(r2) == sort(base) - - # Batch size larger than dataset. - r1 = unbatch( - range(100, parallelism=10).iter_batches( - batch_size=200, - local_shuffle_buffer_size=400, - ) - ) - r2 = unbatch( - range(100, parallelism=10).iter_batches( - batch_size=200, - local_shuffle_buffer_size=400, - ) - ) - # Check randomness of shuffle. - assert r1 != r2, (r1, r2) - assert r1 != base - assert r2 != base - # Check content. - assert sort(r1) == sort(base) - assert sort(r2) == sort(base) - - # Drop partial batches. - r1 = unbatch( - range(100, parallelism=10).iter_batches( - batch_size=7, - local_shuffle_buffer_size=21, - drop_last=True, - ) - ) - r2 = unbatch( - range(100, parallelism=10).iter_batches( - batch_size=7, - local_shuffle_buffer_size=21, - drop_last=True, - ) - ) - # Check randomness of shuffle. - assert r1 != r2, (r1, r2) - assert r1 != base - assert r2 != base - # Check content. - # Check that partial batches were dropped. - assert len(r1) % 7 == 0 - assert len(r2) % 7 == 0 - tmp_base = base - if ds_format in ("arrow", "pandas"): - r1 = [tuple(r.items()) for r in r1] - r2 = [tuple(r.items()) for r in r2] - tmp_base = [tuple(r.items()) for r in base] - assert set(r1) <= set(tmp_base) - assert set(r2) <= set(tmp_base) - - # Test empty dataset. - ds = ray.data.from_items([]) - r1 = unbatch(ds.iter_batches(batch_size=2, local_shuffle_buffer_size=10)) - assert len(r1) == 0 - assert r1 == ds.take() - - -def test_iter_batches_grid(ray_start_regular_shared): - # Tests slicing, batch combining, and partial batch dropping logic over - # a grid of dataset, batching, and dropping configurations. - # Grid: num_blocks x num_rows_block_1 x ... x num_rows_block_N x - # batch_size x drop_last - seed = int(time.time()) - print(f"Seeding RNG for test_iter_batches_grid with: {seed}") - random.seed(seed) - max_num_blocks = 20 - max_num_rows_per_block = 20 - num_blocks_samples = 3 - block_sizes_samples = 3 - batch_size_samples = 3 - - for num_blocks in np.random.randint(1, max_num_blocks + 1, size=num_blocks_samples): - block_sizes_list = [ - np.random.randint(1, max_num_rows_per_block + 1, size=num_blocks) - for _ in range(block_sizes_samples) - ] - for block_sizes in block_sizes_list: - # Create the dataset with the given block sizes. - dfs = [] - running_size = 0 - for block_size in block_sizes: - dfs.append( - pd.DataFrame( - {"value": list(range(running_size, running_size + block_size))} - ) - ) - running_size += block_size - num_rows = running_size - ds = ray.data.from_pandas(dfs) - for batch_size in np.random.randint( - 1, num_rows + 1, size=batch_size_samples - ): - for drop_last in (False, True): - batches = list( - ds.iter_batches( - batch_size=batch_size, - drop_last=drop_last, - batch_format="pandas", - ) - ) - if num_rows % batch_size == 0 or not drop_last: - # Number of batches should be equal to - # num_rows / batch_size, rounded up. - assert len(batches) == math.ceil(num_rows / batch_size) - # Concatenated batches should equal the DataFrame - # representation of the entire dataset. - assert pd.concat(batches, ignore_index=True).equals( - ds.to_pandas() - ) - else: - # Number of batches should be equal to - # num_rows / batch_size, rounded down. - assert len(batches) == num_rows // batch_size - # Concatenated batches should equal the DataFrame - # representation of the dataset with the partial batch - # remainder sliced off. - assert pd.concat(batches, ignore_index=True).equals( - ds.to_pandas()[: batch_size * (num_rows // batch_size)] - ) - if num_rows % batch_size == 0 or drop_last: - assert all(len(batch) == batch_size for batch in batches) - else: - assert all(len(batch) == batch_size for batch in batches[:-1]) - assert len(batches[-1]) == num_rows % batch_size - - -def test_lazy_loading_iter_batches_exponential_rampup(ray_start_regular_shared): - ds = ray.data.range(32, parallelism=8) - expected_num_blocks = [1, 2, 4, 4, 8, 8, 8, 8] - for _, expected in zip(ds.iter_batches(batch_size=None), expected_num_blocks): - if ray.data.context.DatasetContext.get_current().use_streaming_executor: - # In streaming execution of ds.iter_batches(), there is no partial - # execution so _num_computed() in LazyBlocklist is 0. - assert ds._plan.execute()._num_computed() == 0 - else: - assert ds._plan.execute()._num_computed() == expected - - -def test_add_column(ray_start_regular_shared): - ds = ray.data.range(5).add_column("foo", lambda x: 1) - assert ds.take(1) == [{"value": 0, "foo": 1}] - - ds = ray.data.range_table(5).add_column("foo", lambda x: x["value"] + 1) - assert ds.take(1) == [{"value": 0, "foo": 1}] - - ds = ray.data.range_table(5).add_column("value", lambda x: x["value"] + 1) - assert ds.take(2) == [{"value": 1}, {"value": 2}] - - with pytest.raises(ValueError): - ds = ray.data.range(5).add_column("value", 0) - - -def test_drop_columns(ray_start_regular_shared, tmp_path): - df = pd.DataFrame({"col1": [1, 2, 3], "col2": [2, 3, 4], "col3": [3, 4, 5]}) - ds1 = ray.data.from_pandas(df) - ds1.write_parquet(str(tmp_path)) - ds2 = ray.data.read_parquet(str(tmp_path)) - - for ds in [ds1, ds2]: - assert ds.drop_columns(["col2"]).take(1) == [{"col1": 1, "col3": 3}] - assert ds.drop_columns(["col1", "col3"]).take(1) == [{"col2": 2}] - assert ds.drop_columns([]).take(1) == [{"col1": 1, "col2": 2, "col3": 3}] - assert ds.drop_columns(["col1", "col2", "col3"]).take(1) == [{}] - assert ds.drop_columns(["col1", "col1", "col2", "col1"]).take(1) == [ - {"col3": 3} - ] - # Test dropping non-existent column - with pytest.raises(KeyError): - ds.drop_columns(["dummy_col", "col1", "col2"]).fully_executed() - - -def test_select_columns(ray_start_regular_shared): - # Test pandas and arrow - df = pd.DataFrame({"col1": [1, 2, 3], "col2": [2, 3, 4], "col3": [3, 4, 5]}) - ds1 = ray.data.from_pandas(df) - assert ds1.dataset_format() == "pandas" - - ds2 = ds1.map_batches(lambda pa: pa, batch_size=1, batch_format="pyarrow") - assert ds2.dataset_format() == "arrow" - - for each_ds in [ds1, ds2]: - assert each_ds.select_columns(cols=[]).take(1) == [{}] - assert each_ds.select_columns(cols=["col1", "col2", "col3"]).take(1) == [ - {"col1": 1, "col2": 2, "col3": 3} - ] - assert each_ds.select_columns(cols=["col1", "col2"]).take(1) == [ - {"col1": 1, "col2": 2} - ] - assert each_ds.select_columns(cols=["col2", "col1"]).take(1) == [ - {"col1": 1, "col2": 2} - ] - # Test selecting columns with duplicates - assert each_ds.select_columns(cols=["col1", "col2", "col2"]).schema().names == [ - "col1", - "col2", - "col2", - ] - # Test selecting a column that is not in the dataset schema - with pytest.raises(KeyError): - each_ds.select_columns(cols=["col1", "col2", "dummy_col"]).fully_executed() - - # Test simple - ds3 = ray.data.range(10) - assert ds3.dataset_format() == "simple" - with pytest.raises(ValueError): - ds3.select_columns(cols=[]).fully_executed() - - -def test_map_batches_basic(ray_start_regular_shared, tmp_path, restore_dataset_context): - ctx = DatasetContext.get_current() - ctx.execution_options.preserve_order = True - - # Test input validation - ds = ray.data.range(5) - with pytest.raises(ValueError): - ds.map_batches(lambda x: x + 1, batch_format="pyarrow", batch_size=-1).take() - - # Set up. - df = pd.DataFrame({"one": [1, 2, 3], "two": [2, 3, 4]}) - table = pa.Table.from_pandas(df) - pq.write_table(table, os.path.join(tmp_path, "test1.parquet")) - - # Test pandas - ds = ray.data.read_parquet(str(tmp_path)) - ds2 = ds.map_batches(lambda df: df + 1, batch_size=1, batch_format="pandas") - assert ds2.dataset_format() == "pandas" - ds_list = ds2.take() - values = [s["one"] for s in ds_list] - assert values == [2, 3, 4] - values = [s["two"] for s in ds_list] - assert values == [3, 4, 5] - - # Test Pyarrow - ds = ray.data.read_parquet(str(tmp_path)) - ds2 = ds.map_batches(lambda pa: pa, batch_size=1, batch_format="pyarrow") - assert ds2.dataset_format() == "arrow" - ds_list = ds2.take() - values = [s["one"] for s in ds_list] - assert values == [1, 2, 3] - values = [s["two"] for s in ds_list] - assert values == [2, 3, 4] - - # Test batch - size = 300 - ds = ray.data.range(size) - ds2 = ds.map_batches(lambda df: df + 1, batch_size=17, batch_format="pandas") - assert ds2.dataset_format() == "pandas" - ds_list = ds2.take_all() - for i in range(size): - # The pandas column is "value", and it originally has rows from 0~299. - # After the map batch, it should have 1~300. - row = ds_list[i] - assert row["value"] == i + 1 - assert ds.count() == 300 - - # Test the lambda returns different types than the batch_format - # pandas => list block - ds = ray.data.read_parquet(str(tmp_path)) - ds2 = ds.map_batches(lambda df: [1], batch_size=1) - assert ds2.dataset_format() == "simple" - ds_list = ds2.take() - assert ds_list == [1, 1, 1] - assert ds.count() == 3 - - # pyarrow => list block - ds = ray.data.read_parquet(str(tmp_path)) - ds2 = ds.map_batches(lambda df: [1], batch_size=1, batch_format="pyarrow") - assert ds2.dataset_format() == "simple" - ds_list = ds2.take() - assert ds_list == [1, 1, 1] - assert ds.count() == 3 - - # Test the wrong return value raises an exception. - ds = ray.data.read_parquet(str(tmp_path)) - with pytest.raises(ValueError): - ds_list = ds.map_batches( - lambda df: 1, batch_size=2, batch_format="pyarrow" - ).take() - - -def test_map_batches_extra_args(ray_start_regular_shared, tmp_path): - def put(x): - # We only support automatic deref in the legacy backend. - if DatasetContext.get_current().new_execution_backend: - return x - else: - return ray.put(x) - - # Test input validation - ds = ray.data.range(5) - - class Foo: - def __call__(self, df): - return df - - with pytest.raises(ValueError): - # CallableClass not supported for task compute strategy, which is the default. - ds.map_batches(Foo) - - with pytest.raises(ValueError): - # CallableClass not supported for task compute strategy. - ds.map_batches(Foo, compute="tasks") - - with pytest.raises(ValueError): - # fn_constructor_args and fn_constructor_kwargs only supported for actor - # compute strategy. - ds.map_batches( - lambda x: x, - compute="tasks", - fn_constructor_args=(1,), - fn_constructor_kwargs={"a": 1}, - ) - - with pytest.raises(ValueError): - # fn_constructor_args and fn_constructor_kwargs only supported for callable - # class UDFs. - ds.map_batches( - lambda x: x, - compute="actors", - fn_constructor_args=(1,), - fn_constructor_kwargs={"a": 1}, - ) - - # Set up. - df = pd.DataFrame({"one": [1, 2, 3], "two": [2, 3, 4]}) - table = pa.Table.from_pandas(df) - pq.write_table(table, os.path.join(tmp_path, "test1.parquet")) - - # Test extra UDF args. - # Test positional. - def udf(batch, a): - assert a == 1 - return batch + a - - ds = ray.data.read_parquet(str(tmp_path)) - ds2 = ds.map_batches( - udf, - batch_size=1, - batch_format="pandas", - fn_args=(put(1),), - ) - assert ds2.dataset_format() == "pandas" - ds_list = ds2.take() - values = [s["one"] for s in ds_list] - assert values == [2, 3, 4] - values = [s["two"] for s in ds_list] - assert values == [3, 4, 5] - - # Test kwargs. - def udf(batch, b=None): - assert b == 2 - return b * batch - - ds = ray.data.read_parquet(str(tmp_path)) - ds2 = ds.map_batches( - udf, - batch_size=1, - batch_format="pandas", - fn_kwargs={"b": put(2)}, - ) - assert ds2.dataset_format() == "pandas" - ds_list = ds2.take() - values = [s["one"] for s in ds_list] - assert values == [2, 4, 6] - values = [s["two"] for s in ds_list] - assert values == [4, 6, 8] - - # Test both. - def udf(batch, a, b=None): - assert a == 1 - assert b == 2 - return b * batch + a - - ds = ray.data.read_parquet(str(tmp_path)) - ds2 = ds.map_batches( - udf, - batch_size=1, - batch_format="pandas", - fn_args=(put(1),), - fn_kwargs={"b": put(2)}, - ) - assert ds2.dataset_format() == "pandas" - ds_list = ds2.take() - values = [s["one"] for s in ds_list] - assert values == [3, 5, 7] - values = [s["two"] for s in ds_list] - assert values == [5, 7, 9] - - # Test constructor UDF args. - # Test positional. - class CallableFn: - def __init__(self, a): - assert a == 1 - self.a = a - - def __call__(self, x): - return x + self.a - - ds = ray.data.read_parquet(str(tmp_path)) - ds2 = ds.map_batches( - CallableFn, - batch_size=1, - batch_format="pandas", - compute="actors", - fn_constructor_args=(put(1),), - ) - assert ds2.dataset_format() == "pandas" - ds_list = ds2.take() - values = [s["one"] for s in ds_list] - assert values == [2, 3, 4] - values = [s["two"] for s in ds_list] - assert values == [3, 4, 5] - - # Test kwarg. - class CallableFn: - def __init__(self, b=None): - assert b == 2 - self.b = b - - def __call__(self, x): - return self.b * x - - ds = ray.data.read_parquet(str(tmp_path)) - ds2 = ds.map_batches( - CallableFn, - batch_size=1, - batch_format="pandas", - compute="actors", - fn_constructor_kwargs={"b": put(2)}, - ) - assert ds2.dataset_format() == "pandas" - ds_list = ds2.take() - values = [s["one"] for s in ds_list] - assert values == [2, 4, 6] - values = [s["two"] for s in ds_list] - assert values == [4, 6, 8] - - # Test both. - class CallableFn: - def __init__(self, a, b=None): - assert a == 1 - assert b == 2 - self.a = a - self.b = b - - def __call__(self, x): - return self.b * x + self.a - - ds = ray.data.read_parquet(str(tmp_path)) - ds2 = ds.map_batches( - CallableFn, - batch_size=1, - batch_format="pandas", - compute="actors", - fn_constructor_args=(put(1),), - fn_constructor_kwargs={"b": put(2)}, - ) - assert ds2.dataset_format() == "pandas" - ds_list = ds2.take() - values = [s["one"] for s in ds_list] - assert values == [3, 5, 7] - values = [s["two"] for s in ds_list] - assert values == [5, 7, 9] - - # Test callable chain. - ds = ray.data.read_parquet(str(tmp_path)) - fn_constructor_args = (put(1),) - fn_constructor_kwargs = {"b": put(2)} - ds2 = ( - ds.lazy() - .map_batches( - CallableFn, - batch_size=1, - batch_format="pandas", - compute="actors", - fn_constructor_args=fn_constructor_args, - fn_constructor_kwargs=fn_constructor_kwargs, - ) - .map_batches( - CallableFn, - batch_size=1, - batch_format="pandas", - compute="actors", - fn_constructor_args=fn_constructor_args, - fn_constructor_kwargs=fn_constructor_kwargs, - ) - ) - assert ds2.dataset_format() == "pandas" - ds_list = ds2.take() - values = [s["one"] for s in ds_list] - assert values == [7, 11, 15] - values = [s["two"] for s in ds_list] - assert values == [11, 15, 19] - - # Test function + callable chain. - ds = ray.data.read_parquet(str(tmp_path)) - fn_constructor_args = (put(1),) - fn_constructor_kwargs = {"b": put(2)} - ds2 = ( - ds.lazy() - .map_batches( - lambda df, a, b=None: b * df + a, - batch_size=1, - batch_format="pandas", - compute="actors", - fn_args=(put(1),), - fn_kwargs={"b": put(2)}, - ) - .map_batches( - CallableFn, - batch_size=1, - batch_format="pandas", - compute="actors", - fn_constructor_args=fn_constructor_args, - fn_constructor_kwargs=fn_constructor_kwargs, - ) - ) - assert ds2.dataset_format() == "pandas" - ds_list = ds2.take() - values = [s["one"] for s in ds_list] - assert values == [7, 11, 15] - values = [s["two"] for s in ds_list] - assert values == [11, 15, 19] - - -def test_map_batches_generator(ray_start_regular_shared, tmp_path): - # Set up. - df = pd.DataFrame({"one": [1, 2, 3], "two": [2, 3, 4]}) - table = pa.Table.from_pandas(df) - pq.write_table(table, os.path.join(tmp_path, "test1.parquet")) - - def pandas_generator(batch: pd.DataFrame) -> Iterator[pd.DataFrame]: - for i in range(len(batch)): - yield batch.iloc[[i]] + 1 - - ds = ray.data.read_parquet(str(tmp_path)) - ds2 = ds.map_batches(pandas_generator, batch_size=1, batch_format="pandas") - assert ds2.dataset_format() == "pandas" - ds_list = ds2.take() - values = sorted([s["one"] for s in ds_list]) - assert values == [2, 3, 4] - values = sorted([s["two"] for s in ds_list]) - assert values == [3, 4, 5] - - def fail_generator(batch): - for i in range(len(batch)): - yield i - - # Test the wrong return value raises an exception. - ds = ray.data.read_parquet(str(tmp_path)) - with pytest.raises(ValueError): - ds_list = ds.map_batches( - fail_generator, batch_size=2, batch_format="pyarrow" - ).take() - - -def test_map_batches_actors_preserves_order(ray_start_regular_shared): - # Test that actor compute model preserves block order. - ds = ray.data.range(10, parallelism=5) - assert ds.map_batches(lambda x: x, compute="actors").take() == list(range(10)) - - -@pytest.mark.parametrize( - "num_rows,num_blocks,batch_size", - [ - (10, 5, 2), - (10, 1, 10), - (12, 3, 2), - ], -) -def test_map_batches_batch_mutation( - ray_start_regular_shared, num_rows, num_blocks, batch_size, restore_dataset_context -): - ctx = DatasetContext.get_current() - ctx.execution_options.preserve_order = True - - # Test that batch mutation works without encountering a read-only error (e.g. if the - # batch is a zero-copy view on data in the object store). - def mutate(df): - df["value"] += 1 - return df - - ds = ray.data.range_table(num_rows, parallelism=num_blocks).repartition(num_blocks) - # Convert to Pandas blocks. - ds = ds.map_batches(lambda df: df, batch_format="pandas", batch_size=None) - - # Apply UDF that mutates the batches. - ds = ds.map_batches(mutate, batch_size=batch_size) - assert [row["value"] for row in ds.iter_rows()] == list(range(1, num_rows + 1)) - - -@pytest.mark.parametrize( - "num_rows,num_blocks,batch_size", - [ - (10, 5, 2), - (10, 1, 10), - (12, 3, 2), - ], -) -def test_map_batches_batch_zero_copy( - ray_start_regular_shared, num_rows, num_blocks, batch_size -): - # Test that batches are zero-copy read-only views when zero_copy_batch=True. - def mutate(df): - # Check that batch is read-only. - assert not df.values.flags.writeable - df["value"] += 1 - return df - - ds = ray.data.range_table(num_rows, parallelism=num_blocks).repartition(num_blocks) - # Convert to Pandas blocks. - ds = ds.map_batches(lambda df: df, batch_format="pandas", batch_size=None) - ds.fully_executed() - - # Apply UDF that mutates the batches, which should fail since the batch is - # read-only. - with pytest.raises(ValueError, match="tried to mutate a zero-copy read-only batch"): - ds = ds.map_batches(mutate, batch_size=batch_size, zero_copy_batch=True) - ds.fully_executed() - - -BLOCK_BUNDLING_TEST_CASES = [ - (block_size, batch_size) - for batch_size in range(1, 8) - for block_size in range(1, 2 * batch_size + 1) -] - - -@pytest.mark.parametrize("block_size,batch_size", BLOCK_BUNDLING_TEST_CASES) -def test_map_batches_block_bundling_auto( - ray_start_regular_shared, block_size, batch_size -): - # Ensure that we test at least 2 batches worth of blocks. - num_blocks = max(10, 2 * batch_size // block_size) - ds = ray.data.range(num_blocks * block_size, parallelism=num_blocks) - # Confirm that we have the expected number of initial blocks. - assert ds.num_blocks() == num_blocks - - # Blocks should be bundled up to the batch size. - ds1 = ds.map_batches(lambda x: x, batch_size=batch_size).fully_executed() - assert ds1.num_blocks() == math.ceil(num_blocks / max(batch_size // block_size, 1)) - - # Blocks should not be bundled up when batch_size is not specified. - ds2 = ds.map_batches(lambda x: x).fully_executed() - assert ds2.num_blocks() == num_blocks - - -@pytest.mark.parametrize( - "block_sizes,batch_size,expected_num_blocks", - [ - ([1, 2], 3, 1), - ([2, 2, 1], 3, 2), - ([1, 2, 3, 4], 4, 3), - ([3, 1, 1, 3], 4, 2), - ([2, 4, 1, 8], 4, 4), - ([1, 1, 1, 1], 4, 1), - ([1, 0, 3, 2], 4, 2), - ([4, 4, 4, 4], 4, 4), - ], -) -def test_map_batches_block_bundling_skewed_manual( - ray_start_regular_shared, block_sizes, batch_size, expected_num_blocks -): - num_blocks = len(block_sizes) - ds = ray.data.from_pandas( - [pd.DataFrame({"a": [1] * block_size}) for block_size in block_sizes] - ) - # Confirm that we have the expected number of initial blocks. - assert ds.num_blocks() == num_blocks - ds = ds.map_batches(lambda x: x, batch_size=batch_size).fully_executed() - - # Blocks should be bundled up to the batch size. - assert ds.num_blocks() == expected_num_blocks - - -BLOCK_BUNDLING_SKEWED_TEST_CASES = [ - (block_sizes, batch_size) - for batch_size in range(1, 4) - for num_blocks in range(1, batch_size + 1) - for block_sizes in itertools.product( - range(1, 2 * batch_size + 1), repeat=num_blocks - ) -] - - -@pytest.mark.parametrize("block_sizes,batch_size", BLOCK_BUNDLING_SKEWED_TEST_CASES) -def test_map_batches_block_bundling_skewed_auto( - ray_start_regular_shared, block_sizes, batch_size -): - num_blocks = len(block_sizes) - ds = ray.data.from_pandas( - [pd.DataFrame({"a": [1] * block_size}) for block_size in block_sizes] - ) - # Confirm that we have the expected number of initial blocks. - assert ds.num_blocks() == num_blocks - ds = ds.map_batches(lambda x: x, batch_size=batch_size).fully_executed() - curr = 0 - num_out_blocks = 0 - for block_size in block_sizes: - if curr > 0 and curr + block_size > batch_size: - num_out_blocks += 1 - curr = 0 - curr += block_size - if curr > 0: - num_out_blocks += 1 - - # Blocks should be bundled up to the batch size. - assert ds.num_blocks() == num_out_blocks - - -def test_map_with_mismatched_columns(ray_start_regular_shared): - def bad_fn(row): - if row > 5: - return {"a": "hello1"} - else: - return {"b": "hello1"} - - def good_fn(row): - if row > 5: - return {"a": "hello1", "b": "hello2"} - else: - return {"b": "hello2", "a": "hello1"} - - ds = ray.data.range(10, parallelism=1) - error_message = "Current row has different columns compared to previous rows." - with pytest.raises(ValueError) as e: - ds.map(bad_fn).fully_executed() - assert error_message in str(e.value) - ds_map = ds.map(good_fn) - assert ds_map.take() == [{"a": "hello1", "b": "hello2"} for _ in range(10)] - - -def test_union(ray_start_regular_shared): - ds = ray.data.range(20, parallelism=10) - - # Test lazy union. - ds = ds.union(ds, ds, ds, ds) - assert ds.num_blocks() == 50 - assert ds.count() == 100 - assert ds.sum() == 950 - - ds = ds.union(ds) - assert ds.count() == 200 - assert ds.sum() == (950 * 2) - - # Test materialized union. - ds2 = ray.data.from_items([1, 2, 3, 4, 5]) - assert ds2.count() == 5 - ds2 = ds2.union(ds2) - assert ds2.count() == 10 - ds2 = ds2.union(ds) - assert ds2.count() == 210 - - -def test_from_dask(ray_start_regular_shared): - import dask.dataframe as dd - - df = pd.DataFrame({"one": list(range(100)), "two": list(range(100))}) - ddf = dd.from_pandas(df, npartitions=10) - ds = ray.data.from_dask(ddf) - dfds = ds.to_pandas() - assert df.equals(dfds) - - -@pytest.mark.parametrize("ds_format", ["pandas", "arrow"]) -def test_to_dask(ray_start_regular_shared, ds_format): - from ray.util.dask import ray_dask_get - - df1 = pd.DataFrame({"one": [1, 2, 3], "two": ["a", "b", "c"]}) - df2 = pd.DataFrame({"one": [4, 5, 6], "two": ["e", "f", "g"]}) - df = pd.concat([df1, df2]) - ds = ray.data.from_pandas([df1, df2]) - if ds_format == "arrow": - ds = ds.map_batches(lambda df: df, batch_format="pyarrow", batch_size=None) - ddf = ds.to_dask() - meta = ddf._meta - # Check metadata. - assert isinstance(meta, pd.DataFrame) - assert meta.empty - assert list(meta.columns) == ["one", "two"] - assert list(meta.dtypes) == [np.int64, object] - # Explicit Dask-on-Ray - assert df.equals(ddf.compute(scheduler=ray_dask_get)) - # Implicit Dask-on-Ray. - assert df.equals(ddf.compute()) - - # Explicit metadata. - df1["two"] = df1["two"].astype(pd.StringDtype()) - df2["two"] = df2["two"].astype(pd.StringDtype()) - df = pd.concat([df1, df2]) - ds = ray.data.from_pandas([df1, df2]) - if ds_format == "arrow": - ds = ds.map_batches(lambda df: df, batch_format="pyarrow", batch_size=None) - ddf = ds.to_dask( - meta=pd.DataFrame( - {"one": pd.Series(dtype=np.int16), "two": pd.Series(dtype=pd.StringDtype())} - ), - ) - meta = ddf._meta - # Check metadata. - assert isinstance(meta, pd.DataFrame) - assert meta.empty - assert list(meta.columns) == ["one", "two"] - assert list(meta.dtypes) == [np.int16, pd.StringDtype()] - # Explicit Dask-on-Ray - assert df.equals(ddf.compute(scheduler=ray_dask_get)) - # Implicit Dask-on-Ray. - assert df.equals(ddf.compute()) - - -def test_to_dask_tensor_column_cast_pandas(ray_start_regular_shared): - # Check that tensor column casting occurs when converting a Dataset to a Dask - # DataFrame. - data = np.arange(12).reshape((3, 2, 2)) - ctx = ray.data.context.DatasetContext.get_current() - original = ctx.enable_tensor_extension_casting - try: - ctx.enable_tensor_extension_casting = True - in_df = pd.DataFrame({"a": TensorArray(data)}) - ds = ray.data.from_pandas(in_df) - dtypes = ds.schema().types - assert len(dtypes) == 1 - assert isinstance(dtypes[0], TensorDtype) - out_df = ds.to_dask().compute() - assert out_df["a"].dtype.type is np.object_ - expected_df = pd.DataFrame({"a": list(data)}) - pd.testing.assert_frame_equal(out_df, expected_df) - finally: - ctx.enable_tensor_extension_casting = original - - -def test_to_dask_tensor_column_cast_arrow(ray_start_regular_shared): - # Check that tensor column casting occurs when converting a Dataset to a Dask - # DataFrame. - data = np.arange(12).reshape((3, 2, 2)) - ctx = ray.data.context.DatasetContext.get_current() - original = ctx.enable_tensor_extension_casting - try: - ctx.enable_tensor_extension_casting = True - in_table = pa.table({"a": ArrowTensorArray.from_numpy(data)}) - ds = ray.data.from_arrow(in_table) - dtype = ds.schema().field(0).type - assert isinstance(dtype, ArrowTensorType) - out_df = ds.to_dask().compute() - assert out_df["a"].dtype.type is np.object_ - expected_df = pd.DataFrame({"a": list(data)}) - pd.testing.assert_frame_equal(out_df, expected_df) - finally: - ctx.enable_tensor_extension_casting = original - - -def test_from_modin(ray_start_regular_shared): - import modin.pandas as mopd - - df = pd.DataFrame( - {"one": list(range(100)), "two": list(range(100))}, - ) - modf = mopd.DataFrame(df) - ds = ray.data.from_modin(modf) - dfds = ds.to_pandas() - assert df.equals(dfds) - - -def test_to_modin(ray_start_regular_shared): - # create two modin dataframes - # one directly from a pandas dataframe, and - # another from ray.dataset created from the original pandas dataframe - # - import modin.pandas as mopd - - df = pd.DataFrame( - {"one": list(range(100)), "two": list(range(100))}, - ) - modf1 = mopd.DataFrame(df) - ds = ray.data.from_pandas([df]) - modf2 = ds.to_modin() - assert modf1.equals(modf2) - - -@pytest.mark.parametrize("pipelined", [False, True]) -def test_iter_tf_batches(ray_start_regular_shared, pipelined): - df1 = pd.DataFrame( - {"one": [1, 2, 3], "two": [1.0, 2.0, 3.0], "label": [1.0, 2.0, 3.0]} - ) - df2 = pd.DataFrame( - {"one": [4, 5, 6], "two": [4.0, 5.0, 6.0], "label": [4.0, 5.0, 6.0]} - ) - df3 = pd.DataFrame({"one": [7, 8], "two": [7.0, 8.0], "label": [7.0, 8.0]}) - df = pd.concat([df1, df2, df3]) - ds = ray.data.from_pandas([df1, df2, df3]) - ds = maybe_pipeline(ds, pipelined) - - num_epochs = 1 if pipelined else 2 - for _ in range(num_epochs): - iterations = [] - for batch in ds.iter_tf_batches(batch_size=3): - iterations.append( - np.stack((batch["one"], batch["two"], batch["label"]), axis=1) - ) - combined_iterations = np.concatenate(iterations) - np.testing.assert_array_equal(np.sort(df.values), np.sort(combined_iterations)) - - -@pytest.mark.parametrize("pipelined", [False, True]) -def test_iter_tf_batches_tensor_ds(ray_start_regular_shared, pipelined): - arr1 = np.arange(12).reshape((3, 2, 2)) - arr2 = np.arange(12, 24).reshape((3, 2, 2)) - arr = np.concatenate((arr1, arr2)) - ds = ray.data.from_numpy([arr1, arr2]) - ds = maybe_pipeline(ds, pipelined) - - num_epochs = 1 if pipelined else 2 - for _ in range(num_epochs): - iterations = [] - for batch in ds.iter_tf_batches(batch_size=2): - iterations.append(batch) - combined_iterations = np.concatenate(iterations) - np.testing.assert_array_equal(arr, combined_iterations) - - -def test_block_builder_for_block(ray_start_regular_shared): - # list - builder = BlockBuilder.for_block(list()) - builder.add_block([1, 2]) - assert builder.build() == [1, 2] - builder.add_block([3, 4]) - assert builder.build() == [1, 2, 3, 4] - - # pandas dataframe - builder = BlockBuilder.for_block(pd.DataFrame()) - b1 = pd.DataFrame({"A": [1], "B": ["a"]}) - builder.add_block(b1) - assert builder.build().equals(b1) - b2 = pd.DataFrame({"A": [2, 3], "B": ["c", "d"]}) - builder.add_block(b2) - expected = pd.DataFrame({"A": [1, 2, 3], "B": ["a", "c", "d"]}) - assert builder.build().equals(expected) - - # pyarrow table - builder = BlockBuilder.for_block(pa.Table.from_arrays(list())) - b1 = pa.Table.from_pydict({"A": [1], "B": ["a"]}) - builder.add_block(b1) - builder.build().equals(b1) - b2 = pa.Table.from_pydict({"A": [2, 3], "B": ["c", "d"]}) - builder.add_block(b2) - expected = pa.Table.from_pydict({"A": [1, 2, 3], "B": ["a", "c", "d"]}) - builder.build().equals(expected) - - # wrong type - with pytest.raises(TypeError): - BlockBuilder.for_block(str()) - - -def test_grouped_dataset_repr(ray_start_regular_shared): - ds = ray.data.from_items([{"key": "spam"}, {"key": "ham"}, {"key": "spam"}]) - assert repr(ds.groupby("key")) == f"GroupedDataset(dataset={ds!r}, key='key')" - - -def test_groupby_arrow(ray_start_regular_shared, use_push_based_shuffle): - # Test empty dataset. - agg_ds = ( - ray.data.range_table(10) - .filter(lambda r: r["value"] > 10) - .groupby("value") - .count() - ) - assert agg_ds.count() == 0 - - -def test_groupby_errors(ray_start_regular_shared): - ds = ray.data.range(100) - - ds.groupby(None).count().show() # OK - ds.groupby(lambda x: x % 2).count().show() # OK - with pytest.raises(ValueError): - ds.groupby("foo").count().show() - - ds = ray.data.range_table(100) - ds.groupby(None).count().show() # OK - with pytest.raises(ValueError): - ds.groupby(lambda x: x % 2).count().show() - - -def test_agg_errors(ray_start_regular_shared): - ds = ray.data.range(100) - from ray.data.aggregate import Max - - ds.aggregate(Max()) # OK - ds.aggregate(Max(lambda x: x)) # OK - with pytest.raises(ValueError): - ds.aggregate(Max("foo")) - - ds = ray.data.range_table(100) - ds.aggregate(Max("value")) # OK - with pytest.raises(ValueError): - ds.aggregate(Max()) - with pytest.raises(ValueError): - ds.aggregate(Max(lambda x: x)) - with pytest.raises(ValueError): - ds.aggregate(Max("bad_field")) - - -@pytest.mark.parametrize("num_parts", [1, 30]) -def test_groupby_agg_name_conflict(ray_start_regular_shared, num_parts): - # Test aggregation name conflict. - xs = list(range(100)) - grouped_ds = ( - ray.data.from_items([{"A": (x % 3), "B": x} for x in xs]) - .repartition(num_parts) - .groupby("A") - ) - agg_ds = grouped_ds.aggregate( - AggregateFn( - init=lambda k: [0, 0], - accumulate_row=lambda a, r: [a[0] + r["B"], a[1] + 1], - merge=lambda a1, a2: [a1[0] + a2[0], a1[1] + a2[1]], - finalize=lambda a: a[0] / a[1], - name="foo", - ), - AggregateFn( - init=lambda k: [0, 0], - accumulate_row=lambda a, r: [a[0] + r["B"], a[1] + 1], - merge=lambda a1, a2: [a1[0] + a2[0], a1[1] + a2[1]], - finalize=lambda a: a[0] / a[1], - name="foo", - ), - ) - assert agg_ds.count() == 3 - assert [row.as_pydict() for row in agg_ds.sort("A").iter_rows()] == [ - {"A": 0, "foo": 49.5, "foo_2": 49.5}, - {"A": 1, "foo": 49.0, "foo_2": 49.0}, - {"A": 2, "foo": 50.0, "foo_2": 50.0}, - ] - - -@pytest.mark.parametrize("num_parts", [1, 30]) -@pytest.mark.parametrize("ds_format", ["arrow", "pandas"]) -def test_groupby_tabular_count( - ray_start_regular_shared, ds_format, num_parts, use_push_based_shuffle -): - # Test built-in count aggregation - seed = int(time.time()) - print(f"Seeding RNG for test_groupby_arrow_count with: {seed}") - random.seed(seed) - xs = list(range(100)) - random.shuffle(xs) - - def _to_pandas(ds): - return ds.map_batches(lambda x: x, batch_size=None, batch_format="pandas") - - ds = ray.data.from_items([{"A": (x % 3), "B": x} for x in xs]).repartition( - num_parts - ) - if ds_format == "pandas": - ds = _to_pandas(ds) - agg_ds = ds.groupby("A").count() - assert agg_ds.count() == 3 - assert [row.as_pydict() for row in agg_ds.sort("A").iter_rows()] == [ - {"A": 0, "count()": 34}, - {"A": 1, "count()": 33}, - {"A": 2, "count()": 33}, - ] - - -@pytest.mark.parametrize("num_parts", [1, 30]) -@pytest.mark.parametrize("ds_format", ["arrow", "pandas"]) -def test_groupby_tabular_sum( - ray_start_regular_shared, ds_format, num_parts, use_push_based_shuffle -): - # Test built-in sum aggregation - seed = int(time.time()) - print(f"Seeding RNG for test_groupby_tabular_sum with: {seed}") - random.seed(seed) - xs = list(range(100)) - random.shuffle(xs) - - def _to_pandas(ds): - return ds.map_batches(lambda x: x, batch_size=None, batch_format="pandas") - - ds = ray.data.from_items([{"A": (x % 3), "B": x} for x in xs]).repartition( - num_parts - ) - if ds_format == "pandas": - ds = _to_pandas(ds) - - agg_ds = ds.groupby("A").sum("B") - assert agg_ds.count() == 3 - assert [row.as_pydict() for row in agg_ds.sort("A").iter_rows()] == [ - {"A": 0, "sum(B)": 1683}, - {"A": 1, "sum(B)": 1617}, - {"A": 2, "sum(B)": 1650}, - ] - - # Test built-in sum aggregation with nans - ds = ray.data.from_items( - [{"A": (x % 3), "B": x} for x in xs] + [{"A": 0, "B": None}] - ).repartition(num_parts) - if ds_format == "pandas": - ds = _to_pandas(ds) - nan_grouped_ds = ds.groupby("A") - nan_agg_ds = nan_grouped_ds.sum("B") - assert nan_agg_ds.count() == 3 - assert [row.as_pydict() for row in nan_agg_ds.sort("A").iter_rows()] == [ - {"A": 0, "sum(B)": 1683}, - {"A": 1, "sum(B)": 1617}, - {"A": 2, "sum(B)": 1650}, - ] - # Test ignore_nulls=False - nan_agg_ds = nan_grouped_ds.sum("B", ignore_nulls=False) - assert nan_agg_ds.count() == 3 - pd.testing.assert_frame_equal( - nan_agg_ds.sort("A").to_pandas(), - pd.DataFrame( - { - "A": [0, 1, 2], - "sum(B)": [None, 1617, 1650], - } - ), - check_dtype=False, - ) - # Test all nans - ds = ray.data.from_items([{"A": (x % 3), "B": None} for x in xs]).repartition( - num_parts - ) - if ds_format == "pandas": - ds = _to_pandas(ds) - nan_agg_ds = ds.groupby("A").sum("B") - assert nan_agg_ds.count() == 3 - pd.testing.assert_frame_equal( - nan_agg_ds.sort("A").to_pandas(), - pd.DataFrame( - { - "A": [0, 1, 2], - "sum(B)": [None, None, None], - } - ), - ) - - -@pytest.mark.parametrize("num_parts", [1, 30]) -@pytest.mark.parametrize("ds_format", ["arrow", "pandas"]) -def test_global_tabular_sum(ray_start_regular_shared, ds_format, num_parts): - seed = int(time.time()) - print(f"Seeding RNG for test_global_arrow_sum with: {seed}") - random.seed(seed) - xs = list(range(100)) - random.shuffle(xs) - - def _to_pandas(ds): - return ds.map_batches(lambda x: x, batch_size=None, batch_format="pandas") - - # Test built-in global sum aggregation - ds = ray.data.from_items([{"A": x} for x in xs]).repartition(num_parts) - if ds_format == "pandas": - ds = _to_pandas(ds) - assert ds.sum("A") == 4950 - - # Test empty dataset - ds = ray.data.range_table(10) - if ds_format == "pandas": - ds = _to_pandas(ds) - assert ds.filter(lambda r: r["value"] > 10).sum("value") is None - - # Test built-in global sum aggregation with nans - nan_ds = ray.data.from_items([{"A": x} for x in xs] + [{"A": None}]).repartition( - num_parts - ) - if ds_format == "pandas": - nan_ds = _to_pandas(nan_ds) - assert nan_ds.sum("A") == 4950 - # Test ignore_nulls=False - assert nan_ds.sum("A", ignore_nulls=False) is None - # Test all nans - nan_ds = ray.data.from_items([{"A": None}] * len(xs)).repartition(num_parts) - if ds_format == "pandas": - nan_ds = _to_pandas(nan_ds) - assert nan_ds.sum("A") is None - assert nan_ds.sum("A", ignore_nulls=False) is None - - -@pytest.mark.parametrize("num_parts", [1, 30]) -@pytest.mark.parametrize("ds_format", ["arrow", "pandas"]) -def test_groupby_tabular_min(ray_start_regular_shared, ds_format, num_parts): - # Test built-in min aggregation - seed = int(time.time()) - print(f"Seeding RNG for test_groupby_tabular_min with: {seed}") - random.seed(seed) - xs = list(range(100)) - random.shuffle(xs) - - def _to_pandas(ds): - return ds.map_batches(lambda x: x, batch_size=None, batch_format="pandas") - - ds = ray.data.from_items([{"A": (x % 3), "B": x} for x in xs]).repartition( - num_parts - ) - if ds_format == "pandas": - ds = _to_pandas(ds) - - agg_ds = ds.groupby("A").min("B") - assert agg_ds.count() == 3 - assert [row.as_pydict() for row in agg_ds.sort("A").iter_rows()] == [ - {"A": 0, "min(B)": 0}, - {"A": 1, "min(B)": 1}, - {"A": 2, "min(B)": 2}, - ] - - # Test built-in min aggregation with nans - ds = ray.data.from_items( - [{"A": (x % 3), "B": x} for x in xs] + [{"A": 0, "B": None}] - ).repartition(num_parts) - if ds_format == "pandas": - ds = _to_pandas(ds) - nan_grouped_ds = ds.groupby("A") - nan_agg_ds = nan_grouped_ds.min("B") - assert nan_agg_ds.count() == 3 - assert [row.as_pydict() for row in nan_agg_ds.sort("A").iter_rows()] == [ - {"A": 0, "min(B)": 0}, - {"A": 1, "min(B)": 1}, - {"A": 2, "min(B)": 2}, - ] - # Test ignore_nulls=False - nan_agg_ds = nan_grouped_ds.min("B", ignore_nulls=False) - assert nan_agg_ds.count() == 3 - pd.testing.assert_frame_equal( - nan_agg_ds.sort("A").to_pandas(), - pd.DataFrame( - { - "A": [0, 1, 2], - "min(B)": [None, 1, 2], - } - ), - check_dtype=False, - ) - # Test all nans - ds = ray.data.from_items([{"A": (x % 3), "B": None} for x in xs]).repartition( - num_parts - ) - if ds_format == "pandas": - ds = _to_pandas(ds) - nan_agg_ds = ds.groupby("A").min("B") - assert nan_agg_ds.count() == 3 - pd.testing.assert_frame_equal( - nan_agg_ds.sort("A").to_pandas(), - pd.DataFrame( - { - "A": [0, 1, 2], - "min(B)": [None, None, None], - } - ), - check_dtype=False, - ) - - -@pytest.mark.parametrize("num_parts", [1, 30]) -@pytest.mark.parametrize("ds_format", ["arrow", "pandas"]) -def test_global_tabular_min(ray_start_regular_shared, ds_format, num_parts): - seed = int(time.time()) - print(f"Seeding RNG for test_global_arrow_min with: {seed}") - random.seed(seed) - xs = list(range(100)) - random.shuffle(xs) - - def _to_pandas(ds): - return ds.map_batches(lambda x: x, batch_size=None, batch_format="pandas") - - # Test built-in global min aggregation - ds = ray.data.from_items([{"A": x} for x in xs]).repartition(num_parts) - if ds_format == "pandas": - ds = _to_pandas(ds) - assert ds.min("A") == 0 - - # Test empty dataset - ds = ray.data.range_table(10) - if ds_format == "pandas": - ds = _to_pandas(ds) - assert ds.filter(lambda r: r["value"] > 10).min("value") is None - - # Test built-in global min aggregation with nans - nan_ds = ray.data.from_items([{"A": x} for x in xs] + [{"A": None}]).repartition( - num_parts - ) - if ds_format == "pandas": - nan_ds = _to_pandas(nan_ds) - assert nan_ds.min("A") == 0 - # Test ignore_nulls=False - assert nan_ds.min("A", ignore_nulls=False) is None - # Test all nans - nan_ds = ray.data.from_items([{"A": None}] * len(xs)).repartition(num_parts) - if ds_format == "pandas": - nan_ds = _to_pandas(nan_ds) - assert nan_ds.min("A") is None - assert nan_ds.min("A", ignore_nulls=False) is None - - -@pytest.mark.parametrize("num_parts", [1, 30]) -@pytest.mark.parametrize("ds_format", ["arrow", "pandas"]) -def test_groupby_tabular_max(ray_start_regular_shared, ds_format, num_parts): - # Test built-in max aggregation - seed = int(time.time()) - print(f"Seeding RNG for test_groupby_tabular_max with: {seed}") - random.seed(seed) - xs = list(range(100)) - random.shuffle(xs) - - def _to_pandas(ds): - return ds.map_batches(lambda x: x, batch_size=None, batch_format="pandas") - - ds = ray.data.from_items([{"A": (x % 3), "B": x} for x in xs]).repartition( - num_parts - ) - if ds_format == "pandas": - ds = _to_pandas(ds) - - agg_ds = ds.groupby("A").max("B") - assert agg_ds.count() == 3 - assert [row.as_pydict() for row in agg_ds.sort("A").iter_rows()] == [ - {"A": 0, "max(B)": 99}, - {"A": 1, "max(B)": 97}, - {"A": 2, "max(B)": 98}, - ] - - # Test built-in min aggregation with nans - ds = ray.data.from_items( - [{"A": (x % 3), "B": x} for x in xs] + [{"A": 0, "B": None}] - ).repartition(num_parts) - if ds_format == "pandas": - ds = _to_pandas(ds) - nan_grouped_ds = ds.groupby("A") - nan_agg_ds = nan_grouped_ds.max("B") - assert nan_agg_ds.count() == 3 - assert [row.as_pydict() for row in nan_agg_ds.sort("A").iter_rows()] == [ - {"A": 0, "max(B)": 99}, - {"A": 1, "max(B)": 97}, - {"A": 2, "max(B)": 98}, - ] - # Test ignore_nulls=False - nan_agg_ds = nan_grouped_ds.max("B", ignore_nulls=False) - assert nan_agg_ds.count() == 3 - pd.testing.assert_frame_equal( - nan_agg_ds.sort("A").to_pandas(), - pd.DataFrame( - { - "A": [0, 1, 2], - "max(B)": [None, 97, 98], - } - ), - check_dtype=False, - ) - # Test all nans - ds = ray.data.from_items([{"A": (x % 3), "B": None} for x in xs]).repartition( - num_parts - ) - if ds_format == "pandas": - ds = _to_pandas(ds) - nan_agg_ds = ds.groupby("A").max("B") - assert nan_agg_ds.count() == 3 - pd.testing.assert_frame_equal( - nan_agg_ds.sort("A").to_pandas(), - pd.DataFrame( - { - "A": [0, 1, 2], - "max(B)": [None, None, None], - } - ), - check_dtype=False, - ) - - -@pytest.mark.parametrize("num_parts", [1, 30]) -@pytest.mark.parametrize("ds_format", ["arrow", "pandas"]) -def test_global_tabular_max(ray_start_regular_shared, ds_format, num_parts): - seed = int(time.time()) - print(f"Seeding RNG for test_global_arrow_max with: {seed}") - random.seed(seed) - xs = list(range(100)) - random.shuffle(xs) - - def _to_pandas(ds): - return ds.map_batches(lambda x: x, batch_size=None, batch_format="pandas") - - # Test built-in global max aggregation - ds = ray.data.from_items([{"A": x} for x in xs]).repartition(num_parts) - if ds_format == "pandas": - ds = _to_pandas(ds) - assert ds.max("A") == 99 - - # Test empty dataset - ds = ray.data.range_table(10) - if ds_format == "pandas": - ds = _to_pandas(ds) - assert ds.filter(lambda r: r["value"] > 10).max("value") is None - - # Test built-in global max aggregation with nans - nan_ds = ray.data.from_items([{"A": x} for x in xs] + [{"A": None}]).repartition( - num_parts - ) - if ds_format == "pandas": - nan_ds = _to_pandas(nan_ds) - assert nan_ds.max("A") == 99 - # Test ignore_nulls=False - assert nan_ds.max("A", ignore_nulls=False) is None - # Test all nans - nan_ds = ray.data.from_items([{"A": None}] * len(xs)).repartition(num_parts) - if ds_format == "pandas": - nan_ds = _to_pandas(nan_ds) - assert nan_ds.max("A") is None - assert nan_ds.max("A", ignore_nulls=False) is None - - -@pytest.mark.parametrize("num_parts", [1, 30]) -@pytest.mark.parametrize("ds_format", ["arrow", "pandas"]) -def test_groupby_tabular_mean(ray_start_regular_shared, ds_format, num_parts): - # Test built-in mean aggregation - seed = int(time.time()) - print(f"Seeding RNG for test_groupby_tabular_mean with: {seed}") - random.seed(seed) - xs = list(range(100)) - random.shuffle(xs) - - def _to_pandas(ds): - return ds.map_batches(lambda x: x, batch_size=None, batch_format="pandas") - - ds = ray.data.from_items([{"A": (x % 3), "B": x} for x in xs]).repartition( - num_parts - ) - if ds_format == "pandas": - ds = _to_pandas(ds) - - agg_ds = ds.groupby("A").mean("B") - assert agg_ds.count() == 3 - assert [row.as_pydict() for row in agg_ds.sort("A").iter_rows()] == [ - {"A": 0, "mean(B)": 49.5}, - {"A": 1, "mean(B)": 49.0}, - {"A": 2, "mean(B)": 50.0}, - ] - - # Test built-in mean aggregation with nans - ds = ray.data.from_items( - [{"A": (x % 3), "B": x} for x in xs] + [{"A": 0, "B": None}] - ).repartition(num_parts) - if ds_format == "pandas": - ds = _to_pandas(ds) - nan_grouped_ds = ds.groupby("A") - nan_agg_ds = nan_grouped_ds.mean("B") - assert nan_agg_ds.count() == 3 - assert [row.as_pydict() for row in nan_agg_ds.sort("A").iter_rows()] == [ - {"A": 0, "mean(B)": 49.5}, - {"A": 1, "mean(B)": 49.0}, - {"A": 2, "mean(B)": 50.0}, - ] - # Test ignore_nulls=False - nan_agg_ds = nan_grouped_ds.mean("B", ignore_nulls=False) - assert nan_agg_ds.count() == 3 - pd.testing.assert_frame_equal( - nan_agg_ds.sort("A").to_pandas(), - pd.DataFrame( - { - "A": [0, 1, 2], - "mean(B)": [None, 49.0, 50.0], - } - ), - check_dtype=False, - ) - # Test all nans - ds = ray.data.from_items([{"A": (x % 3), "B": None} for x in xs]).repartition( - num_parts - ) - if ds_format == "pandas": - ds = _to_pandas(ds) - nan_agg_ds = ds.groupby("A").mean("B") - assert nan_agg_ds.count() == 3 - pd.testing.assert_frame_equal( - nan_agg_ds.sort("A").to_pandas(), - pd.DataFrame( - { - "A": [0, 1, 2], - "mean(B)": [None, None, None], - } - ), - check_dtype=False, - ) - - -@pytest.mark.parametrize("num_parts", [1, 30]) -@pytest.mark.parametrize("ds_format", ["arrow", "pandas"]) -def test_global_tabular_mean(ray_start_regular_shared, ds_format, num_parts): - seed = int(time.time()) - print(f"Seeding RNG for test_global_arrow_mean with: {seed}") - random.seed(seed) - xs = list(range(100)) - random.shuffle(xs) - - def _to_pandas(ds): - return ds.map_batches(lambda x: x, batch_size=None, batch_format="pandas") - - # Test built-in global mean aggregation - ds = ray.data.from_items([{"A": x} for x in xs]).repartition(num_parts) - if ds_format == "pandas": - ds = _to_pandas(ds) - assert ds.mean("A") == 49.5 - - # Test empty dataset - ds = ray.data.range_table(10) - if ds_format == "pandas": - ds = _to_pandas(ds) - assert ds.filter(lambda r: r["value"] > 10).mean("value") is None - - # Test built-in global mean aggregation with nans - nan_ds = ray.data.from_items([{"A": x} for x in xs] + [{"A": None}]).repartition( - num_parts - ) - if ds_format == "pandas": - nan_ds = _to_pandas(nan_ds) - assert nan_ds.mean("A") == 49.5 - # Test ignore_nulls=False - assert nan_ds.mean("A", ignore_nulls=False) is None - # Test all nans - nan_ds = ray.data.from_items([{"A": None}] * len(xs)).repartition(num_parts) - if ds_format == "pandas": - nan_ds = _to_pandas(nan_ds) - assert nan_ds.mean("A") is None - assert nan_ds.mean("A", ignore_nulls=False) is None - - -@pytest.mark.parametrize("num_parts", [1, 30]) -@pytest.mark.parametrize("ds_format", ["arrow", "pandas"]) -def test_groupby_tabular_std(ray_start_regular_shared, ds_format, num_parts): - # Test built-in std aggregation - seed = int(time.time()) - print(f"Seeding RNG for test_groupby_tabular_std with: {seed}") - random.seed(seed) - xs = list(range(100)) - random.shuffle(xs) - - def _to_arrow(ds): - return ds.map_batches(lambda x: x, batch_size=None, batch_format="pyarrow") - - df = pd.DataFrame({"A": [x % 3 for x in xs], "B": xs}) - ds = ray.data.from_pandas(df).repartition(num_parts) - if ds_format == "arrow": - ds = _to_arrow(ds) - agg_ds = ds.groupby("A").std("B") - assert agg_ds.count() == 3 - result = agg_ds.to_pandas()["std(B)"].to_numpy() - expected = df.groupby("A")["B"].std().to_numpy() - np.testing.assert_array_almost_equal(result, expected) - # ddof of 0 - ds = ray.data.from_pandas(df).repartition(num_parts) - if ds_format == "arrow": - ds = _to_arrow(ds) - agg_ds = ds.groupby("A").std("B", ddof=0) - assert agg_ds.count() == 3 - result = agg_ds.to_pandas()["std(B)"].to_numpy() - expected = df.groupby("A")["B"].std(ddof=0).to_numpy() - np.testing.assert_array_almost_equal(result, expected) - - # Test built-in std aggregation with nans - nan_df = pd.DataFrame({"A": [x % 3 for x in xs] + [0], "B": xs + [None]}) - ds = ray.data.from_pandas(nan_df).repartition(num_parts) - if ds_format == "arrow": - ds = _to_arrow(ds) - nan_grouped_ds = ds.groupby("A") - nan_agg_ds = nan_grouped_ds.std("B") - assert nan_agg_ds.count() == 3 - result = nan_agg_ds.to_pandas()["std(B)"].to_numpy() - expected = nan_df.groupby("A")["B"].std().to_numpy() - np.testing.assert_array_almost_equal(result, expected) - # Test ignore_nulls=False - nan_agg_ds = nan_grouped_ds.std("B", ignore_nulls=False) - assert nan_agg_ds.count() == 3 - result = nan_agg_ds.to_pandas()["std(B)"].to_numpy() - expected = nan_df.groupby("A")["B"].std() - expected[0] = None - np.testing.assert_array_almost_equal(result, expected) - # Test all nans - nan_df = pd.DataFrame({"A": [x % 3 for x in xs], "B": [None] * len(xs)}) - ds = ray.data.from_pandas(nan_df).repartition(num_parts) - if ds_format == "arrow": - ds = _to_arrow(ds) - nan_agg_ds = ds.groupby("A").std("B", ignore_nulls=False) - assert nan_agg_ds.count() == 3 - result = nan_agg_ds.to_pandas()["std(B)"].to_numpy() - expected = pd.Series([None] * 3) - np.testing.assert_array_equal(result, expected) - - -@pytest.mark.parametrize("num_parts", [1, 30]) -@pytest.mark.parametrize("ds_format", ["arrow", "pandas"]) -def test_global_tabular_std(ray_start_regular_shared, ds_format, num_parts): - seed = int(time.time()) - print(f"Seeding RNG for test_global_arrow_std with: {seed}") - random.seed(seed) - xs = list(range(100)) - random.shuffle(xs) - - def _to_arrow(ds): - return ds.map_batches(lambda x: x, batch_size=None, batch_format="pyarrow") - - def _to_pandas(ds): - return ds.map_batches(lambda x: x, batch_size=None, batch_format="pandas") - - # Test built-in global max aggregation - df = pd.DataFrame({"A": xs}) - ds = ray.data.from_pandas(df).repartition(num_parts) - if ds_format == "arrow": - ds = _to_arrow(ds) - assert math.isclose(ds.std("A"), df["A"].std()) - assert math.isclose(ds.std("A", ddof=0), df["A"].std(ddof=0)) - - # Test empty dataset - ds = ray.data.from_pandas(pd.DataFrame({"A": []})) - if ds_format == "arrow": - ds = _to_arrow(ds) - assert ds.std("A") is None - # Test edge cases - ds = ray.data.from_pandas(pd.DataFrame({"A": [3]})) - if ds_format == "arrow": - ds = _to_arrow(ds) - assert ds.std("A") == 0 - - # Test built-in global std aggregation with nans - nan_df = pd.DataFrame({"A": xs + [None]}) - nan_ds = ray.data.from_pandas(nan_df).repartition(num_parts) - if ds_format == "arrow": - nan_ds = _to_arrow(nan_ds) - assert math.isclose(nan_ds.std("A"), nan_df["A"].std()) - # Test ignore_nulls=False - assert nan_ds.std("A", ignore_nulls=False) is None - # Test all nans - nan_ds = ray.data.from_items([{"A": None}] * len(xs)).repartition(num_parts) - if ds_format == "pandas": - nan_ds = _to_pandas(nan_ds) - assert nan_ds.std("A") is None - assert nan_ds.std("A", ignore_nulls=False) is None - - -@pytest.mark.parametrize("num_parts", [1, 30]) -def test_groupby_arrow_multicolumn(ray_start_regular_shared, num_parts): - # Test built-in mean aggregation on multiple columns - seed = int(time.time()) - print(f"Seeding RNG for test_groupby_arrow_multicolumn with: {seed}") - random.seed(seed) - xs = list(range(100)) - random.shuffle(xs) - df = pd.DataFrame({"A": [x % 3 for x in xs], "B": xs, "C": [2 * x for x in xs]}) - agg_ds = ( - ray.data.from_pandas(df).repartition(num_parts).groupby("A").mean(["B", "C"]) - ) - assert agg_ds.count() == 3 - assert [row.as_pydict() for row in agg_ds.sort("A").iter_rows()] == [ - {"A": 0, "mean(B)": 49.5, "mean(C)": 99.0}, - {"A": 1, "mean(B)": 49.0, "mean(C)": 98.0}, - {"A": 2, "mean(B)": 50.0, "mean(C)": 100.0}, - ] - - # Test that unspecified agg column ==> agg on all columns except for - # groupby keys. - agg_ds = ray.data.from_pandas(df).repartition(num_parts).groupby("A").mean() - assert agg_ds.count() == 3 - assert [row.as_pydict() for row in agg_ds.sort("A").iter_rows()] == [ - {"A": 0, "mean(B)": 49.5, "mean(C)": 99.0}, - {"A": 1, "mean(B)": 49.0, "mean(C)": 98.0}, - {"A": 2, "mean(B)": 50.0, "mean(C)": 100.0}, - ] - - # Test built-in global mean aggregation - df = pd.DataFrame({"A": xs, "B": [2 * x for x in xs]}) - result_row = ray.data.from_pandas(df).repartition(num_parts).mean(["A", "B"]) - assert result_row["mean(A)"] == df["A"].mean() - assert result_row["mean(B)"] == df["B"].mean() - - -def test_groupby_agg_bad_on(ray_start_regular_shared): - # Test bad on for groupby aggregation - xs = list(range(100)) - df = pd.DataFrame({"A": [x % 3 for x in xs], "B": xs, "C": [2 * x for x in xs]}) - # Wrong type. - with pytest.raises(TypeError): - ray.data.from_pandas(df).groupby("A").mean(5).fully_executed() - with pytest.raises(TypeError): - ray.data.from_pandas(df).groupby("A").mean([5]).fully_executed() - # Empty list. - with pytest.raises(ValueError): - ray.data.from_pandas(df).groupby("A").mean([]).fully_executed() - # Nonexistent column. - with pytest.raises(ValueError): - ray.data.from_pandas(df).groupby("A").mean("D").fully_executed() - with pytest.raises(ValueError): - ray.data.from_pandas(df).groupby("A").mean(["B", "D"]).fully_executed() - # Columns for simple Dataset. - with pytest.raises(ValueError): - ray.data.from_items(xs).groupby(lambda x: x % 3 == 0).mean("A").fully_executed() - - # Test bad on for global aggregation - # Wrong type. - with pytest.raises(TypeError): - ray.data.from_pandas(df).mean(5).fully_executed() - with pytest.raises(TypeError): - ray.data.from_pandas(df).mean([5]).fully_executed() - # Empty list. - with pytest.raises(ValueError): - ray.data.from_pandas(df).mean([]).fully_executed() - # Nonexistent column. - with pytest.raises(ValueError): - ray.data.from_pandas(df).mean("D").fully_executed() - with pytest.raises(ValueError): - ray.data.from_pandas(df).mean(["B", "D"]).fully_executed() - # Columns for simple Dataset. - with pytest.raises(ValueError): - ray.data.from_items(xs).mean("A").fully_executed() - - -@pytest.mark.parametrize("num_parts", [1, 30]) -def test_groupby_arrow_multi_agg(ray_start_regular_shared, num_parts): - seed = int(time.time()) - print(f"Seeding RNG for test_groupby_arrow_multi_agg with: {seed}") - random.seed(seed) - xs = list(range(100)) - random.shuffle(xs) - df = pd.DataFrame({"A": [x % 3 for x in xs], "B": xs}) - agg_ds = ( - ray.data.from_pandas(df) - .repartition(num_parts) - .groupby("A") - .aggregate( - Count(), - Sum("B"), - Min("B"), - Max("B"), - Mean("B"), - Std("B"), - ) - ) - assert agg_ds.count() == 3 - agg_df = agg_ds.to_pandas() - expected_grouped = df.groupby("A")["B"] - np.testing.assert_array_equal(agg_df["count()"].to_numpy(), [34, 33, 33]) - for agg in ["sum", "min", "max", "mean", "std"]: - result = agg_df[f"{agg}(B)"].to_numpy() - expected = getattr(expected_grouped, agg)().to_numpy() - if agg == "std": - np.testing.assert_array_almost_equal(result, expected) - else: - np.testing.assert_array_equal(result, expected) - # Test built-in global std aggregation - df = pd.DataFrame({"A": xs}) - - result_row = ( - ray.data.from_pandas(df) - .repartition(num_parts) - .aggregate( - Sum("A"), - Min("A"), - Max("A"), - Mean("A"), - Std("A"), - ) - ) - for agg in ["sum", "min", "max", "mean", "std"]: - result = result_row[f"{agg}(A)"] - expected = getattr(df["A"], agg)() - if agg == "std": - assert math.isclose(result, expected) - else: - assert result == expected - - -def test_groupby_simple(ray_start_regular_shared): - seed = int(time.time()) - print(f"Seeding RNG for test_groupby_simple with: {seed}") - random.seed(seed) - parallelism = 3 - xs = [ - ("A", 2), - ("A", 4), - ("A", 9), - ("B", 10), - ("B", 20), - ("C", 3), - ("C", 5), - ("C", 8), - ("C", 12), - ] - random.shuffle(xs) - ds = ray.data.from_items(xs, parallelism=parallelism) - - # Mean aggregation - agg_ds = ds.groupby(lambda r: r[0]).aggregate( - AggregateFn( - init=lambda k: (0, 0), - accumulate_row=lambda a, r: (a[0] + r[1], a[1] + 1), - merge=lambda a1, a2: (a1[0] + a2[0], a1[1] + a2[1]), - finalize=lambda a: a[0] / a[1], - ) - ) - assert agg_ds.count() == 3 - assert agg_ds.sort(key=lambda r: r[0]).take(3) == [("A", 5), ("B", 15), ("C", 7)] - - # Test None row - parallelism = 2 - xs = ["A", "A", "A", None, None, None, "B"] - random.shuffle(xs) - ds = ray.data.from_items(xs, parallelism=parallelism) - # Count aggregation - agg_ds = ds.groupby(lambda r: str(r)).aggregate( - AggregateFn( - init=lambda k: 0, - accumulate_row=lambda a, r: a + 1, - merge=lambda a1, a2: a1 + a2, - ) - ) - assert agg_ds.count() == 3 - assert agg_ds.sort(key=lambda r: str(r[0])).take(3) == [ - ("A", 3), - ("B", 1), - ("None", 3), - ] - - # Test empty dataset. - ds = ray.data.from_items([]) - agg_ds = ds.groupby(lambda r: r[0]).aggregate( - AggregateFn( - init=lambda k: 1 / 0, # should never reach here - accumulate_row=lambda a, r: 1 / 0, - merge=lambda a1, a2: 1 / 0, - finalize=lambda a: 1 / 0, - ) - ) - assert agg_ds.count() == 0 - assert agg_ds.take() == ds.take() - agg_ds = ray.data.range(10).filter(lambda r: r > 10).groupby(lambda r: r).count() - assert agg_ds.count() == 0 - - -@pytest.mark.parametrize("num_parts", [1, 30]) -def test_groupby_simple_count(ray_start_regular_shared, num_parts): - # Test built-in count aggregation - seed = int(time.time()) - print(f"Seeding RNG for test_groupby_simple_count with: {seed}") - random.seed(seed) - xs = list(range(100)) - random.shuffle(xs) - agg_ds = ( - ray.data.from_items(xs).repartition(num_parts).groupby(lambda x: x % 3).count() - ) - assert agg_ds.count() == 3 - assert agg_ds.sort(key=lambda r: r[0]).take(3) == [(0, 34), (1, 33), (2, 33)] - - -@pytest.mark.parametrize("num_parts", [1, 30]) -def test_groupby_simple_sum(ray_start_regular_shared, num_parts): - # Test built-in sum aggregation - seed = int(time.time()) - print(f"Seeding RNG for test_groupby_simple_sum with: {seed}") - random.seed(seed) - xs = list(range(100)) - random.shuffle(xs) - agg_ds = ( - ray.data.from_items(xs).repartition(num_parts).groupby(lambda x: x % 3).sum() - ) - assert agg_ds.count() == 3 - assert agg_ds.sort(key=lambda r: r[0]).take(3) == [(0, 1683), (1, 1617), (2, 1650)] - - # Test built-in sum aggregation with nans - nan_grouped_ds = ( - ray.data.from_items(xs + [None]) - .repartition(num_parts) - .groupby(lambda x: int(x or 0) % 3) - ) - nan_agg_ds = nan_grouped_ds.sum() - assert nan_agg_ds.count() == 3 - assert nan_agg_ds.sort(key=lambda r: r[0]).take(3) == [ - (0, 1683), - (1, 1617), - (2, 1650), - ] - # Test ignore_nulls=False - nan_agg_ds = nan_grouped_ds.sum(ignore_nulls=False) - assert nan_agg_ds.count() == 3 - assert nan_agg_ds.sort(key=lambda r: r[0]).take(3) == [ - (0, None), - (1, 1617), - (2, 1650), - ] - # Test all nans - nan_agg_ds = ( - ray.data.from_items([None] * len(xs)) - .repartition(num_parts) - .groupby(lambda x: 0) - .sum() - ) - assert nan_agg_ds.count() == 1 - assert nan_agg_ds.sort(key=lambda r: r[0]).take(1) == [(0, None)] - - # Test built-in global sum aggregation - assert ray.data.from_items(xs).repartition(num_parts).sum() == 4950 - assert ray.data.range(10).filter(lambda r: r > 10).sum() is None - - # Test built-in global sum aggregation with nans - nan_ds = ray.data.from_items(xs + [None]).repartition(num_parts) - assert nan_ds.sum() == 4950 - # Test ignore_nulls=False - assert nan_ds.sum(ignore_nulls=False) is None - # Test all nans - nan_ds = ray.data.from_items([None] * len(xs)).repartition(num_parts) - assert nan_ds.sum() is None - - -def test_map_batches_preserve_empty_blocks(ray_start_regular_shared): - ds = ray.data.range(10, parallelism=10) - ds = ds.map_batches(lambda x: []) - ds = ds.map_batches(lambda x: x) - assert ds.num_blocks() == 10, ds - - -def test_map_batches_combine_empty_blocks(ray_start_regular_shared): - xs = [x % 3 for x in list(range(100))] - - # ds1 has 1 block which contains 100 rows. - ds1 = ray.data.from_items(xs).repartition(1).sort().map_batches(lambda x: x) - assert ds1._block_num_rows() == [100] - - # ds2 has 30 blocks, but only 3 of them are non-empty - ds2 = ( - ray.data.from_items(xs) - .repartition(30) - .sort() - .map_batches(lambda x: x, batch_size=1) - ) - assert len(ds2._block_num_rows()) == 3 - count = sum(1 for x in ds2._block_num_rows() if x > 0) - assert count == 3 - - # The number of partitions should not affect the map_batches() result. - assert ds1.take_all() == ds2.take_all() - - -def test_groupby_map_groups_for_empty_dataset(ray_start_regular_shared): - ds = ray.data.from_items([]) - mapped = ds.groupby(lambda x: x % 3).map_groups(lambda x: [min(x) * min(x)]) - assert mapped.count() == 0 - assert mapped.take_all() == [] - - -def test_groupby_map_groups_merging_empty_result(ray_start_regular_shared): - ds = ray.data.from_items([1, 2, 3]) - # This needs to merge empty and non-empty results from different groups. - mapped = ds.groupby(lambda x: x).map_groups(lambda x: [] if x == [1] else x) - assert mapped.count() == 2 - assert mapped.take_all() == [2, 3] - - -def test_groupby_map_groups_merging_invalid_result(ray_start_regular_shared): - ds = ray.data.from_items([1, 2, 3]) - grouped = ds.groupby(lambda x: x) - - # The UDF returns None, which is invalid. - with pytest.raises(TypeError): - grouped.map_groups(lambda x: None if x == [1] else x).fully_executed() - - -@pytest.mark.parametrize("num_parts", [1, 2, 30]) -def test_groupby_map_groups_for_none_groupkey(ray_start_regular_shared, num_parts): - ds = ray.data.from_items(list(range(100))) - mapped = ( - ds.repartition(num_parts).groupby(None).map_groups(lambda x: [min(x) + max(x)]) - ) - assert mapped.count() == 1 - assert mapped.take_all() == [99] - - -@pytest.mark.parametrize("num_parts", [1, 2, 30]) -def test_groupby_map_groups_returning_empty_result(ray_start_regular_shared, num_parts): - xs = list(range(100)) - mapped = ( - ray.data.from_items(xs) - .repartition(num_parts) - .groupby(lambda x: x % 3) - .map_groups(lambda x: []) - ) - assert mapped.count() == 0 - assert mapped.take_all() == [] - - -def test_groupby_map_groups_perf(ray_start_regular_shared): - data_list = [x % 100 for x in range(5000000)] - ds = ray.data.from_pandas(pd.DataFrame({"A": data_list})) - start = time.perf_counter() - ds.groupby("A").map_groups(lambda df: df) - end = time.perf_counter() - # On a t3.2xlarge instance, it ran in about 5 seconds, so expecting it has to - # finish within about 10x of that time, unless something went wrong. - assert end - start < 60 - - -@pytest.mark.parametrize("num_parts", [1, 2, 3, 30]) -def test_groupby_map_groups_for_list(ray_start_regular_shared, num_parts): - seed = int(time.time()) - print(f"Seeding RNG for test_groupby_simple_count with: {seed}") - random.seed(seed) - xs = list(range(100)) - random.shuffle(xs) - mapped = ( - ray.data.from_items(xs) - .repartition(num_parts) - .groupby(lambda x: x % 3) - .map_groups(lambda x: [min(x) * min(x)]) - ) - assert mapped.count() == 3 - assert mapped.take_all() == [0, 1, 4] - - -@pytest.mark.parametrize("num_parts", [1, 2, 3, 30]) -def test_groupby_map_groups_for_pandas(ray_start_regular_shared, num_parts): - df = pd.DataFrame({"A": "a a b".split(), "B": [1, 1, 3], "C": [4, 6, 5]}) - grouped = ray.data.from_pandas(df).repartition(num_parts).groupby("A") - - # Normalize the numeric columns (i.e. B and C) for each group. - mapped = grouped.map_groups( - lambda g: g.apply( - lambda col: col / g[col.name].sum() if col.name in ["B", "C"] else col - ) - ) - - # The function (i.e. the normalization) performed on each group doesn't - # aggregate rows, so we still have 3 rows. - assert mapped.count() == 3 - expected = pd.DataFrame( - {"A": ["a", "a", "b"], "B": [0.5, 0.5, 1.000000], "C": [0.4, 0.6, 1.0]} - ) - assert mapped.to_pandas().equals(expected) - - -@pytest.mark.parametrize("num_parts", [1, 2, 3, 30]) -def test_groupby_map_groups_for_arrow(ray_start_regular_shared, num_parts): - at = pa.Table.from_pydict({"A": "a a b".split(), "B": [1, 1, 3], "C": [4, 6, 5]}) - grouped = ray.data.from_arrow(at).repartition(num_parts).groupby("A") - - # Normalize the numeric columns (i.e. B and C) for each group. - def normalize(at: pa.Table): - r = at.select("A") - sb = pa.compute.sum(at.column("B")).cast(pa.float64()) - r = r.append_column("B", pa.compute.divide(at.column("B"), sb)) - sc = pa.compute.sum(at.column("C")).cast(pa.float64()) - r = r.append_column("C", pa.compute.divide(at.column("C"), sc)) - return r - - mapped = grouped.map_groups(normalize, batch_format="pyarrow") - - # The function (i.e. the normalization) performed on each group doesn't - # aggregate rows, so we still have 3 rows. - assert mapped.count() == 3 - expected = pa.Table.from_pydict( - {"A": ["a", "a", "b"], "B": [0.5, 0.5, 1], "C": [0.4, 0.6, 1]} - ) - result = pa.Table.from_pandas(mapped.to_pandas()) - assert result.equals(expected) - - -def test_groupby_map_groups_for_numpy(ray_start_regular_shared): - ds = ray.data.from_items( - [ - {"group": 1, "value": 1}, - {"group": 1, "value": 2}, - {"group": 2, "value": 3}, - {"group": 2, "value": 4}, - ] - ) - - def func(group): - # Test output type is NumPy format. - return {"group": group["group"] + 1, "value": group["value"] + 1} - - ds = ds.groupby("group").map_groups(func, batch_format="numpy") - expected = pa.Table.from_pydict({"group": [2, 2, 3, 3], "value": [2, 3, 4, 5]}) - result = pa.Table.from_pandas(ds.to_pandas()) - assert result.equals(expected) - - -def test_groupby_map_groups_with_different_types(ray_start_regular_shared): - ds = ray.data.from_items( - [ - {"group": 1, "value": 1}, - {"group": 1, "value": 2}, - {"group": 2, "value": 3}, - {"group": 2, "value": 4}, - ] - ) - - def func(group): - # Test output type is Python list, different from input type. - return [group["value"][0]] - - ds = ds.groupby("group").map_groups(func) - assert sorted(ds.take()) == [1, 3] - - -@pytest.mark.parametrize("num_parts", [1, 30]) -def test_groupby_simple_min(ray_start_regular_shared, num_parts): - # Test built-in min aggregation - seed = int(time.time()) - print(f"Seeding RNG for test_groupby_simple_min with: {seed}") - random.seed(seed) - xs = list(range(100)) - random.shuffle(xs) - agg_ds = ( - ray.data.from_items(xs).repartition(num_parts).groupby(lambda x: x % 3).min() - ) - assert agg_ds.count() == 3 - assert agg_ds.sort(key=lambda r: r[0]).take(3) == [(0, 0), (1, 1), (2, 2)] - - # Test built-in min aggregation with nans - nan_grouped_ds = ( - ray.data.from_items(xs + [None]) - .repartition(num_parts) - .groupby(lambda x: int(x or 0) % 3) - ) - nan_agg_ds = nan_grouped_ds.min() - assert nan_agg_ds.count() == 3 - assert nan_agg_ds.sort(key=lambda r: r[0]).take(3) == [(0, 0), (1, 1), (2, 2)] - # Test ignore_nulls=False - nan_agg_ds = nan_grouped_ds.min(ignore_nulls=False) - assert nan_agg_ds.count() == 3 - assert nan_agg_ds.sort(key=lambda r: r[0]).take(3) == [(0, None), (1, 1), (2, 2)] - # Test all nans - nan_agg_ds = ( - ray.data.from_items([None] * len(xs)) - .repartition(num_parts) - .groupby(lambda x: 0) - .min() - ) - assert nan_agg_ds.count() == 1 - assert nan_agg_ds.sort(key=lambda r: r[0]).take(1) == [(0, None)] - - # Test built-in global min aggregation - assert ray.data.from_items(xs).repartition(num_parts).min() == 0 - assert ray.data.range(10).filter(lambda r: r > 10).min() is None - - # Test built-in global min aggregation with nans - nan_ds = ray.data.from_items(xs + [None]).repartition(num_parts) - assert nan_ds.min() == 0 - # Test ignore_nulls=False - assert nan_ds.min(ignore_nulls=False) is None - # Test all nans - nan_ds = ray.data.from_items([None] * len(xs)).repartition(num_parts) - assert nan_ds.min() is None - - -@pytest.mark.parametrize("num_parts", [1, 30]) -def test_groupby_simple_max(ray_start_regular_shared, num_parts): - # Test built-in max aggregation - seed = int(time.time()) - print(f"Seeding RNG for test_groupby_simple_max with: {seed}") - random.seed(seed) - xs = list(range(100)) - random.shuffle(xs) - agg_ds = ( - ray.data.from_items(xs).repartition(num_parts).groupby(lambda x: x % 3).max() - ) - assert agg_ds.count() == 3 - assert agg_ds.sort(key=lambda r: r[0]).take(3) == [(0, 99), (1, 97), (2, 98)] - - # Test built-in max aggregation with nans - nan_grouped_ds = ( - ray.data.from_items(xs + [None]) - .repartition(num_parts) - .groupby(lambda x: int(x or 0) % 3) - ) - nan_agg_ds = nan_grouped_ds.max() - assert nan_agg_ds.count() == 3 - assert nan_agg_ds.sort(key=lambda r: r[0]).take(3) == [(0, 99), (1, 97), (2, 98)] - # Test ignore_nulls=False - nan_agg_ds = nan_grouped_ds.max(ignore_nulls=False) - assert nan_agg_ds.count() == 3 - assert nan_agg_ds.sort(key=lambda r: r[0]).take(3) == [(0, None), (1, 97), (2, 98)] - # Test all nans - nan_agg_ds = ( - ray.data.from_items([None] * len(xs)) - .repartition(num_parts) - .groupby(lambda x: 0) - .max() - ) - assert nan_agg_ds.count() == 1 - assert nan_agg_ds.sort(key=lambda r: r[0]).take(1) == [(0, None)] - - # Test built-in global max aggregation - assert ray.data.from_items(xs).repartition(num_parts).max() == 99 - assert ray.data.range(10).filter(lambda r: r > 10).max() is None - - # Test built-in global max aggregation with nans - nan_ds = ray.data.from_items(xs + [None]).repartition(num_parts) - assert nan_ds.max() == 99 - # Test ignore_nulls=False - assert nan_ds.max(ignore_nulls=False) is None - # Test all nans - nan_ds = ray.data.from_items([None] * len(xs)).repartition(num_parts) - assert nan_ds.max() is None - - -@pytest.mark.parametrize("num_parts", [1, 30]) -def test_groupby_simple_mean(ray_start_regular_shared, num_parts): - # Test built-in mean aggregation - seed = int(time.time()) - print(f"Seeding RNG for test_groupby_simple_mean with: {seed}") - random.seed(seed) - xs = list(range(100)) - random.shuffle(xs) - agg_ds = ( - ray.data.from_items(xs).repartition(num_parts).groupby(lambda x: x % 3).mean() - ) - assert agg_ds.count() == 3 - assert agg_ds.sort(key=lambda r: r[0]).take(3) == [(0, 49.5), (1, 49.0), (2, 50.0)] - - # Test built-in mean aggregation with nans - nan_grouped_ds = ( - ray.data.from_items(xs + [None]) - .repartition(num_parts) - .groupby(lambda x: int(x or 0) % 3) - ) - nan_agg_ds = nan_grouped_ds.mean() - assert nan_agg_ds.count() == 3 - assert nan_agg_ds.sort(key=lambda r: r[0]).take(3) == [ - (0, 49.5), - (1, 49.0), - (2, 50.0), - ] - # Test ignore_nulls=False - nan_agg_ds = nan_grouped_ds.mean(ignore_nulls=False) - assert nan_agg_ds.count() == 3 - assert nan_agg_ds.sort(key=lambda r: r[0]).take(3) == [ - (0, None), - (1, 49.0), - (2, 50.0), - ] - # Test all nans - nan_agg_ds = ( - ray.data.from_items([None] * len(xs)) - .repartition(num_parts) - .groupby(lambda x: 0) - .mean() - ) - assert nan_agg_ds.count() == 1 - assert nan_agg_ds.sort(key=lambda r: r[0]).take(1) == [(0, None)] - - # Test built-in global mean aggregation - assert ray.data.from_items(xs).repartition(num_parts).mean() == 49.5 - # Test empty dataset - assert ray.data.range(10).filter(lambda r: r > 10).mean() is None - - # Test built-in global mean aggregation with nans - nan_ds = ray.data.from_items(xs + [None]).repartition(num_parts) - assert nan_ds.mean() == 49.5 - # Test ignore_nulls=False - assert nan_ds.mean(ignore_nulls=False) is None - # Test all nans - nan_ds = ray.data.from_items([None] * len(xs)).repartition(num_parts) - assert nan_ds.mean() is None - - -@pytest.mark.parametrize("num_parts", [1, 30]) -def test_groupby_simple_std(ray_start_regular_shared, num_parts): - # Test built-in std aggregation - seed = int(time.time()) - print(f"Seeding RNG for test_groupby_simple_std with: {seed}") - random.seed(seed) - xs = list(range(100)) - random.shuffle(xs) - agg_ds = ( - ray.data.from_items(xs).repartition(num_parts).groupby(lambda x: x % 3).std() - ) - assert agg_ds.count() == 3 - df = pd.DataFrame({"A": [x % 3 for x in xs], "B": xs}) - expected = df.groupby("A")["B"].std() - result = agg_ds.sort(key=lambda r: r[0]).take(3) - groups, stds = zip(*result) - result_df = pd.DataFrame({"A": list(groups), "B": list(stds)}) - result_df = result_df.set_index("A") - pd.testing.assert_series_equal(result_df["B"], expected) - # ddof of 0 - agg_ds = ( - ray.data.from_items(xs) - .repartition(num_parts) - .groupby(lambda x: x % 3) - .std(ddof=0) - ) - assert agg_ds.count() == 3 - df = pd.DataFrame({"A": [x % 3 for x in xs], "B": xs}) - expected = df.groupby("A")["B"].std(ddof=0) - result = agg_ds.sort(key=lambda r: r[0]).take(3) - groups, stds = zip(*result) - result_df = pd.DataFrame({"A": list(groups), "B": list(stds)}) - result_df = result_df.set_index("A") - pd.testing.assert_series_equal(result_df["B"], expected) - - # Test built-in std aggregation with nans - nan_grouped_ds = ( - ray.data.from_items(xs + [None]) - .repartition(num_parts) - .groupby(lambda x: int(x or 0) % 3) - ) - nan_agg_ds = nan_grouped_ds.std() - assert nan_agg_ds.count() == 3 - nan_df = pd.DataFrame({"A": [x % 3 for x in xs] + [0], "B": xs + [None]}) - expected = nan_df.groupby("A")["B"].std() - result = nan_agg_ds.sort(key=lambda r: r[0]).take(3) - groups, stds = zip(*result) - result_df = pd.DataFrame({"A": list(groups), "B": list(stds)}) - result_df = result_df.set_index("A") - pd.testing.assert_series_equal(result_df["B"], expected) - # Test ignore_nulls=False - nan_agg_ds = nan_grouped_ds.std(ignore_nulls=False) - assert nan_agg_ds.count() == 3 - expected = nan_df.groupby("A")["B"].std() - expected[0] = None - result = nan_agg_ds.sort(key=lambda r: r[0]).take(3) - groups, stds = zip(*result) - result_df = pd.DataFrame({"A": list(groups), "B": list(stds)}) - result_df = result_df.set_index("A") - pd.testing.assert_series_equal(result_df["B"], expected) - # Test all nans - nan_agg_ds = ( - ray.data.from_items([None] * len(xs)) - .repartition(num_parts) - .groupby(lambda x: 0) - .std(ignore_nulls=False) - ) - assert nan_agg_ds.count() == 1 - expected = pd.Series([None], name="B") - expected.index.rename("A", inplace=True) - result = nan_agg_ds.sort(key=lambda r: r[0]).take(1) - groups, stds = zip(*result) - result_df = pd.DataFrame({"A": list(groups), "B": list(stds)}) - result_df = result_df.set_index("A") - pd.testing.assert_series_equal(result_df["B"], expected) - - # Test built-in global std aggregation - assert math.isclose( - ray.data.from_items(xs).repartition(num_parts).std(), pd.Series(xs).std() - ) - # ddof of 0 - assert math.isclose( - ray.data.from_items(xs).repartition(num_parts).std(ddof=0), - pd.Series(xs).std(ddof=0), - ) - - # Test empty dataset - assert ray.data.from_items([]).std() is None - # Test edge cases - assert ray.data.from_items([3]).std() == 0 - - # Test built-in global std aggregation with nans - nan_ds = ray.data.from_items(xs + [None]).repartition(num_parts) - assert math.isclose(nan_ds.std(), pd.Series(xs).std()) - # Test ignore_nulls=False - assert nan_ds.std(ignore_nulls=False) is None - # Test all nans - nan_ds = ray.data.from_items([None] * len(xs)).repartition(num_parts) - assert nan_ds.std() is None - - -@pytest.mark.parametrize("num_parts", [1, 30]) -def test_groupby_simple_multilambda(ray_start_regular_shared, num_parts): - # Test built-in mean aggregation - seed = int(time.time()) - print(f"Seeding RNG for test_groupby_simple_multilambda with: {seed}") - random.seed(seed) - xs = list(range(100)) - random.shuffle(xs) - agg_ds = ( - ray.data.from_items([[x, 2 * x] for x in xs]) - .repartition(num_parts) - .groupby(lambda x: x[0] % 3) - .mean([lambda x: x[0], lambda x: x[1]]) - ) - assert agg_ds.count() == 3 - assert agg_ds.sort(key=lambda r: r[0]).take(3) == [ - (0, 49.5, 99.0), - (1, 49.0, 98.0), - (2, 50.0, 100.0), - ] - # Test built-in global mean aggregation - assert ray.data.from_items([[x, 2 * x] for x in xs]).repartition(num_parts).mean( - [lambda x: x[0], lambda x: x[1]] - ) == (49.5, 99.0) - assert ray.data.from_items([[x, 2 * x] for x in range(10)]).filter( - lambda r: r[0] > 10 - ).mean([lambda x: x[0], lambda x: x[1]]) == (None, None) - - -@pytest.mark.parametrize("num_parts", [1, 30]) -def test_groupby_simple_multi_agg(ray_start_regular_shared, num_parts): - seed = int(time.time()) - print(f"Seeding RNG for test_groupby_simple_multi_agg with: {seed}") - random.seed(seed) - xs = list(range(100)) - random.shuffle(xs) - df = pd.DataFrame({"A": [x % 3 for x in xs], "B": xs}) - agg_ds = ( - ray.data.from_items(xs) - .repartition(num_parts) - .groupby(lambda x: x % 3) - .aggregate( - Count(), - Sum(), - Min(), - Max(), - Mean(), - Std(), - ) - ) - assert agg_ds.count() == 3 - result = agg_ds.sort(key=lambda r: r[0]).take(3) - groups, counts, sums, mins, maxs, means, stds = zip(*result) - agg_df = pd.DataFrame( - { - "groups": list(groups), - "count": list(counts), - "sum": list(sums), - "min": list(mins), - "max": list(maxs), - "mean": list(means), - "std": list(stds), - } - ) - agg_df = agg_df.set_index("groups") - df = pd.DataFrame({"groups": [x % 3 for x in xs], "B": xs}) - expected_grouped = df.groupby("groups")["B"] - np.testing.assert_array_equal(agg_df["count"].to_numpy(), [34, 33, 33]) - for agg in ["sum", "min", "max", "mean", "std"]: - result = agg_df[agg].to_numpy() - expected = getattr(expected_grouped, agg)().to_numpy() - if agg == "std": - np.testing.assert_array_almost_equal(result, expected) - else: - np.testing.assert_array_equal(result, expected) - # Test built-in global multi-aggregation - result_row = ( - ray.data.from_items(xs) - .repartition(num_parts) - .aggregate( - Sum(), - Min(), - Max(), - Mean(), - Std(), - ) - ) - series = pd.Series(xs) - for idx, agg in enumerate(["sum", "min", "max", "mean", "std"]): - result = result_row[idx] - expected = getattr(series, agg)() - if agg == "std": - assert math.isclose(result, expected) - else: - assert result == expected - - -def test_column_name_type_check(ray_start_regular_shared): - df = pd.DataFrame({"1": np.random.rand(10), "a": np.random.rand(10)}) - ds = ray.data.from_pandas(df) - expected_str = "Dataset(num_blocks=1, num_rows=10, schema={1: float64, a: float64})" - assert str(ds) == expected_str, str(ds) - df = pd.DataFrame({1: np.random.rand(10), "a": np.random.rand(10)}) - with pytest.raises(ValueError): - ray.data.from_pandas(df) - - -def test_len(ray_start_regular_shared): - ds = ray.data.range(1) - with pytest.raises(AttributeError): - len(ds) - - -def test_random_sample(ray_start_regular_shared): - import math - - def ensure_sample_size_close(dataset, sample_percent=0.5): - r1 = ds.random_sample(sample_percent) - assert math.isclose( - r1.count(), int(ds.count() * sample_percent), rel_tol=2, abs_tol=2 - ) - - ds = ray.data.range(10, parallelism=2) - ensure_sample_size_close(ds) - - ds = ray.data.range_table(10, parallelism=2) - ensure_sample_size_close(ds) - - ds = ray.data.range_tensor(5, parallelism=2, shape=(2, 2)) - ensure_sample_size_close(ds) - - # imbalanced datasets - ds1 = ray.data.range(1, parallelism=1) - ds2 = ray.data.range(2, parallelism=1) - ds3 = ray.data.range(3, parallelism=1) - # noinspection PyTypeChecker - ds = ds1.union(ds2).union(ds3) - ensure_sample_size_close(ds) - # Small datasets - ds1 = ray.data.range(5, parallelism=5) - ensure_sample_size_close(ds1) - - -def test_random_sample_checks(ray_start_regular_shared): - with pytest.raises(ValueError): - # Cannot sample -1 - ray.data.range(1).random_sample(-1) - with pytest.raises(ValueError): - # Cannot sample from empty dataset - ray.data.range(0).random_sample(0.2) - with pytest.raises(ValueError): - # Cannot sample fraction > 1 - ray.data.range(1).random_sample(10) - - -def test_random_block_order_schema(ray_start_regular_shared): - df = pd.DataFrame({"a": np.random.rand(10), "b": np.random.rand(10)}) - ds = ray.data.from_pandas(df).randomize_block_order() - ds.schema().names == ["a", "b"] - - -def test_random_block_order(ray_start_regular_shared, restore_dataset_context): - ctx = DatasetContext.get_current() - ctx.execution_options.preserve_order = True - - # Test BlockList.randomize_block_order. - ds = ray.data.range(12).repartition(4) - ds = ds.randomize_block_order(seed=0) - - results = ds.take() - expected = [6, 7, 8, 0, 1, 2, 3, 4, 5, 9, 10, 11] - assert results == expected - - # Test LazyBlockList.randomize_block_order. - context = DatasetContext.get_current() - try: - original_optimize_fuse_read_stages = context.optimize_fuse_read_stages - context.optimize_fuse_read_stages = False - - lazy_blocklist_ds = ray.data.range(12, parallelism=4) - lazy_blocklist_ds = lazy_blocklist_ds.randomize_block_order(seed=0) - lazy_blocklist_results = lazy_blocklist_ds.take() - lazy_blocklist_expected = [6, 7, 8, 0, 1, 2, 3, 4, 5, 9, 10, 11] - assert lazy_blocklist_results == lazy_blocklist_expected - finally: - context.optimize_fuse_read_stages = original_optimize_fuse_read_stages - - -# NOTE: All tests above share a Ray cluster, while the tests below do not. These -# tests should only be carefully reordered to retain this invariant! - - -@pytest.mark.parametrize("pipelined", [False, True]) -def test_random_shuffle(shutdown_only, pipelined, use_push_based_shuffle): - def range(n, parallelism=200): - ds = ray.data.range(n, parallelism=parallelism) - if pipelined: - pipe = ds.repeat(2) - pipe.random_shuffle = pipe.random_shuffle_each_window - return pipe - else: - return ds - - r1 = range(100).random_shuffle().take(999) - r2 = range(100).random_shuffle().take(999) - assert r1 != r2, (r1, r2) - - r1 = range(100, parallelism=1).random_shuffle().take(999) - r2 = range(100, parallelism=1).random_shuffle().take(999) - assert r1 != r2, (r1, r2) - - # TODO(swang): fix this - if not use_push_based_shuffle: - if not pipelined: - assert range(100).random_shuffle(num_blocks=1).num_blocks() == 1 - r1 = range(100).random_shuffle(num_blocks=1).take(999) - r2 = range(100).random_shuffle(num_blocks=1).take(999) - assert r1 != r2, (r1, r2) - - r0 = range(100, parallelism=5).take(999) - r1 = range(100, parallelism=5).random_shuffle(seed=0).take(999) - r2 = range(100, parallelism=5).random_shuffle(seed=0).take(999) - r3 = range(100, parallelism=5).random_shuffle(seed=12345).take(999) - assert r1 == r2, (r1, r2) - assert r1 != r0, (r1, r0) - assert r1 != r3, (r1, r3) - - r0 = ray.data.range_table(100, parallelism=5).take(999) - r1 = ray.data.range_table(100, parallelism=5).random_shuffle(seed=0).take(999) - r2 = ray.data.range_table(100, parallelism=5).random_shuffle(seed=0).take(999) - assert r1 == r2, (r1, r2) - assert r1 != r0, (r1, r0) - - # Test move. - ds = range(100, parallelism=2) - r1 = ds.random_shuffle().take(999) - if pipelined: - with pytest.raises(RuntimeError): - ds = ds.map(lambda x: x).take(999) - else: - ds = ds.map(lambda x: x).take(999) - r2 = range(100).random_shuffle().take(999) - assert r1 != r2, (r1, r2) - - # Test empty dataset. - ds = ray.data.from_items([]) - r1 = ds.random_shuffle() - assert r1.count() == 0 - assert r1.take() == ds.take() - - -def test_random_shuffle_check_random(shutdown_only): - # Rows from the same input should not be contiguous in the final output. - num_files = 10 - num_rows = 100 - items = [i for i in range(num_files) for _ in range(num_rows)] - ds = ray.data.from_items(items, parallelism=num_files) - out = ds.random_shuffle().take(num_files * num_rows) - for i in range(num_files): - part = out[i * num_rows : (i + 1) * num_rows] - seen = set() - num_contiguous = 1 - prev = -1 - for x in part: - if prev != x: - prev = x - num_contiguous = 1 - else: - num_contiguous += 1 - assert num_contiguous < ( - num_rows / num_files - ), f"{part} contains too many contiguous rows from same input block" - seen.add(x) - assert ( - set(range(num_files)) == seen - ), f"{part} does not contain elements from all input blocks" - - # Rows from the same input should appear in a different order in the - # output. - num_files = 10 - num_rows = 100 - items = [j for i in range(num_files) for j in range(num_rows)] - ds = ray.data.from_items(items, parallelism=num_files) - out = ds.random_shuffle().take(num_files * num_rows) - for i in range(num_files): - part = out[i * num_rows : (i + 1) * num_rows] - num_increasing = 0 - prev = -1 - for x in part: - if x >= prev: - num_increasing += 1 - else: - assert num_increasing < ( - num_rows / num_files - ), f"{part} contains non-shuffled rows from input blocks" - num_increasing = 0 - prev = x - - -def test_unsupported_pyarrow_versions_check(shutdown_only, unsupported_pyarrow_version): - # Test that unsupported pyarrow versions cause an error to be raised upon the - # initial pyarrow use. - ray.init(runtime_env={"pip": [f"pyarrow=={unsupported_pyarrow_version}"]}) - - # Test Arrow-native creation APIs. - # Test range_table. - with pytest.raises(ImportError): - ray.data.range_table(10).take_all() - - # Test from_arrow. - with pytest.raises(ImportError): - ray.data.from_arrow(pa.table({"a": [1, 2]})) - - # Test read_parquet. - with pytest.raises(ImportError): - ray.data.read_parquet("example://iris.parquet").take_all() - - # Test from_numpy (we use Arrow for representing the tensors). - with pytest.raises(ImportError): - ray.data.from_numpy(np.arange(12).reshape((3, 2, 2))) - - -def test_unsupported_pyarrow_versions_check_disabled( - shutdown_only, - unsupported_pyarrow_version, - disable_pyarrow_version_check, -): - # Test that unsupported pyarrow versions DO NOT cause an error to be raised upon the - # initial pyarrow use when the version check is disabled. - ray.init( - runtime_env={ - "pip": [f"pyarrow=={unsupported_pyarrow_version}"], - "env_vars": {"RAY_DISABLE_PYARROW_VERSION_CHECK": "1"}, - }, - ) - - # Test Arrow-native creation APIs. - # Test range_table. - try: - ray.data.range_table(10).take_all() - except ImportError as e: - pytest.fail(f"_check_pyarrow_version failed unexpectedly: {e}") - - # Test from_arrow. - try: - ray.data.from_arrow(pa.table({"a": [1, 2]})) - except ImportError as e: - pytest.fail(f"_check_pyarrow_version failed unexpectedly: {e}") - - # Test read_parquet. - try: - ray.data.read_parquet("example://iris.parquet").take_all() - except ImportError as e: - pytest.fail(f"_check_pyarrow_version failed unexpectedly: {e}") - - # Test from_numpy (we use Arrow for representing the tensors). - try: - ray.data.from_numpy(np.arange(12).reshape((3, 2, 2))) - except ImportError as e: - pytest.fail(f"_check_pyarrow_version failed unexpectedly: {e}") - - -def test_random_shuffle_with_custom_resource(ray_start_cluster): - cluster = ray_start_cluster - # Create two nodes which have different custom resources. - cluster.add_node( - resources={"foo": 100}, - num_cpus=1, - ) - cluster.add_node(resources={"bar": 100}, num_cpus=1) - - ray.init(cluster.address) - - # Run dataset in "bar" nodes. - ds = ray.data.read_parquet( - "example://parquet_images_mini", - parallelism=2, - ray_remote_args={"resources": {"bar": 1}}, - ) - ds = ds.random_shuffle(resources={"bar": 1}).fully_executed() - assert "1 nodes used" in ds.stats() - assert "2 nodes used" not in ds.stats() - - -def test_read_write_local_node_ray_client(ray_start_cluster_enabled): - cluster = ray_start_cluster_enabled - cluster.add_node(num_cpus=4) - cluster.head_node._ray_params.ray_client_server_port = "10004" - cluster.head_node.start_ray_client_server() - address = "ray://localhost:10004" - - import tempfile - - data_path = tempfile.mkdtemp() - df = pd.DataFrame({"one": list(range(0, 10)), "two": list(range(10, 20))}) - path = os.path.join(data_path, "test.parquet") - df.to_parquet(path) - - # Read/write from Ray Client will result in error. - ray.init(address) - with pytest.raises(ValueError): - ds = ray.data.read_parquet("local://" + path).fully_executed() - ds = ray.data.from_pandas(df) - with pytest.raises(ValueError): - ds.write_parquet("local://" + data_path).fully_executed() - - -def test_read_warning_large_parallelism(ray_start_regular, propagate_logs, caplog): - with caplog.at_level(logging.WARNING, logger="ray.data.read_api"): - ray.data.range(5000, parallelism=5000).fully_executed() - assert ( - "The requested parallelism of 5000 is " - "more than 4x the number of available CPU slots in the cluster" in caplog.text - ), caplog.text - - -def test_read_write_local_node(ray_start_cluster): - cluster = ray_start_cluster - cluster.add_node( - resources={"bar:1": 100}, - num_cpus=10, - _system_config={"max_direct_call_object_size": 0}, - ) - cluster.add_node(resources={"bar:2": 100}, num_cpus=10) - cluster.add_node(resources={"bar:3": 100}, num_cpus=10) - - ray.init(cluster.address) - - import os - import tempfile - - data_path = tempfile.mkdtemp() - num_files = 5 - for idx in range(num_files): - df = pd.DataFrame( - {"one": list(range(idx, idx + 10)), "two": list(range(idx + 10, idx + 20))} - ) - path = os.path.join(data_path, f"test{idx}.parquet") - df.to_parquet(path) - - ctx = ray.data.context.DatasetContext.get_current() - ctx.read_write_local_node = True - - def check_dataset_is_local(ds): - blocks = ds.get_internal_block_refs() - assert len(blocks) == num_files - ray.wait(blocks, num_returns=len(blocks), fetch_local=False) - location_data = ray.experimental.get_object_locations(blocks) - locations = [] - for block in blocks: - locations.extend(location_data[block]["node_ids"]) - assert set(locations) == {ray.get_runtime_context().get_node_id()} - - local_path = "local://" + data_path - # Plain read. - ds = ray.data.read_parquet(local_path).fully_executed() - check_dataset_is_local(ds) - - # SPREAD scheduling got overridden when read local scheme. - ds = ray.data.read_parquet( - local_path, ray_remote_args={"scheduling_strategy": "SPREAD"} - ).fully_executed() - check_dataset_is_local(ds) - - # With fusion. - ds = ray.data.read_parquet(local_path).map(lambda x: x).fully_executed() - check_dataset_is_local(ds) - - # Write back to local scheme. - output = os.path.join(local_path, "test_read_write_local_node") - ds.write_parquet(output) - assert "1 nodes used" in ds.stats(), ds.stats() - ray.data.read_parquet(output).take_all() == ds.take_all() - - # Mixing paths of local and non-local scheme is invalid. - with pytest.raises(ValueError): - ds = ray.data.read_parquet( - [local_path + "/test1.parquet", data_path + "/test2.parquet"] - ).fully_executed() - with pytest.raises(ValueError): - ds = ray.data.read_parquet( - [local_path + "/test1.parquet", "example://iris.parquet"] - ).fully_executed() - with pytest.raises(ValueError): - ds = ray.data.read_parquet( - ["example://iris.parquet", local_path + "/test1.parquet"] - ).fully_executed() - - -def test_random_shuffle_spread(ray_start_cluster, use_push_based_shuffle): - cluster = ray_start_cluster - cluster.add_node( - resources={"bar:1": 100}, - num_cpus=10, - _system_config={"max_direct_call_object_size": 0}, - ) - cluster.add_node(resources={"bar:2": 100}, num_cpus=10) - cluster.add_node(resources={"bar:3": 100}, num_cpus=0) - - ray.init(cluster.address) - - @ray.remote - def get_node_id(): - return ray.get_runtime_context().get_node_id() - - node1_id = ray.get(get_node_id.options(resources={"bar:1": 1}).remote()) - node2_id = ray.get(get_node_id.options(resources={"bar:2": 1}).remote()) - - ds = ray.data.range(100, parallelism=2).random_shuffle() - blocks = ds.get_internal_block_refs() - ray.wait(blocks, num_returns=len(blocks), fetch_local=False) - location_data = ray.experimental.get_object_locations(blocks) - locations = [] - for block in blocks: - locations.extend(location_data[block]["node_ids"]) - assert "2 nodes used" in ds.stats() - - if not use_push_based_shuffle: - # We don't check this for push-based shuffle since it will try to - # colocate reduce tasks to improve locality. - assert set(locations) == {node1_id, node2_id} - - -def test_parquet_read_spread(ray_start_cluster, tmp_path): - cluster = ray_start_cluster - cluster.add_node( - resources={"bar:1": 100}, - num_cpus=10, - _system_config={"max_direct_call_object_size": 0}, - ) - cluster.add_node(resources={"bar:2": 100}, num_cpus=10) - cluster.add_node(resources={"bar:3": 100}, num_cpus=0) - - ray.init(cluster.address) - - @ray.remote - def get_node_id(): - return ray.get_runtime_context().get_node_id() - - node1_id = ray.get(get_node_id.options(resources={"bar:1": 1}).remote()) - node2_id = ray.get(get_node_id.options(resources={"bar:2": 1}).remote()) - - data_path = str(tmp_path) - df1 = pd.DataFrame({"one": list(range(100)), "two": list(range(100, 200))}) - path1 = os.path.join(data_path, "test1.parquet") - df1.to_parquet(path1) - df2 = pd.DataFrame({"one": list(range(300, 400)), "two": list(range(400, 500))}) - path2 = os.path.join(data_path, "test2.parquet") - df2.to_parquet(path2) - - ds = ray.data.read_parquet(data_path) - - # Force reads. - blocks = ds.get_internal_block_refs() - assert len(blocks) == 2 - - ray.wait(blocks, num_returns=len(blocks), fetch_local=False) - location_data = ray.experimental.get_object_locations(blocks) - locations = [] - for block in blocks: - locations.extend(location_data[block]["node_ids"]) - assert set(locations) == {node1_id, node2_id} - - -def test_stats_actor_cap_num_stats(ray_start_cluster): - actor = _StatsActor.remote(3) - metadatas = [] - task_idx = 0 - for uuid in range(3): - metadatas.append( - BlockMetadata( - num_rows=uuid, - size_bytes=None, - schema=None, - input_files=None, - exec_stats=None, - ) - ) - num_stats = uuid + 1 - actor.record_start.remote(uuid) - assert ray.get(actor._get_stats_dict_size.remote()) == ( - num_stats, - num_stats - 1, - num_stats - 1, - ) - actor.record_task.remote(uuid, task_idx, [metadatas[-1]]) - assert ray.get(actor._get_stats_dict_size.remote()) == ( - num_stats, - num_stats, - num_stats, - ) - for uuid in range(3): - assert ray.get(actor.get.remote(uuid))[0][task_idx] == [metadatas[uuid]] - # Add the fourth stats to exceed the limit. - actor.record_start.remote(3) - # The first stats (with uuid=0) should have been purged. - assert ray.get(actor.get.remote(0))[0] == {} - # The start_time has 3 entries because we just added it above with record_start(). - assert ray.get(actor._get_stats_dict_size.remote()) == (3, 2, 2) - - -@ray.remote -class Counter: - def __init__(self): - self.value = 0 - - def increment(self): - self.value += 1 - return self.value - - -class FlakyCSVDatasource(CSVDatasource): - def __init__(self): - self.counter = Counter.remote() - - def _read_stream(self, f: "pa.NativeFile", path: str, **reader_args): - count = self.counter.increment.remote() - if ray.get(count) == 1: - raise ValueError("oops") - else: - for block in CSVDatasource._read_stream(self, f, path, **reader_args): - yield block - - def _write_block(self, f: "pa.NativeFile", block: BlockAccessor, **writer_args): - count = self.counter.increment.remote() - if ray.get(count) == 1: - raise ValueError("oops") - else: - CSVDatasource._write_block(self, f, block, **writer_args) - - -def test_dataset_retry_exceptions(ray_start_regular, local_path): - df1 = pd.DataFrame({"one": [1, 2, 3], "two": ["a", "b", "c"]}) - path1 = os.path.join(local_path, "test1.csv") - df1.to_csv(path1, index=False, storage_options={}) - ds1 = ray.data.read_datasource(FlakyCSVDatasource(), parallelism=1, paths=path1) - ds1.write_datasource(FlakyCSVDatasource(), path=local_path, dataset_uuid="data") - assert df1.equals( - pd.read_csv(os.path.join(local_path, "data_000000.csv"), storage_options={}) - ) - - counter = Counter.remote() - - def flaky_mapper(x): - count = counter.increment.remote() - if ray.get(count) == 1: - raise ValueError("oops") - else: - return ray.get(count) - - assert sorted(ds1.map(flaky_mapper).take()) == [2, 3, 4] - - with pytest.raises(ValueError): - ray.data.read_datasource( - FlakyCSVDatasource(), - parallelism=1, - paths=path1, - ray_remote_args={"retry_exceptions": False}, - ).take() - - -def test_split_is_not_disruptive(ray_start_regular): - ds = ray.data.range(100, parallelism=10).map_batches(lambda x: x).lazy() - - def verify_integrity(splits): - for dss in splits: - for batch in dss.iter_batches(): - pass - for batch in ds.iter_batches(): - pass - - # No block splitting invovled: split 10 even blocks into 2 groups. - verify_integrity(ds.split(2, equal=True)) - # Block splitting invovled: split 10 even blocks into 3 groups. - verify_integrity(ds.split(3, equal=True)) - - # Same as above but having tranforms post converting to lazy. - verify_integrity(ds.map_batches(lambda x: x).split(2, equal=True)) - verify_integrity(ds.map_batches(lambda x: x).split(3, equal=True)) - - # Same as above but having in-place tranforms post converting to lazy. - verify_integrity(ds.randomize_block_order().split(2, equal=True)) - verify_integrity(ds.randomize_block_order().split(3, equal=True)) - - -def test_datasource(ray_start_regular): - source = ray.data.datasource.RandomIntRowDatasource() - assert len(ray.data.read_datasource(source, n=10, num_columns=2).take()) == 10 - source = ray.data.datasource.RangeDatasource() - assert ray.data.read_datasource(source, n=10).take() == list(range(10)) - - -def test_polars_lazy_import(shutdown_only): - import sys - - ctx = ray.data.context.DatasetContext.get_current() - - try: - original_use_polars = ctx.use_polars - ctx.use_polars = True - - num_items = 100 - parallelism = 4 - ray.init(num_cpus=4) - - @ray.remote - def f(should_import_polars): - # Sleep to spread the tasks. - time.sleep(1) - polars_imported = "polars" in sys.modules.keys() - return polars_imported == should_import_polars - - # We should not use polars for non-Arrow sort. - _ = ray.data.range(num_items, parallelism=parallelism).sort() - assert all(ray.get([f.remote(False) for _ in range(parallelism)])) - - a = range(100) - dfs = [] - partition_size = num_items // parallelism - for i in range(parallelism): - dfs.append( - pd.DataFrame({"a": a[i * partition_size : (i + 1) * partition_size]}) - ) - # At least one worker should have imported polars. - _ = ( - ray.data.from_pandas(dfs) - .map_batches(lambda t: t, batch_format="pyarrow", batch_size=None) - .sort(key="a") - .fully_executed() - ) - assert any(ray.get([f.remote(True) for _ in range(parallelism)])) - - finally: - ctx.use_polars = original_use_polars - - -def test_actor_pool_strategy_apply_interrupt(shutdown_only): - """Test that _apply kills the actor pool if an interrupt is raised.""" - ray.init(include_dashboard=False, num_cpus=1) - - cpus = ray.available_resources()["CPU"] - ds = ray.data.range(5, parallelism=5) - aps = ray.data.ActorPoolStrategy(max_size=5) - blocks = ds._plan.execute() - - # Start some actors, the first one sends a SIGINT, emulating a KeyboardInterrupt - def test_func(block): - for i, _ in enumerate(BlockAccessor.for_block(block).iter_rows()): - if i == 0: - os.kill(os.getpid(), signal.SIGINT) - else: - time.sleep(1000) - return block - - # No need to test ActorPoolStrategy in new execution backend. - if not DatasetContext.get_current().new_execution_backend: - with pytest.raises(ray.exceptions.RayTaskError): - aps._apply(test_func, {}, blocks, False) - - # Check that all actors have been killed by counting the available CPUs - wait_for_condition(lambda: (ray.available_resources().get("CPU", 0) == cpus)) - - -def test_actor_pool_strategy_default_num_actors(shutdown_only): - def f(x): - import time - - time.sleep(1) - return x - - num_cpus = 5 - ray.init(num_cpus=num_cpus) - compute_strategy = ray.data.ActorPoolStrategy() - ray.data.range(10, parallelism=10).map_batches( - f, batch_size=1, compute=compute_strategy - ).fully_executed() - - # The new execution backend is not using the ActorPoolStrategy under - # the hood, so the expectation here applies only to the old backend. - # TODO(https://github.com/ray-project/ray/issues/31723): we should check - # the num of workers once we have autoscaling in new execution backend. - if not DatasetContext.get_current().new_execution_backend: - expected_max_num_workers = math.ceil( - num_cpus * (1 / compute_strategy.ready_to_total_workers_ratio) - ) - assert ( - compute_strategy.num_workers >= num_cpus - and compute_strategy.num_workers <= expected_max_num_workers - ), "Number of actors is out of the expected bound" - - -def test_actor_pool_strategy_bundles_to_max_actors(shutdown_only): - """Tests that blocks are bundled up to the specified max number of actors.""" - - def f(x): - return x - - max_size = 2 - compute_strategy = ray.data.ActorPoolStrategy(max_size=max_size) - ds = ( - ray.data.range(10, parallelism=10) - .map_batches(f, batch_size=None, compute=compute_strategy) - .fully_executed() - ) - - # TODO(https://github.com/ray-project/ray/issues/31723): implement the feature - # of capping bundle size by actor pool size, and then re-enable this test. - if not DatasetContext.get_current().new_execution_backend: - assert f"{max_size}/{max_size} blocks" in ds.stats() - - # Check batch size is still respected. - ds = ( - ray.data.range(10, parallelism=10) - .map_batches(f, batch_size=10, compute=compute_strategy) - .fully_executed() - ) - - assert "1/1 blocks" in ds.stats() - - -def test_default_batch_format(shutdown_only): - ds = ray.data.range(100) - assert ds.default_batch_format() == list - - ds = ray.data.range_tensor(100) - assert ds.default_batch_format() == np.ndarray - - df = pd.DataFrame({"foo": ["a", "b"], "bar": [0, 1]}) - ds = ray.data.from_pandas(df) - assert ds.default_batch_format() == pd.DataFrame - - -def test_dataset_schema_after_read_stats(ray_start_cluster): - cluster = ray_start_cluster - cluster.add_node(num_cpus=1) - ray.init(cluster.address) - cluster.add_node(num_cpus=1, resources={"foo": 1}) - ds = ray.data.read_csv( - "example://iris.csv", ray_remote_args={"resources": {"foo": 1}} - ) - schema = ds.schema() - ds.stats() - assert schema == ds.schema() - - -class LoggerWarningCalled(Exception): - """Custom exception used in test_warning_execute_with_no_cpu() and - test_nowarning_execute_with_cpu(). Raised when the `logger.warning` method - is called, so that we can kick out of `plan.execute()` by catching this Exception - and check logging was done properly.""" - - pass - - -def test_warning_execute_with_no_cpu(ray_start_cluster): - """Tests ExecutionPlan.execute() to ensure a warning is logged - when no CPU resources are available.""" - # Create one node with no CPUs to trigger the Dataset warning - cluster = ray_start_cluster - cluster.add_node(num_cpus=0) - - logger = DatasetLogger("ray.data._internal.plan").get_logger() - with patch.object( - logger, - "warning", - side_effect=LoggerWarningCalled, - ) as mock_logger: - try: - ds = ray.data.range(10) - ds = ds.map_batches(lambda x: x) - ds.take() - except Exception as e: - if ray.data.context.DatasetContext.get_current().use_streaming_executor: - assert isinstance(e, ValueError) - assert "exceeds the execution limits ExecutionResources(cpu=0.0" in str( - e - ) - else: - assert isinstance(e, LoggerWarningCalled) - logger_args, logger_kwargs = mock_logger.call_args - assert ( - "Warning: The Ray cluster currently does not have " - in logger_args[0] - ) - - -def test_nowarning_execute_with_cpu(ray_start_cluster_init): - """Tests ExecutionPlan.execute() to ensure no warning is logged - when there are available CPU resources.""" - # Create one node with CPUs to avoid triggering the Dataset warning - ray.init(ray_start_cluster_init.address) - - logger = DatasetLogger("ray.data._internal.plan").get_logger() - with patch.object( - logger, - "warning", - side_effect=LoggerWarningCalled, - ) as mock_logger: - ds = ray.data.range(10) - ds = ds.map_batches(lambda x: x) - ds.take() - mock_logger.assert_not_called() - - -if __name__ == "__main__": - import sys - - sys.exit(pytest.main(["-v", __file__])) diff --git a/python/ray/data/tests/test_dataset_all_to_all.py b/python/ray/data/tests/test_dataset_all_to_all.py new file mode 100644 index 0000000000000..e9150b52f2f24 --- /dev/null +++ b/python/ray/data/tests/test_dataset_all_to_all.py @@ -0,0 +1,1703 @@ +import itertools +import math +import random +import time + +import numpy as np +import pandas as pd +import pyarrow as pa +import pytest + +import ray +from ray.data.aggregate import AggregateFn, Count, Max, Mean, Min, Std, Sum +from ray.data.context import DatasetContext +from ray.data.tests.conftest import * # noqa +from ray.tests.conftest import * # noqa + + +def test_zip(ray_start_regular_shared): + ds1 = ray.data.range(5, parallelism=5) + ds2 = ray.data.range(5, parallelism=5).map(lambda x: x + 1) + ds = ds1.zip(ds2) + assert ds.schema() == tuple + assert ds.take() == [(0, 1), (1, 2), (2, 3), (3, 4), (4, 5)] + with pytest.raises(ValueError): + ds.zip(ray.data.range(3)).fully_executed() + + +@pytest.mark.parametrize( + "num_blocks1,num_blocks2", + list(itertools.combinations_with_replacement(range(1, 12), 2)), +) +def test_zip_different_num_blocks_combinations( + ray_start_regular_shared, num_blocks1, num_blocks2 +): + n = 12 + ds1 = ray.data.range(n, parallelism=num_blocks1) + ds2 = ray.data.range(n, parallelism=num_blocks2).map(lambda x: x + 1) + ds = ds1.zip(ds2) + assert ds.schema() == tuple + assert ds.take() == list(zip(range(n), range(1, n + 1))) + + +@pytest.mark.parametrize( + "num_cols1,num_cols2,should_invert", + [ + (1, 1, False), + (4, 1, False), + (1, 4, True), + (1, 10, True), + (10, 10, False), + ], +) +def test_zip_different_num_blocks_split_smallest( + ray_start_regular_shared, + num_cols1, + num_cols2, + should_invert, +): + n = 12 + num_blocks1 = 4 + num_blocks2 = 2 + ds1 = ray.data.from_items( + [{str(i): i for i in range(num_cols1)}] * n, parallelism=num_blocks1 + ) + ds2 = ray.data.from_items( + [{str(i): i for i in range(num_cols1, num_cols1 + num_cols2)}] * n, + parallelism=num_blocks2, + ) + ds = ds1.zip(ds2).fully_executed() + num_blocks = ds._plan._snapshot_blocks.executed_num_blocks() + assert ds.take() == [{str(i): i for i in range(num_cols1 + num_cols2)}] * n + if should_invert: + assert num_blocks == num_blocks2 + else: + assert num_blocks == num_blocks1 + + +def test_zip_pandas(ray_start_regular_shared): + ds1 = ray.data.from_pandas(pd.DataFrame({"col1": [1, 2], "col2": [4, 5]})) + ds2 = ray.data.from_pandas(pd.DataFrame({"col3": ["a", "b"], "col4": ["d", "e"]})) + ds = ds1.zip(ds2) + assert ds.count() == 2 + assert "{col1: int64, col2: int64, col3: object, col4: object}" in str(ds) + result = [r.as_pydict() for r in ds.take()] + assert result[0] == {"col1": 1, "col2": 4, "col3": "a", "col4": "d"} + + ds3 = ray.data.from_pandas(pd.DataFrame({"col2": ["a", "b"], "col4": ["d", "e"]})) + ds = ds1.zip(ds3) + assert ds.count() == 2 + assert "{col1: int64, col2: int64, col2_1: object, col4: object}" in str(ds) + result = [r.as_pydict() for r in ds.take()] + assert result[0] == {"col1": 1, "col2": 4, "col2_1": "a", "col4": "d"} + + +def test_zip_arrow(ray_start_regular_shared): + ds1 = ray.data.range_table(5).map(lambda r: {"id": r["value"]}) + ds2 = ray.data.range_table(5).map( + lambda r: {"a": r["value"] + 1, "b": r["value"] + 2} + ) + ds = ds1.zip(ds2) + assert ds.count() == 5 + assert "{id: int64, a: int64, b: int64}" in str(ds) + result = [r.as_pydict() for r in ds.take()] + assert result[0] == {"id": 0, "a": 1, "b": 2} + + # Test duplicate column names. + ds = ds1.zip(ds1).zip(ds1) + assert ds.count() == 5 + assert "{id: int64, id_1: int64, id_2: int64}" in str(ds) + result = [r.as_pydict() for r in ds.take()] + assert result[0] == {"id": 0, "id_1": 0, "id_2": 0} + + +def test_empty_shuffle(ray_start_regular_shared): + ds = ray.data.range(100, parallelism=100) + ds = ds.filter(lambda x: x) + ds = ds.map_batches(lambda x: x) + ds = ds.random_shuffle() # Would prev. crash with AssertionError: pyarrow.Table. + ds.show() + + +def test_repartition_shuffle(ray_start_regular_shared): + ds = ray.data.range(20, parallelism=10) + assert ds.num_blocks() == 10 + assert ds.sum() == 190 + assert ds._block_num_rows() == [2] * 10 + + ds2 = ds.repartition(5, shuffle=True) + assert ds2.num_blocks() == 5 + assert ds2.sum() == 190 + assert ds2._block_num_rows() == [10, 10, 0, 0, 0] + + ds3 = ds2.repartition(20, shuffle=True) + assert ds3.num_blocks() == 20 + assert ds3.sum() == 190 + assert ds3._block_num_rows() == [2] * 10 + [0] * 10 + + large = ray.data.range(10000, parallelism=10) + large = large.repartition(20, shuffle=True) + assert large._block_num_rows() == [500] * 20 + + +def test_repartition_noshuffle(ray_start_regular_shared): + ds = ray.data.range(20, parallelism=10) + assert ds.num_blocks() == 10 + assert ds.sum() == 190 + assert ds._block_num_rows() == [2] * 10 + + ds2 = ds.repartition(5, shuffle=False) + assert ds2.num_blocks() == 5 + assert ds2.sum() == 190 + assert ds2._block_num_rows() == [4, 4, 4, 4, 4] + + ds3 = ds2.repartition(20, shuffle=False) + assert ds3.num_blocks() == 20 + assert ds3.sum() == 190 + assert ds3._block_num_rows() == [1] * 20 + + # Test num_partitions > num_rows + ds4 = ds.repartition(40, shuffle=False) + assert ds4.num_blocks() == 40 + blocks = ray.get(ds4.get_internal_block_refs()) + assert all(isinstance(block, list) for block in blocks), blocks + assert ds4.sum() == 190 + assert ds4._block_num_rows() == [1] * 20 + [0] * 20 + + ds5 = ray.data.range(22).repartition(4) + assert ds5.num_blocks() == 4 + assert ds5._block_num_rows() == [5, 6, 5, 6] + + large = ray.data.range(10000, parallelism=10) + large = large.repartition(20) + assert large._block_num_rows() == [500] * 20 + + +def test_repartition_shuffle_arrow(ray_start_regular_shared): + ds = ray.data.range_table(20, parallelism=10) + assert ds.num_blocks() == 10 + assert ds.count() == 20 + assert ds._block_num_rows() == [2] * 10 + + ds2 = ds.repartition(5, shuffle=True) + assert ds2.num_blocks() == 5 + assert ds2.count() == 20 + assert ds2._block_num_rows() == [10, 10, 0, 0, 0] + + ds3 = ds2.repartition(20, shuffle=True) + assert ds3.num_blocks() == 20 + assert ds3.count() == 20 + assert ds3._block_num_rows() == [2] * 10 + [0] * 10 + + large = ray.data.range_table(10000, parallelism=10) + large = large.repartition(20, shuffle=True) + assert large._block_num_rows() == [500] * 20 + + +def test_grouped_dataset_repr(ray_start_regular_shared): + ds = ray.data.from_items([{"key": "spam"}, {"key": "ham"}, {"key": "spam"}]) + assert repr(ds.groupby("key")) == f"GroupedDataset(dataset={ds!r}, key='key')" + + +def test_groupby_arrow(ray_start_regular_shared, use_push_based_shuffle): + # Test empty dataset. + agg_ds = ( + ray.data.range_table(10) + .filter(lambda r: r["value"] > 10) + .groupby("value") + .count() + ) + assert agg_ds.count() == 0 + + +def test_groupby_errors(ray_start_regular_shared): + ds = ray.data.range(100) + + ds.groupby(None).count().show() # OK + ds.groupby(lambda x: x % 2).count().show() # OK + with pytest.raises(ValueError): + ds.groupby("foo").count().show() + + ds = ray.data.range_table(100) + ds.groupby(None).count().show() # OK + with pytest.raises(ValueError): + ds.groupby(lambda x: x % 2).count().show() + + +def test_agg_errors(ray_start_regular_shared): + ds = ray.data.range(100) + from ray.data.aggregate import Max + + ds.aggregate(Max()) # OK + ds.aggregate(Max(lambda x: x)) # OK + with pytest.raises(ValueError): + ds.aggregate(Max("foo")) + + ds = ray.data.range_table(100) + ds.aggregate(Max("value")) # OK + with pytest.raises(ValueError): + ds.aggregate(Max()) + with pytest.raises(ValueError): + ds.aggregate(Max(lambda x: x)) + with pytest.raises(ValueError): + ds.aggregate(Max("bad_field")) + + +@pytest.mark.parametrize("num_parts", [1, 30]) +def test_groupby_agg_name_conflict(ray_start_regular_shared, num_parts): + # Test aggregation name conflict. + xs = list(range(100)) + grouped_ds = ( + ray.data.from_items([{"A": (x % 3), "B": x} for x in xs]) + .repartition(num_parts) + .groupby("A") + ) + agg_ds = grouped_ds.aggregate( + AggregateFn( + init=lambda k: [0, 0], + accumulate_row=lambda a, r: [a[0] + r["B"], a[1] + 1], + merge=lambda a1, a2: [a1[0] + a2[0], a1[1] + a2[1]], + finalize=lambda a: a[0] / a[1], + name="foo", + ), + AggregateFn( + init=lambda k: [0, 0], + accumulate_row=lambda a, r: [a[0] + r["B"], a[1] + 1], + merge=lambda a1, a2: [a1[0] + a2[0], a1[1] + a2[1]], + finalize=lambda a: a[0] / a[1], + name="foo", + ), + ) + assert agg_ds.count() == 3 + assert [row.as_pydict() for row in agg_ds.sort("A").iter_rows()] == [ + {"A": 0, "foo": 49.5, "foo_2": 49.5}, + {"A": 1, "foo": 49.0, "foo_2": 49.0}, + {"A": 2, "foo": 50.0, "foo_2": 50.0}, + ] + + +@pytest.mark.parametrize("num_parts", [1, 30]) +@pytest.mark.parametrize("ds_format", ["arrow", "pandas"]) +def test_groupby_tabular_count( + ray_start_regular_shared, ds_format, num_parts, use_push_based_shuffle +): + # Test built-in count aggregation + seed = int(time.time()) + print(f"Seeding RNG for test_groupby_arrow_count with: {seed}") + random.seed(seed) + xs = list(range(100)) + random.shuffle(xs) + + def _to_pandas(ds): + return ds.map_batches(lambda x: x, batch_size=None, batch_format="pandas") + + ds = ray.data.from_items([{"A": (x % 3), "B": x} for x in xs]).repartition( + num_parts + ) + if ds_format == "pandas": + ds = _to_pandas(ds) + agg_ds = ds.groupby("A").count() + assert agg_ds.count() == 3 + assert [row.as_pydict() for row in agg_ds.sort("A").iter_rows()] == [ + {"A": 0, "count()": 34}, + {"A": 1, "count()": 33}, + {"A": 2, "count()": 33}, + ] + + +@pytest.mark.parametrize("num_parts", [1, 30]) +@pytest.mark.parametrize("ds_format", ["arrow", "pandas"]) +def test_groupby_tabular_sum( + ray_start_regular_shared, ds_format, num_parts, use_push_based_shuffle +): + # Test built-in sum aggregation + seed = int(time.time()) + print(f"Seeding RNG for test_groupby_tabular_sum with: {seed}") + random.seed(seed) + xs = list(range(100)) + random.shuffle(xs) + + def _to_pandas(ds): + return ds.map_batches(lambda x: x, batch_size=None, batch_format="pandas") + + ds = ray.data.from_items([{"A": (x % 3), "B": x} for x in xs]).repartition( + num_parts + ) + if ds_format == "pandas": + ds = _to_pandas(ds) + + agg_ds = ds.groupby("A").sum("B") + assert agg_ds.count() == 3 + assert [row.as_pydict() for row in agg_ds.sort("A").iter_rows()] == [ + {"A": 0, "sum(B)": 1683}, + {"A": 1, "sum(B)": 1617}, + {"A": 2, "sum(B)": 1650}, + ] + + # Test built-in sum aggregation with nans + ds = ray.data.from_items( + [{"A": (x % 3), "B": x} for x in xs] + [{"A": 0, "B": None}] + ).repartition(num_parts) + if ds_format == "pandas": + ds = _to_pandas(ds) + nan_grouped_ds = ds.groupby("A") + nan_agg_ds = nan_grouped_ds.sum("B") + assert nan_agg_ds.count() == 3 + assert [row.as_pydict() for row in nan_agg_ds.sort("A").iter_rows()] == [ + {"A": 0, "sum(B)": 1683}, + {"A": 1, "sum(B)": 1617}, + {"A": 2, "sum(B)": 1650}, + ] + # Test ignore_nulls=False + nan_agg_ds = nan_grouped_ds.sum("B", ignore_nulls=False) + assert nan_agg_ds.count() == 3 + pd.testing.assert_frame_equal( + nan_agg_ds.sort("A").to_pandas(), + pd.DataFrame( + { + "A": [0, 1, 2], + "sum(B)": [None, 1617, 1650], + } + ), + check_dtype=False, + ) + # Test all nans + ds = ray.data.from_items([{"A": (x % 3), "B": None} for x in xs]).repartition( + num_parts + ) + if ds_format == "pandas": + ds = _to_pandas(ds) + nan_agg_ds = ds.groupby("A").sum("B") + assert nan_agg_ds.count() == 3 + pd.testing.assert_frame_equal( + nan_agg_ds.sort("A").to_pandas(), + pd.DataFrame( + { + "A": [0, 1, 2], + "sum(B)": [None, None, None], + } + ), + ) + + +@pytest.mark.parametrize("num_parts", [1, 30]) +@pytest.mark.parametrize("ds_format", ["arrow", "pandas"]) +def test_global_tabular_sum(ray_start_regular_shared, ds_format, num_parts): + seed = int(time.time()) + print(f"Seeding RNG for test_global_arrow_sum with: {seed}") + random.seed(seed) + xs = list(range(100)) + random.shuffle(xs) + + def _to_pandas(ds): + return ds.map_batches(lambda x: x, batch_size=None, batch_format="pandas") + + # Test built-in global sum aggregation + ds = ray.data.from_items([{"A": x} for x in xs]).repartition(num_parts) + if ds_format == "pandas": + ds = _to_pandas(ds) + assert ds.sum("A") == 4950 + + # Test empty dataset + ds = ray.data.range_table(10) + if ds_format == "pandas": + ds = _to_pandas(ds) + assert ds.filter(lambda r: r["value"] > 10).sum("value") is None + + # Test built-in global sum aggregation with nans + nan_ds = ray.data.from_items([{"A": x} for x in xs] + [{"A": None}]).repartition( + num_parts + ) + if ds_format == "pandas": + nan_ds = _to_pandas(nan_ds) + assert nan_ds.sum("A") == 4950 + # Test ignore_nulls=False + assert nan_ds.sum("A", ignore_nulls=False) is None + # Test all nans + nan_ds = ray.data.from_items([{"A": None}] * len(xs)).repartition(num_parts) + if ds_format == "pandas": + nan_ds = _to_pandas(nan_ds) + assert nan_ds.sum("A") is None + assert nan_ds.sum("A", ignore_nulls=False) is None + + +@pytest.mark.parametrize("num_parts", [1, 30]) +@pytest.mark.parametrize("ds_format", ["arrow", "pandas"]) +def test_groupby_tabular_min(ray_start_regular_shared, ds_format, num_parts): + # Test built-in min aggregation + seed = int(time.time()) + print(f"Seeding RNG for test_groupby_tabular_min with: {seed}") + random.seed(seed) + xs = list(range(100)) + random.shuffle(xs) + + def _to_pandas(ds): + return ds.map_batches(lambda x: x, batch_size=None, batch_format="pandas") + + ds = ray.data.from_items([{"A": (x % 3), "B": x} for x in xs]).repartition( + num_parts + ) + if ds_format == "pandas": + ds = _to_pandas(ds) + + agg_ds = ds.groupby("A").min("B") + assert agg_ds.count() == 3 + assert [row.as_pydict() for row in agg_ds.sort("A").iter_rows()] == [ + {"A": 0, "min(B)": 0}, + {"A": 1, "min(B)": 1}, + {"A": 2, "min(B)": 2}, + ] + + # Test built-in min aggregation with nans + ds = ray.data.from_items( + [{"A": (x % 3), "B": x} for x in xs] + [{"A": 0, "B": None}] + ).repartition(num_parts) + if ds_format == "pandas": + ds = _to_pandas(ds) + nan_grouped_ds = ds.groupby("A") + nan_agg_ds = nan_grouped_ds.min("B") + assert nan_agg_ds.count() == 3 + assert [row.as_pydict() for row in nan_agg_ds.sort("A").iter_rows()] == [ + {"A": 0, "min(B)": 0}, + {"A": 1, "min(B)": 1}, + {"A": 2, "min(B)": 2}, + ] + # Test ignore_nulls=False + nan_agg_ds = nan_grouped_ds.min("B", ignore_nulls=False) + assert nan_agg_ds.count() == 3 + pd.testing.assert_frame_equal( + nan_agg_ds.sort("A").to_pandas(), + pd.DataFrame( + { + "A": [0, 1, 2], + "min(B)": [None, 1, 2], + } + ), + check_dtype=False, + ) + # Test all nans + ds = ray.data.from_items([{"A": (x % 3), "B": None} for x in xs]).repartition( + num_parts + ) + if ds_format == "pandas": + ds = _to_pandas(ds) + nan_agg_ds = ds.groupby("A").min("B") + assert nan_agg_ds.count() == 3 + pd.testing.assert_frame_equal( + nan_agg_ds.sort("A").to_pandas(), + pd.DataFrame( + { + "A": [0, 1, 2], + "min(B)": [None, None, None], + } + ), + check_dtype=False, + ) + + +@pytest.mark.parametrize("num_parts", [1, 30]) +@pytest.mark.parametrize("ds_format", ["arrow", "pandas"]) +def test_groupby_tabular_max(ray_start_regular_shared, ds_format, num_parts): + # Test built-in max aggregation + seed = int(time.time()) + print(f"Seeding RNG for test_groupby_tabular_max with: {seed}") + random.seed(seed) + xs = list(range(100)) + random.shuffle(xs) + + def _to_pandas(ds): + return ds.map_batches(lambda x: x, batch_size=None, batch_format="pandas") + + ds = ray.data.from_items([{"A": (x % 3), "B": x} for x in xs]).repartition( + num_parts + ) + if ds_format == "pandas": + ds = _to_pandas(ds) + + agg_ds = ds.groupby("A").max("B") + assert agg_ds.count() == 3 + assert [row.as_pydict() for row in agg_ds.sort("A").iter_rows()] == [ + {"A": 0, "max(B)": 99}, + {"A": 1, "max(B)": 97}, + {"A": 2, "max(B)": 98}, + ] + + # Test built-in min aggregation with nans + ds = ray.data.from_items( + [{"A": (x % 3), "B": x} for x in xs] + [{"A": 0, "B": None}] + ).repartition(num_parts) + if ds_format == "pandas": + ds = _to_pandas(ds) + nan_grouped_ds = ds.groupby("A") + nan_agg_ds = nan_grouped_ds.max("B") + assert nan_agg_ds.count() == 3 + assert [row.as_pydict() for row in nan_agg_ds.sort("A").iter_rows()] == [ + {"A": 0, "max(B)": 99}, + {"A": 1, "max(B)": 97}, + {"A": 2, "max(B)": 98}, + ] + # Test ignore_nulls=False + nan_agg_ds = nan_grouped_ds.max("B", ignore_nulls=False) + assert nan_agg_ds.count() == 3 + pd.testing.assert_frame_equal( + nan_agg_ds.sort("A").to_pandas(), + pd.DataFrame( + { + "A": [0, 1, 2], + "max(B)": [None, 97, 98], + } + ), + check_dtype=False, + ) + # Test all nans + ds = ray.data.from_items([{"A": (x % 3), "B": None} for x in xs]).repartition( + num_parts + ) + if ds_format == "pandas": + ds = _to_pandas(ds) + nan_agg_ds = ds.groupby("A").max("B") + assert nan_agg_ds.count() == 3 + pd.testing.assert_frame_equal( + nan_agg_ds.sort("A").to_pandas(), + pd.DataFrame( + { + "A": [0, 1, 2], + "max(B)": [None, None, None], + } + ), + check_dtype=False, + ) + + +@pytest.mark.parametrize("num_parts", [1, 30]) +@pytest.mark.parametrize("ds_format", ["arrow", "pandas"]) +def test_groupby_tabular_mean(ray_start_regular_shared, ds_format, num_parts): + # Test built-in mean aggregation + seed = int(time.time()) + print(f"Seeding RNG for test_groupby_tabular_mean with: {seed}") + random.seed(seed) + xs = list(range(100)) + random.shuffle(xs) + + def _to_pandas(ds): + return ds.map_batches(lambda x: x, batch_size=None, batch_format="pandas") + + ds = ray.data.from_items([{"A": (x % 3), "B": x} for x in xs]).repartition( + num_parts + ) + if ds_format == "pandas": + ds = _to_pandas(ds) + + agg_ds = ds.groupby("A").mean("B") + assert agg_ds.count() == 3 + assert [row.as_pydict() for row in agg_ds.sort("A").iter_rows()] == [ + {"A": 0, "mean(B)": 49.5}, + {"A": 1, "mean(B)": 49.0}, + {"A": 2, "mean(B)": 50.0}, + ] + + # Test built-in mean aggregation with nans + ds = ray.data.from_items( + [{"A": (x % 3), "B": x} for x in xs] + [{"A": 0, "B": None}] + ).repartition(num_parts) + if ds_format == "pandas": + ds = _to_pandas(ds) + nan_grouped_ds = ds.groupby("A") + nan_agg_ds = nan_grouped_ds.mean("B") + assert nan_agg_ds.count() == 3 + assert [row.as_pydict() for row in nan_agg_ds.sort("A").iter_rows()] == [ + {"A": 0, "mean(B)": 49.5}, + {"A": 1, "mean(B)": 49.0}, + {"A": 2, "mean(B)": 50.0}, + ] + # Test ignore_nulls=False + nan_agg_ds = nan_grouped_ds.mean("B", ignore_nulls=False) + assert nan_agg_ds.count() == 3 + pd.testing.assert_frame_equal( + nan_agg_ds.sort("A").to_pandas(), + pd.DataFrame( + { + "A": [0, 1, 2], + "mean(B)": [None, 49.0, 50.0], + } + ), + check_dtype=False, + ) + # Test all nans + ds = ray.data.from_items([{"A": (x % 3), "B": None} for x in xs]).repartition( + num_parts + ) + if ds_format == "pandas": + ds = _to_pandas(ds) + nan_agg_ds = ds.groupby("A").mean("B") + assert nan_agg_ds.count() == 3 + pd.testing.assert_frame_equal( + nan_agg_ds.sort("A").to_pandas(), + pd.DataFrame( + { + "A": [0, 1, 2], + "mean(B)": [None, None, None], + } + ), + check_dtype=False, + ) + + +@pytest.mark.parametrize("num_parts", [1, 30]) +@pytest.mark.parametrize("ds_format", ["arrow", "pandas"]) +def test_groupby_tabular_std(ray_start_regular_shared, ds_format, num_parts): + # Test built-in std aggregation + seed = int(time.time()) + print(f"Seeding RNG for test_groupby_tabular_std with: {seed}") + random.seed(seed) + xs = list(range(100)) + random.shuffle(xs) + + def _to_arrow(ds): + return ds.map_batches(lambda x: x, batch_size=None, batch_format="pyarrow") + + df = pd.DataFrame({"A": [x % 3 for x in xs], "B": xs}) + ds = ray.data.from_pandas(df).repartition(num_parts) + if ds_format == "arrow": + ds = _to_arrow(ds) + agg_ds = ds.groupby("A").std("B") + assert agg_ds.count() == 3 + result = agg_ds.to_pandas()["std(B)"].to_numpy() + expected = df.groupby("A")["B"].std().to_numpy() + np.testing.assert_array_almost_equal(result, expected) + # ddof of 0 + ds = ray.data.from_pandas(df).repartition(num_parts) + if ds_format == "arrow": + ds = _to_arrow(ds) + agg_ds = ds.groupby("A").std("B", ddof=0) + assert agg_ds.count() == 3 + result = agg_ds.to_pandas()["std(B)"].to_numpy() + expected = df.groupby("A")["B"].std(ddof=0).to_numpy() + np.testing.assert_array_almost_equal(result, expected) + + # Test built-in std aggregation with nans + nan_df = pd.DataFrame({"A": [x % 3 for x in xs] + [0], "B": xs + [None]}) + ds = ray.data.from_pandas(nan_df).repartition(num_parts) + if ds_format == "arrow": + ds = _to_arrow(ds) + nan_grouped_ds = ds.groupby("A") + nan_agg_ds = nan_grouped_ds.std("B") + assert nan_agg_ds.count() == 3 + result = nan_agg_ds.to_pandas()["std(B)"].to_numpy() + expected = nan_df.groupby("A")["B"].std().to_numpy() + np.testing.assert_array_almost_equal(result, expected) + # Test ignore_nulls=False + nan_agg_ds = nan_grouped_ds.std("B", ignore_nulls=False) + assert nan_agg_ds.count() == 3 + result = nan_agg_ds.to_pandas()["std(B)"].to_numpy() + expected = nan_df.groupby("A")["B"].std() + expected[0] = None + np.testing.assert_array_almost_equal(result, expected) + # Test all nans + nan_df = pd.DataFrame({"A": [x % 3 for x in xs], "B": [None] * len(xs)}) + ds = ray.data.from_pandas(nan_df).repartition(num_parts) + if ds_format == "arrow": + ds = _to_arrow(ds) + nan_agg_ds = ds.groupby("A").std("B", ignore_nulls=False) + assert nan_agg_ds.count() == 3 + result = nan_agg_ds.to_pandas()["std(B)"].to_numpy() + expected = pd.Series([None] * 3) + np.testing.assert_array_equal(result, expected) + + +@pytest.mark.parametrize("num_parts", [1, 30]) +def test_groupby_arrow_multicolumn(ray_start_regular_shared, num_parts): + # Test built-in mean aggregation on multiple columns + seed = int(time.time()) + print(f"Seeding RNG for test_groupby_arrow_multicolumn with: {seed}") + random.seed(seed) + xs = list(range(100)) + random.shuffle(xs) + df = pd.DataFrame({"A": [x % 3 for x in xs], "B": xs, "C": [2 * x for x in xs]}) + agg_ds = ( + ray.data.from_pandas(df).repartition(num_parts).groupby("A").mean(["B", "C"]) + ) + assert agg_ds.count() == 3 + assert [row.as_pydict() for row in agg_ds.sort("A").iter_rows()] == [ + {"A": 0, "mean(B)": 49.5, "mean(C)": 99.0}, + {"A": 1, "mean(B)": 49.0, "mean(C)": 98.0}, + {"A": 2, "mean(B)": 50.0, "mean(C)": 100.0}, + ] + + # Test that unspecified agg column ==> agg on all columns except for + # groupby keys. + agg_ds = ray.data.from_pandas(df).repartition(num_parts).groupby("A").mean() + assert agg_ds.count() == 3 + assert [row.as_pydict() for row in agg_ds.sort("A").iter_rows()] == [ + {"A": 0, "mean(B)": 49.5, "mean(C)": 99.0}, + {"A": 1, "mean(B)": 49.0, "mean(C)": 98.0}, + {"A": 2, "mean(B)": 50.0, "mean(C)": 100.0}, + ] + + # Test built-in global mean aggregation + df = pd.DataFrame({"A": xs, "B": [2 * x for x in xs]}) + result_row = ray.data.from_pandas(df).repartition(num_parts).mean(["A", "B"]) + assert result_row["mean(A)"] == df["A"].mean() + assert result_row["mean(B)"] == df["B"].mean() + + +def test_groupby_agg_bad_on(ray_start_regular_shared): + # Test bad on for groupby aggregation + xs = list(range(100)) + df = pd.DataFrame({"A": [x % 3 for x in xs], "B": xs, "C": [2 * x for x in xs]}) + # Wrong type. + with pytest.raises(TypeError): + ray.data.from_pandas(df).groupby("A").mean(5).fully_executed() + with pytest.raises(TypeError): + ray.data.from_pandas(df).groupby("A").mean([5]).fully_executed() + # Empty list. + with pytest.raises(ValueError): + ray.data.from_pandas(df).groupby("A").mean([]).fully_executed() + # Nonexistent column. + with pytest.raises(ValueError): + ray.data.from_pandas(df).groupby("A").mean("D").fully_executed() + with pytest.raises(ValueError): + ray.data.from_pandas(df).groupby("A").mean(["B", "D"]).fully_executed() + # Columns for simple Dataset. + with pytest.raises(ValueError): + ray.data.from_items(xs).groupby(lambda x: x % 3 == 0).mean("A").fully_executed() + + # Test bad on for global aggregation + # Wrong type. + with pytest.raises(TypeError): + ray.data.from_pandas(df).mean(5).fully_executed() + with pytest.raises(TypeError): + ray.data.from_pandas(df).mean([5]).fully_executed() + # Empty list. + with pytest.raises(ValueError): + ray.data.from_pandas(df).mean([]).fully_executed() + # Nonexistent column. + with pytest.raises(ValueError): + ray.data.from_pandas(df).mean("D").fully_executed() + with pytest.raises(ValueError): + ray.data.from_pandas(df).mean(["B", "D"]).fully_executed() + # Columns for simple Dataset. + with pytest.raises(ValueError): + ray.data.from_items(xs).mean("A").fully_executed() + + +@pytest.mark.parametrize("num_parts", [1, 30]) +def test_groupby_arrow_multi_agg(ray_start_regular_shared, num_parts): + seed = int(time.time()) + print(f"Seeding RNG for test_groupby_arrow_multi_agg with: {seed}") + random.seed(seed) + xs = list(range(100)) + random.shuffle(xs) + df = pd.DataFrame({"A": [x % 3 for x in xs], "B": xs}) + agg_ds = ( + ray.data.from_pandas(df) + .repartition(num_parts) + .groupby("A") + .aggregate( + Count(), + Sum("B"), + Min("B"), + Max("B"), + Mean("B"), + Std("B"), + ) + ) + assert agg_ds.count() == 3 + agg_df = agg_ds.to_pandas() + expected_grouped = df.groupby("A")["B"] + np.testing.assert_array_equal(agg_df["count()"].to_numpy(), [34, 33, 33]) + for agg in ["sum", "min", "max", "mean", "std"]: + result = agg_df[f"{agg}(B)"].to_numpy() + expected = getattr(expected_grouped, agg)().to_numpy() + if agg == "std": + np.testing.assert_array_almost_equal(result, expected) + else: + np.testing.assert_array_equal(result, expected) + # Test built-in global std aggregation + df = pd.DataFrame({"A": xs}) + + result_row = ( + ray.data.from_pandas(df) + .repartition(num_parts) + .aggregate( + Sum("A"), + Min("A"), + Max("A"), + Mean("A"), + Std("A"), + ) + ) + for agg in ["sum", "min", "max", "mean", "std"]: + result = result_row[f"{agg}(A)"] + expected = getattr(df["A"], agg)() + if agg == "std": + assert math.isclose(result, expected) + else: + assert result == expected + + +def test_groupby_simple(ray_start_regular_shared): + seed = int(time.time()) + print(f"Seeding RNG for test_groupby_simple with: {seed}") + random.seed(seed) + parallelism = 3 + xs = [ + ("A", 2), + ("A", 4), + ("A", 9), + ("B", 10), + ("B", 20), + ("C", 3), + ("C", 5), + ("C", 8), + ("C", 12), + ] + random.shuffle(xs) + ds = ray.data.from_items(xs, parallelism=parallelism) + + # Mean aggregation + agg_ds = ds.groupby(lambda r: r[0]).aggregate( + AggregateFn( + init=lambda k: (0, 0), + accumulate_row=lambda a, r: (a[0] + r[1], a[1] + 1), + merge=lambda a1, a2: (a1[0] + a2[0], a1[1] + a2[1]), + finalize=lambda a: a[0] / a[1], + ) + ) + assert agg_ds.count() == 3 + assert agg_ds.sort(key=lambda r: r[0]).take(3) == [("A", 5), ("B", 15), ("C", 7)] + + # Test None row + parallelism = 2 + xs = ["A", "A", "A", None, None, None, "B"] + random.shuffle(xs) + ds = ray.data.from_items(xs, parallelism=parallelism) + # Count aggregation + agg_ds = ds.groupby(lambda r: str(r)).aggregate( + AggregateFn( + init=lambda k: 0, + accumulate_row=lambda a, r: a + 1, + merge=lambda a1, a2: a1 + a2, + ) + ) + assert agg_ds.count() == 3 + assert agg_ds.sort(key=lambda r: str(r[0])).take(3) == [ + ("A", 3), + ("B", 1), + ("None", 3), + ] + + # Test empty dataset. + ds = ray.data.from_items([]) + agg_ds = ds.groupby(lambda r: r[0]).aggregate( + AggregateFn( + init=lambda k: 1 / 0, # should never reach here + accumulate_row=lambda a, r: 1 / 0, + merge=lambda a1, a2: 1 / 0, + finalize=lambda a: 1 / 0, + ) + ) + assert agg_ds.count() == 0 + assert agg_ds.take() == ds.take() + agg_ds = ray.data.range(10).filter(lambda r: r > 10).groupby(lambda r: r).count() + assert agg_ds.count() == 0 + + +@pytest.mark.parametrize("num_parts", [1, 30]) +def test_groupby_simple_count(ray_start_regular_shared, num_parts): + # Test built-in count aggregation + seed = int(time.time()) + print(f"Seeding RNG for test_groupby_simple_count with: {seed}") + random.seed(seed) + xs = list(range(100)) + random.shuffle(xs) + agg_ds = ( + ray.data.from_items(xs).repartition(num_parts).groupby(lambda x: x % 3).count() + ) + assert agg_ds.count() == 3 + assert agg_ds.sort(key=lambda r: r[0]).take(3) == [(0, 34), (1, 33), (2, 33)] + + +@pytest.mark.parametrize("num_parts", [1, 30]) +def test_groupby_simple_sum(ray_start_regular_shared, num_parts): + # Test built-in sum aggregation + seed = int(time.time()) + print(f"Seeding RNG for test_groupby_simple_sum with: {seed}") + random.seed(seed) + xs = list(range(100)) + random.shuffle(xs) + agg_ds = ( + ray.data.from_items(xs).repartition(num_parts).groupby(lambda x: x % 3).sum() + ) + assert agg_ds.count() == 3 + assert agg_ds.sort(key=lambda r: r[0]).take(3) == [(0, 1683), (1, 1617), (2, 1650)] + + # Test built-in sum aggregation with nans + nan_grouped_ds = ( + ray.data.from_items(xs + [None]) + .repartition(num_parts) + .groupby(lambda x: int(x or 0) % 3) + ) + nan_agg_ds = nan_grouped_ds.sum() + assert nan_agg_ds.count() == 3 + assert nan_agg_ds.sort(key=lambda r: r[0]).take(3) == [ + (0, 1683), + (1, 1617), + (2, 1650), + ] + # Test ignore_nulls=False + nan_agg_ds = nan_grouped_ds.sum(ignore_nulls=False) + assert nan_agg_ds.count() == 3 + assert nan_agg_ds.sort(key=lambda r: r[0]).take(3) == [ + (0, None), + (1, 1617), + (2, 1650), + ] + # Test all nans + nan_agg_ds = ( + ray.data.from_items([None] * len(xs)) + .repartition(num_parts) + .groupby(lambda x: 0) + .sum() + ) + assert nan_agg_ds.count() == 1 + assert nan_agg_ds.sort(key=lambda r: r[0]).take(1) == [(0, None)] + + # Test built-in global sum aggregation + assert ray.data.from_items(xs).repartition(num_parts).sum() == 4950 + assert ray.data.range(10).filter(lambda r: r > 10).sum() is None + + # Test built-in global sum aggregation with nans + nan_ds = ray.data.from_items(xs + [None]).repartition(num_parts) + assert nan_ds.sum() == 4950 + # Test ignore_nulls=False + assert nan_ds.sum(ignore_nulls=False) is None + # Test all nans + nan_ds = ray.data.from_items([None] * len(xs)).repartition(num_parts) + assert nan_ds.sum() is None + + +def test_groupby_map_groups_for_empty_dataset(ray_start_regular_shared): + ds = ray.data.from_items([]) + mapped = ds.groupby(lambda x: x % 3).map_groups(lambda x: [min(x) * min(x)]) + assert mapped.count() == 0 + assert mapped.take_all() == [] + + +def test_groupby_map_groups_merging_empty_result(ray_start_regular_shared): + ds = ray.data.from_items([1, 2, 3]) + # This needs to merge empty and non-empty results from different groups. + mapped = ds.groupby(lambda x: x).map_groups(lambda x: [] if x == [1] else x) + assert mapped.count() == 2 + assert mapped.take_all() == [2, 3] + + +def test_groupby_map_groups_merging_invalid_result(ray_start_regular_shared): + ds = ray.data.from_items([1, 2, 3]) + grouped = ds.groupby(lambda x: x) + + # The UDF returns None, which is invalid. + with pytest.raises(TypeError): + grouped.map_groups(lambda x: None if x == [1] else x).fully_executed() + + +@pytest.mark.parametrize("num_parts", [1, 2, 30]) +def test_groupby_map_groups_for_none_groupkey(ray_start_regular_shared, num_parts): + ds = ray.data.from_items(list(range(100))) + mapped = ( + ds.repartition(num_parts).groupby(None).map_groups(lambda x: [min(x) + max(x)]) + ) + assert mapped.count() == 1 + assert mapped.take_all() == [99] + + +@pytest.mark.parametrize("num_parts", [1, 2, 30]) +def test_groupby_map_groups_returning_empty_result(ray_start_regular_shared, num_parts): + xs = list(range(100)) + mapped = ( + ray.data.from_items(xs) + .repartition(num_parts) + .groupby(lambda x: x % 3) + .map_groups(lambda x: []) + ) + assert mapped.count() == 0 + assert mapped.take_all() == [] + + +def test_groupby_map_groups_perf(ray_start_regular_shared): + data_list = [x % 100 for x in range(5000000)] + ds = ray.data.from_pandas(pd.DataFrame({"A": data_list})) + start = time.perf_counter() + ds.groupby("A").map_groups(lambda df: df) + end = time.perf_counter() + # On a t3.2xlarge instance, it ran in about 5 seconds, so expecting it has to + # finish within about 10x of that time, unless something went wrong. + assert end - start < 60 + + +@pytest.mark.parametrize("num_parts", [1, 2, 3, 30]) +def test_groupby_map_groups_for_list(ray_start_regular_shared, num_parts): + seed = int(time.time()) + print(f"Seeding RNG for test_groupby_simple_count with: {seed}") + random.seed(seed) + xs = list(range(100)) + random.shuffle(xs) + mapped = ( + ray.data.from_items(xs) + .repartition(num_parts) + .groupby(lambda x: x % 3) + .map_groups(lambda x: [min(x) * min(x)]) + ) + assert mapped.count() == 3 + assert mapped.take_all() == [0, 1, 4] + + +@pytest.mark.parametrize("num_parts", [1, 2, 3, 30]) +def test_groupby_map_groups_for_pandas(ray_start_regular_shared, num_parts): + df = pd.DataFrame({"A": "a a b".split(), "B": [1, 1, 3], "C": [4, 6, 5]}) + grouped = ray.data.from_pandas(df).repartition(num_parts).groupby("A") + + # Normalize the numeric columns (i.e. B and C) for each group. + mapped = grouped.map_groups( + lambda g: g.apply( + lambda col: col / g[col.name].sum() if col.name in ["B", "C"] else col + ) + ) + + # The function (i.e. the normalization) performed on each group doesn't + # aggregate rows, so we still have 3 rows. + assert mapped.count() == 3 + expected = pd.DataFrame( + {"A": ["a", "a", "b"], "B": [0.5, 0.5, 1.000000], "C": [0.4, 0.6, 1.0]} + ) + assert mapped.to_pandas().equals(expected) + + +@pytest.mark.parametrize("num_parts", [1, 2, 3, 30]) +def test_groupby_map_groups_for_arrow(ray_start_regular_shared, num_parts): + at = pa.Table.from_pydict({"A": "a a b".split(), "B": [1, 1, 3], "C": [4, 6, 5]}) + grouped = ray.data.from_arrow(at).repartition(num_parts).groupby("A") + + # Normalize the numeric columns (i.e. B and C) for each group. + def normalize(at: pa.Table): + r = at.select("A") + sb = pa.compute.sum(at.column("B")).cast(pa.float64()) + r = r.append_column("B", pa.compute.divide(at.column("B"), sb)) + sc = pa.compute.sum(at.column("C")).cast(pa.float64()) + r = r.append_column("C", pa.compute.divide(at.column("C"), sc)) + return r + + mapped = grouped.map_groups(normalize, batch_format="pyarrow") + + # The function (i.e. the normalization) performed on each group doesn't + # aggregate rows, so we still have 3 rows. + assert mapped.count() == 3 + expected = pa.Table.from_pydict( + {"A": ["a", "a", "b"], "B": [0.5, 0.5, 1], "C": [0.4, 0.6, 1]} + ) + result = pa.Table.from_pandas(mapped.to_pandas()) + assert result.equals(expected) + + +def test_groupby_map_groups_for_numpy(ray_start_regular_shared): + ds = ray.data.from_items( + [ + {"group": 1, "value": 1}, + {"group": 1, "value": 2}, + {"group": 2, "value": 3}, + {"group": 2, "value": 4}, + ] + ) + + def func(group): + # Test output type is NumPy format. + return {"group": group["group"] + 1, "value": group["value"] + 1} + + ds = ds.groupby("group").map_groups(func, batch_format="numpy") + expected = pa.Table.from_pydict({"group": [2, 2, 3, 3], "value": [2, 3, 4, 5]}) + result = pa.Table.from_pandas(ds.to_pandas()) + assert result.equals(expected) + + +def test_groupby_map_groups_with_different_types(ray_start_regular_shared): + ds = ray.data.from_items( + [ + {"group": 1, "value": 1}, + {"group": 1, "value": 2}, + {"group": 2, "value": 3}, + {"group": 2, "value": 4}, + ] + ) + + def func(group): + # Test output type is Python list, different from input type. + return [group["value"][0]] + + ds = ds.groupby("group").map_groups(func) + assert sorted(ds.take()) == [1, 3] + + +@pytest.mark.parametrize("num_parts", [1, 30]) +def test_groupby_simple_min(ray_start_regular_shared, num_parts): + # Test built-in min aggregation + seed = int(time.time()) + print(f"Seeding RNG for test_groupby_simple_min with: {seed}") + random.seed(seed) + xs = list(range(100)) + random.shuffle(xs) + agg_ds = ( + ray.data.from_items(xs).repartition(num_parts).groupby(lambda x: x % 3).min() + ) + assert agg_ds.count() == 3 + assert agg_ds.sort(key=lambda r: r[0]).take(3) == [(0, 0), (1, 1), (2, 2)] + + # Test built-in min aggregation with nans + nan_grouped_ds = ( + ray.data.from_items(xs + [None]) + .repartition(num_parts) + .groupby(lambda x: int(x or 0) % 3) + ) + nan_agg_ds = nan_grouped_ds.min() + assert nan_agg_ds.count() == 3 + assert nan_agg_ds.sort(key=lambda r: r[0]).take(3) == [(0, 0), (1, 1), (2, 2)] + # Test ignore_nulls=False + nan_agg_ds = nan_grouped_ds.min(ignore_nulls=False) + assert nan_agg_ds.count() == 3 + assert nan_agg_ds.sort(key=lambda r: r[0]).take(3) == [(0, None), (1, 1), (2, 2)] + # Test all nans + nan_agg_ds = ( + ray.data.from_items([None] * len(xs)) + .repartition(num_parts) + .groupby(lambda x: 0) + .min() + ) + assert nan_agg_ds.count() == 1 + assert nan_agg_ds.sort(key=lambda r: r[0]).take(1) == [(0, None)] + + # Test built-in global min aggregation + assert ray.data.from_items(xs).repartition(num_parts).min() == 0 + assert ray.data.range(10).filter(lambda r: r > 10).min() is None + + # Test built-in global min aggregation with nans + nan_ds = ray.data.from_items(xs + [None]).repartition(num_parts) + assert nan_ds.min() == 0 + # Test ignore_nulls=False + assert nan_ds.min(ignore_nulls=False) is None + # Test all nans + nan_ds = ray.data.from_items([None] * len(xs)).repartition(num_parts) + assert nan_ds.min() is None + + +@pytest.mark.parametrize("num_parts", [1, 30]) +def test_groupby_simple_max(ray_start_regular_shared, num_parts): + # Test built-in max aggregation + seed = int(time.time()) + print(f"Seeding RNG for test_groupby_simple_max with: {seed}") + random.seed(seed) + xs = list(range(100)) + random.shuffle(xs) + agg_ds = ( + ray.data.from_items(xs).repartition(num_parts).groupby(lambda x: x % 3).max() + ) + assert agg_ds.count() == 3 + assert agg_ds.sort(key=lambda r: r[0]).take(3) == [(0, 99), (1, 97), (2, 98)] + + # Test built-in max aggregation with nans + nan_grouped_ds = ( + ray.data.from_items(xs + [None]) + .repartition(num_parts) + .groupby(lambda x: int(x or 0) % 3) + ) + nan_agg_ds = nan_grouped_ds.max() + assert nan_agg_ds.count() == 3 + assert nan_agg_ds.sort(key=lambda r: r[0]).take(3) == [(0, 99), (1, 97), (2, 98)] + # Test ignore_nulls=False + nan_agg_ds = nan_grouped_ds.max(ignore_nulls=False) + assert nan_agg_ds.count() == 3 + assert nan_agg_ds.sort(key=lambda r: r[0]).take(3) == [(0, None), (1, 97), (2, 98)] + # Test all nans + nan_agg_ds = ( + ray.data.from_items([None] * len(xs)) + .repartition(num_parts) + .groupby(lambda x: 0) + .max() + ) + assert nan_agg_ds.count() == 1 + assert nan_agg_ds.sort(key=lambda r: r[0]).take(1) == [(0, None)] + + # Test built-in global max aggregation + assert ray.data.from_items(xs).repartition(num_parts).max() == 99 + assert ray.data.range(10).filter(lambda r: r > 10).max() is None + + # Test built-in global max aggregation with nans + nan_ds = ray.data.from_items(xs + [None]).repartition(num_parts) + assert nan_ds.max() == 99 + # Test ignore_nulls=False + assert nan_ds.max(ignore_nulls=False) is None + # Test all nans + nan_ds = ray.data.from_items([None] * len(xs)).repartition(num_parts) + assert nan_ds.max() is None + + +@pytest.mark.parametrize("num_parts", [1, 30]) +def test_groupby_simple_mean(ray_start_regular_shared, num_parts): + # Test built-in mean aggregation + seed = int(time.time()) + print(f"Seeding RNG for test_groupby_simple_mean with: {seed}") + random.seed(seed) + xs = list(range(100)) + random.shuffle(xs) + agg_ds = ( + ray.data.from_items(xs).repartition(num_parts).groupby(lambda x: x % 3).mean() + ) + assert agg_ds.count() == 3 + assert agg_ds.sort(key=lambda r: r[0]).take(3) == [(0, 49.5), (1, 49.0), (2, 50.0)] + + # Test built-in mean aggregation with nans + nan_grouped_ds = ( + ray.data.from_items(xs + [None]) + .repartition(num_parts) + .groupby(lambda x: int(x or 0) % 3) + ) + nan_agg_ds = nan_grouped_ds.mean() + assert nan_agg_ds.count() == 3 + assert nan_agg_ds.sort(key=lambda r: r[0]).take(3) == [ + (0, 49.5), + (1, 49.0), + (2, 50.0), + ] + # Test ignore_nulls=False + nan_agg_ds = nan_grouped_ds.mean(ignore_nulls=False) + assert nan_agg_ds.count() == 3 + assert nan_agg_ds.sort(key=lambda r: r[0]).take(3) == [ + (0, None), + (1, 49.0), + (2, 50.0), + ] + # Test all nans + nan_agg_ds = ( + ray.data.from_items([None] * len(xs)) + .repartition(num_parts) + .groupby(lambda x: 0) + .mean() + ) + assert nan_agg_ds.count() == 1 + assert nan_agg_ds.sort(key=lambda r: r[0]).take(1) == [(0, None)] + + # Test built-in global mean aggregation + assert ray.data.from_items(xs).repartition(num_parts).mean() == 49.5 + # Test empty dataset + assert ray.data.range(10).filter(lambda r: r > 10).mean() is None + + # Test built-in global mean aggregation with nans + nan_ds = ray.data.from_items(xs + [None]).repartition(num_parts) + assert nan_ds.mean() == 49.5 + # Test ignore_nulls=False + assert nan_ds.mean(ignore_nulls=False) is None + # Test all nans + nan_ds = ray.data.from_items([None] * len(xs)).repartition(num_parts) + assert nan_ds.mean() is None + + +@pytest.mark.parametrize("num_parts", [1, 30]) +def test_groupby_simple_std(ray_start_regular_shared, num_parts): + # Test built-in std aggregation + seed = int(time.time()) + print(f"Seeding RNG for test_groupby_simple_std with: {seed}") + random.seed(seed) + xs = list(range(100)) + random.shuffle(xs) + agg_ds = ( + ray.data.from_items(xs).repartition(num_parts).groupby(lambda x: x % 3).std() + ) + assert agg_ds.count() == 3 + df = pd.DataFrame({"A": [x % 3 for x in xs], "B": xs}) + expected = df.groupby("A")["B"].std() + result = agg_ds.sort(key=lambda r: r[0]).take(3) + groups, stds = zip(*result) + result_df = pd.DataFrame({"A": list(groups), "B": list(stds)}) + result_df = result_df.set_index("A") + pd.testing.assert_series_equal(result_df["B"], expected) + # ddof of 0 + agg_ds = ( + ray.data.from_items(xs) + .repartition(num_parts) + .groupby(lambda x: x % 3) + .std(ddof=0) + ) + assert agg_ds.count() == 3 + df = pd.DataFrame({"A": [x % 3 for x in xs], "B": xs}) + expected = df.groupby("A")["B"].std(ddof=0) + result = agg_ds.sort(key=lambda r: r[0]).take(3) + groups, stds = zip(*result) + result_df = pd.DataFrame({"A": list(groups), "B": list(stds)}) + result_df = result_df.set_index("A") + pd.testing.assert_series_equal(result_df["B"], expected) + + # Test built-in std aggregation with nans + nan_grouped_ds = ( + ray.data.from_items(xs + [None]) + .repartition(num_parts) + .groupby(lambda x: int(x or 0) % 3) + ) + nan_agg_ds = nan_grouped_ds.std() + assert nan_agg_ds.count() == 3 + nan_df = pd.DataFrame({"A": [x % 3 for x in xs] + [0], "B": xs + [None]}) + expected = nan_df.groupby("A")["B"].std() + result = nan_agg_ds.sort(key=lambda r: r[0]).take(3) + groups, stds = zip(*result) + result_df = pd.DataFrame({"A": list(groups), "B": list(stds)}) + result_df = result_df.set_index("A") + pd.testing.assert_series_equal(result_df["B"], expected) + # Test ignore_nulls=False + nan_agg_ds = nan_grouped_ds.std(ignore_nulls=False) + assert nan_agg_ds.count() == 3 + expected = nan_df.groupby("A")["B"].std() + expected[0] = None + result = nan_agg_ds.sort(key=lambda r: r[0]).take(3) + groups, stds = zip(*result) + result_df = pd.DataFrame({"A": list(groups), "B": list(stds)}) + result_df = result_df.set_index("A") + pd.testing.assert_series_equal(result_df["B"], expected) + # Test all nans + nan_agg_ds = ( + ray.data.from_items([None] * len(xs)) + .repartition(num_parts) + .groupby(lambda x: 0) + .std(ignore_nulls=False) + ) + assert nan_agg_ds.count() == 1 + expected = pd.Series([None], name="B") + expected.index.rename("A", inplace=True) + result = nan_agg_ds.sort(key=lambda r: r[0]).take(1) + groups, stds = zip(*result) + result_df = pd.DataFrame({"A": list(groups), "B": list(stds)}) + result_df = result_df.set_index("A") + pd.testing.assert_series_equal(result_df["B"], expected) + + # Test built-in global std aggregation + assert math.isclose( + ray.data.from_items(xs).repartition(num_parts).std(), pd.Series(xs).std() + ) + # ddof of 0 + assert math.isclose( + ray.data.from_items(xs).repartition(num_parts).std(ddof=0), + pd.Series(xs).std(ddof=0), + ) + + # Test empty dataset + assert ray.data.from_items([]).std() is None + # Test edge cases + assert ray.data.from_items([3]).std() == 0 + + # Test built-in global std aggregation with nans + nan_ds = ray.data.from_items(xs + [None]).repartition(num_parts) + assert math.isclose(nan_ds.std(), pd.Series(xs).std()) + # Test ignore_nulls=False + assert nan_ds.std(ignore_nulls=False) is None + # Test all nans + nan_ds = ray.data.from_items([None] * len(xs)).repartition(num_parts) + assert nan_ds.std() is None + + +@pytest.mark.parametrize("num_parts", [1, 30]) +def test_groupby_simple_multilambda(ray_start_regular_shared, num_parts): + # Test built-in mean aggregation + seed = int(time.time()) + print(f"Seeding RNG for test_groupby_simple_multilambda with: {seed}") + random.seed(seed) + xs = list(range(100)) + random.shuffle(xs) + agg_ds = ( + ray.data.from_items([[x, 2 * x] for x in xs]) + .repartition(num_parts) + .groupby(lambda x: x[0] % 3) + .mean([lambda x: x[0], lambda x: x[1]]) + ) + assert agg_ds.count() == 3 + assert agg_ds.sort(key=lambda r: r[0]).take(3) == [ + (0, 49.5, 99.0), + (1, 49.0, 98.0), + (2, 50.0, 100.0), + ] + # Test built-in global mean aggregation + assert ray.data.from_items([[x, 2 * x] for x in xs]).repartition(num_parts).mean( + [lambda x: x[0], lambda x: x[1]] + ) == (49.5, 99.0) + assert ray.data.from_items([[x, 2 * x] for x in range(10)]).filter( + lambda r: r[0] > 10 + ).mean([lambda x: x[0], lambda x: x[1]]) == (None, None) + + +@pytest.mark.parametrize("num_parts", [1, 30]) +def test_groupby_simple_multi_agg(ray_start_regular_shared, num_parts): + seed = int(time.time()) + print(f"Seeding RNG for test_groupby_simple_multi_agg with: {seed}") + random.seed(seed) + xs = list(range(100)) + random.shuffle(xs) + df = pd.DataFrame({"A": [x % 3 for x in xs], "B": xs}) + agg_ds = ( + ray.data.from_items(xs) + .repartition(num_parts) + .groupby(lambda x: x % 3) + .aggregate( + Count(), + Sum(), + Min(), + Max(), + Mean(), + Std(), + ) + ) + assert agg_ds.count() == 3 + result = agg_ds.sort(key=lambda r: r[0]).take(3) + groups, counts, sums, mins, maxs, means, stds = zip(*result) + agg_df = pd.DataFrame( + { + "groups": list(groups), + "count": list(counts), + "sum": list(sums), + "min": list(mins), + "max": list(maxs), + "mean": list(means), + "std": list(stds), + } + ) + agg_df = agg_df.set_index("groups") + df = pd.DataFrame({"groups": [x % 3 for x in xs], "B": xs}) + expected_grouped = df.groupby("groups")["B"] + np.testing.assert_array_equal(agg_df["count"].to_numpy(), [34, 33, 33]) + for agg in ["sum", "min", "max", "mean", "std"]: + result = agg_df[agg].to_numpy() + expected = getattr(expected_grouped, agg)().to_numpy() + if agg == "std": + np.testing.assert_array_almost_equal(result, expected) + else: + np.testing.assert_array_equal(result, expected) + # Test built-in global multi-aggregation + result_row = ( + ray.data.from_items(xs) + .repartition(num_parts) + .aggregate( + Sum(), + Min(), + Max(), + Mean(), + Std(), + ) + ) + series = pd.Series(xs) + for idx, agg in enumerate(["sum", "min", "max", "mean", "std"]): + result = result_row[idx] + expected = getattr(series, agg)() + if agg == "std": + assert math.isclose(result, expected) + else: + assert result == expected + + +def test_random_block_order_schema(ray_start_regular_shared): + df = pd.DataFrame({"a": np.random.rand(10), "b": np.random.rand(10)}) + ds = ray.data.from_pandas(df).randomize_block_order() + ds.schema().names == ["a", "b"] + + +def test_random_block_order(ray_start_regular_shared, restore_dataset_context): + ctx = DatasetContext.get_current() + ctx.execution_options.preserve_order = True + + # Test BlockList.randomize_block_order. + ds = ray.data.range(12).repartition(4) + ds = ds.randomize_block_order(seed=0) + + results = ds.take() + expected = [6, 7, 8, 0, 1, 2, 3, 4, 5, 9, 10, 11] + assert results == expected + + # Test LazyBlockList.randomize_block_order. + context = DatasetContext.get_current() + try: + original_optimize_fuse_read_stages = context.optimize_fuse_read_stages + context.optimize_fuse_read_stages = False + + lazy_blocklist_ds = ray.data.range(12, parallelism=4) + lazy_blocklist_ds = lazy_blocklist_ds.randomize_block_order(seed=0) + lazy_blocklist_results = lazy_blocklist_ds.take() + lazy_blocklist_expected = [6, 7, 8, 0, 1, 2, 3, 4, 5, 9, 10, 11] + assert lazy_blocklist_results == lazy_blocklist_expected + finally: + context.optimize_fuse_read_stages = original_optimize_fuse_read_stages + + +# NOTE: All tests above share a Ray cluster, while the tests below do not. These +# tests should only be carefully reordered to retain this invariant! + + +@pytest.mark.parametrize("pipelined", [False, True]) +def test_random_shuffle(shutdown_only, pipelined, use_push_based_shuffle): + def range(n, parallelism=200): + ds = ray.data.range(n, parallelism=parallelism) + if pipelined: + pipe = ds.repeat(2) + pipe.random_shuffle = pipe.random_shuffle_each_window + return pipe + else: + return ds + + r1 = range(100).random_shuffle().take(999) + r2 = range(100).random_shuffle().take(999) + assert r1 != r2, (r1, r2) + + r1 = range(100, parallelism=1).random_shuffle().take(999) + r2 = range(100, parallelism=1).random_shuffle().take(999) + assert r1 != r2, (r1, r2) + + # TODO(swang): fix this + if not use_push_based_shuffle: + if not pipelined: + assert range(100).random_shuffle(num_blocks=1).num_blocks() == 1 + r1 = range(100).random_shuffle(num_blocks=1).take(999) + r2 = range(100).random_shuffle(num_blocks=1).take(999) + assert r1 != r2, (r1, r2) + + r0 = range(100, parallelism=5).take(999) + r1 = range(100, parallelism=5).random_shuffle(seed=0).take(999) + r2 = range(100, parallelism=5).random_shuffle(seed=0).take(999) + r3 = range(100, parallelism=5).random_shuffle(seed=12345).take(999) + assert r1 == r2, (r1, r2) + assert r1 != r0, (r1, r0) + assert r1 != r3, (r1, r3) + + r0 = ray.data.range_table(100, parallelism=5).take(999) + r1 = ray.data.range_table(100, parallelism=5).random_shuffle(seed=0).take(999) + r2 = ray.data.range_table(100, parallelism=5).random_shuffle(seed=0).take(999) + assert r1 == r2, (r1, r2) + assert r1 != r0, (r1, r0) + + # Test move. + ds = range(100, parallelism=2) + r1 = ds.random_shuffle().take(999) + if pipelined: + with pytest.raises(RuntimeError): + ds = ds.map(lambda x: x).take(999) + else: + ds = ds.map(lambda x: x).take(999) + r2 = range(100).random_shuffle().take(999) + assert r1 != r2, (r1, r2) + + # Test empty dataset. + ds = ray.data.from_items([]) + r1 = ds.random_shuffle() + assert r1.count() == 0 + assert r1.take() == ds.take() + + +def test_random_shuffle_check_random(shutdown_only): + # Rows from the same input should not be contiguous in the final output. + num_files = 10 + num_rows = 100 + items = [i for i in range(num_files) for _ in range(num_rows)] + ds = ray.data.from_items(items, parallelism=num_files) + out = ds.random_shuffle().take(num_files * num_rows) + for i in range(num_files): + part = out[i * num_rows : (i + 1) * num_rows] + seen = set() + num_contiguous = 1 + prev = -1 + for x in part: + if prev != x: + prev = x + num_contiguous = 1 + else: + num_contiguous += 1 + assert num_contiguous < ( + num_rows / num_files + ), f"{part} contains too many contiguous rows from same input block" + seen.add(x) + assert ( + set(range(num_files)) == seen + ), f"{part} does not contain elements from all input blocks" + + # Rows from the same input should appear in a different order in the + # output. + num_files = 10 + num_rows = 100 + items = [j for i in range(num_files) for j in range(num_rows)] + ds = ray.data.from_items(items, parallelism=num_files) + out = ds.random_shuffle().take(num_files * num_rows) + for i in range(num_files): + part = out[i * num_rows : (i + 1) * num_rows] + num_increasing = 0 + prev = -1 + for x in part: + if x >= prev: + num_increasing += 1 + else: + assert num_increasing < ( + num_rows / num_files + ), f"{part} contains non-shuffled rows from input blocks" + num_increasing = 0 + prev = x + + +def test_random_shuffle_with_custom_resource(ray_start_cluster): + cluster = ray_start_cluster + # Create two nodes which have different custom resources. + cluster.add_node( + resources={"foo": 100}, + num_cpus=1, + ) + cluster.add_node(resources={"bar": 100}, num_cpus=1) + + ray.init(cluster.address) + + # Run dataset in "bar" nodes. + ds = ray.data.read_parquet( + "example://parquet_images_mini", + parallelism=2, + ray_remote_args={"resources": {"bar": 1}}, + ) + ds = ds.random_shuffle(resources={"bar": 1}).fully_executed() + assert "1 nodes used" in ds.stats() + assert "2 nodes used" not in ds.stats() + + +def test_random_shuffle_spread(ray_start_cluster, use_push_based_shuffle): + cluster = ray_start_cluster + cluster.add_node( + resources={"bar:1": 100}, + num_cpus=10, + _system_config={"max_direct_call_object_size": 0}, + ) + cluster.add_node(resources={"bar:2": 100}, num_cpus=10) + cluster.add_node(resources={"bar:3": 100}, num_cpus=0) + + ray.init(cluster.address) + + @ray.remote + def get_node_id(): + return ray.get_runtime_context().get_node_id() + + node1_id = ray.get(get_node_id.options(resources={"bar:1": 1}).remote()) + node2_id = ray.get(get_node_id.options(resources={"bar:2": 1}).remote()) + + ds = ray.data.range(100, parallelism=2).random_shuffle() + blocks = ds.get_internal_block_refs() + ray.wait(blocks, num_returns=len(blocks), fetch_local=False) + location_data = ray.experimental.get_object_locations(blocks) + locations = [] + for block in blocks: + locations.extend(location_data[block]["node_ids"]) + assert "2 nodes used" in ds.stats() + + if not use_push_based_shuffle: + # We don't check this for push-based shuffle since it will try to + # colocate reduce tasks to improve locality. + assert set(locations) == {node1_id, node2_id} + + +if __name__ == "__main__": + import sys + + sys.exit(pytest.main(["-v", __file__])) diff --git a/python/ray/data/tests/test_dataset_consumption.py b/python/ray/data/tests/test_dataset_consumption.py new file mode 100644 index 0000000000000..b3a5bbfd9e058 --- /dev/null +++ b/python/ray/data/tests/test_dataset_consumption.py @@ -0,0 +1,1716 @@ +import logging +import math +import os +import random +import time + +import numpy as np +import pandas as pd +import pyarrow as pa +import pytest +from unittest.mock import patch + +import ray +from ray.data._internal.arrow_block import ArrowRow +from ray.data._internal.block_builder import BlockBuilder +from ray.data._internal.dataset_logger import DatasetLogger +from ray.data._internal.lazy_block_list import LazyBlockList +from ray.data._internal.pandas_block import PandasRow +from ray.data.block import BlockAccessor, BlockMetadata +from ray.data.context import DatasetContext +from ray.data.dataset import Dataset, _sliding_window +from ray.data.datasource.datasource import Datasource, ReadTask +from ray.data.datasource.csv_datasource import CSVDatasource +from ray.data.row import TableRow +from ray.data.tests.conftest import * # noqa +from ray.tests.conftest import * # noqa +from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy + + +def maybe_pipeline(ds, enabled): + if enabled: + return ds.window(blocks_per_window=1) + else: + return ds + + +@pytest.mark.parametrize("pipelined", [False, True]) +def test_avoid_placement_group_capture(shutdown_only, pipelined): + ray.init(num_cpus=2) + + @ray.remote + def run(): + ds0 = ray.data.range(5) + ds = maybe_pipeline(ds0, pipelined) + assert sorted(ds.map(lambda x: x + 1).take()) == [1, 2, 3, 4, 5] + ds = maybe_pipeline(ds0, pipelined) + assert ds.count() == 5 + ds = maybe_pipeline(ds0, pipelined) + assert sorted(ds.iter_rows()) == [0, 1, 2, 3, 4] + + pg = ray.util.placement_group([{"CPU": 1}]) + ray.get( + run.options( + scheduling_strategy=PlacementGroupSchedulingStrategy( + placement_group=pg, placement_group_capture_child_tasks=True + ) + ).remote() + ) + + +def test_dataset_lineage_serialization(shutdown_only): + ray.init() + ds = ray.data.range(10) + ds = ds.map(lambda x: x + 1) + ds = ds.map(lambda x: x + 1) + ds = ds.random_shuffle() + epoch = ds._get_epoch() + uuid = ds._get_uuid() + plan_uuid = ds._plan._dataset_uuid + + serialized_ds = ds.serialize_lineage() + # Confirm that the original Dataset was properly copied before clearing/mutating. + in_blocks = ds._plan._in_blocks + # Should not raise. + in_blocks._check_if_cleared() + assert isinstance(in_blocks, LazyBlockList) + assert in_blocks._block_partition_refs[0] is None + + ray.shutdown() + ray.init() + + ds = Dataset.deserialize_lineage(serialized_ds) + # Check Dataset state. + assert ds._get_epoch() == epoch + assert ds._get_uuid() == uuid + assert ds._plan._dataset_uuid == plan_uuid + # Check Dataset content. + assert ds.count() == 10 + assert sorted(ds.take()) == list(range(2, 12)) + + +def test_dataset_lineage_serialization_unsupported(shutdown_only): + ray.init() + # In-memory data sources not supported. + ds = ray.data.from_items(list(range(10))) + ds = ds.map(lambda x: x + 1) + ds = ds.map(lambda x: x + 1) + + with pytest.raises(ValueError): + ds.serialize_lineage() + + # In-memory data source unions not supported. + ds = ray.data.from_items(list(range(10))) + ds1 = ray.data.from_items(list(range(10, 20))) + ds2 = ds.union(ds1) + + with pytest.raises(ValueError): + ds2.serialize_lineage() + + # Post-lazy-read unions not supported. + ds = ray.data.range(10).map(lambda x: x + 1) + ds1 = ray.data.range(20).map(lambda x: 2 * x) + ds2 = ds.union(ds1) + + with pytest.raises(ValueError): + ds2.serialize_lineage() + + # Lazy read unions supported. + ds = ray.data.range(10) + ds1 = ray.data.range(20) + ds2 = ds.union(ds1) + + serialized_ds = ds2.serialize_lineage() + ds3 = Dataset.deserialize_lineage(serialized_ds) + assert ds3.take(30) == list(range(10)) + list(range(20)) + + # Zips not supported. + ds = ray.data.from_items(list(range(10))) + ds1 = ray.data.from_items(list(range(10, 20))) + ds2 = ds.zip(ds1) + + with pytest.raises(ValueError): + ds2.serialize_lineage() + + +@pytest.mark.parametrize("pipelined", [False, True]) +def test_basic(ray_start_regular_shared, pipelined): + ds0 = ray.data.range(5) + ds = maybe_pipeline(ds0, pipelined) + assert sorted(ds.map(lambda x: x + 1).take()) == [1, 2, 3, 4, 5] + ds = maybe_pipeline(ds0, pipelined) + assert ds.count() == 5 + ds = maybe_pipeline(ds0, pipelined) + assert sorted(ds.iter_rows()) == [0, 1, 2, 3, 4] + + +def test_range_table(ray_start_regular_shared): + ds = ray.data.range_table(10, parallelism=10) + assert ds.num_blocks() == 10 + assert ds.count() == 10 + assert ds.take() == [{"value": i} for i in range(10)] + + ds = ray.data.range_table(10, parallelism=2) + assert ds.num_blocks() == 2 + assert ds.count() == 10 + assert ds.take() == [{"value": i} for i in range(10)] + + +def test_empty_dataset(ray_start_regular_shared): + ds = ray.data.range(0) + assert ds.count() == 0 + assert ds.size_bytes() is None + assert ds.schema() is None + + ds = ray.data.range(1) + ds = ds.filter(lambda x: x > 1) + ds.fully_executed() + assert str(ds) == "Dataset(num_blocks=1, num_rows=0, schema=Unknown schema)" + + # Test map on empty dataset. + ds = ray.data.from_items([]) + ds = ds.map(lambda x: x) + ds.fully_executed() + assert ds.count() == 0 + + # Test filter on empty dataset. + ds = ray.data.from_items([]) + ds = ds.filter(lambda: True) + ds.fully_executed() + assert ds.count() == 0 + + +def test_schema(ray_start_regular_shared): + ds = ray.data.range(10, parallelism=10) + ds2 = ray.data.range_table(10, parallelism=10) + ds3 = ds2.repartition(5) + ds3.fully_executed() + ds4 = ds3.map(lambda x: {"a": "hi", "b": 1.0}).limit(5).repartition(1) + ds4.fully_executed() + assert str(ds) == "Dataset(num_blocks=10, num_rows=10, schema=)" + assert str(ds2) == "Dataset(num_blocks=10, num_rows=10, schema={value: int64})" + assert str(ds3) == "Dataset(num_blocks=5, num_rows=10, schema={value: int64})" + assert ( + str(ds4) == "Dataset(num_blocks=1, num_rows=5, schema={a: string, b: double})" + ) + + +def test_schema_lazy(ray_start_regular_shared): + ds = ray.data.range(100, parallelism=10) + # We do not kick off the read task by default. + assert ds._plan._in_blocks._num_computed() == 0 + schema = ds.schema() + assert schema == int + assert ds._plan._in_blocks._num_computed() == 1 + # Fetching the schema should not trigger execution of extra read tasks. + assert ds._plan.execute()._num_computed() == 1 + + +def test_count_lazy(ray_start_regular_shared): + ds = ray.data.range(100, parallelism=10) + # We do not kick off the read task by default. + assert ds._plan._in_blocks._num_computed() == 0 + assert ds.count() == 100 + # Getting number of rows should not trigger execution of any read tasks + # for ray.data.range(), as the number of rows is known beforehand. + assert ds._plan._in_blocks._num_computed() == 0 + + +def test_lazy_loading_exponential_rampup(ray_start_regular_shared): + ds = ray.data.range(100, parallelism=20) + + def check_num_computed(expected): + if ray.data.context.DatasetContext.get_current().use_streaming_executor: + # In streaing executor, ds.take() will not invoke partial execution + # in LazyBlocklist. + assert ds._plan.execute()._num_computed() == 0 + else: + assert ds._plan.execute()._num_computed() == expected + + check_num_computed(0) + assert ds.take(10) == list(range(10)) + check_num_computed(2) + assert ds.take(20) == list(range(20)) + check_num_computed(4) + assert ds.take(30) == list(range(30)) + check_num_computed(8) + assert ds.take(50) == list(range(50)) + check_num_computed(16) + assert ds.take(100) == list(range(100)) + check_num_computed(20) + + +def test_dataset_repr(ray_start_regular_shared): + ds = ray.data.range(10, parallelism=10) + assert repr(ds) == "Dataset(num_blocks=10, num_rows=10, schema=)" + ds = ds.map_batches(lambda x: x) + assert repr(ds) == ( + "MapBatches()\n" + "+- Dataset(num_blocks=10, num_rows=10, schema=)" + ) + ds = ds.filter(lambda x: x > 0) + assert repr(ds) == ( + "Filter\n" + "+- MapBatches()\n" + " +- Dataset(num_blocks=10, num_rows=10, schema=)" + ) + ds = ds.random_shuffle() + assert repr(ds) == ( + "RandomShuffle\n" + "+- Filter\n" + " +- MapBatches()\n" + " +- Dataset(num_blocks=10, num_rows=10, schema=)" + ) + ds.fully_executed() + assert repr(ds) == "Dataset(num_blocks=10, num_rows=9, schema=)" + ds = ds.map_batches(lambda x: x) + assert repr(ds) == ( + "MapBatches()\n" + "+- Dataset(num_blocks=10, num_rows=9, schema=)" + ) + ds1, ds2 = ds.split(2) + assert ( + repr(ds1) + == f"Dataset(num_blocks=5, num_rows={ds1.count()}, schema=)" + ) + assert ( + repr(ds2) + == f"Dataset(num_blocks=5, num_rows={ds2.count()}, schema=)" + ) + ds3 = ds1.union(ds2) + assert repr(ds3) == "Dataset(num_blocks=10, num_rows=9, schema=)" + ds = ds.zip(ds3) + assert repr(ds) == ( + "Zip\n" "+- Dataset(num_blocks=10, num_rows=9, schema=)" + ) + + def my_dummy_fn(x): + return x + + ds = ray.data.range(10, parallelism=10) + ds = ds.map_batches(my_dummy_fn) + assert repr(ds) == ( + "MapBatches(my_dummy_fn)\n" + "+- Dataset(num_blocks=10, num_rows=10, schema=)" + ) + + +@pytest.mark.parametrize("lazy", [False, True]) +def test_limit(ray_start_regular_shared, lazy): + ds = ray.data.range(100, parallelism=20) + if not lazy: + ds = ds.fully_executed() + for i in range(100): + assert ds.limit(i).take(200) == list(range(i)) + + +# NOTE: We test outside the power-of-2 range in order to ensure that we're not reading +# redundant files due to exponential ramp-up. +@pytest.mark.parametrize("limit,expected", [(10, 1), (20, 2), (30, 3), (60, 6)]) +def test_limit_no_redundant_read(ray_start_regular_shared, limit, expected): + # Test that dataset truncation eliminates redundant reads. + @ray.remote + class Counter: + def __init__(self): + self.count = 0 + + def increment(self): + self.count += 1 + + def get(self): + return self.count + + def reset(self): + self.count = 0 + + class CountingRangeDatasource(Datasource): + def __init__(self): + self.counter = Counter.remote() + + def prepare_read(self, parallelism, n): + def range_(i): + ray.get(self.counter.increment.remote()) + return [list(range(parallelism * i, parallelism * i + n))] + + return [ + ReadTask( + lambda i=i: range_(i), + BlockMetadata( + num_rows=n, + size_bytes=None, + schema=None, + input_files=None, + exec_stats=None, + ), + ) + for i in range(parallelism) + ] + + source = CountingRangeDatasource() + + ds = ray.data.read_datasource( + source, + parallelism=10, + n=10, + ) + ds2 = ds.limit(limit) + # Check content. + assert ds2.take(limit) == list(range(limit)) + # Check number of read tasks launched. + assert ray.get(source.counter.get.remote()) == expected + + +def test_limit_no_num_row_info(ray_start_regular_shared): + # Test that datasources with no number-of-rows metadata available are still able to + # be truncated, falling back to kicking off all read tasks. + class DumbOnesDatasource(Datasource): + def prepare_read(self, parallelism, n): + return parallelism * [ + ReadTask( + lambda: [[1] * n], + BlockMetadata( + num_rows=None, + size_bytes=None, + schema=None, + input_files=None, + exec_stats=None, + ), + ) + ] + + ds = ray.data.read_datasource(DumbOnesDatasource(), parallelism=10, n=10) + for i in range(1, 100): + assert ds.limit(i).take(100) == [1] * i + + +def test_convert_types(ray_start_regular_shared): + plain_ds = ray.data.range(1) + arrow_ds = plain_ds.map(lambda x: {"a": x}) + assert arrow_ds.take() == [{"a": 0}] + assert "ArrowRow" in arrow_ds.map(lambda x: str(type(x))).take()[0] + + arrow_ds = ray.data.range_table(1) + assert arrow_ds.map(lambda x: "plain_{}".format(x["value"])).take() == ["plain_0"] + # In streaming, we set batch_format to "default" (because calling + # ds.dataset_format() will still invoke bulk execution and we want + # to avoid that). As a result, it's receiving PandasRow (the defaut + # batch format), which unwraps [0] to plain 0. + if ray.data.context.DatasetContext.get_current().use_streaming_executor: + assert arrow_ds.map(lambda x: {"a": (x["value"],)}).take() == [{"a": 0}] + else: + assert arrow_ds.map(lambda x: {"a": (x["value"],)}).take() == [{"a": [0]}] + + +def test_from_items(ray_start_regular_shared): + ds = ray.data.from_items(["hello", "world"]) + assert ds.take() == ["hello", "world"] + + +@pytest.mark.parametrize("parallelism", list(range(1, 21))) +def test_from_items_parallelism(ray_start_regular_shared, parallelism): + # Test that specifying parallelism yields the expected number of blocks. + n = 20 + records = [{"a": i} for i in range(n)] + ds = ray.data.from_items(records, parallelism=parallelism) + out = ds.take_all() + assert out == records + assert ds.num_blocks() == parallelism + + +def test_from_items_parallelism_truncated(ray_start_regular_shared): + # Test that specifying parallelism greater than the number of items is truncated to + # the number of items. + n = 10 + parallelism = 20 + records = [{"a": i} for i in range(n)] + ds = ray.data.from_items(records, parallelism=parallelism) + out = ds.take_all() + assert out == records + assert ds.num_blocks() == n + + +def test_take_all(ray_start_regular_shared): + assert ray.data.range(5).take_all() == [0, 1, 2, 3, 4] + + with pytest.raises(ValueError): + assert ray.data.range(5).take_all(4) + + +def test_sliding_window(): + arr = list(range(10)) + + # Test all windows over this iterable. + window_sizes = list(range(1, len(arr) + 1)) + for window_size in window_sizes: + windows = list(_sliding_window(arr, window_size)) + assert len(windows) == len(arr) - window_size + 1 + assert all(len(window) == window_size for window in windows) + assert all( + list(window) == arr[i : i + window_size] for i, window in enumerate(windows) + ) + + # Test window size larger than iterable length. + windows = list(_sliding_window(arr, 15)) + assert len(windows) == 1 + assert list(windows[0]) == arr + + +def test_iter_rows(ray_start_regular_shared): + # Test simple rows. + n = 10 + ds = ray.data.range(n) + for row, k in zip(ds.iter_rows(), range(n)): + assert row == k + + # Test tabular rows. + t1 = pa.Table.from_pydict({"one": [1, 2, 3], "two": [2, 3, 4]}) + t2 = pa.Table.from_pydict({"one": [4, 5, 6], "two": [5, 6, 7]}) + t3 = pa.Table.from_pydict({"one": [7, 8, 9], "two": [8, 9, 10]}) + t4 = pa.Table.from_pydict({"one": [10, 11, 12], "two": [11, 12, 13]}) + ts = [t1, t2, t3, t4] + t = pa.concat_tables(ts) + ds = ray.data.from_arrow(ts) + + def to_pylist(table): + pydict = table.to_pydict() + names = table.schema.names + pylist = [ + {column: pydict[column][row] for column in names} + for row in range(table.num_rows) + ] + return pylist + + # Default ArrowRows. + for row, t_row in zip(ds.iter_rows(), to_pylist(t)): + assert isinstance(row, TableRow) + # In streaming, we set batch_format to "default" because calling + # ds.dataset_format() will still invoke bulk execution and we want + # to avoid that. As a result, it's receiving PandasRow (the defaut + # batch format). + if ray.data.context.DatasetContext.get_current().use_streaming_executor: + assert isinstance(row, PandasRow) + else: + assert isinstance(row, ArrowRow) + assert row == t_row + + # PandasRows after conversion. + pandas_ds = ds.map_batches(lambda x: x, batch_format="pandas") + df = t.to_pandas() + for row, (index, df_row) in zip(pandas_ds.iter_rows(), df.iterrows()): + assert isinstance(row, TableRow) + assert isinstance(row, PandasRow) + assert row == df_row.to_dict() + + # Prefetch. + for row, t_row in zip(ds.iter_rows(prefetch_blocks=1), to_pylist(t)): + assert isinstance(row, TableRow) + # In streaming, we set batch_format to "default" because calling + # ds.dataset_format() will still invoke bulk execution and we want + # to avoid that. As a result, it's receiving PandasRow (the defaut + # batch format). + if ray.data.context.DatasetContext.get_current().use_streaming_executor: + assert isinstance(row, PandasRow) + else: + assert isinstance(row, ArrowRow) + assert row == t_row + + +def test_iter_batches_basic(ray_start_regular_shared): + df1 = pd.DataFrame({"one": [1, 2, 3], "two": [2, 3, 4]}) + df2 = pd.DataFrame({"one": [4, 5, 6], "two": [5, 6, 7]}) + df3 = pd.DataFrame({"one": [7, 8, 9], "two": [8, 9, 10]}) + df4 = pd.DataFrame({"one": [10, 11, 12], "two": [11, 12, 13]}) + dfs = [df1, df2, df3, df4] + ds = ray.data.from_pandas(dfs) + + # Default. + for batch, df in zip(ds.iter_batches(batch_size=None, batch_format="pandas"), dfs): + assert isinstance(batch, pd.DataFrame) + assert batch.equals(df) + + # pyarrow.Table format. + for batch, df in zip(ds.iter_batches(batch_size=None, batch_format="pyarrow"), dfs): + assert isinstance(batch, pa.Table) + assert batch.equals(pa.Table.from_pandas(df)) + + # NumPy format. + for batch, df in zip(ds.iter_batches(batch_size=None, batch_format="numpy"), dfs): + assert isinstance(batch, dict) + assert list(batch.keys()) == ["one", "two"] + assert all(isinstance(col, np.ndarray) for col in batch.values()) + pd.testing.assert_frame_equal(pd.DataFrame(batch), df) + + # Numpy format (single column). + ds2 = ds.select_columns(["one"]) + for batch, df in zip(ds2.iter_batches(batch_size=None, batch_format="numpy"), dfs): + assert isinstance(batch, dict) + assert list(batch.keys()) == ["one"] + assert all(isinstance(col, np.ndarray) for col in batch.values()) + pd.testing.assert_frame_equal(pd.DataFrame(batch), df[["one"]]) + + # Test NumPy format on Arrow blocks. + ds2 = ds.map_batches(lambda b: b, batch_size=None, batch_format="pyarrow") + for batch, df in zip(ds2.iter_batches(batch_size=None, batch_format="numpy"), dfs): + assert isinstance(batch, dict) + assert list(batch.keys()) == ["one", "two"] + assert all(isinstance(col, np.ndarray) for col in batch.values()) + pd.testing.assert_frame_equal(pd.DataFrame(batch), df) + + # Test NumPy format on Arrow blocks (single column). + ds3 = ds2.select_columns(["one"]) + for batch, df in zip(ds3.iter_batches(batch_size=None, batch_format="numpy"), dfs): + assert isinstance(batch, dict) + assert list(batch.keys()) == ["one"] + assert all(isinstance(col, np.ndarray) for col in batch.values()) + pd.testing.assert_frame_equal(pd.DataFrame(batch), df[["one"]]) + + # Native format (deprecated). + for batch, df in zip(ds.iter_batches(batch_size=None, batch_format="native"), dfs): + assert BlockAccessor.for_block(batch).to_pandas().equals(df) + + # Default format. + for batch, df in zip(ds.iter_batches(batch_size=None, batch_format="default"), dfs): + assert BlockAccessor.for_block(batch).to_pandas().equals(df) + + # Batch size. + batch_size = 2 + batches = list(ds.iter_batches(batch_size=batch_size, batch_format="pandas")) + assert all(len(batch) == batch_size for batch in batches) + assert len(batches) == math.ceil( + (len(df1) + len(df2) + len(df3) + len(df4)) / batch_size + ) + assert pd.concat(batches, ignore_index=True).equals( + pd.concat(dfs, ignore_index=True) + ) + + # Batch size larger than block. + batch_size = 4 + batches = list(ds.iter_batches(batch_size=batch_size, batch_format="pandas")) + assert all(len(batch) == batch_size for batch in batches) + assert len(batches) == math.ceil( + (len(df1) + len(df2) + len(df3) + len(df4)) / batch_size + ) + assert pd.concat(batches, ignore_index=True).equals( + pd.concat(dfs, ignore_index=True) + ) + + # Batch size larger than dataset. + batch_size = 15 + batches = list(ds.iter_batches(batch_size=batch_size, batch_format="pandas")) + assert all(len(batch) == ds.count() for batch in batches) + assert len(batches) == 1 + assert pd.concat(batches, ignore_index=True).equals( + pd.concat(dfs, ignore_index=True) + ) + + # Batch size drop partial. + batch_size = 5 + batches = list( + ds.iter_batches(batch_size=batch_size, drop_last=True, batch_format="pandas") + ) + assert all(len(batch) == batch_size for batch in batches) + assert len(batches) == (len(df1) + len(df2) + len(df3) + len(df4)) // batch_size + assert pd.concat(batches, ignore_index=True).equals( + pd.concat(dfs, ignore_index=True)[:10] + ) + + # Batch size don't drop partial. + batch_size = 5 + batches = list( + ds.iter_batches(batch_size=batch_size, drop_last=False, batch_format="pandas") + ) + assert all(len(batch) == batch_size for batch in batches[:-1]) + assert len(batches[-1]) == (len(df1) + len(df2) + len(df3) + len(df4)) % batch_size + assert len(batches) == math.ceil( + (len(df1) + len(df2) + len(df3) + len(df4)) / batch_size + ) + assert pd.concat(batches, ignore_index=True).equals( + pd.concat(dfs, ignore_index=True) + ) + + # Prefetch. + batches = list( + ds.iter_batches(prefetch_blocks=1, batch_size=None, batch_format="pandas") + ) + assert len(batches) == len(dfs) + for batch, df in zip(batches, dfs): + assert isinstance(batch, pd.DataFrame) + assert batch.equals(df) + + batch_size = 2 + batches = list( + ds.iter_batches(prefetch_blocks=2, batch_size=batch_size, batch_format="pandas") + ) + assert all(len(batch) == batch_size for batch in batches) + assert len(batches) == math.ceil( + (len(df1) + len(df2) + len(df3) + len(df4)) / batch_size + ) + assert pd.concat(batches, ignore_index=True).equals( + pd.concat(dfs, ignore_index=True) + ) + + # Prefetch more than number of blocks. + batches = list( + ds.iter_batches( + prefetch_blocks=len(dfs), batch_size=None, batch_format="pandas" + ) + ) + assert len(batches) == len(dfs) + for batch, df in zip(batches, dfs): + assert isinstance(batch, pd.DataFrame) + assert batch.equals(df) + + # Prefetch with ray.wait. + context = DatasetContext.get_current() + old_config = context.actor_prefetcher_enabled + try: + context.actor_prefetcher_enabled = False + batches = list( + ds.iter_batches(prefetch_blocks=1, batch_size=None, batch_format="pandas") + ) + assert len(batches) == len(dfs) + for batch, df in zip(batches, dfs): + assert isinstance(batch, pd.DataFrame) + assert batch.equals(df) + finally: + context.actor_prefetcher_enabled = old_config + + +def test_iter_batches_empty_block(ray_start_regular_shared): + ds = ray.data.range(1).repartition(10) + assert list(ds.iter_batches(batch_size=None)) == [[0]] + assert list(ds.iter_batches(batch_size=1, local_shuffle_buffer_size=1)) == [[0]] + + +@pytest.mark.parametrize("pipelined", [False, True]) +@pytest.mark.parametrize("ds_format", ["arrow", "pandas", "simple"]) +def test_iter_batches_local_shuffle(shutdown_only, pipelined, ds_format): + # Input validation. + # Batch size must be given for local shuffle. + with pytest.raises(ValueError): + list( + ray.data.range(100).iter_batches( + batch_size=None, local_shuffle_buffer_size=10 + ) + ) + + def range(n, parallelism=200): + if ds_format == "simple": + ds = ray.data.range(n, parallelism=parallelism) + elif ds_format == "arrow": + ds = ray.data.range_table(n, parallelism=parallelism) + elif ds_format == "pandas": + ds = ray.data.range_table(n, parallelism=parallelism).map_batches( + lambda df: df, batch_size=None, batch_format="pandas" + ) + if pipelined: + pipe = ds.repeat(2) + return pipe + else: + return ds + + def to_row_dicts(batch): + if isinstance(batch, pd.DataFrame): + batch = batch.to_dict(orient="records") + return batch + + def unbatch(batches): + return [r for batch in batches for r in to_row_dicts(batch)] + + def sort(r): + if ds_format == "simple": + return sorted(r) + return sorted(r, key=lambda v: v["value"]) + + base = range(100).take_all() + + # Local shuffle. + r1 = unbatch( + range(100, parallelism=10).iter_batches( + batch_size=3, + local_shuffle_buffer_size=25, + ) + ) + r2 = unbatch( + range(100, parallelism=10).iter_batches( + batch_size=3, + local_shuffle_buffer_size=25, + ) + ) + # Check randomness of shuffle. + assert r1 != r2, (r1, r2) + assert r1 != base + assert r2 != base + # Check content. + assert sort(r1) == sort(base) + assert sort(r2) == sort(base) + + # Set seed. + r1 = unbatch( + range(100, parallelism=10).iter_batches( + batch_size=3, + local_shuffle_buffer_size=25, + local_shuffle_seed=0, + ) + ) + r2 = unbatch( + range(100, parallelism=10).iter_batches( + batch_size=3, + local_shuffle_buffer_size=25, + local_shuffle_seed=0, + ) + ) + # Check randomness of shuffle. + assert r1 == r2, (r1, r2) + assert r1 != base + # Check content. + assert sort(r1) == sort(base) + + # Single block. + r1 = unbatch( + range(100, parallelism=1).iter_batches( + batch_size=3, + local_shuffle_buffer_size=25, + ) + ) + r2 = unbatch( + range(100, parallelism=1).iter_batches( + batch_size=3, + local_shuffle_buffer_size=25, + ) + ) + # Check randomness of shuffle. + assert r1 != r2, (r1, r2) + assert r1 != base + assert r2 != base + # Check content. + assert sort(r1) == sort(base) + assert sort(r2) == sort(base) + + # Single-row blocks. + r1 = unbatch( + range(100, parallelism=100).iter_batches( + batch_size=3, + local_shuffle_buffer_size=25, + ) + ) + r2 = unbatch( + range(100, parallelism=100).iter_batches( + batch_size=3, + local_shuffle_buffer_size=25, + ) + ) + # Check randomness of shuffle. + assert r1 != r2, (r1, r2) + assert r1 != base + assert r2 != base + # Check content. + assert sort(r1) == sort(base) + assert sort(r2) == sort(base) + + # Buffer larger than dataset. + r1 = unbatch( + range(100, parallelism=10).iter_batches( + batch_size=3, + local_shuffle_buffer_size=200, + ) + ) + r2 = unbatch( + range(100, parallelism=10).iter_batches( + batch_size=3, + local_shuffle_buffer_size=200, + ) + ) + # Check randomness of shuffle. + assert r1 != r2, (r1, r2) + assert r1 != base + assert r2 != base + # Check content. + assert sort(r1) == sort(base) + assert sort(r2) == sort(base) + + # Batch size larger than block. + r1 = unbatch( + range(100, parallelism=20).iter_batches( + batch_size=12, + local_shuffle_buffer_size=25, + ) + ) + r2 = unbatch( + range(100, parallelism=20).iter_batches( + batch_size=12, + local_shuffle_buffer_size=25, + ) + ) + # Check randomness of shuffle. + assert r1 != r2, (r1, r2) + assert r1 != base + assert r2 != base + # Check content. + assert sort(r1) == sort(base) + assert sort(r2) == sort(base) + + # Batch size larger than dataset. + r1 = unbatch( + range(100, parallelism=10).iter_batches( + batch_size=200, + local_shuffle_buffer_size=400, + ) + ) + r2 = unbatch( + range(100, parallelism=10).iter_batches( + batch_size=200, + local_shuffle_buffer_size=400, + ) + ) + # Check randomness of shuffle. + assert r1 != r2, (r1, r2) + assert r1 != base + assert r2 != base + # Check content. + assert sort(r1) == sort(base) + assert sort(r2) == sort(base) + + # Drop partial batches. + r1 = unbatch( + range(100, parallelism=10).iter_batches( + batch_size=7, + local_shuffle_buffer_size=21, + drop_last=True, + ) + ) + r2 = unbatch( + range(100, parallelism=10).iter_batches( + batch_size=7, + local_shuffle_buffer_size=21, + drop_last=True, + ) + ) + # Check randomness of shuffle. + assert r1 != r2, (r1, r2) + assert r1 != base + assert r2 != base + # Check content. + # Check that partial batches were dropped. + assert len(r1) % 7 == 0 + assert len(r2) % 7 == 0 + tmp_base = base + if ds_format in ("arrow", "pandas"): + r1 = [tuple(r.items()) for r in r1] + r2 = [tuple(r.items()) for r in r2] + tmp_base = [tuple(r.items()) for r in base] + assert set(r1) <= set(tmp_base) + assert set(r2) <= set(tmp_base) + + # Test empty dataset. + ds = ray.data.from_items([]) + r1 = unbatch(ds.iter_batches(batch_size=2, local_shuffle_buffer_size=10)) + assert len(r1) == 0 + assert r1 == ds.take() + + +def test_iter_batches_grid(ray_start_regular_shared): + # Tests slicing, batch combining, and partial batch dropping logic over + # a grid of dataset, batching, and dropping configurations. + # Grid: num_blocks x num_rows_block_1 x ... x num_rows_block_N x + # batch_size x drop_last + seed = int(time.time()) + print(f"Seeding RNG for test_iter_batches_grid with: {seed}") + random.seed(seed) + max_num_blocks = 20 + max_num_rows_per_block = 20 + num_blocks_samples = 3 + block_sizes_samples = 3 + batch_size_samples = 3 + + for num_blocks in np.random.randint(1, max_num_blocks + 1, size=num_blocks_samples): + block_sizes_list = [ + np.random.randint(1, max_num_rows_per_block + 1, size=num_blocks) + for _ in range(block_sizes_samples) + ] + for block_sizes in block_sizes_list: + # Create the dataset with the given block sizes. + dfs = [] + running_size = 0 + for block_size in block_sizes: + dfs.append( + pd.DataFrame( + {"value": list(range(running_size, running_size + block_size))} + ) + ) + running_size += block_size + num_rows = running_size + ds = ray.data.from_pandas(dfs) + for batch_size in np.random.randint( + 1, num_rows + 1, size=batch_size_samples + ): + for drop_last in (False, True): + batches = list( + ds.iter_batches( + batch_size=batch_size, + drop_last=drop_last, + batch_format="pandas", + ) + ) + if num_rows % batch_size == 0 or not drop_last: + # Number of batches should be equal to + # num_rows / batch_size, rounded up. + assert len(batches) == math.ceil(num_rows / batch_size) + # Concatenated batches should equal the DataFrame + # representation of the entire dataset. + assert pd.concat(batches, ignore_index=True).equals( + ds.to_pandas() + ) + else: + # Number of batches should be equal to + # num_rows / batch_size, rounded down. + assert len(batches) == num_rows // batch_size + # Concatenated batches should equal the DataFrame + # representation of the dataset with the partial batch + # remainder sliced off. + assert pd.concat(batches, ignore_index=True).equals( + ds.to_pandas()[: batch_size * (num_rows // batch_size)] + ) + if num_rows % batch_size == 0 or drop_last: + assert all(len(batch) == batch_size for batch in batches) + else: + assert all(len(batch) == batch_size for batch in batches[:-1]) + assert len(batches[-1]) == num_rows % batch_size + + +def test_lazy_loading_iter_batches_exponential_rampup(ray_start_regular_shared): + ds = ray.data.range(32, parallelism=8) + expected_num_blocks = [1, 2, 4, 4, 8, 8, 8, 8] + for _, expected in zip(ds.iter_batches(batch_size=None), expected_num_blocks): + if ray.data.context.DatasetContext.get_current().use_streaming_executor: + # In streaming execution of ds.iter_batches(), there is no partial + # execution so _num_computed() in LazyBlocklist is 0. + assert ds._plan.execute()._num_computed() == 0 + else: + assert ds._plan.execute()._num_computed() == expected + + +def test_union(ray_start_regular_shared): + ds = ray.data.range(20, parallelism=10) + + # Test lazy union. + ds = ds.union(ds, ds, ds, ds) + assert ds.num_blocks() == 50 + assert ds.count() == 100 + assert ds.sum() == 950 + + ds = ds.union(ds) + assert ds.count() == 200 + assert ds.sum() == (950 * 2) + + # Test materialized union. + ds2 = ray.data.from_items([1, 2, 3, 4, 5]) + assert ds2.count() == 5 + ds2 = ds2.union(ds2) + assert ds2.count() == 10 + ds2 = ds2.union(ds) + assert ds2.count() == 210 + + +@pytest.mark.parametrize("pipelined", [False, True]) +def test_iter_tf_batches(ray_start_regular_shared, pipelined): + df1 = pd.DataFrame( + {"one": [1, 2, 3], "two": [1.0, 2.0, 3.0], "label": [1.0, 2.0, 3.0]} + ) + df2 = pd.DataFrame( + {"one": [4, 5, 6], "two": [4.0, 5.0, 6.0], "label": [4.0, 5.0, 6.0]} + ) + df3 = pd.DataFrame({"one": [7, 8], "two": [7.0, 8.0], "label": [7.0, 8.0]}) + df = pd.concat([df1, df2, df3]) + ds = ray.data.from_pandas([df1, df2, df3]) + ds = maybe_pipeline(ds, pipelined) + + num_epochs = 1 if pipelined else 2 + for _ in range(num_epochs): + iterations = [] + for batch in ds.iter_tf_batches(batch_size=3): + iterations.append( + np.stack((batch["one"], batch["two"], batch["label"]), axis=1) + ) + combined_iterations = np.concatenate(iterations) + np.testing.assert_array_equal(np.sort(df.values), np.sort(combined_iterations)) + + +@pytest.mark.parametrize("pipelined", [False, True]) +def test_iter_tf_batches_tensor_ds(ray_start_regular_shared, pipelined): + arr1 = np.arange(12).reshape((3, 2, 2)) + arr2 = np.arange(12, 24).reshape((3, 2, 2)) + arr = np.concatenate((arr1, arr2)) + ds = ray.data.from_numpy([arr1, arr2]) + ds = maybe_pipeline(ds, pipelined) + + num_epochs = 1 if pipelined else 2 + for _ in range(num_epochs): + iterations = [] + for batch in ds.iter_tf_batches(batch_size=2): + iterations.append(batch) + combined_iterations = np.concatenate(iterations) + np.testing.assert_array_equal(arr, combined_iterations) + + +def test_block_builder_for_block(ray_start_regular_shared): + # list + builder = BlockBuilder.for_block(list()) + builder.add_block([1, 2]) + assert builder.build() == [1, 2] + builder.add_block([3, 4]) + assert builder.build() == [1, 2, 3, 4] + + # pandas dataframe + builder = BlockBuilder.for_block(pd.DataFrame()) + b1 = pd.DataFrame({"A": [1], "B": ["a"]}) + builder.add_block(b1) + assert builder.build().equals(b1) + b2 = pd.DataFrame({"A": [2, 3], "B": ["c", "d"]}) + builder.add_block(b2) + expected = pd.DataFrame({"A": [1, 2, 3], "B": ["a", "c", "d"]}) + assert builder.build().equals(expected) + + # pyarrow table + builder = BlockBuilder.for_block(pa.Table.from_arrays(list())) + b1 = pa.Table.from_pydict({"A": [1], "B": ["a"]}) + builder.add_block(b1) + builder.build().equals(b1) + b2 = pa.Table.from_pydict({"A": [2, 3], "B": ["c", "d"]}) + builder.add_block(b2) + expected = pa.Table.from_pydict({"A": [1, 2, 3], "B": ["a", "c", "d"]}) + builder.build().equals(expected) + + # wrong type + with pytest.raises(TypeError): + BlockBuilder.for_block(str()) + + +@pytest.mark.parametrize("num_parts", [1, 30]) +@pytest.mark.parametrize("ds_format", ["arrow", "pandas"]) +def test_global_tabular_min(ray_start_regular_shared, ds_format, num_parts): + seed = int(time.time()) + print(f"Seeding RNG for test_global_arrow_min with: {seed}") + random.seed(seed) + xs = list(range(100)) + random.shuffle(xs) + + def _to_pandas(ds): + return ds.map_batches(lambda x: x, batch_size=None, batch_format="pandas") + + # Test built-in global min aggregation + ds = ray.data.from_items([{"A": x} for x in xs]).repartition(num_parts) + if ds_format == "pandas": + ds = _to_pandas(ds) + assert ds.min("A") == 0 + + # Test empty dataset + ds = ray.data.range_table(10) + if ds_format == "pandas": + ds = _to_pandas(ds) + assert ds.filter(lambda r: r["value"] > 10).min("value") is None + + # Test built-in global min aggregation with nans + nan_ds = ray.data.from_items([{"A": x} for x in xs] + [{"A": None}]).repartition( + num_parts + ) + if ds_format == "pandas": + nan_ds = _to_pandas(nan_ds) + assert nan_ds.min("A") == 0 + # Test ignore_nulls=False + assert nan_ds.min("A", ignore_nulls=False) is None + # Test all nans + nan_ds = ray.data.from_items([{"A": None}] * len(xs)).repartition(num_parts) + if ds_format == "pandas": + nan_ds = _to_pandas(nan_ds) + assert nan_ds.min("A") is None + assert nan_ds.min("A", ignore_nulls=False) is None + + +@pytest.mark.parametrize("num_parts", [1, 30]) +@pytest.mark.parametrize("ds_format", ["arrow", "pandas"]) +def test_global_tabular_max(ray_start_regular_shared, ds_format, num_parts): + seed = int(time.time()) + print(f"Seeding RNG for test_global_arrow_max with: {seed}") + random.seed(seed) + xs = list(range(100)) + random.shuffle(xs) + + def _to_pandas(ds): + return ds.map_batches(lambda x: x, batch_size=None, batch_format="pandas") + + # Test built-in global max aggregation + ds = ray.data.from_items([{"A": x} for x in xs]).repartition(num_parts) + if ds_format == "pandas": + ds = _to_pandas(ds) + assert ds.max("A") == 99 + + # Test empty dataset + ds = ray.data.range_table(10) + if ds_format == "pandas": + ds = _to_pandas(ds) + assert ds.filter(lambda r: r["value"] > 10).max("value") is None + + # Test built-in global max aggregation with nans + nan_ds = ray.data.from_items([{"A": x} for x in xs] + [{"A": None}]).repartition( + num_parts + ) + if ds_format == "pandas": + nan_ds = _to_pandas(nan_ds) + assert nan_ds.max("A") == 99 + # Test ignore_nulls=False + assert nan_ds.max("A", ignore_nulls=False) is None + # Test all nans + nan_ds = ray.data.from_items([{"A": None}] * len(xs)).repartition(num_parts) + if ds_format == "pandas": + nan_ds = _to_pandas(nan_ds) + assert nan_ds.max("A") is None + assert nan_ds.max("A", ignore_nulls=False) is None + + +@pytest.mark.parametrize("num_parts", [1, 30]) +@pytest.mark.parametrize("ds_format", ["arrow", "pandas"]) +def test_global_tabular_mean(ray_start_regular_shared, ds_format, num_parts): + seed = int(time.time()) + print(f"Seeding RNG for test_global_arrow_mean with: {seed}") + random.seed(seed) + xs = list(range(100)) + random.shuffle(xs) + + def _to_pandas(ds): + return ds.map_batches(lambda x: x, batch_size=None, batch_format="pandas") + + # Test built-in global mean aggregation + ds = ray.data.from_items([{"A": x} for x in xs]).repartition(num_parts) + if ds_format == "pandas": + ds = _to_pandas(ds) + assert ds.mean("A") == 49.5 + + # Test empty dataset + ds = ray.data.range_table(10) + if ds_format == "pandas": + ds = _to_pandas(ds) + assert ds.filter(lambda r: r["value"] > 10).mean("value") is None + + # Test built-in global mean aggregation with nans + nan_ds = ray.data.from_items([{"A": x} for x in xs] + [{"A": None}]).repartition( + num_parts + ) + if ds_format == "pandas": + nan_ds = _to_pandas(nan_ds) + assert nan_ds.mean("A") == 49.5 + # Test ignore_nulls=False + assert nan_ds.mean("A", ignore_nulls=False) is None + # Test all nans + nan_ds = ray.data.from_items([{"A": None}] * len(xs)).repartition(num_parts) + if ds_format == "pandas": + nan_ds = _to_pandas(nan_ds) + assert nan_ds.mean("A") is None + assert nan_ds.mean("A", ignore_nulls=False) is None + + +@pytest.mark.parametrize("num_parts", [1, 30]) +@pytest.mark.parametrize("ds_format", ["arrow", "pandas"]) +def test_global_tabular_std(ray_start_regular_shared, ds_format, num_parts): + seed = int(time.time()) + print(f"Seeding RNG for test_global_arrow_std with: {seed}") + random.seed(seed) + xs = list(range(100)) + random.shuffle(xs) + + def _to_arrow(ds): + return ds.map_batches(lambda x: x, batch_size=None, batch_format="pyarrow") + + def _to_pandas(ds): + return ds.map_batches(lambda x: x, batch_size=None, batch_format="pandas") + + # Test built-in global max aggregation + df = pd.DataFrame({"A": xs}) + ds = ray.data.from_pandas(df).repartition(num_parts) + if ds_format == "arrow": + ds = _to_arrow(ds) + assert math.isclose(ds.std("A"), df["A"].std()) + assert math.isclose(ds.std("A", ddof=0), df["A"].std(ddof=0)) + + # Test empty dataset + ds = ray.data.from_pandas(pd.DataFrame({"A": []})) + if ds_format == "arrow": + ds = _to_arrow(ds) + assert ds.std("A") is None + # Test edge cases + ds = ray.data.from_pandas(pd.DataFrame({"A": [3]})) + if ds_format == "arrow": + ds = _to_arrow(ds) + assert ds.std("A") == 0 + + # Test built-in global std aggregation with nans + nan_df = pd.DataFrame({"A": xs + [None]}) + nan_ds = ray.data.from_pandas(nan_df).repartition(num_parts) + if ds_format == "arrow": + nan_ds = _to_arrow(nan_ds) + assert math.isclose(nan_ds.std("A"), nan_df["A"].std()) + # Test ignore_nulls=False + assert nan_ds.std("A", ignore_nulls=False) is None + # Test all nans + nan_ds = ray.data.from_items([{"A": None}] * len(xs)).repartition(num_parts) + if ds_format == "pandas": + nan_ds = _to_pandas(nan_ds) + assert nan_ds.std("A") is None + assert nan_ds.std("A", ignore_nulls=False) is None + + +def test_column_name_type_check(ray_start_regular_shared): + df = pd.DataFrame({"1": np.random.rand(10), "a": np.random.rand(10)}) + ds = ray.data.from_pandas(df) + expected_str = "Dataset(num_blocks=1, num_rows=10, schema={1: float64, a: float64})" + assert str(ds) == expected_str, str(ds) + df = pd.DataFrame({1: np.random.rand(10), "a": np.random.rand(10)}) + with pytest.raises(ValueError): + ray.data.from_pandas(df) + + +def test_len(ray_start_regular_shared): + ds = ray.data.range(1) + with pytest.raises(AttributeError): + len(ds) + + +def test_simple_block_select(): + xs = list(range(100)) + block_accessor = BlockAccessor.for_block(xs) + + block = block_accessor.select([lambda x: x % 3]) + assert block == [x % 3 for x in xs] + + with pytest.raises(ValueError): + block = block_accessor.select(["foo"]) + + with pytest.raises(ValueError): + block = block_accessor.select([]) + + +def test_pandas_block_select(): + df = pd.DataFrame({"one": [10, 11, 12], "two": [11, 12, 13], "three": [14, 15, 16]}) + block_accessor = BlockAccessor.for_block(df) + + block = block_accessor.select(["two"]) + assert block.equals(df[["two"]]) + + block = block_accessor.select(["two", "one"]) + assert block.equals(df[["two", "one"]]) + + with pytest.raises(ValueError): + block = block_accessor.select([lambda x: x % 3, "two"]) + + +# NOTE: All tests above share a Ray cluster, while the tests below do not. These +# tests should only be carefully reordered to retain this invariant! + + +def test_unsupported_pyarrow_versions_check(shutdown_only, unsupported_pyarrow_version): + ray.shutdown() + + # Test that unsupported pyarrow versions cause an error to be raised upon the + # initial pyarrow use. + ray.init(runtime_env={"pip": [f"pyarrow=={unsupported_pyarrow_version}"]}) + + # Test Arrow-native creation APIs. + # Test range_table. + with pytest.raises(ImportError): + ray.data.range_table(10).take_all() + + # Test from_arrow. + with pytest.raises(ImportError): + ray.data.from_arrow(pa.table({"a": [1, 2]})) + + # Test read_parquet. + with pytest.raises(ImportError): + ray.data.read_parquet("example://iris.parquet").take_all() + + # Test from_numpy (we use Arrow for representing the tensors). + with pytest.raises(ImportError): + ray.data.from_numpy(np.arange(12).reshape((3, 2, 2))) + + +def test_unsupported_pyarrow_versions_check_disabled( + shutdown_only, + unsupported_pyarrow_version, + disable_pyarrow_version_check, +): + # Test that unsupported pyarrow versions DO NOT cause an error to be raised upon the + # initial pyarrow use when the version check is disabled. + ray.init( + runtime_env={ + "pip": [f"pyarrow=={unsupported_pyarrow_version}"], + "env_vars": {"RAY_DISABLE_PYARROW_VERSION_CHECK": "1"}, + }, + ) + + # Test Arrow-native creation APIs. + # Test range_table. + try: + ray.data.range_table(10).take_all() + except ImportError as e: + pytest.fail(f"_check_pyarrow_version failed unexpectedly: {e}") + + # Test from_arrow. + try: + ray.data.from_arrow(pa.table({"a": [1, 2]})) + except ImportError as e: + pytest.fail(f"_check_pyarrow_version failed unexpectedly: {e}") + + # Test read_parquet. + try: + ray.data.read_parquet("example://iris.parquet").take_all() + except ImportError as e: + pytest.fail(f"_check_pyarrow_version failed unexpectedly: {e}") + + # Test from_numpy (we use Arrow for representing the tensors). + try: + ray.data.from_numpy(np.arange(12).reshape((3, 2, 2))) + except ImportError as e: + pytest.fail(f"_check_pyarrow_version failed unexpectedly: {e}") + + +def test_read_write_local_node_ray_client(ray_start_cluster_enabled): + cluster = ray_start_cluster_enabled + cluster.add_node(num_cpus=4) + cluster.head_node._ray_params.ray_client_server_port = "10004" + cluster.head_node.start_ray_client_server() + address = "ray://localhost:10004" + + import tempfile + + data_path = tempfile.mkdtemp() + df = pd.DataFrame({"one": list(range(0, 10)), "two": list(range(10, 20))}) + path = os.path.join(data_path, "test.parquet") + df.to_parquet(path) + + # Read/write from Ray Client will result in error. + ray.init(address) + with pytest.raises(ValueError): + ds = ray.data.read_parquet("local://" + path).fully_executed() + ds = ray.data.from_pandas(df) + with pytest.raises(ValueError): + ds.write_parquet("local://" + data_path).fully_executed() + + +def test_read_warning_large_parallelism(ray_start_regular, propagate_logs, caplog): + with caplog.at_level(logging.WARNING, logger="ray.data.read_api"): + ray.data.range(5000, parallelism=5000).fully_executed() + assert ( + "The requested parallelism of 5000 is " + "more than 4x the number of available CPU slots in the cluster" in caplog.text + ), caplog.text + + +def test_read_write_local_node(ray_start_cluster): + cluster = ray_start_cluster + cluster.add_node( + resources={"bar:1": 100}, + num_cpus=10, + _system_config={"max_direct_call_object_size": 0}, + ) + cluster.add_node(resources={"bar:2": 100}, num_cpus=10) + cluster.add_node(resources={"bar:3": 100}, num_cpus=10) + + ray.init(cluster.address) + + import os + import tempfile + + data_path = tempfile.mkdtemp() + num_files = 5 + for idx in range(num_files): + df = pd.DataFrame( + {"one": list(range(idx, idx + 10)), "two": list(range(idx + 10, idx + 20))} + ) + path = os.path.join(data_path, f"test{idx}.parquet") + df.to_parquet(path) + + ctx = ray.data.context.DatasetContext.get_current() + ctx.read_write_local_node = True + + def check_dataset_is_local(ds): + blocks = ds.get_internal_block_refs() + assert len(blocks) == num_files + ray.wait(blocks, num_returns=len(blocks), fetch_local=False) + location_data = ray.experimental.get_object_locations(blocks) + locations = [] + for block in blocks: + locations.extend(location_data[block]["node_ids"]) + assert set(locations) == {ray.get_runtime_context().get_node_id()} + + local_path = "local://" + data_path + # Plain read. + ds = ray.data.read_parquet(local_path).fully_executed() + check_dataset_is_local(ds) + + # SPREAD scheduling got overridden when read local scheme. + ds = ray.data.read_parquet( + local_path, ray_remote_args={"scheduling_strategy": "SPREAD"} + ).fully_executed() + check_dataset_is_local(ds) + + # With fusion. + ds = ray.data.read_parquet(local_path).map(lambda x: x).fully_executed() + check_dataset_is_local(ds) + + # Write back to local scheme. + output = os.path.join(local_path, "test_read_write_local_node") + ds.write_parquet(output) + assert "1 nodes used" in ds.stats(), ds.stats() + ray.data.read_parquet(output).take_all() == ds.take_all() + + # Mixing paths of local and non-local scheme is invalid. + with pytest.raises(ValueError): + ds = ray.data.read_parquet( + [local_path + "/test1.parquet", data_path + "/test2.parquet"] + ).fully_executed() + with pytest.raises(ValueError): + ds = ray.data.read_parquet( + [local_path + "/test1.parquet", "example://iris.parquet"] + ).fully_executed() + with pytest.raises(ValueError): + ds = ray.data.read_parquet( + ["example://iris.parquet", local_path + "/test1.parquet"] + ).fully_executed() + + +@ray.remote +class Counter: + def __init__(self): + self.value = 0 + + def increment(self): + self.value += 1 + return self.value + + +class FlakyCSVDatasource(CSVDatasource): + def __init__(self): + self.counter = Counter.remote() + + def _read_stream(self, f: "pa.NativeFile", path: str, **reader_args): + count = self.counter.increment.remote() + if ray.get(count) == 1: + raise ValueError("oops") + else: + for block in CSVDatasource._read_stream(self, f, path, **reader_args): + yield block + + def _write_block(self, f: "pa.NativeFile", block: BlockAccessor, **writer_args): + count = self.counter.increment.remote() + if ray.get(count) == 1: + raise ValueError("oops") + else: + CSVDatasource._write_block(self, f, block, **writer_args) + + +def test_dataset_retry_exceptions(ray_start_regular, local_path): + df1 = pd.DataFrame({"one": [1, 2, 3], "two": ["a", "b", "c"]}) + path1 = os.path.join(local_path, "test1.csv") + df1.to_csv(path1, index=False, storage_options={}) + ds1 = ray.data.read_datasource(FlakyCSVDatasource(), parallelism=1, paths=path1) + ds1.write_datasource(FlakyCSVDatasource(), path=local_path, dataset_uuid="data") + assert df1.equals( + pd.read_csv(os.path.join(local_path, "data_000000.csv"), storage_options={}) + ) + + counter = Counter.remote() + + def flaky_mapper(x): + count = counter.increment.remote() + if ray.get(count) == 1: + raise ValueError("oops") + else: + return ray.get(count) + + assert sorted(ds1.map(flaky_mapper).take()) == [2, 3, 4] + + with pytest.raises(ValueError): + ray.data.read_datasource( + FlakyCSVDatasource(), + parallelism=1, + paths=path1, + ray_remote_args={"retry_exceptions": False}, + ).take() + + +def test_datasource(ray_start_regular): + source = ray.data.datasource.RandomIntRowDatasource() + assert len(ray.data.read_datasource(source, n=10, num_columns=2).take()) == 10 + source = ray.data.datasource.RangeDatasource() + assert ray.data.read_datasource(source, n=10).take() == list(range(10)) + + +def test_polars_lazy_import(shutdown_only): + import sys + + ctx = ray.data.context.DatasetContext.get_current() + + try: + original_use_polars = ctx.use_polars + ctx.use_polars = True + + num_items = 100 + parallelism = 4 + ray.init(num_cpus=4) + + @ray.remote + def f(should_import_polars): + # Sleep to spread the tasks. + time.sleep(1) + polars_imported = "polars" in sys.modules.keys() + return polars_imported == should_import_polars + + # We should not use polars for non-Arrow sort. + _ = ray.data.range(num_items, parallelism=parallelism).sort() + assert all(ray.get([f.remote(False) for _ in range(parallelism)])) + + a = range(100) + dfs = [] + partition_size = num_items // parallelism + for i in range(parallelism): + dfs.append( + pd.DataFrame({"a": a[i * partition_size : (i + 1) * partition_size]}) + ) + # At least one worker should have imported polars. + _ = ( + ray.data.from_pandas(dfs) + .map_batches(lambda t: t, batch_format="pyarrow", batch_size=None) + .sort(key="a") + .fully_executed() + ) + assert any(ray.get([f.remote(True) for _ in range(parallelism)])) + + finally: + ctx.use_polars = original_use_polars + + +def test_default_batch_format(shutdown_only): + ds = ray.data.range(100) + assert ds.default_batch_format() == list + + ds = ray.data.range_tensor(100) + assert ds.default_batch_format() == np.ndarray + + df = pd.DataFrame({"foo": ["a", "b"], "bar": [0, 1]}) + ds = ray.data.from_pandas(df) + assert ds.default_batch_format() == pd.DataFrame + + +def test_dataset_schema_after_read_stats(ray_start_cluster): + cluster = ray_start_cluster + cluster.add_node(num_cpus=1) + ray.init(cluster.address) + cluster.add_node(num_cpus=1, resources={"foo": 1}) + ds = ray.data.read_csv( + "example://iris.csv", ray_remote_args={"resources": {"foo": 1}} + ) + schema = ds.schema() + ds.stats() + assert schema == ds.schema() + + +def test_dataset_plan_as_string(ray_start_cluster): + ds = ray.data.read_parquet("example://iris.parquet") + assert ds._plan.get_plan_as_string() == ( + "Dataset(\n" + " num_blocks=1,\n" + " num_rows=150,\n" + " schema={\n" + " sepal.length: double,\n" + " sepal.width: double,\n" + " petal.length: double,\n" + " petal.width: double,\n" + " variety: string\n" + " }\n" + ")" + ) + for _ in range(5): + ds = ds.map_batches(lambda x: x) + assert ds._plan.get_plan_as_string() == ( + "MapBatches()\n" + "+- MapBatches()\n" + " +- MapBatches()\n" + " +- MapBatches()\n" + " +- MapBatches()\n" + " +- Dataset(\n" + " num_blocks=1,\n" + " num_rows=150,\n" + " schema={\n" + " sepal.length: double,\n" + " sepal.width: double,\n" + " petal.length: double,\n" + " petal.width: double,\n" + " variety: string\n" + " }\n" + " )" + ) + + +class LoggerWarningCalled(Exception): + """Custom exception used in test_warning_execute_with_no_cpu() and + test_nowarning_execute_with_cpu(). Raised when the `logger.warning` method + is called, so that we can kick out of `plan.execute()` by catching this Exception + and check logging was done properly.""" + + pass + + +def test_warning_execute_with_no_cpu(ray_start_cluster): + """Tests ExecutionPlan.execute() to ensure a warning is logged + when no CPU resources are available.""" + # Create one node with no CPUs to trigger the Dataset warning + ray.init(ray_start_cluster.address) + cluster = ray_start_cluster + cluster.add_node(num_cpus=0) + + logger = DatasetLogger("ray.data._internal.plan").get_logger() + with patch.object( + logger, + "warning", + side_effect=LoggerWarningCalled, + ) as mock_logger: + try: + ds = ray.data.range(10) + ds = ds.map_batches(lambda x: x) + ds.take() + except Exception as e: + if ray.data.context.DatasetContext.get_current().use_streaming_executor: + assert isinstance(e, ValueError) + assert "exceeds the execution limits ExecutionResources(cpu=0.0" in str( + e + ) + else: + assert isinstance(e, LoggerWarningCalled) + logger_args, logger_kwargs = mock_logger.call_args + assert ( + "Warning: The Ray cluster currently does not have " + in logger_args[0] + ) + + +def test_nowarning_execute_with_cpu(ray_start_cluster): + """Tests ExecutionPlan.execute() to ensure no warning is logged + when there are available CPU resources.""" + # Create one node with CPUs to avoid triggering the Dataset warning + ray.init(ray_start_cluster.address) + + logger = DatasetLogger("ray.data._internal.plan").get_logger() + with patch.object( + logger, + "warning", + side_effect=LoggerWarningCalled, + ) as mock_logger: + ds = ray.data.range(10) + ds = ds.map_batches(lambda x: x) + ds.take() + mock_logger.assert_not_called() + + +if __name__ == "__main__": + import sys + + sys.exit(pytest.main(["-v", __file__])) diff --git a/python/ray/data/tests/test_dataset_csv.py b/python/ray/data/tests/test_dataset_csv.py index 3a2362d4f15f2..4ca6fa23bbf1e 100644 --- a/python/ray/data/tests/test_dataset_csv.py +++ b/python/ray/data/tests/test_dataset_csv.py @@ -1,3 +1,4 @@ +import itertools import os import shutil from functools import partial @@ -24,6 +25,7 @@ PathPartitionFilter, ) from ray.data.datasource.file_based_datasource import ( + FILE_SIZE_FETCH_PARALLELIZATION_THRESHOLD, FileExtensionFilter, _unwrap_protocol, ) @@ -123,7 +125,7 @@ def test_csv_read(ray_start_regular_shared, fs, data_path, endpoint_url): ds = ray.data.read_csv(path, filesystem=fs, partitioning=None) df = pd.concat([df1, df2], ignore_index=True) dsdf = ds.to_pandas() - assert df.equals(dsdf) + pd.testing.assert_frame_equal(df, dsdf) if fs is None: shutil.rmtree(path) else: @@ -259,6 +261,157 @@ def test_csv_read_meta_provider( ) +@pytest.mark.parametrize( + "fs,data_path,endpoint_url", + [ + (None, lazy_fixture("local_path"), None), + (lazy_fixture("local_fs"), lazy_fixture("local_path"), None), + (lazy_fixture("s3_fs"), lazy_fixture("s3_path"), lazy_fixture("s3_server")), + ], +) +def test_csv_read_many_files_basic( + ray_start_regular_shared, + fs, + data_path, + endpoint_url, +): + if endpoint_url is None: + storage_options = {} + else: + storage_options = dict(client_kwargs=dict(endpoint_url=endpoint_url)) + + paths = [] + dfs = [] + num_dfs = 4 * FILE_SIZE_FETCH_PARALLELIZATION_THRESHOLD + for i in range(num_dfs): + df = pd.DataFrame({"one": list(range(i * 3, (i + 1) * 3))}) + dfs.append(df) + path = os.path.join(data_path, f"test_{i}.csv") + paths.append(path) + df.to_csv(path, index=False, storage_options=storage_options) + ds = ray.data.read_csv(paths, filesystem=fs) + + dsdf = ds.to_pandas() + df = pd.concat(dfs).reset_index(drop=True) + pd.testing.assert_frame_equal(df, dsdf) + + +@pytest.mark.parametrize( + "fs,data_path,endpoint_url", + [ + (None, lazy_fixture("local_path"), None), + (lazy_fixture("local_fs"), lazy_fixture("local_path"), None), + (lazy_fixture("s3_fs"), lazy_fixture("s3_path"), lazy_fixture("s3_server")), + ], +) +def test_csv_read_many_files_partitioned( + ray_start_regular_shared, + fs, + data_path, + endpoint_url, + write_partitioned_df, + assert_base_partitioned_ds, +): + if endpoint_url is None: + storage_options = {} + else: + storage_options = dict(client_kwargs=dict(endpoint_url=endpoint_url)) + + partition_keys = ["one"] + partition_path_encoder = PathPartitionEncoder.of( + base_dir=data_path, + field_names=partition_keys, + filesystem=fs, + ) + paths = [] + dfs = [] + num_dfs = FILE_SIZE_FETCH_PARALLELIZATION_THRESHOLD + num_rows = 6 * num_dfs + num_files = 2 * num_dfs + for i in range(num_dfs): + df = pd.DataFrame( + {"one": [1, 1, 1, 3, 3, 3], "two": list(range(6 * i, 6 * (i + 1)))} + ) + df_paths = write_partitioned_df( + df, + partition_keys, + partition_path_encoder, + partial(df_to_csv, storage_options=storage_options, index=False), + file_name_suffix=i, + ) + dfs.append(df) + paths.extend(df_paths) + + ds = ray.data.read_csv( + paths, + filesystem=fs, + partitioning=partition_path_encoder.scheme, + parallelism=num_files, + ) + + assert_base_partitioned_ds( + ds, + count=num_rows, + num_input_files=num_files, + num_rows=num_rows, + schema="{one: int64, two: int64}", + num_computed=num_files, + sorted_values=sorted( + itertools.chain.from_iterable( + list( + map(list, zip([1, 1, 1, 3, 3, 3], list(range(6 * i, 6 * (i + 1))))) + ) + for i in range(num_dfs) + ) + ), + ) + + +@pytest.mark.parametrize( + "fs,data_path,endpoint_url", + [ + (None, lazy_fixture("local_path"), None), + (lazy_fixture("local_fs"), lazy_fixture("local_path"), None), + (lazy_fixture("s3_fs"), lazy_fixture("s3_path"), lazy_fixture("s3_server")), + ], +) +def test_csv_read_many_files_diff_dirs( + ray_start_regular_shared, + fs, + data_path, + endpoint_url, +): + if endpoint_url is None: + storage_options = {} + else: + storage_options = dict(client_kwargs=dict(endpoint_url=endpoint_url)) + + dir1 = os.path.join(data_path, "dir1") + dir2 = os.path.join(data_path, "dir2") + if fs is None: + os.mkdir(dir1) + os.mkdir(dir2) + else: + fs.create_dir(_unwrap_protocol(dir1)) + fs.create_dir(_unwrap_protocol(dir2)) + + paths = [] + dfs = [] + num_dfs = 2 * FILE_SIZE_FETCH_PARALLELIZATION_THRESHOLD + for i, dir_path in enumerate([dir1, dir2]): + for j in range(num_dfs * i, num_dfs * (i + 1)): + df = pd.DataFrame({"one": list(range(3 * j, 3 * (j + 1)))}) + dfs.append(df) + path = os.path.join(dir_path, f"test_{j}.csv") + paths.append(path) + df.to_csv(path, index=False, storage_options=storage_options) + ds = ray.data.read_csv(paths, filesystem=fs) + + dsdf = ds.to_pandas() + df = pd.concat(dfs).reset_index(drop=True) + pd.testing.assert_frame_equal(df, dsdf) + + @pytest.mark.parametrize( "fs,data_path,endpoint_url", [ diff --git a/python/ray/data/tests/test_dataset_ecosystem.py b/python/ray/data/tests/test_dataset_ecosystem.py new file mode 100644 index 0000000000000..e3c19d130f0f6 --- /dev/null +++ b/python/ray/data/tests/test_dataset_ecosystem.py @@ -0,0 +1,145 @@ +import numpy as np +import pandas as pd +import pyarrow as pa +import pytest + +import ray +from ray.data.extensions.tensor_extension import ( + ArrowTensorArray, + ArrowTensorType, + TensorArray, + TensorDtype, +) +from ray.data.tests.conftest import * # noqa +from ray.tests.conftest import * # noqa + + +def test_from_dask(ray_start_regular_shared): + import dask.dataframe as dd + + df = pd.DataFrame({"one": list(range(100)), "two": list(range(100))}) + ddf = dd.from_pandas(df, npartitions=10) + ds = ray.data.from_dask(ddf) + dfds = ds.to_pandas() + assert df.equals(dfds) + + +@pytest.mark.parametrize("ds_format", ["pandas", "arrow"]) +def test_to_dask(ray_start_regular_shared, ds_format): + from ray.util.dask import ray_dask_get + + df1 = pd.DataFrame({"one": [1, 2, 3], "two": ["a", "b", "c"]}) + df2 = pd.DataFrame({"one": [4, 5, 6], "two": ["e", "f", "g"]}) + df = pd.concat([df1, df2]) + ds = ray.data.from_pandas([df1, df2]) + if ds_format == "arrow": + ds = ds.map_batches(lambda df: df, batch_format="pyarrow", batch_size=None) + ddf = ds.to_dask() + meta = ddf._meta + # Check metadata. + assert isinstance(meta, pd.DataFrame) + assert meta.empty + assert list(meta.columns) == ["one", "two"] + assert list(meta.dtypes) == [np.int64, object] + # Explicit Dask-on-Ray + assert df.equals(ddf.compute(scheduler=ray_dask_get)) + # Implicit Dask-on-Ray. + assert df.equals(ddf.compute()) + + # Explicit metadata. + df1["two"] = df1["two"].astype(pd.StringDtype()) + df2["two"] = df2["two"].astype(pd.StringDtype()) + df = pd.concat([df1, df2]) + ds = ray.data.from_pandas([df1, df2]) + if ds_format == "arrow": + ds = ds.map_batches(lambda df: df, batch_format="pyarrow", batch_size=None) + ddf = ds.to_dask( + meta=pd.DataFrame( + {"one": pd.Series(dtype=np.int16), "two": pd.Series(dtype=pd.StringDtype())} + ), + ) + meta = ddf._meta + # Check metadata. + assert isinstance(meta, pd.DataFrame) + assert meta.empty + assert list(meta.columns) == ["one", "two"] + assert list(meta.dtypes) == [np.int16, pd.StringDtype()] + # Explicit Dask-on-Ray + assert df.equals(ddf.compute(scheduler=ray_dask_get)) + # Implicit Dask-on-Ray. + assert df.equals(ddf.compute()) + + +def test_to_dask_tensor_column_cast_pandas(ray_start_regular_shared): + # Check that tensor column casting occurs when converting a Dataset to a Dask + # DataFrame. + data = np.arange(12).reshape((3, 2, 2)) + ctx = ray.data.context.DatasetContext.get_current() + original = ctx.enable_tensor_extension_casting + try: + ctx.enable_tensor_extension_casting = True + in_df = pd.DataFrame({"a": TensorArray(data)}) + ds = ray.data.from_pandas(in_df) + dtypes = ds.schema().types + assert len(dtypes) == 1 + assert isinstance(dtypes[0], TensorDtype) + out_df = ds.to_dask().compute() + assert out_df["a"].dtype.type is np.object_ + expected_df = pd.DataFrame({"a": list(data)}) + pd.testing.assert_frame_equal(out_df, expected_df) + finally: + ctx.enable_tensor_extension_casting = original + + +def test_to_dask_tensor_column_cast_arrow(ray_start_regular_shared): + # Check that tensor column casting occurs when converting a Dataset to a Dask + # DataFrame. + data = np.arange(12).reshape((3, 2, 2)) + ctx = ray.data.context.DatasetContext.get_current() + original = ctx.enable_tensor_extension_casting + try: + ctx.enable_tensor_extension_casting = True + in_table = pa.table({"a": ArrowTensorArray.from_numpy(data)}) + ds = ray.data.from_arrow(in_table) + dtype = ds.schema().field(0).type + assert isinstance(dtype, ArrowTensorType) + out_df = ds.to_dask().compute() + assert out_df["a"].dtype.type is np.object_ + expected_df = pd.DataFrame({"a": list(data)}) + pd.testing.assert_frame_equal(out_df, expected_df) + finally: + ctx.enable_tensor_extension_casting = original + + +def test_from_modin(ray_start_regular_shared): + import modin.pandas as mopd + + df = pd.DataFrame( + {"one": list(range(100)), "two": list(range(100))}, + ) + modf = mopd.DataFrame(df) + ds = ray.data.from_modin(modf) + dfds = ds.to_pandas() + assert df.equals(dfds) + + +def test_to_modin(ray_start_regular_shared): + # create two modin dataframes + # one directly from a pandas dataframe, and + # another from ray.dataset created from the original pandas dataframe + # + import modin.pandas as mopd + + df = pd.DataFrame( + {"one": list(range(100)), "two": list(range(100))}, + ) + modf1 = mopd.DataFrame(df) + ds = ray.data.from_pandas([df]) + modf2 = ds.to_modin() + assert modf1.equals(modf2) + + +if __name__ == "__main__": + import sys + + sys.exit(pytest.main(["-v", __file__])) diff --git a/python/ray/data/tests/test_dataset_image.py b/python/ray/data/tests/test_dataset_image.py index b22fa3b378477..8d495b02bf4a7 100644 --- a/python/ray/data/tests/test_dataset_image.py +++ b/python/ray/data/tests/test_dataset_image.py @@ -1,5 +1,4 @@ import os -import time from typing import Dict import numpy as np @@ -194,16 +193,9 @@ def test_dynamic_block_split(ray_start_regular_shared): # Verify dynamic block splitting taking effect to generate more blocks. assert ds.num_blocks() == 3 - # NOTE: Need to wait for 1 second before checking stats, because we report - # stats to stats actors asynchronously when returning the blocks metadata. - # TODO(chengsu): clean it up after refactoring lazy block list. - time.sleep(1) - assert "3 blocks executed" in ds.stats() - # Test union of same datasets union_ds = ds.union(ds, ds, ds).fully_executed() assert union_ds.num_blocks() == 12 - assert "3 blocks executed" in union_ds.stats() finally: ctx.target_max_block_size = target_max_block_size ctx.block_splitting_enabled = block_splitting_enabled diff --git a/python/ray/data/tests/test_dataset_map.py b/python/ray/data/tests/test_dataset_map.py new file mode 100644 index 0000000000000..b082182e08efe --- /dev/null +++ b/python/ray/data/tests/test_dataset_map.py @@ -0,0 +1,925 @@ +import itertools +import math +import os +import signal +import time +from typing import Iterator + +import pandas as pd +import pyarrow as pa +import pyarrow.parquet as pq +import pytest + +import ray +from ray._private.test_utils import wait_for_condition +from ray.data.block import BlockAccessor +from ray.data.context import DatasetContext +from ray.data.tests.conftest import * # noqa +from ray.tests.conftest import * # noqa + + +def maybe_pipeline(ds, enabled): + if enabled: + return ds.window(blocks_per_window=1) + else: + return ds + + +@pytest.mark.parametrize("pipelined", [False, True]) +def test_basic_actors(shutdown_only, pipelined): + ray.init(num_cpus=6) + n = 5 + ds = ray.data.range(n) + ds = maybe_pipeline(ds, pipelined) + assert sorted(ds.map(lambda x: x + 1, compute="actors").take()) == list( + range(1, n + 1) + ) + + # Should still work even if num actors > num cpus. + ds = ray.data.range(n) + ds = maybe_pipeline(ds, pipelined) + assert sorted( + ds.map(lambda x: x + 1, compute=ray.data.ActorPoolStrategy(4, 4)).take() + ) == list(range(1, n + 1)) + + # Test setting custom max inflight tasks. + ds = ray.data.range(10, parallelism=5) + ds = maybe_pipeline(ds, pipelined) + assert sorted( + ds.map( + lambda x: x + 1, + compute=ray.data.ActorPoolStrategy(max_tasks_in_flight_per_actor=3), + ).take() + ) == list(range(1, 11)) + + # Test invalid max tasks inflight arg. + with pytest.raises(ValueError): + ray.data.range(10).map( + lambda x: x, + compute=ray.data.ActorPoolStrategy(max_tasks_in_flight_per_actor=0), + ) + + # Test min no more than max check. + with pytest.raises(ValueError): + ray.data.range(10).map(lambda x: x, compute=ray.data.ActorPoolStrategy(8, 4)) + + +def test_callable_classes(shutdown_only): + ray.init(num_cpus=2) + ds = ray.data.range(10, parallelism=10) + + class StatefulFn: + def __init__(self): + self.num_reuses = 0 + + def __call__(self, x): + r = self.num_reuses + self.num_reuses += 1 + return r + + # Need to specify compute explicitly. + with pytest.raises(ValueError): + ds.map(StatefulFn).take() + + # Need to specify actor compute strategy. + with pytest.raises(ValueError): + ds.map(StatefulFn, compute="tasks").take() + + # Need to specify compute explicitly. + with pytest.raises(ValueError): + ds.flat_map(StatefulFn).take() + + # Need to specify actor compute strategy. + with pytest.raises(ValueError): + ds.flat_map(StatefulFn, compute="tasks") + + # Need to specify compute explicitly. + with pytest.raises(ValueError): + ds.filter(StatefulFn).take() + + # Need to specify actor compute strategy. + with pytest.raises(ValueError): + ds.filter(StatefulFn, compute="tasks") + + # map + actor_reuse = ds.map(StatefulFn, compute="actors").take() + assert sorted(actor_reuse) == list(range(10)), actor_reuse + + class StatefulFn: + def __init__(self): + self.num_reuses = 0 + + def __call__(self, x): + r = self.num_reuses + self.num_reuses += 1 + return [r] + + # flat map + actor_reuse = ds.flat_map(StatefulFn, compute="actors").take() + assert sorted(actor_reuse) == list(range(10)), actor_reuse + + # map batches + actor_reuse = ds.map_batches(StatefulFn, batch_size=1, compute="actors").take() + assert sorted(actor_reuse) == list(range(10)), actor_reuse + + class StatefulFn: + def __init__(self): + self.num_reuses = 0 + + def __call__(self, x): + r = self.num_reuses + self.num_reuses += 1 + return r > 0 + + # filter + actor_reuse = ds.filter(StatefulFn, compute="actors").take() + assert len(actor_reuse) == 9, actor_reuse + + +def test_transform_failure(shutdown_only): + ray.init(num_cpus=2) + ds = ray.data.from_items([0, 10], parallelism=2) + + def mapper(x): + time.sleep(x) + raise ValueError("oops") + return x + + with pytest.raises(ray.exceptions.RayTaskError): + ds.map(mapper).fully_executed() + + +def test_flat_map_generator(ray_start_regular_shared): + ds = ray.data.range(3) + + def map_generator(item: int) -> Iterator[int]: + for _ in range(2): + yield item + 1 + + assert sorted(ds.flat_map(map_generator).take()) == [1, 1, 2, 2, 3, 3] + + +def test_add_column(ray_start_regular_shared): + ds = ray.data.range(5).add_column("foo", lambda x: 1) + assert ds.take(1) == [{"value": 0, "foo": 1}] + + ds = ray.data.range_table(5).add_column("foo", lambda x: x["value"] + 1) + assert ds.take(1) == [{"value": 0, "foo": 1}] + + ds = ray.data.range_table(5).add_column("value", lambda x: x["value"] + 1) + assert ds.take(2) == [{"value": 1}, {"value": 2}] + + with pytest.raises(ValueError): + ds = ray.data.range(5).add_column("value", 0) + + +def test_drop_columns(ray_start_regular_shared, tmp_path): + df = pd.DataFrame({"col1": [1, 2, 3], "col2": [2, 3, 4], "col3": [3, 4, 5]}) + ds1 = ray.data.from_pandas(df) + ds1.write_parquet(str(tmp_path)) + ds2 = ray.data.read_parquet(str(tmp_path)) + + for ds in [ds1, ds2]: + assert ds.drop_columns(["col2"]).take(1) == [{"col1": 1, "col3": 3}] + assert ds.drop_columns(["col1", "col3"]).take(1) == [{"col2": 2}] + assert ds.drop_columns([]).take(1) == [{"col1": 1, "col2": 2, "col3": 3}] + assert ds.drop_columns(["col1", "col2", "col3"]).take(1) == [{}] + assert ds.drop_columns(["col1", "col1", "col2", "col1"]).take(1) == [ + {"col3": 3} + ] + # Test dropping non-existent column + with pytest.raises(KeyError): + ds.drop_columns(["dummy_col", "col1", "col2"]).fully_executed() + + +def test_select_columns(ray_start_regular_shared): + # Test pandas and arrow + df = pd.DataFrame({"col1": [1, 2, 3], "col2": [2, 3, 4], "col3": [3, 4, 5]}) + ds1 = ray.data.from_pandas(df) + assert ds1.dataset_format() == "pandas" + + ds2 = ds1.map_batches(lambda pa: pa, batch_size=1, batch_format="pyarrow") + assert ds2.dataset_format() == "arrow" + + for each_ds in [ds1, ds2]: + assert each_ds.select_columns(cols=[]).take(1) == [{}] + assert each_ds.select_columns(cols=["col1", "col2", "col3"]).take(1) == [ + {"col1": 1, "col2": 2, "col3": 3} + ] + assert each_ds.select_columns(cols=["col1", "col2"]).take(1) == [ + {"col1": 1, "col2": 2} + ] + assert each_ds.select_columns(cols=["col2", "col1"]).take(1) == [ + {"col1": 1, "col2": 2} + ] + # Test selecting columns with duplicates + assert each_ds.select_columns(cols=["col1", "col2", "col2"]).schema().names == [ + "col1", + "col2", + "col2", + ] + # Test selecting a column that is not in the dataset schema + with pytest.raises(KeyError): + each_ds.select_columns(cols=["col1", "col2", "dummy_col"]).fully_executed() + + # Test simple + ds3 = ray.data.range(10) + assert ds3.dataset_format() == "simple" + with pytest.raises(ValueError): + ds3.select_columns(cols=[]).fully_executed() + + +def test_map_batches_basic(ray_start_regular_shared, tmp_path, restore_dataset_context): + ctx = DatasetContext.get_current() + ctx.execution_options.preserve_order = True + + # Test input validation + ds = ray.data.range(5) + with pytest.raises(ValueError): + ds.map_batches(lambda x: x + 1, batch_format="pyarrow", batch_size=-1).take() + + # Set up. + df = pd.DataFrame({"one": [1, 2, 3], "two": [2, 3, 4]}) + table = pa.Table.from_pandas(df) + pq.write_table(table, os.path.join(tmp_path, "test1.parquet")) + + # Test pandas + ds = ray.data.read_parquet(str(tmp_path)) + ds2 = ds.map_batches(lambda df: df + 1, batch_size=1, batch_format="pandas") + assert ds2.dataset_format() == "pandas" + ds_list = ds2.take() + values = [s["one"] for s in ds_list] + assert values == [2, 3, 4] + values = [s["two"] for s in ds_list] + assert values == [3, 4, 5] + + # Test Pyarrow + ds = ray.data.read_parquet(str(tmp_path)) + ds2 = ds.map_batches(lambda pa: pa, batch_size=1, batch_format="pyarrow") + assert ds2.dataset_format() == "arrow" + ds_list = ds2.take() + values = [s["one"] for s in ds_list] + assert values == [1, 2, 3] + values = [s["two"] for s in ds_list] + assert values == [2, 3, 4] + + # Test batch + size = 300 + ds = ray.data.range(size) + ds2 = ds.map_batches(lambda df: df + 1, batch_size=17, batch_format="pandas") + assert ds2.dataset_format() == "pandas" + ds_list = ds2.take_all() + for i in range(size): + # The pandas column is "value", and it originally has rows from 0~299. + # After the map batch, it should have 1~300. + row = ds_list[i] + assert row["value"] == i + 1 + assert ds.count() == 300 + + # Test the lambda returns different types than the batch_format + # pandas => list block + ds = ray.data.read_parquet(str(tmp_path)) + ds2 = ds.map_batches(lambda df: [1], batch_size=1) + assert ds2.dataset_format() == "simple" + ds_list = ds2.take() + assert ds_list == [1, 1, 1] + assert ds.count() == 3 + + # pyarrow => list block + ds = ray.data.read_parquet(str(tmp_path)) + ds2 = ds.map_batches(lambda df: [1], batch_size=1, batch_format="pyarrow") + assert ds2.dataset_format() == "simple" + ds_list = ds2.take() + assert ds_list == [1, 1, 1] + assert ds.count() == 3 + + # Test the wrong return value raises an exception. + ds = ray.data.read_parquet(str(tmp_path)) + with pytest.raises(ValueError): + ds_list = ds.map_batches( + lambda df: 1, batch_size=2, batch_format="pyarrow" + ).take() + + +def test_map_batches_extra_args(shutdown_only, tmp_path): + ray.shutdown() + ray.init(num_cpus=2) + + def put(x): + # We only support automatic deref in the legacy backend. + if DatasetContext.get_current().new_execution_backend: + return x + else: + return ray.put(x) + + # Test input validation + ds = ray.data.range(5) + + class Foo: + def __call__(self, df): + return df + + with pytest.raises(ValueError): + # CallableClass not supported for task compute strategy, which is the default. + ds.map_batches(Foo) + + with pytest.raises(ValueError): + # CallableClass not supported for task compute strategy. + ds.map_batches(Foo, compute="tasks") + + with pytest.raises(ValueError): + # fn_constructor_args and fn_constructor_kwargs only supported for actor + # compute strategy. + ds.map_batches( + lambda x: x, + compute="tasks", + fn_constructor_args=(1,), + fn_constructor_kwargs={"a": 1}, + ) + + with pytest.raises(ValueError): + # fn_constructor_args and fn_constructor_kwargs only supported for callable + # class UDFs. + ds.map_batches( + lambda x: x, + compute="actors", + fn_constructor_args=(1,), + fn_constructor_kwargs={"a": 1}, + ) + + # Set up. + df = pd.DataFrame({"one": [1, 2, 3], "two": [2, 3, 4]}) + table = pa.Table.from_pandas(df) + pq.write_table(table, os.path.join(tmp_path, "test1.parquet")) + + # Test extra UDF args. + # Test positional. + def udf(batch, a): + assert a == 1 + return batch + a + + ds = ray.data.read_parquet(str(tmp_path)) + ds2 = ds.map_batches( + udf, + batch_size=1, + batch_format="pandas", + fn_args=(put(1),), + ) + assert ds2.dataset_format() == "pandas" + ds_list = ds2.take() + values = [s["one"] for s in ds_list] + assert values == [2, 3, 4] + values = [s["two"] for s in ds_list] + assert values == [3, 4, 5] + + # Test kwargs. + def udf(batch, b=None): + assert b == 2 + return b * batch + + ds = ray.data.read_parquet(str(tmp_path)) + ds2 = ds.map_batches( + udf, + batch_size=1, + batch_format="pandas", + fn_kwargs={"b": put(2)}, + ) + assert ds2.dataset_format() == "pandas" + ds_list = ds2.take() + values = [s["one"] for s in ds_list] + assert values == [2, 4, 6] + values = [s["two"] for s in ds_list] + assert values == [4, 6, 8] + + # Test both. + def udf(batch, a, b=None): + assert a == 1 + assert b == 2 + return b * batch + a + + ds = ray.data.read_parquet(str(tmp_path)) + ds2 = ds.map_batches( + udf, + batch_size=1, + batch_format="pandas", + fn_args=(put(1),), + fn_kwargs={"b": put(2)}, + ) + assert ds2.dataset_format() == "pandas" + ds_list = ds2.take() + values = [s["one"] for s in ds_list] + assert values == [3, 5, 7] + values = [s["two"] for s in ds_list] + assert values == [5, 7, 9] + + # Test constructor UDF args. + # Test positional. + class CallableFn: + def __init__(self, a): + assert a == 1 + self.a = a + + def __call__(self, x): + return x + self.a + + ds = ray.data.read_parquet(str(tmp_path)) + ds2 = ds.map_batches( + CallableFn, + batch_size=1, + batch_format="pandas", + compute="actors", + fn_constructor_args=(put(1),), + ) + assert ds2.dataset_format() == "pandas" + ds_list = ds2.take() + values = [s["one"] for s in ds_list] + assert values == [2, 3, 4] + values = [s["two"] for s in ds_list] + assert values == [3, 4, 5] + + # Test kwarg. + class CallableFn: + def __init__(self, b=None): + assert b == 2 + self.b = b + + def __call__(self, x): + return self.b * x + + ds = ray.data.read_parquet(str(tmp_path)) + ds2 = ds.map_batches( + CallableFn, + batch_size=1, + batch_format="pandas", + compute="actors", + fn_constructor_kwargs={"b": put(2)}, + ) + assert ds2.dataset_format() == "pandas" + ds_list = ds2.take() + values = [s["one"] for s in ds_list] + assert values == [2, 4, 6] + values = [s["two"] for s in ds_list] + assert values == [4, 6, 8] + + # Test both. + class CallableFn: + def __init__(self, a, b=None): + assert a == 1 + assert b == 2 + self.a = a + self.b = b + + def __call__(self, x): + return self.b * x + self.a + + ds = ray.data.read_parquet(str(tmp_path)) + ds2 = ds.map_batches( + CallableFn, + batch_size=1, + batch_format="pandas", + compute="actors", + fn_constructor_args=(put(1),), + fn_constructor_kwargs={"b": put(2)}, + ) + assert ds2.dataset_format() == "pandas" + ds_list = ds2.take() + values = [s["one"] for s in ds_list] + assert values == [3, 5, 7] + values = [s["two"] for s in ds_list] + assert values == [5, 7, 9] + + # Test callable chain. + ds = ray.data.read_parquet(str(tmp_path)) + fn_constructor_args = (put(1),) + fn_constructor_kwargs = {"b": put(2)} + ds2 = ( + ds.lazy() + .map_batches( + CallableFn, + batch_size=1, + batch_format="pandas", + compute="actors", + fn_constructor_args=fn_constructor_args, + fn_constructor_kwargs=fn_constructor_kwargs, + ) + .map_batches( + CallableFn, + batch_size=1, + batch_format="pandas", + compute="actors", + fn_constructor_args=fn_constructor_args, + fn_constructor_kwargs=fn_constructor_kwargs, + ) + ) + assert ds2.dataset_format() == "pandas" + ds_list = ds2.take() + values = [s["one"] for s in ds_list] + assert values == [7, 11, 15] + values = [s["two"] for s in ds_list] + assert values == [11, 15, 19] + + # Test function + callable chain. + ds = ray.data.read_parquet(str(tmp_path)) + fn_constructor_args = (put(1),) + fn_constructor_kwargs = {"b": put(2)} + ds2 = ( + ds.lazy() + .map_batches( + lambda df, a, b=None: b * df + a, + batch_size=1, + batch_format="pandas", + compute="actors", + fn_args=(put(1),), + fn_kwargs={"b": put(2)}, + ) + .map_batches( + CallableFn, + batch_size=1, + batch_format="pandas", + compute="actors", + fn_constructor_args=fn_constructor_args, + fn_constructor_kwargs=fn_constructor_kwargs, + ) + ) + assert ds2.dataset_format() == "pandas" + ds_list = ds2.take() + values = [s["one"] for s in ds_list] + assert values == [7, 11, 15] + values = [s["two"] for s in ds_list] + assert values == [11, 15, 19] + + +def test_map_batches_generator(ray_start_regular_shared, tmp_path): + # Set up. + df = pd.DataFrame({"one": [1, 2, 3], "two": [2, 3, 4]}) + table = pa.Table.from_pandas(df) + pq.write_table(table, os.path.join(tmp_path, "test1.parquet")) + + def pandas_generator(batch: pd.DataFrame) -> Iterator[pd.DataFrame]: + for i in range(len(batch)): + yield batch.iloc[[i]] + 1 + + ds = ray.data.read_parquet(str(tmp_path)) + ds2 = ds.map_batches(pandas_generator, batch_size=1, batch_format="pandas") + assert ds2.dataset_format() == "pandas" + ds_list = ds2.take() + values = sorted([s["one"] for s in ds_list]) + assert values == [2, 3, 4] + values = sorted([s["two"] for s in ds_list]) + assert values == [3, 4, 5] + + def fail_generator(batch): + for i in range(len(batch)): + yield i + + # Test the wrong return value raises an exception. + ds = ray.data.read_parquet(str(tmp_path)) + with pytest.raises(ValueError): + ds_list = ds.map_batches( + fail_generator, batch_size=2, batch_format="pyarrow" + ).take() + + +def test_map_batches_actors_preserves_order(shutdown_only): + ray.shutdown() + ray.init(num_cpus=2) + # Test that actor compute model preserves block order. + ds = ray.data.range(10, parallelism=5) + assert ds.map_batches(lambda x: x, compute="actors").take() == list(range(10)) + + +@pytest.mark.parametrize( + "num_rows,num_blocks,batch_size", + [ + (10, 5, 2), + (10, 1, 10), + (12, 3, 2), + ], +) +def test_map_batches_batch_mutation( + ray_start_regular_shared, num_rows, num_blocks, batch_size, restore_dataset_context +): + ctx = DatasetContext.get_current() + ctx.execution_options.preserve_order = True + + # Test that batch mutation works without encountering a read-only error (e.g. if the + # batch is a zero-copy view on data in the object store). + def mutate(df): + df["value"] += 1 + return df + + ds = ray.data.range_table(num_rows, parallelism=num_blocks).repartition(num_blocks) + # Convert to Pandas blocks. + ds = ds.map_batches(lambda df: df, batch_format="pandas", batch_size=None) + + # Apply UDF that mutates the batches. + ds = ds.map_batches(mutate, batch_size=batch_size) + assert [row["value"] for row in ds.iter_rows()] == list(range(1, num_rows + 1)) + + +@pytest.mark.parametrize( + "num_rows,num_blocks,batch_size", + [ + (10, 5, 2), + (10, 1, 10), + (12, 3, 2), + ], +) +def test_map_batches_batch_zero_copy( + ray_start_regular_shared, num_rows, num_blocks, batch_size +): + # Test that batches are zero-copy read-only views when zero_copy_batch=True. + def mutate(df): + # Check that batch is read-only. + assert not df.values.flags.writeable + df["value"] += 1 + return df + + ds = ray.data.range_table(num_rows, parallelism=num_blocks).repartition(num_blocks) + # Convert to Pandas blocks. + ds = ds.map_batches(lambda df: df, batch_format="pandas", batch_size=None) + ds.fully_executed() + + # Apply UDF that mutates the batches, which should fail since the batch is + # read-only. + with pytest.raises(ValueError, match="tried to mutate a zero-copy read-only batch"): + ds = ds.map_batches(mutate, batch_size=batch_size, zero_copy_batch=True) + ds.fully_executed() + + +BLOCK_BUNDLING_TEST_CASES = [ + (block_size, batch_size) + for batch_size in range(1, 8) + for block_size in range(1, 2 * batch_size + 1) +] + + +@pytest.mark.parametrize("block_size,batch_size", BLOCK_BUNDLING_TEST_CASES) +def test_map_batches_block_bundling_auto( + ray_start_regular_shared, block_size, batch_size +): + # Ensure that we test at least 2 batches worth of blocks. + num_blocks = max(10, 2 * batch_size // block_size) + ds = ray.data.range(num_blocks * block_size, parallelism=num_blocks) + # Confirm that we have the expected number of initial blocks. + assert ds.num_blocks() == num_blocks + + # Blocks should be bundled up to the batch size. + ds1 = ds.map_batches(lambda x: x, batch_size=batch_size).fully_executed() + assert ds1.num_blocks() == math.ceil(num_blocks / max(batch_size // block_size, 1)) + + # Blocks should not be bundled up when batch_size is not specified. + ds2 = ds.map_batches(lambda x: x).fully_executed() + assert ds2.num_blocks() == num_blocks + + +@pytest.mark.parametrize( + "block_sizes,batch_size,expected_num_blocks", + [ + ([1, 2], 3, 1), + ([2, 2, 1], 3, 2), + ([1, 2, 3, 4], 4, 3), + ([3, 1, 1, 3], 4, 2), + ([2, 4, 1, 8], 4, 4), + ([1, 1, 1, 1], 4, 1), + ([1, 0, 3, 2], 4, 2), + ([4, 4, 4, 4], 4, 4), + ], +) +def test_map_batches_block_bundling_skewed_manual( + ray_start_regular_shared, block_sizes, batch_size, expected_num_blocks +): + num_blocks = len(block_sizes) + ds = ray.data.from_pandas( + [pd.DataFrame({"a": [1] * block_size}) for block_size in block_sizes] + ) + # Confirm that we have the expected number of initial blocks. + assert ds.num_blocks() == num_blocks + ds = ds.map_batches(lambda x: x, batch_size=batch_size).fully_executed() + + # Blocks should be bundled up to the batch size. + assert ds.num_blocks() == expected_num_blocks + + +BLOCK_BUNDLING_SKEWED_TEST_CASES = [ + (block_sizes, batch_size) + for batch_size in range(1, 4) + for num_blocks in range(1, batch_size + 1) + for block_sizes in itertools.product( + range(1, 2 * batch_size + 1), repeat=num_blocks + ) +] + + +@pytest.mark.parametrize("block_sizes,batch_size", BLOCK_BUNDLING_SKEWED_TEST_CASES) +def test_map_batches_block_bundling_skewed_auto( + ray_start_regular_shared, block_sizes, batch_size +): + num_blocks = len(block_sizes) + ds = ray.data.from_pandas( + [pd.DataFrame({"a": [1] * block_size}) for block_size in block_sizes] + ) + # Confirm that we have the expected number of initial blocks. + assert ds.num_blocks() == num_blocks + ds = ds.map_batches(lambda x: x, batch_size=batch_size).fully_executed() + curr = 0 + num_out_blocks = 0 + for block_size in block_sizes: + if curr > 0 and curr + block_size > batch_size: + num_out_blocks += 1 + curr = 0 + curr += block_size + if curr > 0: + num_out_blocks += 1 + + # Blocks should be bundled up to the batch size. + assert ds.num_blocks() == num_out_blocks + + +def test_map_with_mismatched_columns(ray_start_regular_shared): + def bad_fn(row): + if row > 5: + return {"a": "hello1"} + else: + return {"b": "hello1"} + + def good_fn(row): + if row > 5: + return {"a": "hello1", "b": "hello2"} + else: + return {"b": "hello2", "a": "hello1"} + + ds = ray.data.range(10, parallelism=1) + error_message = "Current row has different columns compared to previous rows." + with pytest.raises(ValueError) as e: + ds.map(bad_fn).fully_executed() + assert error_message in str(e.value) + ds_map = ds.map(good_fn) + assert ds_map.take() == [{"a": "hello1", "b": "hello2"} for _ in range(10)] + + +def test_map_batches_preserve_empty_blocks(ray_start_regular_shared): + ds = ray.data.range(10, parallelism=10) + ds = ds.map_batches(lambda x: []) + ds = ds.map_batches(lambda x: x) + assert ds.num_blocks() == 10, ds + + +def test_map_batches_combine_empty_blocks(ray_start_regular_shared): + xs = [x % 3 for x in list(range(100))] + + # ds1 has 1 block which contains 100 rows. + ds1 = ray.data.from_items(xs).repartition(1).sort().map_batches(lambda x: x) + assert ds1._block_num_rows() == [100] + + # ds2 has 30 blocks, but only 3 of them are non-empty + ds2 = ( + ray.data.from_items(xs) + .repartition(30) + .sort() + .map_batches(lambda x: x, batch_size=1) + ) + assert len(ds2._block_num_rows()) == 3 + count = sum(1 for x in ds2._block_num_rows() if x > 0) + assert count == 3 + + # The number of partitions should not affect the map_batches() result. + assert ds1.take_all() == ds2.take_all() + + +def test_random_sample(ray_start_regular_shared): + import math + + def ensure_sample_size_close(dataset, sample_percent=0.5): + r1 = ds.random_sample(sample_percent) + assert math.isclose( + r1.count(), int(ds.count() * sample_percent), rel_tol=2, abs_tol=2 + ) + + ds = ray.data.range(10, parallelism=2) + ensure_sample_size_close(ds) + + ds = ray.data.range_table(10, parallelism=2) + ensure_sample_size_close(ds) + + ds = ray.data.range_tensor(5, parallelism=2, shape=(2, 2)) + ensure_sample_size_close(ds) + + # imbalanced datasets + ds1 = ray.data.range(1, parallelism=1) + ds2 = ray.data.range(2, parallelism=1) + ds3 = ray.data.range(3, parallelism=1) + # noinspection PyTypeChecker + ds = ds1.union(ds2).union(ds3) + ensure_sample_size_close(ds) + # Small datasets + ds1 = ray.data.range(5, parallelism=5) + ensure_sample_size_close(ds1) + + +def test_random_sample_checks(ray_start_regular_shared): + with pytest.raises(ValueError): + # Cannot sample -1 + ray.data.range(1).random_sample(-1) + with pytest.raises(ValueError): + # Cannot sample from empty dataset + ray.data.range(0).random_sample(0.2) + with pytest.raises(ValueError): + # Cannot sample fraction > 1 + ray.data.range(1).random_sample(10) + + +# NOTE: All tests above share a Ray cluster, while the tests below do not. These +# tests should only be carefully reordered to retain this invariant! + + +def test_actor_pool_strategy_apply_interrupt(shutdown_only): + """Test that _apply kills the actor pool if an interrupt is raised.""" + ray.shutdown() + + ray.init(include_dashboard=False, num_cpus=1) + + cpus = ray.available_resources()["CPU"] + ds = ray.data.range(5, parallelism=5) + aps = ray.data.ActorPoolStrategy(max_size=5) + blocks = ds._plan.execute() + + # Start some actors, the first one sends a SIGINT, emulating a KeyboardInterrupt + def test_func(block): + for i, _ in enumerate(BlockAccessor.for_block(block).iter_rows()): + if i == 0: + os.kill(os.getpid(), signal.SIGINT) + else: + time.sleep(1000) + return block + + # No need to test ActorPoolStrategy in new execution backend. + if not DatasetContext.get_current().new_execution_backend: + with pytest.raises(ray.exceptions.RayTaskError): + aps._apply(test_func, {}, blocks, False) + + # Check that all actors have been killed by counting the available CPUs + wait_for_condition(lambda: (ray.available_resources().get("CPU", 0) == cpus)) + + +def test_actor_pool_strategy_default_num_actors(shutdown_only): + def f(x): + import time + + time.sleep(1) + return x + + num_cpus = 5 + ray.init(num_cpus=num_cpus) + compute_strategy = ray.data.ActorPoolStrategy() + ray.data.range(10, parallelism=10).map_batches( + f, batch_size=1, compute=compute_strategy + ).fully_executed() + + # The new execution backend is not using the ActorPoolStrategy under + # the hood, so the expectation here applies only to the old backend. + # TODO(https://github.com/ray-project/ray/issues/31723): we should check + # the num of workers once we have autoscaling in new execution backend. + if not DatasetContext.get_current().new_execution_backend: + expected_max_num_workers = math.ceil( + num_cpus * (1 / compute_strategy.ready_to_total_workers_ratio) + ) + assert ( + compute_strategy.num_workers >= num_cpus + and compute_strategy.num_workers <= expected_max_num_workers + ), "Number of actors is out of the expected bound" + + +def test_actor_pool_strategy_bundles_to_max_actors(shutdown_only): + """Tests that blocks are bundled up to the specified max number of actors.""" + + def f(x): + return x + + max_size = 2 + compute_strategy = ray.data.ActorPoolStrategy(max_size=max_size) + ds = ( + ray.data.range(10, parallelism=10) + .map_batches(f, batch_size=None, compute=compute_strategy) + .fully_executed() + ) + + # TODO(https://github.com/ray-project/ray/issues/31723): implement the feature + # of capping bundle size by actor pool size, and then re-enable this test. + if not DatasetContext.get_current().new_execution_backend: + assert f"{max_size}/{max_size} blocks" in ds.stats() + + # Check batch size is still respected. + ds = ( + ray.data.range(10, parallelism=10) + .map_batches(f, batch_size=10, compute=compute_strategy) + .fully_executed() + ) + + assert "1/1 blocks" in ds.stats() + + +if __name__ == "__main__": + import sys + + sys.exit(pytest.main(["-v", __file__])) diff --git a/python/ray/data/tests/test_dataset_numpy.py b/python/ray/data/tests/test_dataset_numpy.py index d882315ce72b2..b36c549812321 100644 --- a/python/ray/data/tests/test_dataset_numpy.py +++ b/python/ray/data/tests/test_dataset_numpy.py @@ -122,8 +122,11 @@ def test_numpy_roundtrip(ray_start_regular_shared, fs, data_path): ds.write_numpy(data_path, filesystem=fs) ds = ray.data.read_numpy(data_path, filesystem=fs) assert str(ds) == ( - "Dataset(num_blocks=2, num_rows=?, " - "schema={__value__: ArrowTensorType(shape=(1,), dtype=int64)})" + "Dataset(\n" + " num_blocks=2,\n" + " num_rows=?,\n" + " schema={__value__: ArrowTensorType(shape=(1,), dtype=int64)}\n" + ")" ) np.testing.assert_equal(ds.take(2), [np.array([0]), np.array([1])]) @@ -134,8 +137,11 @@ def test_numpy_read(ray_start_regular_shared, tmp_path): np.save(os.path.join(path, "test.npy"), np.expand_dims(np.arange(0, 10), 1)) ds = ray.data.read_numpy(path) assert str(ds) == ( - "Dataset(num_blocks=1, num_rows=10, " - "schema={__value__: ArrowTensorType(shape=(1,), dtype=int64)})" + "Dataset(\n" + " num_blocks=1,\n" + " num_rows=10,\n" + " schema={__value__: ArrowTensorType(shape=(1,), dtype=int64)}\n" + ")" ) np.testing.assert_equal(ds.take(2), [np.array([0]), np.array([1])]) @@ -147,8 +153,11 @@ def test_numpy_read(ray_start_regular_shared, tmp_path): assert ds.num_blocks() == 1 assert ds.count() == 10 assert str(ds) == ( - "Dataset(num_blocks=1, num_rows=10, " - "schema={__value__: ArrowTensorType(shape=(1,), dtype=int64)})" + "Dataset(\n" + " num_blocks=1,\n" + " num_rows=10,\n" + " schema={__value__: ArrowTensorType(shape=(1,), dtype=int64)}\n" + ")" ) assert [v.item() for v in ds.take(2)] == [0, 1] @@ -160,8 +169,11 @@ def test_numpy_read_meta_provider(ray_start_regular_shared, tmp_path): np.save(path, np.expand_dims(np.arange(0, 10), 1)) ds = ray.data.read_numpy(path, meta_provider=FastFileMetadataProvider()) assert str(ds) == ( - "Dataset(num_blocks=1, num_rows=10, " - "schema={__value__: ArrowTensorType(shape=(1,), dtype=int64)})" + "Dataset(\n" + " num_blocks=1,\n" + " num_rows=10,\n" + " schema={__value__: ArrowTensorType(shape=(1,), dtype=int64)}\n" + ")" ) np.testing.assert_equal(ds.take(2), [np.array([0]), np.array([1])]) diff --git a/python/ray/data/tests/test_dataset_pandas.py b/python/ray/data/tests/test_dataset_pandas.py index 75e33872b0e80..b8064e34354fd 100644 --- a/python/ray/data/tests/test_dataset_pandas.py +++ b/python/ray/data/tests/test_dataset_pandas.py @@ -7,7 +7,6 @@ from ray.data.extensions import ( TensorDtype, - TensorArray, ArrowTensorType, ArrowTensorArray, ) @@ -112,14 +111,18 @@ def test_to_pandas_tensor_column_cast_pandas(ray_start_regular_shared): original = ctx.enable_tensor_extension_casting try: ctx.enable_tensor_extension_casting = True - in_df = pd.DataFrame({"a": TensorArray(data)}) + in_df = pd.DataFrame({"a": [data]}) ds = ray.data.from_pandas(in_df) dtypes = ds.schema().types assert len(dtypes) == 1 + # Tensor column should be automatically cast to Tensor extension. assert isinstance(dtypes[0], TensorDtype) + # Original df should not be changed. + assert not isinstance(in_df.dtypes[0], TensorDtype) out_df = ds.to_pandas() + # Column should be cast back to object dtype when returning back to user. assert out_df["a"].dtype.type is np.object_ - expected_df = pd.DataFrame({"a": list(data)}) + expected_df = pd.DataFrame({"a": [data]}) pd.testing.assert_frame_equal(out_df, expected_df) finally: ctx.enable_tensor_extension_casting = original diff --git a/python/ray/data/tests/test_dataset_parquet.py b/python/ray/data/tests/test_dataset_parquet.py index 999cef8f56ed1..d0af09650484a 100644 --- a/python/ray/data/tests/test_dataset_parquet.py +++ b/python/ray/data/tests/test_dataset_parquet.py @@ -14,9 +14,11 @@ DefaultFileMetadataProvider, DefaultParquetMetadataProvider, ) +from ray.data.datasource.parquet_base_datasource import ParquetBaseDatasource from ray.data.datasource.parquet_datasource import ( PARALLELIZE_META_FETCH_THRESHOLD, _ParquetDatasourceReader, + ParquetDatasource, ) from ray.data.datasource.file_based_datasource import _unwrap_protocol from ray.data.datasource.parquet_datasource import ( @@ -444,15 +446,21 @@ def test_parquet_read_partitioned(ray_start_regular_shared, fs, data_path): assert ds.schema() is not None input_files = ds.input_files() assert len(input_files) == 2, input_files - assert ( - str(ds) == "Dataset(num_blocks=2, num_rows=6, " - "schema={two: string, " - "one: dictionary})" + assert str(ds) == ( + "Dataset(\n" + " num_blocks=2,\n" + " num_rows=6,\n" + " schema={two: string, " + "one: dictionary}\n" + ")" ), ds - assert ( - repr(ds) == "Dataset(num_blocks=2, num_rows=6, " - "schema={two: string, " - "one: dictionary})" + assert repr(ds) == ( + "Dataset(\n" + " num_blocks=2,\n" + " num_rows=6,\n" + " schema={two: string, " + "one: dictionary}\n" + ")" ), ds check_num_computed(ds, 1, 1) @@ -929,6 +937,57 @@ def test_parquet_reader_batch_size(ray_start_regular_shared, tmp_path): assert ds.count() == 1000 +def test_parquet_datasource_names(ray_start_regular_shared): + assert ParquetBaseDatasource().get_name() == "ParquetBulk" + assert ParquetDatasource().get_name() == "Parquet" + + +# NOTE: All tests above share a Ray cluster, while the tests below do not. These +# tests should only be carefully reordered to retain this invariant! + + +def test_parquet_read_spread(ray_start_cluster, tmp_path): + ray.shutdown() + cluster = ray_start_cluster + cluster.add_node( + resources={"bar:1": 100}, + num_cpus=10, + _system_config={"max_direct_call_object_size": 0}, + ) + cluster.add_node(resources={"bar:2": 100}, num_cpus=10) + cluster.add_node(resources={"bar:3": 100}, num_cpus=0) + + ray.init(cluster.address) + + @ray.remote + def get_node_id(): + return ray.get_runtime_context().get_node_id() + + node1_id = ray.get(get_node_id.options(resources={"bar:1": 1}).remote()) + node2_id = ray.get(get_node_id.options(resources={"bar:2": 1}).remote()) + + data_path = str(tmp_path) + df1 = pd.DataFrame({"one": list(range(100)), "two": list(range(100, 200))}) + path1 = os.path.join(data_path, "test1.parquet") + df1.to_parquet(path1) + df2 = pd.DataFrame({"one": list(range(300, 400)), "two": list(range(400, 500))}) + path2 = os.path.join(data_path, "test2.parquet") + df2.to_parquet(path2) + + ds = ray.data.read_parquet(data_path) + + # Force reads. + blocks = ds.get_internal_block_refs() + assert len(blocks) == 2 + + ray.wait(blocks, num_returns=len(blocks), fetch_local=False) + location_data = ray.experimental.get_object_locations(blocks) + locations = [] + for block in blocks: + locations.extend(location_data[block]["node_ids"]) + assert set(locations) == {node1_id, node2_id} + + if __name__ == "__main__": import sys diff --git a/python/ray/data/tests/test_dataset_tensor.py b/python/ray/data/tests/test_dataset_tensor.py index e95066fae5079..4d0ca0f6d8eea 100644 --- a/python/ray/data/tests/test_dataset_tensor.py +++ b/python/ray/data/tests/test_dataset_tensor.py @@ -23,8 +23,11 @@ def test_tensors_basic(ray_start_regular_shared): tensor_shape = (3, 5) ds = ray.data.range_tensor(6, shape=tensor_shape, parallelism=6) assert str(ds) == ( - "Dataset(num_blocks=6, num_rows=6, " - "schema={__value__: ArrowTensorType(shape=(3, 5), dtype=int64)})" + "Dataset(\n" + " num_blocks=6,\n" + " num_rows=6,\n" + " schema={__value__: ArrowTensorType(shape=(3, 5), dtype=int64)}\n" + ")" ) assert ds.size_bytes() == 5 * 3 * 6 * 8 @@ -273,8 +276,11 @@ def test_tensors_inferred_from_map(ray_start_regular_shared): ds = ray.data.range(10, parallelism=10).map(lambda _: np.ones((4, 4))) ds.fully_executed() assert str(ds) == ( - "Dataset(num_blocks=10, num_rows=10, " - "schema={__value__: ArrowTensorType(shape=(4, 4), dtype=double)})" + "Dataset(\n" + " num_blocks=10,\n" + " num_rows=10,\n" + " schema={__value__: ArrowTensorType(shape=(4, 4), dtype=double)}\n" + ")" ) # Test map_batches. @@ -283,8 +289,11 @@ def test_tensors_inferred_from_map(ray_start_regular_shared): ) ds.fully_executed() assert str(ds) == ( - "Dataset(num_blocks=4, num_rows=24, " - "schema={__value__: ArrowTensorType(shape=(4, 4), dtype=double)})" + "Dataset(\n" + " num_blocks=4,\n" + " num_rows=24,\n" + " schema={__value__: ArrowTensorType(shape=(4, 4), dtype=double)}\n" + ")" ) # Test flat_map. @@ -293,8 +302,11 @@ def test_tensors_inferred_from_map(ray_start_regular_shared): ) ds.fully_executed() assert str(ds) == ( - "Dataset(num_blocks=10, num_rows=20, " - "schema={__value__: ArrowTensorType(shape=(4, 4), dtype=double)})" + "Dataset(\n" + " num_blocks=10,\n" + " num_rows=20,\n" + " schema={__value__: ArrowTensorType(shape=(4, 4), dtype=double)}\n" + ")" ) # Test map_batches ndarray column. @@ -303,8 +315,11 @@ def test_tensors_inferred_from_map(ray_start_regular_shared): ) ds.fully_executed() assert str(ds) == ( - "Dataset(num_blocks=4, num_rows=24, " - "schema={a: TensorDtype(shape=(4, 4), dtype=float64)})" + "Dataset(\n" + " num_blocks=4,\n" + " num_rows=24,\n" + " schema={a: TensorDtype(shape=(4, 4), dtype=float64)}\n" + ")" ) ds = ray.data.range(16, parallelism=4).map_batches( @@ -313,8 +328,11 @@ def test_tensors_inferred_from_map(ray_start_regular_shared): ) ds.fully_executed() assert str(ds) == ( - "Dataset(num_blocks=4, num_rows=16, " - "schema={a: TensorDtype(shape=(None, None), dtype=float64)})" + "Dataset(\n" + " num_blocks=4,\n" + " num_rows=16,\n" + " schema={a: TensorDtype(shape=(None, None), dtype=float64)}\n" + ")" ) diff --git a/python/ray/data/tests/test_dataset_tf.py b/python/ray/data/tests/test_dataset_tf.py index 2be4a34922cc7..799c595478990 100644 --- a/python/ray/data/tests/test_dataset_tf.py +++ b/python/ray/data/tests/test_dataset_tf.py @@ -7,7 +7,6 @@ from ray.air import session from ray.air.config import ScalingConfig from ray.air.constants import TENSOR_COLUMN_NAME -from ray.data.extensions import TensorArray from ray.data.preprocessors import Concatenator from ray.train.tensorflow import TensorflowTrainer @@ -139,7 +138,7 @@ def test_element_spec_pipeline(self): def test_element_spec_shape_with_ragged_tensors(self, batch_size): df = pd.DataFrame( { - "spam": TensorArray([np.zeros([32, 32, 3]), np.zeros([64, 64, 3])]), + "spam": [np.zeros([32, 32, 3]), np.zeros([64, 64, 3])], "ham": [0, 0], } ) diff --git a/python/ray/data/tests/test_dynamic_block_split.py b/python/ray/data/tests/test_dynamic_block_split.py index 252e27c30fa0c..1b58747d21116 100644 --- a/python/ray/data/tests/test_dynamic_block_split.py +++ b/python/ray/data/tests/test_dynamic_block_split.py @@ -1,5 +1,8 @@ +import time + import numpy as np import pandas as pd +import pyarrow as pa import pytest import ray @@ -7,6 +10,7 @@ from ray.data.block import BlockMetadata from ray.data.context import DatasetContext from ray.data.datasource import Datasource +from ray.data.datasource.csv_datasource import CSVDatasource from ray.data.datasource.datasource import ReadTask, Reader from ray.tests.conftest import * # noqa @@ -15,12 +19,14 @@ # Data source generates random bytes data class RandomBytesDatasource(Datasource): def create_reader(self, **read_args): - return RandomBytesReader(read_args["num_blocks"], read_args["block_size"]) + return RandomBytesReader( + read_args["num_blocks_per_task"], read_args["block_size"] + ) class RandomBytesReader(Reader): - def __init__(self, num_blocks: int, block_size: int): - self.num_blocks = num_blocks + def __init__(self, num_blocks_per_task: int, block_size: int): + self.num_blocks_per_task = num_blocks_per_task self.block_size = block_size def estimate_inmemory_data_size(self): @@ -28,15 +34,15 @@ def estimate_inmemory_data_size(self): def get_read_tasks(self, parallelism: int): def _blocks_generator(): - for _ in range(self.num_blocks): + for _ in range(self.num_blocks_per_task): yield pd.DataFrame({"one": [np.random.bytes(self.block_size)]}) return parallelism * [ ReadTask( lambda: _blocks_generator(), BlockMetadata( - num_rows=self.num_blocks, - size_bytes=self.num_blocks * self.block_size, + num_rows=self.num_blocks_per_task, + size_bytes=self.num_blocks_per_task * self.block_size, schema=None, input_files=None, exec_stats=None, @@ -45,6 +51,46 @@ def _blocks_generator(): ] +class SlowCSVDatasource(CSVDatasource): + def _read_stream(self, f: "pa.NativeFile", path: str, **reader_args): + for block in CSVDatasource._read_stream(self, f, path, **reader_args): + time.sleep(3) + yield block + + +# Tests that we don't block on exponential rampup when doing bulk reads. +# https://github.com/ray-project/ray/issues/20625 +@pytest.mark.parametrize("block_split", [False, True]) +def test_bulk_lazy_eval_split_mode(shutdown_only, block_split, tmp_path): + # Defensively shutdown Ray for the first test here to make sure there + # is no existing Ray cluster. + ray.shutdown() + + ray.init(num_cpus=8) + ctx = ray.data.context.DatasetContext.get_current() + + try: + original = ctx.block_splitting_enabled + + ray.data.range(8, parallelism=8).write_csv(str(tmp_path)) + if not block_split: + # Setting infinite block size effectively disables block splitting. + ctx.target_max_block_size = float("inf") + ds = ray.data.read_datasource( + SlowCSVDatasource(), parallelism=8, paths=str(tmp_path) + ) + + start = time.time() + ds.map(lambda x: x) + delta = time.time() - start + + print("full read time", delta) + # Should run in ~3 seconds. It takes >9 seconds if bulk read is broken. + assert delta < 8, delta + finally: + ctx.block_splitting_enabled = original + + def test_enable_in_ray_client(ray_start_cluster_enabled): cluster = ray_start_cluster_enabled cluster.add_node(num_cpus=4) @@ -82,65 +128,70 @@ def test_enable_in_ray_client(ray_start_cluster_enabled): ], ) def test_dataset( - ray_start_regular_shared, + shutdown_only, enable_dynamic_block_splitting, target_max_block_size, compute, ): - # Test 10 blocks from 10 tasks, each block is 1024 bytes. - num_blocks = 10 + ray.shutdown() + # We need at least 2 CPUs to run a actorpool streaming + ray.init(num_cpus=2) + # Test 10 tasks, each task returning 10 blocks, each block has 1 row and each + # row has 1024 bytes. + num_blocks_per_task = 10 block_size = 1024 num_tasks = 10 ds = ray.data.read_datasource( RandomBytesDatasource(), parallelism=num_tasks, - num_blocks=num_blocks, + num_blocks_per_task=num_blocks_per_task, block_size=block_size, ) + # Note the following calls to ds will not fully execute it. assert ds.schema() is not None - assert ds.count() == num_blocks * num_tasks + assert ds.count() == num_blocks_per_task * num_tasks assert ds.num_blocks() == num_tasks - assert ds.size_bytes() >= 0.7 * block_size * num_blocks * num_tasks + assert ds.size_bytes() >= 0.7 * block_size * num_blocks_per_task * num_tasks map_ds = ds.map_batches(lambda x: x, compute=compute) map_ds.fully_executed() assert map_ds.num_blocks() == num_tasks map_ds = ds.map_batches( - lambda x: x, batch_size=num_blocks * num_tasks, compute=compute + lambda x: x, batch_size=num_blocks_per_task * num_tasks, compute=compute ) map_ds.fully_executed() assert map_ds.num_blocks() == 1 map_ds = ds.map(lambda x: x, compute=compute) map_ds.fully_executed() - assert map_ds.num_blocks() == num_blocks * num_tasks + assert map_ds.num_blocks() == num_blocks_per_task * num_tasks ds_list = ds.split(5) assert len(ds_list) == 5 for new_ds in ds_list: - assert new_ds.num_blocks() == num_blocks * num_tasks / 5 + assert new_ds.num_blocks() == num_blocks_per_task * num_tasks / 5 train, test = ds.train_test_split(test_size=0.25) - assert train.num_blocks() == num_blocks * num_tasks * 0.75 - assert test.num_blocks() == num_blocks * num_tasks * 0.25 + assert train.num_blocks() == num_blocks_per_task * num_tasks * 0.75 + assert test.num_blocks() == num_blocks_per_task * num_tasks * 0.25 new_ds = ds.union(ds, ds) assert new_ds.num_blocks() == num_tasks * 3 new_ds.fully_executed() - assert new_ds.num_blocks() == num_blocks * num_tasks * 3 + assert new_ds.num_blocks() == num_blocks_per_task * num_tasks * 3 new_ds = ds.random_shuffle() assert new_ds.num_blocks() == num_tasks new_ds = ds.randomize_block_order() assert new_ds.num_blocks() == num_tasks - assert ds.groupby("one").count().count() == num_blocks * num_tasks + assert ds.groupby("one").count().count() == num_blocks_per_task * num_tasks new_ds = ds.zip(ds) new_ds.fully_executed() - assert new_ds.num_blocks() == num_blocks * num_tasks + assert new_ds.num_blocks() == num_blocks_per_task * num_tasks assert len(ds.take(5)) == 5 - assert len(ds.take_all()) == num_blocks * num_tasks + assert len(ds.take_all()) == num_blocks_per_task * num_tasks for batch in ds.iter_batches(batch_size=10): assert len(batch) == 10 @@ -148,15 +199,16 @@ def test_dataset( def test_dataset_pipeline( ray_start_regular_shared, enable_dynamic_block_splitting, target_max_block_size ): - # Test 10 blocks from 10 tasks, each block is 1024 bytes. - num_blocks = 10 + # Test 10 tasks, each task returning 10 blocks, each block has 1 row and each + # row has 1024 bytes. + num_blocks_per_task = 10 block_size = 1024 num_tasks = 10 ds = ray.data.read_datasource( RandomBytesDatasource(), parallelism=num_tasks, - num_blocks=num_blocks, + num_blocks_per_task=num_blocks_per_task, block_size=block_size, ) dsp = ds.window(blocks_per_window=2) @@ -166,7 +218,7 @@ def test_dataset_pipeline( result_batches = list(ds.iter_batches(batch_size=5)) for batch in result_batches: assert len(batch) == 5 - assert len(result_batches) == num_blocks * num_tasks / 5 + assert len(result_batches) == num_blocks_per_task * num_tasks / 5 dsp = ds.window(blocks_per_window=2) assert dsp._length == num_tasks / 2 @@ -178,40 +230,42 @@ def test_dataset_pipeline( def test_filter( ray_start_regular_shared, enable_dynamic_block_splitting, target_max_block_size ): - # Test 10 blocks from 1 task, each block is 1024 bytes. - num_blocks = 10 + # Test 10 tasks, each task returning 10 blocks, each block has 1 row and each + # row has 1024 bytes. + num_blocks_per_task = 10 block_size = 1024 ds = ray.data.read_datasource( RandomBytesDatasource(), parallelism=1, - num_blocks=num_blocks, + num_blocks_per_task=num_blocks_per_task, block_size=block_size, ) ds = ds.filter(lambda _: True) ds.fully_executed() - assert ds.count() == num_blocks - assert ds.num_blocks() == num_blocks + assert ds.count() == num_blocks_per_task + assert ds.num_blocks() == num_blocks_per_task ds = ds.filter(lambda _: False) ds.fully_executed() assert ds.count() == 0 - assert ds.num_blocks() == num_blocks + assert ds.num_blocks() == num_blocks_per_task def test_lazy_block_list( shutdown_only, enable_dynamic_block_splitting, target_max_block_size ): - # Test 10 blocks from 10 tasks, each block is 1024 bytes. - num_blocks = 10 + # Test 10 tasks, each task returning 10 blocks, each block has 1 row and each + # row has 1024 bytes. + num_blocks_per_task = 10 block_size = 1024 num_tasks = 10 ds = ray.data.read_datasource( RandomBytesDatasource(), parallelism=num_tasks, - num_blocks=num_blocks, + num_blocks_per_task=num_blocks_per_task, block_size=block_size, ) ds.schema() @@ -231,18 +285,18 @@ def test_lazy_block_list( assert len(cached_metadata) == num_tasks for i, block_metadata in enumerate(cached_metadata): if i == 0: - assert len(block_metadata) == num_blocks + assert len(block_metadata) == num_blocks_per_task for m in block_metadata: assert m.num_rows == 1 else: assert block_metadata is None - assert len(metadata) == num_tasks - 1 + num_blocks + assert len(metadata) == num_tasks - 1 + num_blocks_per_task for i, block_metadata in enumerate(metadata): - if i < num_blocks: + if i < num_blocks_per_task: assert block_metadata.num_rows == 1 assert block_metadata.schema is not None else: - assert block_metadata.num_rows == num_blocks + assert block_metadata.num_rows == num_blocks_per_task assert block_metadata.schema is None # Check APIs of LazyBlockList @@ -255,12 +309,12 @@ def test_lazy_block_list( assert len(block_lists[0]._block_partition_refs) == 2 assert len(block_lists[0]._cached_metadata) == 2 - block_lists = block_list.split_by_bytes(block_size * num_blocks * 2) + block_lists = block_list.split_by_bytes(block_size * num_blocks_per_task * 2) assert len(block_lists) == num_tasks / 2 assert len(block_lists[0]._block_partition_refs) == 2 assert len(block_lists[0]._cached_metadata) == 2 - new_block_list = block_list.truncate_by_rows(num_blocks * 3) + new_block_list = block_list.truncate_by_rows(num_blocks_per_task * 3) assert len(new_block_list._block_partition_refs) == 3 assert len(new_block_list._cached_metadata) == 3 @@ -275,7 +329,7 @@ def test_lazy_block_list( assert len(new_block_list._cached_metadata) == num_tasks output_blocks = block_list.get_blocks_with_metadata() - assert len(output_blocks) == num_tasks * num_blocks + assert len(output_blocks) == num_tasks * num_blocks_per_task for _, metadata in output_blocks: assert metadata.num_rows == 1 for _, metadata in block_list.iter_blocks_with_metadata(): @@ -291,10 +345,10 @@ def test_lazy_block_list( assert all(map(lambda ref: ref is None, block_list._block_partition_meta_refs)) assert len(cached_metadata) == num_tasks for block_metadata in cached_metadata: - assert len(block_metadata) == num_blocks + assert len(block_metadata) == num_blocks_per_task for m in block_metadata: assert m.num_rows == 1 - assert len(metadata) == num_tasks * num_blocks + assert len(metadata) == num_tasks * num_blocks_per_task for block_metadata in metadata: assert block_metadata.num_rows == 1 assert block_metadata.schema is not None @@ -302,7 +356,7 @@ def test_lazy_block_list( def test_read_large_data(ray_start_cluster, enable_dynamic_block_splitting): # Test 20G input with single task - num_blocks = 20 + num_blocks_per_task = 20 block_size = 1024 * 1024 * 1024 cluster = ray_start_cluster @@ -316,12 +370,12 @@ def foo(batch): ds = ray.data.read_datasource( RandomBytesDatasource(), parallelism=1, - num_blocks=num_blocks, + num_blocks_per_task=num_blocks_per_task, block_size=block_size, ) ds = ds.map_batches(foo, batch_size=None) - assert ds.count() == num_blocks + assert ds.count() == num_blocks_per_task if __name__ == "__main__": diff --git a/python/ray/data/tests/test_execution_optimizer.py b/python/ray/data/tests/test_execution_optimizer.py index e47ee3a16be5d..fe903813149a2 100644 --- a/python/ray/data/tests/test_execution_optimizer.py +++ b/python/ray/data/tests/test_execution_optimizer.py @@ -1,8 +1,10 @@ +import itertools import pytest import ray from ray.data._internal.execution.operators.map_operator import MapOperator from ray.data._internal.execution.operators.all_to_all_operator import AllToAllOperator +from ray.data._internal.execution.operators.zip_operator import ZipOperator from ray.data._internal.execution.operators.input_data_buffer import InputDataBuffer from ray.data._internal.logical.interfaces import LogicalPlan from ray.data._internal.logical.optimizers import PhysicalOptimizer @@ -20,6 +22,7 @@ Filter, FlatMap, ) +from ray.data._internal.logical.operators.n_ary_operator import Zip from ray.data._internal.planner.planner import Planner from ray.data.aggregate import Count from ray.data.datasource.parquet_datasource import ParquetDatasource @@ -594,6 +597,33 @@ def test_aggregate_e2e( assert row.as_pydict() == {"value": idx, "count()": 1} +def test_zip_operator(ray_start_regular_shared, enable_optimizer): + planner = Planner() + read_op1 = Read(ParquetDatasource()) + read_op2 = Read(ParquetDatasource()) + op = Zip(read_op1, read_op2) + plan = LogicalPlan(op) + physical_op = planner.plan(plan).dag + + assert op.name == "Zip" + assert isinstance(physical_op, ZipOperator) + assert len(physical_op.input_dependencies) == 2 + assert isinstance(physical_op.input_dependencies[0], MapOperator) + assert isinstance(physical_op.input_dependencies[1], MapOperator) + + +@pytest.mark.parametrize( + "num_blocks1,num_blocks2", + list(itertools.combinations_with_replacement(range(1, 12), 2)), +) +def test_zip_e2e(ray_start_regular_shared, enable_optimizer, num_blocks1, num_blocks2): + n = 12 + ds1 = ray.data.range(n, parallelism=num_blocks1) + ds2 = ray.data.range(n, parallelism=num_blocks2).map(lambda x: x + 1) + ds = ds1.zip(ds2) + assert ds.take() == list(zip(range(n), range(1, n + 1))) + + def test_streaming_executor( ray_start_regular_shared, enable_optimizer, diff --git a/python/ray/data/tests/test_metadata_provider.py b/python/ray/data/tests/test_metadata_provider.py index b9d0f9ef3713d..f8d7ab6c79688 100644 --- a/python/ray/data/tests/test_metadata_provider.py +++ b/python/ray/data/tests/test_metadata_provider.py @@ -1,14 +1,26 @@ +from functools import partial +import logging +import os import pytest import posixpath +from unittest.mock import patch import urllib.parse -import os -import logging import pyarrow as pa +from pyarrow.fs import LocalFileSystem import pandas as pd import pyarrow.parquet as pq from pytest_lazyfixture import lazy_fixture -from ray.data.datasource.file_based_datasource import _resolve_paths_and_filesystem +from ray.data.datasource.file_based_datasource import ( + FILE_SIZE_FETCH_PARALLELIZATION_THRESHOLD, + _resolve_paths_and_filesystem, + _unwrap_protocol, +) +from ray.data.datasource.file_meta_provider import ( + _get_file_infos_serial, + _get_file_infos_common_path_prefix, + _get_file_infos_parallel, +) from ray.tests.conftest import * # noqa from ray.data.datasource import ( @@ -18,11 +30,16 @@ DefaultFileMetadataProvider, DefaultParquetMetadataProvider, FastFileMetadataProvider, + PathPartitionEncoder, ) from ray.data.tests.conftest import * # noqa +def df_to_csv(dataframe, path, **kwargs): + dataframe.to_csv(path, **kwargs) + + def _get_parquet_file_meta_size_bytes(file_metas): return sum( sum(m.row_group(i).total_byte_size for i in range(m.num_row_groups)) @@ -125,7 +142,9 @@ def test_default_parquet_metadata_provider(fs, data_path): ), ], ) -def test_default_file_metadata_provider(caplog, fs, data_path, endpoint_url): +def test_default_file_metadata_provider( + propagate_logs, caplog, fs, data_path, endpoint_url +): storage_options = ( {} if endpoint_url is None @@ -144,8 +163,12 @@ def test_default_file_metadata_provider(caplog, fs, data_path, endpoint_url): df2.to_csv(path2, index=False, storage_options=storage_options) meta_provider = DefaultFileMetadataProvider() - with caplog.at_level(logging.WARNING): - file_paths, file_sizes = meta_provider.expand_paths(paths, fs) + with caplog.at_level(logging.WARNING), patch( + "ray.data.datasource.file_meta_provider._get_file_infos_serial", + wraps=_get_file_infos_serial, + ) as mock_get: + file_paths, file_sizes = map(list, zip(*meta_provider.expand_paths(paths, fs))) + mock_get.assert_called_once_with(paths, fs) assert "meta_provider=FastFileMetadataProvider()" in caplog.text assert file_paths == paths expected_file_sizes = _get_file_sizes_bytes(paths, fs) @@ -164,6 +187,195 @@ def test_default_file_metadata_provider(caplog, fs, data_path, endpoint_url): assert meta.schema is None +@pytest.mark.parametrize( + "fs,data_path,endpoint_url", + [ + (lazy_fixture("local_fs"), lazy_fixture("local_path"), None), + (lazy_fixture("s3_fs"), lazy_fixture("s3_path"), lazy_fixture("s3_server")), + ], +) +def test_default_file_metadata_provider_many_files_basic( + propagate_logs, + caplog, + fs, + data_path, + endpoint_url, +): + if endpoint_url is None: + storage_options = {} + else: + storage_options = dict(client_kwargs=dict(endpoint_url=endpoint_url)) + + paths = [] + dfs = [] + num_dfs = 4 * FILE_SIZE_FETCH_PARALLELIZATION_THRESHOLD + for i in range(num_dfs): + df = pd.DataFrame({"one": list(range(i * 3, (i + 1) * 3))}) + dfs.append(df) + path = os.path.join(data_path, f"test_{i}.csv") + paths.append(path) + df.to_csv(path, index=False, storage_options=storage_options) + paths, fs = _resolve_paths_and_filesystem(paths, fs) + + meta_provider = DefaultFileMetadataProvider() + if isinstance(fs, LocalFileSystem): + patcher = patch( + "ray.data.datasource.file_meta_provider._get_file_infos_serial", + wraps=_get_file_infos_serial, + ) + else: + patcher = patch( + "ray.data.datasource.file_meta_provider._get_file_infos_common_path_prefix", + wraps=_get_file_infos_common_path_prefix, + ) + with caplog.at_level(logging.WARNING), patcher as mock_get: + file_paths, file_sizes = map(list, zip(*meta_provider.expand_paths(paths, fs))) + if isinstance(fs, LocalFileSystem): + mock_get.assert_called_once_with(paths, fs) + else: + mock_get.assert_called_once_with(paths, _unwrap_protocol(data_path), fs) + assert "meta_provider=FastFileMetadataProvider()" in caplog.text + assert file_paths == paths + expected_file_sizes = _get_file_sizes_bytes(paths, fs) + assert file_sizes == expected_file_sizes + + +@pytest.mark.parametrize( + "fs,data_path,endpoint_url", + [ + (lazy_fixture("local_fs"), lazy_fixture("local_path"), None), + (lazy_fixture("s3_fs"), lazy_fixture("s3_path"), lazy_fixture("s3_server")), + ], +) +def test_default_file_metadata_provider_many_files_partitioned( + propagate_logs, + caplog, + fs, + data_path, + endpoint_url, + write_partitioned_df, + assert_base_partitioned_ds, +): + if endpoint_url is None: + storage_options = {} + else: + storage_options = dict(client_kwargs=dict(endpoint_url=endpoint_url)) + + partition_keys = ["one"] + partition_path_encoder = PathPartitionEncoder.of( + base_dir=data_path, + field_names=partition_keys, + filesystem=fs, + ) + paths = [] + dfs = [] + num_dfs = FILE_SIZE_FETCH_PARALLELIZATION_THRESHOLD + for i in range(num_dfs): + df = pd.DataFrame( + {"one": [1, 1, 1, 3, 3, 3], "two": list(range(6 * i, 6 * (i + 1)))} + ) + df_paths = write_partitioned_df( + df, + partition_keys, + partition_path_encoder, + partial(df_to_csv, storage_options=storage_options, index=False), + file_name_suffix=i, + ) + dfs.append(df) + paths.extend(df_paths) + paths, fs = _resolve_paths_and_filesystem(paths, fs) + partitioning = partition_path_encoder.scheme + + meta_provider = DefaultFileMetadataProvider() + if isinstance(fs, LocalFileSystem): + patcher = patch( + "ray.data.datasource.file_meta_provider._get_file_infos_serial", + wraps=_get_file_infos_serial, + ) + else: + patcher = patch( + "ray.data.datasource.file_meta_provider._get_file_infos_common_path_prefix", + wraps=_get_file_infos_common_path_prefix, + ) + with caplog.at_level(logging.WARNING), patcher as mock_get: + file_paths, file_sizes = map( + list, zip(*meta_provider.expand_paths(paths, fs, partitioning)) + ) + if isinstance(fs, LocalFileSystem): + mock_get.assert_called_once_with(paths, fs) + else: + mock_get.assert_called_once_with( + paths, + _unwrap_protocol(partitioning.base_dir), + fs, + ) + assert "meta_provider=FastFileMetadataProvider()" in caplog.text + assert file_paths == paths + expected_file_sizes = _get_file_sizes_bytes(paths, fs) + assert file_sizes == expected_file_sizes + + +@pytest.mark.parametrize( + "fs,data_path,endpoint_url", + [ + (lazy_fixture("local_fs"), lazy_fixture("local_path"), None), + (lazy_fixture("s3_fs"), lazy_fixture("s3_path"), lazy_fixture("s3_server")), + ], +) +def test_default_file_metadata_provider_many_files_diff_dirs( + ray_start_regular, + propagate_logs, + caplog, + fs, + data_path, + endpoint_url, +): + if endpoint_url is None: + storage_options = {} + else: + storage_options = dict(client_kwargs=dict(endpoint_url=endpoint_url)) + + dir1 = os.path.join(data_path, "dir1") + dir2 = os.path.join(data_path, "dir2") + if fs is None: + os.mkdir(dir1) + os.mkdir(dir2) + else: + fs.create_dir(_unwrap_protocol(dir1)) + fs.create_dir(_unwrap_protocol(dir2)) + + paths = [] + dfs = [] + num_dfs = 2 * FILE_SIZE_FETCH_PARALLELIZATION_THRESHOLD + for i, dir_path in enumerate([dir1, dir2]): + for j in range(num_dfs * i, num_dfs * (i + 1)): + df = pd.DataFrame({"one": list(range(3 * j, 3 * (j + 1)))}) + dfs.append(df) + path = os.path.join(dir_path, f"test_{j}.csv") + paths.append(path) + df.to_csv(path, index=False, storage_options=storage_options) + paths, fs = _resolve_paths_and_filesystem(paths, fs) + + meta_provider = DefaultFileMetadataProvider() + if isinstance(fs, LocalFileSystem): + patcher = patch( + "ray.data.datasource.file_meta_provider._get_file_infos_serial", + wraps=_get_file_infos_serial, + ) + else: + patcher = patch( + "ray.data.datasource.file_meta_provider._get_file_infos_parallel", + wraps=_get_file_infos_parallel, + ) + with caplog.at_level(logging.WARNING), patcher as mock_get: + file_paths, file_sizes = map(list, zip(*meta_provider.expand_paths(paths, fs))) + mock_get.assert_called_once_with(paths, fs) + assert "meta_provider=FastFileMetadataProvider()" in caplog.text + assert file_paths == paths + expected_file_sizes = _get_file_sizes_bytes(paths, fs) + assert file_sizes == expected_file_sizes + + @pytest.mark.parametrize( "fs,data_path,endpoint_url", [ @@ -182,7 +394,9 @@ def test_default_file_metadata_provider(caplog, fs, data_path, endpoint_url): ), ], ) -def test_fast_file_metadata_provider(caplog, fs, data_path, endpoint_url): +def test_fast_file_metadata_provider( + propagate_logs, caplog, fs, data_path, endpoint_url +): storage_options = ( {} if endpoint_url is None @@ -202,7 +416,7 @@ def test_fast_file_metadata_provider(caplog, fs, data_path, endpoint_url): meta_provider = FastFileMetadataProvider() with caplog.at_level(logging.WARNING): - file_paths, file_sizes = meta_provider.expand_paths(paths, fs) + file_paths, file_sizes = map(list, zip(*meta_provider.expand_paths(paths, fs))) assert "meta_provider=DefaultFileMetadataProvider()" in caplog.text assert file_paths == paths assert len(file_sizes) == len(file_paths) diff --git a/python/ray/data/tests/test_mongo_dataset.py b/python/ray/data/tests/test_mongo_dataset.py index d87d83421003f..92a7f1735fa75 100644 --- a/python/ray/data/tests/test_mongo_dataset.py +++ b/python/ray/data/tests/test_mongo_dataset.py @@ -78,8 +78,11 @@ def test_read_write_mongo(ray_start_regular_shared, start_mongo): ) assert ds._block_num_rows() == [3, 2] assert str(ds) == ( - "Dataset(num_blocks=2, num_rows=5, " - "schema={float_field: double, int_field: int32})" + "Dataset(\n" + " num_blocks=2,\n" + " num_rows=5,\n" + " schema={float_field: double, int_field: int32}\n" + ")" ) assert df.equals(ds.to_pandas()) @@ -93,8 +96,12 @@ def test_read_write_mongo(ray_start_regular_shared, start_mongo): ) assert ds._block_num_rows() == [3, 2] assert str(ds) == ( - "Dataset(num_blocks=2, num_rows=5, schema={_id: fixed_size_binary[12], " - "float_field: double, int_field: int32})" + "Dataset(\n" + " num_blocks=2,\n" + " num_rows=5,\n" + " schema={_id: fixed_size_binary[12], float_field: double, " + "int_field: int32}\n" + ")" ) assert df.equals(ds.drop_columns(["_id"]).to_pandas()) @@ -108,8 +115,12 @@ def test_read_write_mongo(ray_start_regular_shared, start_mongo): ) assert ds._block_num_rows() == [2, 1] assert str(ds) == ( - "Dataset(num_blocks=2, num_rows=3, schema={_id: fixed_size_binary[12], " - "float_field: double, int_field: int32})" + "Dataset(\n" + " num_blocks=2,\n" + " num_rows=3,\n" + " schema={_id: fixed_size_binary[12], float_field: double, " + "int_field: int32}\n" + ")" ) df[df["int_field"] < 3].equals(ds.drop_columns(["_id"]).to_pandas()) @@ -120,8 +131,12 @@ def test_read_write_mongo(ray_start_regular_shared, start_mongo): collection=foo_collection, ) assert str(ds) == ( - "Dataset(num_blocks=5, num_rows=5, schema={_id: fixed_size_binary[12], " - "float_field: double, int_field: int32})" + "Dataset(\n" + " num_blocks=5,\n" + " num_rows=5,\n" + " schema={_id: fixed_size_binary[12], float_field: double, " + "int_field: int32}\n" + ")" ) assert df.equals(ds.drop_columns(["_id"]).to_pandas()) @@ -133,8 +148,12 @@ def test_read_write_mongo(ray_start_regular_shared, start_mongo): parallelism=1000, ) assert str(ds) == ( - "Dataset(num_blocks=5, num_rows=5, schema={_id: fixed_size_binary[12], " - "float_field: double, int_field: int32})" + "Dataset(\n" + " num_blocks=5,\n" + " num_rows=5,\n" + " schema={_id: fixed_size_binary[12], float_field: double, " + "int_field: int32}\n" + ")" ) assert df.equals(ds.drop_columns(["_id"]).to_pandas()) @@ -192,8 +211,11 @@ def test_mongo_datasource(ray_start_regular_shared, start_mongo): ).fully_executed() assert ds._block_num_rows() == [3, 2] assert str(ds) == ( - "Dataset(num_blocks=2, num_rows=5, " - "schema={float_field: double, int_field: int32})" + "Dataset(\n" + " num_blocks=2,\n" + " num_rows=5,\n" + " schema={float_field: double, int_field: int32}\n" + ")" ) assert df.equals(ds.to_pandas()) @@ -208,8 +230,12 @@ def test_mongo_datasource(ray_start_regular_shared, start_mongo): ).fully_executed() assert ds._block_num_rows() == [3, 2] assert str(ds) == ( - "Dataset(num_blocks=2, num_rows=5, " - "schema={_id: fixed_size_binary[12], float_field: double, int_field: int32})" + "Dataset(\n" + " num_blocks=2,\n" + " num_rows=5,\n" + " schema={_id: fixed_size_binary[12], float_field: double, " + "int_field: int32}\n" + ")" ) assert df.equals(ds.drop_columns(["_id"]).to_pandas()) @@ -221,8 +247,12 @@ def test_mongo_datasource(ray_start_regular_shared, start_mongo): collection=foo_collection, ).fully_executed() assert str(ds) == ( - "Dataset(num_blocks=5, num_rows=5, schema={_id: fixed_size_binary[12], " - "float_field: double, int_field: int32})" + "Dataset(\n" + " num_blocks=5,\n" + " num_rows=5,\n" + " schema={_id: fixed_size_binary[12], float_field: double, " + "int_field: int32}\n" + ")" ) assert df.equals(ds.drop_columns(["_id"]).to_pandas()) @@ -235,8 +265,12 @@ def test_mongo_datasource(ray_start_regular_shared, start_mongo): collection=foo_collection, ) assert str(ds) == ( - "Dataset(num_blocks=5, num_rows=5, schema={_id: fixed_size_binary[12], " - "float_field: double, int_field: int32})" + "Dataset(\n" + " num_blocks=5,\n" + " num_rows=5,\n" + " schema={_id: fixed_size_binary[12], float_field: double, " + "int_field: int32}\n" + ")" ) assert df.equals(ds.drop_columns(["_id"]).to_pandas()) @@ -251,8 +285,12 @@ def test_mongo_datasource(ray_start_regular_shared, start_mongo): ) assert ds._block_num_rows() == [2, 1] assert str(ds) == ( - "Dataset(num_blocks=2, num_rows=3, schema={_id: fixed_size_binary[12], " - "float_field: double, int_field: int32})" + "Dataset(\n" + " num_blocks=2,\n" + " num_rows=3,\n" + " schema={_id: fixed_size_binary[12], float_field: double, " + "int_field: int32}\n" + ")" ) df[df["int_field"] < 3].equals(ds.drop_columns(["_id"]).to_pandas()) diff --git a/python/ray/data/tests/test_object_gc.py b/python/ray/data/tests/test_object_gc.py index 67c627ce3afc4..95697649d5c62 100644 --- a/python/ray/data/tests/test_object_gc.py +++ b/python/ray/data/tests/test_object_gc.py @@ -1,5 +1,3 @@ -import time - import pytest import ray @@ -7,12 +5,13 @@ from ray.tests.conftest import * # noqa -def check_no_spill(ctx, pipe, prefetch_blocks: int = 0): - # Run .iter_batches() for 10 secs, and we expect no object spilling. - end_time = time.time() + 10 - for batch in pipe.iter_batches(batch_size=None, prefetch_blocks=prefetch_blocks): - if time.time() > end_time: - break +def check_no_spill(ctx, pipe): + # Run up to 10 epochs of the pipeline to stress test that + # no spilling will happen. + max_epoch = 10 + for p in pipe.iter_epochs(max_epoch): + for _ in p.iter_batches(batch_size=None): + pass meminfo = memory_summary(ctx.address_info["address"], stats_only=True) assert "Spilled" not in meminfo, meminfo @@ -24,10 +23,7 @@ def test_iter_batches_no_spilling_upon_no_transformation(shutdown_only): ds = ray.data.range_tensor(500, shape=(80, 80, 4), parallelism=100) check_no_spill(ctx, ds.repeat()) - check_no_spill(ctx, ds.repeat(), 5) - check_no_spill(ctx, ds.window(blocks_per_window=20)) - check_no_spill(ctx, ds.window(blocks_per_window=20), 5) def test_iter_batches_no_spilling_upon_rewindow(shutdown_only): @@ -39,9 +35,6 @@ def test_iter_batches_no_spilling_upon_rewindow(shutdown_only): check_no_spill( ctx, ds.window(blocks_per_window=20).repeat().rewindow(blocks_per_window=10) ) - check_no_spill( - ctx, ds.window(blocks_per_window=20).repeat().rewindow(blocks_per_window=10), 5 - ) def test_iter_batches_no_spilling_upon_prior_transformation(shutdown_only): @@ -52,11 +45,8 @@ def test_iter_batches_no_spilling_upon_prior_transformation(shutdown_only): # Repeat, with transformation prior to the pipeline. check_no_spill(ctx, ds.map_batches(lambda x: x).repeat()) - check_no_spill(ctx, ds.map_batches(lambda x: x).repeat(), 5) - # Window, with transformation prior to the pipeline. check_no_spill(ctx, ds.map_batches(lambda x: x).window(blocks_per_window=20)) - check_no_spill(ctx, ds.map_batches(lambda x: x).window(blocks_per_window=20), 5) def test_iter_batches_no_spilling_upon_post_transformation(shutdown_only): @@ -67,11 +57,8 @@ def test_iter_batches_no_spilling_upon_post_transformation(shutdown_only): # Repeat, with transformation post the pipeline creation. check_no_spill(ctx, ds.repeat().map_batches(lambda x: x, batch_size=5)) - check_no_spill(ctx, ds.repeat().map_batches(lambda x: x, batch_size=5), 5) - # Window, with transformation post the pipeline creation. check_no_spill(ctx, ds.window(blocks_per_window=20).map_batches(lambda x: x)) - check_no_spill(ctx, ds.window(blocks_per_window=20).map_batches(lambda x: x), 5) def test_iter_batches_no_spilling_upon_transformations(shutdown_only): @@ -87,14 +74,6 @@ def test_iter_batches_no_spilling_upon_transformations(shutdown_only): .repeat() .map_batches(lambda x: x, batch_size=5), ) - check_no_spill( - ctx, - ds.map_batches(lambda x: x, batch_size=5) - .repeat() - .map_batches(lambda x: x, batch_size=5), - 5, - ) - # Window, with transformation before and post the pipeline. check_no_spill( ctx, @@ -102,13 +81,6 @@ def test_iter_batches_no_spilling_upon_transformations(shutdown_only): .window(blocks_per_window=20) .map_batches(lambda x: x), ) - check_no_spill( - ctx, - ds.map_batches(lambda x: x) - .window(blocks_per_window=20) - .map_batches(lambda x: x), - 5, - ) def test_iter_batches_no_spilling_upon_shuffle(shutdown_only): @@ -118,10 +90,7 @@ def test_iter_batches_no_spilling_upon_shuffle(shutdown_only): ds = ray.data.range_tensor(500, shape=(80, 80, 4), parallelism=100) check_no_spill(ctx, ds.repeat().random_shuffle_each_window()) - check_no_spill(ctx, ds.repeat().random_shuffle_each_window(), 5) - check_no_spill(ctx, ds.window(blocks_per_window=20).random_shuffle_each_window()) - check_no_spill(ctx, ds.window(blocks_per_window=20).random_shuffle_each_window(), 5) def test_pipeline_splitting_has_no_spilling(shutdown_only): diff --git a/python/ray/data/tests/test_operators.py b/python/ray/data/tests/test_operators.py index 104e5d4223155..0fb9b8813d516 100644 --- a/python/ray/data/tests/test_operators.py +++ b/python/ray/data/tests/test_operators.py @@ -261,6 +261,48 @@ def test_split_operator_random(ray_start_regular_shared, equal, random_seed): assert sum(len(output_splits[i]) for i in range(3)) == num_inputs, output_splits +def test_split_operator_locality_hints(ray_start_regular_shared): + input_op = InputDataBuffer(make_ref_bundles([[i] for i in range(10)])) + op = OutputSplitter(input_op, 2, equal=False, locality_hints=["node1", "node2"]) + + def get_fake_loc(item): + if item in [0, 1, 4, 5, 8]: + return "node1" + else: + return "node2" + + def get_bundle_loc(bundle): + return get_fake_loc(ray.get(bundle.blocks[0][0])[0]) + + op._get_location = get_bundle_loc + + # Feed data and implement streaming exec. + output_splits = collections.defaultdict(list) + op.start(ExecutionOptions()) + while input_op.has_next(): + op.add_input(input_op.get_next(), 0) + op.inputs_done() + while op.has_next(): + ref = op.get_next() + assert ref.owns_blocks, ref + for block, _ in ref.blocks: + output_splits[ref.output_split_idx].extend(ray.get(block)) + + total = 0 + for i in range(2): + if i == 0: + node = "node1" + else: + node = "node2" + split = output_splits[i] + for item in split: + assert get_fake_loc(item) == node + total += 1 + + assert total == 10, total + assert "10 locality hits, 0 misses" in op.progress_str() + + def test_map_operator_actor_locality_stats(ray_start_regular_shared): # Create with inputs. input_op = InputDataBuffer(make_ref_bundles([[i] for i in range(100)])) diff --git a/python/ray/data/tests/test_optimize.py b/python/ray/data/tests/test_optimize.py index 1b74355edad81..27b12f8a4845b 100644 --- a/python/ray/data/tests/test_optimize.py +++ b/python/ray/data/tests/test_optimize.py @@ -612,7 +612,7 @@ def __call__(self, x): pipe, 1, [ - "ReadParquetBulk->MapBatches(CallableFn)->MapBatches(CallableFn)", + "ReadParquet->MapBatches(CallableFn)->MapBatches(CallableFn)", ], ) @@ -648,7 +648,7 @@ def __call__(self, x): pipe, 1, [ - "ReadParquetBulk->MapBatches()->MapBatches(CallableFn)", + "ReadParquet->MapBatches()->MapBatches(CallableFn)", ], ) diff --git a/python/ray/data/tests/test_raydp_dataset.py b/python/ray/data/tests/test_raydp_dataset.py index b759fc65ee148..ca9ae69dc7377 100644 --- a/python/ray/data/tests/test_raydp_dataset.py +++ b/python/ray/data/tests/test_raydp_dataset.py @@ -4,6 +4,7 @@ import torch +# RayDP tests require Ray Java. Make sure ray jar is built before running this test. @pytest.fixture(scope="function") def spark(request): ray.init(num_cpus=2, include_dashboard=False) diff --git a/python/ray/data/tests/test_split.py b/python/ray/data/tests/test_split.py index 8d3949d20c742..9c31835481536 100644 --- a/python/ray/data/tests/test_split.py +++ b/python/ray/data/tests/test_split.py @@ -779,6 +779,31 @@ def test_train_test_split(ray_start_regular_shared): ds.train_test_split(test_size=9) +def test_split_is_not_disruptive(ray_start_cluster): + ray.shutdown() + ds = ray.data.range(100, parallelism=10).map_batches(lambda x: x).lazy() + + def verify_integrity(splits): + for dss in splits: + for batch in dss.iter_batches(): + pass + for batch in ds.iter_batches(): + pass + + # No block splitting invovled: split 10 even blocks into 2 groups. + verify_integrity(ds.split(2, equal=True)) + # Block splitting invovled: split 10 even blocks into 3 groups. + verify_integrity(ds.split(3, equal=True)) + + # Same as above but having tranforms post converting to lazy. + verify_integrity(ds.map_batches(lambda x: x).split(2, equal=True)) + verify_integrity(ds.map_batches(lambda x: x).split(3, equal=True)) + + # Same as above but having in-place tranforms post converting to lazy. + verify_integrity(ds.randomize_block_order().split(2, equal=True)) + verify_integrity(ds.randomize_block_order().split(3, equal=True)) + + if __name__ == "__main__": import sys diff --git a/python/ray/data/tests/test_stats.py b/python/ray/data/tests/test_stats.py index 8b458bc88e287..6fcb22b4259e0 100644 --- a/python/ray/data/tests/test_stats.py +++ b/python/ray/data/tests/test_stats.py @@ -5,8 +5,9 @@ import pytest import ray -from ray.data._internal.stats import DatasetStats +from ray.data._internal.stats import _StatsActor, DatasetStats from ray.data._internal.dataset_logger import DatasetLogger +from ray.data.block import BlockMetadata from ray.data.context import DatasetContext from ray.tests.conftest import * # noqa @@ -160,6 +161,9 @@ def test_dataset_stats_basic(ray_start_regular_shared, enable_auto_log_stats): Dataset iterator time breakdown: * In ray.wait(): T * In ray.get(): T +* Num blocks local: Z +* Num blocks remote: Z +* Num blocks unknown location: N * In next_batch(): T * In format_batch(): T * In user code: T @@ -288,7 +292,7 @@ def test_dataset_stats_read_parquet(ray_start_regular_shared, tmp_path): if context.new_execution_backend: assert ( stats - == """Stage N ReadParquetBulk->map: N/N blocks executed in T + == """Stage N ReadParquet->map: N/N blocks executed in T * Remote wall time: T min, T max, T mean, T total * Remote cpu time: T min, T max, T mean, T total * Peak heap memory usage (MiB): N min, N max, N mean @@ -912,6 +916,9 @@ def test_streaming_stats_full(ray_start_regular_shared, restore_dataset_context) Dataset iterator time breakdown: * In ray.wait(): T * In ray.get(): T +* Num blocks local: Z +* Num blocks remote: Z +* Num blocks unknown location: N * In next_batch(): T * In format_batch(): T * In user code: T @@ -920,6 +927,47 @@ def test_streaming_stats_full(ray_start_regular_shared, restore_dataset_context) ) +# NOTE: All tests above share a Ray cluster, while the tests below do not. These +# tests should only be carefully reordered to retain this invariant! + + +def test_stats_actor_cap_num_stats(ray_start_cluster): + actor = _StatsActor.remote(3) + metadatas = [] + task_idx = 0 + for uuid in range(3): + metadatas.append( + BlockMetadata( + num_rows=uuid, + size_bytes=None, + schema=None, + input_files=None, + exec_stats=None, + ) + ) + num_stats = uuid + 1 + actor.record_start.remote(uuid) + assert ray.get(actor._get_stats_dict_size.remote()) == ( + num_stats, + num_stats - 1, + num_stats - 1, + ) + actor.record_task.remote(uuid, task_idx, [metadatas[-1]]) + assert ray.get(actor._get_stats_dict_size.remote()) == ( + num_stats, + num_stats, + num_stats, + ) + for uuid in range(3): + assert ray.get(actor.get.remote(uuid))[0][task_idx] == [metadatas[uuid]] + # Add the fourth stats to exceed the limit. + actor.record_start.remote(3) + # The first stats (with uuid=0) should have been purged. + assert ray.get(actor.get.remote(0))[0] == {} + # The start_time has 3 entries because we just added it above with record_start(). + assert ray.get(actor._get_stats_dict_size.remote()) == (3, 2, 2) + + if __name__ == "__main__": import sys diff --git a/python/ray/data/tests/test_streaming_executor.py b/python/ray/data/tests/test_streaming_executor.py index c593bce813978..cf1f9f6d5eff4 100644 --- a/python/ray/data/tests/test_streaming_executor.py +++ b/python/ray/data/tests/test_streaming_executor.py @@ -125,7 +125,7 @@ def test_select_operator_to_run(): o3.num_active_work_refs = MagicMock(return_value=2) o3.internal_queue_size = MagicMock(return_value=0) assert select_operator_to_run(topo, NO_USAGE, ExecutionResources(), True) == o2 - # nternal queue size is added to num active tasks. + # Internal queue size is added to num active tasks. o3.num_active_work_refs = MagicMock(return_value=0) o3.internal_queue_size = MagicMock(return_value=2) assert select_operator_to_run(topo, NO_USAGE, ExecutionResources(), True) == o2 @@ -136,6 +136,10 @@ def test_select_operator_to_run(): o2.internal_queue_size = MagicMock(return_value=2) assert select_operator_to_run(topo, NO_USAGE, ExecutionResources(), True) == o3 + # Test prioritization of nothrottle ops. + o2.throttling_disabled = MagicMock(return_value=True) + assert select_operator_to_run(topo, NO_USAGE, ExecutionResources(), True) == o2 + def test_dispatch_next_task(): inputs = make_ref_bundles([[x] for x in range(20)]) @@ -302,17 +306,36 @@ def test_configure_output_locality(): o2, compute_strategy=ray.data.ActorPoolStrategy(1, 1), ) + # No locality. build_streaming_topology(o3, ExecutionOptions(locality_with_output=False)) assert o2._ray_remote_args.get("scheduling_strategy") is None assert o3._ray_remote_args.get("scheduling_strategy") == "SPREAD" + + # Current node locality. build_streaming_topology(o3, ExecutionOptions(locality_with_output=True)) - assert isinstance( - o2._ray_remote_args["scheduling_strategy"], NodeAffinitySchedulingStrategy - ) - assert isinstance( - o3._ray_remote_args["scheduling_strategy"], - NodeAffinitySchedulingStrategy, + s1 = o2._get_runtime_ray_remote_args()["scheduling_strategy"] + assert isinstance(s1, NodeAffinitySchedulingStrategy) + assert s1.node_id == ray.get_runtime_context().get_node_id() + s2 = o3._get_runtime_ray_remote_args()["scheduling_strategy"] + assert isinstance(s2, NodeAffinitySchedulingStrategy) + assert s2.node_id == ray.get_runtime_context().get_node_id() + + # Multi node locality. + build_streaming_topology( + o3, ExecutionOptions(locality_with_output=["node1", "node2"]) ) + s1a = o2._get_runtime_ray_remote_args()["scheduling_strategy"] + s1b = o2._get_runtime_ray_remote_args()["scheduling_strategy"] + s1c = o2._get_runtime_ray_remote_args()["scheduling_strategy"] + assert s1a.node_id == "node1" + assert s1b.node_id == "node2" + assert s1c.node_id == "node1" + s2a = o3._get_runtime_ray_remote_args()["scheduling_strategy"] + s2b = o3._get_runtime_ray_remote_args()["scheduling_strategy"] + s2c = o3._get_runtime_ray_remote_args()["scheduling_strategy"] + assert s2a.node_id == "node1" + assert s2b.node_id == "node2" + assert s2c.node_id == "node1" def test_calculate_topology_usage(): @@ -381,6 +404,31 @@ def test_execution_allowed_downstream_aware_memory_throttling(): ) +def test_execution_allowed_nothrottle(): + op = InputDataBuffer([]) + op.incremental_resource_usage = MagicMock(return_value=ExecutionResources()) + # Above global. + assert not _execution_allowed( + op, + TopologyResourceUsage( + ExecutionResources(object_store_memory=1000), + {op: DownstreamMemoryInfo(1, 1000)}, + ), + ExecutionResources(object_store_memory=900), + ) + + # Throttling disabled. + op.throttling_disabled = MagicMock(return_value=True) + assert _execution_allowed( + op, + TopologyResourceUsage( + ExecutionResources(object_store_memory=1000), + {op: DownstreamMemoryInfo(1, 1000)}, + ), + ExecutionResources(object_store_memory=900), + ) + + if __name__ == "__main__": import sys diff --git a/python/ray/data/tests/test_streaming_integration.py b/python/ray/data/tests/test_streaming_integration.py index 0520f1409be90..69b8ae72c5770 100644 --- a/python/ray/data/tests/test_streaming_integration.py +++ b/python/ray/data/tests/test_streaming_integration.py @@ -1,4 +1,5 @@ import pytest +import threading import time from typing import List, Any @@ -16,6 +17,7 @@ from ray.data._internal.execution.operators.all_to_all_operator import AllToAllOperator from ray.data._internal.execution.operators.map_operator import MapOperator from ray.data._internal.execution.operators.input_data_buffer import InputDataBuffer +from ray.data._internal.execution.operators.output_splitter import OutputSplitter from ray.data._internal.execution.util import make_ref_bundles from ray._private.test_utils import wait_for_condition from ray.data.tests.conftest import * # noqa @@ -55,6 +57,82 @@ def reverse_sort(inputs: List[RefBundle], ctx): assert output == expected, (output, expected) +def test_output_split_e2e(ray_start_10_cpus_shared): + executor = StreamingExecutor(ExecutionOptions()) + inputs = make_ref_bundles([[x] for x in range(20)]) + o1 = InputDataBuffer(inputs) + o2 = OutputSplitter(o1, 2, equal=True) + it = executor.execute(o2) + + class Consume(threading.Thread): + def __init__(self, idx): + self.idx = idx + self.out = [] + super().__init__() + + def run(self): + while True: + try: + self.out.append(it.get_next(output_split_idx=self.idx)) + except Exception as e: + print(e) + raise + + c0 = Consume(0) + c1 = Consume(1) + c0.start() + c1.start() + c0.join() + c1.join() + assert len(c0.out) == 10, c0.out + assert len(c1.out) == 10, c0.out + + +def test_streaming_split_e2e(ray_start_10_cpus_shared): + def get_lengths(*iterators): + lengths = [] + for it in iterators: + x = 0 + for batch in it.iter_batches(): + x += len(batch) + lengths.append(x) + lengths.sort() + return lengths + + ds = ray.data.range(1000) + ( + i1, + i2, + ) = ds.streaming_split(2, equal=True) + lengths = get_lengths(i1, i2) + assert lengths == [500, 500], lengths + + ds = ray.data.range(1) + ( + i1, + i2, + ) = ds.streaming_split(2, equal=True) + lengths = get_lengths(i1, i2) + assert lengths == [0, 0], lengths + + ds = ray.data.range(1) + ( + i1, + i2, + ) = ds.streaming_split(2, equal=False) + lengths = get_lengths(i1, i2) + assert lengths == [0, 1], lengths + + ds = ray.data.range(1000, parallelism=10) + i1, i2, i3 = ds.streaming_split(3, equal=True) + lengths = get_lengths(i1, i2, i3) + assert lengths == [333, 333, 333], lengths + + i1, i2, i3 = ds.streaming_split(3, equal=False) + lengths = get_lengths(i1, i2, i3) + assert lengths == [300, 300, 400], lengths + + def test_e2e_option_propagation(ray_start_10_cpus_shared, restore_dataset_context): DatasetContext.get_current().new_execution_backend = True DatasetContext.get_current().use_streaming_executor = True diff --git a/python/ray/data/tests/test_transform_pyarrow.py b/python/ray/data/tests/test_transform_pyarrow.py index 68b5a91594a0f..3ff1e29d32632 100644 --- a/python/ray/data/tests/test_transform_pyarrow.py +++ b/python/ray/data/tests/test_transform_pyarrow.py @@ -1,7 +1,12 @@ +import os + import numpy as np +import pandas as pd import pyarrow as pa import pytest +import ray +from ray.data.block import BlockAccessor from ray.data.extensions import ( ArrowTensorArray, ArrowTensorType, @@ -286,6 +291,109 @@ def test_unify_schemas(): ) +def test_arrow_block_select(): + df = pd.DataFrame({"one": [10, 11, 12], "two": [11, 12, 13], "three": [14, 15, 16]}) + table = pa.Table.from_pandas(df) + block_accessor = BlockAccessor.for_block(table) + + block = block_accessor.select(["two"]) + assert block.schema == pa.schema([("two", pa.int64())]) + assert block.to_pandas().equals(df[["two"]]) + + block = block_accessor.select(["two", "one"]) + assert block.schema == pa.schema([("two", pa.int64()), ("one", pa.int64())]) + assert block.to_pandas().equals(df[["two", "one"]]) + + with pytest.raises(ValueError): + block = block_accessor.select([lambda x: x % 3, "two"]) + + +def test_arrow_block_slice_copy(): + # Test that ArrowBlock slicing properly copies the underlying Arrow + # table. + def check_for_copy(table1, table2, a, b, is_copy): + expected_slice = table1.slice(a, b - a) + assert table2.equals(expected_slice) + assert table2.schema == table1.schema + assert table1.num_columns == table2.num_columns + for col1, col2 in zip(table1.columns, table2.columns): + assert col1.num_chunks == col2.num_chunks + for chunk1, chunk2 in zip(col1.chunks, col2.chunks): + bufs1 = chunk1.buffers() + bufs2 = chunk2.buffers() + expected_offset = 0 if is_copy else a + assert chunk2.offset == expected_offset + assert len(chunk2) == b - a + if is_copy: + assert bufs2[1].address != bufs1[1].address + else: + assert bufs2[1].address == bufs1[1].address + + n = 20 + df = pd.DataFrame( + {"one": list(range(n)), "two": ["a"] * n, "three": [np.nan] + [1.5] * (n - 1)} + ) + table = pa.Table.from_pandas(df) + a, b = 5, 10 + block_accessor = BlockAccessor.for_block(table) + + # Test with copy. + table2 = block_accessor.slice(a, b, True) + check_for_copy(table, table2, a, b, is_copy=True) + + # Test without copy. + table2 = block_accessor.slice(a, b, False) + check_for_copy(table, table2, a, b, is_copy=False) + + +def test_arrow_block_slice_copy_empty(): + # Test that ArrowBlock slicing properly copies the underlying Arrow + # table when the table is empty. + df = pd.DataFrame({"one": []}) + table = pa.Table.from_pandas(df) + a, b = 0, 0 + expected_slice = table.slice(a, b - a) + block_accessor = BlockAccessor.for_block(table) + + # Test with copy. + table2 = block_accessor.slice(a, b, True) + assert table2.equals(expected_slice) + assert table2.schema == table.schema + assert table2.num_rows == 0 + + # Test without copy. + table2 = block_accessor.slice(a, b, False) + assert table2.equals(expected_slice) + assert table2.schema == table.schema + assert table2.num_rows == 0 + + +def test_convert_to_pyarrow(ray_start_regular_shared, tmp_path): + ds = ray.data.range(100) + assert ds.to_dask().sum().compute()[0] == 4950 + path = os.path.join(tmp_path, "test_parquet_dir") + os.mkdir(path) + ds.write_parquet(path) + assert ray.data.read_parquet(path).count() == 100 + + +def test_pyarrow(ray_start_regular_shared): + ds = ray.data.range_table(5) + assert ds.map(lambda x: {"b": x["value"] + 2}).take() == [ + {"b": 2}, + {"b": 3}, + {"b": 4}, + {"b": 5}, + {"b": 6}, + ] + assert ds.map(lambda x: {"b": x["value"] + 2}).filter( + lambda x: x["b"] % 2 == 0 + ).take() == [{"b": 2}, {"b": 4}, {"b": 6}] + assert ds.filter(lambda x: x["value"] == 0).flat_map( + lambda x: [{"b": x["value"] + 2}, {"b": x["value"] + 20}] + ).take() == [{"b": 2}, {"b": 20}] + + if __name__ == "__main__": import sys diff --git a/python/ray/experimental/state/common.py b/python/ray/experimental/state/common.py index 4042312a735fd..c6fc8a8eaf0a4 100644 --- a/python/ray/experimental/state/common.py +++ b/python/ray/experimental/state/common.py @@ -564,6 +564,8 @@ class TaskState(StateSchema): start_time_ms: Optional[int] = state_column(detail=True, filterable=False) #: The time when the task is finished or failed. A Unix timestamp in ms. end_time_ms: Optional[int] = state_column(detail=True, filterable=False) + #: Task error type. + error_type: Optional[str] = state_column(detail=False, filterable=False) @dataclass(init=True) diff --git a/python/ray/serve/_private/common.py b/python/ray/serve/_private/common.py index 69f0f570c7691..c14493b488d38 100644 --- a/python/ray/serve/_private/common.py +++ b/python/ray/serve/_private/common.py @@ -311,7 +311,18 @@ def __post_init__(self): ] ) ) + + # RunningReplicaInfo class set frozen=True, this is the hacky way to set + # new attribute for the class. object.__setattr__(self, "_hash", hash_val) def __hash__(self): return self._hash + + def __eq__(self, other): + return all( + [ + isinstance(other, RunningReplicaInfo), + self._hash == other._hash, + ] + ) diff --git a/python/ray/serve/schema.py b/python/ray/serve/schema.py index a5672fa4cca9a..1e3d9703df5e8 100644 --- a/python/ray/serve/schema.py +++ b/python/ray/serve/schema.py @@ -251,7 +251,7 @@ class ServeApplicationSchema(BaseModel, extra=Extra.forbid): "Application name, the name should be unique within the serve instance" ), ) - route_prefix: str = Field( + route_prefix: Optional[str] = Field( default="/", description=( "Route prefix for HTTP requests. If not provided, it will use" @@ -260,7 +260,6 @@ class ServeApplicationSchema(BaseModel, extra=Extra.forbid): ), ) import_path: str = Field( - default=None, description=( "An import path to a bound deployment node. Should be of the " 'form "module.submodule_1...submodule_n.' @@ -477,6 +476,35 @@ class ServeDeploySchema(BaseModel, extra=Extra.forbid): description=("The set of Serve applications to run on the Ray cluster."), ) + @validator("applications") + def application_names_unique(cls, v): + # Ensure there are no duplicate applications listed + names = [app.name for app in v] + duplicates = {f'"{name}"' for name in names if names.count(name) > 1} + if len(duplicates): + apps_str = ("application " if len(duplicates) == 1 else "applications ") + ( + ", ".join(duplicates) + ) + raise ValueError( + f"Found multiple configs for {apps_str}. Please remove all duplicates." + ) + return v + + @validator("applications") + def application_routes_unique(cls, v): + # Ensure each application with a non-null route prefix has unique route prefixes + routes = [app.route_prefix for app in v if app.route_prefix is not None] + duplicates = {f'"{route}"' for route in routes if routes.count(route) > 1} + if len(duplicates): + routes_str = ( + "route prefix " if len(duplicates) == 1 else "route prefixes " + ) + (", ".join(duplicates)) + raise ValueError( + f"Found duplicate applications for {routes_str}. Please ensure each " + "application's route_prefix is unique." + ) + return v + @staticmethod def get_empty_schema_dict() -> Dict: """Returns an empty deploy schema dictionary. diff --git a/python/ray/serve/tests/test_router.py b/python/ray/serve/tests/test_router.py index 80520a064a848..422a245acfd7c 100644 --- a/python/ray/serve/tests/test_router.py +++ b/python/ray/serve/tests/test_router.py @@ -3,6 +3,7 @@ controller or the actual replica wrapper, use mock if necessary. """ import asyncio +import copy import pytest @@ -118,6 +119,23 @@ async def num_queries(self): await asyncio.sleep(0.2) assert not third_ref_pending_task.done() + # Let's make sure in flight queries is 1 for each replica. + assert len(rs.in_flight_queries[replicas[0]]) == 1 + assert len(rs.in_flight_queries[replicas[1]]) == 1 + + # Let's copy a new RunningReplicaInfo object and update the router + cur_replicas_info = list(rs.in_flight_queries.keys()) + replicas = copy.deepcopy(cur_replicas_info) + assert id(replicas[0].actor_handle) != id(cur_replicas_info[0].actor_handle) + assert replicas[0].replica_tag == cur_replicas_info[0].replica_tag + assert id(replicas[1].actor_handle) != id(cur_replicas_info[1].actor_handle) + assert replicas[1].replica_tag == cur_replicas_info[1].replica_tag + rs.update_running_replicas(replicas) + + # Let's make sure in flight queries is 1 for each replica even if replicas update + assert len(rs.in_flight_queries[replicas[0]]) == 1 + assert len(rs.in_flight_queries[replicas[1]]) == 1 + # Let's unblock the two replicas await signal.send.remote() assert await first_ref == "DONE" diff --git a/python/ray/serve/tests/test_schema.py b/python/ray/serve/tests/test_schema.py index 86bde26856f48..05da6629b8cae 100644 --- a/python/ray/serve/tests/test_schema.py +++ b/python/ray/serve/tests/test_schema.py @@ -18,6 +18,7 @@ DeploymentSchema, ServeApplicationSchema, ServeStatusSchema, + ServeDeploySchema, serve_status_to_schema, ) from ray.util.accelerators.accelerators import NVIDIA_TESLA_V100, NVIDIA_TESLA_P4 @@ -617,6 +618,11 @@ def test_remove_app_name_from_deployment_names(self): with pytest.raises(AssertionError): config.remove_app_name_from_deployment_names() + def test_serve_application_import_path_required(self): + # If no import path is specified, this should not parse successfully + with pytest.raises(ValidationError): + ServeApplicationSchema.parse_obj({"host": "127.0.0.1", "port": 8000}) + class TestServeDeploySchema: def test_serve_application_to_deploy_config(self): @@ -647,6 +653,90 @@ def test_serve_application_to_deploy_config(self): "applications": [app_config_dict], } + def test_deploy_config_duplicate_apps(self): + deploy_config_dict = { + "host": "127.0.0.1", + "port": 8000, + "applications": [ + { + "name": "app1", + "route_prefix": "/alice", + "import_path": "module.graph", + }, + { + "name": "app2", + "route_prefix": "/charlie", + "import_path": "module.graph", + }, + ], + } + ServeDeploySchema.parse_obj(deploy_config_dict) + + # Duplicate app1 + deploy_config_dict["applications"].append( + {"name": "app1", "route_prefix": "/bob", "import_path": "module.graph"}, + ) + with pytest.raises(ValidationError) as e: + ServeDeploySchema.parse_obj(deploy_config_dict) + assert "app1" in str(e.value) and "app2" not in str(e.value) + + # Duplicate app2 + deploy_config_dict["applications"].append( + {"name": "app2", "route_prefix": "/david", "import_path": "module.graph"} + ) + with pytest.raises(ValidationError) as e: + ServeDeploySchema.parse_obj(deploy_config_dict) + assert "app1" in str(e.value) and "app2" in str(e.value) + + def test_deploy_config_duplicate_routes1(self): + """Test that apps with duplicate route prefixes raises validation error""" + deploy_config_dict = { + "host": "127.0.0.1", + "port": 8000, + "applications": [ + { + "name": "app1", + "route_prefix": "/alice", + "import_path": "module.graph", + }, + {"name": "app2", "route_prefix": "/bob", "import_path": "module.graph"}, + ], + } + ServeDeploySchema.parse_obj(deploy_config_dict) + + # Duplicate route prefix /alice + deploy_config_dict["applications"].append( + {"name": "app3", "route_prefix": "/alice", "import_path": "module.graph"}, + ) + with pytest.raises(ValidationError) as e: + ServeDeploySchema.parse_obj(deploy_config_dict) + assert "alice" in str(e.value) and "bob" not in str(e.value) + + # Duplicate route prefix /bob + deploy_config_dict["applications"].append( + {"name": "app4", "route_prefix": "/bob", "import_path": "module.graph"}, + ) + with pytest.raises(ValidationError) as e: + ServeDeploySchema.parse_obj(deploy_config_dict) + assert "alice" in str(e.value) and "bob" in str(e.value) + + def test_deploy_config_duplicate_routes2(self): + """Test that multiple apps with route_prefix set to None parses with no error""" + deploy_config_dict = { + "host": "127.0.0.1", + "port": 8000, + "applications": [ + { + "name": "app1", + "route_prefix": "/app1", + "import_path": "module.graph", + }, + {"name": "app2", "route_prefix": None, "import_path": "module.graph"}, + {"name": "app3", "route_prefix": None, "import_path": "module.graph"}, + ], + } + ServeDeploySchema.parse_obj(deploy_config_dict) + class TestServeStatusSchema: def get_valid_serve_status_schema(self): diff --git a/python/ray/serve/tests/test_standalone2.py b/python/ray/serve/tests/test_standalone2.py index 94136b23b3f83..96ea3b21f6585 100644 --- a/python/ray/serve/tests/test_standalone2.py +++ b/python/ray/serve/tests/test_standalone2.py @@ -1204,6 +1204,72 @@ async def waiter(*args): serve.shutdown() +@pytest.mark.parametrize( + "ray_instance", + [ + { + "LISTEN_FOR_CHANGE_REQUEST_TIMEOUT_S_LOWER_BOUND": "1", + "LISTEN_FOR_CHANGE_REQUEST_TIMEOUT_S_UPPER_BOUND": "2", + }, + ], + indirect=True, +) +def test_long_poll_timeout_with_max_concurrent_queries(ray_instance): + """Test max_concurrent_queries can be honorded with long poll timeout + + issue: https://github.com/ray-project/ray/issues/32652 + """ + + signal_actor = SignalActor.remote() + + @serve.deployment(max_concurrent_queries=1) + async def f(): + await signal_actor.wait.remote() + return "hello" + + handle = serve.run(f.bind()) + first_ref = handle.remote() + + # Clear all the internal longpoll client objects within handle + # long poll client will receive new updates from long poll host, + # this is to simulate the longpoll timeout + object_snapshots1 = handle.router.long_poll_client.object_snapshots + handle.router.long_poll_client._reset() + wait_for_condition( + lambda: len(handle.router.long_poll_client.object_snapshots) > 0, timeout=10 + ) + object_snapshots2 = handle.router.long_poll_client.object_snapshots + + # Check object snapshots between timeout interval + assert object_snapshots1.keys() == object_snapshots2.keys() + assert len(object_snapshots1.keys()) == 1 + key = list(object_snapshots1.keys())[0] + assert ( + object_snapshots1[key][0].actor_handle != object_snapshots2[key][0].actor_handle + ) + assert ( + object_snapshots1[key][0].actor_handle._actor_id + == object_snapshots2[key][0].actor_handle._actor_id + ) + + # Make sure the inflight queries still one + assert len(handle.router._replica_set.in_flight_queries) == 1 + key = list(handle.router._replica_set.in_flight_queries.keys())[0] + assert len(handle.router._replica_set.in_flight_queries[key]) == 1 + + # Make sure the first request is being run. + replicas = list(handle.router._replica_set.in_flight_queries.keys()) + assert len(handle.router._replica_set.in_flight_queries[replicas[0]]) == 1 + # First ref should be still ongoing + with pytest.raises(ray.exceptions.GetTimeoutError): + ray.get(first_ref, timeout=1) + # Unblock the first request. + signal_actor.send.remote() + assert ray.get(first_ref) == "hello" + + serve.shutdown() + + def test_shutdown_remote(start_and_shutdown_ray_cli_function): """Check that serve.shutdown() works on a remote Ray cluster.""" diff --git a/python/ray/tests/spark/test_utils.py b/python/ray/tests/spark/test_utils.py index d9c7e570483f6..864e71ca48327 100644 --- a/python/ray/tests/spark/test_utils.py +++ b/python/ray/tests/spark/test_utils.py @@ -8,6 +8,7 @@ _calc_mem_per_ray_worker_node, _get_avail_mem_per_ray_worker_node, ) +from ray.util.spark.cluster_init import _convert_ray_node_options pytestmark = pytest.mark.skipif( not sys.platform.startswith("linux"), @@ -86,6 +87,16 @@ def test_get_avail_mem_per_ray_worker_node(monkeypatch): ) == (280000, 120000, None, None) +def test_convert_ray_node_options(): + assert _convert_ray_node_options( + { + "cluster_name": "aBc", + "disable_usage_stats": None, + "include_dashboard": False, + } + ) == ["--cluster-name=aBc", "--disable-usage-stats", "--include-dashboard=False"] + + if __name__ == "__main__": if os.environ.get("PARALLEL_CI"): sys.exit(pytest.main(["-n", "auto", "--boxed", "-vs", __file__])) diff --git a/python/ray/tests/test_failure_4.py b/python/ray/tests/test_failure_4.py index ab1394b3ac6aa..70e1c292847b9 100644 --- a/python/ray/tests/test_failure_4.py +++ b/python/ray/tests/test_failure_4.py @@ -7,6 +7,7 @@ import psutil import pytest from grpc._channel import _InactiveRpcError +from ray._private.state_api_test_utils import verify_failed_task import ray import ray._private.ray_constants as ray_constants @@ -555,13 +556,20 @@ def func(): # The lease request should wait inside raylet # since there is no available resources. - ret = func.remote() + ret = func.options(name="task-local-raylet-dead").remote() # Waiting for the lease request to reach raylet. time.sleep(1) head.kill_raylet() with pytest.raises(LocalRayletDiedError): ray.get(ret) + # Check the task failure states for observability. + wait_for_condition( + verify_failed_task, + name="task-local-raylet-dead", + error_type="LOCAL_RAYLET_DIED", + ) + def test_locality_aware_scheduling_for_dead_nodes(shutdown_only): """Test that locality-ware scheduling can handle dead nodes.""" diff --git a/python/ray/tests/test_memory_pressure.py b/python/ray/tests/test_memory_pressure.py index 9b3eb86430237..0148d2cd727ad 100644 --- a/python/ray/tests/test_memory_pressure.py +++ b/python/ray/tests/test_memory_pressure.py @@ -14,6 +14,7 @@ import numpy as np from ray._private.utils import get_system_memory from ray._private.utils import get_used_memory +from ray.experimental.state.api import list_tasks from ray.experimental.state.state_manager import StateDataSourceClient @@ -339,7 +340,7 @@ async def test_task_oom_logs_error(ray_with_memory_monitor): bytes_to_alloc = get_additional_bytes_to_reach_memory_usage_pct(1) with pytest.raises(ray.exceptions.OutOfMemoryError) as _: ray.get( - allocate_memory.options(max_retries=0).remote( + allocate_memory.options(max_retries=0, name="allocate_memory").remote( allocate_bytes=bytes_to_alloc, allocate_interval_s=0, post_allocate_sleep_s=1000, @@ -355,8 +356,16 @@ async def test_task_oom_logs_error(ray_with_memory_monitor): verified = True assert verified - # TODO(clarng): verify task info once state_api_client.get_task_info - # returns the crashed task. + def verify_oom_task_error(): + tasks = list_tasks(filters=[("name", "=", "allocate_memory")]) + print(tasks) + assert len(tasks) == 1, "no retries should be expected." + assert tasks[0]["state"] == "FAILED" + assert tasks[0]["error_type"] == "OUT_OF_MEMORY" + return True + + wait_for_condition(verify_oom_task_error) + # TODO(clarng): verify log info once state api can dump log info diff --git a/python/ray/tests/test_multiprocessing.py b/python/ray/tests/test_multiprocessing.py index 07051b1ef3659..7a82f39933f5c 100644 --- a/python/ray/tests/test_multiprocessing.py +++ b/python/ray/tests/test_multiprocessing.py @@ -6,7 +6,6 @@ import time import random from collections import defaultdict -import warnings import queue import math @@ -509,26 +508,17 @@ def f(args): result_iter.next() -@pytest.mark.filterwarnings( - "default:Passing a non-iterable argument:ray.util.annotations.RayDeprecationWarning" -) -def test_warn_on_non_iterable_imap_or_imap_unordered(pool): +def test_imap_fail_on_non_iterable(pool): def fn(_): pass non_iterable = 3 - with warnings.catch_warnings(record=True) as w: + with pytest.raises(TypeError, match="object is not iterable"): pool.imap(fn, non_iterable) - assert any( - "Passing a non-iterable argument" in str(warning.message) for warning in w - ) - with warnings.catch_warnings(record=True) as w: + with pytest.raises(TypeError, match="object is not iterable"): pool.imap_unordered(fn, non_iterable) - assert any( - "Passing a non-iterable argument" in str(warning.message) for warning in w - ) @pytest.mark.parametrize("use_iter", [True, False]) diff --git a/python/ray/tests/test_task_events.py b/python/ray/tests/test_task_events.py index 6311c9111ac38..cb3e88c1876a5 100644 --- a/python/ray/tests/test_task_events.py +++ b/python/ray/tests/test_task_events.py @@ -3,7 +3,9 @@ import pytest import threading import time - +from ray._private.state_api_test_utils import verify_failed_task +from ray.exceptions import RuntimeEnvSetupError +from ray.runtime_env import RuntimeEnv import ray from ray.experimental.state.common import ListApiOptions, StateResource from ray._private.test_utils import ( @@ -82,6 +84,172 @@ def verify(): ) +def test_failed_task_error(shutdown_only): + ray.init(_system_config=_SYSTEM_CONFIG) + + # Test failed task with TASK_EXECUTION_EXCEPTION + @ray.remote + def fail(x=None): + if x is not None: + time.sleep(x) + raise ValueError("fail is expected to failed") + + with pytest.raises(ray.exceptions.RayTaskError): + ray.get(fail.options(name="fail").remote()) + + wait_for_condition( + verify_failed_task, name="fail", error_type="TASK_EXECUTION_EXCEPTION" + ) + + # Test canceled tasks with TASK_CANCELLED + @ray.remote + def sleep(): + time.sleep(999) + + with pytest.raises(ray.exceptions.TaskCancelledError): + t = sleep.options(name="sleep-cancel").remote() + ray.cancel(t) + ray.get(t) + + wait_for_condition( + verify_failed_task, name="sleep-cancel", error_type="TASK_CANCELLED" + ) + + # Test task failed when worker killed :WORKER_DIED + @ray.remote(max_retries=0) + def die(): + exit(1) + + with pytest.raises(ray.exceptions.WorkerCrashedError): + ray.get(die.options(name="die-worker").remote()) + + wait_for_condition(verify_failed_task, name="die-worker", error_type="WORKER_DIED") + + # Test actor task failed with actor dead: ACTOR_DIED + @ray.remote + class Actor: + def f(self): + time.sleep(999) + + a = Actor.remote() + with pytest.raises(ray.exceptions.RayActorError): + ray.kill(a) + ray.get(a.f.options(name="actor-killed").remote()) + + wait_for_condition(verify_failed_task, name="actor-killed", error_type="ACTOR_DIED") + + +def test_failed_task_failed_due_to_node_failure(ray_start_cluster): + cluster = ray_start_cluster + cluster.add_node(num_cpus=1) + ray.init(address=cluster.address) + node = cluster.add_node(num_cpus=2) + + driver_script = """ +import ray +ray.init("auto") + +@ray.remote(num_cpus=2, max_retries=0) +def sleep(): + import time + time.sleep(999) + +x = sleep.options(name="node-killed").remote() +ray.get(x) + """ + + run_string_as_driver_nonblocking(driver_script) + + def driver_running(): + t = list_tasks(filters=[("name", "=", "node-killed")]) + return len(t) > 0 + + wait_for_condition(driver_running) + + # Kill the node + cluster.remove_node(node) + + wait_for_condition(verify_failed_task, name="node-killed", error_type="NODE_DIED") + + +def test_failed_task_unschedulable(shutdown_only): + ray.init(num_cpus=1, _system_config=_SYSTEM_CONFIG) + + node_id = ray.get_runtime_context().get_node_id() + policy = ray.util.scheduling_strategies.NodeAffinitySchedulingStrategy( + node_id=node_id, + soft=False, + ) + + @ray.remote + def task(): + pass + + task.options( + scheduling_strategy=policy, + name="task-unschedulable", + num_cpus=2, + ).remote() + + wait_for_condition( + verify_failed_task, + name="task-unschedulable", + error_type="TASK_UNSCHEDULABLE_ERROR", + ) + + +def test_failed_task_removed_placement_group(shutdown_only, monkeypatch): + ray.init(num_cpus=2, _system_config=_SYSTEM_CONFIG) + from ray.util.placement_group import placement_group, remove_placement_group + from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy + + pg = placement_group([{"CPU": 2}]) + ray.get(pg.ready()) + + @ray.remote(num_cpus=2) + def sleep(): + time.sleep(999) + + with monkeypatch.context() as m: + m.setenv( + "RAY_testing_asio_delay_us", + "NodeManagerService.grpc_server.RequestWorkerLease=3000000:3000000", + ) + + sleep.options( + scheduling_strategy=PlacementGroupSchedulingStrategy(placement_group=pg), + name="task-pg-removed", + max_retries=0, + ).remote() + + remove_placement_group(pg) + + wait_for_condition( + verify_failed_task, + name="task-pg-removed", + error_type="TASK_PLACEMENT_GROUP_REMOVED", + ) + + +def test_failed_task_runtime_env_setup(shutdown_only): + @ray.remote + def f(): + pass + + bad_env = RuntimeEnv(conda={"dependencies": ["_this_does_not_exist"]}) + with pytest.raises( + RuntimeEnvSetupError, + match="ResolvePackageNotFound", + ): + ray.get(f.options(runtime_env=bad_env, name="task-runtime-env-failed").remote()) + + wait_for_condition( + verify_failed_task, + name="task-runtime-env-failed", + error_type="RUNTIME_ENV_SETUP_FAILED", + ) + + def test_fault_tolerance_parent_failed(shutdown_only): ray.init(num_cpus=4, _system_config=_SYSTEM_CONFIG) diff --git a/python/ray/train/_internal/backend_executor.py b/python/ray/train/_internal/backend_executor.py index 6cebbc9331f70..6cc713fd67e0e 100644 --- a/python/ray/train/_internal/backend_executor.py +++ b/python/ray/train/_internal/backend_executor.py @@ -111,6 +111,15 @@ def start( actor_cls_kwargs=train_cls_kwargs, placement_group=placement_group, ) + # Hack to avoid OOMs. + # This is just a temporary solution for Train loading entire checkpoints + # into memory by ensuring that the rank 0 worker is on the same node as + # trainable, thus allowing for lazy checkpoint transfer to be used. + # See https://github.com/ray-project/ray/issues/33073 + # for more context. + # TODO remove + if self._trial_info and self._trial_info.driver_ip: + self.worker_group._move_workers_with_ip_to_front(self._trial_info.driver_ip) try: if initialization_hook: self._initialization_hook = initialization_hook diff --git a/python/ray/train/_internal/worker_group.py b/python/ray/train/_internal/worker_group.py index 07bdff2807011..ea4ee58be129d 100644 --- a/python/ray/train/_internal/worker_group.py +++ b/python/ray/train/_internal/worker_group.py @@ -345,5 +345,26 @@ def add_workers(self, num_workers: int): for i in range(len(new_actors)): self.workers.append(Worker(actor=new_actors[i], metadata=metadata[i])) + def _move_workers_with_ip_to_front(self, ip): + # Hack to avoid OOMs. + # This is just a temporary solution for Train loading entire checkpoints + # into memory by ensuring that the rank 0 worker is on the same node as + # trainable, thus allowing for lazy checkpoint transfer to be used. + # See https://github.com/ray-project/ray/issues/33073 + # for more context. + # TODO remove + workers_with_ip = [] + indices_to_remove = set() + for i, worker in enumerate(self.workers): + if worker.metadata.node_ip == ip: + workers_with_ip.append(worker) + indices_to_remove.add(i) + if workers_with_ip: + self.workers = workers_with_ip + [ + worker + for i, worker in enumerate(self.workers) + if i not in indices_to_remove + ] + def __len__(self): return len(self.workers) diff --git a/python/ray/train/batch_predictor.py b/python/ray/train/batch_predictor.py index ea1219f0fac97..6b0457919e56d 100644 --- a/python/ray/train/batch_predictor.py +++ b/python/ray/train/batch_predictor.py @@ -36,6 +36,7 @@ def __init__( self._predictor_cls = predictor_cls self._predictor_kwargs = predictor_kwargs self._override_preprocessor: Optional[Preprocessor] = None + self._override_preprocessor_set = False def __repr__(self): return ( @@ -98,7 +99,7 @@ def _predict_pandas(self, df, **kwargs) -> "pd.DataFrame": def get_preprocessor(self) -> Preprocessor: """Get the preprocessor to use prior to executing predictions.""" - if self._override_preprocessor: + if self._override_preprocessor_set: return self._override_preprocessor return self._checkpoint.get_preprocessor() @@ -106,6 +107,7 @@ def get_preprocessor(self) -> Preprocessor: def set_preprocessor(self, preprocessor: Preprocessor) -> None: """Set the preprocessor to use prior to executing predictions.""" self._override_preprocessor = preprocessor + self._override_preprocessor_set = True def predict( self, diff --git a/python/ray/train/examples/pytorch/torch_quick_start.py b/python/ray/train/examples/pytorch/torch_quick_start.py index 9f10e6333acbb..dd966cefc8f6a 100644 --- a/python/ray/train/examples/pytorch/torch_quick_start.py +++ b/python/ray/train/examples/pytorch/torch_quick_start.py @@ -5,79 +5,93 @@ # __torch_setup_begin__ import torch import torch.nn as nn - -num_samples = 20 -input_size = 10 -layer_size = 15 -output_size = 5 +from torch.utils.data import DataLoader +from torchvision import datasets +from torchvision.transforms import ToTensor + +def get_dataset(): + return datasets.FashionMNIST( + root="/tmp/data", + train=True, + download=True, + transform=ToTensor(), + ) class NeuralNetwork(nn.Module): def __init__(self): - super(NeuralNetwork, self).__init__() - self.layer1 = nn.Linear(input_size, layer_size) - self.relu = nn.ReLU() - self.layer2 = nn.Linear(layer_size, output_size) - - def forward(self, input): - return self.layer2(self.relu(self.layer1(input))) - -# In this example we use a randomly generated dataset. -input = torch.randn(num_samples, input_size) -labels = torch.randn(num_samples, output_size) - + super().__init__() + self.flatten = nn.Flatten() + self.linear_relu_stack = nn.Sequential( + nn.Linear(28 * 28, 512), + nn.ReLU(), + nn.Linear(512, 512), + nn.ReLU(), + nn.Linear(512, 10), + ) + + def forward(self, inputs): + inputs = self.flatten(inputs) + logits = self.linear_relu_stack(inputs) + return logits # __torch_setup_end__ # __torch_single_begin__ - -import torch.optim as optim - def train_func(): num_epochs = 3 + batch_size = 64 + + dataset = get_dataset() + dataloader = DataLoader(dataset, batch_size=batch_size) + model = NeuralNetwork() - loss_fn = nn.MSELoss() - optimizer = optim.SGD(model.parameters(), lr=0.1) + + criterion = nn.CrossEntropyLoss() + optimizer = torch.optim.SGD(model.parameters(), lr=0.01) for epoch in range(num_epochs): - output = model(input) - loss = loss_fn(output, labels) - optimizer.zero_grad() - loss.backward() - optimizer.step() + for inputs, labels in dataloader: + optimizer.zero_grad() + pred = model(inputs) + loss = criterion(pred, labels) + loss.backward() + optimizer.step() print(f"epoch: {epoch}, loss: {loss.item()}") - # __torch_single_end__ # __torch_distributed_begin__ - from ray import train def train_func_distributed(): num_epochs = 3 + batch_size = 64 + + dataset = get_dataset() + dataloader = DataLoader(dataset, batch_size=batch_size) + dataloader = train.torch.prepare_data_loader(dataloader) + model = NeuralNetwork() model = train.torch.prepare_model(model) - loss_fn = nn.MSELoss() - optimizer = optim.SGD(model.parameters(), lr=0.1) + + criterion = nn.CrossEntropyLoss() + optimizer = torch.optim.SGD(model.parameters(), lr=0.01) for epoch in range(num_epochs): - output = model(input) - loss = loss_fn(output, labels) - optimizer.zero_grad() - loss.backward() - optimizer.step() + for inputs, labels in dataloader: + optimizer.zero_grad() + pred = model(inputs) + loss = criterion(pred, labels) + loss.backward() + optimizer.step() print(f"epoch: {epoch}, loss: {loss.item()}") - # __torch_distributed_end__ if __name__ == "__main__": # __torch_single_run_begin__ - train_func() - # __torch_single_run_end__ # __torch_trainer_begin__ - from ray.train.torch import TorchTrainer from ray.air.config import ScalingConfig @@ -86,10 +100,8 @@ def train_func_distributed(): trainer = TorchTrainer( train_func_distributed, - scaling_config=ScalingConfig( - num_workers=4, use_gpu=use_gpu) + scaling_config=ScalingConfig(num_workers=4, use_gpu=use_gpu) ) results = trainer.fit() - # __torch_trainer_end__ diff --git a/python/ray/train/examples/tf/tensorflow_quick_start.py b/python/ray/train/examples/tf/tensorflow_quick_start.py index 15a8086c1a276..fea8983bb5a8a 100644 --- a/python/ray/train/examples/tf/tensorflow_quick_start.py +++ b/python/ray/train/examples/tf/tensorflow_quick_start.py @@ -3,7 +3,6 @@ # isort: skip_file # __tf_setup_begin__ - import numpy as np import tensorflow as tf @@ -32,21 +31,17 @@ def build_and_compile_cnn_model(): optimizer=tf.keras.optimizers.SGD(learning_rate=0.001), metrics=['accuracy']) return model - # __tf_setup_end__ # __tf_single_begin__ - def train_func(): batch_size = 64 single_worker_dataset = mnist_dataset(batch_size) single_worker_model = build_and_compile_cnn_model() single_worker_model.fit(single_worker_dataset, epochs=3, steps_per_epoch=70) - # __tf_single_end__ # __tf_distributed_begin__ - import json import os @@ -66,18 +61,14 @@ def train_func_distributed(): multi_worker_model = build_and_compile_cnn_model() multi_worker_model.fit(multi_worker_dataset, epochs=3, steps_per_epoch=70) - # __tf_distributed_end__ if __name__ == "__main__": # __tf_single_run_begin__ - train_func() - # __tf_single_run_end__ # __tf_trainer_begin__ - from ray.train.tensorflow import TensorflowTrainer from ray.air.config import ScalingConfig @@ -87,5 +78,4 @@ def train_func_distributed(): trainer = TensorflowTrainer(train_func_distributed, scaling_config=ScalingConfig(num_workers=4, use_gpu=use_gpu)) trainer.fit() - # __tf_trainer_end__ diff --git a/python/ray/train/huggingface/_huggingface_utils.py b/python/ray/train/huggingface/_huggingface_utils.py index ba886a6a9ff02..4254b78538369 100644 --- a/python/ray/train/huggingface/_huggingface_utils.py +++ b/python/ray/train/huggingface/_huggingface_utils.py @@ -7,7 +7,7 @@ from transformers.trainer_utils import IntervalStrategy from ray.air import session -from ray.data.dataset import Dataset +from ray.data import DatasetIterator from ray.train.huggingface.huggingface_checkpoint import HuggingFaceCheckpoint if TYPE_CHECKING: @@ -63,7 +63,7 @@ def get_train_dataloader(self): class RayDatasetHFIterable(datasets.iterable_dataset.ExamplesIterable): """HF ExamplesIterable backed by a Ray Dataset.""" - def __init__(self, dataset: Dataset) -> None: + def __init__(self, dataset: DatasetIterator) -> None: self.dataset = dataset self.generate_examples_fn = self.dataset.iter_rows @@ -75,7 +75,7 @@ def __iter__(self): yield (0, {k: v for k, v in row.as_pydict().items()}) -def process_dataset_for_hf(dataset: Dataset) -> "IterableDataset": +def process_dataset_for_hf(dataset: DatasetIterator) -> "IterableDataset": """Converts a Ray Dataset into a HF IterableDataset.""" hf_iterable = RayDatasetHFIterable(dataset) @@ -84,8 +84,8 @@ def process_dataset_for_hf(dataset: Dataset) -> "IterableDataset": ).with_format("torch") try: - dataset_length = dataset.count() - except ValueError: + dataset_length = dataset._base_dataset.count() + except (ValueError, AttributeError): # pipeline case dataset_length = None @@ -94,8 +94,8 @@ def process_dataset_for_hf(dataset: Dataset) -> "IterableDataset": def process_datasets( - train_dataset: Dataset, - eval_dataset: Dataset, + train_dataset: DatasetIterator, + eval_dataset: DatasetIterator, ) -> Tuple["IterableDataset", "IterableDataset"]: """Convert Ray train and validation to HF IterableDatasets.""" train_torch_dataset = process_dataset_for_hf(train_dataset) diff --git a/python/ray/train/huggingface/huggingface_predictor.py b/python/ray/train/huggingface/huggingface_predictor.py index b469dcaf819d7..aef519970df53 100644 --- a/python/ray/train/huggingface/huggingface_predictor.py +++ b/python/ray/train/huggingface/huggingface_predictor.py @@ -74,7 +74,8 @@ def __init__( "prediction will only use CPU. Please consider explicitly " "setting `HuggingFacePredictor(use_gpu=True)` or " "`batch_predictor.predict(ds, num_gpus_per_worker=1)` to " - "enable GPU prediction." + "enable GPU prediction. Ignore if you have set `device` or " + "`device_map` arguments in the `pipeline` manually." ) super().__init__(preprocessor) @@ -98,6 +99,11 @@ def from_checkpoint( The checkpoint is expected to be a result of ``HuggingFaceTrainer``. + Note that the Transformers ``pipeline`` used internally expects to + recieve raw text. If you have any Preprocessors in Checkpoint + that tokenize the data, remove them by calling + ``Checkpoint.set_preprocessor(None)`` beforehand. + Args: checkpoint: The checkpoint to load the model, tokenizer and preprocessor from. It is expected to be from the result of a @@ -111,13 +117,14 @@ def from_checkpoint( initialization. If ``pipeline`` is None, this must contain the 'task' argument. Cannot contain 'model'. Can be used to override the tokenizer with 'tokenizer'. If ``use_gpu`` is - True, 'device' will be set to 0 by default. + True, 'device' will be set to 0 by default, unless 'device_map' is + passed. """ if not pipeline_cls and "task" not in pipeline_kwargs: raise ValueError( "If `pipeline_cls` is not specified, 'task' must be passed as a kwarg." ) - if use_gpu: + if use_gpu and "device_map" not in pipeline_kwargs: # default to using the GPU with the first index pipeline_kwargs.setdefault("device", 0) pipeline_cls = pipeline_cls or pipeline_factory diff --git a/python/ray/train/huggingface/huggingface_trainer.py b/python/ray/train/huggingface/huggingface_trainer.py index 7c548d808b106..f58e2cc993d6f 100644 --- a/python/ray/train/huggingface/huggingface_trainer.py +++ b/python/ray/train/huggingface/huggingface_trainer.py @@ -63,16 +63,6 @@ sys.modules[spec.name] = datasets_modules spec.loader.exec_module(datasets_modules) -# This trainer uses a special checkpoint syncing logic. -# Because HF checkpoints are very large dirs (at least several GBs), -# we use directory checkpoints that are synced between nodes when -# required instead of serializing the checkpoints and sending -# bytes over nodes. This is a much more performant solution for -# large directory checkpoints. The current implementation -# is special for HuggingFaceTrainer, but can and should be -# made generic. -# TODO(ml-team): Make dir syncing checkpoint logic generic. - TRAINER_INIT_FN_KEY = "_trainer_init_per_worker" diff --git a/python/ray/train/tests/test_batch_predictor.py b/python/ray/train/tests/test_batch_predictor.py index c361b8c415c90..402211d35c303 100644 --- a/python/ray/train/tests/test_batch_predictor.py +++ b/python/ray/train/tests/test_batch_predictor.py @@ -504,6 +504,18 @@ def test_get_and_set_preprocessor(): 12.0, ] + # Remove preprocessor + batch_predictor.set_preprocessor(None) + assert batch_predictor.get_preprocessor() is None + + output_ds = batch_predictor.predict(test_dataset) + assert output_ds.to_pandas().to_numpy().squeeze().tolist() == [ + 0.0, + 2.0, + 4.0, + 6.0, + ] + def test_batch_prediction_large_predictor_kwarg(): class StubPredictor(Predictor): diff --git a/python/ray/train/tests/test_huggingface_trainer.py b/python/ray/train/tests/test_huggingface_trainer.py index 5292cca60bee6..2698d7524a8ff 100644 --- a/python/ray/train/tests/test_huggingface_trainer.py +++ b/python/ray/train/tests/test_huggingface_trainer.py @@ -56,6 +56,9 @@ def ray_start_8_cpus(): def train_function(train_dataset, eval_dataset=None, **config): + # Check that train_dataset has len + assert len(train_dataset) + model_config = AutoConfig.from_pretrained(model_checkpoint) model = AutoModelForCausalLM.from_config(model_config) evaluation_strategy = ( diff --git a/python/ray/train/tests/test_huggingface_trainer_steps.py b/python/ray/train/tests/test_huggingface_trainer_steps.py index c1d98f257a242..efa504f75f6a4 100644 --- a/python/ray/train/tests/test_huggingface_trainer_steps.py +++ b/python/ray/train/tests/test_huggingface_trainer_steps.py @@ -38,6 +38,9 @@ def ray_start_4_cpus(): def train_function(train_dataset, eval_dataset=None, **config): + # Check that train_dataset has len + assert len(train_dataset) + model_config = AutoConfig.from_pretrained(model_checkpoint) model = AutoModelForCausalLM.from_config(model_config) training_args = TrainingArguments( diff --git a/python/ray/train/tests/test_mosaic_trainer.py b/python/ray/train/tests/test_mosaic_trainer.py index 59d1c88a0284b..7a670d664e58c 100644 --- a/python/ray/train/tests/test_mosaic_trainer.py +++ b/python/ray/train/tests/test_mosaic_trainer.py @@ -258,7 +258,7 @@ def test_monitor_callbacks(ray_start_4_cpus): from ray.train.mosaic import MosaicTrainer # Test Callbacks involving logging (SpeedMonitor, LRMonitor) - from composer.callbacks import SpeedMonitor, LRMonitor, GradMonitor + from composer.callbacks import SpeedMonitor, LRMonitor trainer_init_config = { "max_duration": "1ep", @@ -270,7 +270,6 @@ def test_monitor_callbacks(ray_start_4_cpus): trainer_init_config["callbacks"] = [ SpeedMonitor(window_size=3), LRMonitor(), - GradMonitor(), ] trainer = MosaicTrainer( @@ -289,7 +288,6 @@ def test_monitor_callbacks(ray_start_4_cpus): "wall_clock/val", "wall_clock/total", "lr-DecoupledSGDW/group0", - "grad_l2_norm/step", ] for column in columns_to_check: assert column in metrics_columns, column + " is not found" diff --git a/python/ray/train/tests/test_worker_group.py b/python/ray/train/tests/test_worker_group.py index 249d2b3b4c396..e40be2ff16a00 100644 --- a/python/ray/train/tests/test_worker_group.py +++ b/python/ray/train/tests/test_worker_group.py @@ -3,7 +3,9 @@ import pytest import ray -from ray.train._internal.worker_group import WorkerGroup +from ray.train._internal.worker_group import WorkerGroup, Worker, WorkerMetadata +from copy import deepcopy +from random import seed, shuffle @pytest.fixture @@ -81,6 +83,29 @@ def test_execute_args(ray_start_2_cpus): assert all(o == 1 for o in outputs) +def test_move_workers_with_ip_to_front(ray_start_2_cpus): + wg = WorkerGroup(num_workers=2) + wg.workers = [ + Worker( + actor=None, + metadata=WorkerMetadata( + node_id="dummy", node_ip=f"10.1.10.{i}", hostname="dummy", gpu_ids=None + ), + ) + for i in range(1, 17) + ] + wg.workers += deepcopy(wg.workers) + workers_pre_move = deepcopy(wg.workers) + seed(1) + shuffle(wg.workers) + wg._move_workers_with_ip_to_front("10.1.10.1") + assert wg.workers[0].metadata.node_ip == "10.1.10.1" + assert wg.workers[1].metadata.node_ip == "10.1.10.1" + assert sorted([w.metadata.node_ip for w in workers_pre_move]) == sorted( + [w.metadata.node_ip for w in wg.workers] + ) + + def test_execute_single(ray_start_2_cpus): wg = WorkerGroup(num_workers=2) diff --git a/python/ray/tune/BUILD b/python/ray/tune/BUILD index b9030d0d3880b..e3d69b87c5058 100644 --- a/python/ray/tune/BUILD +++ b/python/ray/tune/BUILD @@ -296,6 +296,14 @@ py_test( tags = ["team:ml", "exclusive"], ) +py_test( + name = "test_util_object_cache", + size = "small", + srcs = ["tests/test_util_object_cache.py"], + deps = [":tune_lib"], + tags = ["team:ml", "exclusive"], +) + py_test( name = "test_syncer", size = "medium", diff --git a/python/ray/tune/execution/ray_trial_executor.py b/python/ray/tune/execution/ray_trial_executor.py index 31a566f3f3541..18c32a38d0b9b 100644 --- a/python/ray/tune/execution/ray_trial_executor.py +++ b/python/ray/tune/execution/ray_trial_executor.py @@ -6,15 +6,15 @@ import random import time import traceback -from collections import deque, defaultdict, Counter +from collections import deque from contextlib import contextmanager from enum import Enum from functools import partial -from typing import Callable, Dict, Iterable, List, Optional, Set, Union, Tuple +from typing import Callable, Dict, Iterable, Optional, Set, Union import ray from ray.actor import ActorHandle -from ray.air import Checkpoint, AcquiredResources, ResourceRequest +from ray.air import Checkpoint, AcquiredResources from ray.air._internal.checkpoint_manager import CheckpointStorage, _TrackedCheckpoint from ray.air.constants import ( COPY_DIRECTORY_CHECKPOINTS_INSTEAD_OF_MOVING_ENV, @@ -35,6 +35,7 @@ from ray.tune.result import STDERR_FILE, STDOUT_FILE, TRIAL_INFO from ray.tune.experiment.trial import Trial, _Location, _TrialInfo from ray.tune.utils import warn_if_slow +from ray.tune.utils.object_cache import _ObjectCache from ray.tune.utils.resource_updater import _ResourceUpdater from ray.tune.trainable.util import TrainableUtil from ray.util import log_once @@ -234,13 +235,10 @@ def __init__( # Actor re-use. # For details, see docstring of `_maybe_cache_trial_actor()` self._reuse_actors = reuse_actors - self._resource_request_to_cached_actors: Dict[ - ResourceRequest, List[Tuple[ray.actor.ActorHandle, AcquiredResources]] - ] = defaultdict(list) + self._actor_cache = _ObjectCache(may_keep_one=True) # Trials for which we requested resources self._staged_trials = set() # Staged trials - self._staged_resources = Counter() # Resources of staged trials self._trial_to_acquired_resources: Dict[Trial, AcquiredResources] = {} # Result buffer @@ -261,7 +259,7 @@ def __init__( def setup( self, max_pending_trials: int, trainable_kwargs: Optional[Dict] = None ) -> None: - if len(self._resource_request_to_cached_actors) > 0: + if self._actor_cache.num_cached_objects: logger.warning( "Cannot update maximum number of queued actors for reuse " "during a run." @@ -327,7 +325,7 @@ def _stage_and_update_status(self, trials: Iterable[Trial]): resource_request = trial.placement_group_factory self._staged_trials.add(trial) - self._staged_resources[trial.placement_group_factory] += 1 + self._actor_cache.increase_max(resource_request) self._resource_manager.request_resources(resource_request=resource_request) self._resource_manager.update_state() @@ -344,7 +342,7 @@ def get_ready_trial(self) -> Optional[Trial]: for trial in self._staged_trials: resource_request = trial.placement_group_factory # If we have a cached actor for these resources, return - if self._resource_request_to_cached_actors[resource_request]: + if self._actor_cache.has_cached_object(resource_request): return trial # If the resources are available from the resource manager, return @@ -360,12 +358,13 @@ def _maybe_use_cached_actor(self, trial, logger_creator) -> Optional[ActorHandle return None resource_request = trial.placement_group_factory - if not self._resource_request_to_cached_actors[resource_request]: + + if not self._actor_cache.has_cached_object(resource_request): return None - actor, acquired_resources = self._resource_request_to_cached_actors[ + actor, acquired_resources = self._actor_cache.pop_cached_object( resource_request - ].pop(0) + ) logger.debug(f"Trial {trial}: Reusing cached actor " f"{actor}") @@ -541,7 +540,7 @@ def _unstage_trial_with_resources(self, trial: Trial): # Case 1: The trial we started was staged. Just remove it if trial in self._staged_trials: self._staged_trials.remove(trial) - self._staged_resources[trial.placement_group_factory] -= 1 + self._actor_cache.decrease_max(trial.placement_group_factory) return # Case 2: We staged a trial "A" with the same resources, but our trial "B" @@ -560,7 +559,7 @@ def _unstage_trial_with_resources(self, trial: Trial): if candidate_trial: self._staged_trials.remove(candidate_trial) - self._staged_resources[candidate_trial.placement_group_factory] -= 1 + self._actor_cache.decrease_max(candidate_trial.placement_group_factory) return raise RuntimeError( @@ -593,16 +592,8 @@ def _maybe_cache_trial_actor(self, trial: Trial) -> bool: acquired_resources = self._trial_to_acquired_resources[trial] cached_resource_request = acquired_resources.resource_request - staged_resource_count = self._count_staged_resources() - if ( - # If we have at least one cached actor already - any(v for v in self._resource_request_to_cached_actors.values()) - # and we haven't requested resources for an actor with the - # same resources as the actor we want to cache - and len(self._resource_request_to_cached_actors[cached_resource_request]) - >= staged_resource_count[cached_resource_request] - # then we don't have an immediate need for the actor and don't - # want to cache it. + if not self._actor_cache.cache_object( + cached_resource_request, (trial.runner, acquired_resources) ): logger.debug( f"Could not cache actor of trial {trial} for " @@ -613,9 +604,6 @@ def _maybe_cache_trial_actor(self, trial: Trial) -> bool: logger.debug(f"Caching actor of trial {trial} for re-use") - self._resource_request_to_cached_actors[cached_resource_request].append( - (trial.runner, acquired_resources) - ) self._trial_to_acquired_resources.pop(trial) trial.set_runner(None) @@ -833,7 +821,7 @@ def has_resources_for_trial(self, trial: Trial) -> bool: return ( trial in self._staged_trials - or self._resource_request_to_cached_actors[resource_request] + or self._actor_cache.has_cached_object(resource_request) or len(self._staged_trials) < self._max_staged_actors or self._resource_manager.has_resources_ready(resource_request) ) @@ -861,9 +849,6 @@ def on_step_end(self, search_ended: bool = False) -> None: self._cleanup_cached_actors(search_ended=search_ended) self._do_force_trial_cleanup() - def _count_staged_resources(self): - return self._staged_resources - def _cleanup_cached_actors( self, search_ended: bool = False, force_all: bool = False ): @@ -902,21 +887,16 @@ def _cleanup_cached_actors( # (if the search ended). return - staged_resources = self._count_staged_resources() - - for resource_request, actors in self._resource_request_to_cached_actors.items(): - while len(actors) > staged_resources.get(resource_request, 0) or ( - force_all and len(actors) - ): - actor, acquired_resources = actors[-1] - actors.pop() - future = actor.stop.remote() - self._futures[future] = ( - _ExecutorEventType.STOP_RESULT, - acquired_resources, - ) - if self._trial_cleanup: # force trial cleanup within a deadline - self._trial_cleanup.add(future) + for (actor, acquired_resources) in self._actor_cache.flush_cached_objects( + force_all=force_all + ): + future = actor.stop.remote() + self._futures[future] = ( + _ExecutorEventType.STOP_RESULT, + acquired_resources, + ) + if self._trial_cleanup: # force trial cleanup within a deadline + self._trial_cleanup.add(future) def _resolve_stop_event( self, @@ -1196,18 +1176,12 @@ def get_next_executor_event( # when next_trial_exists and there are cached resources ################################################################### # There could be existing PGs from either - # `self._resource_request_to_cached_actors` + # `self._actor_cache` # or from ready trials. If so and if there is indeed # a next trial to run, we return `PG_READY` future for trial # runner. The next trial can then be scheduled on this PG. if next_trial_exists: - if ( - sum( - len(cached) - for cached in self._resource_request_to_cached_actors.values() - ) - > 0 - ): + if self._actor_cache.num_cached_objects > 0: return _ExecutorEvent(_ExecutorEventType.PG_READY) # TODO(xwjiang): Expose proper API when we decide to do # ActorPool abstraction. diff --git a/python/ray/tune/registry.py b/python/ray/tune/registry.py index 069689ae5d495..be868ce3cad72 100644 --- a/python/ray/tune/registry.py +++ b/python/ray/tune/registry.py @@ -1,5 +1,5 @@ +import atexit import logging -import uuid from functools import partial from types import FunctionType from typing import Callable, Optional, Type, Union @@ -10,6 +10,7 @@ _internal_kv_get, _internal_kv_initialized, _internal_kv_put, + _internal_kv_del, ) from ray.tune.error import TuneError from ray.util.annotations import DeveloperAPI @@ -111,6 +112,10 @@ def register_trainable(name: str, trainable: Union[Callable, Type], warn: bool = _global_registry.register(TRAINABLE_CLASS, name, trainable) +def _unregister_trainables(): + _global_registry.unregister_all(TRAINABLE_CLASS) + + @DeveloperAPI def register_env(name: str, env_creator: Callable): """Register a custom environment for use with RLlib. @@ -128,6 +133,10 @@ def register_env(name: str, env_creator: Callable): _global_registry.register(ENV_CREATOR, name, env_creator) +def _unregister_envs(): + _global_registry.unregister_all(ENV_CREATOR) + + @DeveloperAPI def register_input(name: str, input_creator: Callable): """Register a custom input api for RLlib. @@ -142,6 +151,10 @@ def register_input(name: str, input_creator: Callable): _global_registry.register(RLLIB_INPUT, name, input_creator) +def _unregister_inputs(): + _global_registry.unregister_all(RLLIB_INPUT) + + @DeveloperAPI def registry_contains_input(name: str) -> bool: return _global_registry.contains(RLLIB_INPUT, name) @@ -152,6 +165,12 @@ def registry_get_input(name: str) -> Callable: return _global_registry.get(RLLIB_INPUT, name) +def _unregister_all(): + _unregister_inputs() + _unregister_envs() + _unregister_trainables() + + def _check_serializability(key, value): _global_registry.register(TEST, key, value) @@ -179,8 +198,29 @@ def _make_key(prefix: str, category: str, key: str): class _Registry: def __init__(self, prefix: Optional[str] = None): + """If no prefix is given, use runtime context job ID.""" self._to_flush = {} - self._prefix = prefix or uuid.uuid4().hex[:8] + self._prefix = prefix + self._registered = set() + self._atexit_handler_registered = False + + @property + def prefix(self): + if not self._prefix: + self._prefix = ray.get_runtime_context().get_job_id() + return self._prefix + + def _register_atexit(self): + if self._atexit_handler_registered: + # Already registered + return + + if ray._private.worker.global_worker.mode != ray.SCRIPT_MODE: + # Only cleanup on the driver + return + + atexit.register(_unregister_all) + self._atexit_handler_registered = True def register(self, category, key, value): """Registers the value with the global registry. @@ -198,16 +238,31 @@ def register(self, category, key, value): if _internal_kv_initialized(): self.flush_values() + def unregister(self, category, key): + if _internal_kv_initialized(): + _internal_kv_del(_make_key(self.prefix, category, key)) + else: + self._to_flush.pop((category, key), None) + + def unregister_all(self, category: Optional[str] = None): + remaining = set() + for (cat, key) in self._registered: + if category and category == cat: + self.unregister(cat, key) + else: + remaining.add((cat, key)) + self._registered = remaining + def contains(self, category, key): if _internal_kv_initialized(): - value = _internal_kv_get(_make_key(self._prefix, category, key)) + value = _internal_kv_get(_make_key(self.prefix, category, key)) return value is not None else: return (category, key) in self._to_flush def get(self, category, key): if _internal_kv_initialized(): - value = _internal_kv_get(_make_key(self._prefix, category, key)) + value = _internal_kv_get(_make_key(self.prefix, category, key)) if value is None: raise ValueError( "Registry value for {}/{} doesn't exist.".format(category, key) @@ -217,14 +272,16 @@ def get(self, category, key): return pickle.loads(self._to_flush[(category, key)]) def flush_values(self): + self._register_atexit() for (category, key), value in self._to_flush.items(): _internal_kv_put( - _make_key(self._prefix, category, key), value, overwrite=True + _make_key(self.prefix, category, key), value, overwrite=True ) + self._registered.add((category, key)) self._to_flush.clear() -_global_registry = _Registry(prefix="global") +_global_registry = _Registry() ray._private.worker._post_init_hooks.append(_global_registry.flush_values) diff --git a/python/ray/tune/tests/_test_multi_tenancy_run.py b/python/ray/tune/tests/_test_multi_tenancy_run.py index 2b42a96c0f127..74da15b283289 100644 --- a/python/ray/tune/tests/_test_multi_tenancy_run.py +++ b/python/ray/tune/tests/_test_multi_tenancy_run.py @@ -33,9 +33,6 @@ # are tracked by the driver, not the trainable. VALS = [int(os.environ["VAL_1"]), int(os.environ["VAL_2"])] -# If 1, use workaround, if 0, just run (and fail in job 1). -USE_WORKAROUND = bool(int(os.environ["WORKAROUND"])) - # Wait for HANG_RUN_MARKER while HANG_RUN_MARKER and Path(HANG_RUN_MARKER).exists(): time.sleep(0.1) @@ -56,13 +53,6 @@ def train_func(config): session.report({"param": config["param"], "fixed": config["fixed"]}) -# Workaround: Just use a unique name per trainer/trainable -if USE_WORKAROUND: - import uuid - - DataParallelTrainer.__name__ = "DataParallelTrainer_" + uuid.uuid4().hex[:8] - - trainer = DataParallelTrainer( train_loop_per_worker=train_func, train_loop_config={ @@ -97,9 +87,3 @@ def train_func(config): # Put assertions last, so we don't finish early because of failures assert sorted([result.metrics["param"] for result in results]) == VALS assert [result.metrics["fixed"] for result in results] == [FIXED_VAL, FIXED_VAL] - -if USE_WORKAROUND: - from ray.experimental.internal_kv import _internal_kv_del - from ray.tune.registry import _make_key, TRAINABLE_CLASS - - _internal_kv_del(_make_key("global", TRAINABLE_CLASS, DataParallelTrainer.__name__)) diff --git a/python/ray/tune/tests/test_experiment_analysis.py b/python/ray/tune/tests/test_experiment_analysis.py index c89202b50d2b4..b99d3fbbce57b 100644 --- a/python/ray/tune/tests/test_experiment_analysis.py +++ b/python/ray/tune/tests/test_experiment_analysis.py @@ -390,9 +390,7 @@ def testPickling(self): self.assertTrue(analysis.get_best_trial(metric=self.metric, mode="max")) ray.shutdown() - ray.tune.registry._global_registry = ray.tune.registry._Registry( - prefix="global" - ) + ray.tune.registry._global_registry = ray.tune.registry._Registry() with open(pickle_path, "rb") as f: analysis = pickle.load(f) @@ -406,9 +404,7 @@ def testFromPath(self): self.assertTrue(analysis.get_best_trial(metric=self.metric, mode="max")) ray.shutdown() - ray.tune.registry._global_registry = ray.tune.registry._Registry( - prefix="global" - ) + ray.tune.registry._global_registry = ray.tune.registry._Registry() analysis = ExperimentAnalysis(self.test_path) diff --git a/python/ray/tune/tests/test_multi_tenancy.py b/python/ray/tune/tests/test_multi_tenancy.py index 71062c8595cc4..b363d7dff7b67 100644 --- a/python/ray/tune/tests/test_multi_tenancy.py +++ b/python/ray/tune/tests/test_multi_tenancy.py @@ -15,14 +15,13 @@ def ray_start_4_cpus(): ray.shutdown() -@pytest.mark.parametrize("use_workaround", [False, True]) @pytest.mark.parametrize("exit_same", [False, True]) -def test_registry_conflict(ray_start_4_cpus, tmpdir, use_workaround, exit_same): +def test_registry_conflict(ray_start_4_cpus, tmpdir, exit_same): """Two concurrent Tune runs can conflict with each other when they use a trainable with the same name. - This test starts two runs in parallel and asserts that a workaround used - in the docs can alleviate the problem. + This test starts two runs in parallel and asserts that our fix in + https://github.com/ray-project/ray/pull/33095 resolves the issue. This is how we schedule the runs: @@ -42,10 +41,6 @@ def test_registry_conflict(ray_start_4_cpus, tmpdir, use_workaround, exit_same): - Run 1 finally finishes, and we compare the expected results with the actual results. - When you don't use the workaround, expect an assertion error (if ``exit_same=True``, - see below), otherwise a KeyError (because a trial failed). - When the workaround is used, we expect everything to run without error. - NOTE: Two errors can occur with registry conflicts. First, the trainable can be overwritten and captured, for example, when a fixed value is included in the trainable. The second trial of run 1 then has a wrong @@ -57,10 +52,6 @@ def test_registry_conflict(ray_start_4_cpus, tmpdir, use_workaround, exit_same): removed already. Note that these objects are registered with ``tune.with_parameters()`` (not the global registry store). We test both scenarios using the ``exit_same`` parameter. - - NOTE: If we resolve the registry issue (for example, with unique keys) - you can remove the test that expects the assertion error. We can remove - the parametrization and the workaround and assert that no conflict occurs. """ # Create file markers run_1_running = tmpdir / "run_1_running" @@ -75,7 +66,6 @@ def test_registry_conflict(ray_start_4_cpus, tmpdir, use_workaround, exit_same): run_1_env = { "RAY_ADDRESS": ray_address, - "WORKAROUND": str(int(use_workaround)), "FIXED_VAL": str(1), "VAL_1": str(2), "VAL_2": str(3), @@ -93,7 +83,6 @@ def test_registry_conflict(ray_start_4_cpus, tmpdir, use_workaround, exit_same): run_2_env = { "RAY_ADDRESS": ray_address, - "WORKAROUND": str(int(use_workaround)), "FIXED_VAL": str(4), "VAL_1": str(5), "VAL_2": str(6), @@ -123,18 +112,7 @@ def test_registry_conflict(ray_start_4_cpus, tmpdir, use_workaround, exit_same): print("Started run 2:", run_2.pid) assert run_2.wait() == 0 - - if use_workaround: - assert run_1.wait() == 0 - else: - assert run_1.wait() != 0 - - stderr = run_1.stderr.read().decode() - - if not exit_same: - assert "OwnerDiedError" in stderr, stderr - else: - assert "AssertionError" in stderr, stderr + assert run_1.wait() == 0 if __name__ == "__main__": diff --git a/python/ray/tune/tests/test_ray_trial_executor.py b/python/ray/tune/tests/test_ray_trial_executor.py index 66e9152f84068..ad8a99d09b47e 100644 --- a/python/ray/tune/tests/test_ray_trial_executor.py +++ b/python/ray/tune/tests/test_ray_trial_executor.py @@ -648,14 +648,7 @@ def train(config): executor._stage_and_update_status([trial1, trial2, trial3]) executor.pause_trial(trial1) # Caches the PG - assert ( - len( - executor._resource_request_to_cached_actors[ - trial1.placement_group_factory - ] - ) - == 1 - ) + assert executor._actor_cache.num_cached_objects == 1 # Second trial remains staged, it will only be removed from staging when it # is started diff --git a/python/ray/tune/tests/test_trial_runner_pg.py b/python/ray/tune/tests/test_trial_runner_pg.py index 0349188b1f0d6..6af195c90932f 100644 --- a/python/ray/tune/tests/test_trial_runner_pg.py +++ b/python/ray/tune/tests/test_trial_runner_pg.py @@ -112,10 +112,7 @@ def on_step_end(self, iteration, trials, **info): len(s) for s in resource_manager._request_to_ready_pgs.values() ) num_in_use = len(resource_manager._acquired_pgs) - num_cached = sum( - len(a) - for a in trial_executor._resource_request_to_cached_actors.values() - ) + num_cached = trial_executor._actor_cache.num_cached_objects total_num_tracked = num_staging + num_ready + num_in_use + num_cached diff --git a/python/ray/tune/tests/test_util_object_cache.py b/python/ray/tune/tests/test_util_object_cache.py new file mode 100644 index 0000000000000..dd61c03b8de46 --- /dev/null +++ b/python/ray/tune/tests/test_util_object_cache.py @@ -0,0 +1,124 @@ +import pytest + +from ray.tune.utils.object_cache import _ObjectCache + + +@pytest.mark.parametrize("eager", [False, True]) +def test_no_may_keep_one(eager): + """Test object caching. + + - After init, no objects are cached (as max cached is 0), except when eager caching + - After increasing max to 2, up to 2 objects are cached + - Decreasing max objects will evict them on flush + """ + cache = _ObjectCache(may_keep_one=eager) + + # max(A) = 0, so we we only cache when eager caching + assert cache.cache_object("A", 1) == eager + assert cache.num_cached_objects == int(eager) + + # Set max(A) = 2 + cache.increase_max("A", 2) + + # max(A) = 2, so we cache up to two objects + if not eager: + assert cache.cache_object("A", 1) + + assert cache.cache_object("A", 2) + assert not cache.cache_object("A", 3) + + assert cache.num_cached_objects == 2 + + # Nothing has to be evicted + assert not list(cache.flush_cached_objects()) + + # Set max(A) = 1, so we have one object too much + cache.decrease_max("A", 1) + + # First cached object is evicted + assert list(cache.flush_cached_objects()) == [1] + assert cache.num_cached_objects == 1 + + # Set max(A) = 0 + cache.decrease_max("A", 1) + + # Second cached object is evicted if not eager caching + assert list(cache.flush_cached_objects()) == ([2] if not eager else []) + assert cache.num_cached_objects == (0 if not eager else 1) + + +@pytest.mark.parametrize("eager", [False, True]) +def test_multi(eager): + """Test caching with multiple objects""" + cache = _ObjectCache(may_keep_one=eager) + + # max(A) = 0, so we we only cache when eager caching + assert cache.cache_object("A", 1) == eager + assert cache.num_cached_objects == int(eager) + + # max(B) = 0, so no caching + assert not cache.cache_object("B", 5) + assert cache.num_cached_objects == int(eager) + + # Increase maximums levels + cache.increase_max("A", 1) + cache.increase_max("B", 1) + + # Cache objects (A is already cached if eager) + assert cache.cache_object("A", 1) != eager + assert cache.cache_object("B", 5) + + # No further objects can be cached + assert not cache.cache_object("A", 2) + assert not cache.cache_object("B", 6) + + assert cache.num_cached_objects == 2 + + # Decrease + cache.decrease_max("A", 1) + + # Evict A object + assert list(cache.flush_cached_objects()) == [1] + + cache.decrease_max("B", 1) + + # If eager, keep B object, otherwise, evict B + assert list(cache.flush_cached_objects()) == ([5] if not eager else []) + assert cache.num_cached_objects == (0 if not eager else 1) + + +def test_multi_eager_other(): + """On eager caching, only cache an object if no other object is expected. + + - Expect up to one cached A object + - Try to cache object B --> doesn't get cached + - Remove expectation for A object + - Try to cache object B --> get's cached + """ + cache = _ObjectCache(may_keep_one=True) + + cache.increase_max("A", 1) + assert not cache.cache_object("B", 2) + + cache.decrease_max("A", 1) + assert cache.cache_object("B", 3) + + +@pytest.mark.parametrize("eager", [False, True]) +def test_force_all(eager): + """Assert that force_all=True will always evict all object.""" + cache = _ObjectCache(may_keep_one=eager) + + cache.increase_max("A", 2) + + assert cache.cache_object("A", 1) + assert cache.cache_object("A", 2) + + assert list(cache.flush_cached_objects(force_all=True)) == [1, 2] + assert cache.num_cached_objects == 0 + + +if __name__ == "__main__": + import sys + + sys.exit(pytest.main(["-v", __file__])) diff --git a/python/ray/tune/trainable/util.py b/python/ray/tune/trainable/util.py index 2c62278571ad2..44714da403768 100644 --- a/python/ray/tune/trainable/util.py +++ b/python/ray/tune/trainable/util.py @@ -148,9 +148,10 @@ def get_checkpoints_paths(logdir): iter_chkpt_pairs = [] for marker_path in marker_paths: chkpt_dir = os.path.dirname(marker_path) + basename = os.path.basename(chkpt_dir) # Skip temporary checkpoints - if os.path.basename(chkpt_dir).startswith("checkpoint_tmp"): + if basename.startswith("checkpoint_tmp"): continue metadata_file = glob.glob( @@ -162,9 +163,21 @@ def get_checkpoints_paths(logdir): os.path.join(glob.escape(chkpt_dir), _TUNE_METADATA_FILENAME) ) metadata_file = list(set(metadata_file)) # avoid duplication - if len(metadata_file) != 1: + if len(metadata_file) == 0: + logger.warning( + f"The checkpoint {basename} does not have a metadata file. " + f"This usually means that the training process was interrupted " + f"while the checkpoint was being written. The checkpoint will be " + f"excluded from analysis. Consider deleting the directory. " + f"Full path: {chkpt_dir}" + ) + continue + elif len(metadata_file) > 1: raise ValueError( - "{} has zero or more than one tune_metadata.".format(chkpt_dir) + f"The checkpoint {basename} contains more than one metadata file. " + f"If this happened without manual intervention, please file an " + f"issue at https://github.com/ray-project/ray/issues. " + f"Full path: {chkpt_dir}" ) metadata_file = metadata_file[0] diff --git a/python/ray/tune/utils/object_cache.py b/python/ray/tune/utils/object_cache.py new file mode 100644 index 0000000000000..a93af2f37b274 --- /dev/null +++ b/python/ray/tune/utils/object_cache.py @@ -0,0 +1,169 @@ +from collections import defaultdict, Counter + +from typing import Dict, Generator, List, Optional, TypeVar + +# Grouping key - must be hashable +T = TypeVar("T") +# Objects to cache +U = TypeVar("U") + + +class _ObjectCache: + """Cache up to some maximum count given a grouping key. + + This object cache can e.g. be used to cache Ray Tune trainable actors + given their resource requirements (reuse_actors=True). + + If the max number of cached objects for a grouping key is reached, + no more objects for this group will be cached. + + However, if `may_keep_one=True`, one object (globally across all grouping + keys) may be cached, even if the max number of objects is 0. This is to + allow to cache an object if the max number of objects of this key + will increase shortly after (as is the case e.g. in the Ray Tune control + loop). + + Args: + may_keep_one: If True, one object (globally) may be cached if no desired + maximum objects are defined. + + """ + + def __init__(self, may_keep_one: bool = True): + self._num_cached_objects: int = 0 + self._cached_objects: Dict[T, List[U]] = defaultdict(list) + self._max_num_objects: Counter[T] = Counter() + + self._may_keep_one = may_keep_one + + @property + def num_cached_objects(self): + return self._num_cached_objects + + def increase_max(self, key: T, by: int = 1) -> None: + """Increase number of max objects for this key. + + Args: + key: Group key. + by: Decrease by this amount. + """ + self._max_num_objects[key] += by + + def decrease_max(self, key: T, by: int = 1) -> None: + """Decrease number of max objects for this key. + + Args: + key: Group key. + by: Decrease by this amount. + """ + self._max_num_objects[key] -= by + + def has_cached_object(self, key: T) -> bool: + """Return True if at least one cached object exists for this key. + + Args: + key: Group key. + + Returns: + True if at least one cached object exists for this key. + """ + return bool(self._cached_objects[key]) + + def cache_object(self, key: T, obj: U) -> bool: + """Cache object for a given key. + + This will put the object into a cache, assuming the number + of cached objects for this key is less than the number of + max objects for this key. + + An exception is made if `max_keep_one=True` and no other + objects are cached globally. In that case, the object can + still be cached. + + Args: + key: Group key. + obj: Object to cache. + + Returns: + True if the object has been cached. False otherwise. + + """ + # If we have more objects cached already than we desire + if len(self._cached_objects[key]) >= self._max_num_objects[key]: + # If may_keep_one is False, never cache + if not self._may_keep_one: + return False + + # If we have more than one other cached object, don't cache + if self._num_cached_objects > 0: + return False + + # If any other objects are expected to be cached, don't cache + if any(v for v in self._max_num_objects.values()): + return False + + # Otherwise, cache (for now). + + self._cached_objects[key].append(obj) + self._num_cached_objects += 1 + return True + + def pop_cached_object(self, key: T) -> Optional[U]: + """Get one cached object for a key. + + This will remove the object from the cache. + + Args: + key: Group key. + + Returns: + Cached object. + """ + if not self.has_cached_object(key): + return None + + self._num_cached_objects -= 1 + return self._cached_objects[key].pop(0) + + def flush_cached_objects(self, force_all: bool = False) -> Generator[U, None, None]: + """Return a generator over cached objects evicted from the cache. + + This method yields all cached objects that should be evicted from the + cache for cleanup by the caller. + + If the number of max objects is lower than the number of + cached objects for a given key, objects are evicted until + the numbers are equal. + + If `max_keep_one=True` (and ``force_all=False``), one cached object + may be retained. + + Objects are evicted FIFO. + + If ``force_all=True``, all objects are evicted. + + Args: + force_all: If True, all objects are flushed. This takes precedence + over ``keep_one``. + + Yields: + Evicted objects to be cleaned up by caller. + + """ + # If force_all=True, don't keep one. + keep_one = self._may_keep_one and not force_all + + for key, objs in self._cached_objects.items(): + max = self._max_num_objects[key] if not force_all else 0 + + if ( + self._num_cached_objects == 1 + and keep_one + # Only keep this object if we don't expect a different one + and not any(v for v in self._max_num_objects.values()) + ): + break + + while len(objs) > max: + self._num_cached_objects -= 1 + yield objs.pop(0) diff --git a/python/ray/tune/utils/util.py b/python/ray/tune/utils/util.py index 4f2953555d033..f6979cdca6654 100644 --- a/python/ray/tune/utils/util.py +++ b/python/ray/tune/utils/util.py @@ -250,17 +250,31 @@ def __init__(self, stream1, stream2): self.stream1 = stream1 self.stream2 = stream2 + def _warn(self, op, s, args, kwargs): + msg = f"ValueError when calling '{op}' on stream ({s}). " + msg += f"args: {args} kwargs: {kwargs}" + logger.warning(msg) + def seek(self, *args, **kwargs): - self.stream1.seek(*args, **kwargs) - self.stream2.seek(*args, **kwargs) + for s in [self.stream1, self.stream2]: + try: + s.seek(*args, **kwargs) + except ValueError: + self._warn("seek", s, args, kwargs) def write(self, *args, **kwargs): - self.stream1.write(*args, **kwargs) - self.stream2.write(*args, **kwargs) + for s in [self.stream1, self.stream2]: + try: + s.write(*args, **kwargs) + except ValueError: + self._warn("write", s, args, kwargs) def flush(self, *args, **kwargs): - self.stream1.flush(*args, **kwargs) - self.stream2.flush(*args, **kwargs) + for s in [self.stream1, self.stream2]: + try: + s.flush(*args, **kwargs) + except ValueError: + self._warn("flush", s, args, kwargs) @property def encoding(self): diff --git a/python/ray/util/multiprocessing/pool.py b/python/ray/util/multiprocessing/pool.py index fa659a5b4ae55..299b271ca1869 100644 --- a/python/ray/util/multiprocessing/pool.py +++ b/python/ray/util/multiprocessing/pool.py @@ -7,14 +7,12 @@ import queue import sys import threading -import warnings import time from multiprocessing import TimeoutError from typing import Any, Callable, Dict, Hashable, Iterable, List, Optional, Tuple import ray from ray.util import log_once -from ray.util.annotations import RayDeprecationWarning try: from joblib._parallel_backends import SafeFunction @@ -390,21 +388,7 @@ def __init__(self, pool, func, iterable, chunksize=None): # submitted chunks. Ordering mirrors that in the in the ResultThread. self._submitted_chunks = [] self._ready_objects = collections.deque() - try: - self._iterator = iter(iterable) - except TypeError: - warnings.warn( - "Passing a non-iterable argument to the " - "ray.util.multiprocessing.Pool imap and imap_unordered " - "methods is deprecated as of Ray 2.3 and " - " will be removed in a future release. See " - "https://github.com/ray-project/ray/issues/24237 for more " - "information.", - category=RayDeprecationWarning, - stacklevel=3, - ) - iterable = [iterable] - self._iterator = iter(iterable) + self._iterator = iter(iterable) if isinstance(iterable, collections.abc.Iterator): # Got iterator (which has no len() function). # Make default chunksize 1 instead of using _calculate_chunksize(). diff --git a/python/ray/util/spark/cluster_init.py b/python/ray/util/spark/cluster_init.py index 100ec69e09c80..ef344f9a92ceb 100644 --- a/python/ray/util/spark/cluster_init.py +++ b/python/ray/util/spark/cluster_init.py @@ -213,7 +213,12 @@ def _convert_ray_node_option_key(key): def _convert_ray_node_options(options): - return [f"{_convert_ray_node_option_key(k)}={str(v)}" for k, v in options.items()] + return [ + f"{_convert_ray_node_option_key(k)}" + if v is None + else f"{_convert_ray_node_option_key(k)}={str(v)}" + for k, v in options.items() + ] _RAY_HEAD_STARTUP_TIMEOUT = 5 @@ -835,10 +840,16 @@ def setup_ray_cluster( head_node_options: A dict representing Ray head node extra options, these options will be passed to `ray start` script. Note you need to convert `ray start` options key from `--foo-bar` format to `foo_bar` format. + For flag options (e.g. '--disable-usage-stats'), you should set the value + to None in the option dict, like `{"disable_usage_stats": None}`. + Note: Short name options (e.g. '-v') are not supported. worker_node_options: A dict representing Ray worker node extra options, these options will be passed to `ray start` script. Note you need to convert `ray start` options key from `--foo-bar` format to `foo_bar` format. + For flag options (e.g. '--disable-usage-stats'), you should set the value + to None in the option dict, like `{"disable_usage_stats": None}`. + Note: Short name options (e.g. '-v') are not supported. ray_temp_root_dir: A local disk path to store the ray temporary data. The created cluster will create a subdirectory "ray-{head_port}-{random_suffix}" beneath this path. diff --git a/python/requirements/ml/requirements_dl.txt b/python/requirements/ml/requirements_dl.txt index 7ab1796596d1a..5549414a9f69d 100644 --- a/python/requirements/ml/requirements_dl.txt +++ b/python/requirements/ml/requirements_dl.txt @@ -1,7 +1,8 @@ # These requirements are used for the CI and CPU-only Docker images so we install CPU only versions of torch. # For GPU Docker images, you should install requirements_ml_docker.txt afterwards. -tensorflow==2.11.0 +tensorflow==2.11.0; sys_platform != 'darwin' or platform_machine != 'arm64' +tensorflow-macos==2.11.0; sys_platform == 'darwin' and platform_machine == 'arm64' tensorflow-probability==0.19.0 # If you make changes below this line, please also make the corresponding changes to `requirements_ml_docker.txt`! @@ -11,7 +12,7 @@ tensorflow-probability==0.19.0 torch==1.13.0 torchvision==0.14.0 torch-scatter==2.1.0 -torch-sparse==0.6.15 +torch-sparse==0.6.16 torch-cluster==1.6.0 torch-spline-conv==1.2.1 torch-geometric==2.1.0 diff --git a/python/requirements/ml/requirements_rllib.txt b/python/requirements/ml/requirements_rllib.txt index 8432397dbe9d5..1d5fea8705430 100644 --- a/python/requirements/ml/requirements_rllib.txt +++ b/python/requirements/ml/requirements_rllib.txt @@ -24,7 +24,8 @@ supersuit==3.7.0; python_version >= '3.7' # For tests on minigrid. minigrid==2.1.1 # For tests on RecSim and Kaggle envs. -recsim==0.2.4 +# Explicitly depends on `tensorflow` and doesn't accept `tensorflow-macos` +recsim==0.2.4; sys_platform != 'darwin' or platform_machine != 'arm64' tensorflow_estimator==2.11.0 # DeepMind's OpenSpiel open-spiel==1.2 @@ -37,8 +38,10 @@ higher==0.2.1 pyglet==1.5.15 imageio-ffmpeg==0.4.5 # ONNX -onnx==1.12.0 -onnxruntime==1.14.0 -tf2onnx==1.13.0 +# ONNX 1.13.0 depends on protobuf > 3.20, conflicting with tensorflow. +# ONNX 1.12.0 is not published for mac arm64, so we exclude it for now. +onnx==1.12.0; sys_platform != 'darwin' or platform_machine != 'arm64' +onnxruntime==1.14.1; sys_platform != 'darwin' or platform_machine != 'arm64' +tf2onnx==1.13.0; sys_platform != 'darwin' or platform_machine != 'arm64' typer==0.6.1 rich==12.0.1 diff --git a/python/requirements/ml/requirements_train.txt b/python/requirements/ml/requirements_train.txt index 0c10b988a1f5b..f539e7c2b1a94 100644 --- a/python/requirements/ml/requirements_train.txt +++ b/python/requirements/ml/requirements_train.txt @@ -2,7 +2,7 @@ -r requirements_dl.txt -mosaicml==0.10.1 +mosaicml==0.12.1 mlflow==1.30.0 tensorboardX==2.4.1 diff --git a/python/requirements/ml/requirements_tune.txt b/python/requirements/ml/requirements_tune.txt index 839c940b75087..8891aad90bbde 100644 --- a/python/requirements/ml/requirements_tune.txt +++ b/python/requirements/ml/requirements_tune.txt @@ -9,18 +9,21 @@ ConfigSpace==0.4.18 dragonfly-opt==0.1.6 flaml==1.1.1 freezegun==1.1.0 -gluoncv==0.10.1.post0 +# Requires decord which is unavailable for arm64 +gluoncv==0.10.1.post0; platform_machine != "arm64" gpy==1.10.0 -autorom[accept-rom-license] +# Requires libtorrent which is unavailable for arm64 +autorom[accept-rom-license]; platform_machine != "arm64" h5py==3.7.0 hpbandster==0.7.4 HEBO==0.3.2 hyperopt==0.2.5 jupyterlab==3.6.1 -lightgbm==3.2.1 +lightgbm==3.3.5 matplotlib!=3.4.3 mlflow==1.30.0 -mxnet==1.8.0.post0 +# Unavailable for arm64 in more recent versions +mxnet==1.8.0.post0; platform_machine != "arm64" nevergrad==0.4.3.post7 optuna==2.10.0 # For HEBO compatibility @@ -36,5 +39,6 @@ timm==0.4.5 transformers==4.18.0; python_version <= '3.6' transformers==4.19.1; python_version > '3.6' wandb==0.13.4 -xgboost==1.3.3 +xgboost==1.6.2; python_version <= '3.7' +xgboost==1.7.4; python_version > '3.7' zoopt==0.4.1 diff --git a/python/setup.py b/python/setup.py index a4e5c4c721f05..8330995949769 100644 --- a/python/setup.py +++ b/python/setup.py @@ -18,6 +18,15 @@ from enum import Enum from itertools import chain +# Workaround for setuptools_scm (used on macos) adding junk files +# https://stackoverflow.com/a/61274968/8162137 +try: + import setuptools_scm.integration + + setuptools_scm.integration.find_files = lambda _: [] +except ImportError: + pass + logger = logging.getLogger(__name__) SUPPORTED_PYTHONS = [(3, 6), (3, 7), (3, 8), (3, 9), (3, 10), (3, 11)] @@ -101,7 +110,7 @@ def __init__( def get_packages(self): if self.type == SetupType.RAY: - return setuptools.find_packages() + return setuptools.find_packages(exclude=("tests", "*.tests", "*.tests.*")) else: return [] @@ -789,6 +798,11 @@ def has_ext_modules(self): "ray": ["includes/*.pxd", "*.pxd"], }, include_package_data=True, + exclude_package_data={ + # Empty string means "any package". + # Therefore, exclude BUILD from every package: + "": ["BUILD"], + }, zip_safe=False, license="Apache 2.0", ) if __name__ == "__main__" else None diff --git a/release/air_examples/dreambooth/dreambooth b/release/air_examples/dreambooth/dreambooth new file mode 120000 index 0000000000000..362a68d08d3f5 --- /dev/null +++ b/release/air_examples/dreambooth/dreambooth @@ -0,0 +1 @@ +../../../python/ray/air/examples/dreambooth \ No newline at end of file diff --git a/release/air_examples/dreambooth/dreambooth_compute.yaml b/release/air_examples/dreambooth/dreambooth_compute.yaml new file mode 100644 index 0000000000000..ce81b50f9fc64 --- /dev/null +++ b/release/air_examples/dreambooth/dreambooth_compute.yaml @@ -0,0 +1,10 @@ +cloud_id: {{env["ANYSCALE_CLOUD_ID"]}} +region: us-west-2 + +max_workers: 0 + +head_node_type: + name: head_node + instance_type: g5.12xlarge + +worker_node_types: [] diff --git a/release/air_examples/dreambooth/dreambooth_env.yaml b/release/air_examples/dreambooth/dreambooth_env.yaml new file mode 100644 index 0000000000000..e11714bc99f75 --- /dev/null +++ b/release/air_examples/dreambooth/dreambooth_env.yaml @@ -0,0 +1,12 @@ +base_image: {{ env["RAY_IMAGE_ML_NIGHTLY_GPU"] | default("anyscale/ray-ml:nightly-py37-gpu") }} +env_vars: {} +debian_packages: + - curl + +python: + pip_packages: [] + conda_packages: [] + +post_build_cmds: + - pip uninstall -y ray || true && pip3 install -U {{ env["RAY_WHEELS"] | default("ray") }} + - {{ env["RAY_WHEELS_SANITY_CHECK"] | default("echo No Ray wheels sanity check") }} diff --git a/release/air_examples/dreambooth/dreambooth_run.sh b/release/air_examples/dreambooth/dreambooth_run.sh new file mode 100644 index 0000000000000..561f01e9ff4e0 --- /dev/null +++ b/release/air_examples/dreambooth/dreambooth_run.sh @@ -0,0 +1,68 @@ +#!/bin/bash + +# ATTN: This should be kept in sync with python/ray/air/examples/dreambooth/README.md + +set -xe + +# Step 0 +pushd dreambooth || true +pip install -Ur requirements.txt + +# Step 0 cont +export DATA_PREFIX="/tmp" +export ORIG_MODEL_NAME="CompVis/stable-diffusion-v1-4" +export ORIG_MODEL_HASH="249dd2d739844dea6a0bc7fc27b3c1d014720b28" +export ORIG_MODEL_DIR="$DATA_PREFIX/model-orig" +export ORIG_MODEL_PATH="$ORIG_MODEL_DIR/models--${ORIG_MODEL_NAME/\//--}/snapshots/$ORIG_MODEL_HASH" +export TUNED_MODEL_DIR="$DATA_PREFIX/model-tuned" +export IMAGES_REG_DIR="$DATA_PREFIX/images-reg" +export IMAGES_OWN_DIR="$DATA_PREFIX/images-own" +export IMAGES_NEW_DIR="$DATA_PREFIX/images-new" + +export CLASS_NAME="lego car" + +mkdir -p $ORIG_MODEL_DIR $TUNED_MODEL_DIR $IMAGES_REG_DIR $IMAGES_OWN_DIR $IMAGES_NEW_DIR + +# Copy own images into IMAGES_OWN_DIR +cp -rf ./images/unqtkn/*.jpg "$IMAGES_OWN_DIR/" + +# Step 1 +python cache_model.py --model_dir=$ORIG_MODEL_DIR --model_name=$ORIG_MODEL_NAME --revision=$ORIG_MODEL_HASH + +# Clear reg dir +rm -rf "$IMAGES_REG_DIR"/*.jpg + +# Step 2 +# ATTN: Reduced the number of samples per prompt for faster testing +python run_model.py \ + --model_dir=$ORIG_MODEL_PATH \ + --output_dir=$IMAGES_REG_DIR \ + --prompts="photo of a $CLASS_NAME" \ + --num_samples_per_prompt=20 + +# Step 3 +python train.py \ + --model_dir=$ORIG_MODEL_PATH \ + --output_dir=$TUNED_MODEL_DIR \ + --instance_images_dir=$IMAGES_OWN_DIR \ + --instance_prompt="a photo of unqtkn $CLASS_NAME" \ + --class_images_dir=$IMAGES_REG_DIR \ + --class_prompt="a photo of a $CLASS_NAME" + +# Clear new dir +rm -rf "$IMAGES_NEW_DIR"/*.jpg + +# Step 4 +# ATTN: Reduced the number of samples per prompt for faster testing +python run_model.py \ + --model_dir=$TUNED_MODEL_DIR \ + --output_dir=$IMAGES_NEW_DIR \ + --prompts="photo of a unqtkn $CLASS_NAME" \ + --num_samples_per_prompt=5 + +# Save artifact +mkdir -p /tmp/artifacts +cp -f "$IMAGES_NEW_DIR"/0-*.jpg /tmp/artifacts/example_out.jpg + +# Exit +popd || true diff --git a/release/air_examples/gptj_deepspeed_finetuning/gptj_deepspeed_compute.yaml b/release/air_examples/gptj_deepspeed_finetuning/gptj_deepspeed_compute.yaml new file mode 100644 index 0000000000000..f5c461c1618b1 --- /dev/null +++ b/release/air_examples/gptj_deepspeed_finetuning/gptj_deepspeed_compute.yaml @@ -0,0 +1,20 @@ +cloud_id: {{env["ANYSCALE_CLOUD_ID"]}} +region: us-west-2 + +head_node_type: + name: head_node + instance_type: g4dn.4xlarge + +worker_node_types: + - name: worker_node + instance_type: g4dn.4xlarge + min_workers: 15 + max_workers: 15 + use_spot: false + +aws: + TagSpecifications: + - ResourceType: "instance" + Tags: + - Key: ttl-hours + Value: '24' diff --git a/release/air_examples/gptj_deepspeed_finetuning/gptj_deepspeed_env.yaml b/release/air_examples/gptj_deepspeed_finetuning/gptj_deepspeed_env.yaml new file mode 100644 index 0000000000000..cbecd450c8496 --- /dev/null +++ b/release/air_examples/gptj_deepspeed_finetuning/gptj_deepspeed_env.yaml @@ -0,0 +1,21 @@ +base_image: {{ env["RAY_IMAGE_ML_NIGHTLY_GPU"] | default("anyscale/ray-ml:nightly-py37-gpu") }} +env_vars: {} +debian_packages: + - curl + +python: + pip_packages: + - "datasets" + - "evaluate" + - "accelerate>=0.16.0" + - "transformers>=4.26.0" + - "torch>=1.12.0" + - "deepspeed" + - myst-parser==0.15.2 + - myst-nb==0.13.1 + - jupytext==1.13.6 + conda_packages: [] + +post_build_cmds: + - pip uninstall -y ray || true && pip3 install -U {{ env["RAY_WHEELS"] | default("ray") }} + - {{ env["RAY_WHEELS_SANITY_CHECK"] | default("echo No Ray wheels sanity check") }} \ No newline at end of file diff --git a/release/air_examples/gptj_deepspeed_finetuning/gptj_deepspeed_fine_tuning.ipynb b/release/air_examples/gptj_deepspeed_finetuning/gptj_deepspeed_fine_tuning.ipynb new file mode 120000 index 0000000000000..a65044dfacf95 --- /dev/null +++ b/release/air_examples/gptj_deepspeed_finetuning/gptj_deepspeed_fine_tuning.ipynb @@ -0,0 +1 @@ +../../../doc/source/ray-air/examples/gptj_deepspeed_fine_tuning.ipynb \ No newline at end of file diff --git a/release/air_examples/gptj_deepspeed_finetuning/test_myst_doc.py b/release/air_examples/gptj_deepspeed_finetuning/test_myst_doc.py new file mode 120000 index 0000000000000..c265ccc7b062b --- /dev/null +++ b/release/air_examples/gptj_deepspeed_finetuning/test_myst_doc.py @@ -0,0 +1 @@ +../../../doc/test_myst_doc.py \ No newline at end of file diff --git a/release/benchmarks/distributed/test_many_actors.py b/release/benchmarks/distributed/test_many_actors.py index f56dd0e910b71..a40d8a583b095 100644 --- a/release/benchmarks/distributed/test_many_actors.py +++ b/release/benchmarks/distributed/test_many_actors.py @@ -1,4 +1,3 @@ -import json import os import ray import ray._private.test_utils as test_utils @@ -63,23 +62,21 @@ def no_resource_leaks(): f"{end_time - start_time}s. ({rate} actors/s)" ) -if "TEST_OUTPUT_JSON" in os.environ: - out_file = open(os.environ["TEST_OUTPUT_JSON"], "w") - results = { - "actors_per_second": rate, - "num_actors": MAX_ACTORS_IN_CLUSTER, - "time": end_time - start_time, - "success": "1", - "_peak_memory": round(used_gb, 2), - "_peak_process_memory": usage, - } - if not is_smoke_test: - results["perf_metrics"] = [ - { - "perf_metric_name": "actors_per_second", - "perf_metric_value": rate, - "perf_metric_type": "THROUGHPUT", - } - ] - dashboard_test.update_release_test_result(results) - json.dump(results, out_file) +results = { + "actors_per_second": rate, + "num_actors": MAX_ACTORS_IN_CLUSTER, + "time": end_time - start_time, + "success": "1", + "_peak_memory": round(used_gb, 2), + "_peak_process_memory": usage, +} +if not is_smoke_test: + results["perf_metrics"] = [ + { + "perf_metric_name": "actors_per_second", + "perf_metric_value": rate, + "perf_metric_type": "THROUGHPUT", + } + ] +dashboard_test.update_release_test_result(results) +test_utils.safe_write_to_results_json(results) diff --git a/release/benchmarks/distributed/test_many_pgs.py b/release/benchmarks/distributed/test_many_pgs.py index e6993a002e2cb..40029bcf601d0 100644 --- a/release/benchmarks/distributed/test_many_pgs.py +++ b/release/benchmarks/distributed/test_many_pgs.py @@ -1,4 +1,3 @@ -import json import os import ray import ray._private.test_utils as test_utils @@ -100,23 +99,21 @@ def no_resource_leaks(): f"{end_time - start_time}s. ({rate} pgs/s)" ) -if "TEST_OUTPUT_JSON" in os.environ: - out_file = open(os.environ["TEST_OUTPUT_JSON"], "w") - results = { - "pgs_per_second": rate, - "num_pgs": MAX_PLACEMENT_GROUPS, - "time": end_time - start_time, - "success": "1", - "_peak_memory": round(used_gb, 2), - "_peak_process_memory": usage, - } - if not is_smoke_test: - results["perf_metrics"] = [ - { - "perf_metric_name": "pgs_per_second", - "perf_metric_value": rate, - "perf_metric_type": "THROUGHPUT", - } - ] - dashboard_test.update_release_test_result(results) - json.dump(results, out_file) +results = { + "pgs_per_second": rate, + "num_pgs": MAX_PLACEMENT_GROUPS, + "time": end_time - start_time, + "success": "1", + "_peak_memory": round(used_gb, 2), + "_peak_process_memory": usage, +} +if not is_smoke_test: + results["perf_metrics"] = [ + { + "perf_metric_name": "pgs_per_second", + "perf_metric_value": rate, + "perf_metric_type": "THROUGHPUT", + } + ] +dashboard_test.update_release_test_result(results) +test_utils.safe_write_to_results_json(results) diff --git a/release/benchmarks/distributed/test_many_tasks.py b/release/benchmarks/distributed/test_many_tasks.py index c11ad908d4652..365d27e355ab3 100644 --- a/release/benchmarks/distributed/test_many_tasks.py +++ b/release/benchmarks/distributed/test_many_tasks.py @@ -1,6 +1,4 @@ import click -import json -import os import ray import ray._private.test_utils as test_utils import time @@ -67,14 +65,7 @@ def no_resource_leaks(): @click.command() @click.option("--num-tasks", required=True, type=int, help="Number of tasks to launch.") -@click.option( - "--smoke-test", - is_flag=True, - type=bool, - default=False, - help="If set, it's a smoke test", -) -def test(num_tasks, smoke_test): +def test(num_tasks): addr = ray.init(address="auto") test_utils.wait_for_condition(no_resource_leaks) @@ -109,32 +100,30 @@ def not_none(res): f"({rate} tasks/s)" ) - if "TEST_OUTPUT_JSON" in os.environ: - out_file = open(os.environ["TEST_OUTPUT_JSON"], "w") - results = { - "tasks_per_second": rate, - "num_tasks": num_tasks, - "time": end_time - start_time, - "used_cpus": used_cpus, - "success": "1", - "_peak_memory": round(used_gb, 2), - "_peak_process_memory": usage, - } - if not smoke_test: - results["perf_metrics"] = [ - { - "perf_metric_name": "tasks_per_second", - "perf_metric_value": rate, - "perf_metric_type": "THROUGHPUT", - }, - { - "perf_metric_name": "used_cpus_by_deadline", - "perf_metric_value": used_cpus, - "perf_metric_type": "THROUGHPUT", - }, - ] - dashboard_test.update_release_test_result(results) - json.dump(results, out_file) + results = { + "tasks_per_second": rate, + "num_tasks": num_tasks, + "time": end_time - start_time, + "used_cpus": used_cpus, + "success": "1", + "_peak_memory": round(used_gb, 2), + "_peak_process_memory": usage, + "perf_metrics": [ + { + "perf_metric_name": "tasks_per_second", + "perf_metric_value": rate, + "perf_metric_type": "THROUGHPUT", + }, + { + "perf_metric_name": "used_cpus_by_deadline", + "perf_metric_value": used_cpus, + "perf_metric_type": "THROUGHPUT", + }, + ], + } + + dashboard_test.update_release_test_result(results) + test_utils.safe_write_to_results_json(results) if __name__ == "__main__": diff --git a/release/benchmarks/distributed/test_scheduling.py b/release/benchmarks/distributed/test_scheduling.py index 0ea75f2a9ed80..2e4dd138d7a7f 100644 --- a/release/benchmarks/distributed/test_scheduling.py +++ b/release/benchmarks/distributed/test_scheduling.py @@ -2,8 +2,7 @@ import argparse from time import time, sleep from math import floor -import os -import json +from ray._private.test_utils import safe_write_to_results_json @ray.remote @@ -105,8 +104,6 @@ def start_actor(num_actors, num_actors_per_nodes, job): args.total_num_actors, args.num_actors_per_nodes, job ) - output = os.environ.get("TEST_OUTPUT_JSON") - result = { "total_num_task": args.total_num_task, "num_cpu_per_task": args.num_cpu_per_task, @@ -121,10 +118,6 @@ def start_actor(num_actors, num_actors_per_nodes, job): "_runtime": submission_cost + ready_cost + actor_job_cost, } - if output is not None: - from pathlib import Path - - p = Path(output) - p.write_text(json.dumps(result)) + safe_write_to_results_json(result) print(result) diff --git a/release/long_running_tests/app_config.yaml b/release/long_running_tests/app_config.yaml index 6a6bf3aba6af2..5d13555ff713a 100755 --- a/release/long_running_tests/app_config.yaml +++ b/release/long_running_tests/app_config.yaml @@ -1,5 +1,5 @@ base_image: {{ env["RAY_IMAGE_NIGHTLY_CPU"] | default("anyscale/ray:nightly-py37") }} -env_vars: {} +env_vars: {"RLLIB_TEST_NO_JAX_IMPORT": "1"} debian_packages: - curl @@ -7,15 +7,13 @@ debian_packages: python: pip_packages: - # These dependencies should be handled by requirements_rllib.txt and - # requirements_ml_docker.txt and removed here + # These dependencies should be handled by requirements_rllib.txt and requirements_ml_docker.txt and removed here - gym>=0.21.0,<0.24.1 - ale-py==0.7.5 - pytest - tensorflow - # AutoROM downloads ROMs via torrent when they are built. The torrent is unreliable, - # so we built it for py3 and use that instead. This wheel was tested for python 3.7, 3.8, - # and 3.9. + # AutoROM downloads ROMs via torrent when they are built. The torrent is unreliable, so we built it for py3 and + # use that instead. This wheel was tested for python 3.7, 3.8, and 3.9. - https://ray-ci-deps-wheels.s3.us-west-2.amazonaws.com/AutoROM.accept_rom_license-0.5.4-py3-none-any.whl conda_packages: [] diff --git a/release/ml_user_tests/horovod/horovod_user_test.py b/release/ml_user_tests/horovod/horovod_user_test.py index 3355d3e21826e..69dcf71b8fffd 100644 --- a/release/ml_user_tests/horovod/horovod_user_test.py +++ b/release/ml_user_tests/horovod/horovod_user_test.py @@ -22,6 +22,7 @@ num_workers=6, use_gpu=True, placement_group_timeout_s=2000, + timeout_s=120, kwargs={"num_epochs": 20}, ) diff --git a/release/nightly_tests/dataset/pipelined_training.py b/release/nightly_tests/dataset/pipelined_training.py index 49fb50014f8d9..69adfc7dfa7fb 100644 --- a/release/nightly_tests/dataset/pipelined_training.py +++ b/release/nightly_tests/dataset/pipelined_training.py @@ -327,7 +327,7 @@ def consume(split, rank=None, batch_size=None): ray.get(tasks) else: print("Create Ray executor") - settings = RayExecutor.create_settings(timeout_s=30) + settings = RayExecutor.create_settings(timeout_s=120) executor = RayExecutor(settings, num_workers=args.num_workers, use_gpu=True) executor.start() executor.run(train_main, args=[args, splits]) diff --git a/release/release_logs/2.3.0/benchmarks/many_actors.json b/release/release_logs/2.3.0/benchmarks/many_actors.json new file mode 100644 index 0000000000000..4f63c20bdc67d --- /dev/null +++ b/release/release_logs/2.3.0/benchmarks/many_actors.json @@ -0,0 +1,32 @@ +{ + "_dashboard_memory_usage_mb": 518.770688, + "_dashboard_test_success": true, + "_peak_memory": 3.58, + "_peak_process_memory": "PID\tMEM\tCOMMAND\n291\t1.75GiB\t/home/ray/anaconda3/lib/python3.7/site-packages/ray/core/src/ray/gcs/gcs_server --log_dir=/tmp/ray/s\n2427\t0.85GiB\tpython distributed/test_many_actors.py\n464\t0.31GiB\t/home/ray/anaconda3/bin/python /home/ray/anaconda3/lib/python3.7/site-packages/ray/dashboard/dashboa\n47\t0.09GiB\t/home/ray/anaconda3/bin/python /home/ray/anaconda3/bin/anyscale session web_terminal_server --deploy\n764\t0.08GiB\t/home/ray/anaconda3/bin/python -u /home/ray/anaconda3/lib/python3.7/site-packages/ray/dashboard/agen\n2648\t0.07GiB\tray::DashboardTester.run\n44\t0.06GiB\t/home/ray/anaconda3/bin/python /home/ray/anaconda3/bin/jupyter-notebook --NotebookApp.token=aph0_CkY\n2564\t0.06GiB\tray::MemoryMonitorActor.run\n391\t0.05GiB\t/home/ray/anaconda3/bin/python -m ray.util.client.server --address=172.31.206.170:9031 --host=0.0.0.\n663\t0.05GiB\t/home/ray/anaconda3/bin/python -u /home/ray/anaconda3/lib/python3.7/site-packages/ray/_private/log_m", + "actors_per_second": 777.2062125146554, + "num_actors": 10000, + "perf_metrics": [ + { + "perf_metric_name": "actors_per_second", + "perf_metric_type": "THROUGHPUT", + "perf_metric_value": 777.2062125146554 + }, + { + "perf_metric_name": "dashboard_p50_latency_ms", + "perf_metric_type": "LATENCY", + "perf_metric_value": 30.356 + }, + { + "perf_metric_name": "dashboard_p95_latency_ms", + "perf_metric_type": "LATENCY", + "perf_metric_value": 1313.017 + }, + { + "perf_metric_name": "dashboard_p99_latency_ms", + "perf_metric_type": "LATENCY", + "perf_metric_value": 4925.524 + } + ], + "success": "1", + "time": 12.866598129272461 +} diff --git a/release/release_logs/2.3.0/benchmarks/many_nodes.json b/release/release_logs/2.3.0/benchmarks/many_nodes.json new file mode 100644 index 0000000000000..3717336d2fbd1 --- /dev/null +++ b/release/release_logs/2.3.0/benchmarks/many_nodes.json @@ -0,0 +1,38 @@ +{ + "_dashboard_memory_usage_mb": 253.00992, + "_dashboard_test_success": true, + "_peak_memory": 1.92, + "_peak_process_memory": "PID\tMEM\tCOMMAND\n291\t0.74GiB\t/home/ray/anaconda3/lib/python3.7/site-packages/ray/core/src/ray/gcs/gcs_server --log_dir=/tmp/ray/s\n464\t0.22GiB\t/home/ray/anaconda3/bin/python /home/ray/anaconda3/lib/python3.7/site-packages/ray/dashboard/dashboa\n3086\t0.2GiB\tpython distributed/test_many_tasks.py --num-tasks=1000\n47\t0.09GiB\t/home/ray/anaconda3/bin/python /home/ray/anaconda3/bin/anyscale session web_terminal_server --deploy\n764\t0.08GiB\t/home/ray/anaconda3/bin/python -u /home/ray/anaconda3/lib/python3.7/site-packages/ray/dashboard/agen\n3386\t0.07GiB\tray::StateAPIGeneratorActor.start\n3308\t0.07GiB\tray::DashboardTester.run\n3221\t0.07GiB\tray::MemoryMonitorActor.run\n44\t0.06GiB\t/home/ray/anaconda3/bin/python /home/ray/anaconda3/bin/jupyter-notebook --NotebookApp.token=agh0_Ckg\n391\t0.05GiB\t/home/ray/anaconda3/bin/python -m ray.util.client.server --address=172.31.167.22:9031 --host=0.0.0.0", + "num_tasks": 1000, + "perf_metrics": [ + { + "perf_metric_name": "tasks_per_second", + "perf_metric_type": "THROUGHPUT", + "perf_metric_value": 219.0605760959142 + }, + { + "perf_metric_name": "used_cpus_by_deadline", + "perf_metric_type": "THROUGHPUT", + "perf_metric_value": 250.0 + }, + { + "perf_metric_name": "dashboard_p50_latency_ms", + "perf_metric_type": "LATENCY", + "perf_metric_value": 3.356 + }, + { + "perf_metric_name": "dashboard_p95_latency_ms", + "perf_metric_type": "LATENCY", + "perf_metric_value": 44.061 + }, + { + "perf_metric_name": "dashboard_p99_latency_ms", + "perf_metric_type": "LATENCY", + "perf_metric_value": 152.793 + } + ], + "success": "1", + "tasks_per_second": 219.0605760959142, + "time": 304.5649473667145, + "used_cpus": 250.0 +} diff --git a/release/release_logs/2.3.0/benchmarks/many_pgs.json b/release/release_logs/2.3.0/benchmarks/many_pgs.json new file mode 100644 index 0000000000000..1c78ccfa44d5b --- /dev/null +++ b/release/release_logs/2.3.0/benchmarks/many_pgs.json @@ -0,0 +1,32 @@ +{ + "_dashboard_memory_usage_mb": 241.78688, + "_dashboard_test_success": true, + "_peak_memory": 2.37, + "_peak_process_memory": "PID\tMEM\tCOMMAND\n291\t1.07GiB\t/home/ray/anaconda3/lib/python3.7/site-packages/ray/core/src/ray/gcs/gcs_server --log_dir=/tmp/ray/s\n2413\t0.4GiB\tpython distributed/test_many_pgs.py\n464\t0.21GiB\t/home/ray/anaconda3/bin/python /home/ray/anaconda3/lib/python3.7/site-packages/ray/dashboard/dashboa\n47\t0.09GiB\t/home/ray/anaconda3/bin/python /home/ray/anaconda3/bin/anyscale session web_terminal_server --deploy\n590\t0.08GiB\t/home/ray/anaconda3/lib/python3.7/site-packages/ray/core/src/ray/raylet/raylet --raylet_socket_name=\n764\t0.08GiB\t/home/ray/anaconda3/bin/python -u /home/ray/anaconda3/lib/python3.7/site-packages/ray/dashboard/agen\n2551\t0.06GiB\tray::MemoryMonitorActor.run\n2634\t0.06GiB\tray::DashboardTester.run\n44\t0.06GiB\t/home/ray/anaconda3/bin/python /home/ray/anaconda3/bin/jupyter-notebook --NotebookApp.token=aph0_Ckc\n391\t0.05GiB\t/home/ray/anaconda3/bin/python -m ray.util.client.server --address=172.31.255.206:9031 --host=0.0.0.", + "num_pgs": 1000, + "perf_metrics": [ + { + "perf_metric_name": "pgs_per_second", + "perf_metric_type": "THROUGHPUT", + "perf_metric_value": 17.89601570164703 + }, + { + "perf_metric_name": "dashboard_p50_latency_ms", + "perf_metric_type": "LATENCY", + "perf_metric_value": 3.192 + }, + { + "perf_metric_name": "dashboard_p95_latency_ms", + "perf_metric_type": "LATENCY", + "perf_metric_value": 12.196 + }, + { + "perf_metric_name": "dashboard_p99_latency_ms", + "perf_metric_type": "LATENCY", + "perf_metric_value": 149.699 + } + ], + "pgs_per_second": 17.89601570164703, + "success": "1", + "time": 55.87835955619812 +} diff --git a/release/release_logs/2.3.0/benchmarks/many_tasks.json b/release/release_logs/2.3.0/benchmarks/many_tasks.json new file mode 100644 index 0000000000000..133326ff16c85 --- /dev/null +++ b/release/release_logs/2.3.0/benchmarks/many_tasks.json @@ -0,0 +1,38 @@ +{ + "_dashboard_memory_usage_mb": 498.393088, + "_dashboard_test_success": true, + "_peak_memory": 4.39, + "_peak_process_memory": "PID\tMEM\tCOMMAND\n291\t2.09GiB\t/home/ray/anaconda3/lib/python3.7/site-packages/ray/core/src/ray/gcs/gcs_server --log_dir=/tmp/ray/s\n2422\t0.86GiB\tpython distributed/test_many_tasks.py --num-tasks=10000\n464\t0.59GiB\t/home/ray/anaconda3/bin/python /home/ray/anaconda3/lib/python3.7/site-packages/ray/dashboard/dashboa\n2644\t0.1GiB\tray::DashboardTester.run\n47\t0.09GiB\t/home/ray/anaconda3/bin/python /home/ray/anaconda3/bin/anyscale session web_terminal_server --deploy\n765\t0.08GiB\t/home/ray/anaconda3/bin/python -u /home/ray/anaconda3/lib/python3.7/site-packages/ray/dashboard/agen\n2710\t0.07GiB\tray::StateAPIGeneratorActor.start\n2560\t0.06GiB\tray::MemoryMonitorActor.run\n44\t0.06GiB\t/home/ray/anaconda3/bin/python /home/ray/anaconda3/bin/jupyter-notebook --NotebookApp.token=aph0_CkY\n391\t0.05GiB\t/home/ray/anaconda3/bin/python -m ray.util.client.server --address=172.31.223.72:9031 --host=0.0.0.0", + "num_tasks": 10000, + "perf_metrics": [ + { + "perf_metric_name": "tasks_per_second", + "perf_metric_type": "THROUGHPUT", + "perf_metric_value": 210.67184928063165 + }, + { + "perf_metric_name": "used_cpus_by_deadline", + "perf_metric_type": "THROUGHPUT", + "perf_metric_value": 2500.0 + }, + { + "perf_metric_name": "dashboard_p50_latency_ms", + "perf_metric_type": "LATENCY", + "perf_metric_value": 3.985 + }, + { + "perf_metric_name": "dashboard_p95_latency_ms", + "perf_metric_type": "LATENCY", + "perf_metric_value": 665.858 + }, + { + "perf_metric_name": "dashboard_p99_latency_ms", + "perf_metric_type": "LATENCY", + "perf_metric_value": 1404.251 + } + ], + "success": "1", + "tasks_per_second": 210.67184928063165, + "time": 347.46718668937683, + "used_cpus": 2500.0 +} diff --git a/release/release_logs/2.3.0/microbenchmark.json b/release/release_logs/2.3.0/microbenchmark.json new file mode 100644 index 0000000000000..6da062760215f --- /dev/null +++ b/release/release_logs/2.3.0/microbenchmark.json @@ -0,0 +1,283 @@ +{ + "1_1_actor_calls_async": [ + 8266.227716717382, + 154.83431912796019 + ], + "1_1_actor_calls_concurrent": [ + 4822.515544345499, + 69.03417417619077 + ], + "1_1_actor_calls_sync": [ + 2326.919630897046, + 18.048639582599087 + ], + "1_1_async_actor_calls_async": [ + 3472.864053257764, + 18.492804155673895 + ], + "1_1_async_actor_calls_sync": [ + 1600.088775357745, + 41.89765647230385 + ], + "1_1_async_actor_calls_with_args_async": [ + 2350.1130733425607, + 169.642459588056 + ], + "1_n_actor_calls_async": [ + 10819.31324573512, + 556.3564152930876 + ], + "1_n_async_actor_calls_async": [ + 10741.070943872623, + 74.14284598217124 + ], + "client__1_1_actor_calls_async": [ + 916.2458814581453, + 20.970867830230226 + ], + "client__1_1_actor_calls_concurrent": [ + 925.777467637641, + 25.193859971158158 + ], + "client__1_1_actor_calls_sync": [ + 521.8726144223089, + 7.962424357055347 + ], + "client__get_calls": [ + 1149.982480593273, + 18.207879410048996 + ], + "client__put_calls": [ + 861.7294897745508, + 20.886660586339037 + ], + "client__put_gigabytes": [ + 0.04579083755671009, + 0.0011758702466256328 + ], + "client__tasks_and_get_batch": [ + 0.85332224821338, + 0.009728477126712553 + ], + "client__tasks_and_put_batch": [ + 10926.28867356008, + 112.00041581027524 + ], + "multi_client_put_calls_Plasma_Store": [ + 11907.518691049167, + 140.67822025073386 + ], + "multi_client_put_gigabytes": [ + 38.34929639975205, + 3.8021163438852286 + ], + "multi_client_tasks_async": [ + 29213.877508157526, + 1402.303220638465 + ], + "n_n_actor_calls_async": [ + 32206.022554465333, + 1389.5479846707906 + ], + "n_n_actor_calls_with_arg_async": [ + 2569.4536725239686, + 27.598753015224556 + ], + "n_n_async_actor_calls_async": [ + 27823.30349707046, + 169.7830788090935 + ], + "perf_metrics": [ + { + "perf_metric_name": "single_client_get_calls_Plasma_Store", + "perf_metric_type": "THROUGHPUT", + "perf_metric_value": 6507.325737010699 + }, + { + "perf_metric_name": "single_client_put_calls_Plasma_Store", + "perf_metric_type": "THROUGHPUT", + "perf_metric_value": 5657.281266012957 + }, + { + "perf_metric_name": "multi_client_put_calls_Plasma_Store", + "perf_metric_type": "THROUGHPUT", + "perf_metric_value": 11907.518691049167 + }, + { + "perf_metric_name": "single_client_put_gigabytes", + "perf_metric_type": "THROUGHPUT", + "perf_metric_value": 20.68898688869432 + }, + { + "perf_metric_name": "single_client_tasks_and_get_batch", + "perf_metric_type": "THROUGHPUT", + "perf_metric_value": 11.039401889292254 + }, + { + "perf_metric_name": "multi_client_put_gigabytes", + "perf_metric_type": "THROUGHPUT", + "perf_metric_value": 38.34929639975205 + }, + { + "perf_metric_name": "single_client_get_object_containing_10k_refs", + "perf_metric_type": "THROUGHPUT", + "perf_metric_value": 13.146682237214058 + }, + { + "perf_metric_name": "single_client_wait_1k_refs", + "perf_metric_type": "THROUGHPUT", + "perf_metric_value": 5.498008946220131 + }, + { + "perf_metric_name": "single_client_tasks_sync", + "perf_metric_type": "THROUGHPUT", + "perf_metric_value": 1282.1050579108664 + }, + { + "perf_metric_name": "single_client_tasks_async", + "perf_metric_type": "THROUGHPUT", + "perf_metric_value": 10781.5375768007 + }, + { + "perf_metric_name": "multi_client_tasks_async", + "perf_metric_type": "THROUGHPUT", + "perf_metric_value": 29213.877508157526 + }, + { + "perf_metric_name": "1_1_actor_calls_sync", + "perf_metric_type": "THROUGHPUT", + "perf_metric_value": 2326.919630897046 + }, + { + "perf_metric_name": "1_1_actor_calls_async", + "perf_metric_type": "THROUGHPUT", + "perf_metric_value": 8266.227716717382 + }, + { + "perf_metric_name": "1_1_actor_calls_concurrent", + "perf_metric_type": "THROUGHPUT", + "perf_metric_value": 4822.515544345499 + }, + { + "perf_metric_name": "1_n_actor_calls_async", + "perf_metric_type": "THROUGHPUT", + "perf_metric_value": 10819.31324573512 + }, + { + "perf_metric_name": "n_n_actor_calls_async", + "perf_metric_type": "THROUGHPUT", + "perf_metric_value": 32206.022554465333 + }, + { + "perf_metric_name": "n_n_actor_calls_with_arg_async", + "perf_metric_type": "THROUGHPUT", + "perf_metric_value": 2569.4536725239686 + }, + { + "perf_metric_name": "1_1_async_actor_calls_sync", + "perf_metric_type": "THROUGHPUT", + "perf_metric_value": 1600.088775357745 + }, + { + "perf_metric_name": "1_1_async_actor_calls_async", + "perf_metric_type": "THROUGHPUT", + "perf_metric_value": 3472.864053257764 + }, + { + "perf_metric_name": "1_1_async_actor_calls_with_args_async", + "perf_metric_type": "THROUGHPUT", + "perf_metric_value": 2350.1130733425607 + }, + { + "perf_metric_name": "1_n_async_actor_calls_async", + "perf_metric_type": "THROUGHPUT", + "perf_metric_value": 10741.070943872623 + }, + { + "perf_metric_name": "n_n_async_actor_calls_async", + "perf_metric_type": "THROUGHPUT", + "perf_metric_value": 27823.30349707046 + }, + { + "perf_metric_name": "placement_group_create/removal", + "perf_metric_type": "THROUGHPUT", + "perf_metric_value": 989.8353575694138 + }, + { + "perf_metric_name": "client__get_calls", + "perf_metric_type": "THROUGHPUT", + "perf_metric_value": 1149.982480593273 + }, + { + "perf_metric_name": "client__put_calls", + "perf_metric_type": "THROUGHPUT", + "perf_metric_value": 861.7294897745508 + }, + { + "perf_metric_name": "client__put_gigabytes", + "perf_metric_type": "THROUGHPUT", + "perf_metric_value": 0.04579083755671009 + }, + { + "perf_metric_name": "client__tasks_and_put_batch", + "perf_metric_type": "THROUGHPUT", + "perf_metric_value": 10926.28867356008 + }, + { + "perf_metric_name": "client__1_1_actor_calls_sync", + "perf_metric_type": "THROUGHPUT", + "perf_metric_value": 521.8726144223089 + }, + { + "perf_metric_name": "client__1_1_actor_calls_async", + "perf_metric_type": "THROUGHPUT", + "perf_metric_value": 916.2458814581453 + }, + { + "perf_metric_name": "client__1_1_actor_calls_concurrent", + "perf_metric_type": "THROUGHPUT", + "perf_metric_value": 925.777467637641 + }, + { + "perf_metric_name": "client__tasks_and_get_batch", + "perf_metric_type": "THROUGHPUT", + "perf_metric_value": 0.85332224821338 + } + ], + "placement_group_create/removal": [ + 989.8353575694138, + 8.999610265563307 + ], + "single_client_get_calls_Plasma_Store": [ + 6507.325737010699, + 169.10049512002783 + ], + "single_client_get_object_containing_10k_refs": [ + 13.146682237214058, + 0.40574499297791244 + ], + "single_client_put_calls_Plasma_Store": [ + 5657.281266012957, + 65.81470332190243 + ], + "single_client_put_gigabytes": [ + 20.68898688869432, + 6.118551705446652 + ], + "single_client_tasks_and_get_batch": [ + 11.039401889292254, + 0.07053691123976474 + ], + "single_client_tasks_async": [ + 10781.5375768007, + 80.41585546336292 + ], + "single_client_tasks_sync": [ + 1282.1050579108664, + 25.53646612884218 + ], + "single_client_wait_1k_refs": [ + 5.498008946220131, + 0.019177007797266794 + ] +} diff --git a/release/release_logs/2.3.0/scalability/object_store.json b/release/release_logs/2.3.0/scalability/object_store.json new file mode 100644 index 0000000000000..5d16a4b8ee474 --- /dev/null +++ b/release/release_logs/2.3.0/scalability/object_store.json @@ -0,0 +1,13 @@ +{ + "broadcast_time": 92.42755234200001, + "num_nodes": 50, + "object_size": 1073741824, + "perf_metrics": [ + { + "perf_metric_name": "time_to_broadcast_1073741824_bytes_to_50_nodes", + "perf_metric_type": "LATENCY", + "perf_metric_value": 92.42755234200001 + } + ], + "success": "1" +} diff --git a/release/release_logs/2.3.0/scalability/single_node.json b/release/release_logs/2.3.0/scalability/single_node.json new file mode 100644 index 0000000000000..bababa5f97d9d --- /dev/null +++ b/release/release_logs/2.3.0/scalability/single_node.json @@ -0,0 +1,40 @@ +{ + "args_time": 16.49447203899996, + "get_time": 24.665624744000013, + "large_object_size": 107374182400, + "large_object_time": 325.56598805499993, + "num_args": 10000, + "num_get_args": 10000, + "num_queued": 1000000, + "num_returns": 3000, + "perf_metrics": [ + { + "perf_metric_name": "10000_args_time", + "perf_metric_type": "LATENCY", + "perf_metric_value": 16.49447203899996 + }, + { + "perf_metric_name": "3000_returns_time", + "perf_metric_type": "LATENCY", + "perf_metric_value": 6.200411088999999 + }, + { + "perf_metric_name": "10000_get_time", + "perf_metric_type": "LATENCY", + "perf_metric_value": 24.665624744000013 + }, + { + "perf_metric_name": "1000000_queued_time", + "perf_metric_type": "LATENCY", + "perf_metric_value": 190.882864026 + }, + { + "perf_metric_name": "107374182400_large_object_time", + "perf_metric_type": "LATENCY", + "perf_metric_value": 325.56598805499993 + } + ], + "queued_time": 190.882864026, + "returns_time": 6.200411088999999, + "success": "1" +} diff --git a/release/release_logs/2.3.0/stress_tests/stress_test_dead_actors.json b/release/release_logs/2.3.0/stress_tests/stress_test_dead_actors.json new file mode 100644 index 0000000000000..ad110b1a960c7 --- /dev/null +++ b/release/release_logs/2.3.0/stress_tests/stress_test_dead_actors.json @@ -0,0 +1,14 @@ +{ + "avg_iteration_time": 1.9298896980285645, + "max_iteration_time": 7.609254598617554, + "min_iteration_time": 0.8152294158935547, + "perf_metrics": [ + { + "perf_metric_name": "avg_iteration_time", + "perf_metric_type": "LATENCY", + "perf_metric_value": 1.9298896980285645 + } + ], + "success": 1, + "total_time": 192.9892761707306 +} diff --git a/release/release_logs/2.3.0/stress_tests/stress_test_many_tasks.json b/release/release_logs/2.3.0/stress_tests/stress_test_many_tasks.json new file mode 100644 index 0000000000000..6e03dbd3e78c9 --- /dev/null +++ b/release/release_logs/2.3.0/stress_tests/stress_test_many_tasks.json @@ -0,0 +1,47 @@ +{ + "perf_metrics": [ + { + "perf_metric_name": "stage_0_time", + "perf_metric_type": "LATENCY", + "perf_metric_value": 10.636847257614136 + }, + { + "perf_metric_name": "stage_1_avg_iteration_time", + "perf_metric_type": "LATENCY", + "perf_metric_value": 22.78462884426117 + }, + { + "perf_metric_name": "stage_2_avg_iteration_time", + "perf_metric_type": "LATENCY", + "perf_metric_value": 63.28760423660278 + }, + { + "perf_metric_name": "stage_3_creation_time", + "perf_metric_type": "LATENCY", + "perf_metric_value": 0.05330848693847656 + }, + { + "perf_metric_name": "stage_3_time", + "perf_metric_type": "LATENCY", + "perf_metric_value": 2655.176714658737 + }, + { + "perf_metric_name": "stage_4_spread", + "perf_metric_type": "LATENCY", + "perf_metric_value": 0.7394418476633471 + } + ], + "stage_0_time": 10.636847257614136, + "stage_1_avg_iteration_time": 22.78462884426117, + "stage_1_max_iteration_time": 23.30313539505005, + "stage_1_min_iteration_time": 21.94757604598999, + "stage_1_time": 227.84637260437012, + "stage_2_avg_iteration_time": 63.28760423660278, + "stage_2_max_iteration_time": 64.29542255401611, + "stage_2_min_iteration_time": 61.036946058273315, + "stage_2_time": 316.43891644477844, + "stage_3_creation_time": 0.05330848693847656, + "stage_3_time": 2655.176714658737, + "stage_4_spread": 0.7394418476633471, + "success": 1 +} diff --git a/release/release_logs/2.3.0/stress_tests/stress_test_placement_group.json b/release/release_logs/2.3.0/stress_tests/stress_test_placement_group.json new file mode 100644 index 0000000000000..8c12e0c86876b --- /dev/null +++ b/release/release_logs/2.3.0/stress_tests/stress_test_placement_group.json @@ -0,0 +1,17 @@ +{ + "avg_pg_create_time_ms": 0.8553733138129204, + "avg_pg_remove_time_ms": 0.8286827042024993, + "perf_metrics": [ + { + "perf_metric_name": "avg_pg_create_time_ms", + "perf_metric_type": "LATENCY", + "perf_metric_value": 0.8553733138129204 + }, + { + "perf_metric_name": "avg_pg_remove_time_ms", + "perf_metric_type": "LATENCY", + "perf_metric_value": 0.8286827042024993 + } + ], + "success": 1 +} diff --git a/release/release_tests.yaml b/release/release_tests.yaml index 5e0c9149e95ec..eb19c3cf0166b 100644 --- a/release/release_tests.yaml +++ b/release/release_tests.yaml @@ -570,6 +570,46 @@ timeout: 1800 script: bash heterogeneity_benchmark.sh 2 + +####################### +# AIR examples +####################### + + +# Test additional CPU nodes for preprocessing. +- name: air_example_dreambooth_finetuning + group: AIR examples + working_dir: air_examples/dreambooth + + stable: false + + frequency: weekly + team: ml + cluster: + cluster_env: dreambooth_env.yaml + cluster_compute: dreambooth_compute.yaml + + run: + timeout: 1800 + script: bash dreambooth_run.sh + + +- name: air_example_gptj_deepspeed_fine_tuning + group: AIR examples + working_dir: air_examples/gptj_deepspeed_finetuning + + python: "3.9" + + frequency: weekly + team: ml + cluster: + cluster_env: gptj_deepspeed_env.yaml + cluster_compute: gptj_deepspeed_compute.yaml + + run: + timeout: 3600 + script: python test_myst_doc.py --path gptj_deepspeed_fine_tuning.ipynb + ####################### # XGBoost release tests ####################### @@ -3421,18 +3461,6 @@ num_nodes: 65 - smoke_test: - frequency: nightly - cluster: - cluster_env: app_config.yaml - cluster_compute: distributed_smoke_test.yaml - run: - timeout: 3600 - script: python distributed/test_many_tasks.py --num-tasks=100 - wait_for_nodes: - num_nodes: 2 - - - name: many_pgs group: core-scalability-test working_dir: benchmarks diff --git a/release/rllib_tests/app_config.yaml b/release/rllib_tests/app_config.yaml index 2dae5b40912ca..c6a77e18d845a 100755 --- a/release/rllib_tests/app_config.yaml +++ b/release/rllib_tests/app_config.yaml @@ -1,5 +1,5 @@ base_image: {{ env["RAY_IMAGE_ML_NIGHTLY_GPU"] | default("anyscale/ray-ml:nightly-py37-gpu") }} -env_vars: {"LD_LIBRARY_PATH": "$LD_LIBRARY_PATH:/home/ray/.mujoco/mujoco210/bin"} +env_vars: {"LD_LIBRARY_PATH": "$LD_LIBRARY_PATH:/home/ray/.mujoco/mujoco210/bin", "RLLIB_TEST_NO_JAX_IMPORT": "1"} debian_packages: - unzip - zip diff --git a/rllib/BUILD b/rllib/BUILD index 69d187b6a8fe0..3806403a56f90 100644 --- a/rllib/BUILD +++ b/rllib/BUILD @@ -1848,7 +1848,7 @@ py_test( py_test( name = "test_catalog", tags = ["team:rllib", "core"], - size = "small", + size = "medium", srcs = ["core/models/tests/test_catalog.py"] ) diff --git a/rllib/algorithms/algorithm.py b/rllib/algorithms/algorithm.py index bbfcdfa5623ae..44c69355c8712 100644 --- a/rllib/algorithms/algorithm.py +++ b/rllib/algorithms/algorithm.py @@ -2690,7 +2690,7 @@ def _create_local_replay_buffer_if_necessary( None, if local replay buffer is not needed. """ if not config.get("replay_buffer_config") or config["replay_buffer_config"].get( - "no_local_replay_buffer" or config.get("no_local_replay_buffer") + "no_local_replay_buffer" ): return diff --git a/rllib/algorithms/algorithm_config.py b/rllib/algorithms/algorithm_config.py index 3b0eb2146363d..0888cb428950a 100644 --- a/rllib/algorithms/algorithm_config.py +++ b/rllib/algorithms/algorithm_config.py @@ -116,6 +116,15 @@ def _resolve_class_path(module) -> Type: return getattr(module, class_name) +def _check_rl_module_spec(module_spec: ModuleSpec) -> None: + if not isinstance(module_spec, (SingleAgentRLModuleSpec, MultiAgentRLModuleSpec)): + raise ValueError( + "rl_module_spec must be an instance of " + "SingleAgentRLModuleSpec or MultiAgentRLModuleSpec." + f"Got {type(module_spec)} instead." + ) + + class AlgorithmConfig(_Config): """A RLlib AlgorithmConfig builds an RLlib Algorithm from a given configuration. @@ -901,17 +910,26 @@ def validate(self) -> None: # compatibility for now. User only needs to set num_rollout_workers. self.input_config["parallelism"] = self.num_rollout_workers or 1 - # resolve rl_module_spec class - if self._enable_rl_module_api and self.rl_module_spec is None: - self.rl_module_spec = self.get_default_rl_module_spec() - if not isinstance( - self.rl_module_spec, (SingleAgentRLModuleSpec, MultiAgentRLModuleSpec) - ): - raise ValueError( - "rl_module_spec must be an instance of " - "SingleAgentRLModuleSpec or MultiAgentRLModuleSpec." - f"Got {type(self.rl_module_spec)} instead." - ) + if self._enable_rl_module_api: + default_rl_module_spec = self.get_default_rl_module_spec() + _check_rl_module_spec(default_rl_module_spec) + + if self.rl_module_spec is not None: + # Merge provided RL Module spec class with defaults + _check_rl_module_spec(self.rl_module_spec) + # We can only merge if we have SingleAgentRLModuleSpecs. + # TODO(Artur): Support merging for MultiAgentRLModuleSpecs. + if isinstance(self.rl_module_spec, SingleAgentRLModuleSpec): + if isinstance(default_rl_module_spec, SingleAgentRLModuleSpec): + default_rl_module_spec.update(self.rl_module_spec) + self.rl_module_spec = default_rl_module_spec + elif isinstance(default_rl_module_spec, MultiAgentRLModuleSpec): + raise ValueError( + "Cannot merge MultiAgentRLModuleSpec with " + "SingleAgentRLModuleSpec!" + ) + else: + self.rl_module_spec = default_rl_module_spec # make sure the resource requirements for learner_group is valid if self.num_learner_workers == 0 and self.num_gpus_per_worker > 1: @@ -2283,9 +2301,9 @@ def rl_module( Args: rl_module_spec: The RLModule spec to use for this config. It can be either a SingleAgentRLModuleSpec or a MultiAgentRLModuleSpec. If the - observation_space, action_space, or the model_config is not specified - it will be inferred from the env and other parts of the algorithm - config object. + observation_space, action_space, catalog_class, or the model_config is + not specified it will be inferred from the env and other parts of the + algorithm config object. _enable_rl_module_api: Whether to enable the RLModule API for this config. By default if you call `config.rl_module(...)`, the RLModule API will NOT be enabled. If you want to enable it, you can call @@ -2833,8 +2851,8 @@ def get_marl_module_spec( module_spec.observation_space = policy_spec.observation_space if module_spec.action_space is None: module_spec.action_space = policy_spec.action_space - if module_spec.model_config is None: - module_spec.model_config = policy_spec.config.get("model", {}) + if module_spec.model_config_dict is None: + module_spec.model_config_dict = policy_spec.config.get("model", {}) return marl_module_spec @@ -3060,7 +3078,7 @@ def _resolve_tf_settings(self, _tf1, _tfv): # Recommend setting tracing to True for speedups. logger.info( f"Executing eagerly (framework='{self.framework_str}')," - f" with eager_tracing={self.framework_str}. For " + f" with eager_tracing={self.eager_tracing}. For " "production workloads, make sure to set eager_tracing=True" " in order to match the speed of tf-static-graph " "(framework='tf'). For debugging purposes, " diff --git a/rllib/algorithms/impala/impala.py b/rllib/algorithms/impala/impala.py index 2f1db2ed74f2f..c8d95bb931411 100644 --- a/rllib/algorithms/impala/impala.py +++ b/rllib/algorithms/impala/impala.py @@ -14,6 +14,7 @@ ImpalaHPs, _reduce_impala_results, ) +from ray.rllib.algorithms.ppo.ppo_catalog import PPOCatalog from ray.rllib.core.learner.learner_group_config import ( LearnerGroupConfig, ModuleSpec, @@ -429,7 +430,9 @@ def get_default_rl_module_spec(self) -> SingleAgentRLModuleSpec: if self.framework_str == "tf2": from ray.rllib.algorithms.ppo.tf.ppo_tf_rl_module import PPOTfRLModule - return SingleAgentRLModuleSpec(module_class=PPOTfRLModule) + return SingleAgentRLModuleSpec( + module_class=PPOTfRLModule, catalog_class=PPOCatalog + ) else: raise ValueError(f"The framework {self.framework_str} is not supported.") @@ -831,9 +834,9 @@ def get_samples_from_workers( timeout_seconds=self._timeout_s_sampler_manager, return_obj_refs=return_object_refs, ) - elif self.workers.local_worker() and ( - self.config.create_env_on_local_worker - or self.config.num_rollout_workers == 0 + elif ( + self.workers.local_worker() + and self.workers.local_worker().async_env is not None ): # Sampling from the local worker sample_batch = self.workers.local_worker().sample() diff --git a/rllib/algorithms/impala/tests/tf/test_impala_learner.py b/rllib/algorithms/impala/tests/tf/test_impala_learner.py index bef4067dd124f..61f9c4e6bb923 100644 --- a/rllib/algorithms/impala/tests/tf/test_impala_learner.py +++ b/rllib/algorithms/impala/tests/tf/test_impala_learner.py @@ -89,7 +89,8 @@ def test_impala_loss(self): module_class=algo_config.rl_module_spec.module_class, observation_space=policy.observation_space, action_space=policy.action_space, - model_config=policy.config["model"], + model_config_dict=policy.config["model"], + catalog_class=algo_config.rl_module_spec.catalog_class, ) ) learner_group_config.num_learner_workers = 0 diff --git a/rllib/algorithms/ppo/ppo.py b/rllib/algorithms/ppo/ppo.py index 8ff7e4d106024..dd53cc52b022c 100644 --- a/rllib/algorithms/ppo/ppo.py +++ b/rllib/algorithms/ppo/ppo.py @@ -17,6 +17,7 @@ from ray.rllib.algorithms.algorithm_config import AlgorithmConfig, NotProvided from ray.rllib.algorithms.pg import PGConfig from ray.rllib.algorithms.ppo.ppo_learner_config import PPOLearnerHPs +from ray.rllib.algorithms.ppo.ppo_catalog import PPOCatalog from ray.rllib.core.rl_module.rl_module import SingleAgentRLModuleSpec from ray.rllib.execution.rollout_ops import ( standardize_fields, @@ -129,11 +130,15 @@ def get_default_rl_module_spec(self) -> SingleAgentRLModuleSpec: PPOTorchRLModule, ) - return SingleAgentRLModuleSpec(module_class=PPOTorchRLModule) + return SingleAgentRLModuleSpec( + module_class=PPOTorchRLModule, catalog_class=PPOCatalog + ) elif self.framework_str == "tf2": from ray.rllib.algorithms.ppo.tf.ppo_tf_rl_module import PPOTfRLModule - return SingleAgentRLModuleSpec(module_class=PPOTfRLModule) + return SingleAgentRLModuleSpec( + module_class=PPOTfRLModule, catalog_class=PPOCatalog + ) else: raise ValueError(f"The framework {self.framework_str} is not supported.") diff --git a/rllib/algorithms/ppo/ppo_base_rl_module.py b/rllib/algorithms/ppo/ppo_base_rl_module.py index 6bed5d37ecfe5..235384e9ecc44 100644 --- a/rllib/algorithms/ppo/ppo_base_rl_module.py +++ b/rllib/algorithms/ppo/ppo_base_rl_module.py @@ -3,14 +3,11 @@ """ import abc -from typing import Mapping, Any import gymnasium as gym -from ray.rllib.algorithms.ppo.ppo_catalog import PPOCatalog -from ray.rllib.algorithms.ppo.ppo_rl_module_config import PPOModuleConfig from ray.rllib.core.rl_module.rl_module import RLModule, RLModuleConfig -from ray.rllib.utils.annotations import override, ExperimentalAPI +from ray.rllib.utils.annotations import ExperimentalAPI from ray.rllib.utils.gym import convert_old_gym_space_to_gymnasium_space from ray.rllib.core.models.base import ActorCriticEncoder @@ -20,42 +17,18 @@ class PPORLModuleBase(RLModule, abc.ABC): framework = None def __init__(self, config: RLModuleConfig): - super().__init__() - self.config = config - catalog = config.catalog - - assert isinstance(catalog, PPOCatalog), "A PPOCatalog is required for PPO." + super().__init__(config) + catalog = self.config.get_catalog() # Build models from catalog self.encoder = catalog.build_actor_critic_encoder(framework=self.framework) self.pi = catalog.build_pi_head(framework=self.framework) self.vf = catalog.build_vf_head(framework=self.framework) + self.action_dist_cls = catalog.get_action_dist_cls(framework=self.framework) + self._is_discrete = isinstance( convert_old_gym_space_to_gymnasium_space(self.config.action_space), gym.spaces.Discrete, ) assert isinstance(self.encoder, ActorCriticEncoder) - - @classmethod - @override(RLModule) - def from_model_config( - cls, - observation_space: gym.Space, - action_space: gym.Space, - *, - model_config_dict: Mapping[str, Any], - ) -> "PPORLModuleBase": - catalog = PPOCatalog( - observation_space=observation_space, - action_space=action_space, - model_config_dict=model_config_dict, - ) - - config = PPOModuleConfig( - observation_space=observation_space, - action_space=action_space, - catalog=catalog, - ) - - return config.build(framework=cls.framework) diff --git a/rllib/algorithms/ppo/ppo_catalog.py b/rllib/algorithms/ppo/ppo_catalog.py index d1c929cd7070a..57cc9934e6407 100644 --- a/rllib/algorithms/ppo/ppo_catalog.py +++ b/rllib/algorithms/ppo/ppo_catalog.py @@ -2,6 +2,7 @@ from ray.rllib.core.models.catalog import Catalog from ray.rllib.core.models.configs import ActorCriticEncoderConfig, MLPHeadConfig +from ray.rllib.core.models.base import Encoder, ActorCriticEncoder, Model from ray.rllib.utils import override @@ -54,7 +55,8 @@ def __init__( assert len(observation_space.shape) in ( 1, - ), "This simple PPO Module only supports 1D observation spaces." + 3, + ), "This simple PPO Module only supports 1D and 3D observation spaces." assert isinstance(action_space, (gym.spaces.Discrete, gym.spaces.Box)), ( "This simple PPO Module only supports Discrete and Box action spaces.", @@ -66,40 +68,27 @@ def __init__( shared=self.model_config_dict["vf_share_layers"], ) - if isinstance(action_space, gym.spaces.Discrete): - pi_output_dim = action_space.n - else: - pi_output_dim = action_space.shape[0] * 2 - post_fcnet_hiddens = self.model_config_dict["post_fcnet_hiddens"] post_fcnet_activation = self.model_config_dict["post_fcnet_activation"] self.pi_head_config = MLPHeadConfig( - input_dim=self.encoder_config.output_dim, + input_dims=self.latent_dims, hidden_layer_dims=post_fcnet_hiddens, hidden_layer_activation=post_fcnet_activation, output_activation="linear", - output_dim=pi_output_dim, + output_dims=None, # We don't know the output dimension yet, because it + # depends on the action distribution input dimension ) self.vf_head_config = MLPHeadConfig( - input_dim=self.encoder_config.output_dim, + input_dims=self.latent_dims, hidden_layer_dims=post_fcnet_hiddens, hidden_layer_activation=post_fcnet_activation, output_activation="linear", - output_dim=1, + output_dims=[1], ) - # Set input- and output dimensions to fit PPO's needs. - self.encoder_config.input_dim = observation_space.shape[0] - self.pi_head_config.input_dim = self.encoder_config.output_dim - if isinstance(action_space, gym.spaces.Discrete): - self.pi_head_config.output_dim = int(action_space.n) - else: - self.pi_head_config.output_dim = int(action_space.shape[0] * 2) - self.vf_head_config.output_dim = 1 - - def build_actor_critic_encoder(self, framework: str): + def build_actor_critic_encoder(self, framework: str) -> ActorCriticEncoder: """Builds the ActorCriticEncoder. The default behavior is to build the encoder from the encoder_config. @@ -115,7 +104,7 @@ def build_actor_critic_encoder(self, framework: str): return self.actor_critic_encoder_config.build(framework=framework) @override(Catalog) - def build_encoder(self, framework: str): + def build_encoder(self, framework: str) -> Encoder: """Builds the encoder. Since PPO uses an ActorCriticEncoder, this method should not be implemented. @@ -124,7 +113,7 @@ def build_encoder(self, framework: str): "Use PPOCatalog.build_actor_critic_encoder() instead." ) - def build_pi_head(self, framework: str): + def build_pi_head(self, framework: str) -> Model: """Builds the policy head. The default behavior is to build the head from the pi_head_config. @@ -137,9 +126,16 @@ def build_pi_head(self, framework: str): Returns: The policy head. """ + # Get action_distribution_cls to find out about the output dimension for pi_head + action_distribution_cls = self.get_action_dist_cls(framework=framework) + self.pi_head_config.output_dims = ( + action_distribution_cls.required_model_output_shape( + space=self.action_space, model_config=self.model_config_dict + ) + ) return self.pi_head_config.build(framework=framework) - def build_vf_head(self, framework: str): + def build_vf_head(self, framework: str) -> Model: """Builds the value function head. The default behavior is to build the head from the vf_head_config. diff --git a/rllib/algorithms/ppo/ppo_rl_module_config.py b/rllib/algorithms/ppo/ppo_rl_module_config.py deleted file mode 100644 index f07a87d1d99e4..0000000000000 --- a/rllib/algorithms/ppo/ppo_rl_module_config.py +++ /dev/null @@ -1,43 +0,0 @@ -from dataclasses import dataclass - -import gymnasium as gym - -from ray.rllib.core.models.catalog import Catalog -from ray.rllib.core.rl_module.rl_module import RLModuleConfig -from ray.rllib.utils.annotations import ExperimentalAPI - - -@ExperimentalAPI -@dataclass -class PPOModuleConfig(RLModuleConfig): - """Configuration for the PPORLModule. - - Attributes: - observation_space: The observation space of the environment. - action_space: The action space of the environment. - catalog: The PPOCatalog object to use for building the models. - """ - - observation_space: gym.Space = None - action_space: gym.Space = None - catalog: Catalog = None - - def build(self, framework: str): - """Builds a PPORLModule. - - Args: - framework: The framework to use for the module. - - Returns: - PPORLModule: The module. - """ - if framework == "torch": - from ray.rllib.algorithms.ppo.torch.ppo_torch_rl_module import ( - PPOTorchRLModule, - ) - - return PPOTorchRLModule(self) - else: - from ray.rllib.algorithms.ppo.tf.ppo_tf_rl_module import PPOTfRLModule - - return PPOTfRLModule(self) diff --git a/rllib/algorithms/ppo/tests/test_ppo_learner.py b/rllib/algorithms/ppo/tests/test_ppo_learner.py index a5a3482e84334..12e910ed85996 100644 --- a/rllib/algorithms/ppo/tests/test_ppo_learner.py +++ b/rllib/algorithms/ppo/tests/test_ppo_learner.py @@ -6,6 +6,7 @@ import tree # pip install dm-tree import ray.rllib.algorithms.ppo as ppo +from ray.rllib.algorithms.ppo.ppo_catalog import PPOCatalog from ray.rllib.core.rl_module.rl_module import SingleAgentRLModuleSpec from ray.rllib.policy.sample_batch import SampleBatch @@ -102,7 +103,8 @@ def test_loss(self): module_class=algo_config.rl_module_spec.module_class, observation_space=policy.observation_space, action_space=policy.action_space, - model_config=policy.config["model"], + model_config_dict=policy.config["model"], + catalog_class=PPOCatalog, ) ) learner_group = learner_group_config.build() diff --git a/rllib/algorithms/ppo/tests/test_ppo_rl_module.py b/rllib/algorithms/ppo/tests/test_ppo_rl_module.py index 0eb56db3b32fc..d648d99c5b5d9 100644 --- a/rllib/algorithms/ppo/tests/test_ppo_rl_module.py +++ b/rllib/algorithms/ppo/tests/test_ppo_rl_module.py @@ -6,17 +6,18 @@ import tensorflow as tf import torch import tree -from ray.rllib.algorithms.ppo.ppo_catalog import PPOCatalog -from ray.rllib.algorithms.ppo.ppo_rl_module_config import PPOModuleConfig import ray from ray.rllib import SampleBatch +from ray.rllib.algorithms.ppo.ppo_catalog import PPOCatalog from ray.rllib.algorithms.ppo.tf.ppo_tf_rl_module import ( PPOTfRLModule, ) from ray.rllib.algorithms.ppo.torch.ppo_torch_rl_module import ( PPOTorchRLModule, ) +from ray.rllib.core.rl_module.rl_module import RLModuleConfig +from ray.rllib.models.preprocessors import get_preprocessor from ray.rllib.utils.numpy import convert_to_numpy from ray.rllib.utils.torch_utils import convert_to_torch_tensor @@ -24,8 +25,8 @@ def get_expected_module_config( env: gym.Env, model_config_dict: dict, - observation_space: gym.Space, -) -> PPOModuleConfig: + observation_space: gym.spaces.Space, +) -> RLModuleConfig: """Get a PPOModuleConfig that we would expect from the catalog otherwise. Args: @@ -36,17 +37,14 @@ def get_expected_module_config( Returns: A PPOModuleConfig containing the relevant configs to build PPORLModule """ - catalog = PPOCatalog( + config = RLModuleConfig( observation_space=observation_space, action_space=env.action_space, model_config_dict=model_config_dict, + catalog_class=PPOCatalog, ) - return PPOModuleConfig( - observation_space=env.observation_space, - action_space=env.action_space, - catalog=catalog, - ) + return config def dummy_torch_ppo_loss(batch, fwd_out): @@ -124,9 +122,9 @@ def tearDownClass(cls): ray.shutdown() def test_rollouts(self): - # TODO: Add ALE/Breakout-v5 to cover a 3D obs space + # TODO: Add FrozenLake-v1 to cover LSTM case. frameworks = ["torch", "tf2"] - env_names = ["CartPole-v1", "Pendulum-v1"] + env_names = ["CartPole-v1", "Pendulum-v1", "ALE/Breakout-v5"] fwd_fns = ["forward_exploration", "forward_inference"] # TODO(Artur): Re-enable LSTM lstm = [False] @@ -136,15 +134,26 @@ def test_rollouts(self): if lstm and fw == "tf2": # LSTM not implemented in TF2 yet continue + if env_name == "ALE/Breakout-v5" and fw == "tf2": + # TODO(Artur): Implement CNN in TF2. + continue print(f"[FW={fw} | [ENV={env_name}] | [FWD={fwd_fn}] | LSTM" f"={lstm}") - env = gym.make(env_name) + if env_name.startswith("ALE/"): + env = gym.make("GymV26Environment-v0", env_id=env_name) + else: + env = gym.make(env_name) + + preprocessor_cls = get_preprocessor(env.observation_space) + preprocessor = preprocessor_cls(env.observation_space) + module = _get_ppo_module( framework=fw, env=env, lstm=lstm, - observation_space=env.observation_space, + observation_space=preprocessor.observation_space, ) obs, _ = env.reset() + obs = preprocessor.transform(obs) batch = _get_input_batch_from_obs(fw, obs) @@ -162,9 +171,9 @@ def test_rollouts(self): module.forward_inference(batch) def test_forward_train(self): - # TODO: Add ALE/Breakout-v5 to cover a 3D obs space + # TODO: Add FrozenLake-v1 to cover LSTM case. frameworks = ["torch", "tf2"] - env_names = ["CartPole-v1", "Pendulum-v1"] + env_names = ["CartPole-v1", "Pendulum-v1", "ALE/Breakout-v5"] # TODO(Artur): Re-enable LSTM lstm = [False] config_combinations = [frameworks, env_names, lstm] @@ -174,21 +183,29 @@ def test_forward_train(self): # LSTM not implemented in TF2 yet continue if env_name == "ALE/Breakout-v5" and fw == "tf2": - # CNN not implement in TF2 yet + # TODO(Artur): Implement CNN in TF2. continue print(f"[FW={fw} | [ENV={env_name}] | LSTM={lstm}") - env = gym.make(env_name) + # TODO(Artur): Figure out why this is needed and fix it. + if env_name.startswith("ALE/"): + env = gym.make("GymV26Environment-v0", env_id=env_name) + else: + env = gym.make(env_name) + + preprocessor_cls = get_preprocessor(env.observation_space) + preprocessor = preprocessor_cls(env.observation_space) module = _get_ppo_module( framework=fw, env=env, lstm=lstm, - observation_space=env.observation_space, + observation_space=preprocessor.observation_space, ) # collect a batch of data batches = [] obs, _ = env.reset() + obs = preprocessor.transform(obs) tstep = 0 # TODO (Artur): Un-uncomment once Policy supports RNN # state_in = module.get_initial_state() @@ -206,6 +223,7 @@ def test_forward_train(self): fwd_out = module.forward_exploration(input_batch) action = convert_to_numpy(fwd_out["action_dist"].sample()[0]) new_obs, reward, terminated, truncated, _ = env.step(action) + new_obs = preprocessor.transform(new_obs) output_batch = { SampleBatch.OBS: obs, SampleBatch.NEXT_OBS: new_obs, diff --git a/rllib/algorithms/ppo/tests/test_ppo_with_rl_module.py b/rllib/algorithms/ppo/tests/test_ppo_with_rl_module.py index ab32f7460f6e0..5d39e836ae679 100644 --- a/rllib/algorithms/ppo/tests/test_ppo_with_rl_module.py +++ b/rllib/algorithms/ppo/tests/test_ppo_with_rl_module.py @@ -115,8 +115,11 @@ def test_ppo_compilation_and_schedule_mixins(self): for fw in framework_iterator( config, frameworks=("torch", "tf2"), with_eager_tracing=True ): - # TODO (Kourosh) Bring back "FrozenLake-v1" and "MsPacmanNoFrameskip-v4" - for env in ["CartPole-v1", "Pendulum-v1"]: + # TODO (Kourosh) Bring back "FrozenLake-v1" + for env in ["CartPole-v1", "Pendulum-v1", "ALE/Breakout-v5"]: + if env == "ALE/Breakout-v5" and fw == "tf2": + # TODO(Artur): Implement CNN in TF2. + continue print("Env={}".format(env)) # TODO (Kourosh, Avnishn): for now just do lstm=False for lstm in [False]: diff --git a/rllib/core/learner/learner.py b/rllib/core/learner/learner.py index 468fef74eb0c6..a3ea1cb91c7aa 100644 --- a/rllib/core/learner/learner.py +++ b/rllib/core/learner/learner.py @@ -135,7 +135,7 @@ class Learner: module_class=MyModule, observation_space=env.observation_space, action_space=env.action_space, - model_config = {"hidden": [128, 128]} + model_config_dict = {"hidden": [128, 128]} ) # create a learner instance that will train the module @@ -154,7 +154,7 @@ class Learner: module_class=NewPlayerModule, observation_space=env.observation_space, action_space=env.action_space, - model_config = {"hidden": [128, 128]} + model_config_dict = {"hidden": [128, 128]} ) ) diff --git a/rllib/core/learner/tests/test_learner.py b/rllib/core/learner/tests/test_learner.py index 04b7e8fbe76cb..4b6932c2b5347 100644 --- a/rllib/core/learner/tests/test_learner.py +++ b/rllib/core/learner/tests/test_learner.py @@ -23,7 +23,7 @@ def get_learner() -> Learner: module_class=DiscreteBCTFModule, observation_space=env.observation_space, action_space=env.action_space, - model_config={"fcnet_hiddens": [32]}, + model_config_dict={"fcnet_hiddens": [32]}, ), optimizer_config={"lr": 1e-3}, learner_scaling_config=LearnerGroupScalingConfig(), @@ -128,7 +128,7 @@ def set_optimizer_fn(module): module_class=DiscreteBCTFModule, observation_space=env.observation_space, action_space=env.action_space, - model_config={"fcnet_hiddens": [16]}, + model_config_dict={"fcnet_hiddens": [16]}, ), set_optimizer_fn=set_optimizer_fn, ) diff --git a/rllib/core/learner/tests/test_learner_group_config.py b/rllib/core/learner/tests/test_learner_group_config.py index 476ba86f2cec9..7bf543647b069 100644 --- a/rllib/core/learner/tests/test_learner_group_config.py +++ b/rllib/core/learner/tests/test_learner_group_config.py @@ -46,7 +46,7 @@ def test_learner_group_build_from_algorithm_config(self): module_class=DiscreteBCTFModule, observation_space=env.observation_space, action_space=env.action_space, - model_config={"fcnet_hiddens": [32]}, + model_config_dict={"fcnet_hiddens": [32]}, ) ) learner_group_config.build() diff --git a/rllib/core/learner/torch/tests/test_torch_learner.py b/rllib/core/learner/torch/tests/test_torch_learner.py index ef12b17437137..8eb1ed1b62b52 100644 --- a/rllib/core/learner/torch/tests/test_torch_learner.py +++ b/rllib/core/learner/torch/tests/test_torch_learner.py @@ -114,7 +114,7 @@ def set_optimizer_fn(module): module_class=DiscreteBCTorchModule, observation_space=env.observation_space, action_space=env.action_space, - model_config={"fcnet_hiddens": [16]}, + model_config_dict={"fcnet_hiddens": [16]}, ), set_optimizer_fn=set_optimizer_fn, ) diff --git a/rllib/core/models/base.py b/rllib/core/models/base.py index 88a3f79e86290..6d45a1ea8613e 100644 --- a/rllib/core/models/base.py +++ b/rllib/core/models/base.py @@ -1,6 +1,6 @@ import abc from dataclasses import dataclass -from typing import List, Union +from typing import List, Union, Tuple from ray.rllib import SampleBatch from ray.rllib.models.specs.specs_base import Spec @@ -30,8 +30,15 @@ class ModelConfig(abc.ABC): It is therefore a means of configuration for RLModules. However, ModelConfigs are not restricted to be used only with Catalog or RLModules. A usage Example together with a Model can be found in the Model. + + Args: + input_dims: The input dimensions of the network + output_dims: The output dimensions of the network. """ + input_dims: Union[List[int], Tuple[int]] = None + output_dims: Union[List[int], Tuple[int]] = None + @abc.abstractmethod def build(self, framework: str): """Builds the model. @@ -195,7 +202,8 @@ class Encoder(Model, abc.ABC): Similarly, their output_spec contains the latent space dimensions. Encoders can be recurrent, in which case the state should be part of input- and output_specs. The latents that are produced by an encoder are fed into subsequent - heads. + heads. Any implementation of Encoder should also be callable. This should be done + by also inheriting from a framework-specific model base-class, s.a. TorchModel. Abstract illustration of typical flow of tensors: diff --git a/rllib/core/models/catalog.py b/rllib/core/models/catalog.py index 991244a6c5307..4230d28f5b113 100644 --- a/rllib/core/models/catalog.py +++ b/rllib/core/models/catalog.py @@ -1,13 +1,21 @@ +from typing import Optional, Mapping, Any +import functools + +import numpy as np import gymnasium as gym +from gymnasium.spaces import Box, Dict, Discrete, MultiDiscrete, Tuple + +from ray.rllib.core.models.base import ModelConfig +from ray.rllib.core.models.base import Encoder from ray.rllib.core.models.configs import ( MLPEncoderConfig, LSTMEncoderConfig, CNNEncoderConfig, ) -from ray.rllib.core.models.base import ModelConfig from ray.rllib.models import MODEL_DEFAULTS -from gymnasium.spaces import Box from ray.rllib.models.utils import get_filter_config +from ray.rllib.utils.error import UnsupportedSpaceException +from ray.rllib.utils.spaces.simplex import Simplex class Catalog: @@ -40,8 +48,8 @@ def __init__( super().__init__(observation_space, action_space, model_config_dict) self.my_model_config_dict = MLPHeadConfig( hidden_layer_dims=[64, 32], - input_dim=self.observation_space.shape[0], - output_dim=1, + input_dims=[self.observation_space.shape[0]], + output_dims=[1], ) def build_my_head(self, framework: str): @@ -81,18 +89,56 @@ def __init__( self.model_config_dict = {**MODEL_DEFAULTS, **model_config_dict} self.view_requirements = view_requirements - # Produce a basic encoder config. + self._latent_dims = None + + # Overwrite this post-init hook in subclasses + self.__post_init__() + + @property + def latent_dims(self): + """Returns the latent dimensions of the encoder. + + This establishes an agreement between encoder and heads about the latent + dimensions. Encoders can be built to output a latent tensor with + `latent_dims` dimensions, and heads can be built with tensors of + `latent_dims` dimensions as inputs. This can be safely ignored if this + agreement is not needed in case of modifications to the Catalog. + + Returns: + The latent dimensions of the encoder. + """ + return self._latent_dims + + @latent_dims.setter + def latent_dims(self, value): + self._latent_dims = value + + def __post_init__(self): + """Post-init hook for subclasses to override. + + This makes it so that subclasses are not forced to create an encoder config + if the rest of their catalog is not dependent on it or if it breaks. + At the end of Catalog initialization, an attribute `Catalog.latent_dims` + should be set so that heads can be built using that information. + """ self.encoder_config = self.get_encoder_config( - observation_space=observation_space, - action_space=action_space, - model_config_dict=model_config_dict, - view_requirements=view_requirements, + observation_space=self.observation_space, + action_space=self.action_space, + model_config_dict=self.model_config_dict, + view_requirements=self.view_requirements, ) + + # Create a function that can be called when framework is known to retrieve the + # class type for action distributions + self.action_dist_class_fn = functools.partial( + self.get_dist_cls_from_action_space, action_space=self.action_space + ) + # The dimensions of the latent vector that is output by the encoder and fed # to the heads. - self.latent_dim = self.encoder_config.output_dim + self.latent_dims = self.encoder_config.output_dims - def build_encoder(self, framework: str): + def build_encoder(self, framework: str) -> Encoder: """Builds the encoder. By default this method builds an encoder instance from Catalog.encoder_config. @@ -103,8 +149,35 @@ def build_encoder(self, framework: str): Returns: The encoder. """ + assert hasattr(self, "encoder_config"), ( + "You must define a `Catalog.encoder_config` attribute in your Catalog " + "subclass or override the `Catalog.build_encoder` method. By default, " + "an encoder_config is created in the __post_init__ method." + ) return self.encoder_config.build(framework=framework) + def get_action_dist_cls(self, framework: str): + """Get the action distribution class. + + The default behavior is to get the action distribution from the + `Catalog.action_dist_class_fn`. This can be overridden to build a custom action + distribution as a means of configuring the behavior of a PPORLModuleBase + implementation. + + Args: + framework: The framework to use. Either "torch" or "tf". + + Returns: + The action distribution. + """ + assert hasattr(self, "action_dist_class_fn"), ( + "You must define a `Catalog.action_dist_class_fn` attribute in your " + "Catalog subclass or override the `Catalog.action_dist_class_fn` method. " + "By default, an action_dist_class_fn is created in the __post_init__ " + "method." + ) + return self.action_dist_class_fn(framework=framework) + @classmethod def get_encoder_config( cls, @@ -149,20 +222,22 @@ def get_encoder_config( encoder_latent_dim = ( model_config_dict["encoder_latent_dim"] or fcnet_hiddens[-1] ) + use_lstm = model_config_dict["use_lstm"] + use_attention = model_config_dict["use_attention"] - if model_config_dict["use_lstm"]: + if use_lstm: encoder_config = LSTMEncoderConfig( hidden_dim=model_config_dict["lstm_cell_size"], batch_first=not model_config_dict["_time_major"], num_layers=1, - output_dim=model_config_dict["lstm_cell_size"], + output_dims=[model_config_dict["lstm_cell_size"]], output_activation=output_activation, observation_space=observation_space, action_space=action_space, view_requirements_dict=view_requirements, get_tokenizer_config=cls.get_tokenizer_config, ) - elif model_config_dict["use_attention"]: + elif use_attention: raise NotImplementedError else: # TODO (Artur): Maybe check for original spaces here @@ -176,10 +251,10 @@ def get_encoder_config( else: hidden_layer_dims = model_config_dict["fcnet_hiddens"][:-1] encoder_config = MLPEncoderConfig( - input_dim=observation_space.shape[0], + input_dims=[observation_space.shape[0]], hidden_layer_dims=hidden_layer_dims, hidden_layer_activation=activation, - output_dim=encoder_latent_dim, + output_dims=[encoder_latent_dim], output_activation=output_activation, ) @@ -197,12 +272,18 @@ def get_encoder_config( filter_specifiers=model_config_dict["conv_filters"], filter_layer_activation=activation, output_activation=output_activation, - output_dim=encoder_latent_dim, + output_dims=[encoder_latent_dim], ) # input_space is a possibly nested structure of spaces. else: # NestedModelConfig - raise NotImplementedError("No default config for complex spaces yet!") + raise ValueError( + f"No default encoder config for " + f"obs space={observation_space}," + f" lstm={use_lstm} and " + f"attention={use_attention} " + f"found." + ) return encoder_config @@ -224,3 +305,107 @@ def get_tokenizer_config( **{"use_lstm": False, "use_attention": False}, }, ) + + @classmethod + def get_dist_cls_from_action_space( + cls, + action_space: gym.Space, + *, + framework: Optional[str] = None, + deterministic: Optional[bool] = False, + ) -> Mapping[str, Any]: + """Returns a distribution class for the given action space. + + You can get the required input dimension for the distribution by calling + `action_dict_cls.required_model_output_shape(action_space, model_config_dict)` + on the retrieved class. This is useful, because the Catalog needs to find out + about the required input dimension for the distribution before the model that + outputs these inputs is configured. + + Args: + action_space: Action space of the target gym env. + framework: The framework to use. + deterministic: Whether to return a Deterministic distribution on input + logits instead of a stochastic distributions. For example for Discrete + spaces, the stochastic is a Categorical distribution with output logits, + while the deterministic distribution will be to output the argmax of + logits directly. + + + Returns: + The distribution class for the given action space. + """ + + if framework == "torch": + from ray.rllib.models.torch.torch_distributions import ( + TorchCategorical, + TorchDeterministic, + TorchDiagGaussian, + ) + + distribution_dicts = { + "deterministic": TorchDeterministic, + "gaussian": TorchDiagGaussian, + "categorical": TorchCategorical, + } + elif framework == "tf": + from ray.rllib.models.tf.tf_distributions import ( + TfCategorical, + TfDeterministic, + TfDiagGaussian, + ) + + distribution_dicts = { + "deterministic": TfDeterministic, + "gaussian": TfDiagGaussian, + "categorical": TfCategorical, + } + else: + raise ValueError( + f"Unknown framework: {framework}. Only 'torch' and 'tf2' are " + "supported for RLModule Catalogs." + ) + + # Box space -> DiagGaussian OR Deterministic. + if isinstance(action_space, Box): + if action_space.dtype.char in np.typecodes["AllInteger"]: + raise ValueError( + "Box(..., `int`) action spaces are not supported. " + "Use MultiDiscrete or Box(..., `float`)." + ) + else: + if len(action_space.shape) > 1: + raise UnsupportedSpaceException( + "Action space has multiple dimensions " + "{}. ".format(action_space.shape) + + "Consider reshaping this into a single dimension, " + "using a custom action distribution, " + "using a Tuple action space, or the multi-agent API." + ) + if deterministic: + return distribution_dicts["deterministic"] + else: + return distribution_dicts["gaussian"] + + # Discrete Space -> Categorical. + elif isinstance(action_space, Discrete): + return distribution_dicts["categorical"] + + # Tuple/Dict Spaces -> MultiAction. + elif isinstance(action_space, (Tuple, Dict)): + # TODO(Artur): Supported Tuple/Dict. + raise NotImplementedError("Tuple/Dict spaces not yet supported.") + + # Simplex -> Dirichlet. + elif isinstance(action_space, Simplex): + # TODO(Artur): Supported Simplex (in torch). + raise NotImplementedError("Simplex action space not yet supported.") + + # MultiDiscrete -> MultiCategorical. + elif isinstance(action_space, MultiDiscrete): + # TODO(Artur): Support multi-discrete. + raise NotImplementedError("MultiDiscrete spaces not yet supported.") + + # Unknown type -> Error. + else: + raise NotImplementedError(f"Unsupported action space: `{action_space}`") diff --git a/rllib/core/models/configs.py b/rllib/core/models/configs.py index eccb998e61568..5135e7e950e87 100644 --- a/rllib/core/models/configs.py +++ b/rllib/core/models/configs.py @@ -71,19 +71,19 @@ class MLPHeadConfig(ModelConfig): The configured MLP encodes 1D-observations into a latent space. The stack of layers is composed of a sequence of linear layers. The first layer - has `input_dim` inputs and the last layer has `output_dim` outputs. The number of + has `input_dims` inputs and the last layer has `output_dims` outputs. The number of units inbetween is determined by `hidden_layer_dims`. If `hidden_layer_dims` is - None, there is only one linear layer with `input_dim` inputs and `output_dim` + None, there is only one linear layer with `input_dims` inputs and `output_dims` outputs. Each layer is followed by an activation function as per this config. See ModelConfig for usage details. Example: Configuration: - input_dim = 4 + input_dims = [4] hidden_layer_dims = [8, 8] hidden_layer_activation = "relu" - output_dim = 2 + output_dims = [2] output_activation = "linear" Resulting stack in pseudocode: @@ -94,19 +94,15 @@ class MLPHeadConfig(ModelConfig): Linear(8, 2) Attributes: - input_dim: The input dimension of the network. It cannot be None. hidden_layer_dims: The sizes of the hidden layers. hidden_layer_activation: The activation function to use after each layer ( except for the output). output_activation: The activation function to use for the output layer. - output_dim: The output dimension of the network. """ - input_dim: int = None hidden_layer_dims: List[int] = field(default_factory=lambda: [256, 256]) hidden_layer_activation: str = "relu" output_activation: str = "linear" - output_dim: int = None @_framework_implemented() def build(self, framework: str = "torch") -> Model: @@ -137,7 +133,7 @@ class CNNEncoderConfig(ModelConfig): The stack of layers is composed of a sequence of convolutional layers. `input_dims` describes the shape of the input tensor. Beyond that, each layer specified by `filter_specifiers` is followed by an activation function according - to `filter_activation`. The `output_dim` is reached by flattening a final + to `filter_activation`. `output_dims` is reached by flattening a final convolutional layer and applying a linear layer with `output_activation`. See ModelConfig for usage details. @@ -150,7 +146,7 @@ class CNNEncoderConfig(ModelConfig): [32, [4, 4], 2], ] filter_activation = "relu" - output_dim = 256 + output_dims = [256] output_activation = "linear" Resulting stack in pseudocode: @@ -172,9 +168,6 @@ class CNNEncoderConfig(ModelConfig): filter_layer_activation: The activation function to use after each layer ( except for the output). output_activation: The activation function to use for the output layer. - output_dim: The output dimension. We append a final convolutional layer - depth-only filters that is flattened and a final linear layer to achieve - this dimension regardless of the previous filters. """ input_dims: Union[List[int], Tuple[int]] = None @@ -183,7 +176,6 @@ class CNNEncoderConfig(ModelConfig): ) filter_layer_activation: str = "relu" output_activation: str = "linear" - output_dim: int = None @_framework_implemented(tf2=False) def build(self, framework: str = "torch") -> Model: @@ -263,7 +255,6 @@ class LSTMEncoderConfig(ModelConfig): action_space: gym.Space = None view_requirements_dict: ViewRequirementsDict = None get_tokenizer_config: Callable[[gym.Space, Dict], ModelConfig] = None - output_dim: int = None @_framework_implemented(tf2=False) def build(self, framework: str = "torch") -> Encoder: diff --git a/rllib/core/models/tests/test_catalog.py b/rllib/core/models/tests/test_catalog.py index b9926083c1ec2..73668020e5812 100644 --- a/rllib/core/models/tests/test_catalog.py +++ b/rllib/core/models/tests/test_catalog.py @@ -1,18 +1,38 @@ import itertools import unittest +import functools +from collections import namedtuple -import gym +import gymnasium as gym import numpy as np import tree -from gymnasium.spaces import Box +from gymnasium.spaces import Box, Discrete +from ray.rllib.algorithms.ppo.ppo import PPOConfig +from ray.rllib.core.models.torch.base import TorchModel +from ray.rllib.core.models.base import ModelConfig, Encoder +from ray.rllib.algorithms.ppo.torch.ppo_torch_rl_module import PPOTorchRLModule +from ray.rllib.algorithms.ppo.ppo_catalog import PPOCatalog from ray.rllib.core.models.base import STATE_IN, ENCODER_OUT, STATE_OUT -from ray.rllib.core.models.catalog import Catalog from ray.rllib.core.models.configs import MLPEncoderConfig, CNNEncoderConfig +from ray.rllib.core.models.catalog import Catalog +from ray.rllib.core.rl_module.rl_module import SingleAgentRLModuleSpec +from ray.rllib.models import MODEL_DEFAULTS +from ray.rllib.models.tf.tf_distributions import ( + TfCategorical, + TfDeterministic, + TfDiagGaussian, +) +from ray.rllib.models.torch.torch_distributions import ( + TorchCategorical, + TorchDeterministic, + TorchDiagGaussian, +) from ray.rllib.policy.sample_batch import SampleBatch from ray.rllib.utils.framework import try_import_tf from ray.rllib.utils.framework import try_import_torch from ray.rllib.utils.spaces.space_utils import get_dummy_batch_for_space +from ray.rllib.utils.test_utils import framework_iterator from ray.rllib.utils.torch_utils import convert_to_torch_tensor _, tf, _ = try_import_tf() @@ -51,9 +71,9 @@ def _check_model_outputs(self, model, framework, model_config_dict, input_space) } outputs = model(inputs) - assert outputs[ENCODER_OUT].shape == (32, latent_dim) + self.assertEqual(outputs[ENCODER_OUT].shape, (32, latent_dim)) tree.map_structure_with_path( - lambda p, v: self.assertTrue(v.shape == states[p].shape), + lambda p, v: self.assertEqual(v.shape, states[p].shape), outputs[STATE_OUT], ) @@ -164,7 +184,7 @@ def test_get_encoder_config(self): model_config = catalog.get_encoder_config( observation_space=input_space, model_config_dict=model_config_dict ) - assert type(model_config) == model_config_type + self.assertEqual(type(model_config), model_config_type) model = model_config.build(framework=framework) # Do a forward pass and check if the output has the correct shape @@ -173,6 +193,164 @@ def test_get_encoder_config(self): # TODO(Artur): Add support for composite spaces and test here # Today, Catalog does not handle composite spaces, so we can't test them + def test_get_dist_cls_from_action_space(self): + """Tests if we can create a bunch of action distributions. + + Action distributions are created from the base catalog class. Things this + test checks: + - Whether we output the correct action distributions classes. + - Whether we can instantiate the action distributions, query their + required input dimensions and sample from them. + + """ + TestConfig = namedtuple( + "TestConfig", ("action_space", "deterministic", "expected_dist_cls_dict") + ) + test_configs = [ + TestConfig( + Box(-np.inf, np.inf, (7,), dtype=np.float32), + False, + {"torch": TorchDiagGaussian, "tf": TfDiagGaussian}, + ), + TestConfig( + Box(-np.inf, np.inf, (7,), dtype=np.float32), + True, + {"torch": TorchDeterministic, "tf": TfDeterministic}, + ), + TestConfig( + Discrete(5), None, {"torch": TorchCategorical, "tf": TfCategorical} + ), + ] + + for ( + action_space, + deterministic, + expected_cls_dict, + ) in test_configs: + print( + f"Testing action space: {action_space} and deterministic:" + f" {deterministic}" + ) + catalog = Catalog( + observation_space=Box(-1.0, 1.0, (84, 84, 1), dtype=np.float32), + action_space=action_space, + model_config_dict=MODEL_DEFAULTS.copy(), + ) + + for framework in framework_iterator(frameworks=["tf2", "torch"]): + + if framework == "tf2": + framework = "tf" + + dist_cls = catalog.get_dist_cls_from_action_space( + action_space=action_space, + deterministic=deterministic, + framework=framework, + ) + + # Check if we can query the required input dimensions + input_shape = expected_cls_dict[framework].required_model_output_shape( + action_space, model_config=MODEL_DEFAULTS.copy() + ) + logits = np.ones((32, *input_shape), dtype=np.float32) + if framework == "torch": + logits = torch.from_numpy(logits) + else: + logits = tf.convert_to_tensor(logits) + # We don't need a model if we input tensors + dist = dist_cls.from_logits(logits=logits) + self.assertTrue(isinstance(dist, expected_cls_dict[framework])) + actions = dist.sample() + self.assertTrue(action_space.contains(actions.numpy()[0])) + + def test_customize_catalog_from_algorithm_config(self): + """Test if we can pass catalog to algorithm config and it ends up inside + RLModule and is used to build models there.""" + + class MyCatalog(PPOCatalog): + def build_vf_head(self, framework): + return torch.nn.Linear(self.latent_dims[0], 1) + + config = ( + PPOConfig() + .rl_module(rl_module_spec=SingleAgentRLModuleSpec(catalog_class=MyCatalog)) + .framework("torch") + ) + + algo = config.build(env="CartPole-v0") + self.assertEqual( + algo.get_policy("default_policy").model.config.catalog_class, MyCatalog + ) + + # Test if we can pass custom catalog to algorithm config and train with it. + + config = ( + PPOConfig() + .rl_module( + rl_module_spec=SingleAgentRLModuleSpec( + module_class=PPOTorchRLModule, catalog_class=MyCatalog + ) + ) + .framework("torch") + ) + + algo = config.build(env="CartPole-v0") + algo.train() + + def test_post_init_overwrite(self): + """Test if we can overwrite post_init method of a catalog class. + + This tests: + - Defines a custom encoder and its config. + - Defines a custom catalog class that uses the custom encoder by + overwriting the __post_init__ method and defining a custom + Catalog.encoder_config. + - Defines a custom RLModule that uses the custom catalog. + - Runs a forward pass through the custom RLModule to check if + everything is working together as expected. + + """ + env = gym.make("CartPole-v0") + + class MyCostumTorchEncoderConfig(ModelConfig): + def build(self, framework): + return MyCostumTorchEncoder() + + class MyCostumTorchEncoder(TorchModel, Encoder): + def __init__(self): + super().__init__({}) + self.net = torch.nn.Linear(env.observation_space.shape[0], 10) + + def _forward(self, input_dict, **kwargs): + return { + ENCODER_OUT: (self.net(input_dict["obs"])), + STATE_OUT: None, + } + + class MyCustomCatalog(PPOCatalog): + def __post_init__(self): + self.action_dist_class_fn = functools.partial( + self.get_dist_cls_from_action_space, action_space=self.action_space + ) + self.latent_dims = (10,) + self.encoder_config = MyCostumTorchEncoderConfig( + input_dims=self.observation_space.shape, + output_dims=self.latent_dims, + ) + + spec = SingleAgentRLModuleSpec( + module_class=PPOTorchRLModule, + observation_space=env.observation_space, + action_space=env.action_space, + model_config_dict=MODEL_DEFAULTS.copy(), + catalog_class=MyCustomCatalog, + ) + module = spec.build() + + module.forward_inference( + input_data={"obs": torch.ones((32, *env.observation_space.shape))} + ) + if __name__ == "__main__": import pytest diff --git a/rllib/core/models/tf/encoder.py b/rllib/core/models/tf/encoder.py index 0ca9c4e285b1a..15332244bb267 100644 --- a/rllib/core/models/tf/encoder.py +++ b/rllib/core/models/tf/encoder.py @@ -1,7 +1,5 @@ from typing import Union -import tree - from ray.rllib.core.models.base import ( Encoder, ActorCriticEncoder, @@ -15,7 +13,6 @@ from ray.rllib.models.specs.specs_base import Spec from ray.rllib.models.specs.specs_dict import SpecDict from ray.rllib.models.specs.specs_tf import TFTensorSpecs -from ray.rllib.policy.rnn_sequencing import add_time_dimension from ray.rllib.policy.sample_batch import SampleBatch from ray.rllib.utils.annotations import override from ray.rllib.utils.framework import try_import_torch @@ -31,9 +28,9 @@ def __init__(self, config: ModelConfig) -> None: # Create the neural networks self.net = TfMLP( - input_dim=config.input_dim, + input_dim=config.input_dims[0], hidden_layer_dims=config.hidden_layer_dims, - output_dim=config.output_dim, + output_dim=config.output_dims[0], hidden_layer_activation=config.hidden_layer_activation, ) @@ -41,7 +38,7 @@ def __init__(self, config: ModelConfig) -> None: def get_input_spec(self) -> Union[Spec, None]: return SpecDict( { - SampleBatch.OBS: TFTensorSpecs("b, h", h=self.config.input_dim), + SampleBatch.OBS: TFTensorSpecs("b, h", h=self.config.input_dims[0]), STATE_IN: None, SampleBatch.SEQ_LENS: None, } @@ -51,7 +48,7 @@ def get_input_spec(self) -> Union[Spec, None]: def get_output_spec(self) -> Union[Spec, None]: return SpecDict( { - ENCODER_OUT: TFTensorSpecs("b, h", h=self.config.output_dim), + ENCODER_OUT: TFTensorSpecs("b, h", h=self.config.output_dims[0]), STATE_OUT: None, } ) @@ -66,91 +63,6 @@ def _forward(self, inputs: NestedDict) -> NestedDict: ) -class LSTMEncoder(Encoder, TfModel): - """An encoder that uses an LSTM cell and a linear layer.""" - - def __init__(self, config: ModelConfig) -> None: - TfModel.__init__(self, config) - Encoder.__init__(self, config) - - # Create the neural networks - self.lstm = nn.LSTM( - config.input_dim, - config.hidden_dim, - config.num_layers, - batch_first=config.batch_first, - ) - self.linear = nn.Linear(config.hidden_dim, config.output_dim) - - @override(Model) - def get_input_spec(self) -> Union[Spec, None]: - return SpecDict( - { - # bxt is just a name for better readability to indicated padded batch - SampleBatch.OBS: TFTensorSpecs("bxt, h", h=self.config.input_dim), - STATE_IN: { - "h": TFTensorSpecs( - "b, l, h", h=self.config.hidden_dim, l=self.config.num_layers - ), - "c": TFTensorSpecs( - "b, l, h", h=self.config.hidden_dim, l=self.config.num_layers - ), - }, - SampleBatch.SEQ_LENS: None, - } - ) - - @override(Model) - def get_output_spec(self) -> Union[Spec, None]: - return SpecDict( - { - ENCODER_OUT: TFTensorSpecs("bxt, h", h=self.config.output_dim), - STATE_OUT: { - "h": TFTensorSpecs( - "b, l, h", h=self.config.hidden_dim, l=self.config.num_layers - ), - "c": TFTensorSpecs( - "b, l, h", h=self.config.hidden_dim, l=self.config.num_layers - ), - }, - } - ) - - @override(Model) - def get_initial_state(self): - config = self.config - return { - "h": torch.zeros(config.num_layers, config.hidden_dim), - "c": torch.zeros(config.num_layers, config.hidden_dim), - } - - @override(Model) - def _forward(self, inputs: NestedDict, **kwargs) -> NestedDict: - x = inputs[SampleBatch.OBS] - states = inputs[STATE_IN] - # states are batch-first when coming in - states = tree.map_structure(lambda x: x.transpose(0, 1), states) - - x = add_time_dimension( - x, - seq_lens=inputs[SampleBatch.SEQ_LENS], - framework="tf", - time_major=not self.config.batch_first, - ) - states_o = {} - x, (states_o["h"], states_o["c"]) = self.lstm(x, (states["h"], states["c"])) - - x = self.linear(x) - x = x.view(-1, x.shape[-1]) - - return NestedDict( - { - ENCODER_OUT: x, - STATE_OUT: tree.map_structure(lambda x: x.transpose(0, 1), states_o), - } - ) - - class TfActorCriticEncoder(TfModel, ActorCriticEncoder): """An encoder that can hold two encoders.""" diff --git a/rllib/core/models/tf/mlp.py b/rllib/core/models/tf/mlp.py index d5929c91c46aa..96056fc71a34c 100644 --- a/rllib/core/models/tf/mlp.py +++ b/rllib/core/models/tf/mlp.py @@ -17,20 +17,20 @@ def __init__(self, config: ModelConfig) -> None: TfModel.__init__(self, config) self.net = TfMLP( - input_dim=config.input_dim, + input_dim=config.input_dims[0], hidden_layer_dims=config.hidden_layer_dims, - output_dim=config.output_dim, + output_dim=config.output_dims[0], hidden_layer_activation=config.hidden_layer_activation, output_activation=config.output_activation, ) @override(Model) def get_input_spec(self) -> Union[Spec, None]: - return TFTensorSpecs("b, h", h=self.config.input_dim) + return TFTensorSpecs("b, h", h=self.config.input_dims[0]) @override(Model) def get_output_spec(self) -> Union[Spec, None]: - return TFTensorSpecs("b, h", h=self.config.output_dim) + return TFTensorSpecs("b, h", h=self.config.output_dims[0]) @override(Model) def _forward(self, inputs: NestedDict, **kwargs) -> NestedDict: diff --git a/rllib/core/models/tf/tests/test_tf_mlp_head.py b/rllib/core/models/tf/tests/test_tf_mlp_head.py index 1599c449dc012..727ae76e787bb 100644 --- a/rllib/core/models/tf/tests/test_tf_mlp_head.py +++ b/rllib/core/models/tf/tests/test_tf_mlp_head.py @@ -9,26 +9,25 @@ class TestTfMLPHead(unittest.TestCase): def test_tf_mlp_head(self): - - inputs_dims = [1, 2, 1000] + inputs_dims_configs = [[1], [2], [1000]] list_of_hidden_layer_dims = [[], [1], [64, 64], [1000, 1000, 1000, 1000]] hidden_layer_activations = [None, "linear", "relu", "tanh", "elu", "swish"] - output_dims = inputs_dims + output_dims_configs = inputs_dims_configs output_activations = hidden_layer_activations for permutation in itertools.product( - inputs_dims, + inputs_dims_configs, list_of_hidden_layer_dims, hidden_layer_activations, output_activations, - output_dims, + output_dims_configs, ): ( - inputs_dim, + inputs_dims, hidden_layer_dims, hidden_layer_activation, output_activation, @@ -37,7 +36,7 @@ def test_tf_mlp_head(self): print( f"Testing ...\n" - f"inputs_dim: {inputs_dim}\n" + f"inputs_dim: {inputs_dims}\n" f"hidden_layer_dims: {hidden_layer_dims}\n" f"hidden_layer_activation: {hidden_layer_activation}\n" f"output_activation: {output_activation}\n" @@ -45,20 +44,20 @@ def test_tf_mlp_head(self): ) config = MLPHeadConfig( - input_dim=inputs_dim, + input_dims=inputs_dims, hidden_layer_dims=hidden_layer_dims, - output_dim=output_dims, + output_dims=output_dims, hidden_layer_activation=hidden_layer_activation, output_activation=output_activation, ) model = config.build(framework="tf") - inputs = tf.random.uniform((1, inputs_dim)) + inputs = tf.random.uniform((1, inputs_dims[0])) outputs = model(inputs) - self.assertEqual(outputs.shape, (1, output_dims)) + self.assertEqual(outputs.shape, (1, output_dims[0])) if __name__ == "__main__": diff --git a/rllib/core/models/torch/encoder.py b/rllib/core/models/torch/encoder.py index 87fb2ccd23f9d..fb18670b18150 100644 --- a/rllib/core/models/torch/encoder.py +++ b/rllib/core/models/torch/encoder.py @@ -1,8 +1,7 @@ from typing import Union -import torch -import torch.nn as nn import tree + from ray.rllib.core.models.base import ( Encoder, ActorCriticEncoder, @@ -13,7 +12,6 @@ from ray.rllib.core.models.base import ModelConfig, Model from ray.rllib.core.models.torch.base import TorchModel from ray.rllib.core.models.torch.primitives import TorchMLP, TorchCNN - from ray.rllib.models.specs.specs_base import Spec from ray.rllib.models.specs.specs_dict import SpecDict from ray.rllib.models.specs.specs_torch import TorchTensorSpec @@ -21,8 +19,11 @@ from ray.rllib.policy.rnn_sequencing import add_time_dimension from ray.rllib.policy.sample_batch import SampleBatch from ray.rllib.utils.annotations import override +from ray.rllib.utils.framework import try_import_torch from ray.rllib.utils.nested_dict import NestedDict +torch, nn = try_import_torch() + class TorchMLPEncoder(TorchModel, Encoder): def __init__(self, config: ModelConfig) -> None: @@ -31,9 +32,9 @@ def __init__(self, config: ModelConfig) -> None: # Create the neural networks self.net = TorchMLP( - input_dim=config.input_dim, + input_dim=config.input_dims[0], hidden_layer_dims=config.hidden_layer_dims, - output_dim=config.output_dim, + output_dim=config.output_dims[0], hidden_layer_activation=config.hidden_layer_activation, ) @@ -41,7 +42,7 @@ def __init__(self, config: ModelConfig) -> None: def get_input_spec(self) -> Union[Spec, None]: return SpecDict( { - SampleBatch.OBS: TorchTensorSpec("b, h", h=self.config.input_dim), + SampleBatch.OBS: TorchTensorSpec("b, h", h=self.config.input_dims[0]), STATE_IN: None, SampleBatch.SEQ_LENS: None, } @@ -51,7 +52,7 @@ def get_input_spec(self) -> Union[Spec, None]: def get_output_spec(self) -> Union[Spec, None]: return SpecDict( { - ENCODER_OUT: TorchTensorSpec("b, h", h=self.config.output_dim), + ENCODER_OUT: TorchTensorSpec("b, h", h=self.config.output_dims[0]), STATE_OUT: None, } ) @@ -89,7 +90,9 @@ def __init__(self, config: ModelConfig) -> None: # Add a final linear layer to make sure that the outputs have the correct # dimensionality. layers.append( - nn.Linear(int(cnn.output_width) * int(cnn.output_height), config.output_dim) + nn.Linear( + int(cnn.output_width) * int(cnn.output_height), config.output_dims[0] + ) ) if output_activation is not None: layers.append(output_activation()) @@ -115,7 +118,7 @@ def get_input_spec(self) -> Union[Spec, None]: def get_output_spec(self) -> Union[Spec, None]: return SpecDict( { - ENCODER_OUT: TorchTensorSpec("b, h", h=self.config.output_dim), + ENCODER_OUT: TorchTensorSpec("b, h", h=self.config.output_dims[0]), STATE_OUT: None, } ) @@ -143,14 +146,14 @@ def __init__(self, config: ModelConfig) -> None: config.num_layers, batch_first=config.batch_first, ) - self.linear = nn.Linear(config.hidden_dim, config.output_dim) + self.linear = nn.Linear(config.hidden_dim, config.output_dims[0]) @override(Model) def get_input_spec(self) -> Union[Spec, None]: return SpecDict( { # bxt is just a name for better readability to indicated padded batch - SampleBatch.OBS: TorchTensorSpec("bxt, h", h=self.config.input_dim), + SampleBatch.OBS: TorchTensorSpec("bxt, h", h=self.config.input_dims[0]), STATE_IN: { "h": TorchTensorSpec( "b, l, h", h=self.config.hidden_dim, l=self.config.num_layers @@ -167,7 +170,7 @@ def get_input_spec(self) -> Union[Spec, None]: def get_output_spec(self) -> Union[Spec, None]: return SpecDict( { - ENCODER_OUT: TorchTensorSpec("bxt, h", h=self.config.output_dim), + ENCODER_OUT: TorchTensorSpec("bxt, h", h=self.config.output_dims[0]), STATE_OUT: { "h": TorchTensorSpec( "b, l, h", h=self.config.hidden_dim, l=self.config.num_layers @@ -189,7 +192,7 @@ def get_initial_state(self): @override(Model) def _forward(self, inputs: NestedDict, **kwargs) -> NestedDict: - x = inputs[SampleBatch.OBS] + x = inputs[SampleBatch.OBS].float() states = inputs[STATE_IN] # states are batch-first when coming in states = tree.map_structure(lambda x: x.transpose(0, 1), states) diff --git a/rllib/core/models/torch/mlp.py b/rllib/core/models/torch/mlp.py index f2570f70fe3ab..80423014ad432 100644 --- a/rllib/core/models/torch/mlp.py +++ b/rllib/core/models/torch/mlp.py @@ -18,20 +18,20 @@ def __init__(self, config: ModelConfig) -> None: TorchModel.__init__(self, config) self.net = TorchMLP( - input_dim=config.input_dim, + input_dim=config.input_dims[0], hidden_layer_dims=config.hidden_layer_dims, - output_dim=config.output_dim, + output_dim=config.output_dims[0], hidden_layer_activation=config.hidden_layer_activation, output_activation=config.output_activation, ) @override(Model) def get_input_spec(self) -> Union[Spec, None]: - return TorchTensorSpec("b, h", h=self.config.input_dim) + return TorchTensorSpec("b, h", h=self.config.input_dims[0]) @override(Model) def get_output_spec(self) -> Union[Spec, None]: - return TorchTensorSpec("b, h", h=self.config.output_dim) + return TorchTensorSpec("b, h", h=self.config.output_dims[0]) @override(Model) def _forward(self, inputs: torch.Tensor, **kwargs) -> torch.Tensor: diff --git a/rllib/core/models/torch/primitives.py b/rllib/core/models/torch/primitives.py index 8a08dbe663d96..c7bf9af5fc760 100644 --- a/rllib/core/models/torch/primitives.py +++ b/rllib/core/models/torch/primitives.py @@ -60,8 +60,10 @@ def __init__( self.output_dim = dims[-1] self.mlp = nn.Sequential(*layers) + self.expected_input_dtype = torch.float32 + def forward(self, x): - return self.mlp(x) + return self.mlp(x.type(self.expected_input_dtype)) class TorchCNN(nn.Module): @@ -69,7 +71,7 @@ class TorchCNN(nn.Module): def __init__( self, - input_dims: Union[List, Tuple] = None, + input_dims: Union[List[int], Tuple[int]] = None, filter_specifiers: List[List[Union[int, List]]] = None, filter_layer_activation: str = "relu", output_activation: str = "linear", @@ -152,7 +154,9 @@ def __init__( # Create the cnn that potentially includes a flattened layer self.cnn = nn.Sequential(*layers) + self.expected_input_dtype = torch.float32 + def forward(self, x): # Permute b/c data comes in as [B, dim, dim, channels]: inputs = x.permute(0, 3, 1, 2) - return self.cnn(inputs) + return self.cnn(inputs.type(self.expected_input_dtype)) diff --git a/rllib/core/models/torch/tests/test_torch_cnn_encoder.py b/rllib/core/models/torch/tests/test_torch_cnn_encoder.py index 4802f96daf025..358b3079aa367 100644 --- a/rllib/core/models/torch/tests/test_torch_cnn_encoder.py +++ b/rllib/core/models/torch/tests/test_torch_cnn_encoder.py @@ -30,7 +30,7 @@ def test_torch_cnn_encoder(self): filter_layer_activation = [None, "linear", "relu"] - output_dims = [1, 100] + output_dims_configs = [[1], [100]] output_activations = filter_layer_activation @@ -38,7 +38,7 @@ def test_torch_cnn_encoder(self): inputs_dimss, filter_layer_activation, output_activations, - output_dims, + output_dims_configs, ): ( inputs_dims, @@ -63,7 +63,7 @@ def test_torch_cnn_encoder(self): filter_specifiers=filter_specifiers, filter_layer_activation=filter_layer_activation, output_activation=output_activation, - output_dim=output_dims, + output_dims=output_dims, ) model = config.build(framework="torch") @@ -77,7 +77,7 @@ def test_torch_cnn_encoder(self): {SampleBatch.OBS: obs, SampleBatch.SEQ_LENS: seq_lens, STATE_IN: state} ) - self.assertEqual(outputs[ENCODER_OUT].shape, (1, output_dims)) + self.assertEqual(outputs[ENCODER_OUT].shape, (1, output_dims[0])) self.assertEqual(outputs[STATE_OUT], None) diff --git a/rllib/core/models/torch/tests/test_torch_mlp_encoder.py b/rllib/core/models/torch/tests/test_torch_mlp_encoder.py index b43b9b21b1511..b1be558892bee 100644 --- a/rllib/core/models/torch/tests/test_torch_mlp_encoder.py +++ b/rllib/core/models/torch/tests/test_torch_mlp_encoder.py @@ -12,25 +12,25 @@ class TestTorchMLPEncoder(unittest.TestCase): def test_torch_mlp_encoder(self): - inputs_dims = [1, 2, 1000] + inputs_dims_configs = [[1], [2], [1000]] list_of_hidden_layer_dims = [[], [1], [64, 64], [1000, 1000, 1000, 1000]] hidden_layer_activations = [None, "linear", "relu", "tanh", "elu", "swish"] - output_dims = inputs_dims + output_dims_configs = inputs_dims_configs output_activations = hidden_layer_activations for permutation in itertools.product( - inputs_dims, + inputs_dims_configs, list_of_hidden_layer_dims, hidden_layer_activations, output_activations, - output_dims, + output_dims_configs, ): ( - inputs_dim, + inputs_dims, hidden_layer_dims, hidden_layer_activation, output_activation, @@ -39,7 +39,7 @@ def test_torch_mlp_encoder(self): print( f"Testing ...\n" - f"inputs_dim: {inputs_dim}\n" + f"inputs_dim: {inputs_dims}\n" f"hidden_layer_dims: {hidden_layer_dims}\n" f"hidden_layer_activation: {hidden_layer_activation}\n" f"output_activation: {output_activation}\n" @@ -47,16 +47,16 @@ def test_torch_mlp_encoder(self): ) config = MLPEncoderConfig( - input_dim=inputs_dim, + input_dims=inputs_dims, hidden_layer_dims=hidden_layer_dims, - output_dim=output_dims, + output_dims=output_dims, hidden_layer_activation=hidden_layer_activation, output_activation=output_activation, ) model = config.build(framework="torch") - obs = torch.randn(1, inputs_dim) + obs = torch.randn(1, inputs_dims[0]) seq_lens = torch.tensor([1]) state = None @@ -64,7 +64,7 @@ def test_torch_mlp_encoder(self): {SampleBatch.OBS: obs, SampleBatch.SEQ_LENS: seq_lens, STATE_IN: state} ) - self.assertEqual(outputs[ENCODER_OUT].shape, (1, output_dims)) + self.assertEqual(outputs[ENCODER_OUT].shape, (1, output_dims[0])) self.assertEqual(outputs[STATE_OUT], None) diff --git a/rllib/core/models/torch/tests/test_torch_mlp_head.py b/rllib/core/models/torch/tests/test_torch_mlp_head.py index b748fe8f9966e..266c3e552a9a5 100644 --- a/rllib/core/models/torch/tests/test_torch_mlp_head.py +++ b/rllib/core/models/torch/tests/test_torch_mlp_head.py @@ -10,25 +10,25 @@ class TestTorchMLPHead(unittest.TestCase): def test_torch_mlp_head(self): - inputs_dims = [1, 2, 1000] + inputs_dims_configs = [[1], [2], [1000]] list_of_hidden_layer_dims = [[], [1], [64, 64], [1000, 1000, 1000, 1000]] hidden_layer_activations = [None, "linear", "relu", "tanh", "elu", "swish"] - output_dims = inputs_dims + output_dims_configs = inputs_dims_configs output_activations = hidden_layer_activations for permutation in itertools.product( - inputs_dims, + inputs_dims_configs, list_of_hidden_layer_dims, hidden_layer_activations, output_activations, - output_dims, + output_dims_configs, ): ( - inputs_dim, + inputs_dims, hidden_layer_dims, hidden_layer_activation, output_activation, @@ -37,7 +37,7 @@ def test_torch_mlp_head(self): print( f"Testing ...\n" - f"inputs_dim: {inputs_dim}\n" + f"inputs_dim: {inputs_dims}\n" f"hidden_layer_dims: {hidden_layer_dims}\n" f"hidden_layer_activation: {hidden_layer_activation}\n" f"output_activation: {output_activation}\n" @@ -45,20 +45,20 @@ def test_torch_mlp_head(self): ) config = MLPHeadConfig( - input_dim=inputs_dim, + input_dims=inputs_dims, hidden_layer_dims=hidden_layer_dims, - output_dim=output_dims, + output_dims=output_dims, hidden_layer_activation=hidden_layer_activation, output_activation=output_activation, ) model = config.build(framework="torch") - inputs = torch.randn(1, inputs_dim) + inputs = torch.randn(1, inputs_dims[0]) outputs = model(inputs) - self.assertEqual(outputs.shape, (1, output_dims)) + self.assertEqual(outputs.shape, (1, output_dims[0])) if __name__ == "__main__": diff --git a/rllib/core/rl_module/marl_module.py b/rllib/core/rl_module/marl_module.py index 312e09535a29f..2970b16b5fa7b 100644 --- a/rllib/core/rl_module/marl_module.py +++ b/rllib/core/rl_module/marl_module.py @@ -1,5 +1,4 @@ -import copy -from dataclasses import dataclass +from dataclasses import dataclass, field import pprint from typing import Iterator, Mapping, Any, Union, Dict, Optional, Type @@ -17,22 +16,6 @@ ModuleID = str -def _get_module_configs(config: Dict[str, Any]): - """Constructs a mapping from module_id to module config. - - It takes care of the inheritance of common configs to individual module configs. - See `from_multi_agent_config` for more details. - """ - config = copy.deepcopy(config) - module_specs = config.pop("modules", {}) - for common_spec in config: - for module_spec in module_specs.values(): - if getattr(module_spec, common_spec) is None: - setattr(module_spec, common_spec, config[common_spec]) - - return module_specs - - @PublicAPI(stability="alpha") class MultiAgentRLModule(RLModule): """Base class for multi-agent RLModules. @@ -58,79 +41,21 @@ class MultiAgentRLModule(RLModule): `MultiAgentRLModule`. """ - def __init__(self, rl_modules: Mapping[ModuleID, RLModule] = None) -> None: - super().__init__() - self._rl_modules: Mapping[ModuleID, RLModule] = rl_modules or {} + def __init__(self, config: "MultiAgentRLModuleConfig" = None) -> None: + if config is None: + config = MultiAgentRLModuleConfig() - @classmethod - def from_multi_agent_config(cls, config: Mapping[str, Any]) -> "MultiAgentRLModule": - """Creates a MultiAgentRLModule from a multi-agent config. - - The input config should contain "modules" key that is a mapping from module_id - to the module spec for each RLModule which is a SingleAgentRLModuleSpec object. - If there are multiple modules that do share the same - `observation_space`, `action_space`, or `model_config`, you can specify these - keys at the top level of the config, and the module spec will inherit the - values from the top level config. - - Examples: - - .. code-block:: python - - config = { - "modules": { - "module_1": SingleAgentRLModuleSpec( - module_class="RLModule1", - observation_space=gym.spaces.Box(...), - action_space=gym.spaces.Discrete(...), - model_config={hidden_dim: 256} - ) - "module_2": SingleAgentRLModuleSpec( - module_class="RLModule2", - observation_space=gym.spaces.Box(...), - ) - }, - "action_space": gym.spaces.Box(...), - "model_config": {hidden_dim: 32} - } + super().__init__(config) - # This is equivalent to the following config: - - config = { - "modules": { - "module_1": SingleAgentRLModuleSpec( - module_class="RLModule1", - observation_space=gym.spaces.Box(...), - action_space=gym.spaces.Discrete(...), - model_config={hidden_dim: 256} - ) - "module_2": SingleAgentRLModuleSpec( - module_class="RLModule2", - observation_space=gym.spaces.Box(...), - action_space=gym.spaces.Box(...), # Inherited - model_config={hidden_dim: 32} # Inherited - } - }, - } + # self.build() will abstract the construction of rl_modules + self._rl_modules = {} + self.build() - Args: - config: A config dict that contains the module configs. See above for the - format required. - - Returns: - The MultiAgentRLModule. - """ - - module_configs: Dict[ModuleID, Any] = _get_module_configs(config) - cls.__check_module_configs(module_configs) - - multiagent_module = cls() - - for module_id, module_spec in module_configs.items(): - module = module_spec.build() - multiagent_module.add_module(module_id, module) - - return multiagent_module + def build(self): + """Builds the underlying RLModules.""" + self.__check_module_configs(self.config.modules) + for module_id, module_spec in self.config.modules.items(): + self._rl_modules[module_id] = module_spec.build() @classmethod def __check_module_configs(cls, module_configs: Dict[ModuleID, Any]): @@ -335,47 +260,6 @@ def set_state(self, state_dict: Mapping[str, Any]) -> None: for module_id, module in self._rl_modules.items(): module.set_state(state_dict[module_id]) - def serialize(self) -> Mapping[str, Any]: - """Return the serialized state of the module. - - NOTE: This method needs to be implemented in order to support - checkpointing and fault tolerance. - - """ - return { - "class": self.__class__, - "rl_modules": { - module_id: module.serialize() - for module_id, module in self._rl_modules.items() - }, - } - - @classmethod - def deserialize(cls, state: Mapping[str, Any]) -> "MultiAgentRLModule": - """Construct a module from a serialized state. - - Args: - state: The serialized state of the module. - The state should contain the keys "class", "kwargs", and "state". - - - "class" is the class of the RLModule to be constructed. - - "rl_modules" is a dict mapping module ids of the RLModules to - their serialized states. The serialized states can be obtained - from `RLModule.serialize()`. - - NOTE: this state is typically obtained from `serialize()`. - - NOTE: This method needs to be implemented in order to support - checkpointing and fault tolerance. - - Returns: - A deserialized MultiAgentRLModule. - """ - rl_modules = {} - for module_id, module_state in state["rl_modules"].items(): - rl_modules[module_id] = RLModule.deserialize(module_state) - return cls(rl_modules) - def __repr__(self) -> str: return f"MARL({pprint.pformat(self._rl_modules)})" @@ -421,7 +305,7 @@ def _check_module_exists(self, module_id: ModuleID) -> None: ) -@ExperimentalAPI +@PublicAPI(stability="alpha") @dataclass class MultiAgentRLModuleSpec: """A utility spec class to make it constructing MARL modules easier. @@ -454,6 +338,9 @@ def __post_init__(self): "SingleAgentRLModuleSpecs for each individual module." ) + def get_marl_config(self) -> "MultiAgentRLModuleConfig": + return MultiAgentRLModuleConfig(modules=self.module_specs) + def build( self, module_id: Optional[ModuleID] = None ) -> Union[SingleAgentRLModuleSpec, "MultiAgentRLModule"]: @@ -478,27 +365,100 @@ def build( if module_id: return self.module_specs[module_id].build() - return self.marl_module_class.from_multi_agent_config( - {"modules": self.module_specs} - ) + + module_config = self.get_marl_config() + return self.marl_module_class(module_config) def add_modules( - self, module_specs: Dict[ModuleID, SingleAgentRLModuleSpec] + self, + module_specs: Dict[ModuleID, SingleAgentRLModuleSpec], + overwrite: bool = True, ) -> None: - """Add new module specs to the spec. + """Add new module specs to the spec or updates existing ones. Args: module_specs: The mapping for the module_id to the single-agent module specs to be added to this multi-agent module spec. + overwrite: Whether to overwrite the existing module specs if they already + exist. If False, they will be updated only. """ if self.module_specs is None: self.module_specs = {} - self.module_specs.update(module_specs) + for module_id, module_spec in module_specs.items(): + if overwrite or module_id not in self.module_specs: + self.module_specs[module_id] = module_spec + else: + self.module_specs[module_id].update(module_spec) + + @classmethod + def from_module(self, module: MultiAgentRLModule) -> "MultiAgentRLModuleSpec": + """Creates a MultiAgentRLModuleSpec from a MultiAgentRLModule. + + Args: + module: The MultiAgentRLModule to create the spec from. + + Returns: + The MultiAgentRLModuleSpec. + """ + module_specs = { + module_id: SingleAgentRLModuleSpec.from_module(rl_module) + for module_id, rl_module in module._rl_modules.items() + } + marl_module_class = module.__class__ + return MultiAgentRLModuleSpec( + marl_module_class=marl_module_class, module_specs=module_specs + ) def _check_before_build(self): if not isinstance(self.module_specs, dict): raise ValueError( - f"When build() is called on {self.__class__} the module_specs " + f"When build() is called on {self.__class__}, the module_specs " "should be a dictionary mapping from module IDs to " "SingleAgentRLModuleSpecs for each individual module." ) + + def update(self, other: "MultiAgentRLModuleSpec", overwrite=False) -> None: + """Updates this spec with the other spec. + + Traverses this MultiAgentRLModuleSpec's module_specs and updates them with + the module specs from the other MultiAgentRLModuleSpec. + + Args: + other: The other spec to update this spec with. + overwrite: Whether to overwrite the existing module specs if they already + exist. If False, they will be updated only. + """ + assert type(other) is MultiAgentRLModuleSpec + + if isinstance(other.module_specs, dict): + self.add_modules(other.module_specs, overwrite=overwrite) + else: + if not self.module_specs: + self.module_specs = other.module_specs + else: + self.module_specs.update(other.module_specs) + + +@ExperimentalAPI +@dataclass +class MultiAgentRLModuleConfig: + + modules: Mapping[ModuleID, SingleAgentRLModuleSpec] = field(default_factory=dict) + + def to_dict(self): + + return { + "modules": { + module_id: module_spec.to_dict() + for module_id, module_spec in self.modules.items() + } + } + + @classmethod + def from_dict(cls, d) -> "MultiAgentRLModuleConfig": + return cls( + modules={ + module_id: SingleAgentRLModuleSpec.from_dict(module_spec) + for module_id, module_spec in d["modules"].items() + } + ) diff --git a/rllib/core/rl_module/rl_module.py b/rllib/core/rl_module/rl_module.py index d73d052314ffe..53220a41e0205 100644 --- a/rllib/core/rl_module/rl_module.py +++ b/rllib/core/rl_module/rl_module.py @@ -1,16 +1,20 @@ import abc from dataclasses import dataclass +import datetime import gymnasium as gym -from typing import Mapping, Any, TYPE_CHECKING, Optional, Type, Dict +import json +import pathlib +from typing import Any, Dict, Mapping, Optional, Type, TYPE_CHECKING, Union if TYPE_CHECKING: from ray.rllib.core.rl_module.marl_module import MultiAgentRLModule + from ray.rllib.core.models.catalog import Catalog +import ray from ray.rllib.utils.annotations import ( ExperimentalAPI, OverrideToImplementCustomLogic_CallToSuperRecommended, ) -from ray.rllib.utils.serialization import check_if_args_kwargs_serializable from ray.rllib.models.specs.typing import SpecType from ray.rllib.models.specs.checker import ( @@ -22,9 +26,16 @@ from ray.rllib.policy.sample_batch import DEFAULT_POLICY_ID, SampleBatch from ray.rllib.utils.nested_dict import NestedDict from ray.rllib.utils.typing import SampleBatchType +from ray.rllib.utils.serialization import ( + gym_space_from_dict, + gym_space_to_dict, + serialize_type, + deserialize_type, +) ModuleID = str +METADATA_FILE_NAME = "rl_module_metadata.json" @ExperimentalAPI @@ -33,50 +44,147 @@ class SingleAgentRLModuleSpec: """A utility spec class to make it constructing RLModules (in single-agent case) easier. Args: - module_class: ... - observation_space: ... - action_space: ... - model_config: ... + module_class: The RLModule class to use. + observation_space: The observation space of the RLModule. + action_space: The action space of the RLModule. + model_config_dict: The model config dict to use. + catalog_class: The Catalog class to use. """ module_class: Optional[Type["RLModule"]] = None - observation_space: Optional["gym.Space"] = None - action_space: Optional["gym.Space"] = None - model_config: Optional[Dict[str, Any]] = None + observation_space: Optional[gym.Space] = None + action_space: Optional[gym.Space] = None + model_config_dict: Optional[Mapping[str, Any]] = None + catalog_class: Optional[Type["Catalog"]] = None + + def get_rl_module_config(self) -> "RLModuleConfig": + """Returns the RLModule config for this spec.""" + return RLModuleConfig( + observation_space=self.observation_space, + action_space=self.action_space, + model_config_dict=self.model_config_dict, + catalog_class=self.catalog_class, + ) def build(self) -> "RLModule": - + if self.module_class is None: + raise ValueError("RLModule class is not set.") if self.observation_space is None: - raise ValueError("Observation space must be specified.") + raise ValueError("Observation space is not set.") if self.action_space is None: - raise ValueError("Action space must be specified.") - if self.model_config is None: - raise ValueError("Model config must be specified.") + raise ValueError("Action space is not set.") + if self.model_config_dict is None: + raise ValueError("Model config is not set.") - return self.module_class.from_model_config( - observation_space=self.observation_space, - action_space=self.action_space, - model_config_dict=self.model_config, + module_config = self.get_rl_module_config() + return self.module_class(module_config) + + @classmethod + def from_module(cls, module: "RLModule") -> "SingleAgentRLModuleSpec": + from ray.rllib.core.rl_module.marl_module import MultiAgentRLModule + + if isinstance(module, MultiAgentRLModule): + raise ValueError( + "MultiAgentRLModule cannot be converted to SingleAgentRLModuleSpec." + ) + + return SingleAgentRLModuleSpec( + module_class=type(module), + observation_space=module.config.observation_space, + action_space=module.config.action_space, + model_config_dict=module.config.model_config_dict, + catalog_class=module.config.catalog_class, + ) + + def to_dict(self): + """Returns a serialized representation of the spec.""" + + return { + "module_class": serialize_type(self.module_class), + "module_config": self.get_rl_module_config().to_dict(), + } + + @classmethod + def from_dict(cls, d): + """Returns a single agent RLModule spec from a serialized representation.""" + module_class = deserialize_type(d["module_class"]) + + module_config = RLModuleConfig.from_dict(d["module_config"]) + observation_space = module_config.observation_space + action_space = module_config.action_space + model_config_dict = module_config.model_config_dict + catalog_class = module_config.catalog_class + + return SingleAgentRLModuleSpec( + module_class=module_class, + observation_space=observation_space, + action_space=action_space, + model_config_dict=model_config_dict, + catalog_class=catalog_class, ) + def update(self, other) -> None: + """Updates this spec with the given other spec. Works like dict.update().""" + if not isinstance(other, SingleAgentRLModuleSpec): + raise ValueError("Can only update with another SingleAgentRLModuleSpec.") + + # If the field is None in the other, keep the current field, otherwise update + # with the new value. + self.module_class = other.module_class or self.module_class + self.observation_space = other.observation_space or self.observation_space + self.action_space = other.action_space or self.action_space + self.model_config_dict = other.model_config_dict or self.model_config_dict + self.catalog_class = other.catalog_class or self.catalog_class + @ExperimentalAPI @dataclass class RLModuleConfig: - """Configuration for the PPO module. - # TODO (Kourosh): Whether we need this or not really depends on how the catalog - # design end up being. - Attributes: - observation_space: The observation space of the environment. - action_space: The action space of the environment. - max_seq_len: Max seq len for training an RNN model. - (TODO (Kourosh) having max_seq_len here seems a bit unnatural, can we rethink - this design?) - """ observation_space: gym.Space = None action_space: gym.Space = None - max_seq_len: int = None + model_config_dict: Mapping[str, Any] = None + catalog_class: Type["Catalog"] = None + + def get_catalog(self) -> "Catalog": + """Returns the catalog for this config.""" + return self.catalog_class( + observation_space=self.observation_space, + action_space=self.action_space, + model_config_dict=self.model_config_dict, + ) + + def to_dict(self): + """Returns a serialized representation of the config. + + NOTE: This should be JSON-able. Users can test this by calling + json.dumps(config.to_dict()). + + """ + catalog_class_path = ( + serialize_type(type(self.catalog_class)) if self.catalog_class else "" + ) + return { + "observation_space": gym_space_to_dict(self.observation_space), + "action_space": gym_space_to_dict(self.action_space), + "model_config_dict": self.model_config_dict, + "catalog_class_path": catalog_class_path, + } + + @classmethod + def from_dict(cls, d: Dict[str, Any]): + """Creates a config from a serialized representation.""" + catalog_class = ( + None + if d["catalog_class_path"] == "" + else deserialize_type(d["catalog_class_path"]) + ) + return cls( + observation_space=gym_space_from_dict(d["observation_space"]), + action_space=gym_space_from_dict(d["action_space"]), + model_config_dict=d["model_config_dict"], + catalog_class=catalog_class, + ) @ExperimentalAPI @@ -143,9 +251,8 @@ class RLModule(abc.ABC): More details here: https://github.com/pytorch/pytorch/issues/49726. """ - def __init__(self, *args, **kwargs): - check_if_args_kwargs_serializable(args, kwargs) - self._args_and_kwargs = {"args": args, "kwargs": kwargs} + def __init__(self, config: RLModuleConfig): + self.config = config def __init_subclass__(cls, **kwargs): # Automatically add a __post_init__ method to all subclasses of RLModule. @@ -187,58 +294,6 @@ def __post_init__(self): self.output_specs_inference() ) - @classmethod - def from_model_config( - cls, - observation_space: gym.Space, - action_space: gym.Space, - *, - model_config: Mapping[str, Any], - ) -> "RLModule": - """Creates a RLModule instance from a model config dict and spaces. - - The model config dict is the same as the one passed to the AlgorithmConfig - object that contains global model configurations parameters. - - This method can also be used to create a config dict for the module constructor - so it can be re-used to create multiple instances of the module. - - Example: - - .. code-block:: python - - class MyModule(RLModule): - def __init__(self, input_dim, output_dim): - self.input_dim, self.output_dim = input_dim, output_dim - - @classmethod - def from_model_config( - cls, - observation_space: gym.Space, - action_space: gym.Space, - model_config: Mapping[str, Any], - ): - return cls( - input_dim=observation_space.shape[0], - output_dim=action_space.n - ) - - module = MyModule.from_model_config( - observation_space=gym.spaces.Box(low=0, high=1, shape=(4,)), - action_space=gym.spaces.Discrete(2), - model_config={}, - ) - - - Args: - observation_space: The observation space of the env. - action_space: The action space of the env. - model_config: The model config dict. - """ - raise NotImplementedError - - # TODO: (Artur) Add a method `from_catalog` that creates RLModule from Catalog - def get_initial_state(self) -> NestedDict: """Returns the initial state of the module. @@ -361,39 +416,126 @@ def get_state(self) -> Mapping[str, Any]: def set_state(self, state_dict: Mapping[str, Any]) -> None: """Sets the state dict of the module.""" - def serialize(self) -> Mapping[str, Any]: - """Return the serialized state of the module.""" - return { - "class": self.__class__, - "args": self._args_and_kwargs["args"], - "kwargs": self._args_and_kwargs["kwargs"], - "state": self.get_state(), - } + def _save_module_metadata( + self, + checkpoint_dir: Union[str, pathlib.Path], + module_state_path: Union[str, pathlib.Path], + ): + """Saves the metadata of the module to checkpoint_dir. + + Includes: + - module class path + - module state path + - the module config + - the ray version used + - the ray commit hash used + - the date and time of the checkpoint was created + + """ + if isinstance(checkpoint_dir, str): + checkpoint_dir = pathlib.Path(checkpoint_dir) + if isinstance(module_state_path, str): + module_state_path = pathlib.Path(module_state_path) + gmt_time = datetime.datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S GMT") + metadata = {} + # TODO (Avnishn): Find a way to incorporate the tune registry here. + metadata["module_class"] = serialize_type(self.__class__) + metadata["module_config"] = self.config.to_dict() + metadata["ray_version"] = ray.__version__ + metadata["ray_commit_hash"] = ray.__commit__ + metadata["checkpoint_date_time"] = gmt_time + metadata["module_state_path"] = str(module_state_path) + metadata_path = checkpoint_dir / METADATA_FILE_NAME + with open(metadata_path, "w") as f: + json.dump(metadata, f) @classmethod - def deserialize(cls, state: Mapping[str, Any]) -> "RLModule": - """Construct a module from a serialized state. + def _from_metadata_file(cls, metadata_path: Union[str, pathlib.Path]) -> "RLModule": + """Constructs a module from the metadata. Args: - state: The serialized state of the module. + metadata_path: The path to the metadata json file for a module. - NOTE: this state is typically obtained from `serialize()`. + Returns: + The module. + """ + if isinstance(metadata_path, str): + metadata_path = pathlib.Path(metadata_path) + if not metadata_path.exists(): + raise ValueError("The metadata path was not found.") + if not metadata_path.exists(): + raise ValueError( + "While constructing the module from the metadata, the " + f"metadata file was not found at {str(metadata_path)}" + ) + with open(metadata_path, "r") as f: + metadata = json.load(f) + module_class = deserialize_type(metadata["module_class"]) + module_config = RLModuleConfig.from_dict(metadata["module_config"]) + module = module_class(module_config) + return module - NOTE: This method needs to be implemented in order to support - checkpointing and fault tolerance. + def save_state_to_file(self, path: Union[str, pathlib.Path]) -> str: + """Saves the weights of this RLmodule to path. + + Args: + path: The directory to save the checkpoint to. Returns: - A deserialized RLModule. + The path to the saved checkpoint. """ - for key in ["class", "args", "kwargs", "state"]: - if key not in state: - raise ValueError( - "By default, the serialized state must contain the following " - f"keys: 'class', 'args', 'args', and 'kwargs'. Got: {state.keys()}" - ) - constructor = state["class"] - module = constructor(*state["args"], **state["kwargs"]) - module.set_state(state["state"]) + raise NotImplementedError + + def load_state_from_file(self, path: Union[str, pathlib.Path]) -> None: + """Loads the weights of an RLmodule from path. + + Args: + path: The directory to load the checkpoint from. + """ + raise NotImplementedError + + def save_to_checkpoint(self, checkpoint_dir_path: str) -> None: + """Saves the module to a checkpoint directory. + + Args: + dir_path: The directory to save the checkpoint to. + + Raises: + ValueError: If dir_path is not an absolute path. + """ + path = pathlib.Path(checkpoint_dir_path) + if not path.is_absolute(): + raise ValueError("dir_path must be an absolute path.") + path.mkdir(parents=True, exist_ok=True) + module_state_path = self.save_state_to_file(path) + self._save_module_metadata(path, module_state_path) + + @classmethod + def from_checkpoint(cls, checkpoint_dir_path: str) -> None: + """Loads the module from a checkpoint directory. + + Args: + dir_path: The directory to load the checkpoint from. + """ + path = pathlib.Path(checkpoint_dir_path) + if not path.exists(): + raise ValueError( + "While loading from checkpoint there was no directory" + " found at {}".format(checkpoint_dir_path) + ) + if not path.is_absolute(): + raise ValueError("dir_path must be an absolute path.") + if not path.is_dir(): + raise ValueError( + "While loading from checkpoint the checkpoint_dir_path " + "provided was not a directory." + ) + metadata_path = path / METADATA_FILE_NAME + with open(metadata_path, "r") as f: + metadata = json.load(f) + state_path = metadata["module_state_path"] + module = cls._from_metadata_file(metadata_path) + module.load_state_from_file(state_path) return module @abc.abstractmethod @@ -408,4 +550,6 @@ def as_multi_agent(self) -> "MultiAgentRLModule": """Returns a multi-agent wrapper around this module.""" from ray.rllib.core.rl_module.marl_module import MultiAgentRLModule - return MultiAgentRLModule({DEFAULT_POLICY_ID: self}) + marl_module = MultiAgentRLModule() + marl_module.add_module(DEFAULT_POLICY_ID, self) + return marl_module diff --git a/rllib/core/rl_module/tests/test_marl_module.py b/rllib/core/rl_module/tests/test_marl_module.py index 17aeccea41d1b..93e9880879ce9 100644 --- a/rllib/core/rl_module/tests/test_marl_module.py +++ b/rllib/core/rl_module/tests/test_marl_module.py @@ -1,8 +1,11 @@ import unittest -from ray.rllib.core.rl_module.rl_module import SingleAgentRLModuleSpec -from ray.rllib.core.rl_module.marl_module import MultiAgentRLModule, _get_module_configs +from ray.rllib.core.rl_module.rl_module import SingleAgentRLModuleSpec, RLModuleConfig +from ray.rllib.core.rl_module.marl_module import ( + MultiAgentRLModule, + MultiAgentRLModuleConfig, +) from ray.rllib.core.testing.torch.bc_module import DiscreteBCTorchModule from ray.rllib.env.multi_agent_env import make_multi_agent from ray.rllib.utils.test_utils import check @@ -16,45 +19,24 @@ def test_from_config(self): env_class = make_multi_agent("CartPole-v0") env = env_class({"num_agents": 2}) - module1 = DiscreteBCTorchModule.from_model_config( - env.observation_space, - env.action_space, + module1 = SingleAgentRLModuleSpec( + module_class=DiscreteBCTorchModule, + observation_space=env.observation_space, + action_space=env.action_space, model_config_dict={"fcnet_hiddens": [32]}, ) - module2 = DiscreteBCTorchModule.from_model_config( - env.observation_space, - env.action_space, + + module2 = SingleAgentRLModuleSpec( + module_class=DiscreteBCTorchModule, + observation_space=env.observation_space, + action_space=env.action_space, model_config_dict={"fcnet_hiddens": [32]}, ) - multi_agent_dict = {"module1": module1, "module2": module2} - marl_module = MultiAgentRLModule(multi_agent_dict) - - self.assertEqual(set(marl_module.keys()), {"module1", "module2"}) - self.assertIsInstance(marl_module["module1"], DiscreteBCTorchModule) - self.assertIsInstance(marl_module["module2"], DiscreteBCTorchModule) - - def test_from_multi_agent_config(self): - - env_class = make_multi_agent("CartPole-v0") - env = env_class({"num_agents": 2}) - - multi_agent_dict = { - "modules": { - "module1": SingleAgentRLModuleSpec( - module_class=DiscreteBCTorchModule, - model_config={"fcnet_hiddens": [64]}, - ), - "module2": SingleAgentRLModuleSpec( - module_class=DiscreteBCTorchModule, - model_config={"fcnet_hiddens": [32]}, - ), - }, - "observation_space": env.observation_space, # this is common - "action_space": env.action_space, # this is common - } - - marl_module = MultiAgentRLModule.from_multi_agent_config(multi_agent_dict) + config = MultiAgentRLModuleConfig( + modules={"module1": module1, "module2": module2} + ) + marl_module = MultiAgentRLModule(config) self.assertEqual(set(marl_module.keys()), {"module1", "module2"}) self.assertIsInstance(marl_module["module1"], DiscreteBCTorchModule) @@ -65,10 +47,12 @@ def test_as_multi_agent(self): env_class = make_multi_agent("CartPole-v0") env = env_class({"num_agents": 2}) - marl_module = DiscreteBCTorchModule.from_model_config( - env.observation_space, - env.action_space, - model_config_dict={"fcnet_hiddens": [32]}, + marl_module = DiscreteBCTorchModule( + config=RLModuleConfig( + env.observation_space, + env.action_space, + model_config_dict={"fcnet_hiddens": [32]}, + ) ).as_multi_agent() self.assertNotIsInstance(marl_module, DiscreteBCTorchModule) @@ -84,10 +68,12 @@ def test_get_set_state(self): env_class = make_multi_agent("CartPole-v0") env = env_class({"num_agents": 2}) - module = DiscreteBCTorchModule.from_model_config( - env.observation_space, - env.action_space, - model_config_dict={"fcnet_hiddens": [32]}, + module = DiscreteBCTorchModule( + config=RLModuleConfig( + env.observation_space, + env.action_space, + model_config_dict={"fcnet_hiddens": [32]}, + ) ).as_multi_agent() state = module.get_state() @@ -98,10 +84,12 @@ def test_get_set_state(self): set(module[DEFAULT_POLICY_ID].get_state().keys()), ) - module2 = DiscreteBCTorchModule.from_model_config( - env.observation_space, - env.action_space, - model_config_dict={"fcnet_hiddens": [32]}, + module2 = DiscreteBCTorchModule( + config=RLModuleConfig( + env.observation_space, + env.action_space, + model_config_dict={"fcnet_hiddens": [32]}, + ) ).as_multi_agent() state2 = module2.get_state() check(state, state2, false=True) @@ -116,18 +104,22 @@ def test_add_remove_modules(self): env_class = make_multi_agent("CartPole-v0") env = env_class({"num_agents": 2}) - module = DiscreteBCTorchModule.from_model_config( - env.observation_space, - env.action_space, - model_config_dict={"fcnet_hiddens": [32]}, + module = DiscreteBCTorchModule( + config=RLModuleConfig( + env.observation_space, + env.action_space, + model_config_dict={"fcnet_hiddens": [32]}, + ) ).as_multi_agent() module.add_module( "test", - DiscreteBCTorchModule.from_model_config( - env.observation_space, - env.action_space, - model_config_dict={"fcnet_hiddens": [32]}, + DiscreteBCTorchModule( + config=RLModuleConfig( + env.observation_space, + env.action_space, + model_config_dict={"fcnet_hiddens": [32]}, + ) ), ) self.assertEqual(set(module.keys()), {DEFAULT_POLICY_ID, "test"}) @@ -139,124 +131,28 @@ def test_add_remove_modules(self): ValueError, lambda: module.add_module( DEFAULT_POLICY_ID, - DiscreteBCTorchModule.from_model_config( - env.observation_space, - env.action_space, - model_config_dict={"fcnet_hiddens": [32]}, + DiscreteBCTorchModule( + config=RLModuleConfig( + env.observation_space, + env.action_space, + model_config_dict={"fcnet_hiddens": [32]}, + ) ), ), ) module.add_module( DEFAULT_POLICY_ID, - DiscreteBCTorchModule.from_model_config( - env.observation_space, - env.action_space, - model_config_dict={"fcnet_hiddens": [32]}, + DiscreteBCTorchModule( + config=RLModuleConfig( + env.observation_space, + env.action_space, + model_config_dict={"fcnet_hiddens": [32]}, + ) ), override=True, ) - def test_get_module_configs(self): - """Tests the method for getting the module configs from multi-agent config.""" - - config = { - "modules": { - "1": SingleAgentRLModuleSpec( - **{"module_class": "foo", "model_config": "bar"} - ), - "2": SingleAgentRLModuleSpec( - **{"module_class": "foo2", "model_config": "bar2"} - ), - }, - "observation_space": "obs_space", - "action_space": "action_space", - } - - expected_config = { - "1": SingleAgentRLModuleSpec( - **{ - "module_class": "foo", - "model_config": "bar", - "observation_space": "obs_space", - "action_space": "action_space", - } - ), - "2": SingleAgentRLModuleSpec( - **{ - "module_class": "foo2", - "model_config": "bar2", - "observation_space": "obs_space", - "action_space": "action_space", - } - ), - } - - self.assertDictEqual(_get_module_configs(config), expected_config) - - config = { - "modules": { - "1": SingleAgentRLModuleSpec( - **{ - "module_class": "foo", - "model_config": "bar", - "observation_space": "obs_space1", # won't get overwritten - "action_space": "action_space1", # won't get overwritten - } - ), - "2": SingleAgentRLModuleSpec( - **{"module_class": "foo2", "model_config": "bar2"} - ), - }, - "observation_space": "obs_space", - "action_space": "action_space", - } - - expected_config = { - "1": SingleAgentRLModuleSpec( - **{ - "module_class": "foo", - "model_config": "bar", - "observation_space": "obs_space1", - "action_space": "action_space1", - } - ), - "2": SingleAgentRLModuleSpec( - **{ - "module_class": "foo2", - "model_config": "bar2", - "observation_space": "obs_space", - "action_space": "action_space", - } - ), - } - - self.assertDictEqual(_get_module_configs(config), expected_config) - - def test_serialize_deserialize(self): - env_class = make_multi_agent("CartPole-v0") - env = env_class({"num_agents": 2}) - module1 = DiscreteBCTorchModule.from_model_config( - env.observation_space, - env.action_space, - model_config_dict={"fcnet_hiddens": [32]}, - ) - module2 = DiscreteBCTorchModule.from_model_config( - env.observation_space, - env.action_space, - model_config_dict={"fcnet_hiddens": [32]}, - ) - - multi_agent_dict = {"module1": module1, "module2": module2} - marl_module = MultiAgentRLModule(multi_agent_dict) - new_marl_module = marl_module.deserialize(marl_module.serialize()) - - self.assertNotEqual(id(marl_module), id(new_marl_module)) - self.assertEqual(set(marl_module.keys()), set(new_marl_module.keys())) - for key in marl_module.keys(): - self.assertNotEqual(id(marl_module[key]), id(new_marl_module[key])) - check(marl_module[key].get_state(), new_marl_module[key].get_state()) - if __name__ == "__main__": import pytest diff --git a/rllib/core/rl_module/tests/test_rl_module_specs.py b/rllib/core/rl_module/tests/test_rl_module_specs.py index 3f4230e0f5ad0..11f94aada99b0 100644 --- a/rllib/core/rl_module/tests/test_rl_module_specs.py +++ b/rllib/core/rl_module/tests/test_rl_module_specs.py @@ -10,12 +10,12 @@ from ray.rllib.core.testing.torch.bc_module import ( DiscreteBCTorchModule, BCTorchRLModuleWithSharedGlobalEncoder, - BCTorchMultiAgentSpec, + BCTorchMultiAgentModuleWithSharedEncoder, ) from ray.rllib.core.testing.tf.bc_module import ( DiscreteBCTFModule, BCTfRLModuleWithSharedGlobalEncoder, - BCTfMultiAgentSpec, + BCTfMultiAgentModuleWithSharedEncoder, ) MODULES = [DiscreteBCTorchModule, DiscreteBCTFModule] @@ -23,20 +23,10 @@ "torch": BCTorchRLModuleWithSharedGlobalEncoder, "tf": BCTfRLModuleWithSharedGlobalEncoder, } -CUSTOM_MARL_SPECS = {"torch": BCTorchMultiAgentSpec, "tf": BCTfMultiAgentSpec} - - -class BCRLModuleSpecCustom(SingleAgentRLModuleSpec): - """A customized SingleAgentRLModuleSpec.""" - - def build(self): - # this handles all implementation details - config = { - "input_dim": self.observation_space.shape[0], - "hidden_dim": self.model_config["fcnet_hiddens"][0], - "output_dim": self.action_space.n, - } - return self.module_class(**config) +CUSTOM_MARL_MODULES = { + "torch": BCTorchMultiAgentModuleWithSharedEncoder, + "tf": BCTfMultiAgentModuleWithSharedEncoder, +} class TestRLModuleSpecs(unittest.TestCase): @@ -48,47 +38,30 @@ def test_single_agent_spec(self): module_class=module_class, observation_space=env.observation_space, action_space=env.action_space, - model_config={"fcnet_hiddens": [64]}, + model_config_dict={"fcnet_hiddens": [64]}, ) module = spec.build() self.assertIsInstance(module, module_class) - def test_customized_single_agent_spec(self): - """Tests the a customized SingleAgentRLModuleSpec.""" - env = gym.make("CartPole-v1") - for module_class in MODULES: - - spec = BCRLModuleSpecCustom( - module_class=module_class, - observation_space=env.observation_space, - action_space=env.action_space, - model_config={"fcnet_hiddens": [64]}, - ) - module = spec.build() - self.assertIsInstance(module, module_class) - def test_multi_agent_spec(self): env = gym.make("CartPole-v1") num_agents = 2 - # make sure I use both default and cutomized single agent specs - single_agent_spec_classes = [SingleAgentRLModuleSpec, BCRLModuleSpecCustom] for module_class in MODULES: module_specs = {} for i in range(num_agents): - module_spec_cls = single_agent_spec_classes[i % num_agents] - module_specs[f"module_{i}"] = module_spec_cls( + module_specs[f"module_{i}"] = SingleAgentRLModuleSpec( module_class=module_class, observation_space=env.observation_space, action_space=env.action_space, - model_config={"fcnet_hiddens": [32 * (i + 1)]}, + model_config_dict={"fcnet_hiddens": [32 * (i + 1)]}, ) spec = MultiAgentRLModuleSpec(module_specs=module_specs) module = spec.build() self.assertIsInstance(module, MultiAgentRLModule) - def test_customized_multi_agent_spec(self): + def test_customized_multi_agent_module(self): """Tests creating a customized MARL BC module that owns a shared encoder.""" global_dim = 10 @@ -97,14 +70,14 @@ def test_customized_multi_agent_spec(self): # TODO (Kourosh): add tf support for fw in ["torch"]: - spec_cls = CUSTOM_MARL_SPECS[fw] - module_cls = CUSTOM_MODULES[fw] + marl_module_cls = CUSTOM_MARL_MODULES[fw] + rl_module_cls = CUSTOM_MODULES[fw] - spec = spec_cls( - marl_module_class=MultiAgentRLModule, + spec = MultiAgentRLModuleSpec( + marl_module_class=marl_module_cls, module_specs={ "agent_1": SingleAgentRLModuleSpec( - module_class=module_cls, + module_class=rl_module_cls, observation_space=gym.spaces.Dict( { "global": gym.spaces.Box( @@ -116,10 +89,10 @@ def test_customized_multi_agent_spec(self): } ), action_space=gym.spaces.Discrete(action_dims[0]), - model_config={"fcnet_hiddens": [128]}, + model_config_dict={"fcnet_hiddens": [128]}, ), "agent_2": SingleAgentRLModuleSpec( - module_class=module_cls, + module_class=rl_module_cls, observation_space=gym.spaces.Dict( { "global": gym.spaces.Box( @@ -131,7 +104,7 @@ def test_customized_multi_agent_spec(self): } ), action_space=gym.spaces.Discrete(action_dims[1]), - model_config={"fcnet_hiddens": [128]}, + model_config_dict={"fcnet_hiddens": [128]}, ), }, ) @@ -145,6 +118,156 @@ def test_customized_multi_agent_spec(self): foo.data = torch.ones_like(foo.data) self.assertTrue(torch.allclose(model["agent_2"].encoder[0].bias, foo)) + def test_get_spec_from_module_multi_agent(self): + """Tests wether MultiAgentRLModuleSpec.from_module() works.""" + env = gym.make("CartPole-v1") + num_agents = 2 + for module_class in MODULES: + module_specs = {} + for i in range(num_agents): + module_specs[f"module_{i}"] = SingleAgentRLModuleSpec( + module_class=module_class, + observation_space=env.observation_space, + action_space=env.action_space, + model_config_dict={"fcnet_hiddens": [32 * (i + 1)]}, + ) + + spec = MultiAgentRLModuleSpec(module_specs=module_specs) + module = spec.build() + + spec_from_module = MultiAgentRLModuleSpec.from_module(module) + self.assertEqual(spec, spec_from_module) + + def test_get_spec_from_module_single_agent(self): + """Tests wether SingleAgentRLModuleSpec.from_module() works.""" + env = gym.make("CartPole-v1") + for module_class in MODULES: + spec = SingleAgentRLModuleSpec( + module_class=module_class, + observation_space=env.observation_space, + action_space=env.action_space, + model_config_dict={"fcnet_hiddens": [32]}, + ) + + module = spec.build() + spec_from_module = SingleAgentRLModuleSpec.from_module(module) + self.assertEqual(spec, spec_from_module) + + def test_update_specs(self): + """Tests wether SingleAgentRLModuleSpec.update() works.""" + env = gym.make("CartPole-v0") + + # Test if SingleAgentRLModuleSpec.update() works. + module_spec_1 = SingleAgentRLModuleSpec( + module_class=DiscreteBCTorchModule, + observation_space=env.observation_space, + action_space=env.action_space, + model_config_dict="Update me!", + ) + module_spec_2 = SingleAgentRLModuleSpec( + model_config_dict={"fcnet_hiddens": [32]} + ) + self.assertEqual(module_spec_1.model_config_dict, "Update me!") + module_spec_1.update(module_spec_2) + self.assertEqual(module_spec_1.model_config_dict, {"fcnet_hiddens": [32]}) + + def test_update_specs_multi_agent(self): + """Test if updating a SingleAgentRLModuleSpec in MultiAgentRLModuleSpec works. + + This tests if we can update a `model_config_dict` field through different + kinds of updates: + - Create a SingleAgentRLModuleSpec and update its model_config_dict. + - Create two MultiAgentRLModuleSpecs and update the first one with the + second one without overwriting it. + - Check if the updated MultiAgentRLModuleSpec does not(!) have the + updated model_config_dict. + - Create two MultiAgentRLModuleSpecs and update the first one with the + second one with overwriting it. + - Check if the updated MultiAgentRLModuleSpec has(!) the updated + model_config_dict. + + """ + env = gym.make("CartPole-v0") + + # Test if SingleAgentRLModuleSpec.update() works. + module_spec_1 = SingleAgentRLModuleSpec( + module_class=DiscreteBCTorchModule, + observation_space="Do not update me!", + action_space=env.action_space, + model_config_dict="Update me!", + ) + module_spec_2 = SingleAgentRLModuleSpec( + model_config_dict={"fcnet_hiddens": [32]}, + ) + + self.assertEqual(module_spec_1.model_config_dict, "Update me!") + module_spec_1.update(module_spec_2) + self.assertEqual(module_spec_1.module_class, DiscreteBCTorchModule) + self.assertEqual(module_spec_1.observation_space, "Do not update me!") + self.assertEqual(module_spec_1.action_space, env.action_space) + self.assertEqual( + module_spec_1.model_config_dict, module_spec_2.model_config_dict + ) + + # Redefine module_spec_1 for following tests. + module_spec_1 = SingleAgentRLModuleSpec( + module_class=DiscreteBCTorchModule, + observation_space="Do not update me!", + action_space=env.action_space, + model_config_dict="Update me!", + ) + + marl_spec_1 = MultiAgentRLModuleSpec( + marl_module_class=BCTorchMultiAgentModuleWithSharedEncoder, + module_specs={"agent_1": module_spec_1}, + ) + marl_spec_2 = MultiAgentRLModuleSpec( + marl_module_class=BCTorchMultiAgentModuleWithSharedEncoder, + module_specs={"agent_1": module_spec_2}, + ) + + # Test if updating MultiAgentRLModuleSpec with overwriting works. This means + # that the single agent specs should be overwritten + self.assertEqual( + marl_spec_1.module_specs["agent_1"].model_config_dict, "Update me!" + ) + marl_spec_1.update(marl_spec_2, overwrite=True) + self.assertEqual(marl_spec_1.module_specs["agent_1"], module_spec_2) + + # Test if updating MultiAgentRLModuleSpec without overwriting works. This + # means that the single agent specs should not be overwritten + marl_spec_3 = MultiAgentRLModuleSpec( + marl_module_class=BCTorchMultiAgentModuleWithSharedEncoder, + module_specs={"agent_1": module_spec_1}, + ) + + self.assertEqual( + marl_spec_3.module_specs["agent_1"].observation_space, "Do not update me!" + ) + marl_spec_3.update(marl_spec_2, overwrite=False) + # If we would overwrite, we would replace the observation space even though + # it was None. This is not the case here. + self.assertEqual( + marl_spec_3.module_specs["agent_1"].observation_space, "Do not update me!" + ) + + # Test if updating with an additional SingleAgentRLModuleSpec works. + module_spec_3 = SingleAgentRLModuleSpec( + module_class=DiscreteBCTorchModule, + observation_space=env.observation_space, + action_space=env.action_space, + model_config_dict="I'm new!", + ) + marl_spec_3 = MultiAgentRLModuleSpec( + marl_module_class=BCTorchMultiAgentModuleWithSharedEncoder, + module_specs={"agent_2": module_spec_3}, + ) + self.assertEqual(marl_spec_1.module_specs.get("agent_2"), None) + marl_spec_1.update(marl_spec_3) + self.assertEqual( + marl_spec_1.module_specs["agent_2"].model_config_dict, "I'm new!" + ) + if __name__ == "__main__": import pytest diff --git a/rllib/core/rl_module/tf/tests/test_tf_rl_module.py b/rllib/core/rl_module/tf/tests/test_tf_rl_module.py index 78164d37cd06d..98fe5e71765fb 100644 --- a/rllib/core/rl_module/tf/tests/test_tf_rl_module.py +++ b/rllib/core/rl_module/tf/tests/test_tf_rl_module.py @@ -1,13 +1,13 @@ import gymnasium as gym import tensorflow as tf import tensorflow_probability as tfp -import threading +import tempfile from typing import Mapping import unittest +from ray.rllib.core.rl_module.rl_module import RLModuleConfig from ray.rllib.core.rl_module.tf.tf_rl_module import TfRLModule from ray.rllib.core.testing.tf.bc_module import DiscreteBCTFModule -from ray.rllib.utils.error import NotSerializable from ray.rllib.utils.test_utils import check @@ -15,10 +15,12 @@ class TestRLModule(unittest.TestCase): def test_compilation(self): env = gym.make("CartPole-v1") - module = DiscreteBCTFModule.from_model_config( - env.observation_space, - env.action_space, - model_config_dict={"fcnet_hiddens": [32]}, + module = DiscreteBCTFModule( + config=RLModuleConfig( + env.observation_space, + env.action_space, + model_config_dict={"fcnet_hiddens": [32]}, + ) ) self.assertIsInstance(module, TfRLModule) @@ -27,12 +29,13 @@ def test_forward_train(self): bsize = 1024 env = gym.make("CartPole-v1") - module = DiscreteBCTFModule.from_model_config( - env.observation_space, - env.action_space, - model_config_dict={"fcnet_hiddens": [32]}, + module = DiscreteBCTFModule( + config=RLModuleConfig( + env.observation_space, + env.action_space, + model_config_dict={"fcnet_hiddens": [32]}, + ) ) - obs_shape = env.observation_space.shape obs = tf.random.uniform((bsize,) + obs_shape) actions = tf.stack( @@ -59,10 +62,12 @@ def test_forward(self): """Test forward inference and exploration of""" env = gym.make("CartPole-v1") - module = DiscreteBCTFModule.from_model_config( - env.observation_space, - env.action_space, - model_config_dict={"fcnet_hiddens": [32]}, + module = DiscreteBCTFModule( + config=RLModuleConfig( + env.observation_space, + env.action_space, + model_config_dict={"fcnet_hiddens": [32]}, + ) ) obs_shape = env.observation_space.shape @@ -75,19 +80,23 @@ def test_forward(self): def test_get_set_state(self): env = gym.make("CartPole-v1") - module = DiscreteBCTFModule.from_model_config( - env.observation_space, - env.action_space, - model_config_dict={"fcnet_hiddens": [32]}, + module = DiscreteBCTFModule( + config=RLModuleConfig( + env.observation_space, + env.action_space, + model_config_dict={"fcnet_hiddens": [32]}, + ) ) state = module.get_state() self.assertIsInstance(state, dict) - module2 = DiscreteBCTFModule.from_model_config( - env.observation_space, - env.action_space, - model_config_dict={"fcnet_hiddens": [32]}, + module2 = DiscreteBCTFModule( + config=RLModuleConfig( + env.observation_space, + env.action_space, + model_config_dict={"fcnet_hiddens": [32]}, + ) ) state2 = module2.get_state() check(state["policy"][0], state2["policy"][0], false=True) @@ -96,42 +105,22 @@ def test_get_set_state(self): state2_after = module2.get_state() check(state, state2_after) - def test_serialize_deserialize(self): + def test_checkpointing(self): env = gym.make("CartPole-v1") - module = DiscreteBCTFModule.from_model_config( - env.observation_space, - env.action_space, - model_config_dict={"fcnet_hiddens": [32]}, + module = DiscreteBCTFModule( + config=RLModuleConfig( + env.observation_space, + env.action_space, + model_config_dict={"fcnet_hiddens": [32]}, + ) ) + with tempfile.TemporaryDirectory() as tmpdir: + module.save_to_checkpoint(tmpdir) + new_module = DiscreteBCTFModule.from_checkpoint(tmpdir) - # create a new module from the old module - new_module = module.deserialize(module.serialize()) - - # check that the new module is the same type - self.assertIsInstance(new_module, type(module)) - - # check that a parameter of their's is the same - self.assertEqual(new_module._input_dim, module._input_dim) - - # check that their states are the same check(module.get_state(), new_module.get_state()) - - # check that these 2 objects are not the same object self.assertNotEqual(id(module), id(new_module)) - # check that unpickleable parameters are not allowed by the RL Module - # constructor - unpickleable_param = threading.Thread() - - def bad_constructor(): - return DiscreteBCTFModule( - input_dim=unpickleable_param, - hidden_dim=unpickleable_param, - output_dim=unpickleable_param, - ) - - self.assertRaises(NotSerializable, bad_constructor) - if __name__ == "__main__": import pytest diff --git a/rllib/core/rl_module/tf/tf_rl_module.py b/rllib/core/rl_module/tf/tf_rl_module.py index b746242c4ae37..3322b2fb7f1f2 100644 --- a/rllib/core/rl_module/tf/tf_rl_module.py +++ b/rllib/core/rl_module/tf/tf_rl_module.py @@ -1,4 +1,5 @@ -from typing import Any, Mapping +import pathlib +from typing import Any, Mapping, Union from ray.rllib.core.rl_module import RLModule from ray.rllib.utils.annotations import override @@ -42,6 +43,20 @@ def get_state(self) -> Mapping[str, Any]: def set_state(self, state_dict: Mapping[str, Any]) -> None: self.set_weights(state_dict) + @override(RLModule) + def save_state_to_file(self, path: Union[str, pathlib.Path]) -> str: + if isinstance(path, str): + path = pathlib.Path(path) + module_state_dir = path / "module_state" + module_state_dir.mkdir(parents=True, exist_ok=True) + module_state_path = module_state_dir / "module_state" + self.save_weights(str(module_state_path), save_format="tf") + return str(module_state_path) + + @override(RLModule) + def load_state_from_file(self, path: Union[str, pathlib.Path]) -> None: + self.load_weights(path) + @override(RLModule) def make_distributed(self, dist_config: Mapping[str, Any] = None) -> None: """Makes the module distributed.""" diff --git a/rllib/core/rl_module/torch/tests/test_torch_rl_module.py b/rllib/core/rl_module/torch/tests/test_torch_rl_module.py index 7f2232f16c1d8..61747b906627a 100644 --- a/rllib/core/rl_module/torch/tests/test_torch_rl_module.py +++ b/rllib/core/rl_module/torch/tests/test_torch_rl_module.py @@ -1,12 +1,12 @@ -import threading import gymnasium as gym +import tempfile import torch from typing import Mapping import unittest +from ray.rllib.core.rl_module.rl_module import RLModuleConfig from ray.rllib.core.rl_module.torch import TorchRLModule from ray.rllib.core.testing.torch.bc_module import DiscreteBCTorchModule -from ray.rllib.utils.error import NotSerializable from ray.rllib.utils.test_utils import check @@ -14,10 +14,12 @@ class TestRLModule(unittest.TestCase): def test_compilation(self): env = gym.make("CartPole-v1") - module = DiscreteBCTorchModule.from_model_config( - env.observation_space, - env.action_space, - model_config_dict={"fcnet_hiddens": [32]}, + module = DiscreteBCTorchModule( + config=RLModuleConfig( + env.observation_space, + env.action_space, + model_config_dict={"fcnet_hiddens": [32]}, + ) ) self.assertIsInstance(module, TorchRLModule) @@ -26,10 +28,12 @@ def test_forward_train(self): bsize = 1024 env = gym.make("CartPole-v1") - module = DiscreteBCTorchModule.from_model_config( - env.observation_space, - env.action_space, - model_config_dict={"fcnet_hiddens": [32]}, + module = DiscreteBCTorchModule( + config=RLModuleConfig( + env.observation_space, + env.action_space, + model_config_dict={"fcnet_hiddens": [32]}, + ) ) obs_shape = env.observation_space.shape @@ -54,10 +58,12 @@ def test_forward(self): """Test forward inference and exploration of""" env = gym.make("CartPole-v1") - module = DiscreteBCTorchModule.from_model_config( - env.observation_space, - env.action_space, - model_config_dict={"fcnet_hiddens": [32]}, + module = DiscreteBCTorchModule( + config=RLModuleConfig( + env.observation_space, + env.action_space, + model_config_dict={"fcnet_hiddens": [32]}, + ) ) obs_shape = env.observation_space.shape @@ -70,19 +76,23 @@ def test_forward(self): def test_get_set_state(self): env = gym.make("CartPole-v1") - module = DiscreteBCTorchModule.from_model_config( - env.observation_space, - env.action_space, - model_config_dict={"fcnet_hiddens": [32]}, + module = DiscreteBCTorchModule( + config=RLModuleConfig( + env.observation_space, + env.action_space, + model_config_dict={"fcnet_hiddens": [32]}, + ) ) state = module.get_state() self.assertIsInstance(state, dict) - module2 = DiscreteBCTorchModule.from_model_config( - env.observation_space, - env.action_space, - model_config_dict={"fcnet_hiddens": [32]}, + module2 = DiscreteBCTorchModule( + config=RLModuleConfig( + env.observation_space, + env.action_space, + model_config_dict={"fcnet_hiddens": [32]}, + ) ) state2 = module2.get_state() check(state, state2, false=True) @@ -91,42 +101,22 @@ def test_get_set_state(self): state2_after = module2.get_state() check(state, state2_after) - def test_serialize_deserialize(self): + def test_checkpointing(self): env = gym.make("CartPole-v1") - module = DiscreteBCTorchModule.from_model_config( - env.observation_space, - env.action_space, - model_config_dict={"fcnet_hiddens": [32]}, + module = DiscreteBCTorchModule( + config=RLModuleConfig( + env.observation_space, + env.action_space, + model_config_dict={"fcnet_hiddens": [32]}, + ) ) + with tempfile.TemporaryDirectory() as tmpdir: + module.save_to_checkpoint(tmpdir) + new_module = DiscreteBCTorchModule.from_checkpoint(tmpdir) - # create a new module from the old module - new_module = module.deserialize(module.serialize()) - - # check that the new module is the same type - self.assertIsInstance(new_module, type(module)) - - # check that a parameter of their's is the same - self.assertEqual(new_module.input_dim, module.input_dim) - - # check that their states are the same check(module.get_state(), new_module.get_state()) - - # check that these 2 objects are not the same object self.assertNotEqual(id(module), id(new_module)) - # check that unpickleable parameters are not allowed by the RL Module - # constructor - unpickleable_param = threading.Thread() - - def bad_constructor(): - return DiscreteBCTorchModule( - input_dim=unpickleable_param, - hidden_dim=unpickleable_param, - output_dim=unpickleable_param, - ) - - self.assertRaises(NotSerializable, bad_constructor) - if __name__ == "__main__": import pytest diff --git a/rllib/core/rl_module/torch/torch_rl_module.py b/rllib/core/rl_module/torch/torch_rl_module.py index dcf4e5759678e..03741f8a015f3 100644 --- a/rllib/core/rl_module/torch/torch_rl_module.py +++ b/rllib/core/rl_module/torch/torch_rl_module.py @@ -1,7 +1,9 @@ -from typing import Any, Mapping +import pathlib +from typing import Any, Mapping, Union + +from ray.rllib.core.rl_module import RLModule from ray.rllib.utils.annotations import override from ray.rllib.utils.framework import try_import_torch -from ray.rllib.core.rl_module import RLModule torch, nn = try_import_torch() @@ -27,6 +29,24 @@ def get_state(self) -> Mapping[str, Any]: def set_state(self, state_dict: Mapping[str, Any]) -> None: self.load_state_dict(state_dict) + @override(RLModule) + def save_state_to_file(self, path: Union[str, pathlib.Path]) -> str: + if isinstance(path, str): + path = pathlib.Path(path) + module_state_path = path / "module_state.pt" + torch.save(self.state_dict(), str(module_state_path)) + return str(module_state_path) + + @override(RLModule) + def load_state_from_file(self, path: Union[str, pathlib.Path]) -> None: + if isinstance(path, str): + path = pathlib.Path(path) + if not path.exists(): + raise ValueError( + f"While loading state from path, the path does not exist: {path}" + ) + self.set_state(torch.load(str(path))) + @override(RLModule) def make_distributed(self, dist_config: Mapping[str, Any] = None) -> None: """Makes the module distributed.""" @@ -40,10 +60,10 @@ def is_distributed(self) -> bool: return False -class TorchDDPRLModule(nn.parallel.DistributedDataParallel, RLModule): +class TorchDDPRLModule(RLModule, nn.parallel.DistributedDataParallel): def __init__(self, *args, **kwargs) -> None: nn.parallel.DistributedDataParallel.__init__(self, *args, **kwargs) - # we do not want to call RLModule.__init__ here because it will all we need is + # we do not want to call RLModule.__init__ here because all we need is # the interface of that base-class not the actual implementation. @override(RLModule) diff --git a/rllib/core/testing/tests/test_bc_algorithm.py b/rllib/core/testing/tests/test_bc_algorithm.py index 591889699dba1..b3eeb21b61cb0 100644 --- a/rllib/core/testing/tests/test_bc_algorithm.py +++ b/rllib/core/testing/tests/test_bc_algorithm.py @@ -4,15 +4,16 @@ import ray from ray.rllib.core.testing.torch.bc_module import ( DiscreteBCTorchModule, - BCTorchMultiAgentSpec, BCTorchRLModuleWithSharedGlobalEncoder, + BCTorchMultiAgentModuleWithSharedEncoder, ) from ray.rllib.core.testing.tf.bc_module import ( DiscreteBCTFModule, - BCTfMultiAgentSpec, BCTfRLModuleWithSharedGlobalEncoder, + BCTfMultiAgentModuleWithSharedEncoder, ) from ray.rllib.core.rl_module.rl_module import SingleAgentRLModuleSpec +from ray.rllib.core.rl_module.marl_module import MultiAgentRLModuleSpec from ray.rllib.core.testing.bc_algorithm import BCConfigTest from ray.rllib.utils.test_utils import framework_iterator from ray.rllib.examples.env.multi_agent import MultiAgentCartPole @@ -79,16 +80,18 @@ def test_bc_algorithm_w_custom_marl_module(self): for fw in ["torch"]: if fw == "torch": - spec = BCTorchMultiAgentSpec( + spec = MultiAgentRLModuleSpec( + marl_module_class=BCTorchMultiAgentModuleWithSharedEncoder, module_specs=SingleAgentRLModuleSpec( module_class=BCTorchRLModuleWithSharedGlobalEncoder - ) + ), ) else: - spec = BCTfMultiAgentSpec( + spec = MultiAgentRLModuleSpec( + marl_module_class=BCTfMultiAgentModuleWithSharedEncoder, module_specs=SingleAgentRLModuleSpec( module_class=BCTfRLModuleWithSharedGlobalEncoder - ) + ), ) policies = {"policy_1", "policy_2"} diff --git a/rllib/core/testing/tf/bc_module.py b/rllib/core/testing/tf/bc_module.py index 6147642cb9253..e776ab469072e 100644 --- a/rllib/core/testing/tf/bc_module.py +++ b/rllib/core/testing/tf/bc_module.py @@ -1,10 +1,12 @@ -import gymnasium as gym import tensorflow as tf import tensorflow_probability as tfp -from typing import Any, Mapping, Optional +from typing import Any, Mapping -from ray.rllib.core.rl_module.rl_module import RLModule -from ray.rllib.core.rl_module.marl_module import MultiAgentRLModuleSpec, ModuleID +from ray.rllib.core.rl_module.rl_module import RLModule, RLModuleConfig +from ray.rllib.core.rl_module.marl_module import ( + MultiAgentRLModule, + MultiAgentRLModuleConfig, +) from ray.rllib.core.rl_module.tf.tf_rl_module import TfRLModule from ray.rllib.models.specs.typing import SpecType from ray.rllib.policy.sample_batch import SampleBatch @@ -13,15 +15,12 @@ class DiscreteBCTFModule(TfRLModule): - def __init__( - self, - input_dim: int, - hidden_dim: int, - output_dim: int, - ) -> None: - super().__init__( - input_dim=input_dim, output_dim=output_dim, hidden_dim=hidden_dim - ) + def __init__(self, config: RLModuleConfig) -> None: + super().__init__(config) + + input_dim = self.config.observation_space.shape[0] + hidden_dim = self.config.model_config_dict["fcnet_hiddens"][0] + output_dim = self.config.action_space.n layers = [] layers.append(tf.keras.Input(shape=(input_dim,))) @@ -72,24 +71,6 @@ def get_state(self) -> Mapping[str, Any]: def set_state(self, state: Mapping[str, Any]) -> None: self.policy.set_weights(state["policy"]) - @classmethod - @override(RLModule) - def from_model_config( - cls, - observation_space: "gym.Space", - action_space: "gym.Space", - *, - model_config_dict: Mapping[str, Any], - ) -> "DiscreteBCTFModule": - - config = { - "input_dim": observation_space.shape[0], - "hidden_dim": model_config_dict["fcnet_hiddens"][0], - "output_dim": action_space.n, - } - - return cls(**config) - class BCTfRLModuleWithSharedGlobalEncoder(TfRLModule): def __init__(self, encoder, local_dim, hidden_dim, action_dim): @@ -133,15 +114,16 @@ def _common_forward(self, batch): return {"action_dist": tf.distributions.Categorical(logits=action_logits)} -class BCTfMultiAgentSpec(MultiAgentRLModuleSpec): - def build(self, module_id: Optional[ModuleID] = None): +class BCTfMultiAgentModuleWithSharedEncoder(MultiAgentRLModule): + def __init__(self, config: MultiAgentRLModuleConfig) -> None: + super().__init__(config) - self._check_before_build() # constructing the global encoder based on the observation_space of the first # module - module_spec = next(iter(self.module_specs.values())) + module_specs = self.config.modules + module_spec = next(iter(module_specs.values())) global_dim = module_spec.observation_space["global"].shape[0] - hidden_dim = module_spec.model_config["fcnet_hiddens"][0] + hidden_dim = module_spec.model_config_dict["fcnet_hiddens"][0] shared_encoder = tf.keras.Sequential( [ tf.keras.Input(shape=(global_dim,)), @@ -150,21 +132,18 @@ def build(self, module_id: Optional[ModuleID] = None): ] ) - if module_id: - return module_spec.module_class( + for module_id, module_spec in module_specs.items(): + self._rl_modules[module_id] = module_spec.module_class( encoder=shared_encoder, local_dim=module_spec.observation_space["local"].shape[0], hidden_dim=hidden_dim, action_dim=module_spec.action_space.n, ) - rl_modules = {} - for module_id, module_spec in self.module_specs.items(): - rl_modules[module_id] = module_spec.module_class( - encoder=shared_encoder, - local_dim=module_spec.observation_space["local"].shape[0], - hidden_dim=hidden_dim, - action_dim=module_spec.action_space.n, - ) + def serialize(self): + # TODO (Kourosh): Implement when needed. + raise NotImplementedError - return self.marl_module_class(rl_modules) + def deserialize(self, data): + # TODO (Kourosh): Implement when needed. + raise NotImplementedError diff --git a/rllib/core/testing/torch/bc_module.py b/rllib/core/testing/torch/bc_module.py index 06291d999e107..ff4f5abc281b0 100644 --- a/rllib/core/testing/torch/bc_module.py +++ b/rllib/core/testing/torch/bc_module.py @@ -1,8 +1,10 @@ -import gymnasium as gym -from typing import Any, Mapping, Optional +from typing import Any, Mapping -from ray.rllib.core.rl_module import RLModule -from ray.rllib.core.rl_module.marl_module import MultiAgentRLModuleSpec, ModuleID +from ray.rllib.core.rl_module.rl_module import RLModule, RLModuleConfig +from ray.rllib.core.rl_module.marl_module import ( + MultiAgentRLModuleConfig, + MultiAgentRLModule, +) from ray.rllib.core.rl_module.torch.torch_rl_module import TorchRLModule from ray.rllib.models.specs.typing import SpecType from ray.rllib.utils.annotations import override @@ -13,15 +15,13 @@ class DiscreteBCTorchModule(TorchRLModule): - def __init__( - self, - input_dim: int, - hidden_dim: int, - output_dim: int, - ) -> None: - super().__init__( - input_dim=input_dim, hidden_dim=hidden_dim, output_dim=output_dim - ) + def __init__(self, config: RLModuleConfig) -> None: + super().__init__(config) + + input_dim = self.config.observation_space.shape[0] + hidden_dim = self.config.model_config_dict["fcnet_hiddens"][0] + output_dim = self.config.action_space.n + self.policy = nn.Sequential( nn.Linear(input_dim, hidden_dim), nn.ReLU(), @@ -57,24 +57,6 @@ def _forward_train(self, batch: NestedDict) -> Mapping[str, Any]: action_logits = self.policy(batch["obs"]) return {"action_dist": torch.distributions.Categorical(logits=action_logits)} - @classmethod - @override(RLModule) - def from_model_config( - cls, - observation_space: "gym.Space", - action_space: "gym.Space", - *, - model_config_dict: Mapping[str, Any], - ) -> "DiscreteBCTorchModule": - - config = { - "input_dim": observation_space.shape[0], - "hidden_dim": model_config_dict["fcnet_hiddens"][0], - "output_dim": action_space.n, - } - - return cls(**config) - class BCTorchRLModuleWithSharedGlobalEncoder(TorchRLModule): """An example of an RLModule that uses an encoder shared with other things. @@ -92,7 +74,7 @@ class BCTorchRLModuleWithSharedGlobalEncoder(TorchRLModule): def __init__( self, encoder: nn.Module, local_dim: int, hidden_dim: int, action_dim: int ) -> None: - super().__init__() + super().__init__(config=None) self.encoder = encoder self.policy_head = nn.Sequential( @@ -128,34 +110,24 @@ def _common_forward(self, batch): return {"action_dist": torch.distributions.Categorical(logits=action_logits)} -class BCTorchMultiAgentSpec(MultiAgentRLModuleSpec): +class BCTorchMultiAgentModuleWithSharedEncoder(MultiAgentRLModule): + def __init__(self, config: MultiAgentRLModuleConfig) -> None: + super().__init__(config) - # TODO: make sure the default class is MultiAgentRLModule + def build(self): - def build(self, module_id: Optional[ModuleID] = None): - - self._check_before_build() - # constructing the global encoder based on the observation_space of the first - # module - module_spec = next(iter(self.module_specs.values())) + module_specs = self.config.modules + module_spec = next(iter(module_specs.values())) global_dim = module_spec.observation_space["global"].shape[0] - hidden_dim = module_spec.model_config["fcnet_hiddens"][0] + hidden_dim = module_spec.model_config_dict["fcnet_hiddens"][0] shared_encoder = nn.Sequential( nn.Linear(global_dim, hidden_dim), nn.ReLU(), nn.Linear(hidden_dim, hidden_dim), ) - if module_id: - return module_spec.module_class( - encoder=shared_encoder, - local_dim=module_spec.observation_space["local"].shape[0], - hidden_dim=hidden_dim, - action_dim=module_spec.action_space.n, - ) - rl_modules = {} - for module_id, module_spec in self.module_specs.items(): + for module_id, module_spec in module_specs.items(): rl_modules[module_id] = module_spec.module_class( encoder=shared_encoder, local_dim=module_spec.observation_space["local"].shape[0], @@ -163,4 +135,12 @@ def build(self, module_id: Optional[ModuleID] = None): action_dim=module_spec.action_space.n, ) - return self.marl_module_class(rl_modules) + self._rl_modules = rl_modules + + def serialize(self): + # TODO (Kourosh): Implement when needed. + raise NotImplementedError + + def deserialize(self, data): + # TODO (Kourosh): Implement when needed. + raise NotImplementedError diff --git a/rllib/core/testing/utils.py b/rllib/core/testing/utils.py index cc6b4258b0a09..c572466f79cdf 100644 --- a/rllib/core/testing/utils.py +++ b/rllib/core/testing/utils.py @@ -61,7 +61,7 @@ def get_module_spec(framework: str, env: "gym.Env", is_multi_agent: bool = False module_class=get_module_class(framework), observation_space=env.observation_space, action_space=env.action_space, - model_config={"fcnet_hiddens": [32]}, + model_config_dict={"fcnet_hiddens": [32]}, ) if is_multi_agent: diff --git a/rllib/env/multi_agent_env.py b/rllib/env/multi_agent_env.py index a65b8c3160736..081a5eae76060 100644 --- a/rllib/env/multi_agent_env.py +++ b/rllib/env/multi_agent_env.py @@ -662,11 +662,14 @@ def send_actions(self, action_dict: MultiEnvDict) -> None: assert isinstance(terminateds, dict), "Not a multi-agent terminateds dict!" assert isinstance(truncateds, dict), "Not a multi-agent truncateds dict!" assert isinstance(infos, dict), "Not a multi-agent info dict!" - if isinstance(obs, dict) and set(infos).difference(set(obs)): - raise ValueError( - "Key set for infos must be a subset of obs: " - "{} vs {}".format(infos.keys(), obs.keys()) - ) + if isinstance(obs, dict): + info_diff = set(infos).difference(set(obs)) + if info_diff and info_diff != {"__common__"}: + raise ValueError( + "Key set for infos must be a subset of obs (plus optionally " + "the '__common__' key for infos concerning all/no agents): " + "{} vs {}".format(infos.keys(), obs.keys()) + ) if "__all__" not in terminateds: raise ValueError( "In multi-agent environments, '__all__': True|False must " diff --git a/rllib/env/wrappers/atari_wrappers.py b/rllib/env/wrappers/atari_wrappers.py index a4bdb554bd87f..b836961669d92 100644 --- a/rllib/env/wrappers/atari_wrappers.py +++ b/rllib/env/wrappers/atari_wrappers.py @@ -212,7 +212,9 @@ def __init__(self, env, skip=4): """Return only every `skip`-th frame""" gym.Wrapper.__init__(self, env) # most recent raw observations (for max pooling across time steps) - self._obs_buffer = np.zeros((2,) + env.observation_space.shape, dtype=np.uint8) + self._obs_buffer = np.zeros( + (2,) + env.observation_space.shape, dtype=env.observation_space.dtype + ) self._skip = skip def step(self, action): diff --git a/rllib/models/tf/tf_action_dist.py b/rllib/models/tf/tf_action_dist.py index c33729618b945..a28ac4cce4225 100644 --- a/rllib/models/tf/tf_action_dist.py +++ b/rllib/models/tf/tf_action_dist.py @@ -197,7 +197,7 @@ def required_model_output_shape( high_ = np.max(action_space.high) assert np.all(action_space.low == low_) assert np.all(action_space.high == high_) - np.prod(action_space.shape, dtype=np.int32) * (high_ - low_ + 1) + return np.prod(action_space.shape, dtype=np.int32) * (high_ - low_ + 1) # MultiDiscrete space. else: # nvec is already integer, so no casting needed. diff --git a/rllib/models/tf/tf_distributions.py b/rllib/models/tf/tf_distributions.py index 26f8e2b55c834..91cc995722e06 100644 --- a/rllib/models/tf/tf_distributions.py +++ b/rllib/models/tf/tf_distributions.py @@ -31,7 +31,7 @@ def __init__(self, *args, **kwargs): self._dist = self._get_tf_distribution(*args, **kwargs) @abc.abstractmethod - def _get_tf_distribution(self, *args, **kwargs) -> tfp.distributions.Distribution: + def _get_tf_distribution(self, *args, **kwargs) -> "tfp.distributions.Distribution": """Returns the tfp.distributions.Distribution object to use.""" @override(Distribution) @@ -118,7 +118,7 @@ def _get_tf_distribution( probs: tf.Tensor = None, logits: tf.Tensor = None, temperature: float = 1.0, - ) -> tfp.distributions.Distribution: + ) -> "tfp.distributions.Distribution": if logits is not None: assert temperature > 0.0, "Categorical `temperature` must be > 0.0!" logits /= temperature @@ -129,7 +129,7 @@ def _get_tf_distribution( def required_model_output_shape( space: gym.Space, model_config: ModelConfigDict ) -> Tuple[int, ...]: - return (space.n,) + return (int(space.n),) @override(TfDistribution) def _rsample(self, sample_shape=()): @@ -179,7 +179,7 @@ def __init__( super().__init__(loc=loc, scale=scale) @override(TfDistribution) - def _get_tf_distribution(self, loc, scale=None) -> tfp.distributions.Distribution: + def _get_tf_distribution(self, loc, scale=None) -> "tfp.distributions.Distribution": if scale is None: loc, log_scale = tf.split(loc, num_or_size_splits=2, axis=-1) scale = tf.exp(log_scale) @@ -202,7 +202,7 @@ def kl(self, other: "TfDistribution") -> TensorType: def required_model_output_shape( space: gym.Space, model_config: ModelConfigDict ) -> Tuple[int, ...]: - return tuple(np.prod(space.shape, dtype=np.int32) * 2) + return (int(np.prod(space.shape, dtype=np.int32) * 2),) @override(TfDistribution) def _rsample(self, sample_shape=()): @@ -283,7 +283,7 @@ def required_model_output_shape( space: gym.Space, model_config: ModelConfigDict ) -> Tuple[int, ...]: # TODO: This was copied from previous code. Is this correct? add unit test. - return tuple(np.prod(space.shape, dtype=np.int32)) + return (int(np.prod(space.shape, dtype=np.int32)),) @classmethod @override(Distribution) diff --git a/rllib/models/torch/torch_action_dist.py b/rllib/models/torch/torch_action_dist.py index dadbec72f2f1c..12f725b9e10d6 100644 --- a/rllib/models/torch/torch_action_dist.py +++ b/rllib/models/torch/torch_action_dist.py @@ -182,7 +182,7 @@ def required_model_output_shape( high_ = np.max(action_space.high) assert np.all(action_space.low == low_) assert np.all(action_space.high == high_) - np.prod(action_space.shape, dtype=np.int32) * (high_ - low_ + 1) + return np.prod(action_space.shape, dtype=np.int32) * (high_ - low_ + 1) # MultiDiscrete space. else: # `nvec` is already integer. No need to cast. diff --git a/rllib/models/torch/torch_distributions.py b/rllib/models/torch/torch_distributions.py index 0cf9c5258081e..f70bde3c68cc6 100644 --- a/rllib/models/torch/torch_distributions.py +++ b/rllib/models/torch/torch_distributions.py @@ -28,7 +28,7 @@ def __init__(self, *args, **kwargs): @abc.abstractmethod def _get_torch_distribution( self, *args, **kwargs - ) -> torch.distributions.Distribution: + ) -> "torch.distributions.Distribution": """Returns the torch.distributions.Distribution object to use.""" @override(Distribution) @@ -108,7 +108,7 @@ def _get_torch_distribution( probs: torch.Tensor = None, logits: torch.Tensor = None, temperature: float = 1.0, - ) -> torch.distributions.Distribution: + ) -> "torch.distributions.Distribution": if logits is not None: assert temperature > 0.0, "Categorical `temperature` must be > 0.0!" logits /= temperature @@ -119,7 +119,7 @@ def _get_torch_distribution( def required_model_output_shape( space: gym.Space, model_config: ModelConfigDict ) -> Tuple[int, ...]: - return (space.n,) + return (int(space.n),) @classmethod @override(Distribution) @@ -165,7 +165,7 @@ def __init__( def _get_torch_distribution( self, loc, scale=None - ) -> torch.distributions.Distribution: + ) -> "torch.distributions.Distribution": if scale is None: loc, log_std = torch.chunk(self.inputs, 2, dim=1) scale = torch.exp(log_std) @@ -188,7 +188,7 @@ def kl(self, other: "TorchDistribution") -> TensorType: def required_model_output_shape( space: gym.Space, model_config: ModelConfigDict ) -> Tuple[int, ...]: - return tuple(np.prod(space.shape, dtype=np.int32) * 2) + return (int(np.prod(space.shape, dtype=np.int32) * 2),) @classmethod @override(Distribution) @@ -268,7 +268,7 @@ def required_model_output_shape( space: gym.Space, model_config: ModelConfigDict ) -> Tuple[int, ...]: # TODO: This was copied from previous code. Is this correct? add unit test. - return tuple(np.prod(space.shape, dtype=np.int32)) + return (int(np.prod(space.shape, dtype=np.int32)),) @classmethod @override(Distribution) diff --git a/rllib/policy/policy.py b/rllib/policy/policy.py index 0c6ab850f6b8e..567b134891a48 100644 --- a/rllib/policy/policy.py +++ b/rllib/policy/policy.py @@ -392,11 +392,18 @@ def make_rl_module(self) -> "RLModule": "bug, please file a github issue." ) - module_spec = self.config["__marl_module_spec"] - if isinstance(module_spec, SingleAgentRLModuleSpec): - module = module_spec.build() + spec = self.config["__marl_module_spec"] + if isinstance(spec, SingleAgentRLModuleSpec): + module = spec.build() else: - module = module_spec.build(module_id=self.__policy_id) + # filter the module_spec to only contain the policy_id of this policy + marl_spec = type(spec)( + marl_module_class=spec.marl_module_class, + module_specs={self.__policy_id: spec.module_specs[self.__policy_id]}, + ) + marl_module = marl_spec.build() + module = marl_module[self.__policy_id] + return module @DeveloperAPI @@ -1341,7 +1348,10 @@ def _initialize_loss_from_dummy_batch( # Save for later so that loss init does not change global timestep global_ts_before_init = int(convert_to_numpy(self.global_timestep)) - sample_batch_size = max(self.batch_divisibility_req * 4, 32) + sample_batch_size = min( + max(self.batch_divisibility_req * 4, 32), + self.config["train_batch_size"], # Don't go over the asked batch size. + ) self._dummy_batch = self._get_dummy_batch_from_view_requirements( sample_batch_size ) diff --git a/rllib/utils/framework.py b/rllib/utils/framework.py index 226b2a441a3d8..d5cc2ce9ca270 100644 --- a/rllib/utils/framework.py +++ b/rllib/utils/framework.py @@ -26,7 +26,7 @@ def try_import_jax(error: bool = False): """ if "RLLIB_TEST_NO_JAX_IMPORT" in os.environ: logger.warning("Not importing JAX for test purposes.") - return None + return None, None try: import jax diff --git a/rllib/utils/serialization.py b/rllib/utils/serialization.py index 90560c1b1b225..5ac2fd5918694 100644 --- a/rllib/utils/serialization.py +++ b/rllib/utils/serialization.py @@ -1,8 +1,9 @@ import base64 import numpy as np import io +import importlib import zlib -from typing import Dict, Any, Sequence +from typing import Dict, Any, Optional, Sequence, Type, Union import ray from ray.rllib.utils.annotations import DeveloperAPI @@ -73,17 +74,17 @@ def _box(sp: gym.spaces.Box) -> Dict: def _discrete(sp: gym.spaces.Discrete) -> Dict: d = { "space": "discrete", - "n": sp.n, + "n": int(sp.n), } # Offset is a relatively new Discrete space feature. if hasattr(sp, "start"): - d["start"] = sp.start + d["start"] = int(sp.start) return d def _multi_binary(sp: gym.spaces.MultiBinary) -> Dict: return { "space": "multi-binary", - "n": sp.n, + "n": int(sp.n), "dtype": sp.dtype.str, } @@ -325,3 +326,59 @@ def check_if_args_kwargs_serializable(args: Sequence[Any], kwargs: Dict[str, Any f"Found non-serializable keyword argument: {k} = {v}.\n" f"Original serialization error: {e}" ) + + +@DeveloperAPI +def serialize_type(type_: Union[Type, str]) -> str: + """Converts a type into its full classpath ([module file] + "." + [class name]). + Args: + type_: The type to convert. + Returns: + The full classpath of the given type, e.g. "ray.rllib.algorithms.ppo.PPOConfig". + """ + # TODO (avnishn): find a way to incorporate the tune registry here. + # Already serialized. + if isinstance(type_, str): + return type_ + + return type_.__module__ + "." + type_.__qualname__ + + +@DeveloperAPI +def deserialize_type( + module: Union[str, Type], error: bool = False +) -> Optional[Union[str, Type]]: + """Resolves a class path to a class. + If the given module is already a class, it is returned as is. + If the given module is a string, it is imported and the class is returned. + Args: + module: The classpath (str) or type to resolve. + error: Whether to throw a ValueError if `module` could not be resolved into + a class. If False and `module` is not resolvable, returns None. + Returns: + The resolved class or `module` (if `error` is False and no resolution possible). + Raises: + ValueError: If `error` is True and `module` cannot be resolved. + """ + if isinstance(module, type): + return module + + elif isinstance(module, str): + # Try interpreting (as classpath) and importing the given module. + try: + module_path, class_name = module.rsplit(".", 1) + module = importlib.import_module(module_path) + return getattr(module, class_name) + # Module not found. + except (ModuleNotFoundError, ImportError, AttributeError, ValueError) as e: + if error: + raise ValueError( + f"Could not deserialize the given classpath `module={module}` into " + "a valid python class! Make sure you have all necessary pip " + "packages installed and all custom modules are in your " + "`PYTHONPATH` env variable." + ) from e + else: + raise ValueError(f"`module` ({module} must be type or string (classpath)!") + + return module diff --git a/src/mock/ray/core_worker/task_manager.h b/src/mock/ray/core_worker/task_manager.h index 7795744b37ee8..2b166fdf4a37e 100644 --- a/src/mock/ray/core_worker/task_manager.h +++ b/src/mock/ray/core_worker/task_manager.h @@ -54,7 +54,7 @@ class MockTaskFinisherInterface : public TaskFinisherInterface { (const, override)); MOCK_METHOD(bool, RetryTaskIfPossible, - (const TaskID &task_id, bool task_failed_due_to_oom), + (const TaskID &task_id, const rpc::RayErrorInfo &error_info), (override)); MOCK_METHOD(void, MarkDependenciesResolved, (const TaskID &task_id), (override)); MOCK_METHOD(void, diff --git a/src/ray/core_worker/task_event_buffer.cc b/src/ray/core_worker/task_event_buffer.cc index 15731ad46628a..9c2bb74b01912 100644 --- a/src/ray/core_worker/task_event_buffer.cc +++ b/src/ray/core_worker/task_event_buffer.cc @@ -31,14 +31,12 @@ TaskStatusEvent::TaskStatusEvent( const rpc::TaskStatus &task_status, int64_t timestamp, const std::shared_ptr &task_spec, - absl::optional node_id, - absl::optional worker_id) + absl::optional state_update) : TaskEvent(task_id, job_id, attempt_number), task_status_(task_status), timestamp_(timestamp), task_spec_(task_spec), - node_id_(node_id), - worker_id_(worker_id) {} + state_update_(state_update) {} TaskProfileEvent::TaskProfileEvent(TaskID task_id, JobID job_id, @@ -67,22 +65,30 @@ void TaskStatusEvent::ToRpcTaskEvents(rpc::TaskEvents *rpc_task_events) { } // Task status update. - auto state_updates = rpc_task_events->mutable_state_updates(); + auto dst_state_update = rpc_task_events->mutable_state_updates(); + gcs::FillTaskStatusUpdateTime(task_status_, timestamp_, dst_state_update); - if (node_id_.has_value()) { + if (!state_update_.has_value()) { + return; + } + + if (state_update_->node_id_.has_value()) { RAY_CHECK(task_status_ == rpc::TaskStatus::SUBMITTED_TO_WORKER) << "Node ID should be included when task status changes to " "SUBMITTED_TO_WORKER."; - state_updates->set_node_id(node_id_->Binary()); + dst_state_update->set_node_id(state_update_->node_id_->Binary()); } - if (worker_id_.has_value()) { + if (state_update_->worker_id_.has_value()) { RAY_CHECK(task_status_ == rpc::TaskStatus::SUBMITTED_TO_WORKER) << "Worker ID should be included when task status changes to " "SUBMITTED_TO_WORKER."; - state_updates->set_worker_id(worker_id_->Binary()); + dst_state_update->set_worker_id(state_update_->worker_id_->Binary()); + } + + if (state_update_->error_info_.has_value()) { + dst_state_update->set_error_type(state_update_->error_info_->error_type()); } - gcs::FillTaskStatusUpdateTime(task_status_, timestamp_, state_updates); } void TaskProfileEvent::ToRpcTaskEvents(rpc::TaskEvents *rpc_task_events) { diff --git a/src/ray/core_worker/task_event_buffer.h b/src/ray/core_worker/task_event_buffer.h index 2ee359846ffc3..c00aa8d145149 100644 --- a/src/ray/core_worker/task_event_buffer.h +++ b/src/ray/core_worker/task_event_buffer.h @@ -69,6 +69,25 @@ class TaskEvent { /// TaskStatusEvent is generated when a task changes its status. class TaskStatusEvent : public TaskEvent { public: + /// A class that contain data that will be converted to rpc::TaskStateUpdate + struct TaskStateUpdate { + TaskStateUpdate(const absl::optional &error_info) + : error_info_(error_info) {} + + TaskStateUpdate(const NodeID &node_id, const WorkerID &worker_id) + : node_id_(node_id), worker_id_(worker_id) {} + + private: + friend class TaskStatusEvent; + + /// Node id if it's a SUBMITTED_TO_WORKER status change. + const absl::optional node_id_ = absl::nullopt; + /// Worker id if it's a SUBMITTED_TO_WORKER status change. + const absl::optional worker_id_ = absl::nullopt; + /// Task error info. + const absl::optional error_info_ = absl::nullopt; + }; + explicit TaskStatusEvent( TaskID task_id, JobID job_id, @@ -76,8 +95,7 @@ class TaskStatusEvent : public TaskEvent { const rpc::TaskStatus &task_status, int64_t timestamp, const std::shared_ptr &task_spec = nullptr, - absl::optional node_id = absl::nullopt, - absl::optional worker_id = absl::nullopt); + absl::optional state_update = absl::nullopt); void ToRpcTaskEvents(rpc::TaskEvents *rpc_task_events) override; @@ -90,10 +108,8 @@ class TaskStatusEvent : public TaskEvent { const int64_t timestamp_ = -1; /// Pointer to the task spec. const std::shared_ptr task_spec_ = nullptr; - /// Node id if it's a SUBMITTED_TO_WORKER status change. - const absl::optional node_id_ = absl::nullopt; - /// Worker id if it's a SUBMITTED_TO_WORKER status change. - const absl::optional worker_id_ = absl::nullopt; + /// Pointer to the task state update + absl::optional state_update_ = absl::nullopt; }; /// TaskProfileEvent is generated when `RAY_enable_timeline` is on. diff --git a/src/ray/core_worker/task_manager.cc b/src/ray/core_worker/task_manager.cc index f7126fcff0fe7..794dbc3a64e90 100644 --- a/src/ray/core_worker/task_manager.cc +++ b/src/ray/core_worker/task_manager.cc @@ -381,7 +381,9 @@ void TaskManager::CompletePendingTask(const TaskID &task_id, it->second.num_successful_executions++; if (is_application_error) { - SetTaskStatus(it->second, rpc::TaskStatus::FAILED); + SetTaskStatus(it->second, + rpc::TaskStatus::FAILED, + gcs::GetRayErrorInfo(rpc::ErrorType::TASK_EXECUTION_EXCEPTION)); } else { SetTaskStatus(it->second, rpc::TaskStatus::FINISHED); } @@ -420,11 +422,12 @@ void TaskManager::CompletePendingTask(const TaskID &task_id, } bool TaskManager::RetryTaskIfPossible(const TaskID &task_id, - bool task_failed_due_to_oom) { + const rpc::RayErrorInfo &error_info) { TaskSpecification spec; bool will_retry = false; int32_t num_retries_left = 0; int32_t num_oom_retries_left = 0; + bool task_failed_due_to_oom = error_info.error_type() == rpc::ErrorType::OUT_OF_MEMORY; { absl::MutexLock lock(&mu_); auto it = submissible_tasks_.find(task_id); @@ -455,7 +458,7 @@ bool TaskManager::RetryTaskIfPossible(const TaskID &task_id, } } if (will_retry) { - MarkTaskRetryOnFailed(it->second); + MarkTaskRetryOnFailed(it->second, error_info); } } @@ -493,7 +496,9 @@ void TaskManager::FailPendingTask(const TaskID &task_id, // Note that this might be the __ray_terminate__ task, so we don't log // loudly with ERROR here. RAY_LOG(DEBUG) << "Task " << task_id << " failed with error " - << rpc::ErrorType_Name(error_type); + << rpc::ErrorType_Name(error_type) << ", ray_error_info: " + << ((ray_error_info == nullptr) ? "nullptr" + : ray_error_info->DebugString()); TaskSpecification spec; // Check whether the error should be stored in plasma or not. @@ -506,7 +511,10 @@ void TaskManager::FailPendingTask(const TaskID &task_id, RAY_CHECK(it->second.IsPending()) << "Tried to fail task that was not pending " << task_id; spec = it->second.spec; - SetTaskStatus(it->second, rpc::TaskStatus::FAILED); + SetTaskStatus( + it->second, + rpc::TaskStatus::FAILED, + ray_error_info == nullptr ? gcs::GetRayErrorInfo(error_type) : *ray_error_info); submissible_tasks_.erase(it); num_pending_tasks_--; @@ -556,7 +564,8 @@ bool TaskManager::FailOrRetryPendingTask(const TaskID &task_id, bool will_retry = false; if (!fail_immediately) { will_retry = RetryTaskIfPossible( - task_id, /*task_failed_due_to_oom*/ error_type == rpc::ErrorType::OUT_OF_MEMORY); + task_id, + ray_error_info == nullptr ? gcs::GetRayErrorInfo(error_type) : *ray_error_info); } if (!will_retry && mark_task_object_failed) { @@ -801,8 +810,7 @@ void TaskManager::MarkTaskWaitingForExecution(const TaskID &task_id, it->second.spec, rpc::TaskStatus::SUBMITTED_TO_WORKER, /* include_task_info */ false, - node_id, - worker_id); + worker::TaskStatusEvent::TaskStateUpdate(node_id, worker_id)); } void TaskManager::MarkTaskRetryOnResubmit(TaskEntry &task_entry) { @@ -822,10 +830,14 @@ void TaskManager::MarkTaskRetryOnResubmit(TaskEntry &task_entry) { /* include_task_info */ true); } -void TaskManager::MarkTaskRetryOnFailed(TaskEntry &task_entry) { +void TaskManager::MarkTaskRetryOnFailed(TaskEntry &task_entry, + const rpc::RayErrorInfo &error_info) { // Record the old attempt status as FAILED. - RecordTaskStatusEvent( - task_entry.spec.AttemptNumber(), task_entry.spec, rpc::TaskStatus::FAILED); + RecordTaskStatusEvent(task_entry.spec.AttemptNumber(), + task_entry.spec, + rpc::TaskStatus::FAILED, + /* include_task_info */ false, + worker::TaskStatusEvent::TaskStateUpdate(error_info)); task_entry.MarkRetryOnFailed(); // Mark the new status and also include task spec info for the new attempt. @@ -836,9 +848,16 @@ void TaskManager::MarkTaskRetryOnFailed(TaskEntry &task_entry) { /* include_task_info */ true); } -void TaskManager::SetTaskStatus(TaskEntry &task_entry, rpc::TaskStatus status) { +void TaskManager::SetTaskStatus( + TaskEntry &task_entry, + rpc::TaskStatus status, + const absl::optional &error_info) { task_entry.SetStatus(status); - RecordTaskStatusEvent(task_entry.spec.AttemptNumber(), task_entry.spec, status); + RecordTaskStatusEvent(task_entry.spec.AttemptNumber(), + task_entry.spec, + status, + /* include_task_info */ false, + worker::TaskStatusEvent::TaskStateUpdate(error_info)); } void TaskManager::FillTaskInfo(rpc::GetCoreWorkerStatsReply *reply, @@ -892,12 +911,12 @@ void TaskManager::RecordMetrics() { task_counter_.FlushOnChangeCallbacks(); } -void TaskManager::RecordTaskStatusEvent(int32_t attempt_number, - const TaskSpecification &spec, - rpc::TaskStatus status, - bool include_task_info, - absl::optional node_id, - absl::optional worker_id) { +void TaskManager::RecordTaskStatusEvent( + int32_t attempt_number, + const TaskSpecification &spec, + rpc::TaskStatus status, + bool include_task_info, + absl::optional state_update) { if (!task_event_buffer_.Enabled()) { return; } @@ -908,8 +927,7 @@ void TaskManager::RecordTaskStatusEvent(int32_t attempt_number, status, /* timestamp */ absl::GetCurrentTimeNanos(), include_task_info ? std::make_shared(spec) : nullptr, - node_id, - worker_id); + std::move(state_update)); task_event_buffer_.AddTaskEvent(std::move(task_event)); } diff --git a/src/ray/core_worker/task_manager.h b/src/ray/core_worker/task_manager.h index 6b17729700e93..dca839dc8a50a 100644 --- a/src/ray/core_worker/task_manager.h +++ b/src/ray/core_worker/task_manager.h @@ -38,7 +38,7 @@ class TaskFinisherInterface { bool is_application_error) = 0; virtual bool RetryTaskIfPossible(const TaskID &task_id, - bool task_failed_due_to_oom) = 0; + const rpc::RayErrorInfo &error_info) = 0; virtual void FailPendingTask(const TaskID &task_id, rpc::ErrorType error_type, @@ -170,10 +170,9 @@ class TaskManager : public TaskFinisherInterface, public TaskResubmissionInterfa /// Returns true if task can be retried. /// /// \param[in] task_id ID of the task to be retried. - /// \param[in] task_failed_due_to_oom last task attempt failed due to node running out - /// of memory. /// \return true if task is scheduled to be retried. - bool RetryTaskIfPossible(const TaskID &task_id, bool task_failed_due_to_oom) override; + bool RetryTaskIfPossible(const TaskID &task_id, + const rpc::RayErrorInfo &error_info) override; /// A pending task failed. This will either retry the task or mark the task /// as failed if there are no retries left. @@ -317,16 +316,13 @@ class TaskManager : public TaskFinisherInterface, public TaskResubmissionInterfa /// \param spec corresponding TaskSpecification of the task /// \param status the changed status. /// \param include_task_info True if TaskInfoEntry will be added to the Task events. - /// \param node_id Node ID of the worker for which the task's submitted. Only applicable - /// for SUBMITTED_TO_WORKER status change. - /// \param worker_id Worker ID of the worker for which the task's submitted. Only - /// applicable for SUBMITTED_TO_WORKER status change. - void RecordTaskStatusEvent(int32_t attempt_number, - const TaskSpecification &spec, - rpc::TaskStatus status, - bool include_task_info = false, - absl::optional node_id = absl::nullopt, - absl::optional worker_id = absl::nullopt); + void RecordTaskStatusEvent( + int32_t attempt_number, + const TaskSpecification &spec, + rpc::TaskStatus status, + bool include_task_info = false, + absl::optional state_update = + absl::nullopt); private: struct TaskEntry { @@ -487,7 +483,11 @@ class TaskManager : public TaskFinisherInterface, public TaskResubmissionInterfa /// /// \param task_entry corresponding TaskEntry of a task to record the event. /// \param status new status. - void SetTaskStatus(TaskEntry &task_entry, rpc::TaskStatus status); + /// \param error_info Optional error info for task execution. + void SetTaskStatus( + TaskEntry &task_entry, + rpc::TaskStatus status, + const absl::optional &error_info = absl::nullopt); /// Update the task entry for the task attempt to reflect retry on resubmit. /// @@ -503,7 +503,7 @@ class TaskManager : public TaskFinisherInterface, public TaskResubmissionInterfa /// the retry counter. /// /// \param task_entry Task entry for the corresponding task attempt - void MarkTaskRetryOnFailed(TaskEntry &task_entry); + void MarkTaskRetryOnFailed(TaskEntry &task_entry, const rpc::RayErrorInfo &error_info); /// Used to store task results. std::shared_ptr in_memory_store_; diff --git a/src/ray/core_worker/test/dependency_resolver_test.cc b/src/ray/core_worker/test/dependency_resolver_test.cc index 241cf0e5e31dc..4d2406e006ec2 100644 --- a/src/ray/core_worker/test/dependency_resolver_test.cc +++ b/src/ray/core_worker/test/dependency_resolver_test.cc @@ -69,7 +69,8 @@ class MockTaskFinisher : public TaskFinisherInterface { num_tasks_complete++; } - bool RetryTaskIfPossible(const TaskID &task_id, bool task_failed_due_to_oom) override { + bool RetryTaskIfPossible(const TaskID &task_id, + const rpc::RayErrorInfo &error_info) override { num_task_retries_attempted++; return false; } diff --git a/src/ray/core_worker/test/direct_task_transport_test.cc b/src/ray/core_worker/test/direct_task_transport_test.cc index 401481fd58397..61eb4370c3f45 100644 --- a/src/ray/core_worker/test/direct_task_transport_test.cc +++ b/src/ray/core_worker/test/direct_task_transport_test.cc @@ -125,7 +125,8 @@ class MockTaskFinisher : public TaskFinisherInterface { num_tasks_complete++; } - bool RetryTaskIfPossible(const TaskID &task_id, bool task_failed_due_to_oom) override { + bool RetryTaskIfPossible(const TaskID &task_id, + const rpc::RayErrorInfo &error_info) override { num_task_retries_attempted++; return false; } diff --git a/src/ray/core_worker/transport/direct_actor_task_submitter.cc b/src/ray/core_worker/transport/direct_actor_task_submitter.cc index b380365ee6179..cab04a6cebe51 100644 --- a/src/ray/core_worker/transport/direct_actor_task_submitter.cc +++ b/src/ray/core_worker/transport/direct_actor_task_submitter.cc @@ -140,8 +140,8 @@ Status CoreWorkerDirectActorTaskSubmitter::SubmitTask(TaskSpecification task_spe absl::MutexLock lock(&mu_); const auto queue_it = client_queues_.find(task_spec.ActorId()); const auto &death_cause = queue_it->second.death_cause; - error_type = GenErrorTypeFromDeathCause(death_cause); error_info = GetErrorInfoFromActorDeathCause(death_cause); + error_type = error_info.error_type(); } auto status = Status::IOError("cancelling task of dead actor"); // No need to increment the number of completed tasks since the actor is @@ -304,8 +304,8 @@ void CoreWorkerDirectActorTaskSubmitter::DisconnectActor( // Failing tasks has to be done without mu_ hold because the callback // might require holding mu_ which will lead to a deadlock. auto status = Status::IOError("cancelling all pending tasks of dead actor"); - rpc::ErrorType error_type = GenErrorTypeFromDeathCause(death_cause); const auto error_info = GetErrorInfoFromActorDeathCause(death_cause); + const auto error_type = error_info.error_type(); for (auto &task_id : task_ids_to_fail) { // No need to increment the number of completed tasks since the actor is @@ -528,7 +528,7 @@ void CoreWorkerDirectActorTaskSubmitter::HandlePushTaskReply( is_actor_dead = queue.state == rpc::ActorTableData::DEAD; const auto &death_cause = queue.death_cause; error_info = GetErrorInfoFromActorDeathCause(death_cause); - error_type = GenErrorTypeFromDeathCause(death_cause); + error_type = error_info.error_type(); fail_immediatedly = error_info.has_actor_died_error() && error_info.actor_died_error().has_oom_context() && error_info.actor_died_error().oom_context().fail_immediately(); diff --git a/src/ray/core_worker/transport/direct_task_transport.cc b/src/ray/core_worker/transport/direct_task_transport.cc index 7de9c58aa7f01..39538d5ddf458 100644 --- a/src/ray/core_worker/transport/direct_task_transport.cc +++ b/src/ray/core_worker/transport/direct_task_transport.cc @@ -524,7 +524,7 @@ void CoreWorkerDirectTaskSubmitter::RequestNewWorkerIfNeeded( } } } - + error_info.set_error_type(error_type); while (!tasks_to_fail.empty()) { auto &task_spec = tasks_to_fail.front(); if (task_spec.IsActorCreationTask() && @@ -641,7 +641,8 @@ void CoreWorkerDirectTaskSubmitter::PushNormalTask( !reply.is_retryable_error() || !task_finisher_->RetryTaskIfPossible( task_id, - /*task_failed_due_to_oom*/ false)) { + gcs::GetRayErrorInfo( + rpc::ErrorType::TASK_EXECUTION_EXCEPTION))) { task_finisher_->CompletePendingTask( task_id, reply, addr.ToProto(), reply.is_application_error()); } diff --git a/src/ray/gcs/pb_util.h b/src/ray/gcs/pb_util.h index 3e74ea8042d27..cef44e37979c5 100644 --- a/src/ray/gcs/pb_util.h +++ b/src/ray/gcs/pb_util.h @@ -138,22 +138,6 @@ inline const rpc::RayException *GetCreationTaskExceptionFromDeathCause( return &(death_cause->creation_task_failure_context()); } -/// Generate object error type from ActorDeathCause. -inline rpc::ErrorType GenErrorTypeFromDeathCause( - const rpc::ActorDeathCause &death_cause) { - if (death_cause.context_case() == ContextCase::kCreationTaskFailureContext) { - return rpc::ErrorType::ACTOR_DIED; - } else if (death_cause.context_case() == ContextCase::kRuntimeEnvFailedContext) { - return rpc::ErrorType::RUNTIME_ENV_SETUP_FAILED; - } else if (death_cause.context_case() == ContextCase::kActorUnschedulableContext) { - return rpc::ErrorType::ACTOR_UNSCHEDULABLE_ERROR; - } else if (death_cause.context_case() == ContextCase::kOomContext) { - return rpc::ErrorType::OUT_OF_MEMORY; - } else { - return rpc::ErrorType::ACTOR_DIED; - } -} - inline const std::string &GetActorDeathCauseString( const rpc::ActorDeathCause &death_cause) { static absl::flat_hash_map death_cause_string{ @@ -179,17 +163,22 @@ inline rpc::RayErrorInfo GetErrorInfoFromActorDeathCause( if (death_cause.context_case() == ContextCase::kActorDiedErrorContext || death_cause.context_case() == ContextCase::kCreationTaskFailureContext) { error_info.mutable_actor_died_error()->CopyFrom(death_cause); + error_info.set_error_type(rpc::ErrorType::ACTOR_DIED); } else if (death_cause.context_case() == ContextCase::kRuntimeEnvFailedContext) { error_info.mutable_runtime_env_setup_failed_error()->CopyFrom( death_cause.runtime_env_failed_context()); + error_info.set_error_type(rpc::ErrorType::RUNTIME_ENV_SETUP_FAILED); } else if (death_cause.context_case() == ContextCase::kActorUnschedulableContext) { *(error_info.mutable_error_message()) = death_cause.actor_unschedulable_context().error_message(); + error_info.set_error_type(rpc::ErrorType::ACTOR_UNSCHEDULABLE_ERROR); } else if (death_cause.context_case() == ContextCase::kOomContext) { error_info.mutable_actor_died_error()->CopyFrom(death_cause); *(error_info.mutable_error_message()) = death_cause.oom_context().error_message(); + error_info.set_error_type(rpc::ErrorType::OUT_OF_MEMORY); } else { RAY_CHECK(death_cause.context_case() == ContextCase::CONTEXT_NOT_SET); + error_info.set_error_type(rpc::ErrorType::ACTOR_DIED); } return error_info; } @@ -270,6 +259,15 @@ inline void FillTaskInfo(rpc::TaskInfoEntry *task_info, } } +/// Generate a RayErrorInfo from ErrorType +inline rpc::RayErrorInfo GetRayErrorInfo(const rpc::ErrorType &error_type, + const std::string &error_msg = "") { + rpc::RayErrorInfo error_info; + error_info.set_error_type(error_type); + error_info.set_error_message(error_msg); + return error_info; +} + /// Get the timestamp of the task status if available. /// /// \param task_event Task event. diff --git a/src/ray/protobuf/common.proto b/src/ray/protobuf/common.proto index 6660b9277ddeb..55d6718323554 100644 --- a/src/ray/protobuf/common.proto +++ b/src/ray/protobuf/common.proto @@ -214,7 +214,7 @@ message RayErrorInfo { string error_message = 4; } // The type of error that caused the exception. - optional ErrorType error_type = 11; + ErrorType error_type = 11; } message OutOfMemoryErrorContext { diff --git a/src/ray/protobuf/gcs.proto b/src/ray/protobuf/gcs.proto index bd470ddfb82de..3e4c4cce84f68 100644 --- a/src/ray/protobuf/gcs.proto +++ b/src/ray/protobuf/gcs.proto @@ -215,6 +215,8 @@ message TaskStateUpdate { optional int64 failed_ts = 7; // Worker that runs the task. optional bytes worker_id = 8; + // Task faulure info. + optional ErrorType error_type = 9; } // Represents events and state changes from a single task run. diff --git a/src/ray/protobuf/usage.proto b/src/ray/protobuf/usage.proto index e30501aeb9971..e60553ea706dd 100644 --- a/src/ray/protobuf/usage.proto +++ b/src/ray/protobuf/usage.proto @@ -94,4 +94,9 @@ enum TagKey { NUM_ACTOR_TASKS = 306; NUM_NORMAL_TASKS = 307; NUM_DRIVERS = 308; + + // Data + // Logical operators, stored in JSON format with operator name and count. + // Example: {"MapBatches": 2, "Filter": 1} + DATA_LOGICAL_OPS = 400; } diff --git a/src/ray/raylet/worker_pool.cc b/src/ray/raylet/worker_pool.cc index aeadc4180cadb..093abb41841c6 100644 --- a/src/ray/raylet/worker_pool.cc +++ b/src/ray/raylet/worker_pool.cc @@ -1233,8 +1233,7 @@ void WorkerPool::PopWorker(const TaskSpecification &task_spec, const int runtime_env_hash = task_spec.GetRuntimeEnvHash(); for (auto it = idle_of_all_languages_.rbegin(); it != idle_of_all_languages_.rend(); it++) { - if (task_spec.GetLanguage() != it->first->GetLanguage() || - state.pending_disconnection_workers.count(it->first) > 0 || it->first->IsDead()) { + if (task_spec.GetLanguage() != it->first->GetLanguage() || it->first->IsDead()) { continue; } @@ -1421,8 +1420,6 @@ void WorkerPool::DisconnectWorker(const std::shared_ptr &worker return; } - RAY_UNUSED(RemoveWorker(state.pending_disconnection_workers, worker)); - for (auto it = idle_of_all_languages_.begin(); it != idle_of_all_languages_.end(); it++) { if (it->first == worker) { @@ -1432,23 +1429,6 @@ void WorkerPool::DisconnectWorker(const std::shared_ptr &worker } } RemoveWorker(state.idle, worker); - if (disconnect_type != rpc::WorkerExitType::INTENDED_USER_EXIT) { - // A Java worker process may have multiple workers. If one of them disconnects - // unintentionally (which means that the worker process has died), we remove the - // others from idle pool so that the failed actor will not be rescheduled on the same - // process. - auto pid = worker->GetProcess().GetId(); - for (auto worker2 : state.registered_workers) { - if (worker2->GetProcess().GetId() == pid) { - // NOTE(kfstorm): We have to use a new field to record these workers (instead of - // just removing them from idle sets) because they may haven't announced worker - // port yet. When they announce worker port, they'll be marked idle again. So - // removing them from idle sets here doesn't really prevent them from being popped - // later. - state.pending_disconnection_workers.insert(worker2); - } - } - } } void WorkerPool::DisconnectDriver(const std::shared_ptr &driver) { diff --git a/src/ray/raylet/worker_pool.h b/src/ray/raylet/worker_pool.h index d21ec5f758c41..6118a9f7d95a0 100644 --- a/src/ray/raylet/worker_pool.h +++ b/src/ray/raylet/worker_pool.h @@ -513,9 +513,6 @@ class WorkerPool : public WorkerPoolInterface, public IOWorkerPoolInterface { std::unordered_set> registered_workers; /// All drivers that have registered and are still connected. std::unordered_set> registered_drivers; - /// All workers that have registered but is about to disconnect. They shouldn't be - /// popped anymore. - std::unordered_set> pending_disconnection_workers; /// A map from the startup tokens of worker processes, assigned by the raylet, to /// the extra information of the process. Note that the shim process PID is the /// same with worker process PID, except worker process in container.