Merge branch 'master' into ISSUE-23166

* master: (35 commits) [data] Refactor all to all op implementations into a separate file (ray-project#26585) [Datasets] Explicitly define Dataset-like APIs in DatasetPipeline class (ray-project#26394) [Serve][Part2] Migrate the tests to use deployment graph api (ray-project#26507) [Serve] Default to EveryNode when starting Serve from REST API (ray-project#26588) Revert "[KubeRay][Autoscaler][Core] Add a flag to disable ray status version check (ray-project#26584)" (ray-project#26597) [air] Add initial benchmark section (ray-project#26608) [Workflow] Remove workflow execution module (ray-project#26504) [air] Add xgboost release test for silver tier(10-node case). (ray-project#26460) Revert "Revert "[serve] Use soft constraint for pinning controller on head node (ray-project#25091)" (ray-project#25857)" (ray-project#25858) [RLlib] Fixes MARWIL release tests (ray-project#26586) [Datasets] Improve read_xxx experience of HTTP file (ray-project#26454) Cleanup ActorContext due to multi actor instances got removed. (ray-project#26497) Print newest_ckpt_path when resuming trial. (ray-project#26561) Fix test_serialization_error_message for pytest 6.x (ray-project#26591) [RLlib] Make DQN update_target use only trainable variables. (ray-project#25226) [RLlib] In env check, step only expected agents. (ray-project#26425) [RLlib] `restart_failed_sub_environments` now works for MA cases and crashes during `reset()`; +more tests and logging; add eval worker sub-env fault tolerance test. (ray-project#26276) [runtime env] plugin refactor[4/n]: remove runtime env protobuf (ray-project#26522) Improve streaming read performance for default configuration. (ray-project#26587) [Dashboard] Fix test dashboard flaky by catch an expected exception (ray-project#26555) ...
truelegion47 · Jul 16, 2022 · 9b56323 · 9b56323
2 parents 72617dc + cf980c3
commit 9b56323
Show file tree

Hide file tree

Showing 153 changed files with 3,755 additions and 2,908 deletions.
diff --git a/.buildkite/pipeline.macos.yml b/.buildkite/pipeline.macos.yml
@@ -61,17 +61,25 @@ steps:
  - if [ "$BUILDKITE_BRANCH" = "master" ]; then python .buildkite/copy_files.py --destination jars --path ./.jar/darwin; fi
 
 
-- label: ":mac: :apple: Ray C++, Java and Libraries"
+- label: ":mac: :apple: Ray Core, Dashboard and Serve"
  <<: *common
- conditions: ["RAY_CI_SERVE_AFFECTED", "RAY_CI_CORE_CPP_AFFECTED", "RAY_CI_CPP_AFFECTED", "RAY_CI_JAVA_AFFECTED", "RAY_CI_PYTHON_AFFECTED", "RAY_CI_DASHBOARD_AFFECTED"]
+ conditions: ["RAY_CI_SERVE_AFFECTED", "RAY_CI_CORE_CPP_AFFECTED", "RAY_CI_PYTHON_AFFECTED", "RAY_CI_DASHBOARD_AFFECTED"]
  commands:
- - export RAY_INSTALL_JAVA=1
  - *prelude_commands
  - TORCH_VERSION=1.6 ./ci/env/install-dependencies.sh
  # Use --dynamic_mode=off until MacOS CI runs on Big Sur or newer. Otherwise there are problems with running tests
  # with dynamic linking.
  - bazel test --config=ci --dynamic_mode=off --test_env=CI $(./ci/run/bazel_export_options) --build_tests_only --test_tag_filters=-post_wheel_build --
  //:all python/ray/serve/... python/ray/dashboard/... -rllib/... -core_worker_test
+ - *epilogue_commands
+
+
+- label: ":mac: :apple: Ray C++ and Java"
+ <<: *common
+ conditions: ["RAY_CI_CPP_AFFECTED", "RAY_CI_JAVA_AFFECTED"]
+ commands:
+ - export RAY_INSTALL_JAVA=1
+ - *prelude_commands
  # clang-format is needed by java/test.sh
  - pip install clang-format==12.0.1
  - ./java/test.sh

diff --git a/BUILD.bazel b/BUILD.bazel
@@ -818,6 +818,7 @@ cc_library(
  "@com_google_absl//absl/container:btree",
  "@com_google_absl//absl/container:flat_hash_map",
  "@com_google_absl//absl/container:flat_hash_set",
+ "@nlohmann_json",
  ],
 )
 

diff --git a/ci/ci.sh b/ci/ci.sh
@@ -170,6 +170,7 @@ test_python() {
  python/ray/tests/...
  -python/ray/serve:conda_env # pip field in runtime_env not supported
  -python/ray/serve:test_cross_language # Ray java not built on Windows yet.
+ -python/ray/serve:test_gcs_failure # Fork not supported in windows
  -python/ray/tests:test_actor_advanced # crashes in shutdown
  -python/ray/tests:test_autoscaler # We don't support Autoscaler on Windows
  -python/ray/tests:test_autoscaler_aws

diff --git a/dashboard/modules/runtime_env/runtime_env_agent.py b/dashboard/modules/runtime_env/runtime_env_agent.py
@@ -303,10 +303,10 @@ def setup_plugins():
  raise RuntimeError(f"runtime env plugin {name} not found.")
  # TODO(architkulkarni): implement uri support
  plugin.validate(runtime_env)
- plugin.create("uri not implemented", json.loads(config), context)
+ plugin.create("uri not implemented", config, context)
  plugin.modify_context(
  "uri not implemented",
- json.loads(config),
+ config,
  context,
  per_job_logger,
  )

diff --git a/dashboard/optional_utils.py b/dashboard/optional_utils.py
@@ -275,7 +275,10 @@ async def decorator(self, *args, **kwargs):
  if connect_to_serve:
  from ray import serve
 
- serve.start(detached=True, http_options={"host": "0.0.0.0"})
+ serve.start(
+ detached=True,
+ http_options={"host": "0.0.0.0", "location": "EveryNode"},
+ )
 
  return await f(self, *args, **kwargs)
  except Exception as e:

diff --git a/dashboard/tests/test_dashboard.py b/dashboard/tests/test_dashboard.py
@@ -839,8 +839,13 @@ def test_dashboard_does_not_depend_on_serve():
  )
 
  # Check that Serve-dependent features fail
- response = requests.get(f"http:https://{agent_url}/api/serve/deployments/")
- assert response.status_code == 500
+ try:
+ response = requests.get(f"http:https://{agent_url}/api/serve/deployments/")
+ assert response.status_code == 500
+ except Exception as e:
+ # Fail to connect to service is fine.
+ print(e)
+ assert True
 
 
 @pytest.mark.skipif(
@@ -872,8 +877,13 @@ def test_agent_does_not_depend_on_serve(shutdown_only):
  agent_url = node.node_ip_address + ":" + str(node.dashboard_agent_listen_port)
 
  # Check that Serve-dependent features fail
- response = requests.get(f"http:https://{agent_url}/api/serve/deployments/")
- assert response.status_code == 500
+ try:
+ response = requests.get(f"http:https://{agent_url}/api/serve/deployments/")
+ assert response.status_code == 500
+ except Exception as e:
+ # Fail to connect to service is fine.
+ print(e)
+ assert True
 
  # The agent should be dead if raylet exits.
  raylet_proc.kill()

diff --git a/doc/source/_toc.yml b/doc/source/_toc.yml
@@ -23,6 +23,7 @@ parts:
  - file: ray-air/examples/serving_guide
  - file: ray-air/deployment
  - file: ray-air/use-pretrained-model
+ - file: ray-air/benchmarks
  - file: ray-air/examples/index
  sections:
  - file: ray-air/examples/torch_image_example

diff --git a/doc/source/cluster/usage-stats.rst b/doc/source/cluster/usage-stats.rst
@@ -52,7 +52,7 @@ Ray will decide whether usage stats collection should be enabled or not by consi
 
 #. If neither is set and the console is interactive, then the user will be prompted to enable or disable the collection. If the console is non-interactive, usage stats collection will be enabled by default. The decision will be saved to ``~/.ray/config.json``, so the prompt is only shown once.
 
-Note: usage stats collection is not enabled when using local dev clusters started via ``ray.init()``. This means that Ray will never collect data from third-party library users not using Ray directly.
+Note: usage stats collection is not enabled when using local dev clusters started via ``ray.init()`` unless it's a nightly wheel. This means that Ray will never collect data from third-party library users not using Ray directly.
 
 If usage stats collection is enabled, a background process on the head node will collect the usage stats
 and report to ``https://usage-stats.ray.io/`` every hour. The reported usage stats will also be saved to

diff --git a/doc/source/data/faq.rst b/doc/source/data/faq.rst
@@ -300,7 +300,7 @@ particular model under different shuffling policies:
 * no shuffling,
 * local (per-shard) limited-memory shuffle buffer,
 * local (per-shard) shuffling,
-* windowed (psuedo-global) shuffling, and
+* windowed (pseudo-global) shuffling, and
 * fully global shuffling.
 
 From the perspective of keeping preprocessing time in check, as long as your data

diff --git a/doc/source/ray-air/benchmarks.rst b/doc/source/ray-air/benchmarks.rst
@@ -0,0 +1,81 @@
+AIR Benchmarks
+==============
+
+Below we document key performance benchmarks for common AIR tasks and workflows.
+
+XGBoost Batch Prediction
+------------------------
+
+This task uses the BatchPredictor module to process different amounts of data
+using an XGBoost model.
+
+We test out the performance across different cluster sizes and data sizes.
+
+- `XGBoost Prediction Script`_
+- `XGBoost Cluster configuration`_
+
+.. TODO: Add script for generating data and running the benchmark.
+
+.. list-table::
+
+ * - **Cluster Setup**
+ - **# workers**
+ - **Data Size**
+ - **# of rows**
+ - **Time taken**
+ - **Throughput**
+ - **Command**
+ * - 1 m5.4xlarge
+ - 1 actor
+ - 10 GB
+ - 26M rows
+ - 275 s
+ - 94.5k rows/sec
+ - `python xgboost_benchmark.py --size 10GB`
+ * - 10 m5.4xlarge nodes
+ - 10 actors (12 CPUs each)
+ - 100 GB
+ - 260M rows
+ - 331 s
+ - 786k rows/sec
+ - `python xgboost_benchmark.py --size 100GB`
+
+
+XGBoost training
+----------------
+
+This task uses the XGBoostTrainer module to train on different sizes of data
+with different amounts of parallelism.
+
+XGBoost parameters were kept as defaults for xgboost==1.6.1 this task.
+
+
+- `XGBoost Training Script`_
+- `XGBoost Cluster configuration`_
+
+.. list-table::
+
+ * - **Cluster Setup**
+ - **# workers**
+ - **Data Size**
+ - **# of rows**
+ - **Time taken**
+ - **Command**
+ * - 1 m5.4xlarge
+ - 1 actor
+ - 10 GB
+ - 26M rows
+ - 692 s
+ - `python xgboost_benchmark.py --size 10GB`
+ * - 10 m5.4xlarge nodes
+ - 10 actors (12 CPUs each)
+ - 100 GB
+ - 260M rows
+ - 693 s
+ - `python xgboost_benchmark.py --size 100GB`
+
+
+
+.. _`XGBoost Training Script`: https://github.com/ray-project/ray/blob/a241e6a0f5a630d6ed5b84cce30c51963834d15b/release/air_tests/air_benchmarks/workloads/xgboost_benchmark.py#L40-L58
+.. _`XGBoost Prediction Script`: https://github.com/ray-project/ray/blob/a241e6a0f5a630d6ed5b84cce30c51963834d15b/release/air_tests/air_benchmarks/workloads/xgboost_benchmark.py#L63-L71
+.. _`XGBoost Cluster configuration`: https://github.com/ray-project/ray/blob/a241e6a0f5a630d6ed5b84cce30c51963834d15b/release/air_tests/air_benchmarks/xgboost_compute_tpl.yaml#L6-L24
diff --git a/doc/source/ray-air/examples/BUILD b/doc/source/ray-air/examples/BUILD
@@ -35,7 +35,6 @@ py_test_run_all_notebooks(
  "feast_example.ipynb", # REGRESSION
  "rl_offline_example.ipynb", # REGRESSION
  "rl_online_example.ipynb", # REGRESSION
- "convert_existing_pytorch_code_to_ray_air.ipynb", # REGRESSION
  "tfx_tabular_train_to_serve.ipynb", # REGRESSION
  ],
  data = ["//doc/source/ray-air/examples:air_examples"],