[train] Simplify ray.train.xgboost/lightgbm (6/n): Add core xgb/lgb…

…m trainer release tests (#43693) Adds a `LightGBMTrainer` release test that is the counterpart to the existing `air_benchmark_xgboost_cpu_10` release test. Previously, `LightGBMTrainer` had no coverage in any release tests. This PR also moves these release tests out of the `air_tests` directory into `train_tests`. --------- Signed-off-by: Justin Yu <[email protected]>
ray-project · Mar 7, 2024 · 3adbee4 · 3adbee4
1 parent 504ad8a
commit 3adbee4
Show file tree

Hide file tree

Showing 13 changed files with 304 additions and 140 deletions.
diff --git a/doc/source/cluster/doc_code/xgboost_submit.py b/doc/source/cluster/doc_code/xgboost_submit.py
@@ -4,10 +4,10 @@
 
 kick_off_xgboost_benchmark = (
  # Clone ray. If ray is already present, don't clone again.
- "git clone https://github.com/ray-project/ray || true;"
+ "git clone https://github.com/ray-project/ray || true; "
  # Run the benchmark.
- " python ray/release/air_tests/air_benchmarks/workloads/xgboost_benchmark.py"
- " --size 100G --disable-check"
+ "python ray/release/train_tests/xgboost_lightgbm/train_batch_inference_benchmark.py"
+ " xgboost --size=100G --disable-check"
 )
 
 

diff --git a/doc/source/train/benchmarks.rst b/doc/source/train/benchmarks.rst
@@ -127,9 +127,10 @@ XGBoost training
 ----------------
 
 This task uses the XGBoostTrainer module to train on different sizes of data
-with different amounts of parallelism.
+with different amounts of parallelism to show near-linear scaling from distributed
+data parallelism.
 
-XGBoost parameters were kept as defaults for xgboost==1.6.1 this task.
+XGBoost parameters were kept as defaults for ``xgboost==1.7.6`` this task.
 
 
 - `XGBoost Training Script`_
@@ -138,17 +139,20 @@ XGBoost parameters were kept as defaults for xgboost==1.6.1 this task.
 .. list-table::
 
  * - **Cluster Setup**
+ - **Number of distributed training workers**
  - **Data Size**
  - **Performance**
  - **Command**
- * - 1 m5.4xlarge node (1 actor)
+ * - 1 m5.4xlarge node with 16 CPUs
+ - 1 training worker using 12 CPUs, leaving 4 CPUs for Ray Data tasks
  - 10 GB (26M rows)
- - 692 s
- - `python xgboost_benchmark.py --size 10GB`
- * - 10 m5.4xlarge nodes (10 actors)
+ - 310.22 s
+ - `python train_batch_inference_benchmark.py "xgboost" --size=10GB`
+ * - 10 m5.4xlarge nodes
+ - 10 training workers (one per node), using 10x12 CPUs, leaving 10x4 CPUs for Ray Data tasks
  - 100 GB (260M rows)
- - 693 s
- - `python xgboost_benchmark.py --size 100GB`
+ - 326.86 s
+ - `python train_batch_inference_benchmark.py "xgboost" --size=100GB`
 
 .. _`GPU image training script`: https://github.com/ray-project/ray/blob/cec82a1ced631525a4d115e4dc0c283fa4275a7f/release/air_tests/air_benchmarks/workloads/pytorch_training_e2e.py#L95-L106
 .. _`GPU training small cluster configuration`: https://github.com/ray-project/ray/blob/master/release/air_tests/air_benchmarks/compute_gpu_1_aws.yaml#L6-L24
@@ -159,5 +163,5 @@ XGBoost parameters were kept as defaults for xgboost==1.6.1 this task.
 .. _`Tensorflow comparison training script`: https://github.com/ray-project/ray/blob/master/release/air_tests/air_benchmarks/workloads/tensorflow_benchmark.py
 .. _`Tensorflow comparison CPU cluster configuration`: https://github.com/ray-project/ray/blob/master/release/air_tests/air_benchmarks/compute_cpu_4_aws.yaml
 .. _`Tensorflow comparison GPU cluster configuration`: https://github.com/ray-project/ray/blob/master/release/air_tests/air_benchmarks/compute_gpu_4x4_aws.yaml
-.. _`XGBoost Training Script`: https://github.com/ray-project/ray/blob/a241e6a0f5a630d6ed5b84cce30c51963834d15b/release/air_tests/air_benchmarks/workloads/xgboost_benchmark.py#L40-L58
-.. _`XGBoost Cluster Configuration`: https://github.com/ray-project/ray/blob/a241e6a0f5a630d6ed5b84cce30c51963834d15b/release/air_tests/air_benchmarks/xgboost_compute_tpl.yaml#L6-L24
+.. _`XGBoost Training Script`: https://github.com/ray-project/ray/blob/9ac58f4efc83253fe63e280106f959fe317b1104/release/train_tests/xgboost_lightgbm/train_batch_inference_benchmark.py
+.. _`XGBoost Cluster Configuration`: https://github.com/ray-project/ray/tree/9ac58f4efc83253fe63e280106f959fe317b1104/release/train_tests/xgboost_lightgbm
diff --git a/release/BUILD.bazel b/release/BUILD.bazel
@@ -233,11 +233,11 @@ py_test(
 ####
 
 py_test(
- name = "air_benchmark_xgboost_smoke_test",
+ name = "xgboost_train_batch_inference_benchmark_smoke_test",
  size = "small",
  srcs = test_srcs,
- args = ["--smoke-test"],
- main = "air_tests/air_benchmarks/workloads/xgboost_benchmark.py",
+ args = ["xgboost", "--smoke-test"],
+ main = "train_tests/xgboost_lightgbm/train_batch_inference_benchmark.py",
  tags = [
  "exclusive",
  "team:ml",

diff --git a/release/release_tests.yaml b/release/release_tests.yaml
@@ -351,41 +351,6 @@
  alert: default
 
 
-# AIR benchmarks for XGBoost CUJ
-- name: air_benchmark_xgboost_cpu_10
- group: AIR tests
- working_dir: air_tests/air_benchmarks
-
- frequency: nightly
- team: ml
- cluster:
- byod:
- type: gpu
- cluster_compute: compute_xgboost_aws.yaml
-
- run:
- timeout: 36000
- script: python workloads/xgboost_benchmark.py
-
- wait_for_nodes:
- num_nodes: 11
-
- variations:
- - __suffix__: aws
- - __suffix__: gce
- env: gce
- frequency: manual
- cluster:
- cluster_compute: compute_xgboost_gce.yaml
-
- smoke_test:
- frequency: manual
-
- run:
- timeout: 1800
-
- alert: default
-
 # Ray AIR distributed Torch benchmarks
 - name: air_benchmark_torch_mnist_cpu_4x1
  group: AIR tests
@@ -2673,7 +2638,7 @@
 
 - name: train_multinode_persistence
  group: Train tests
- working_dir: train_tests/e2e
+ working_dir: train_tests/multinode_persistence
 
  frequency: nightly
  team: ml
@@ -2721,6 +2686,145 @@
  alert: default
 
 
+- name: xgboost_train_batch_inference_benchmark_10G
+ group: Train tests
+ working_dir: train_tests/xgboost_lightgbm
+
+ frequency: nightly
+ team: ml
+ cluster:
+ byod:
+ type: gpu
+ cluster_compute: compute_aws_1worker.yaml
+
+ run:
+ timeout: 36000
+ script: python train_batch_inference_benchmark.py "xgboost" --size=10G
+
+ wait_for_nodes:
+ num_nodes: 2
+
+ variations:
+ - __suffix__: aws
+ - __suffix__: gce
+ env: gce
+ frequency: manual
+ cluster:
+ cluster_compute: compute_gce_1worker.yaml
+
+ smoke_test:
+ frequency: manual
+
+ run:
+ timeout: 1800
+
+ alert: default
+
+- name: xgboost_train_batch_inference_benchmark_100G
+ group: Train tests
+ working_dir: train_tests/xgboost_lightgbm
+
+ frequency: nightly-3x
+ team: ml
+ cluster:
+ byod:
+ type: gpu
+ cluster_compute: compute_aws_10workers.yaml
+
+ run:
+ timeout: 36000
+ script: python train_batch_inference_benchmark.py "xgboost" --size=100G
+
+ wait_for_nodes:
+ num_nodes: 11
+
+ variations:
+ - __suffix__: aws
+ - __suffix__: gce
+ env: gce
+ frequency: manual
+ cluster:
+ cluster_compute: compute_gce_10workers.yaml
+
+ smoke_test:
+ frequency: manual
+
+ run:
+ timeout: 1800
+
+ alert: default
+
+
+- name: lightgbm_train_batch_inference_benchmark_10G
+ group: Train tests
+ working_dir: train_tests/xgboost_lightgbm
+
+ frequency: nightly
+ team: ml
+ cluster:
+ byod:
+ type: gpu
+ cluster_compute: compute_aws_1worker.yaml
+
+ run:
+ timeout: 36000
+ script: python train_batch_inference_benchmark.py "lightgbm" --size=10G
+
+ wait_for_nodes:
+ num_nodes: 2
+
+ variations:
+ - __suffix__: aws
+ - __suffix__: gce
+ env: gce
+ frequency: manual
+ cluster:
+ cluster_compute: compute_gce_1worker.yaml
+
+ smoke_test:
+ frequency: manual
+
+ run:
+ timeout: 1800
+
+ alert: default
+
+
+- name: lightgbm_train_batch_inference_benchmark_100G
+ group: Train tests
+ working_dir: train_tests/xgboost_lightgbm
+
+ frequency: nightly-3x
+ team: ml
+ cluster:
+ byod:
+ type: gpu
+ cluster_compute: compute_aws_10workers.yaml
+
+ run:
+ timeout: 36000
+ script: python train_batch_inference_benchmark.py "lightgbm" --size=100G
+
+ wait_for_nodes:
+ num_nodes: 11
+
+ variations:
+ - __suffix__: aws
+ - __suffix__: gce
+ env: gce
+ frequency: manual
+ cluster:
+ cluster_compute: compute_gce_10workers.yaml
+
+ smoke_test:
+ frequency: manual
+
+ run:
+ timeout: 1800
+
+ alert: default
+
+
 ########################
 # RLlib tests
 ########################

diff --git a/release/train_tests/e2e/compute_aws.yaml → ...ts/multinode_persistence/compute_aws.yaml b/release/train_tests/e2e/compute_aws.yaml → ...ts/multinode_persistence/compute_aws.yaml
diff --git a/release/train_tests/e2e/compute_gce.yaml → ...ts/multinode_persistence/compute_gce.yaml b/release/train_tests/e2e/compute_gce.yaml → ...ts/multinode_persistence/compute_gce.yaml
diff --git a/...e/train_tests/e2e/test_new_persistence.py → ...inode_persistence/test_new_persistence.py b/...e/train_tests/e2e/test_new_persistence.py → ...inode_persistence/test_new_persistence.py
diff --git a/release/train_tests/e2e/test_persistence.py → ...multinode_persistence/test_persistence.py b/release/train_tests/e2e/test_persistence.py → ...multinode_persistence/test_persistence.py
diff --git a/...s/air_benchmarks/compute_xgboost_aws.yaml → ...boost_lightgbm/compute_aws_10workers.yaml b/...s/air_benchmarks/compute_xgboost_aws.yaml → ...boost_lightgbm/compute_aws_10workers.yaml
@@ -24,5 +24,5 @@ aws:
  DeleteOnTermination: true
  Iops: 5000
  Throughput: 1000
- VolumeSize: 1000
+ VolumeSize: 200
  VolumeType: gp3
diff --git a/release/train_tests/xgboost_lightgbm/compute_aws_1worker.yaml b/release/train_tests/xgboost_lightgbm/compute_aws_1worker.yaml
@@ -0,0 +1,28 @@
+cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
+region: us-west-2
+
+max_workers: 10
+
+head_node_type:
+ name: head_node
+ instance_type: m5.2xlarge
+ resources:
+ cpu: 0
+
+
+worker_node_types:
+ - name: worker_node
+ instance_type: m5.4xlarge
+ max_workers: 1
+ min_workers: 1
+ use_spot: false
+
+aws:
+ BlockDeviceMappings:
+ - DeviceName: /dev/sda1
+ Ebs:
+ DeleteOnTermination: true
+ Iops: 5000
+ Throughput: 1000
+ VolumeSize: 200
+ VolumeType: gp3
diff --git a/...s/air_benchmarks/compute_xgboost_gce.yaml → ...boost_lightgbm/compute_gce_10workers.yaml b/...s/air_benchmarks/compute_xgboost_gce.yaml → ...boost_lightgbm/compute_gce_10workers.yaml
@@ -25,4 +25,4 @@ gcp_advanced_configurations_json:
  - boot: true
  auto_delete: true
  initialize_params:
- disk_size_gb: 1000
+ disk_size_gb: 250
diff --git a/release/train_tests/xgboost_lightgbm/compute_gce_1worker.yaml b/release/train_tests/xgboost_lightgbm/compute_gce_1worker.yaml
@@ -0,0 +1,28 @@
+cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
+region: us-west1
+allowed_azs:
+ - us-west1-b
+
+max_workers: 10
+
+head_node_type:
+ name: head_node
+ instance_type: n1-standard-8
+ resources:
+ cpu: 0
+
+
+worker_node_types:
+ - name: worker_node
+ instance_type: n1-standard-16
+ max_workers: 1
+ min_workers: 1
+ use_spot: false
+
+gcp_advanced_configurations_json:
+ instance_properties:
+ disks:
+ - boot: true
+ auto_delete: true
+ initialize_params:
+ disk_size_gb: 200