[CI] [Cluster] Fix example GCP GPU/docker example cluster YAML file (#…

…41134) Fixes the broken GCP GPU docker example YAML by updating the image and cuda version. (Thanks @PaulFenton for the fix!) Adds a release test that checks that ray up works with this YAML file (it didn't work before). Switched the image used in the YAML to ray instead of ray-ml because ray-ml gave errors with running out of disk space, and because it makes the test run faster. If it's important to make ray-ml the default, we can probably just increase diskSizeGb and test it again. Related issue number Closes #35266 --------- Signed-off-by: Archit Kulkarni <[email protected]>
ray-project · Nov 21, 2023 · a1a9a48 · a1a9a48
1 parent 90b8b37
commit a1a9a48
Show file tree

Hide file tree

Showing 2 changed files with 24 additions and 8 deletions.
diff --git a/python/ray/autoscaler/gcp/example-gpu-docker.yaml b/python/ray/autoscaler/gcp/example-gpu-docker.yaml
@@ -15,8 +15,8 @@ upscaling_speed: 1.0
 # and opens all the necessary ports to support the Ray cluster.
 # Empty string means disabled.
 docker:
- image: "rayproject/ray-ml:latest-gpu"
- # image: rayproject/ray:latest-gpu # use this one if you don't need ML dependencies, it's faster to pull
+ image: "rayproject/ray:latest-gpu"
+ # image: rayproject/ray-ml:latest-gpu # use this one if you need ML dependencies, but it's slower to pull
  container_name: "ray_nvidia_docker" # e.g. ray_docker
 
  # # Example of running a GPU head with CPU workers
@@ -33,7 +33,7 @@ provider:
  type: gcp
  region: us-west1
  availability_zone: us-west1-b
- project_id: null # Globally unique project id
+ project_id: null # Replace this with your globally unique project id
 
 # How Ray will authenticate with newly launched nodes.
 auth:
@@ -56,18 +56,18 @@ available_node_types:
  # For more documentation on available fields, see:
  # https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
  node_config:
- machineType: custom-6-16384
+ machineType: n1-standard-2
  disks:
  - boot: true
  autoDelete: true
  type: PERSISTENT
  initializeParams:
  diskSizeGb: 50
  # See https://cloud.google.com/compute/docs/images for more images
- sourceImage: projects/deeplearning-platform-release/global/images/family/common-cu110
+ sourceImage: projects/deeplearning-platform-release/global/images/family/common-cu113
  # Make sure to set scheduling->onHostMaintenance to TERMINATE when GPUs are present
  guestAccelerators:
- - acceleratorType: nvidia-tesla-k80
+ - acceleratorType: nvidia-tesla-t4
  acceleratorCount: 1
  metadata:
  items:
@@ -98,10 +98,10 @@ available_node_types:
  initializeParams:
  diskSizeGb: 50
  # See https://cloud.google.com/compute/docs/images for more images
- sourceImage: projects/deeplearning-platform-release/global/images/family/common-cu110
+ sourceImage: projects/deeplearning-platform-release/global/images/family/common-cu113
  # Make sure to set scheduling->onHostMaintenance to TERMINATE when GPUs are present
  guestAccelerators:
- - acceleratorType: nvidia-tesla-k80
+ - acceleratorType: nvidia-tesla-t4
  acceleratorCount: 1
  metadata:
  items:

diff --git a/release/release_tests.yaml b/release/release_tests.yaml
@@ -6646,6 +6646,22 @@
  timeout: 3600
  script: python launch_and_verify_cluster.py gcp/example-full.yaml --num-expected-nodes 2 --retries 20 --docker-override commit
 
+- name: gcp_cluster_launcher_gpu_docker
+ group: cluster-launcher-test
+ working_dir: ../python/ray/autoscaler/
+
+ stable: true
+
+ env: gce
+ frequency: weekly
+ team: clusters
+ cluster:
+ byod: {}
+ cluster_compute: gcp/tests/single_node_32_cpu_gce.yaml
+
+ run:
+ timeout: 1200
+ script: python launch_and_verify_cluster.py gcp/example-gpu-docker.yaml
 
 - name: autoscaler_aws
  group: autoscaler-test