[core][experimental] Fix GPU microbenchmark (ray-project#45426)

Fix compute config for microbenchmark_gpu_unstable. Closes ray-project#45322. --------- Signed-off-by: Stephanie Wang <[email protected]>
c21 · May 21, 2024 · ab2b442 · ab2b442
1 parent a9fb64f
commit ab2b442
Show file tree

Hide file tree

Showing 3 changed files with 7 additions and 11 deletions.
diff --git a/release/microbenchmark/experimental/accelerated_dag_gpu_microbenchmark.py b/release/microbenchmark/experimental/accelerated_dag_gpu_microbenchmark.py
@@ -11,14 +11,11 @@
 from ray.air._internal import torch_utils
 import ray.cluster_utils
 from ray.dag import InputNode
-from ray.tests.conftest import * # noqa
 from ray.util.collective.collective_group import nccl_util
 
 from ray.experimental.channel.torch_tensor_type import TorchTensorType
 from ray._private.ray_microbenchmark_helpers import timeit
 
-# from ray.experimental.torch_serializer import TorchTensor
-
 
 logger = logging.getLogger(__name__)
 
@@ -160,7 +157,8 @@ def exec_ray_dag_ipc(label, sender, receiver, use_nccl=False):
  dag = sender.send.bind(SHAPE, DTYPE, inp)
  dag = receiver.recv.bind(
  dag,
- SHAPE[0] * DTYPE.itemsize,
+ # torch.float16 has item size of 2 bytes.
+ SHAPE[0] * 2,
  SHAPE,
  nccl_util.TORCH_NUMPY_DTYPE_MAP[DTYPE],
  )
@@ -347,7 +345,6 @@ def main():
  results += exec_ray_dag_gpu_cpu_gpu()
  results += exec_ray_dag_gpu_nccl(dynamic_shape=True)
  results += exec_ray_dag_gpu_nccl(dynamic_shape=False)
- results += exec_ray_dag_gpu_ipc_gpu()
 
 
 if __name__ == "__main__":

diff --git a/...hmark/experimental/compute_gpu_2_gce.yaml → ...hmark/experimental/compute_gpu_2_aws.yaml b/...hmark/experimental/compute_gpu_2_gce.yaml → ...hmark/experimental/compute_gpu_2_aws.yaml
@@ -1,12 +1,10 @@
 cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
-region: us-west1
-allowed_azs:
- - us-west1-b
+region: us-west-2
 
 max_workers: 0
 
 head_node_type:
  name: head_node
- instance_type: n1-standard-32-nvidia-tesla-t4-2
+ instance_type: g3.8xlarge
 
 worker_node_types: []
diff --git a/release/release_tests.yaml b/release/release_tests.yaml
@@ -3518,8 +3518,9 @@
  stable: false
 
  cluster:
- byod: {}
- cluster_compute: experimental/compute_gpu_2_gce.yaml
+ byod:
+ type: gpu
+ cluster_compute: experimental/compute_gpu_2_aws.yaml
 
  run:
  timeout: 1800