Skip to content

Commit

Permalink
[core][experimental] Fix GPU microbenchmark (ray-project#45426)
Browse files Browse the repository at this point in the history
Fix compute config for microbenchmark_gpu_unstable.

Closes ray-project#45322.

---------

Signed-off-by: Stephanie Wang <[email protected]>
  • Loading branch information
stephanie-wang committed May 21, 2024
1 parent a9fb64f commit ab2b442
Show file tree
Hide file tree
Showing 3 changed files with 7 additions and 11 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,11 @@
from ray.air._internal import torch_utils
import ray.cluster_utils
from ray.dag import InputNode
from ray.tests.conftest import * # noqa
from ray.util.collective.collective_group import nccl_util

from ray.experimental.channel.torch_tensor_type import TorchTensorType
from ray._private.ray_microbenchmark_helpers import timeit

# from ray.experimental.torch_serializer import TorchTensor


logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -160,7 +157,8 @@ def exec_ray_dag_ipc(label, sender, receiver, use_nccl=False):
dag = sender.send.bind(SHAPE, DTYPE, inp)
dag = receiver.recv.bind(
dag,
SHAPE[0] * DTYPE.itemsize,
# torch.float16 has item size of 2 bytes.
SHAPE[0] * 2,
SHAPE,
nccl_util.TORCH_NUMPY_DTYPE_MAP[DTYPE],
)
Expand Down Expand Up @@ -347,7 +345,6 @@ def main():
results += exec_ray_dag_gpu_cpu_gpu()
results += exec_ray_dag_gpu_nccl(dynamic_shape=True)
results += exec_ray_dag_gpu_nccl(dynamic_shape=False)
results += exec_ray_dag_gpu_ipc_gpu()


if __name__ == "__main__":
Expand Down
Original file line number Diff line number Diff line change
@@ -1,12 +1,10 @@
cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
region: us-west1
allowed_azs:
- us-west1-b
region: us-west-2

max_workers: 0

head_node_type:
name: head_node
instance_type: n1-standard-32-nvidia-tesla-t4-2
instance_type: g3.8xlarge

worker_node_types: []
5 changes: 3 additions & 2 deletions release/release_tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3518,8 +3518,9 @@
stable: false

cluster:
byod: {}
cluster_compute: experimental/compute_gpu_2_gce.yaml
byod:
type: gpu
cluster_compute: experimental/compute_gpu_2_aws.yaml

run:
timeout: 1800
Expand Down

0 comments on commit ab2b442

Please sign in to comment.