[train] Fix ScalingConfig(accelerator_type) to request a small frac…

…tion of the accelerator label (#44225) Make Ray Train's accelerator type resource request match Ray Core by setting it to a fractional value (0.001). This is needed to fix autoscaling behavior to request the correct number of GPUs. Signed-off-by: Justin Yu <[email protected]>
ray-project · Mar 22, 2024 · 5923cb9 · 5923cb9
1 parent 801cb86
commit 5923cb9
Show file tree

Hide file tree

Showing 3 changed files with 18 additions and 16 deletions.
diff --git a/python/ray/air/config.py b/python/ray/air/config.py
@@ -206,7 +206,7 @@ def _resources_per_worker_not_none(self):
 
  if self.accelerator_type:
  accelerator = f"{RESOURCE_CONSTRAINT_PREFIX}{self.accelerator_type}"
- resources_per_worker.setdefault(accelerator, 1)
+ resources_per_worker.setdefault(accelerator, 0.001)
  return resources_per_worker
 
  @property

diff --git a/python/ray/air/tests/test_api.py b/python/ray/air/tests/test_api.py
@@ -149,14 +149,14 @@ def test_scaling_config_accelerator_type():
  }
  assert scaling_config._resources_per_worker_not_none == {
  "GPU": 1,
- "accelerator_type:A100": 1,
+ "accelerator_type:A100": 0.001,
  }
  assert scaling_config.additional_resources_per_worker == {
- "accelerator_type:A100": 1
+ "accelerator_type:A100": 0.001
  }
  assert scaling_config.as_placement_group_factory().bundles == [
- {"GPU": 1, "accelerator_type:A100": 1, "CPU": 1},
- {"GPU": 1, "accelerator_type:A100": 1},
+ {"GPU": 1, "accelerator_type:A100": 0.001, "CPU": 1},
+ {"GPU": 1, "accelerator_type:A100": 0.001},
  ]
 
  # With resources_per_worker
@@ -172,15 +172,15 @@ def test_scaling_config_accelerator_type():
  assert scaling_config._resources_per_worker_not_none == {
  "GPU": 1,
  "custom_resource": 1,
- "accelerator_type:A100": 1,
+ "accelerator_type:A100": 0.001,
  }
  assert scaling_config.additional_resources_per_worker == {
  "custom_resource": 1,
- "accelerator_type:A100": 1,
+ "accelerator_type:A100": 0.001,
  }
  assert scaling_config.as_placement_group_factory().bundles == [
- {"GPU": 1, "custom_resource": 1, "accelerator_type:A100": 1, "CPU": 1},
- {"GPU": 1, "custom_resource": 1, "accelerator_type:A100": 1},
+ {"GPU": 1, "custom_resource": 1, "accelerator_type:A100": 0.001, "CPU": 1},
+ {"GPU": 1, "custom_resource": 1, "accelerator_type:A100": 0.001},
  ]
 
  # With trainer_resources
@@ -195,14 +195,14 @@ def test_scaling_config_accelerator_type():
  }
  assert scaling_config._resources_per_worker_not_none == {
  "GPU": 1,
- "accelerator_type:A100": 1,
+ "accelerator_type:A100": 0.001,
  }
  assert scaling_config.additional_resources_per_worker == {
- "accelerator_type:A100": 1
+ "accelerator_type:A100": 0.001
  }
  assert scaling_config.as_placement_group_factory().bundles == [
- {"GPU": 1, "accelerator_type:A100": 1, "memory": 10 * 1024**3},
- {"GPU": 1, "accelerator_type:A100": 1},
+ {"GPU": 1, "accelerator_type:A100": 0.001, "memory": 10 * 1024**3},
+ {"GPU": 1, "accelerator_type:A100": 0.001},
  ]
 
 

diff --git a/python/ray/train/tests/test_data_parallel_trainer.py b/python/ray/train/tests/test_data_parallel_trainer.py
@@ -51,9 +51,11 @@ def ray_start_heterogenous_cluster():
  cluster.add_node(
  num_cpus=4,
  num_gpus=4,
- resources={f"{RESOURCE_CONSTRAINT_PREFIX}{accelerator_type}": 4}
- if accelerator_type
- else {},
+ resources=(
+ {f"{RESOURCE_CONSTRAINT_PREFIX}{accelerator_type}": 1.0}
+ if accelerator_type
+ else {}
+ ),
  )
 
  ray.init(address=cluster.address)