Skip to content

Commit

Permalink
[train] Fix ScalingConfig(accelerator_type) to request a small frac…
Browse files Browse the repository at this point in the history
…tion of the accelerator label (#44225)

Make Ray Train's accelerator type resource request match Ray Core by setting it to a fractional value (0.001). This is needed to fix autoscaling behavior to request the correct number of GPUs.

Signed-off-by: Justin Yu <[email protected]>
  • Loading branch information
justinvyu committed Mar 22, 2024
1 parent 801cb86 commit 5923cb9
Show file tree
Hide file tree
Showing 3 changed files with 18 additions and 16 deletions.
2 changes: 1 addition & 1 deletion python/ray/air/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,7 +206,7 @@ def _resources_per_worker_not_none(self):

if self.accelerator_type:
accelerator = f"{RESOURCE_CONSTRAINT_PREFIX}{self.accelerator_type}"
resources_per_worker.setdefault(accelerator, 1)
resources_per_worker.setdefault(accelerator, 0.001)
return resources_per_worker

@property
Expand Down
24 changes: 12 additions & 12 deletions python/ray/air/tests/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,14 +149,14 @@ def test_scaling_config_accelerator_type():
}
assert scaling_config._resources_per_worker_not_none == {
"GPU": 1,
"accelerator_type:A100": 1,
"accelerator_type:A100": 0.001,
}
assert scaling_config.additional_resources_per_worker == {
"accelerator_type:A100": 1
"accelerator_type:A100": 0.001
}
assert scaling_config.as_placement_group_factory().bundles == [
{"GPU": 1, "accelerator_type:A100": 1, "CPU": 1},
{"GPU": 1, "accelerator_type:A100": 1},
{"GPU": 1, "accelerator_type:A100": 0.001, "CPU": 1},
{"GPU": 1, "accelerator_type:A100": 0.001},
]

# With resources_per_worker
Expand All @@ -172,15 +172,15 @@ def test_scaling_config_accelerator_type():
assert scaling_config._resources_per_worker_not_none == {
"GPU": 1,
"custom_resource": 1,
"accelerator_type:A100": 1,
"accelerator_type:A100": 0.001,
}
assert scaling_config.additional_resources_per_worker == {
"custom_resource": 1,
"accelerator_type:A100": 1,
"accelerator_type:A100": 0.001,
}
assert scaling_config.as_placement_group_factory().bundles == [
{"GPU": 1, "custom_resource": 1, "accelerator_type:A100": 1, "CPU": 1},
{"GPU": 1, "custom_resource": 1, "accelerator_type:A100": 1},
{"GPU": 1, "custom_resource": 1, "accelerator_type:A100": 0.001, "CPU": 1},
{"GPU": 1, "custom_resource": 1, "accelerator_type:A100": 0.001},
]

# With trainer_resources
Expand All @@ -195,14 +195,14 @@ def test_scaling_config_accelerator_type():
}
assert scaling_config._resources_per_worker_not_none == {
"GPU": 1,
"accelerator_type:A100": 1,
"accelerator_type:A100": 0.001,
}
assert scaling_config.additional_resources_per_worker == {
"accelerator_type:A100": 1
"accelerator_type:A100": 0.001
}
assert scaling_config.as_placement_group_factory().bundles == [
{"GPU": 1, "accelerator_type:A100": 1, "memory": 10 * 1024**3},
{"GPU": 1, "accelerator_type:A100": 1},
{"GPU": 1, "accelerator_type:A100": 0.001, "memory": 10 * 1024**3},
{"GPU": 1, "accelerator_type:A100": 0.001},
]


Expand Down
8 changes: 5 additions & 3 deletions python/ray/train/tests/test_data_parallel_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,9 +51,11 @@ def ray_start_heterogenous_cluster():
cluster.add_node(
num_cpus=4,
num_gpus=4,
resources={f"{RESOURCE_CONSTRAINT_PREFIX}{accelerator_type}": 4}
if accelerator_type
else {},
resources=(
{f"{RESOURCE_CONSTRAINT_PREFIX}{accelerator_type}": 1.0}
if accelerator_type
else {}
),
)

ray.init(address=cluster.address)
Expand Down

0 comments on commit 5923cb9

Please sign in to comment.