Skip to content

Commit

Permalink
[train] Disallow trainer_resources() in ScalingConfig for GBDT traine…
Browse files Browse the repository at this point in the history
…rs (ray-project#39059)

`ScalingConfig.trainer_resources` are currently silently dropped in GBDT trainers. Instead, we should raise an error when this is set.

Signed-off-by: Kai Fricke <[email protected]>
  • Loading branch information
krfricke committed Sep 6, 2023
1 parent e31e34d commit c20103f
Show file tree
Hide file tree
Showing 7 changed files with 34 additions and 7 deletions.
3 changes: 0 additions & 3 deletions doc/source/train/distributed-xgboost-lightgbm.rst
Original file line number Diff line number Diff line change
Expand Up @@ -113,9 +113,6 @@ Here are some examples for common use-cases:
:start-after: __scaling_cpu_start__
:end-before: __scaling_cpu_end__

Note that we pass 0 CPUs for the trainer resources, so that all resources can
be allocated to the actual distributed training workers.


.. tab-item:: Single-node multi-GPU

Expand Down
1 change: 0 additions & 1 deletion doc/source/train/doc_code/gbdt_user_guide.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,6 @@
# __scaling_cpu_start__
scaling_config = ScalingConfig(
num_workers=4,
trainer_resources={"CPU": 0},
resources_per_worker={"CPU": 8},
)
# __scaling_cpu_end__
Expand Down
2 changes: 1 addition & 1 deletion doc/source/train/user-guides/fault-tolerance.rst
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ If the experiment has been interrupted due to one of the reasons listed above, u


Auto-resume
+++++++++++
~~~~~~~~~~~

Adding the branching logic below will allow you to run the same script after the interrupt,
picking up training from where you left on the previous run. Notice that we use the
Expand Down
2 changes: 1 addition & 1 deletion python/ray/air/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ class ScalingConfig:
Args:
trainer_resources: Resources to allocate for the trainer. If None is provided,
will default to 1 CPU.
will default to 1 CPU for most trainers.
num_workers: The number of workers (Ray actors) to launch.
Each worker will reserve 1 CPU by default. The number of CPUs
reserved by each worker can be overridden with the
Expand Down
16 changes: 16 additions & 0 deletions python/ray/train/gbdt_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,22 @@ def _validate_config_and_datasets(self) -> None:
f"which is not present in `datasets`."
)

@classmethod
def _validate_scaling_config(cls, scaling_config: ScalingConfig) -> ScalingConfig:
# Todo: `trainer_resources` should be configurable. Currently it is silently
# ignored. We catch the error here rather than in
# `_scaling_config_allowed_keys` because the default of `None` is updated to
# `{}` from XGBoost-Ray.
if scaling_config.trainer_resources not in [None, {}]:
raise ValueError(
f"The `trainer_resources` attribute for {cls.__name__} "
f"is currently ignored and defaults to `{{}}`. Remove the "
f"`trainer_resources` key from your `ScalingConfig` to resolve."
)
return super(GBDTTrainer, cls)._validate_scaling_config(
scaling_config=scaling_config
)

def _get_dmatrices(
self, dmatrix_params: Dict[str, Any]
) -> Dict[str, "xgboost_ray.RayDMatrix"]:
Expand Down
8 changes: 8 additions & 0 deletions python/ray/train/tests/test_lightgbm_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,6 +198,14 @@ def test_default_parameters_scaling_config():
assert trainer._ray_params.cpus_per_actor == 4


def test_lightgbm_trainer_resources():
"""`trainer_resources` is not allowed in the scaling config"""
with pytest.raises(ValueError):
LightGBMTrainer._validate_scaling_config(
ScalingConfig(trainer_resources={"something": 1})
)


if __name__ == "__main__":
import pytest
import sys
Expand Down
9 changes: 8 additions & 1 deletion python/ray/train/tests/test_xgboost_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,6 @@ def test_fit_with_advanced_scaling_config(ray_start_4_cpus):
valid_dataset = ray.data.from_pandas(test_df)
trainer = ScalingConfigAssertingXGBoostTrainer(
scaling_config=ScalingConfig(
trainer_resources={"CPU": 0},
num_workers=2,
placement_strategy="SPREAD",
_max_cpu_fraction_per_node=0.9,
Expand Down Expand Up @@ -216,6 +215,14 @@ def _train(self, params, dtrain, **kwargs):
trainer.fit()


def test_xgboost_trainer_resources():
"""`trainer_resources` is not allowed in the scaling config"""
with pytest.raises(ValueError):
XGBoostTrainer._validate_scaling_config(
ScalingConfig(trainer_resources={"something": 1})
)


if __name__ == "__main__":
import pytest
import sys
Expand Down

0 comments on commit c20103f

Please sign in to comment.