[train] Disallow trainer_resources() in ScalingConfig for GBDT traine…

…rs (ray-project#39059) `ScalingConfig.trainer_resources` are currently silently dropped in GBDT trainers. Instead, we should raise an error when this is set. Signed-off-by: Kai Fricke <[email protected]>
charleslai2000 · Sep 6, 2023 · c20103f · c20103f
1 parent e31e34d
commit c20103f
Show file tree

Hide file tree

Showing 7 changed files with 34 additions and 7 deletions.
diff --git a/doc/source/train/distributed-xgboost-lightgbm.rst b/doc/source/train/distributed-xgboost-lightgbm.rst
@@ -113,9 +113,6 @@ Here are some examples for common use-cases:
  :start-after: __scaling_cpu_start__
  :end-before: __scaling_cpu_end__
 
- Note that we pass 0 CPUs for the trainer resources, so that all resources can
- be allocated to the actual distributed training workers.
-
 
  .. tab-item:: Single-node multi-GPU
 

diff --git a/doc/source/train/doc_code/gbdt_user_guide.py b/doc/source/train/doc_code/gbdt_user_guide.py
@@ -155,7 +155,6 @@
 # __scaling_cpu_start__
 scaling_config = ScalingConfig(
  num_workers=4,
- trainer_resources={"CPU": 0},
  resources_per_worker={"CPU": 8},
 )
 # __scaling_cpu_end__

diff --git a/doc/source/train/user-guides/fault-tolerance.rst b/doc/source/train/user-guides/fault-tolerance.rst
@@ -99,7 +99,7 @@ If the experiment has been interrupted due to one of the reasons listed above, u
 
 
 Auto-resume
-+++++++++++
+~~~~~~~~~~~
 
 Adding the branching logic below will allow you to run the same script after the interrupt,
 picking up training from where you left on the previous run. Notice that we use the

diff --git a/python/ray/air/config.py b/python/ray/air/config.py
@@ -96,7 +96,7 @@ class ScalingConfig:
 
  Args:
  trainer_resources: Resources to allocate for the trainer. If None is provided,
- will default to 1 CPU.
+ will default to 1 CPU for most trainers.
  num_workers: The number of workers (Ray actors) to launch.
  Each worker will reserve 1 CPU by default. The number of CPUs
  reserved by each worker can be overridden with the

diff --git a/python/ray/train/gbdt_trainer.py b/python/ray/train/gbdt_trainer.py
@@ -203,6 +203,22 @@ def _validate_config_and_datasets(self) -> None:
  f"which is not present in `datasets`."
  )
 
+ @classmethod
+ def _validate_scaling_config(cls, scaling_config: ScalingConfig) -> ScalingConfig:
+ # Todo: `trainer_resources` should be configurable. Currently it is silently
+ # ignored. We catch the error here rather than in
+ # `_scaling_config_allowed_keys` because the default of `None` is updated to
+ # `{}` from XGBoost-Ray.
+ if scaling_config.trainer_resources not in [None, {}]:
+ raise ValueError(
+ f"The `trainer_resources` attribute for {cls.__name__} "
+ f"is currently ignored and defaults to `{{}}`. Remove the "
+ f"`trainer_resources` key from your `ScalingConfig` to resolve."
+ )
+ return super(GBDTTrainer, cls)._validate_scaling_config(
+ scaling_config=scaling_config
+ )
+
  def _get_dmatrices(
  self, dmatrix_params: Dict[str, Any]
  ) -> Dict[str, "xgboost_ray.RayDMatrix"]:

diff --git a/python/ray/train/tests/test_lightgbm_trainer.py b/python/ray/train/tests/test_lightgbm_trainer.py
@@ -198,6 +198,14 @@ def test_default_parameters_scaling_config():
  assert trainer._ray_params.cpus_per_actor == 4
 
 
+def test_lightgbm_trainer_resources():
+ """`trainer_resources` is not allowed in the scaling config"""
+ with pytest.raises(ValueError):
+ LightGBMTrainer._validate_scaling_config(
+ ScalingConfig(trainer_resources={"something": 1})
+ )
+
+
 if __name__ == "__main__":
  import pytest
  import sys

diff --git a/python/ray/train/tests/test_xgboost_trainer.py b/python/ray/train/tests/test_xgboost_trainer.py
@@ -76,7 +76,6 @@ def test_fit_with_advanced_scaling_config(ray_start_4_cpus):
  valid_dataset = ray.data.from_pandas(test_df)
  trainer = ScalingConfigAssertingXGBoostTrainer(
  scaling_config=ScalingConfig(
- trainer_resources={"CPU": 0},
  num_workers=2,
  placement_strategy="SPREAD",
  _max_cpu_fraction_per_node=0.9,
@@ -216,6 +215,14 @@ def _train(self, params, dtrain, **kwargs):
  trainer.fit()
 
 
+def test_xgboost_trainer_resources():
+ """`trainer_resources` is not allowed in the scaling config"""
+ with pytest.raises(ValueError):
+ XGBoostTrainer._validate_scaling_config(
+ ScalingConfig(trainer_resources={"something": 1})
+ )
+
+
 if __name__ == "__main__":
  import pytest
  import sys