Initial commit

Signed-off-by: Avnish <[email protected]>
ray-project · gjoliver · May 23, 2023 · May 18, 2023 · May 18, 2023 · May 18, 2023
commit bce003eb749388e8d90d3a33d632845597f152c8
diff --git a/repro.py b/repro.py
@@ -0,0 +1,77 @@
+import ray
+from ray import air, tune
+from ray.rllib.algorithms.ppo import PPOConfig
+from ray.rllib.examples.env.dm_control_suite import cheetah_run
+
+
+def run_with_tuner_1_rollout_worker(config):
+ config = config.rollouts(num_rollout_workers=1)
+ print("-" * 80)
+ tuner = tune.Tuner(
+ "PPO",
+ param_space=config,
+ run_config=air.RunConfig(
+ stop={"timesteps_total": 128},
+ failure_config=air.FailureConfig(max_failures=3),
+ # storage_path="/mnt/shared_storage/avnishn/ppo_multi_gpu_benchmarking",
+ # checkpoint_config=air.CheckpointConfig(checkpoint_frequency=1),
+ # sync_config=tune.SyncConfig(syncer=None)
+ ),
+ )
+ tuner.fit()
+
+
+def run_with_tuner_0_rollout_worker(config):
+ print("-" * 80)
+ config = config.rollouts(num_rollout_workers=0)
+ tuner = tune.Tuner(
+ "PPO",
+ param_space=config,
+ run_config=air.RunConfig(
+ stop={"timesteps_total": 128},
+ failure_config=air.FailureConfig(max_failures=3),
+ # storage_path="/mnt/shared_storage/avnishn/ppo_multi_gpu_benchmarking",
+ # checkpoint_config=air.CheckpointConfig(checkpoint_frequency=1),
+ # sync_config=tune.SyncConfig(syncer=None)
+ ),
+ )
+ tuner.fit()
+
+
+if __name__ == "__main__":
+
+ # This experiment is run on a machine with a 8 cpu headnode, and 2, 1 gpu 4 cpu
+ # workernodes. Note I couldn't reproduce this bug if I made my worker nodes 2,
+ # 4 gpu 32 cpu instances.
+
+ ray.init()
+
+ tune.registry.register_env(
+ "HalfCheetahDmc", lambda c: cheetah_run(from_pixels=False)
+ )
+
+ config = (
+ PPOConfig()
+ .training(
+ _enable_learner_api=True,
+ model={
+ "fcnet_hiddens": [256, 256, 256],
+ "fcnet_activation": "relu",
+ "vf_share_layers": True,
+ },
+ train_batch_size=128,
+ )
+ .rl_module(_enable_rl_module_api=True)
+ .environment("HalfCheetahDmc")
+ .resources(
+ num_gpus_per_learner_worker=1,
+ num_learner_workers=2,
+ )
+ .rollouts(num_rollout_workers=1)
+ .reporting(min_time_s_per_iteration=0, min_sample_timesteps_per_iteration=10)
+ )
+
+ # run_with_tuner_0_rollout_worker(config) # this works
+ print("finished without tune")
+ print("*" * 100)
+ run_with_tuner_1_rollout_worker(config) # this hangs
@@ -2272,13 +2272,22 @@ def default_resource_request(
  # resources for remote learner workers
  learner_bundles = []
  if cf._enable_learner_api and cf.num_learner_workers > 0:
- learner_bundles = [
- {
- "CPU": cf.num_cpus_per_learner_worker,
- "GPU": cf.num_gpus_per_learner_worker,
- }
- for _ in range(cf.num_learner_workers)
- ]
+ # can't specify cpus for learner workers at the same
+ # time as gpus
+ if cf.num_gpus_per_learner_worker:
+ learner_bundles = [
+ {
+ "GPU": cf.num_gpus_per_learner_worker,
+ }
+ for _ in range(cf.num_learner_workers)
+ ]
+ elif cf.num_cpus_per_learner_worker:
+ learner_bundles = [
+ {
+ "CPU": cf.num_cpus_per_learner_worker,
+ }
+ for _ in range(cf.num_learner_workers)
+ ]
 
  bundles = [driver] + rollout_bundles + evaluation_bundles + learner_bundles
 

@@ -888,6 +888,18 @@ def validate(self) -> None:
  "False)."
  )
 
+ if (
+ self.num_cpus_per_learner_worker > 1
+ and self.num_gpus_per_learner_worker > 0
+ ):
+ raise ValueError(
+ "Cannot set both `num_cpus_per_learner_worker` and "
+ " `num_gpus_per_learner_worker` > 0! Users must set one"
+ " or the other due to issues with placement group"
+ " fragmentation. See "
+ "https://github.com/ray-project/ray/issues/35409 for more details."
+ )
+
  if bool(os.environ.get("RLLIB_ENABLE_RL_MODULE", False)):
  # Enable RLModule API and connectors if env variable is set
  # (to be used in unittesting)
@@ -1149,7 +1161,8 @@ def resources(
  num_gpus_per_learner_worker: Number of GPUs allocated per worker. If
  `num_learner_workers = 0`, any value greater than 0 will run the
  training on a single GPU on the head node, while a value of 0 will run
- the training on head node CPU cores.
+ the training on head node CPU cores. If num_gpus_per_learner_worker is
+ set, then num_cpus_per_learner_worker cannot be set.
  local_gpu_idx: if num_gpus_per_worker > 0, and num_workers<2, then this gpu
  index will be used for training. This is an index into the available
  cuda devices. For example if os.environ["CUDA_VISIBLE_DEVICES"] = "1"
@@ -3227,7 +3240,11 @@ def get_learner_group_config(self, module_spec: ModuleSpec) -> LearnerGroupConfi
  )
  .resources(
  num_learner_workers=self.num_learner_workers,
- num_cpus_per_learner_worker=self.num_cpus_per_learner_worker,
+ num_cpus_per_learner_worker=(
+ self.num_cpus_per_learner_worker
+ if not self.num_gpus_per_learner_worker
+ else 0
+ ),
  num_gpus_per_learner_worker=self.num_gpus_per_learner_worker,
  local_gpu_idx=self.local_gpu_idx,
  )