Change update policy for MinCount, MaxCount, Queue and ComputeResource

Change update policy for * MinCount and MaxCount, from COMPUTE_FLEET_STOP to RESIZE_UPDATE_STRATEGY_ON_REMOVE * Queue and ComputeResource, from COMPUTE_FLEET_STOP_ON_REMOVE to RESIZE_UPDATE_STRATEGY_ON_REMOVE. RESIZE_UPDATE_STRATEGY_ON_REMOVE is a new update policy that permits update in one of the following cases: * compute fleet is stopped * QueueUpdateStrategy is set to TERMINATE * a new Queue is added * a new ComputeResource is added * MaxCount is increased * MinCount is increased AND MaxCount is increased of at least the same amount When setting QueueUpdateStrategy = TERMINATE, only the nodes at the back of the node list will be terminated. Example: * cluster initial capacity is `MinCount = 5` and `MaxCount = 10`, the nodes are `st-[1-5]; dy-[1-5]` * when resizing the cluster to `MinCount = 3` and `MaxCount = 5`, the new cluster capacity will be composed by the nodes `st-[1-3]; dy-[1-2]`, which will not be touched during the update * the nodes `st-[4-5]; dy-[3-5]` are going to be terminated This is possible by the adoption of Slurm 23.11, ref https://slurm.schedmd.com/news.html Signed-off-by: Luca Carrogu <[email protected]>
aws · Jan 16, 2024 · 314ccd9 · 314ccd9
1 parent 7628936
commit 314ccd9
Show file tree

Hide file tree

Showing 9 changed files with 693 additions and 92 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,10 @@ CHANGELOG
 ------
 
 **ENHANCEMENTS**
+- Permit to update `MinCount`, `MaxCount`, `Queue` and `ComputeResource` configuration parameters without the need to
+ stop the compute fleet. It's now possible to update them by setting `Scheduling/SlurmSettings/QueueUpdateStrategy`
+ to TERMINATE. ParallelCluster will terminate only the nodes removed during a resize of the cluster capacity
+ performed through a cluster update.
 - Add support for installing Intel OneAPI Base Toolkit and HPC Toolkit, and Intel Python.
  - Intel OneAPI Base Toolkits: 2023.2.0
  - Intel OneAPI HPC Toolkits: 2023.2.0
@@ -19,7 +23,7 @@ CHANGELOG
 - Add support for Python 3.11, 3.12 in pcluster CLI and aws-parallelcluster-batch-cli.
 - Upgrade Python to version 3.12 and NodeJS to version 18 in ParallelCluster Lambda Layer.
 - Build network interfaces using network card index from `NetworkCardIndex` list of EC2 DescribeInstances response, 
- instead of looping over `MaximumNetworkCards` range. 
+ instead of looping over `MaximumNetworkCards` range.
 
 3.8.0
 ------

diff --git a/cli/src/pcluster/config/update_policy.py b/cli/src/pcluster/config/update_policy.py
@@ -12,7 +12,7 @@
 from enum import Enum
 
 from pcluster.config.cluster_config import QueueUpdateStrategy
-from pcluster.constants import AWSBATCH, DEFAULT_MAX_COUNT, SLURM
+from pcluster.constants import AWSBATCH, DEFAULT_MAX_COUNT, DEFAULT_MIN_COUNT, SLURM
 
 
 class UpdatePolicy:
@@ -113,6 +113,15 @@ def actions_needed_queue_update_strategy(change, _):
  return actions
 
 
+def actions_needed_resize_update_strategy_on_remove(*_):
+ return (
+ "Stop the compute fleet with the pcluster update-compute-fleet command, "
+ "or set QueueUpdateStrategy to TERMINATE in the configuration used for the 'update-cluster' operation. "
+ "Be aware that this update will remove nodes from the scheduler and terminates the EC2 instances "
+ "associated. Jobs running on the removed nodes will terminate"
+ )
+
+
 def actions_needed_managed_placement_group(change, patch):
  if is_managed_placement_group_deletion(change, patch):
  actions = "Stop the compute fleet with the pcluster update-compute-fleet command."
@@ -258,6 +267,10 @@ def fail_reason_queue_update_strategy(change, _):
  return reason
 
 
+def fail_reason_resize_update_strategy_on_remove(*_):
+ return "All compute nodes must be stopped or QueueUpdateStrategy must be set to TERMINATE"
+
+
 def fail_reason_managed_placement_group(change, patch):
  if is_managed_placement_group_deletion(change, patch):
  reason = "All compute nodes must be stopped for a managed placement group deletion"
@@ -287,6 +300,16 @@ def is_queue_update_strategy_set(patch):
  )
 
 
+def is_resize_update_strategy_terminate(patch):
+ # Return true if the update strategy is set to TERMINATE
+ update_strategy = (
+ patch.target_config.get("Scheduling")
+ .get("SlurmSettings", {})
+ .get("QueueUpdateStrategy", QueueUpdateStrategy.COMPUTE_FLEET_STOP.value)
+ )
+ return update_strategy == QueueUpdateStrategy.TERMINATE.value
+
+
 def condition_checker_queue_update_strategy(change, patch):
  result = not patch.cluster.has_running_capacity()
  # QueueUpdateStrategy can override UpdatePolicy of parameters under SlurmQueues
@@ -296,6 +319,46 @@ def condition_checker_queue_update_strategy(change, patch):
  return result
 
 
+def condition_checker_resize_update_strategy_on_remove(change, patch):
+ # Check if fleet is stopped
+ result = not patch.cluster.has_running_capacity()
+
+ # Check if the change is inside a Queue section
+ if not result and (is_slurm_queues_change(change) or change.key == "SlurmQueues"):
+ # Check if QueueUpdateStrategy is TERMINATE
+ result = is_resize_update_strategy_terminate(patch)
+
+ # Queue or ComputeResource can be added
+ if not result and change.is_list:
+ result = change.old_value is None and change.new_value is not None
+
+ # Check if MaxCount is increased
+ if not result and change.key == "MaxCount":
+ result = convert_value_to_int(change.new_value, DEFAULT_MAX_COUNT) >= convert_value_to_int(
+ change.old_value, DEFAULT_MAX_COUNT
+ )
+
+ # Check if MinCount is increased and MaxCount is increased of at least the same amount
+ if not result and change.key == "MinCount":
+ path = change.path
+ for other_change in patch.changes:
+ # Check the value of MaxCount next to MinCount.
+ # MinCount and MaxCount next to each other have the same path
+ if path == other_change.path and other_change.key == "MaxCount":
+ other_change_new_value = convert_value_to_int(other_change.new_value, DEFAULT_MAX_COUNT)
+ other_change_old_value = convert_value_to_int(other_change.old_value, DEFAULT_MAX_COUNT)
+ change_new_value = convert_value_to_int(change.new_value, DEFAULT_MIN_COUNT)
+ change_old_value = convert_value_to_int(change.old_value, DEFAULT_MIN_COUNT)
+ result = (other_change_new_value - other_change_old_value) >= (change_new_value - change_old_value)
+ break
+
+ return result
+
+
+def convert_value_to_int(value, default):
+ return int(value) if value is not None else default
+
+
 def condition_checker_queue_update_strategy_on_remove(change, patch):
  result = not patch.cluster.has_running_capacity()
  # Update of list element value is possible if one of the following is verified:
@@ -414,6 +477,7 @@ def condition_checker_login_nodes_stop_policy(_, patch):
  ),
  "pcluster_stop": lambda change, patch: "Stop the compute fleet with the pcluster update-compute-fleet command",
  "pcluster_stop_conditional": actions_needed_queue_update_strategy,
+ "pcluster_resize_conditional": actions_needed_resize_update_strategy_on_remove,
  "managed_placement_group": actions_needed_managed_placement_group,
  "shared_storage_update_conditional": actions_needed_shared_storage_update,
  "managed_fsx": actions_needed_managed_fsx,
@@ -472,6 +536,15 @@ def condition_checker_login_nodes_stop_policy(_, patch):
  condition_checker=condition_checker_queue_update_strategy,
 )
 
+# Update supported with fleet stopped or with replacement policy set to TERMINATE
+UpdatePolicy.RESIZE_UPDATE_STRATEGY_ON_REMOVE = UpdatePolicy(
+ name="RESIZE_UPDATE_STRATEGY_ON_REMOVE",
+ level=5,
+ fail_reason=fail_reason_resize_update_strategy_on_remove,
+ action_needed=UpdatePolicy.ACTIONS_NEEDED["pcluster_resize_conditional"],
+ condition_checker=condition_checker_resize_update_strategy_on_remove,
+)
+
 # We must force COMPUTE_FLEET_STOP for the deletion of managed groups, otherwise fall back to QUEUE_UPDATE_STRATEGY
 UpdatePolicy.MANAGED_PLACEMENT_GROUP = UpdatePolicy(
  name="MANAGED_PLACEMENT_GROUP",

diff --git a/cli/src/pcluster/schemas/cluster_schema.py b/cli/src/pcluster/schemas/cluster_schema.py
@@ -1501,8 +1501,12 @@ class SlurmComputeResourceSchema(_ComputeResourceSchema):
  many=True,
  metadata={"update_policy": UpdatePolicy.COMPUTE_FLEET_STOP_ON_REMOVE, "update_key": "InstanceType"},
  )
- max_count = fields.Int(validate=validate.Range(min=1), metadata={"update_policy": UpdatePolicy.MAX_COUNT})
- min_count = fields.Int(validate=validate.Range(min=0), metadata={"update_policy": UpdatePolicy.COMPUTE_FLEET_STOP})
+ max_count = fields.Int(
+ validate=validate.Range(min=1), metadata={"update_policy": UpdatePolicy.RESIZE_UPDATE_STRATEGY_ON_REMOVE}
+ )
+ min_count = fields.Int(
+ validate=validate.Range(min=0), metadata={"update_policy": UpdatePolicy.RESIZE_UPDATE_STRATEGY_ON_REMOVE}
+ )
  spot_price = fields.Float(
  validate=validate.Range(min=0), metadata={"update_policy": UpdatePolicy.QUEUE_UPDATE_STRATEGY}
  )
@@ -1649,7 +1653,7 @@ class SlurmQueueSchema(_CommonQueueSchema):
  compute_resources = fields.Nested(
  SlurmComputeResourceSchema,
  many=True,
- metadata={"update_policy": UpdatePolicy.COMPUTE_FLEET_STOP_ON_REMOVE, "update_key": "Name"},
+ metadata={"update_policy": UpdatePolicy.RESIZE_UPDATE_STRATEGY_ON_REMOVE, "update_key": "Name"},
  )
  networking = fields.Nested(
  SlurmQueueNetworkingSchema, required=True, metadata={"update_policy": UpdatePolicy.QUEUE_UPDATE_STRATEGY}
@@ -1781,7 +1785,7 @@ class SchedulingSchema(BaseSchema):
  slurm_queues = fields.Nested(
  SlurmQueueSchema,
  many=True,
- metadata={"update_policy": UpdatePolicy.COMPUTE_FLEET_STOP_ON_REMOVE, "update_key": "Name"},
+ metadata={"update_policy": UpdatePolicy.RESIZE_UPDATE_STRATEGY_ON_REMOVE, "update_key": "Name"},
  )
  # Awsbatch schema:
  aws_batch_queues = fields.Nested(

diff --git a/cli/tests/pcluster/config/test_config_patch.py b/cli/tests/pcluster/config/test_config_patch.py
@@ -163,7 +163,7 @@ def _sorting_func(change):
  "MaxCount",
  1,
  2,
- UpdatePolicy.MAX_COUNT,
+ UpdatePolicy.RESIZE_UPDATE_STRATEGY_ON_REMOVE,
  False,
  id="change compute resources max count",
  ),
@@ -332,19 +332,19 @@ def _test_compute_resources(base_conf, target_conf):
  "ComputeResources",
  {"Name": "compute-removed", "InstanceType": "c5.9xlarge", "MaxCount": 20},
  None,
- UpdatePolicy.COMPUTE_FLEET_STOP_ON_REMOVE,
+ UpdatePolicy.RESIZE_UPDATE_STRATEGY_ON_REMOVE,
  is_list=True,
  ),
  Change(
  ["Scheduling", "SlurmQueues[queue1]"],
  "ComputeResources",
  None,
  {"Name": "new-compute", "InstanceType": "c5.large", "MinCount": 1},
- UpdatePolicy.COMPUTE_FLEET_STOP_ON_REMOVE,
+ UpdatePolicy.RESIZE_UPDATE_STRATEGY_ON_REMOVE,
  is_list=True,
  ),
  ],
- UpdatePolicy.COMPUTE_FLEET_STOP_ON_REMOVE,
+ UpdatePolicy.RESIZE_UPDATE_STRATEGY_ON_REMOVE,
  )
 
 
@@ -381,7 +381,7 @@ def _test_queues(base_conf, target_conf):
  "ComputeResources": {"Name": "compute-removed", "InstanceType": "c5.9xlarge"},
  },
  None,
- UpdatePolicy.COMPUTE_FLEET_STOP_ON_REMOVE,
+ UpdatePolicy.RESIZE_UPDATE_STRATEGY_ON_REMOVE,
  is_list=True,
  ),
  Change(
@@ -393,11 +393,11 @@ def _test_queues(base_conf, target_conf):
  "Networking": {"SubnetIds": "subnet-987654321"},
  "ComputeResources": {"Name": "new-compute", "InstanceType": "c5.xlarge"},
  },
- UpdatePolicy.COMPUTE_FLEET_STOP_ON_REMOVE,
+ UpdatePolicy.RESIZE_UPDATE_STRATEGY_ON_REMOVE,
  is_list=True,
  ),
  ],
- UpdatePolicy.COMPUTE_FLEET_STOP_ON_REMOVE,
+ UpdatePolicy.RESIZE_UPDATE_STRATEGY_ON_REMOVE,
  )
 
 
@@ -566,7 +566,7 @@ def _test_less_target_sections(base_conf, target_conf):
  "MinCount",
  1,
  None,
- UpdatePolicy.COMPUTE_FLEET_STOP,
+ UpdatePolicy.RESIZE_UPDATE_STRATEGY_ON_REMOVE,
  is_list=False,
  ),
  Change(
@@ -670,7 +670,7 @@ def _test_more_target_sections(base_conf, target_conf):
  "MinCount",
  None,
  1,
- UpdatePolicy.COMPUTE_FLEET_STOP,
+ UpdatePolicy.RESIZE_UPDATE_STRATEGY_ON_REMOVE,
  is_list=False,
  ),
  Change(