Skip to content

Commit

Permalink
Change update policy for MinCount, MaxCount, Queue and ComputeResource
Browse files Browse the repository at this point in the history
Change update policy for
* MinCount and MaxCount, from COMPUTE_FLEET_STOP to RESIZE_UPDATE_STRATEGY_ON_REMOVE
* Queue and ComputeResource, from COMPUTE_FLEET_STOP_ON_REMOVE to RESIZE_UPDATE_STRATEGY_ON_REMOVE.

RESIZE_UPDATE_STRATEGY_ON_REMOVE is a new update policy that permits update in one of the following cases:
* compute fleet is stopped
* QueueUpdateStrategy is set to TERMINATE
* a new Queue is added
* a new ComputeResource is added
* MaxCount is increased
* MinCount is increased AND MaxCount is increased of at least the same amount

When setting QueueUpdateStrategy = TERMINATE, only the nodes at the back of the node list will be terminated.

Example:
* cluster initial capacity is `MinCount = 5` and `MaxCount = 10`, the nodes are `st-[1-5]; dy-[1-5]`
* when resizing the cluster to `MinCount = 3` and `MaxCount = 5`, the new cluster capacity will be composed by the nodes `st-[1-3]; dy-[1-2]`, which will not be touched during the update
* the nodes `st-[4-5]; dy-[3-5]` are going to be terminated

This is possible by the adoption of Slurm 23.11, ref https://slurm.schedmd.com/news.html

Signed-off-by: Luca Carrogu <[email protected]>
  • Loading branch information
lukeseawalker committed Jan 16, 2024
1 parent 7628936 commit 314ccd9
Show file tree
Hide file tree
Showing 9 changed files with 693 additions and 92 deletions.
6 changes: 5 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,10 @@ CHANGELOG
------

**ENHANCEMENTS**
- Permit to update `MinCount`, `MaxCount`, `Queue` and `ComputeResource` configuration parameters without the need to
stop the compute fleet. It's now possible to update them by setting `Scheduling/SlurmSettings/QueueUpdateStrategy`
to TERMINATE. ParallelCluster will terminate only the nodes removed during a resize of the cluster capacity
performed through a cluster update.
- Add support for installing Intel OneAPI Base Toolkit and HPC Toolkit, and Intel Python.
- Intel OneAPI Base Toolkits: 2023.2.0
- Intel OneAPI HPC Toolkits: 2023.2.0
Expand All @@ -19,7 +23,7 @@ CHANGELOG
- Add support for Python 3.11, 3.12 in pcluster CLI and aws-parallelcluster-batch-cli.
- Upgrade Python to version 3.12 and NodeJS to version 18 in ParallelCluster Lambda Layer.
- Build network interfaces using network card index from `NetworkCardIndex` list of EC2 DescribeInstances response,
instead of looping over `MaximumNetworkCards` range.
instead of looping over `MaximumNetworkCards` range.

3.8.0
------
Expand Down
75 changes: 74 additions & 1 deletion cli/src/pcluster/config/update_policy.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from enum import Enum

from pcluster.config.cluster_config import QueueUpdateStrategy
from pcluster.constants import AWSBATCH, DEFAULT_MAX_COUNT, SLURM
from pcluster.constants import AWSBATCH, DEFAULT_MAX_COUNT, DEFAULT_MIN_COUNT, SLURM


class UpdatePolicy:
Expand Down Expand Up @@ -113,6 +113,15 @@ def actions_needed_queue_update_strategy(change, _):
return actions


def actions_needed_resize_update_strategy_on_remove(*_):
return (
"Stop the compute fleet with the pcluster update-compute-fleet command, "
"or set QueueUpdateStrategy to TERMINATE in the configuration used for the 'update-cluster' operation. "
"Be aware that this update will remove nodes from the scheduler and terminates the EC2 instances "
"associated. Jobs running on the removed nodes will terminate"
)


def actions_needed_managed_placement_group(change, patch):
if is_managed_placement_group_deletion(change, patch):
actions = "Stop the compute fleet with the pcluster update-compute-fleet command."
Expand Down Expand Up @@ -258,6 +267,10 @@ def fail_reason_queue_update_strategy(change, _):
return reason


def fail_reason_resize_update_strategy_on_remove(*_):
return "All compute nodes must be stopped or QueueUpdateStrategy must be set to TERMINATE"


def fail_reason_managed_placement_group(change, patch):
if is_managed_placement_group_deletion(change, patch):
reason = "All compute nodes must be stopped for a managed placement group deletion"
Expand Down Expand Up @@ -287,6 +300,16 @@ def is_queue_update_strategy_set(patch):
)


def is_resize_update_strategy_terminate(patch):
# Return true if the update strategy is set to TERMINATE
update_strategy = (
patch.target_config.get("Scheduling")
.get("SlurmSettings", {})
.get("QueueUpdateStrategy", QueueUpdateStrategy.COMPUTE_FLEET_STOP.value)
)
return update_strategy == QueueUpdateStrategy.TERMINATE.value


def condition_checker_queue_update_strategy(change, patch):
result = not patch.cluster.has_running_capacity()
# QueueUpdateStrategy can override UpdatePolicy of parameters under SlurmQueues
Expand All @@ -296,6 +319,46 @@ def condition_checker_queue_update_strategy(change, patch):
return result


def condition_checker_resize_update_strategy_on_remove(change, patch):
# Check if fleet is stopped
result = not patch.cluster.has_running_capacity()

# Check if the change is inside a Queue section
if not result and (is_slurm_queues_change(change) or change.key == "SlurmQueues"):
# Check if QueueUpdateStrategy is TERMINATE
result = is_resize_update_strategy_terminate(patch)

# Queue or ComputeResource can be added
if not result and change.is_list:
result = change.old_value is None and change.new_value is not None

# Check if MaxCount is increased
if not result and change.key == "MaxCount":
result = convert_value_to_int(change.new_value, DEFAULT_MAX_COUNT) >= convert_value_to_int(
change.old_value, DEFAULT_MAX_COUNT
)

# Check if MinCount is increased and MaxCount is increased of at least the same amount
if not result and change.key == "MinCount":
path = change.path
for other_change in patch.changes:
# Check the value of MaxCount next to MinCount.
# MinCount and MaxCount next to each other have the same path
if path == other_change.path and other_change.key == "MaxCount":
other_change_new_value = convert_value_to_int(other_change.new_value, DEFAULT_MAX_COUNT)
other_change_old_value = convert_value_to_int(other_change.old_value, DEFAULT_MAX_COUNT)
change_new_value = convert_value_to_int(change.new_value, DEFAULT_MIN_COUNT)
change_old_value = convert_value_to_int(change.old_value, DEFAULT_MIN_COUNT)
result = (other_change_new_value - other_change_old_value) >= (change_new_value - change_old_value)
break

return result


def convert_value_to_int(value, default):
return int(value) if value is not None else default


def condition_checker_queue_update_strategy_on_remove(change, patch):
result = not patch.cluster.has_running_capacity()
# Update of list element value is possible if one of the following is verified:
Expand Down Expand Up @@ -414,6 +477,7 @@ def condition_checker_login_nodes_stop_policy(_, patch):
),
"pcluster_stop": lambda change, patch: "Stop the compute fleet with the pcluster update-compute-fleet command",
"pcluster_stop_conditional": actions_needed_queue_update_strategy,
"pcluster_resize_conditional": actions_needed_resize_update_strategy_on_remove,
"managed_placement_group": actions_needed_managed_placement_group,
"shared_storage_update_conditional": actions_needed_shared_storage_update,
"managed_fsx": actions_needed_managed_fsx,
Expand Down Expand Up @@ -472,6 +536,15 @@ def condition_checker_login_nodes_stop_policy(_, patch):
condition_checker=condition_checker_queue_update_strategy,
)

# Update supported with fleet stopped or with replacement policy set to TERMINATE
UpdatePolicy.RESIZE_UPDATE_STRATEGY_ON_REMOVE = UpdatePolicy(
name="RESIZE_UPDATE_STRATEGY_ON_REMOVE",
level=5,
fail_reason=fail_reason_resize_update_strategy_on_remove,
action_needed=UpdatePolicy.ACTIONS_NEEDED["pcluster_resize_conditional"],
condition_checker=condition_checker_resize_update_strategy_on_remove,
)

# We must force COMPUTE_FLEET_STOP for the deletion of managed groups, otherwise fall back to QUEUE_UPDATE_STRATEGY
UpdatePolicy.MANAGED_PLACEMENT_GROUP = UpdatePolicy(
name="MANAGED_PLACEMENT_GROUP",
Expand Down
12 changes: 8 additions & 4 deletions cli/src/pcluster/schemas/cluster_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -1501,8 +1501,12 @@ class SlurmComputeResourceSchema(_ComputeResourceSchema):
many=True,
metadata={"update_policy": UpdatePolicy.COMPUTE_FLEET_STOP_ON_REMOVE, "update_key": "InstanceType"},
)
max_count = fields.Int(validate=validate.Range(min=1), metadata={"update_policy": UpdatePolicy.MAX_COUNT})
min_count = fields.Int(validate=validate.Range(min=0), metadata={"update_policy": UpdatePolicy.COMPUTE_FLEET_STOP})
max_count = fields.Int(
validate=validate.Range(min=1), metadata={"update_policy": UpdatePolicy.RESIZE_UPDATE_STRATEGY_ON_REMOVE}
)
min_count = fields.Int(
validate=validate.Range(min=0), metadata={"update_policy": UpdatePolicy.RESIZE_UPDATE_STRATEGY_ON_REMOVE}
)
spot_price = fields.Float(
validate=validate.Range(min=0), metadata={"update_policy": UpdatePolicy.QUEUE_UPDATE_STRATEGY}
)
Expand Down Expand Up @@ -1649,7 +1653,7 @@ class SlurmQueueSchema(_CommonQueueSchema):
compute_resources = fields.Nested(
SlurmComputeResourceSchema,
many=True,
metadata={"update_policy": UpdatePolicy.COMPUTE_FLEET_STOP_ON_REMOVE, "update_key": "Name"},
metadata={"update_policy": UpdatePolicy.RESIZE_UPDATE_STRATEGY_ON_REMOVE, "update_key": "Name"},
)
networking = fields.Nested(
SlurmQueueNetworkingSchema, required=True, metadata={"update_policy": UpdatePolicy.QUEUE_UPDATE_STRATEGY}
Expand Down Expand Up @@ -1781,7 +1785,7 @@ class SchedulingSchema(BaseSchema):
slurm_queues = fields.Nested(
SlurmQueueSchema,
many=True,
metadata={"update_policy": UpdatePolicy.COMPUTE_FLEET_STOP_ON_REMOVE, "update_key": "Name"},
metadata={"update_policy": UpdatePolicy.RESIZE_UPDATE_STRATEGY_ON_REMOVE, "update_key": "Name"},
)
# Awsbatch schema:
aws_batch_queues = fields.Nested(
Expand Down
18 changes: 9 additions & 9 deletions cli/tests/pcluster/config/test_config_patch.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,7 @@ def _sorting_func(change):
"MaxCount",
1,
2,
UpdatePolicy.MAX_COUNT,
UpdatePolicy.RESIZE_UPDATE_STRATEGY_ON_REMOVE,
False,
id="change compute resources max count",
),
Expand Down Expand Up @@ -332,19 +332,19 @@ def _test_compute_resources(base_conf, target_conf):
"ComputeResources",
{"Name": "compute-removed", "InstanceType": "c5.9xlarge", "MaxCount": 20},
None,
UpdatePolicy.COMPUTE_FLEET_STOP_ON_REMOVE,
UpdatePolicy.RESIZE_UPDATE_STRATEGY_ON_REMOVE,
is_list=True,
),
Change(
["Scheduling", "SlurmQueues[queue1]"],
"ComputeResources",
None,
{"Name": "new-compute", "InstanceType": "c5.large", "MinCount": 1},
UpdatePolicy.COMPUTE_FLEET_STOP_ON_REMOVE,
UpdatePolicy.RESIZE_UPDATE_STRATEGY_ON_REMOVE,
is_list=True,
),
],
UpdatePolicy.COMPUTE_FLEET_STOP_ON_REMOVE,
UpdatePolicy.RESIZE_UPDATE_STRATEGY_ON_REMOVE,
)


Expand Down Expand Up @@ -381,7 +381,7 @@ def _test_queues(base_conf, target_conf):
"ComputeResources": {"Name": "compute-removed", "InstanceType": "c5.9xlarge"},
},
None,
UpdatePolicy.COMPUTE_FLEET_STOP_ON_REMOVE,
UpdatePolicy.RESIZE_UPDATE_STRATEGY_ON_REMOVE,
is_list=True,
),
Change(
Expand All @@ -393,11 +393,11 @@ def _test_queues(base_conf, target_conf):
"Networking": {"SubnetIds": "subnet-987654321"},
"ComputeResources": {"Name": "new-compute", "InstanceType": "c5.xlarge"},
},
UpdatePolicy.COMPUTE_FLEET_STOP_ON_REMOVE,
UpdatePolicy.RESIZE_UPDATE_STRATEGY_ON_REMOVE,
is_list=True,
),
],
UpdatePolicy.COMPUTE_FLEET_STOP_ON_REMOVE,
UpdatePolicy.RESIZE_UPDATE_STRATEGY_ON_REMOVE,
)


Expand Down Expand Up @@ -566,7 +566,7 @@ def _test_less_target_sections(base_conf, target_conf):
"MinCount",
1,
None,
UpdatePolicy.COMPUTE_FLEET_STOP,
UpdatePolicy.RESIZE_UPDATE_STRATEGY_ON_REMOVE,
is_list=False,
),
Change(
Expand Down Expand Up @@ -670,7 +670,7 @@ def _test_more_target_sections(base_conf, target_conf):
"MinCount",
None,
1,
UpdatePolicy.COMPUTE_FLEET_STOP,
UpdatePolicy.RESIZE_UPDATE_STRATEGY_ON_REMOVE,
is_list=False,
),
Change(
Expand Down
Loading

0 comments on commit 314ccd9

Please sign in to comment.