Skip to content

Commit

Permalink
Modify to meet precommit requirements.
Browse files Browse the repository at this point in the history
  • Loading branch information
Filtee committed Jun 23, 2024
1 parent 34b2c4b commit defd54f
Show file tree
Hide file tree
Showing 3 changed files with 9 additions and 9 deletions.
4 changes: 2 additions & 2 deletions dlrover/python/elastic_agent/torch/ckpt_saver.py
Original file line number Diff line number Diff line change
Expand Up @@ -697,8 +697,8 @@ def _sync_node_checkpoint(
self, master_client: MasterClient, step: int, timeout: int
):
"""
Check whether all training nodes can save the checkpoint from the memory
to the storage. If some nodes fail, other nodes need not save.
Check whether all training nodes can save the checkpoint from the
memory to the storage. If some nodes fail, other nodes need not save.
"""
start = time.time()
while True:
Expand Down
8 changes: 4 additions & 4 deletions dlrover/trainer/torch/flash_checkpoint/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,7 @@ class CheckpointEngine(metaclass=ABCMeta):
Args:
checkpoint_dir (str): the directory to save checkpoint.
storage: a CheckpointStorage instance to write/read the storage.
comm_backend (str): the communication backend to create a process group,
comm_backend (str): communication backend to create a process group,
The default is the backend of general main process group.
"""

Expand Down Expand Up @@ -260,7 +260,7 @@ def _notify_agent_to_create_saver(self):
"""Notify the agent in the main process to create a checkpoint saver"""
if self._local_rank != 0:
return
# the agent side will release the lock if the training process restarts.
# agent side will release the lock if the training process restarts.
queue = SharedQueue(name="factory")

local_shard_num = self.get_local_shard_num()
Expand Down Expand Up @@ -398,7 +398,7 @@ def save_to_memory(self, step, state_dict, paths: Dict[str, str]):
Args:
step (int): the global iteration step.
state_dict (dict): the state dict of the model and optimizer to save.
state_dict (dict): state dict of the model and optimizer to save.
paths (dict): the key is a category in
["model_states", "optim_states"] of the state dict, and
the value is the path of storage to save.
Expand All @@ -417,7 +417,7 @@ def save_to_storage(self, step, state_dict, paths: Dict[str, str]):
Args:
step (int): the iteration step.
state_dict (dict): the state dict of the model and optimizer to save.
state_dict (dict): state dict of the model and optimizer to save.
paths (dict): the key is a category in
["model_states", "optim_states"] of the state dict, and
the value is the path of storage to save.
Expand Down
6 changes: 3 additions & 3 deletions dlrover/trainer/torch/flash_checkpoint/full_ckpt_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ def save_to_memory(self, step, state_dict, paths: Dict[str, str]):
Args:
step (int): the global iteration step.
state_dict (dict): the state dict of the model and optimizer to save.
state_dict (dict): state dict of the model and optimizer to save.
paths (dict): the key is a category in
["model_states", "optim_states"] of the state dict, and
the value is the path of storage to save.
Expand All @@ -126,7 +126,7 @@ def save_to_storage(self, step, state_dict, paths):
Args:
step (int): the global iteration step.
state_dict (dict): the state dict of the model and optimizer to save.
state_dict (dict): state dict of the model and optimizer to save.
paths (dict): the key is a category in
["model_states", "optim_states"] of the state dict, and
the value is the path of storage to save.
Expand All @@ -144,7 +144,7 @@ def save_to_storage(self, step, state_dict, paths):

def load(self, resume_path=""):
"""
The method will first try to load the state dict from the shared memory.
The method will first try to load state dict from the shared memory.
If there is no state dict in the shared memory, the method will
load the state dict from the storage.
Expand Down

0 comments on commit defd54f

Please sign in to comment.