Skip to content

Commit

Permalink
[Train] Adding more flexibility to the default checkpointing behavior (
Browse files Browse the repository at this point in the history
…ray-project#37897)

Signed-off-by: Kourosh Hakhamaneshi <[email protected]>
  • Loading branch information
kouroshHakha committed Jul 28, 2023
1 parent c254c96 commit cee8c30
Show file tree
Hide file tree
Showing 2 changed files with 14 additions and 1 deletion.
Original file line number Diff line number Diff line change
Expand Up @@ -461,6 +461,15 @@ def parse_args():
parser.add_argument(
"--num-epochs", type=int, default=1, help="Number of epochs to train for."
)
parser.add_argument(
"--num-checkpoints-to-keep",
type=int,
help=(
"Number of checkpoints to keep, if None, all checkpoints will be kept, "
"if set to n>=1, the top n checkpoint with min. evaluation perplexity "
"will be kept."
),
)
parser.add_argument("--lr", type=float, default=5e-6, help="Learning rate to use.")

parser.add_argument(
Expand Down Expand Up @@ -550,7 +559,9 @@ def main():
# sync_config=tune.SyncConfig(sync_artifacts=False),
storage_path=storage_path,
checkpoint_config=air.CheckpointConfig(
num_to_keep=1,
num_to_keep=args.num_checkpoints_to_keep,
checkpoint_score_attribute="perplexity",
checkpoint_score_order="min",
# Enable distributed checkpointing
_checkpoint_keep_all_ranks=True,
_checkpoint_upload_from_workers=True,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,8 @@ fine_tune() {
--train_path "${train_path}" \
--test_path "${test_path}" \
--special_token_path "${token_path}" \
--num-checkpoints-to-keep 1 \
--num-epochs 1 \
"${params[@]}"; then
echo "Failed to fine-tune the model. Exiting..."
exit 1
Expand Down

0 comments on commit cee8c30

Please sign in to comment.