Skip to content

Commit

Permalink
update(auto-resume): use getopts for deploy args and exclude heartbea…
Browse files Browse the repository at this point in the history
…t pulse on logging
  • Loading branch information
jon-tow committed May 18, 2023
1 parent 6830d9b commit c94fa58
Show file tree
Hide file tree
Showing 4 changed files with 45 additions and 26 deletions.
12 changes: 7 additions & 5 deletions README-STABLE-LM.md
Original file line number Diff line number Diff line change
Expand Up @@ -55,21 +55,23 @@ This test runs the model on a small dataset and checks that the output is correc
To run a complete training run, use the provided `deploy.sh` script that uses slurm to configure everything:

```bash
bash deploy.sh <num_nodes> <config_file_name> <run_name>
```
bash deploy.sh -n <num_nodes> -c <config_path> -j <job_name>

# Or using long form arg names:

**Note:** For <config_file_name>, make sure you do not include the .yml extension.
bash deploy.sh --nodes <num_nodes> --config <config_path> --jobname <job_name>
```

If you want to test a single node training with sbatch, you can use the following command:

```bash
bash deploy.sh 1 ./config/test_config single_node_test
bash deploy.sh -c ./configs/test_config -j single_node_test -n 1
```

To perform a multi-node training run, you can use the following command:

```bash
bash deploy.sh 2 ./config/test_multinode_config multi_node_test
bash deploy.sh --config ./configs/test_multi_node_config --jobname multi_node_test --nodes 2
```

For multinode, ensure that your config file has the following settings:
Expand Down
54 changes: 35 additions & 19 deletions deploy.sh
Original file line number Diff line number Diff line change
@@ -1,27 +1,47 @@
#!/bin/bash

mkdir sbatches
cat << EOF > sbatches/sbatch_runner_$1.sh
usage()
{
echo "Usage: bash deploy.sh [ -c | --config ] [ -j | --jobname ] [ -n | --nodes ]"
exit 2
}

PARSED_ARGUMENTS=$(getopt -o c:j:n: --long config:,jobname:,nodes: -- "$@")
echo "PARSED_ARGUMENTS is $PARSED_ARGUMENTS"
eval set -- "$PARSED_ARGUMENTS"

while true ; do
case "$1" in
-c|--config) config=$2 ; shift 2 ;;
-j|--jobname) jobname=$2 ; shift 2 ;;
-n|--nodes) nodes=$2 ; shift 2 ;;
--) shift; break ;;
*) echo "Unexpected option: $1 - this should not happen."
usage ;;
esac
done

mkdir -p sbatches
cat << EOF > sbatches/sbatch_runner_$jobname.sh
#!/bin/bash
#SBATCH --account="stablegpt"
#SBATCH --job-name="$1"
#SBATCH --job-name=${jobname}
#SBATCH --partition=g40
#SBATCH --nodes=$2 # Set > 1 for multi-node
#SBATCH --nodes=$nodes # Set > 1 for multi-node
#SBATCH --ntasks-per-node=8
#SBATCH --cpus-per-task=12
#SBATCH --mem-per-cpu=11G
#SBATCH --gres=gpu:8
#SBATCH --exclusive
#SBATCH --output=logs/neox_%j.out
#SBATCH --error=logs/neox_%j.err
#SBATCH --exclude=ip-26-0-140-150,ip-26-0-134-43,ip-26-0-143-225,ip-26-0-135-173,ip-26-0-129-240,ip-26-0-140-63,ip-26-0-133-115,ip-26-0-131-[143,183,188,201],ip-26-0-132-214,ip-26-0-133-[81,126],ip-26-0-136-[27,42],ip-26-0-138-51,ip-26-0-140-[123-124],ip-26-0-143-[111,121,235,250],ip-26-0-129-197,ip-26-0-130-[37,127,132,150,164,193],ip-26-0-134-201,ip-26-0-137-[115,168,184,196],ip-26-0-138-208,ip-26-0-141-70,ip-26-0-142-[3,13]
#SBATCH --output=logs/%x_%j.out
#SBATCH --error=logs/%x_%j.err
KILLED=137
TERMINATED=143
ABORTED=134
REPEAT_COUNTER=\${1:-0}
MAX_RUNS=1
MAX_RUNS=2
source /etc/profile.d/modules.sh
module load openmpi cuda/11.7
Expand All @@ -40,6 +60,7 @@ export LD_LIBRARY_PATH=\$CONDA_HOME/lib:\$LD_LIBRARY_PATH
export CPATH=\$CONDA_HOME/include:\$CPATH
###########################################################
###########################################################
# CUDA/Torch Setup
###########################################################
Expand Down Expand Up @@ -103,14 +124,6 @@ export DLTS_HOSTFILE=\$CWD/hostfiles/hosts_\$SLURM_JOBID
###########################################################
sig_handler()
{
echo "BATCH interrupted"
wait # wait for all children, this is important!
}
trap 'sig_handler' SIGINT SIGTERM SIGCONT
###########################################################
# Environment Setup
# TODO: Replace with your own environment setup
Expand All @@ -132,19 +145,22 @@ TRAIN_PATH=\$CWD
cd \$TRAIN_PATH
wandb login --relogin --host https://stability.wandb.io local-edea3863613ef71e1ef7532673fdf4b46bc5ffd7
git config --global --add safe.directory \$TRAIN_PATH
python ./deepy.py train.py $3
echo "$0 = \$0"
bash -c 'python ./deepy.py train.py ${config}; exit \$?'
RETVAL=\$?
echo "Test process returned \${RETVAL}"
echo "RETVAL = \${RETVAL}"
# choose your action, we retry when process aborted,killed or signalled but not when it exited with 0 or non-zero code
# but only up to MAX_RUNS attempts
if [ \${RETVAL} -eq \${ABORTED} -o \${RETVAL} -eq \${TERMINATED} -o \${RETVAL} -eq \${KILLED} ]
then
let run=\${REPEAT_COUNTER}+1
if [ \${run} -lt \${MAX_RUNS} ]
then
echo "Resubmitting job. Retry number = \${run}"
sbatch \$0 \${run}
fi
fi
EOF

sbatch sbatches/sbatch_runner_$1.sh
sbatch sbatches/sbatch_runner_${jobname}.sh
2 changes: 1 addition & 1 deletion megatron/neox_arguments/neox_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -798,7 +798,7 @@ class NeoXArgsTraining(NeoXArgsTemplate):
Timeout for initializing ranks. If a rank does not initialize within this time, the program will exit.
"""

heartbeat_timeout: int = 300
heartbeat_timeout: int = 60
"""
Timeout for heartbeats between ranks. If a rank does not send a heartbeat within this time, the program will exit.
"""
Expand Down
3 changes: 2 additions & 1 deletion megatron/training.py
Original file line number Diff line number Diff line change
Expand Up @@ -822,7 +822,7 @@ def train(
# to monitor if we've skipped many iterations in a row and trigger an early exit
overflow_monitor = OverflowMonitor(optimizer)
while iteration < neox_args.train_iters:
hb.pulse()
hb.start()
loss_dict, skipped_iter = train_step(
neox_args=neox_args,
timers=timers,
Expand All @@ -844,6 +844,7 @@ def train(
lr = optimizer.param_groups[0].get("lr", 0)
else:
lr = 0
hb.stop()

# Logging.
report_memory_flag = training_log(
Expand Down

0 comments on commit c94fa58

Please sign in to comment.