update(auto-resume): use getopts for deploy args and exclude heartbea…

…t pulse on logging
Stability-AI · May 18, 2023 · 67c654e · 67c654e
1 parent 6830d9b
commit 67c654e
Show file tree

Hide file tree

Showing 3 changed files with 44 additions and 25 deletions.
diff --git a/README-STABLE-LM.md b/README-STABLE-LM.md
@@ -55,21 +55,23 @@ This test runs the model on a small dataset and checks that the output is correc
 To run a complete training run, use the provided `deploy.sh` script that uses slurm to configure everything:
 
 ```bash
-bash deploy.sh <num_nodes> <config_file_name> <run_name>
-```
+bash deploy.sh -n <num_nodes> -c <config_path> -j <job_name> 
+
+# Or using long form arg names:
 
-**Note:** For <config_file_name>, make sure you do not include the .yml extension.
+bash deploy.sh --nodes <num_nodes> --config <config_path> --jobname <job_name>
+```
 
 If you want to test a single node training with sbatch, you can use the following command:
 
 ```bash
-bash deploy.sh 1 ./config/test_config single_node_test
+bash deploy.sh -c ./configs/test_config -j single_node_test -n 1
 ```
 
 To perform a multi-node training run, you can use the following command:
 
 ```bash
-bash deploy.sh 2 ./config/test_multinode_config multi_node_test
+bash deploy.sh --config ./configs/test_multi_node_config --jobname multi_node_test --nodes 2
 ```
 
 For multinode, ensure that your config file has the following settings:

diff --git a/deploy.sh b/deploy.sh
@@ -1,27 +1,47 @@
 #!/bin/bash
 
-mkdir sbatches
-cat << EOF > sbatches/sbatch_runner_$1.sh
+usage()
+{
+  echo "Usage: bash deploy.sh [ -c | --config ] [ -j | --jobname ] [ -n | --nodes ]"
+  exit 2
+}
+
+PARSED_ARGUMENTS=$(getopt -o c:j:n: --long config:,jobname:,nodes: -- "$@")
+echo "PARSED_ARGUMENTS is $PARSED_ARGUMENTS"
+eval set -- "$PARSED_ARGUMENTS"
+
+while true ; do
+    case "$1" in
+        -c|--config) config=$2 ; shift 2 ;;
+        -j|--jobname) jobname=$2 ; shift 2 ;;
+        -n|--nodes) nodes=$2 ; shift 2 ;;
+        --) shift; break ;;
+        *) echo "Unexpected option: $1 - this should not happen."
+        usage ;;
+    esac
+done
+
+mkdir -p sbatches
+cat << EOF > sbatches/sbatch_runner_$jobname.sh
 #!/bin/bash
 #SBATCH --account="stablegpt"
-#SBATCH --job-name="$1"
+#SBATCH --job-name=${jobname}
 #SBATCH --partition=g40
-#SBATCH --nodes=$2  # Set > 1 for multi-node
+#SBATCH --nodes=$nodes # Set > 1 for multi-node
 #SBATCH --ntasks-per-node=8
 #SBATCH --cpus-per-task=12
 #SBATCH --mem-per-cpu=11G
 #SBATCH --gres=gpu:8
 #SBATCH --exclusive
-#SBATCH --output=logs/neox_%j.out
-#SBATCH --error=logs/neox_%j.err
-#SBATCH --exclude=ip-26-0-140-150,ip-26-0-134-43,ip-26-0-143-225,ip-26-0-135-173,ip-26-0-129-240,ip-26-0-140-63,ip-26-0-133-115,ip-26-0-131-[143,183,188,201],ip-26-0-132-214,ip-26-0-133-[81,126],ip-26-0-136-[27,42],ip-26-0-138-51,ip-26-0-140-[123-124],ip-26-0-143-[111,121,235,250],ip-26-0-129-197,ip-26-0-130-[37,127,132,150,164,193],ip-26-0-134-201,ip-26-0-137-[115,168,184,196],ip-26-0-138-208,ip-26-0-141-70,ip-26-0-142-[3,13]
+#SBATCH --output=logs/%x_%j.out
+#SBATCH --error=logs/%x_%j.err
 
 KILLED=137
 TERMINATED=143
 ABORTED=134
 
 REPEAT_COUNTER=\${1:-0}
-MAX_RUNS=1
+MAX_RUNS=2
 
 source /etc/profile.d/modules.sh
 module load openmpi cuda/11.7
@@ -40,6 +60,7 @@ export LD_LIBRARY_PATH=\$CONDA_HOME/lib:\$LD_LIBRARY_PATH
 export CPATH=\$CONDA_HOME/include:\$CPATH
 ###########################################################
 
+
 ###########################################################
 # CUDA/Torch Setup
 ###########################################################
@@ -103,14 +124,6 @@ export DLTS_HOSTFILE=\$CWD/hostfiles/hosts_\$SLURM_JOBID
 ###########################################################
 
 
-sig_handler()
-{
-    echo "BATCH interrupted"
-    wait # wait for all children, this is important!
-}
-trap 'sig_handler' SIGINT SIGTERM SIGCONT
-
-
 ###########################################################
 # Environment Setup
 # TODO: Replace with your own environment setup
@@ -132,19 +145,22 @@ TRAIN_PATH=\$CWD
 cd \$TRAIN_PATH
 wandb login --relogin --host https://stability.wandb.io local-edea3863613ef71e1ef7532673fdf4b46bc5ffd7
 git config --global --add safe.directory \$TRAIN_PATH
-python ./deepy.py train.py $3
+
+echo "$0 = \$0"
+bash -c 'python ./deepy.py train.py ${config}; exit \$?'
 RETVAL=\$?
-echo "Test process returned \${RETVAL}"
+echo "RETVAL = \${RETVAL}"
 # choose your action, we retry when process aborted,killed or signalled but not when it exited with 0 or non-zero code
 # but only up to MAX_RUNS attempts
 if [ \${RETVAL} -eq  \${ABORTED} -o \${RETVAL} -eq \${TERMINATED} -o \${RETVAL} -eq \${KILLED} ]
 then
   let run=\${REPEAT_COUNTER}+1
   if [ \${run} -lt \${MAX_RUNS} ]
   then
+    echo "Resubmitting job. Retry number = \${run}"
     sbatch \$0 \${run}
   fi
 fi
 EOF
 
-sbatch sbatches/sbatch_runner_$1.sh
+sbatch sbatches/sbatch_runner_${jobname}.sh
diff --git a/megatron/training.py b/megatron/training.py
@@ -822,7 +822,7 @@ def train(
     # to monitor if we've skipped many iterations in a row and trigger an early exit
     overflow_monitor = OverflowMonitor(optimizer)
     while iteration < neox_args.train_iters:
-        hb.pulse()
+        hb.start()
         loss_dict, skipped_iter = train_step(
             neox_args=neox_args,
             timers=timers,
@@ -844,6 +844,7 @@ def train(
             lr = optimizer.param_groups[0].get("lr", 0)
         else:
             lr = 0
+        hb.stop()
 
         # Logging.
         report_memory_flag = training_log(