Merge pull request #22 from EleutherAI/add_configs

add small -> 2-7B train scripts
EleutherAI · Feb 10, 2021 · 796d8af · 796d8af
2 parents f674530 + 366ae0f
commit 796d8af
Show file tree

Hide file tree

Showing 12 changed files with 713 additions and 43 deletions.
diff --git a/examples/ds_config.json → configs/deepspeed_configs/ds_config.json b/examples/ds_config.json → configs/deepspeed_configs/ds_config.json
diff --git a/examples/ds_config_flops_profile.json → ...speed_configs/ds_zero_stage_1_config.json b/examples/ds_config_flops_profile.json → ...speed_configs/ds_zero_stage_1_config.json
@@ -2,6 +2,14 @@
  "train_batch_size": 240,
  "train_micro_batch_size_per_gpu": 4,
  "steps_per_print": 10,
+ "zero_optimization": {
+ "stage": 1,
+ "allgather_partitions": true,
+ "reduce_scatter": true,
+ "allgather_bucket_size": 50000000,
+ "reduce_bucket_size": 50000000,
+ "overlap_comm": true
+ },
  "optimizer": {
  "type": "Adam",
  "params": {
@@ -19,13 +27,6 @@
  "hysteresis": 2,
  "min_loss_scale": 1
  },
- "flops_profiler": {
- "enabled": true,
- "start_step": 5,
- "end_step": 10,
- "module_depth": -1,
- "top_modules": 3
- },
  "wall_clock_breakdown": true,
  "zero_allow_untested_optimizer": false
 }
diff --git a/configs/deepspeed_configs/ds_zero_stage_1_config_2-7B.json b/configs/deepspeed_configs/ds_zero_stage_1_config_2-7B.json
@@ -0,0 +1,32 @@
+{
+ "train_batch_size": 240,
+ "train_micro_batch_size_per_gpu": 2,
+ "steps_per_print": 10,
+ "zero_optimization": {
+ "stage": 1,
+ "allgather_partitions": true,
+ "reduce_scatter": true,
+ "allgather_bucket_size": 5000000,
+ "reduce_bucket_size": 5000000,
+ "overlap_comm": true
+ },
+ "optimizer": {
+ "type": "Adam",
+ "params": {
+ "lr": 0.00015,
+ "max_grad_norm": 1.0,
+ "betas": [0.9, 0.95]
+ }
+ },
+ "gradient_clipping": 1.0,
+ "fp16": {
+ "enabled": true,
+
+ "loss_scale": 0,
+ "loss_scale_window": 1000,
+ "hysteresis": 2,
+ "min_loss_scale": 1
+ },
+ "wall_clock_breakdown": true,
+ "zero_allow_untested_optimizer": false
+}
diff --git a/examples/ds_zero_stage_2_config.json → ...speed_configs/ds_zero_stage_2_config.json b/examples/ds_zero_stage_2_config.json → ...speed_configs/ds_zero_stage_2_config.json
diff --git a/examples/ds_pretrain_gpt2.sh b/examples/ds_pretrain_gpt2.sh
@@ -18,7 +18,7 @@ CHECKPOINT_PATH=checkpoints/gpt2_345m_ds
 
 script_path=$(realpath $0)
 script_dir=$(dirname $script_path)
-config_json="$script_dir/ds_zero_stage_2_config.json"
+config_json="configs/deepspeed_configs/ds_config.json"
 
 # Megatron Model Parallelism
 mp_size=4

diff --git a/examples/ds_pretrain_gpt2_2-7B_pipe.sh b/examples/ds_pretrain_gpt2_2-7B_pipe.sh
@@ -0,0 +1,156 @@
+#! /bin/bash
+
+GPUS_PER_NODE=8
+# Change for multinode config
+MASTER_ADDR=127.0.0.1
+MASTER_PORT=2000
+NNODES=1
+NODE_RANK=1
+
+export DLWS_NUM_WORKER=${NNODES}
+export DLWS_NUM_GPU_PER_WORKER=${GPUS_PER_NODE}
+
+# DATA OPTIONS: 
+
+DATA_PATH=data/enron/enron_text_document # data/webtext/webtext_text_document
+VOCAB_PATH=data/gpt2-vocab.json
+MERGE_PATH=data/gpt2-merges.txt
+CHECKPOINT_PATH=checkpoints/gpt2_2-7B_ds
+
+script_path=$(realpath $0)
+script_dir=$(dirname $script_path)
+#config_json="configs/deepspeed_configs/ds_zero_stage_2_config.json"
+config_json="configs/deepspeed_configs/ds_zero_stage_1_config_2-7B.json"
+#config_json="configs/deepspeed_configs/ds_config.json"
+
+# Training options: 
+# Megatron Model Parallelism
+mp_size=1
+# DeepSpeed Pipeline parallelism
+pp_size=8
+# TOTAL BATCH SIZE = BATCHSIZE(pergpu) * GAS * N_GPUS
+# ensure batch size details are consistent between here and the deepspeed config
+BATCHSIZE=2
+GAS=32
+LOGDIR="tensorboard_data/${NLAYERS}l_${NHIDDEN}h_${NNODES}n_${GPUS_PER_NODE}g_${pp_size}pp_${mp_size}mp_${BATCHSIZE}b_ds4"
+
+#ZeRO Configs
+stage=1
+reduce_scatter=true
+contigious_gradients=true
+rbs=50000000
+agbs=5000000000
+
+#Actication Checkpointing and Contigious Memory
+chkp_layers=1
+PA=true
+PA_CPU=true
+CC=true
+SYNCHRONIZE=true
+PROFILE=false
+
+# GPT options:
+NLAYERS=32
+NHIDDEN=2560
+NHEADS=32
+SEQLEN=1024
+LR="1.6e-4"
+MINLR="1.6e-5"
+WEIGHTDECAY=0
+DROPOUT=0
+SPARSITY='interspersed'
+TRAIN_ITERS=320000
+
+gpt_options=" \
+ --model-parallel-size ${mp_size} \
+ --pipe-parallel-size ${pp_size} \
+ --num-layers $NLAYERS \
+ --hidden-size $NHIDDEN \
+ --num-attention-heads $NHEADS \
+ --seq-length $SEQLEN \
+ --max-position-embeddings 1024 \
+ --batch-size $BATCHSIZE \
+ --gas $GAS \
+ --train-iters $TRAIN_ITERS \
+ --lr-decay-iters $TRAIN_ITERS \
+ --save $CHECKPOINT_PATH \
+ --load $CHECKPOINT_PATH \
+ --data-path $DATA_PATH \
+ --vocab-file $VOCAB_PATH \
+ --merge-file $MERGE_PATH \
+ --data-impl mmap \
+ --split 949,50,1 \
+ --distributed-backend nccl \
+ --lr $LR \
+ --lr-decay-style cosine \
+ --min-lr $MINLR \
+ --weight-decay $WEIGHTDECAY \
+ --attention-dropout $DROPOUT \
+ --hidden-dropout $DROPOUT \
+ --clip-grad 1.0 \
+ --warmup 0.01 \
+ --checkpoint-activations \
+ --log-interval 1 \
+ --save-interval 500 \
+ --eval-interval 100 \
+ --eval-iters 10 \
+ --fp16 \
+ --tensorboard-dir ${LOGDIR} \
+ --sparsity $SPARSITY \
+ --sinusoidal-pos-emb
+"
+
+ deepspeed_options=" \
+ --deepspeed \
+ --deepspeed_config ${config_json} \
+ --zero-stage ${stage} \
+ --zero-reduce-bucket-size ${rbs} \
+ --zero-allgather-bucket-size ${agbs} 
+ "
+
+if [ "${contigious_gradients}" = "true" ]; then
+deepspeed_options="${deepspeed_options} \
+ --zero-contigious-gradients"
+fi
+
+if [ "${reduce_scatter}" = "true" ]; then
+deepspeed_options="${deepspeed_options} \
+ --zero-reduce-scatter"
+fi
+
+chkp_opt=" \
+--checkpoint-activations \
+--checkpoint-num-layers ${chkp_layers}"
+
+if [ "${PA}" = "true" ]; then
+chkp_opt="${chkp_opt} \
+ --partition-activations"
+fi
+
+if [ "${PA_CPU}" = "true" ]; then
+chkp_opt="${chkp_opt} \
+ --checkpoint-in-cpu"
+fi
+
+if [ "${SYNCHRONIZE}" = "true" ]; then
+chkp_opt="${chkp_opt} \
+ --synchronize-each-layer"
+fi
+
+if [ "${CC}" = "true" ]; then
+chkp_opt="${chkp_opt} \
+ --contigious-checkpointing"
+fi
+
+if [ "${PROFILE}" = "true" ]; then
+chkp_opt="${chkp_opt} \
+ --profile-backward"
+fi
+
+full_options="${gpt_options} ${deepspeed_options} ${chkp_opt}"
+
+run_cmd="deepspeed pretrain_gpt2.py $@ ${full_options}"
+echo ${run_cmd}
+eval ${run_cmd}
+
+set +x
diff --git a/examples/ds_pretrain_gpt2_XL_pipe.sh b/examples/ds_pretrain_gpt2_XL_pipe.sh
@@ -0,0 +1,155 @@
+#! /bin/bash
+
+GPUS_PER_NODE=8
+# Change for multinode config
+MASTER_ADDR=127.0.0.1
+MASTER_PORT=2000
+NNODES=1
+NODE_RANK=1
+
+export DLWS_NUM_WORKER=${NNODES}
+export DLWS_NUM_GPU_PER_WORKER=${GPUS_PER_NODE}
+
+# DATA OPTIONS: 
+
+DATA_PATH=data/enron/enron_text_document # data/webtext/webtext_text_document
+VOCAB_PATH=data/gpt2-vocab.json
+MERGE_PATH=data/gpt2-merges.txt
+CHECKPOINT_PATH=checkpoints/gpt2_XL_ds
+
+script_path=$(realpath $0)
+script_dir=$(dirname $script_path)
+#config_json="configs/deepspeed_configs/ds_zero_stage_2_config.json"
+config_json="configs/deepspeed_configs/ds_zero_stage_1_config.json"
+#config_json="configs/deepspeed_configs/ds_config.json"
+
+# Training options: 
+# Megatron Model Parallelism
+mp_size=1
+# DeepSpeed Pipeline parallelism
+pp_size=2
+# TOTAL BATCH SIZE = BATCHSIZE(pergpu) * GAS * N_GPUS
+# ensure batch size details are consistent between here and the deepspeed config
+BATCHSIZE=4 
+GAS=16
+LOGDIR="tensorboard_data/${NLAYERS}l_${NHIDDEN}h_${NNODES}n_${GPUS_PER_NODE}g_${pp_size}pp_${mp_size}mp_${BATCHSIZE}b_ds4"
+
+#ZeRO Configs
+stage=1
+reduce_scatter=true
+contigious_gradients=true
+rbs=50000000
+agbs=5000000000
+
+#Actication Checkpointing and Contigious Memory
+chkp_layers=1
+PA=true
+PA_CPU=true
+CC=true
+SYNCHRONIZE=true
+PROFILE=false
+
+# GPT options:
+NLAYERS=24
+NHIDDEN=2048
+NHEADS=16
+SEQLEN=1024
+LR="2.0e-4"
+MINLR="2.0e-5"
+WEIGHTDECAY=0
+DROPOUT=0
+SPARSITY='interspersed'
+TRAIN_ITERS=320000
+
+gpt_options=" \
+ --model-parallel-size ${mp_size} \
+ --pipe-parallel-size ${pp_size} \
+ --num-layers $NLAYERS \
+ --hidden-size $NHIDDEN \
+ --num-attention-heads $NHEADS \
+ --seq-length $SEQLEN \
+ --max-position-embeddings 1024 \
+ --batch-size $BATCHSIZE \
+ --gas $GAS \
+ --train-iters $TRAIN_ITERS \
+ --lr-decay-iters $TRAIN_ITERS \
+ --save $CHECKPOINT_PATH \
+ --load $CHECKPOINT_PATH \
+ --data-path $DATA_PATH \
+ --vocab-file $VOCAB_PATH \
+ --merge-file $MERGE_PATH \
+ --data-impl mmap \
+ --split 949,50,1 \
+ --distributed-backend nccl \
+ --lr $LR \
+ --lr-decay-style cosine \
+ --min-lr $MINLR \
+ --weight-decay $WEIGHTDECAY \
+ --attention-dropout $DROPOUT \
+ --hidden-dropout $DROPOUT \
+ --clip-grad 1.0 \
+ --warmup 0.01 \
+ --checkpoint-activations \
+ --log-interval 1 \
+ --save-interval 500 \
+ --eval-interval 100 \
+ --eval-iters 10 \
+ --fp16 \
+ --tensorboard-dir ${LOGDIR} \
+ --sparsity $SPARSITY
+"
+
+ deepspeed_options=" \
+ --deepspeed \
+ --deepspeed_config ${config_json} \
+ --zero-stage ${stage} \
+ --zero-reduce-bucket-size ${rbs} \
+ --zero-allgather-bucket-size ${agbs} 
+ "
+
+if [ "${contigious_gradients}" = "true" ]; then
+deepspeed_options="${deepspeed_options} \
+ --zero-contigious-gradients"
+fi
+
+if [ "${reduce_scatter}" = "true" ]; then
+deepspeed_options="${deepspeed_options} \
+ --zero-reduce-scatter"
+fi
+
+chkp_opt=" \
+--checkpoint-activations \
+--checkpoint-num-layers ${chkp_layers}"
+
+if [ "${PA}" = "true" ]; then
+chkp_opt="${chkp_opt} \
+ --partition-activations"
+fi
+
+if [ "${PA_CPU}" = "true" ]; then
+chkp_opt="${chkp_opt} \
+ --checkpoint-in-cpu"
+fi
+
+if [ "${SYNCHRONIZE}" = "true" ]; then
+chkp_opt="${chkp_opt} \
+ --synchronize-each-layer"
+fi
+
+if [ "${CC}" = "true" ]; then
+chkp_opt="${chkp_opt} \
+ --contigious-checkpointing"
+fi
+
+if [ "${PROFILE}" = "true" ]; then
+chkp_opt="${chkp_opt} \
+ --profile-backward"
+fi
+
+full_options="${gpt_options} ${deepspeed_options} ${chkp_opt}"
+
+run_cmd="deepspeed pretrain_gpt2.py $@ ${full_options}"
+echo ${run_cmd}
+eval ${run_cmd}
+
+set +x