Skip to content

Commit

Permalink
Merge pull request #22 from EleutherAI/add_configs
Browse files Browse the repository at this point in the history
add small -> 2-7B train scripts
  • Loading branch information
StellaAthena committed Feb 10, 2021
2 parents 069daa1 + c1bef76 commit 4f8e185
Show file tree
Hide file tree
Showing 12 changed files with 713 additions and 43 deletions.
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,14 @@
"train_batch_size": 240,
"train_micro_batch_size_per_gpu": 4,
"steps_per_print": 10,
"zero_optimization": {
"stage": 1,
"allgather_partitions": true,
"reduce_scatter": true,
"allgather_bucket_size": 50000000,
"reduce_bucket_size": 50000000,
"overlap_comm": true
},
"optimizer": {
"type": "Adam",
"params": {
Expand All @@ -19,13 +27,6 @@
"hysteresis": 2,
"min_loss_scale": 1
},
"flops_profiler": {
"enabled": true,
"start_step": 5,
"end_step": 10,
"module_depth": -1,
"top_modules": 3
},
"wall_clock_breakdown": true,
"zero_allow_untested_optimizer": false
}
32 changes: 32 additions & 0 deletions configs/deepspeed_configs/ds_zero_stage_1_config_2-7B.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
{
"train_batch_size": 240,
"train_micro_batch_size_per_gpu": 2,
"steps_per_print": 10,
"zero_optimization": {
"stage": 1,
"allgather_partitions": true,
"reduce_scatter": true,
"allgather_bucket_size": 5000000,
"reduce_bucket_size": 5000000,
"overlap_comm": true
},
"optimizer": {
"type": "Adam",
"params": {
"lr": 0.00015,
"max_grad_norm": 1.0,
"betas": [0.9, 0.95]
}
},
"gradient_clipping": 1.0,
"fp16": {
"enabled": true,

"loss_scale": 0,
"loss_scale_window": 1000,
"hysteresis": 2,
"min_loss_scale": 1
},
"wall_clock_breakdown": true,
"zero_allow_untested_optimizer": false
}
File renamed without changes.
2 changes: 1 addition & 1 deletion examples/ds_pretrain_gpt2.sh
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ CHECKPOINT_PATH=checkpoints/gpt2_345m_ds

script_path=$(realpath $0)
script_dir=$(dirname $script_path)
config_json="$script_dir/ds_zero_stage_2_config.json"
config_json="configs/deepspeed_configs/ds_config.json"

# Megatron Model Parallelism
mp_size=4
Expand Down
156 changes: 156 additions & 0 deletions examples/ds_pretrain_gpt2_2-7B_pipe.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
#! /bin/bash

GPUS_PER_NODE=8
# Change for multinode config
MASTER_ADDR=127.0.0.1
MASTER_PORT=2000
NNODES=1
NODE_RANK=1

export DLWS_NUM_WORKER=${NNODES}
export DLWS_NUM_GPU_PER_WORKER=${GPUS_PER_NODE}

# DATA OPTIONS:

DATA_PATH=data/enron/enron_text_document # data/webtext/webtext_text_document
VOCAB_PATH=data/gpt2-vocab.json
MERGE_PATH=data/gpt2-merges.txt
CHECKPOINT_PATH=checkpoints/gpt2_2-7B_ds

script_path=$(realpath $0)
script_dir=$(dirname $script_path)
#config_json="configs/deepspeed_configs/ds_zero_stage_2_config.json"
config_json="configs/deepspeed_configs/ds_zero_stage_1_config_2-7B.json"
#config_json="configs/deepspeed_configs/ds_config.json"

# Training options:
# Megatron Model Parallelism
mp_size=1
# DeepSpeed Pipeline parallelism
pp_size=8
# TOTAL BATCH SIZE = BATCHSIZE(pergpu) * GAS * N_GPUS
# ensure batch size details are consistent between here and the deepspeed config
BATCHSIZE=2
GAS=32
LOGDIR="tensorboard_data/${NLAYERS}l_${NHIDDEN}h_${NNODES}n_${GPUS_PER_NODE}g_${pp_size}pp_${mp_size}mp_${BATCHSIZE}b_ds4"

#ZeRO Configs
stage=1
reduce_scatter=true
contigious_gradients=true
rbs=50000000
agbs=5000000000

#Actication Checkpointing and Contigious Memory
chkp_layers=1
PA=true
PA_CPU=true
CC=true
SYNCHRONIZE=true
PROFILE=false

# GPT options:
NLAYERS=32
NHIDDEN=2560
NHEADS=32
SEQLEN=1024
LR="1.6e-4"
MINLR="1.6e-5"
WEIGHTDECAY=0
DROPOUT=0
SPARSITY='interspersed'
TRAIN_ITERS=320000

gpt_options=" \
--model-parallel-size ${mp_size} \
--pipe-parallel-size ${pp_size} \
--num-layers $NLAYERS \
--hidden-size $NHIDDEN \
--num-attention-heads $NHEADS \
--seq-length $SEQLEN \
--max-position-embeddings 1024 \
--batch-size $BATCHSIZE \
--gas $GAS \
--train-iters $TRAIN_ITERS \
--lr-decay-iters $TRAIN_ITERS \
--save $CHECKPOINT_PATH \
--load $CHECKPOINT_PATH \
--data-path $DATA_PATH \
--vocab-file $VOCAB_PATH \
--merge-file $MERGE_PATH \
--data-impl mmap \
--split 949,50,1 \
--distributed-backend nccl \
--lr $LR \
--lr-decay-style cosine \
--min-lr $MINLR \
--weight-decay $WEIGHTDECAY \
--attention-dropout $DROPOUT \
--hidden-dropout $DROPOUT \
--clip-grad 1.0 \
--warmup 0.01 \
--checkpoint-activations \
--log-interval 1 \
--save-interval 500 \
--eval-interval 100 \
--eval-iters 10 \
--fp16 \
--tensorboard-dir ${LOGDIR} \
--sparsity $SPARSITY \
--sinusoidal-pos-emb
"

deepspeed_options=" \
--deepspeed \
--deepspeed_config ${config_json} \
--zero-stage ${stage} \
--zero-reduce-bucket-size ${rbs} \
--zero-allgather-bucket-size ${agbs}
"

if [ "${contigious_gradients}" = "true" ]; then
deepspeed_options="${deepspeed_options} \
--zero-contigious-gradients"
fi

if [ "${reduce_scatter}" = "true" ]; then
deepspeed_options="${deepspeed_options} \
--zero-reduce-scatter"
fi

chkp_opt=" \
--checkpoint-activations \
--checkpoint-num-layers ${chkp_layers}"

if [ "${PA}" = "true" ]; then
chkp_opt="${chkp_opt} \
--partition-activations"
fi

if [ "${PA_CPU}" = "true" ]; then
chkp_opt="${chkp_opt} \
--checkpoint-in-cpu"
fi

if [ "${SYNCHRONIZE}" = "true" ]; then
chkp_opt="${chkp_opt} \
--synchronize-each-layer"
fi

if [ "${CC}" = "true" ]; then
chkp_opt="${chkp_opt} \
--contigious-checkpointing"
fi

if [ "${PROFILE}" = "true" ]; then
chkp_opt="${chkp_opt} \
--profile-backward"
fi

full_options="${gpt_options} ${deepspeed_options} ${chkp_opt}"

run_cmd="deepspeed pretrain_gpt2.py $@ ${full_options}"
echo ${run_cmd}
eval ${run_cmd}

set +x
155 changes: 155 additions & 0 deletions examples/ds_pretrain_gpt2_XL_pipe.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@
#! /bin/bash

GPUS_PER_NODE=8
# Change for multinode config
MASTER_ADDR=127.0.0.1
MASTER_PORT=2000
NNODES=1
NODE_RANK=1

export DLWS_NUM_WORKER=${NNODES}
export DLWS_NUM_GPU_PER_WORKER=${GPUS_PER_NODE}

# DATA OPTIONS:

DATA_PATH=data/enron/enron_text_document # data/webtext/webtext_text_document
VOCAB_PATH=data/gpt2-vocab.json
MERGE_PATH=data/gpt2-merges.txt
CHECKPOINT_PATH=checkpoints/gpt2_XL_ds

script_path=$(realpath $0)
script_dir=$(dirname $script_path)
#config_json="configs/deepspeed_configs/ds_zero_stage_2_config.json"
config_json="configs/deepspeed_configs/ds_zero_stage_1_config.json"
#config_json="configs/deepspeed_configs/ds_config.json"

# Training options:
# Megatron Model Parallelism
mp_size=1
# DeepSpeed Pipeline parallelism
pp_size=2
# TOTAL BATCH SIZE = BATCHSIZE(pergpu) * GAS * N_GPUS
# ensure batch size details are consistent between here and the deepspeed config
BATCHSIZE=4
GAS=16
LOGDIR="tensorboard_data/${NLAYERS}l_${NHIDDEN}h_${NNODES}n_${GPUS_PER_NODE}g_${pp_size}pp_${mp_size}mp_${BATCHSIZE}b_ds4"

#ZeRO Configs
stage=1
reduce_scatter=true
contigious_gradients=true
rbs=50000000
agbs=5000000000

#Actication Checkpointing and Contigious Memory
chkp_layers=1
PA=true
PA_CPU=true
CC=true
SYNCHRONIZE=true
PROFILE=false

# GPT options:
NLAYERS=24
NHIDDEN=2048
NHEADS=16
SEQLEN=1024
LR="2.0e-4"
MINLR="2.0e-5"
WEIGHTDECAY=0
DROPOUT=0
SPARSITY='interspersed'
TRAIN_ITERS=320000

gpt_options=" \
--model-parallel-size ${mp_size} \
--pipe-parallel-size ${pp_size} \
--num-layers $NLAYERS \
--hidden-size $NHIDDEN \
--num-attention-heads $NHEADS \
--seq-length $SEQLEN \
--max-position-embeddings 1024 \
--batch-size $BATCHSIZE \
--gas $GAS \
--train-iters $TRAIN_ITERS \
--lr-decay-iters $TRAIN_ITERS \
--save $CHECKPOINT_PATH \
--load $CHECKPOINT_PATH \
--data-path $DATA_PATH \
--vocab-file $VOCAB_PATH \
--merge-file $MERGE_PATH \
--data-impl mmap \
--split 949,50,1 \
--distributed-backend nccl \
--lr $LR \
--lr-decay-style cosine \
--min-lr $MINLR \
--weight-decay $WEIGHTDECAY \
--attention-dropout $DROPOUT \
--hidden-dropout $DROPOUT \
--clip-grad 1.0 \
--warmup 0.01 \
--checkpoint-activations \
--log-interval 1 \
--save-interval 500 \
--eval-interval 100 \
--eval-iters 10 \
--fp16 \
--tensorboard-dir ${LOGDIR} \
--sparsity $SPARSITY
"

deepspeed_options=" \
--deepspeed \
--deepspeed_config ${config_json} \
--zero-stage ${stage} \
--zero-reduce-bucket-size ${rbs} \
--zero-allgather-bucket-size ${agbs}
"

if [ "${contigious_gradients}" = "true" ]; then
deepspeed_options="${deepspeed_options} \
--zero-contigious-gradients"
fi

if [ "${reduce_scatter}" = "true" ]; then
deepspeed_options="${deepspeed_options} \
--zero-reduce-scatter"
fi

chkp_opt=" \
--checkpoint-activations \
--checkpoint-num-layers ${chkp_layers}"

if [ "${PA}" = "true" ]; then
chkp_opt="${chkp_opt} \
--partition-activations"
fi

if [ "${PA_CPU}" = "true" ]; then
chkp_opt="${chkp_opt} \
--checkpoint-in-cpu"
fi

if [ "${SYNCHRONIZE}" = "true" ]; then
chkp_opt="${chkp_opt} \
--synchronize-each-layer"
fi

if [ "${CC}" = "true" ]; then
chkp_opt="${chkp_opt} \
--contigious-checkpointing"
fi

if [ "${PROFILE}" = "true" ]; then
chkp_opt="${chkp_opt} \
--profile-backward"
fi

full_options="${gpt_options} ${deepspeed_options} ${chkp_opt}"

run_cmd="deepspeed pretrain_gpt2.py $@ ${full_options}"
echo ${run_cmd}
eval ${run_cmd}

set +x
Loading

0 comments on commit 4f8e185

Please sign in to comment.