Readme update + change gpt2 to gpt

zTaoplus · Jan 12, 2021 · 152aab3 · 152aab3
1 parent f5eac3d
commit 152aab3
Show file tree

Hide file tree

Showing 22 changed files with 244 additions and 195 deletions.
diff --git a/README.md b/README.md
diff --git a/examples/evaluate_zeroshot_gpt2.sh → examples/evaluate_zeroshot_gpt.sh b/examples/evaluate_zeroshot_gpt2.sh → examples/evaluate_zeroshot_gpt.sh
diff --git a/examples/finetune_mnli_distributed.sh b/examples/finetune_mnli_distributed.sh
@@ -28,11 +28,11 @@ python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/main.py \
  --num-layers 24 \
  --hidden-size 1024 \
  --num-attention-heads 16 \
- --batch-size 8 \
+ --micro-batch-size 8 \
  --checkpoint-activations \
  --lr 5.0e-5 \
  --lr-decay-style linear \
- --warmup 0.065 \
+ --lr-warmup-fraction 0.065 \
  --seq-length 512 \
  --max-position-embeddings 512 \
  --save-interval 500000 \

diff --git a/examples/finetune_race_distributed.sh b/examples/finetune_race_distributed.sh
@@ -28,11 +28,11 @@ python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/main.py \
  --num-layers 24 \
  --hidden-size 1024 \
  --num-attention-heads 16 \
- --batch-size 4 \
+ --micro-batch-size 4 \
  --checkpoint-activations \
  --lr 1.0e-5 \
  --lr-decay-style linear \
- --warmup 0.06 \
+ --lr-warmup-fraction 0.06 \
  --seq-length 512 \
  --max-position-embeddings 512 \
  --save-interval 100000 \

diff --git a/examples/pretrain_bert.sh b/examples/pretrain_bert.sh
@@ -9,24 +9,24 @@ python pretrain_bert.py \
  --num-layers 24 \
  --hidden-size 1024 \
  --num-attention-heads 16 \
- --batch-size 4 \
+ --micro-batch-size 4 \
+ --global-batch-size 8 \
  --seq-length 512 \
  --max-position-embeddings 512 \
  --train-iters 2000000 \
+ --lr-decay-iters 990000 \
  --save $CHECKPOINT_PATH \
  --load $CHECKPOINT_PATH \
  --data-path $DATA_PATH \
  --vocab-file bert-vocab.txt \
  --data-impl mmap \
  --split 949,50,1 \
- --distributed-backend nccl \
  --lr 0.0001 \
  --min-lr 0.00001 \
  --lr-decay-style linear \
- --lr-decay-iters 990000 \
+ --lr-warmup-fraction .01 \
  --weight-decay 1e-2 \
  --clip-grad 1.0 \
- --warmup .01 \
  --log-interval 100 \
  --save-interval 10000 \
  --eval-interval 1000 \

diff --git a/examples/pretrain_bert_distributed.sh b/examples/pretrain_bert_distributed.sh
@@ -15,11 +15,11 @@ DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $
 
 python -m torch.distributed.launch $DISTRIBUTED_ARGS \
  pretrain_bert.py \
- --tensor-model-parallel-size 1 \
  --num-layers 24 \
  --hidden-size 1024 \
  --num-attention-heads 16 \
- --batch-size 4 \
+ --micro-batch-size 4 \
+ --global-batch-size 32 \
  --seq-length 512 \
  --max-position-embeddings 512 \
  --train-iters 1000000 \
@@ -36,7 +36,7 @@ python -m torch.distributed.launch $DISTRIBUTED_ARGS \
  --lr-decay-iters 990000 \
  --weight-decay 1e-2 \
  --clip-grad 1.0 \
- --warmup .01 \
+ --lr-warmup-fraction .01 \
  --log-interval 100 \
  --save-interval 10000 \
  --eval-interval 1000 \

diff --git a/examples/pretrain_bert_distributed_with_mp.sh b/examples/pretrain_bert_distributed_with_mp.sh
@@ -20,8 +20,8 @@ python -m torch.distributed.launch $DISTRIBUTED_ARGS \
  --num-layers 24 \
  --hidden-size 1024 \
  --num-attention-heads 16 \
- --batch-size 2 \
- --num-microbatches-in-minibatch 2 \
+ --micro-batch-size 2 \
+ --global-batch-size 16 \
  --seq-length 512 \
  --max-position-embeddings 512 \
  --train-iters 1000000 \
@@ -38,7 +38,7 @@ python -m torch.distributed.launch $DISTRIBUTED_ARGS \
  --lr-decay-iters 990000 \
  --weight-decay 1e-2 \
  --clip-grad 1.0 \
- --warmup .01 \
+ --lr-warmup-fraction .01 \
  --log-interval 100 \
  --save-interval 10000 \
  --eval-interval 1000 \

diff --git a/examples/pretrain_gpt2.sh → examples/pretrain_gpt.sh b/examples/pretrain_gpt2.sh → examples/pretrain_gpt.sh
@@ -9,11 +9,12 @@ DATA_PATH=<Specify path and file prefix>_text_document
 CHECKPOINT_PATH=<Specify path>
 
 
-python pretrain_gpt2.py \
+python pretrain_gpt.py \
  --num-layers 24 \
  --hidden-size 1024 \
  --num-attention-heads 16 \
- --batch-size 8 \
+ --micro-batch-size 4 \
+ --global-batch-size 8 \
  --seq-length 1024 \
  --max-position-embeddings 1024 \
  --train-iters 500000 \
@@ -31,7 +32,7 @@ python pretrain_gpt2.py \
  --lr-decay-style cosine \
  --weight-decay 1e-2 \
  --clip-grad 1.0 \
- --warmup .01 \
+ --lr-warmup-fraction .01 \
  --checkpoint-activations \
  --log-interval 100 \
  --save-interval 10000 \

diff --git a/examples/pretrain_gpt3_175B.sh b/examples/pretrain_gpt3_175B.sh
@@ -0,0 +1,64 @@
+#!/bin/bash
+
+
+#SBATCH <SLURM OPTIONS> --nodes=128 --exclusive --ntasks-per-node=8 --job-name=megatron_gpt3_175b
+
+
+DIR=`pwd`
+DATETIME=`date +'date_%y-%m-%d_time_%H-%M-%S'`
+mkdir -p $DIR/logs
+
+
+DATASET_1="<PATH TO THE FIRST DATASET>"
+DATASET_2="<PATH TO THE SECOND DATASET>"
+DATASET_3="<PATH TO THE THIRD DATASET>"
+DATASET="0.2 ${DATASET_1} 0.3 ${DATASET_2} 0.5 ${DATASET_3}"
+
+
+options=" \
+ --tensor-model-parallel-size 8 \
+ --pipeline-model-parallel-size 16 \
+ --num-layers 96 \
+ --hidden-size 12288 \
+ --num-attention-heads 96 \
+ --seq-length 2048 \
+ --max-position-embeddings 2048 \
+ --micro-batch-size 1 \
+ --global-batch-size 1536 \
+ --rampup-batch-size 16 16 5859375 \
+ --train-samples 146484375 \
+ --lr-decay-samples 126953125 \
+ --lr-warmup-samples 183105 \
+ --lr 6.0e-5 \
+ --min-lr 6.0e-6 \
+ --lr-decay-style cosine \
+ --log-interval 10 \
+ --eval-iters 40 \
+ --eval-interval 1000 \
+ --data-path ${DATASET} \
+ --vocab-file <PATH TO gpt-vocab.json> \
+ --merge-file <PATH TO gpt-merges.txt> \
+ --save-interval 1000 \
+ --save <PATH TO CHECKPOINTS DIRECTORY> \
+ --load <PATH TO CHECKPOINTS DIRECTORY> \
+ --split 98,2,0 \
+ --clip-grad 1.0 \
+ --weight-decay 0.1 \
+ --adam-beta1 0.9 \
+ --adam-beta2 0.95 \
+ --tensorboard-dir <TENSORBOARD DIRECTORY> \
+ --fp16 \
+ --checkpoint-activations "
+
+
+run_cmd="python -u ${DIR}/pretrain_gpt.py $@ ${options}"
+
+
+srun -l \
+ --container-image "nvcr.io/nvidia/pytorch:20.12-py3" \
+ --container-mounts "<DIRECTORIES TO MOUNT>" \
+ --output=$DIR/logs/%x_%j_$DATETIME.log sh -c "${run_cmd}"
+
+
+set +x
+
diff --git a/examples/pretrain_gpt2_distributed.sh → examples/pretrain_gpt_distributed.sh b/examples/pretrain_gpt2_distributed.sh → examples/pretrain_gpt_distributed.sh
@@ -16,12 +16,12 @@ CHECKPOINT_PATH=<Specify path>
 DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
 
 python -m torch.distributed.launch $DISTRIBUTED_ARGS \
- pretrain_gpt2.py \
- --tensor-model-parallel-size 1 \
+ pretrain_gpt.py \
  --num-layers 24 \
  --hidden-size 1024 \
  --num-attention-heads 16 \
- --batch-size 8 \
+ --micro-batch-size 8 \
+ --global-batch-size 64 \
  --seq-length 1024 \
  --max-position-embeddings 1024 \
  --train-iters 500000 \
@@ -39,7 +39,7 @@ python -m torch.distributed.launch $DISTRIBUTED_ARGS \
  --min-lr 1.0e-5 \
  --weight-decay 1e-2 \
  --clip-grad 1.0 \
- --warmup .01 \
+ --lr-warmup-fraction .01 \
  --checkpoint-activations \
  --log-interval 100 \
  --save-interval 10000 \

diff --git a/...ples/pretrain_gpt2_distributed_with_mp.sh → examples/pretrain_gpt_distributed_with_mp.sh b/...ples/pretrain_gpt2_distributed_with_mp.sh → examples/pretrain_gpt_distributed_with_mp.sh
@@ -16,14 +16,14 @@ CHECKPOINT_PATH=<Specify path>
 DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
 
 python -m torch.distributed.launch $DISTRIBUTED_ARGS \
- pretrain_gpt2.py \
+ pretrain_gpt.py \
  --tensor-model-parallel-size 2 \
  --pipeline-model-parallel-size 2 \
  --num-layers 24 \
  --hidden-size 1024 \
  --num-attention-heads 16 \
- --batch-size 4 \
- --num-microbatches-in-minibatch 2 \
+ --micro-batch-size 4 \
+ --global-batch-size 16 \
  --seq-length 1024 \
  --max-position-embeddings 1024 \
  --train-iters 500000 \
@@ -41,7 +41,7 @@ python -m torch.distributed.launch $DISTRIBUTED_ARGS \
  --min-lr 1.0e-5 \
  --weight-decay 1e-2 \
  --clip-grad 1.0 \
- --warmup .01 \
+ --lr-warmup-fraction .01 \
  --checkpoint-activations \
  --log-interval 100 \
  --save-interval 10000 \

diff --git a/images/cases.png b/images/cases.png
diff --git a/images/cases_jan2021.png b/images/cases_jan2021.png
diff --git a/images/scaling-dp.png b/images/scaling-dp.png
diff --git a/images/scaling-mp.png b/images/scaling-mp.png
diff --git a/images/scaling.png b/images/scaling.png
diff --git a/megatron/data/gpt2_dataset.py → megatron/data/gpt_dataset.py b/megatron/data/gpt2_dataset.py → megatron/data/gpt_dataset.py
@@ -13,7 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-"""GPT2 style dataset."""
+"""GPT style dataset."""
 
 import os
 import time
@@ -107,7 +107,7 @@ def build_dataset(index, name):
  if splits[index + 1] > splits[index]:
  documents = np.arange(start=splits[index], stop=splits[index + 1],
  step=1, dtype=np.int32)
- dataset = GPT2Dataset(name, data_prefix,
+ dataset = GPTDataset(name, data_prefix,
  documents, indexed_dataset,
  train_valid_test_num_samples[index],
  seq_length, seed)
@@ -136,7 +136,7 @@ def get_indexed_dataset_(data_prefix, data_impl, skip_warmup):
  return indexed_dataset
 
 
-class GPT2Dataset(torch.utils.data.Dataset):
+class GPTDataset(torch.utils.data.Dataset):
 
  def __init__(self, name, data_prefix, documents, indexed_dataset,
  num_samples, seq_length, seed):

diff --git a/megatron/model/__init__.py b/megatron/model/__init__.py
@@ -34,10 +34,11 @@ def import_layernorm(fp32_residual_connection):
  BertModelFirstStage,
  BertModelIntermediateStage,
  BertModelLastStage)
-from .gpt2_model import (GPT2Model,
- GPT2ModelFirstStage,
- GPT2ModelIntermediateStage,
- GPT2ModelLastStage)
+from .realm_model import ICTBertModel
+from .gpt_model import (GPTModel,
+ GPTModelFirstStage,
+ GPTModelIntermediateStage,
+ GPTModelLastStage)
 from .language_model import get_language_model
 from .module import FP16Module
 from .realm_model import ICTBertModel