EleutherAI · lintangsutawika · Jun 8, 2022 · Jun 8, 2022 · Jun 13, 2022 · Jun 13, 2022
@@ -0,0 +1,110 @@
+# GPT-2 pretraining setup
+{
+ # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
+ # across the node boundaries )
+ "pipe-parallel-size": 1,
+ "model-parallel-size": 1,
+
+ # model settings
+ "num-layers": 12,
+ "hidden-size": 768,
+ "num-attention-heads": 12,
+ "seq-length": 2048,
+ "max-position-embeddings": 2048,
+ "norm": "layernorm",
+ "pos-emb": "rotary",
+ "no-weight-tying": true,
+
+ # these should provide some speedup but takes a while to build, set to true if desired
+ "scaled-upper-triang-masked-softmax-fusion": false,
+ "bias-gelu-fusion": false,
+
+
+ # optimizer settings
+ "optimizer": {
+ "type": "Adam",
+ "params": {
+ "lr": 0.0006,
+ "betas": [0.9, 0.999],
+ "eps": 1.0e-8,
+ }
+ },
+ "zero_optimization": {
+ "stage": 0,
+ "allgather_partitions": True,
+ "allgather_bucket_size": 500000000,
+ "overlap_comm": True,
+ "reduce_scatter": True,
+ "reduce_bucket_size": 500000000,
+ "contiguous_gradients": True,
+ "cpu_offload": False
+ },
+
+ # batch / data settings
+ "train_micro_batch_size_per_gpu": 4,
+ "gradient_accumulation_steps": 8,
+ "data-impl": "mmap",
+ "split": "995,4,1",
+
+ # activation checkpointing
+ "checkpoint-activations": true,
+ "checkpoint-num-layers": 1,
+ "partition-activations": true,
+ "synchronize-each-layer": true,
+
+ # regularization
+ "gradient_clipping": 1.0,
+ "weight-decay": 0.01,
+ "hidden-dropout": 0.0,
+ "attention-dropout": 0.0,
+
+ # precision settings
+ "fp16": {
+ "fp16": true,
+ "enabled": true,
+ "loss_scale": 0,
+ "loss_scale_window": 1000,
+ "initial_scale_power": 12,
+ "hysteresis": 2,
+ "min_loss_scale": 1
+ },
+
+ # misc. training settings
+ "train-iters": 320000,
+ "lr-decay-iters": 320000,
+ "distributed-backend": "nccl",
+ "lr-decay-style": "cosine",
+ "warmup": 0.01,
+ "save-interval": 10000,
+ "eval-interval": 1000,
+ "eval-iters": 10,
+
+ # logging
+ "log-interval": 100,
+ "steps_per_print": 10,
+ "keep-last-n-checkpoints": 4,
+ "wall_clock_breakdown": true,
+
+
+ # Tokenizer / checkpoint settings - you will need to change these to the location you have them saved in
+ "vocab-file": "/fsx/pile/20B_tokenizer.json",
+ # "merge-file": "./20b_checkpoints/merges.txt",
+ "save": "./mlm_125m_checkpoints",
+ "load": "./mlm_125m_checkpoints",
+
+ # If finetuning, edit the following to the location of your finetuning dataset:
+ "data-path": "/fsx/pile/pile_20B_tokenizer_text_document",
+
+ "extra-sentinel-tokens": 100,
+ "training-objective": "mlm",
+ # "train_mtf": True,
+ # "use_prefix_attention": True,
+
+ ### NEW DATA: ####
+ "tokenizer_type": "HFTokenizer",
+ "tensorboard-dir": "./tensorboard",
+ "log-dir": "./logs",
+
+ "hostfile": "./hostfile",
+ "launcher": "OpenMPI",
+}
@@ -0,0 +1,39 @@
+# Suggested data paths when finetuning T5 on Super Glue locally
+{
+
+ "finetune": True,
+ "packing": True,
+ "data-path": "data/super_glue/super_glue",
+
+ "tokenizer-type": "HFTokenizer",
+ "vocab-file": "data/tokenizer.json",
+
+ #"save": "data/improved-t5-test",
+ #"load": "data/improved-t5-test",
+ "load": "ckpts/pretrain",
+ "save": "ckpts/sglue",
+
+ # batch / data settings
+ "train_micro_batch_size_per_gpu": 8,
+ "gradient-accumulation-steps": 1,
+ "data-impl": "mmap",
+ "split": "949,50,1",
+
+ # misc. training settings
+ "train-iters": 5000,
+ "lr-decay-iters": 5000,
+ "distributed-backend": "nccl",
+ "lr-decay-style": "cosine",
+ "warmup": 0.01,
+ "save-interval": 1000,
+ "eval-interval": 5001,
+ "eval-iters": 10,
+
+ "tensorboard-dir": "/tmp/improved-t5/tensorboard",
+ "log-dir": "/tmp/improved-t5/logs",
+ # "use_wandb": True,
+ # "wandb_group": "T5-770M-9-3-22-testppl",
+ # "wandb_team": "eleutherai",
+ # "wandb_project": "improved-t5", 
+
+}
@@ -0,0 +1,106 @@
+# GPT-2 pretraining setup
+{
+ # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
+ # across the node boundaries )
+ "pipe-parallel-size": 1,
+ "model-parallel-size": 1,
+
+ # model settings
+ "num-layers": 12,
+ "hidden-size": 768,
+ "num-attention-heads": 12,
+ "seq-length": 2048,
+ "max-position-embeddings": 2048,
+ "norm": "layernorm",
+ "pos-emb": "rotary",
+ "no-weight-tying": true,
+
+ # these should provide some speedup but takes a while to build, set to true if desired
+ "scaled-upper-triang-masked-softmax-fusion": false,
+ "bias-gelu-fusion": false,
+
+
+ # optimizer settings
+ "optimizer": {
+ "type": "Adam",
+ "params": {
+ "lr": 0.0006,
+ "betas": [0.9, 0.999],
+ "eps": 1.0e-8,
+ }
+ },
+ "zero_optimization": {
+ "stage": 0,
+ "allgather_partitions": True,
+ "allgather_bucket_size": 500000000,
+ "overlap_comm": True,
+ "reduce_scatter": True,
+ "reduce_bucket_size": 500000000,
+ "contiguous_gradients": True,
+ "cpu_offload": False
+ },
+
+ # batch / data settings
+ "train_micro_batch_size_per_gpu": 4,
+ "data-impl": "mmap",
+ "split": "949,50,1",
+
+ # activation checkpointing
+ "checkpoint-activations": true,
+ "checkpoint-num-layers": 1,
+ "partition-activations": true,
+ "synchronize-each-layer": true,
+
+ # regularization
+ "gradient_clipping": 1.0,
+ "weight-decay": 0.0,
+ "hidden-dropout": 0.0,
+ "attention-dropout": 0.0,
+
+ # precision settings
+ "fp16": {
+ "enabled": true,
+ "type": "bfloat16", # set bf16 as precision
+ "loss_scale": 0,
+ "loss_scale_window": 1000,
+ "hysteresis": 2,
+ "min_loss_scale": 1
+ },
+
+ "fp32_allreduce": True, # without a patch to torch, bf16 models have to do the allreduce in fp32
+ # misc. training settings
+ "train-iters": 320000,
+ "lr-decay-iters": 320000,
+ "distributed-backend": "nccl",
+ "lr-decay-style": "cosine",
+ "warmup": 0.01,
+ "save-interval": 10000,
+ "eval-interval": 1000,
+ "eval-iters": 10,
+
+ # logging
+ "log-interval": 100,
+ "steps_per_print": 10,
+ "keep-last-n-checkpoints": 4,
+ "wall_clock_breakdown": true,
+
+
+ # Tokenizer / checkpoint settings - you will need to change these to the location you have them saved in
+ "vocab-file": "./data/20b_checkpoints/125m-tokenizer/",
+ # "merge-file": "./20b_checkpoints/merges.txt",
+ "save": "./data/20b_checkpoints",
+ "load": "./data/20b_checkpoints",
+
+ # If finetuning, edit the following to the location of your finetuning dataset:
+ "data-path": "./data/enron/enron_text_document",
+
+ "extra-sentinel-tokens": 100,
+ "train-mlm": True,
+ # "use_prefix_attention": True,
+ "seq-length": 1024,
+
+ ### NEW DATA: ####
+ "tokenizer_type": "HFGPT2Tokenizer",
+ "tensorboard-dir": "./tensorboard",
+ "log-dir": "./logs",
+}
@@ -83,4 +83,34 @@
  "steps_per_print": 10,
  "keep-last-n-checkpoints": 4,
  "wall_clock_breakdown": true,
+
+
+ # Tokenizer / checkpoint settings - you will need to change these to the location you have them saved in
+ "vocab-file": "./data/20b_checkpoints/125m-tokenizer/",
+ # "merge-file": "./20b_checkpoints/merges.txt",
+ "save": "./data/20b_checkpoints",
+ "load": "./data/20b_checkpoints",
+
+ # If finetuning, edit the following to the location of your finetuning dataset:
+ # "data-path": "./data/p3/p3",
+ "train-data-paths": ["./data/p3/p3"],
+ "valid-data-paths": ["./data/p3_valid/p3_valid"],
+ "test-data-paths": ["./data/p3_valid/p3_valid"],
+
+ "train-data-weights": [1],
+ "valid-data-weights": [1],
+ "test-data-weights": [1],
+
+
+ # "extra-sentinel-tokens": 100,
+ # "training-objective": "prefixlm",
+ "train_mtf": True,
+ "loss_on_targets_only": True,
+ # "use_prefix_attention": True,
+ "seq-length": 1024,
+
+ ### NEW DATA: ####
+ "tokenizer_type": "HFGPT2Tokenizer",
+ "tensorboard-dir": "./tensorboard",
+ "log-dir": "./logs",
 }
@@ -0,0 +1,87 @@
+# example config for an encoder-decoder model 
+# (not optimized, just containing all relevant neox args)
+{
+ "pipe-parallel-size": 1,
+ "model-parallel-size": 1,
+
+ # model settings
+ "model-arch": "t5",
+ "num-encoder-layers": 12,
+ "num-layers": 12,
+ "hidden-size": 768,
+ "num-attention-heads": 12,
+ "seq-length": 512,
+ "decoder-seq-length": 114,
+ "max-position-embeddings": 626,
+ "norm": "layernorm",
+ "pos-emb": "rotary",
+ "rotary-pct": 0.25,
+ "activation": "geglu",
+ "no-weight-tying": true,
+ "gpt-j-residual": false,
+ "output-layer-parallelism": "column",
+
+ "init_method": "small_init",
+ "output_layer_init_method": "wang_init",
+
+ "extra-sentinel-tokens": 100,
+ # "masked-lm-prob": 0.15,
+ # "mean_noise_span_length": 3,
+
+ # fusion ops (STILL UNTESTED WITH T5)
+ "scaled-upper-triang-masked-softmax-fusion": false, 
+ # can we do upper triang fusion for certain layers only? would that be a speedup?
+ "bias-gelu-fusion": false,
+
+ # optimizer settings
+ "optimizer": {
+ "type": "Adam",
+ "params": {
+ "lr": 0.0001,
+ "betas": [0.9, 0.999],
+ "eps": 1.0e-8,
+ }
+ },
+ "min_lr": 0.0001,
+
+ "zero_optimization": {
+ "stage": 1,
+ "allgather_partitions": True,
+ "allgather_bucket_size": 500000000,
+ "overlap_comm": True,
+ "reduce_scatter": True,
+ "reduce_bucket_size": 500000000,
+ "contiguous_gradients": True,
+ "cpu_offload": False
+ },
+
+ # activation checkpointing
+ "checkpoint-activations": false,
+ "checkpoint-num-layers": 1,
+ "partition-activations": false,
+ "synchronize-each-layer": true,
+
+ # regularization
+ "gradient_clipping": 1.0,
+ "weight-decay": 0.01,
+ "hidden-dropout": 0.0,
+ "attention-dropout": 0.0,
+
+ # precision settings
+ "fp16": {
+ "enabled": true,
+ "loss_scale": 0,
+ "loss_scale_window": 1000,
+ "initial_scale_power": 12,
+ "hysteresis": 2,
+ "min_loss_scale": 1
+ },
+
+ # logging
+ "log-interval": 100,
+ "steps_per_print": 10,
+ "wall_clock_breakdown": true,
+
+ # "launcher": "openmpi",
+ # "deepspeed_mpi": true,
+}