start

Jarvis-LLM · May 18, 2023 · b0b561f · b0b561f
commit b0b561f
Show file tree

Hide file tree

Showing 321 changed files with 37,664 additions and 0 deletions.
diff --git a/configs/1-3B.yml b/configs/1-3B.yml
@@ -0,0 +1,91 @@
+# GPT-2 pretraining setup
+{
+ # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
+ # across the node boundaries )
+ "pipe-parallel-size": 1,
+ "model-parallel-size": 1,
+
+ # model settings
+ "num-layers": 24,
+ "hidden-size": 2048,
+ "num-attention-heads": 16,
+ "seq-length": 2048,
+ "max-position-embeddings": 2048,
+ "norm": "layernorm",
+ "pos-emb": "rotary",
+ "no-weight-tying": true,
+ "gpt_j_residual": false,
+ "output_layer_parallelism": "column",
+
+ # these should provide some speedup but takes a while to build, set to true if desired
+ "scaled-upper-triang-masked-softmax-fusion": false,
+ "bias-gelu-fusion": false,
+
+ # init methods
+ "init_method": "small_init",
+ "output_layer_init_method": "wang_init",
+
+ # optimizer settings
+ "optimizer": {
+ "type": "Adam",
+ "params": {
+ "lr": 0.0002,
+ "betas": [0.9, 0.95],
+ "eps": 1.0e-8,
+ }
+ },
+ "min_lr": 0.00002,
+
+ # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
+ "zero_optimization": {
+ "stage": 1,
+ "allgather_partitions": True,
+ "allgather_bucket_size": 500000000,
+ "overlap_comm": True,
+ "reduce_scatter": True,
+ "reduce_bucket_size": 500000000,
+ "contiguous_gradients": True,
+ },
+
+ # batch / data settings
+ "train_micro_batch_size_per_gpu": 4,
+ "data-impl": "mmap",
+
+ # activation checkpointing
+ "checkpoint-activations": true,
+ "checkpoint-num-layers": 1,
+ "partition-activations": true,
+ "synchronize-each-layer": true,
+
+ # regularization
+ "gradient_clipping": 1.0,
+ "weight-decay": 0.1,
+ "hidden-dropout": 0,
+ "attention-dropout": 0,
+
+ # precision settings
+ "fp16": {
+ "fp16": true,
+ "enabled": true,
+ "loss_scale": 0,
+ "loss_scale_window": 1000,
+ "hysteresis": 2,
+ "min_loss_scale": 1
+ },
+
+ # misc. training settings
+ "train-iters": 320000,
+ "lr-decay-iters": 320000,
+ "distributed-backend": "nccl",
+ "lr-decay-style": "cosine",
+ "warmup": 0.01,
+ "checkpoint-factor": 10000,
+ "eval-interval": 1000,
+ "eval-iters": 10,
+
+ # logging
+ "log-interval": 100,
+ "steps_per_print": 10,
+ "keep-last-n-checkpoints": 4,
+ "wall_clock_breakdown": true,
+}
diff --git a/configs/125M-json.yml b/configs/125M-json.yml
@@ -0,0 +1,78 @@
+{
+ "pipe-parallel-size": 1,
+ "model-parallel-size": 1,
+
+ "num-layers": 12,
+ "hidden-size": 768,
+ "num-attention-heads": 12,
+ "seq-length": 2048,
+ "max-position-embeddings": 2048,
+ "norm": "layernorm",
+ "pos-emb": "rotary",
+ "no-weight-tying": true,
+ "gpt_j_residual": false,
+ "output_layer_parallelism": "column",
+
+ "scaled-upper-triang-masked-softmax-fusion": false,
+ "bias-gelu-fusion": false,
+
+ "init_method": "small_init",
+ "output_layer_init_method": "wang_init",
+
+ "optimizer": {
+ "type": "Adam",
+ "params": {
+ "lr": 0.0006,
+ "betas": [0.9, 0.95],
+ "eps": 1.0e-8
+ }
+ },
+ "min_lr": 0.00006,
+
+ "zero_optimization": {
+ "stage": 1,
+ "allgather_partitions": true,
+ "allgather_bucket_size": 500000000,
+ "overlap_comm": true,
+ "reduce_scatter": true,
+ "reduce_bucket_size": 500000000,
+ "contiguous_gradients": true
+ },
+
+ "train_micro_batch_size_per_gpu": 4,
+ "data-impl": "mmap",
+
+ "checkpoint-activations": true,
+ "checkpoint-num-layers": 1,
+ "partition-activations": true,
+ "synchronize-each-layer": true,
+
+ "gradient_clipping": 1.0,
+ "weight-decay": 0.1,
+ "hidden-dropout": 0.0,
+ "attention-dropout": 0.0,
+
+ "fp16": {
+ "enabled": true,
+ "loss_scale": 0,
+ "loss_scale_window": 1000,
+ "hysteresis": 2,
+ "min_loss_scale": 1
+ },
+
+ "train-iters": 320000,
+ "lr-decay-iters": 320000,
+ "distributed-backend": "nccl",
+ "lr-decay-style": "cosine",
+ "warmup": 0.01,
+ "checkpoint-factor": 10000,
+ "eval-interval": 1000,
+ "eval-iters": 10,
+
+ "log-interval": 100,
+ "steps_per_print": 10,
+ "keep-last-n-checkpoints": 4,
+ "wall_clock_breakdown": true,
+
+ "hostfile": "/mock_path"
+}
diff --git a/configs/125M.yml b/configs/125M.yml
@@ -0,0 +1,94 @@
+# GPT-2 pretraining setup
+{
+ # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
+ # across the node boundaries )
+ "pipe-parallel-size": 1,
+ "model-parallel-size": 1,
+
+ # model settings
+ "num-layers": 12,
+ "hidden-size": 768,
+ "num-attention-heads": 12,
+ "seq-length": 2048,
+ "max-position-embeddings": 2048,
+ "norm": "layernorm",
+ "pos-emb": "rotary",
+ "no-weight-tying": true,
+ "gpt_j_residual": false,
+ "output_layer_parallelism": "column",
+
+ # these should provide some speedup but takes a while to build, set to true if desired
+ "scaled-upper-triang-masked-softmax-fusion": false,
+ "bias-gelu-fusion": false,
+
+ # init methods
+ "init_method": "small_init",
+ "output_layer_init_method": "wang_init",
+
+
+ # optimizer settings
+ "optimizer": {
+ "type": "Adam",
+ "params": {
+ "lr": 0.0006,
+ "betas": [0.9, 0.95],
+ "eps": 1.0e-8,
+ }
+ },
+ "min_lr": 0.00006,
+
+ # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
+ "zero_optimization": {
+ "stage": 1,
+ "allgather_partitions": True,
+ "allgather_bucket_size": 500000000,
+ "overlap_comm": True,
+ "reduce_scatter": True,
+ "reduce_bucket_size": 500000000,
+ "contiguous_gradients": True,
+ },
+
+ # batch / data settings
+ "train_micro_batch_size_per_gpu": 4,
+ "data-impl": "mmap",
+
+ # activation checkpointing
+ "checkpoint-activations": true,
+ "checkpoint-num-layers": 1,
+ "partition-activations": true,
+ "synchronize-each-layer": true,
+
+ # regularization
+ "gradient_clipping": 1.0,
+ "weight-decay": 0.1,
+ "hidden-dropout": 0.0,
+ "attention-dropout": 0.0,
+
+ # precision settings
+ "fp16": {
+ "enabled": true,
+ "loss_scale": 0,
+ "loss_scale_window": 1000,
+ "hysteresis": 2,
+ "min_loss_scale": 1
+ },
+
+ # misc. training settings
+ "train-iters": 320000,
+ "lr-decay-iters": 320000,
+ "distributed-backend": "nccl",
+ "lr-decay-style": "cosine",
+ "warmup": 0.01,
+ "checkpoint-factor": 10000,
+ "eval-interval": 1000,
+ "eval-iters": 10,
+
+ # logging
+ "log-interval": 100,
+ "steps_per_print": 10,
+ "keep-last-n-checkpoints": 4,
+ "wall_clock_breakdown": true,
+
+ # networking
+ "hostfile": "/mock_path"
+}
diff --git a/configs/13B.yml b/configs/13B.yml
@@ -0,0 +1,92 @@
+# GPT-2 pretraining setup
+{
+ # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
+ # across the node boundaries )
+ "pipe-parallel-size": 1,
+ "model-parallel-size": 1,
+
+ # model settings
+ "num-layers": 40,
+ "hidden-size": 5120,
+ "num-attention-heads": 40,
+ "seq-length": 2048,
+ "max-position-embeddings": 2048,
+ "norm": "layernorm",
+ "pos-emb": "rotary",
+ "no-weight-tying": true,
+ "gpt_j_residual": false,
+ "output_layer_parallelism": "column",
+
+ # these should provide some speedup but takes a while to build, set to true if desired
+ "scaled-upper-triang-masked-softmax-fusion": false,
+ "bias-gelu-fusion": false,
+
+ # init methods
+ "init_method": "small_init",
+ "output_layer_init_method": "wang_init",
+
+
+ # optimizer settings
+ "optimizer": {
+ "type": "Adam",
+ "params": {
+ "lr": 0.0001,
+ "betas": [0.9, 0.95],
+ "eps": 1.0e-8,
+ }
+ },
+
+ # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
+ "zero_optimization": {
+ "stage": 1,
+ "allgather_partitions": True,
+ "allgather_bucket_size": 500000000,
+ "overlap_comm": True,
+ "reduce_scatter": True,
+ "reduce_bucket_size": 500000000,
+ "contiguous_gradients": True,
+ },
+ "min_lr": 0.00001,
+
+ # batch / data settings
+ "train_micro_batch_size_per_gpu": 4,
+ "data-impl": "mmap",
+
+ # activation checkpointing
+ "checkpoint-activations": true,
+ "checkpoint-num-layers": 1,
+ "partition-activations": true,
+ "synchronize-each-layer": true,
+
+ # regularization
+ "gradient_clipping": 1.0,
+ "weight-decay": 0.1,
+ "hidden-dropout": 0,
+ "attention-dropout": 0,
+
+ # precision settings
+ "fp16": {
+ "fp16": true,
+ "enabled": true,
+ "loss_scale": 0,
+ "loss_scale_window": 1000,
+ "hysteresis": 2,
+ "min_loss_scale": 1
+ },
+
+ # misc. training settings
+ "train-iters": 320000,
+ "lr-decay-iters": 320000,
+ "distributed-backend": "nccl",
+ "lr-decay-style": "cosine",
+ "warmup": 0.01,
+ "checkpoint-factor": 10000,
+ "eval-interval": 1000,
+ "eval-iters": 10,
+
+ # logging
+ "log-interval": 100,
+ "steps_per_print": 10,
+ "keep-last-n-checkpoints": 4,
+ "wall_clock_breakdown": true,
+}