Add MoE (#1129)

* Add DeepSpeed MoE Thanks to dayofthepenguin for extensive testing Closes #479 * Update NeoXArgs docs automatically * pre-commit * Update NeoXArgs docs automatically --------- Co-authored-by: Yang Zhang <[email protected]> Co-authored-by: github-actions <[email protected]> Co-authored-by: Quentin Anthony <[email protected]>
EleutherAI · Mar 7, 2024 · 86758c3 · 86758c3
1 parent df8cf24
commit 86758c3
Show file tree

Hide file tree

Showing 10 changed files with 434 additions and 31 deletions.
diff --git a/configs/125M-moe.yml b/configs/125M-moe.yml
@@ -0,0 +1,103 @@
+# GPT-2 pretraining setup
+{
+ # Have 4 experts per layer (every 2 layers by default)
+ # So with 12 layers total:
+ # 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11
+ # Experts would be in layers:
+ # 0, 2, 4, 6, 8, 10
+ "num_experts": 4,
+
+ # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
+ # across the node boundaries )
+ "pipe_parallel_size": 1,
+ "model_parallel_size": 1,
+ "moe_expert_parallel_size": 1,
+
+ # model settings
+ "num_layers": 12,
+ "hidden_size": 768,
+ "num_attention_heads": 12,
+ "seq_length": 2048,
+ "max_position_embeddings": 2048,
+ "norm": "layernorm",
+ "pos_emb": "rotary",
+ "no_weight_tying": true,
+ "gpt_j_residual": false,
+ "output_layer_parallelism": "column",
+
+ # these should provide some speedup but takes a while to build, set to true if desired
+ "scaled_upper_triang_masked_softmax_fusion": false,
+ "bias_gelu_fusion": false,
+ "rope_fusion": false,
+
+ # init methods
+ "init_method": "small_init",
+ "output_layer_init_method": "wang_init",
+
+
+ # optimizer settings
+ "optimizer": {
+ "type": "Adam",
+ "params": {
+ "lr": 0.0006,
+ "betas": [0.9, 0.95],
+ "eps": 1.0e-8,
+ }
+ },
+ "min_lr": 0.00006,
+
+ # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
+ "zero_optimization": {
+ "stage": 1,
+ "allgather_partitions": True,
+ "allgather_bucket_size": 500000000,
+ "overlap_comm": True,
+ "reduce_scatter": True,
+ "reduce_bucket_size": 500000000,
+ "contiguous_gradients": True,
+ },
+
+ # batch / data settings
+ "train_micro_batch_size_per_gpu": 4,
+ "data_impl": "mmap",
+
+ # activation checkpointing
+ "checkpoint_activations": true,
+ "checkpoint_num_layers": 1,
+ "partition_activations": true,
+ "synchronize_each_layer": true,
+
+ # regularization
+ "gradient_clipping": 1.0,
+ "weight_decay": 0.1,
+ "hidden_dropout": 0.0,
+ "attention_dropout": 0.0,
+
+ # precision settings
+ "fp16": {
+ "enabled": true,
+ "loss_scale": 0,
+ "loss_scale_window": 1000,
+ "hysteresis": 2,
+ "min_loss_scale": 1
+ },
+
+ # misc. training settings
+ "train_iters": 320000,
+ "lr_decay_iters": 320000,
+ "distributed_backend": "nccl",
+ "lr_decay_style": "cosine",
+ "warmup": 0.01,
+ "checkpoint_factor": 10000,
+ "eval_interval": 1000,
+ "eval_iters": 10,
+
+ # logging
+ "log_interval": 10,
+ "steps_per_print": 10,
+ "keep_last_n_checkpoints": 4,
+ "wall_clock_breakdown": true,
+
+ # networking
+ "hostfile": "/mock_path"
+}
diff --git a/configs/neox_arguments.md b/configs/neox_arguments.md
@@ -111,7 +111,7 @@ Logging Arguments
 
 - **git_hash**: str
 
- Default = 2a3c4e1
+ Default = ae06be5
 
  current git hash of repository
 
@@ -1007,6 +1007,14 @@ Parallelism Arguments
 
 
 
+- **expert_interval**: int
+
+ Default = 2
+
+ Have one MoE layer every expert_interval layers
+
+
+
 ## NeoXArgsTemplate
 
 NeoXArgsTemplate()
@@ -1128,6 +1136,94 @@ Text Generation arguments
 
 
 
+- **moe_top_k**: int
+
+ Default = 1
+
+ Activate top K experts in MoE
+
+
+
+- **use_tutel**: bool
+
+ Default = False
+
+ Use Tutel optimizations in MoE
+
+
+
+- **num_experts**: int
+
+ Default = 1
+
+ Number of MoE experts
+
+
+
+- **moe_loss_coeff**: float
+
+ Default = 0.1
+
+ Coefficient for MoE loss
+
+
+
+- **moe_train_capacity_factor**: float
+
+ Default = 1.0
+
+ The capacity of the expert at train time
+
+
+
+- **moe_eval_capacity_factor**: float
+
+ Default = 1.0
+
+ The capacity of the expert at eval time
+
+
+
+- **moe_min_capacity**: int
+
+ Default = 4
+
+ The minimum capacity per expert regardless of the capacity_factor
+
+
+
+- **moe_token_dropping**: bool
+
+ Default = True
+
+ Whether to drop tokens when exceeding capacity
+
+
+
+- **create_moe_param_group**: bool
+
+ Default = True
+
+ Whether to create a separate parameter group for MoE parameters
+
+
+
+- **moe_use_residual**: bool
+
+ Default = True
+
+ Whether to use residual in MoE
+
+
+
+- **moe_expert_parallel_size**: int
+
+ Default = 1
+
+ Number of parallel experts in MoE
+
+
+
 ## NeoXArgsTokenizer
 
 Tokenizer Arguments