-
Notifications
You must be signed in to change notification settings - Fork 977
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* change the eye links to mystic mirror * update README and neox_arguments.md * clarify soem arguments in prepare / preprocess_data.py * add 20B config * add `sample_input_file` and `sample_output_file` to cmd line args * update README.md * Add paper link
- Loading branch information
Showing
8 changed files
with
515 additions
and
201 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,110 @@ | ||
# DISCLAIMER: This is the configuration file for the GPT-NeoX-20B model as it was trained on 96x 40GB A100 | ||
# GPUs. Depending on your system configuration, you may need to change some parameters in order to fit | ||
# the model in memory. | ||
|
||
{ | ||
# Tokenizer / checkpoint settings - you will need to change these to the location you have them saved in | ||
"vocab-file": "./20B_checkpoints/20B_tokenizer.json", | ||
"save": "./20B_checkpoints", | ||
"load": "./20B_checkpoints", | ||
|
||
# If finetuning, edit the following to the location of your finetuning dataset: | ||
"data-path": "./data/pile_20B_tokenizer/pile_20B_tokenizer_text_document", | ||
|
||
# parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages | ||
# across the node boundaries ) | ||
"pipe-parallel-size": 4, | ||
"model-parallel-size": 2, | ||
|
||
# model settings | ||
"num-layers": 44, | ||
"hidden-size": 6144, | ||
"num-attention-heads": 64, | ||
"seq-length": 2048, | ||
"max-position-embeddings": 2048, | ||
"norm": "layernorm", | ||
"pos-emb": "rotary", | ||
"rotary_pct": 0.25, | ||
"no-weight-tying": true, | ||
"gpt_j_residual": true, | ||
"output_layer_parallelism": "column", | ||
"scaled-upper-triang-masked-softmax-fusion": true, | ||
"bias-gelu-fusion": true, | ||
|
||
# init methods | ||
"init_method": "small_init", | ||
"output_layer_init_method": "wang_init", | ||
|
||
# optimizer settings | ||
"optimizer": { | ||
"type": "Adam", | ||
"params": { | ||
"lr": 0.97e-4, | ||
"betas": [0.9, 0.95], | ||
"eps": 1.0e-8, | ||
} | ||
}, | ||
|
||
"min_lr": 0.97e-5, | ||
"zero_optimization": { | ||
"stage": 1, | ||
"allgather_partitions": True, | ||
"allgather_bucket_size": 1260000000, | ||
"overlap_comm": True, | ||
"reduce_scatter": True, | ||
"reduce_bucket_size": 1260000000, | ||
"contiguous_gradients": True, | ||
"cpu_offload": False | ||
}, | ||
|
||
# batch / data settings (assuming 96 GPUs) | ||
"train_micro_batch_size_per_gpu": 4, | ||
"gradient_accumulation_steps": 32, | ||
"data-impl": "mmap", | ||
"split": "995,4,1", | ||
|
||
# activation checkpointing | ||
"checkpoint-activations": true, | ||
"checkpoint-num-layers": 1, | ||
"partition-activations": false, | ||
"synchronize-each-layer": true, | ||
|
||
# regularization | ||
"gradient_clipping": 1.0, | ||
"weight-decay": 0.01, | ||
"hidden-dropout": 0, | ||
"attention-dropout": 0, | ||
|
||
# precision settings | ||
"fp16": { | ||
"fp16": true, | ||
"enabled": true, | ||
"loss_scale": 0, | ||
"loss_scale_window": 1000, | ||
"initial_scale_power": 12, | ||
"hysteresis": 2, | ||
"min_loss_scale": 1 | ||
}, | ||
|
||
# misc. training settings | ||
"train-iters": 150000, | ||
"lr-decay-iters": 150000, | ||
|
||
"distributed-backend": "nccl", | ||
"lr-decay-style": "cosine", | ||
"warmup": 0.01, | ||
"save-interval": 500, | ||
"eval-interval": 1000, | ||
"eval-iters": 10, | ||
|
||
# logging | ||
"log-interval": 2, | ||
"steps_per_print": 2, | ||
"wall_clock_breakdown": false, | ||
|
||
### NEW DATA: #### | ||
"tokenizer_type": "HFTokenizer", | ||
"tensorboard-dir": "./tensorboard", | ||
"log-dir": "./logs", | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.