Skip to content

Commit

Permalink
add 65B v1 config to train
Browse files Browse the repository at this point in the history
  • Loading branch information
kyriemao committed May 23, 2023
1 parent 6ac71ff commit 7cff99d
Show file tree
Hide file tree
Showing 2 changed files with 167 additions and 0 deletions.
109 changes: 109 additions & 0 deletions jarvis_configs/65B/v1/params.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
# GPT-2 pretraining setup
{
# parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
# across the node boundaries )
"pipe-parallel-size": 1,
"model-parallel-size": 8,
"make_vocab_size_divisible_by": 1,

# model settings
"num-layers": 80,
"hidden-size": 8192,
"num-attention-heads": 64,
"seq-length": 2048,
"max-position-embeddings": 2048,
"pos-emb": "rotary",
"rotary-pct": 1,
"no-weight-tying": true,
"gpt-j-residual": false,
"output-layer-parallelism": "column",
"norm": "rmsnorm",
"rms_norm_epsilon": 1.0e-6,

"attention_config": [[["flash"], 80]],

"scaled-upper-triang-masked-softmax-fusion": true,
"bias-gelu-fusion": false,
"use_bias_in_norms": false,
"use_bias_in_attn_linear": false,
"mlp_type": "llama",
"activation": "silu",

# these should provide some speedup but takes a while to build, set to true if desired
"scaled-upper-triang-masked-softmax-fusion": false,
"bias-gelu-fusion": false,

# init methods
"init_method": "small_init",
"output_layer_init_method": "wang_init",

# optimizer settings
"optimizer": {
"type": "Adam",
"params": {
"lr": 3.0e-5,
"betas": [0.9, 0.95],
"eps": 1.0e-8,
}
},
"min_lr": 1.0e-6,

# for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
"zero_optimization": {
"stage": 1,
"allgather_partitions": True,
"allgather_bucket_size": 500000000,
"overlap_comm": True,
"reduce_scatter": True,
"reduce_bucket_size": 500000000,
"contiguous_gradients": True,
},

# batch / data settings
"train_micro_batch_size_per_gpu": 32,
"data-impl": "mmap",

# activation checkpointing
"checkpoint-activations": true,
"checkpoint-num-layers": 1,
"partition-activations": true,
"synchronize-each-layer": true,

# regularization
"gradient_clipping": 1.0,
"weight-decay": 0.1,
"hidden-dropout": 0,
"attention-dropout": 0,

# precision settings
# "fp16": {
# "fp16": true,
# "enabled": true,
# "loss_scale": 0,
# "loss_scale_window": 1000,
# "hysteresis": 2,
# "min_loss_scale": 1
# },

"bf16": {
"enabled": True,
},

# misc. training settings
"train-iters": 300000,
"lr-decay-iters": 300000,
"distributed-backend": "nccl",
"lr-decay-style": "cosine",
"warmup": 0.00006,
"checkpoint-factor": 2000,
"eval-interval": 2000,
"eval-iters": 200,

# logging
"log-interval": 10,
"steps_per_print": 10,
"keep-last-n-checkpoints": 3,
"wall_clock_breakdown": true,
}


58 changes: 58 additions & 0 deletions jarvis_configs/65B/v1/setup.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
# Suggested data paths when using GPT-NeoX locally
{
# "data-path": "./data/pile_00_text_document",

# or for weighted datasets:
# sample nums: [13704383051, 60761429, 441673231, 79901568160, 13242918819, 25936489268, 5180327559, 14382492596, 5565088327, 1606159768, 35026345808]
"train-data-paths": [
"/fs/fast/share/jarvis/tokenized_data/jarvis_v1/zhihu/train_text_document",
"/fs/fast/share/jarvis/tokenized_data/jarvis_v1/csl/train_text_document",
"/fs/fast/share/jarvis/tokenized_data/jarvis_v1/wiki_wentong/train_text_document",
"/fs/fast/share/jarvis/tokenized_data/jarvis_v1/cicg/train_text_document",
"/fs/fast/share/jarvis/tokenized_data/jarvis_v1/cbook/train_text_document",
"/fs/fast/share/jarvis/tokenized_data/jarvis_v1/pubmed_central/train_text_document",
"/fs/fast/share/jarvis/tokenized_data/jarvis_v1/pubmed_abstract/train_text_document",
"/fs/fast/share/jarvis/tokenized_data/jarvis_v1/freelaw/train_text_document",
"/fs/fast/share/jarvis/tokenized_data/jarvis_v1/uspto/train_text_document",
"/fs/fast/share/jarvis/tokenized_data/jarvis_v1/europarl/train_text_document",
"/fs/fast/share/jarvis/tokenized_data/jarvis_v1/github/train_text_document",
],
# sample nums: [13803879, 59044, 415941, 79673535, 12346838, 379471665]
"valid-data-paths": [
"/fs/fast/share/jarvis/tokenized_data/jarvis_v1/zhihu/val_text_document",
"/fs/fast/share/jarvis/tokenized_data/jarvis_v1/csl/val_text_document",
"/fs/fast/share/jarvis/tokenized_data/jarvis_v1/wiki_wentong/val_text_document",
"/fs/fast/share/jarvis/tokenized_data/jarvis_v1/cicg/val_text_document",
"/fs/fast/share/jarvis/tokenized_data/jarvis_v1/cbook/val_text_document",
"/fs/fast/share/jarvis/tokenized_data/jarvis_v1/pile_val_test/val_text_document",
],
"test-data-paths": [
"/fs/fast/share/jarvis/tokenized_data/jarvis_v1/pile_val_test/test_text_document",
],
# "train-data-weights": [1.0, 1.0, 1.0, 1.0, 1.0, 0.831, 0.831, 0.831, 0.831, 0.831, 0.12],
# "valid-data-weights": [1.0, 1.0, 1.0, 1.0, 1.0, 0.06],
"train-data-weights": [1.82, 1.82, 1.82, 1.82, 1.82, 3.7, 3.7, 3.7, 3.7, 3.7, 5.57],
"valid-data-weights": [4.57, 4.57, 4.57, 4.57, 4.57, 1.28],
# "train-data-weights": [1.0, 1.0, 1.0, 1.0, 1.0],
# "valid-data-weights": [1.0],
"test-data-weights": [1.0],

"vocab-file": "/fs/fast/share/jarvis/tokenizer/jarvis_tokenizer_v1/tokenizer.model",
"tokenizer_type": "LlamaTokenizer",


"save": "/fs/fast/share/jarvis/checkpoints/65B/v1",
"load": "/home/share/jarvis/llama-65B-neox-mp",

"finetune": True,
"checkpoint_validation_with_forward_pass": False,

# new added parameters for training llama-cn
"initialize_llama_cn_with_llama_word_embeddings": True,
# "only_train_new_chinese": True,

"use_wandb": True,
"wandb_host": "https://api.wandb.ai",
"wandb_project": "65B_v1",
"wandb_team": "jarvis_llm"
}

0 comments on commit 7cff99d

Please sign in to comment.