Skip to content

Commit

Permalink
65B v1 config | jarvis_v1_tokenize_scripts | fix llama tokenizer enco…
Browse files Browse the repository at this point in the history
…ding for prerpocessing of tokenization that enables BOS | modify custom checkpoint loading function for llama (65B), still need to refine
  • Loading branch information
kyriemao committed May 22, 2023
1 parent 7dfe6b0 commit 6ac71ff
Show file tree
Hide file tree
Showing 35 changed files with 911 additions and 67 deletions.
36 changes: 21 additions & 15 deletions jarvis_configs/65B/tmp/params.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@
"output-layer-parallelism": "column",
"norm": "rmsnorm",
"rms_norm_epsilon": 1.0e-6,

"attention_config": [[["flash"], 80]],

"scaled-upper-triang-masked-softmax-fusion": true,
"bias-gelu-fusion": false,
Expand All @@ -39,12 +41,12 @@
"optimizer": {
"type": "Adam",
"params": {
"lr": 1.5e-5,
"lr": 2.0e-5,
"betas": [0.9, 0.95],
"eps": 1.0e-8,
}
},
"min_lr": 5.0e-6,
"min_lr": 1.0e-6,

# for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
"zero_optimization": {
Expand All @@ -58,7 +60,7 @@
},

# batch / data settings
"train_micro_batch_size_per_gpu": 2,
"train_micro_batch_size_per_gpu": 32,
"data-impl": "mmap",

# activation checkpointing
Expand All @@ -72,15 +74,19 @@
"weight-decay": 0.1,
"hidden-dropout": 0,
"attention-dropout": 0,

# precision settings
"fp16": {
"fp16": true,
"enabled": true,
"loss_scale": 0,
"loss_scale_window": 1000,
"hysteresis": 2,
"min_loss_scale": 1
# "fp16": {
# "fp16": true,
# "enabled": true,
# "loss_scale": 0,
# "loss_scale_window": 1000,
# "hysteresis": 2,
# "min_loss_scale": 1
# },

"bf16": {
"enabled": True,
},

# misc. training settings
Expand All @@ -89,14 +95,14 @@
"distributed-backend": "nccl",
"lr-decay-style": "cosine",
"warmup": 0.01,
"checkpoint-factor": 100,
"eval-interval": 100,
"eval-iters": 100,
"checkpoint-factor": 200,
"eval-interval": 200,
"eval-iters": 200,

# logging
"log-interval": 10,
"steps_per_print": 10,
"keep-last-n-checkpoints": 2,
"keep-last-n-checkpoints": 3,
"wall_clock_breakdown": true,
}

Expand Down
53 changes: 36 additions & 17 deletions jarvis_configs/65B/tmp/setup.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,32 +4,51 @@

# or for weighted datasets:
"train-data-paths": [
"/fs/fast/u2020000280/data/comments2019_llama_tokenized/train_text_document",
"/fs/fast/u2020000280/data/csl_llama_tokenized/train_text_document",
"/fs/fast/u2020000280/data/news2016zh_llama_tokenized/train_text_document",
"/fs/fast/u2021000178/data/llama-cn_tokenized/zhihu/train_text_document",
"/fs/fast/u2021000178/data/llama-cn_tokenized/csl/train_text_document",
"/fs/fast/u2021000178/data/llama-cn_tokenized/wudao/train_text_document",
"/fs/fast/u2021000178/data/llama-cn_tokenized/news2016zh/train_text_document",
"/fs/fast/u2021000178/data/llama-cn_tokenized/wiki-wentong/train_text_document",
"/fs/fast/u2021000178/data/llama-cn_tokenized/pubmed_central/train_text_document",
"/fs/fast/u2021000178/data/llama-cn_tokenized/pubmed_abstract/train_text_document",
"/fs/fast/u2021000178/data/llama-cn_tokenized/freelaw/train_text_document",
"/fs/fast/u2021000178/data/llama-cn_tokenized/uspto/train_text_document",
"/fs/fast/u2021000178/data/llama-cn_tokenized/europarl/train_text_document",
"/fs/fast/u2021000178/data/llama-cn_tokenized/github/train_text_document",
],
# [13942805, 59112, 7738333, 1674883, 419468, 379260051]
"valid-data-paths": [
"/fs/fast/u2021000178/data/llama-cn_tokenized/zhihu/val_text_document",
"/fs/fast/u2021000178/data/llama-cn_tokenized/csl/val_text_document",
"/fs/fast/u2021000178/data/llama-cn_tokenized/wudao/val_text_document",
"/fs/fast/u2021000178/data/llama-cn_tokenized/news2016zh/val_text_document",
"/fs/fast/u2021000178/data/llama-cn_tokenized/wiki-wentong/val_text_document",
"/fs/fast/u2021000178/data/llama-cn_tokenized/pile-val-test/val_text_document",
],
"test-data-paths": [
"/fs/fast/u2020000280/data/comments2019_llama_tokenized/val_text_document",
"/fs/fast/u2020000280/data/csl_llama_tokenized/val_text_document",
"/fs/fast/u2020000280/data/news2016zh_llama_tokenized/val_text_document",
],
"valid-data-paths": [
"/fs/fast/u2020000280/data/comments2019_llama_tokenized/val_text_document",
"/fs/fast/u2020000280/data/csl_llama_tokenized/val_text_document",
"/fs/fast/u2020000280/data/news2016zh_llama_tokenized/val_text_document",
"/fs/fast/u2021000178/data/llama-cn_tokenized/pile-val-test/test_text_document",
],
"train-data-weights": [1., 1., 1.],
"test-data-weights": [1., 1., 1.],
"valid-data-weights": [1., 1., 1.],
"train-data-weights": [1.0, 1.0, 1.0, 1.0, 1.0, 0.831, 0.831, 0.831, 0.831, 0.831, 0.12],
"valid-data-weights": [1.0, 1.0, 1.0, 1.0, 1.0, 0.06],
# "train-data-weights": [1.0, 1.0, 1.0, 1.0, 1.0],
# "valid-data-weights": [1.0],
"test-data-weights": [1.0],

"vocab-file": "/home/share/jarvis/llama-7b-hf/tokenizer.model",
"vocab-file": "/home/share/jarvis/tokenizer/llama_chinese_tokenizer.model",
# "vocab-file": "/home/u2021000178/share/gsai_joint_project/llama_train/gpt-neox-main/tokenizer.model",
"tokenizer_type": "LlamaTokenizer",


"save": "checkpoints_65B_tmp",
"load": "checkpints_65B_tmp",
"finetune": True,
# "load": "/home/share/jarvis/llama-65B-neox-mp",
"load": "checkpoints_65B_tmp",
"finetune": False,
"checkpoint_validation_with_forward_pass": False,

# new added parameters for training llama-cn
"initialize_llama_cn_with_llama_word_embeddings": False,
# "only_train_new_chinese": True,

"use_wandb": True,
"wandb_host": "https://api.wandb.ai",
"wandb_project": "65B_tmp",
Expand Down
9 changes: 6 additions & 3 deletions megatron/checkpointing.py
Original file line number Diff line number Diff line change
Expand Up @@ -224,16 +224,19 @@ def save_checkpoint(neox_args, iteration, model, optimizer, lr_scheduler):

def custom_llama_cn_vocab_expand_load_func(src, dst):
# llama word embedding + random initialized chinese word embeddings
original_random_cn_word_embeddings = dst.sequential[0].word_embeddings.weight.data[4000:].cpu().detach()
len_llama_vocab = src['sequential.0.word_embeddings.weight'].shape[0]
print("original llama vocab size:", len_llama_vocab)

original_random_cn_word_embeddings = dst.sequential[0].word_embeddings.weight.data[len_llama_vocab:].cpu().detach()
new_word_embeddings = torch.cat([src['sequential.0.word_embeddings.weight'], original_random_cn_word_embeddings], dim=0)
src['sequential.0.word_embeddings.weight'] = new_word_embeddings

# llama word embedding + random initialized chinese word embeddings
original_random_final_linear_embeddings = dst.sequential[84].final_linear.weight.data[4000:].cpu().detach()
original_random_final_linear_embeddings = dst.sequential[84].final_linear.weight.data[len_llama_vocab:].cpu().detach()
new_final_linear_embeddings = torch.cat([src['sequential.84.final_linear.weight'], original_random_final_linear_embeddings], dim=0)
src['sequential.84.final_linear.weight'] = new_final_linear_embeddings

dst.load_state_dict(src, strict=False)
dst.load_state_dict(src, strict=True)


def zero_grad_hook(grad):
Expand Down
2 changes: 1 addition & 1 deletion megatron/tokenizer/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -247,7 +247,7 @@ def inv_vocab(self):
raise NotImplementedError

def tokenize(self, text: str):
return self.tokenizer.encode(text, add_special_tokens=False)
return self.tokenizer.encode(text, add_special_tokens=True)

def tokenize_batch(self, text_batch: Union[List[str], str]):
if isinstance(text_batch, list):
Expand Down
30 changes: 30 additions & 0 deletions run_scripts/run_convert_raw_llama_to_neox.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
#!/bin/bash

### 将本次作业计费到导师课题组,tutor_project改为导师创建的课题组名
#SBATCH --comment=joint_project

### 给你这个作业起个名字,方便识别不同的作业
#SBATCH --job-name=convert_raw_llama_to_neox

### 指定该作业需要多少个节点
### 注意!没有使用多机并行(MPI/NCCL等),下面参数写1!不要多写,多写了也不会加速程序!
#SBATCH --nodes=1

### 指定该作业需要多少个CPU核心
### 注意!一般根据队列的CPU核心数填写,比如cpu队列64核,这里申请64核,并在你的程序中尽量使用多线程充分利用64核资源!
#SBATCH --gres=gpu:8

### 指定该作业在哪个队列上执行
#SBATCH --partition=gpu-a800-gsai

### 以上参数用来申请所需资源
### 以下命令将在计算节点执行

### 本例使用Anaconda中的Python,先将Python添加到环境变量配置好环境变量
source activate llm


### 执行你的作业
python ./tools/convert_raw_llama_weights_to_neox.py --input_dir /home/share/jarvis/llama-checkpoints --model_size 65B --num_output_shards 8 --output_dir /home/share/jarvis/llama-65B-neox-mp
echo "-----> Task finished..."

38 changes: 38 additions & 0 deletions run_scripts/run_train_65B_tmp.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
#!/bin/bash

### 将本次作业计费到导师课题组,tutor_project改为导师创建的课题组名
#SBATCH --comment=joint_project

### 给你这个作业起个名字,方便识别不同的作业
#SBATCH --job-name=train_65B_tmp

### 指定该作业需要多少个节点
### 注意!没有使用多机并行(MPI/NCCL等),下面参数写1!不要多写,多写了也不会加速程序!
#SBATCH --nodes=12
### SBATCH --nodelist=a800-[5,6,7,8]
### 指定该作业需要多少个CPU核心
### 注意!一般根据队列的CPU核心数填写,比如cpu队列64核,这里申请64核,并在你的程序中尽量使用多线程充分利用64核资源!
#SBATCH --gres=gpu:8

### 指定该作业在哪个队列上执行
#SBATCH --partition=gpu-a800-gsai

### 以上参数用来申请所需资源
### 以下命令将在计算节点执行

### 本例使用Anaconda中的Python,先将Python添加到环境变量配置好环境变量
# source activate llm
nodelist=$(scontrol show hostname $SLURM_NODELIST)
for i in "${nodelist[@]}"
do
printf "%s slots=8\n" $i > 65B_tmp_hostfile

# or do whatever with individual element of the array
done

### 执行你的作业
module load cuda/11.8
source activate llm
python ./deepy.py train.py -d jarvis_configs/65B/tmp params.yml setup.yml -H 65B_tmp_hostfile

rm 65B_tmp_hostfile
38 changes: 38 additions & 0 deletions run_scripts/run_train_65B_v1.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
#!/bin/bash

### 将本次作业计费到导师课题组,tutor_project改为导师创建的课题组名
#SBATCH --comment=joint_project

### 给你这个作业起个名字,方便识别不同的作业
#SBATCH --job-name=train_65B_v1

### 指定该作业需要多少个节点
### 注意!没有使用多机并行(MPI/NCCL等),下面参数写1!不要多写,多写了也不会加速程序!
#SBATCH --nodes=10
### SBATCH --nodelist=a800-[5,6,7,8]
### 指定该作业需要多少个CPU核心
### 注意!一般根据队列的CPU核心数填写,比如cpu队列64核,这里申请64核,并在你的程序中尽量使用多线程充分利用64核资源!
#SBATCH --gres=gpu:8

### 指定该作业在哪个队列上执行
#SBATCH --partition=gpu-a800-gsai

### 以上参数用来申请所需资源
### 以下命令将在计算节点执行

### 本例使用Anaconda中的Python,先将Python添加到环境变量配置好环境变量
# source activate llm
nodelist=$(scontrol show hostname $SLURM_NODELIST)
for i in "${nodelist[@]}"
do
printf "%s slots=8\n" $i > 65B_v1_hostfile

# or do whatever with individual element of the array
done

### 执行你的作业
module load cuda/11.8
source activate llm
python ./deepy.py train.py -d jarvis_configs/65B/v1 params.yml setup.yml -H 65B_v1_hostfile

rm 65B_v1_hostfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
#!/bin/bash

### 将本次作业计费到导师课题组,tutor_project改为导师创建的课题组名
#SBATCH --comment=joint_project

### 给你这个作业起个名字,方便识别不同的作业
#SBATCH --job-name="tokenize_jarvis_v1_arxiv"

### 指定该作业需要多少个节点
### 注意!没有使用多机并行(MPI/NCCL等),下面参数写1!不要多写,多写了也不会加速程序!
#SBATCH --nodes=1

### 指定该作业需要多少个CPU核心
### 注意!一般根据队列的CPU核心数填写,比如cpu队列64核,这里申请64核,并在你的程序中尽量使用多线程充分利用64核资源!
#SBATCH --gres=gpu:8

### 指定该作业在哪个队列上执行
#SBATCH --partition=gpu-a800-gsai

### 以上参数用来申请所需资源
### 以下命令将在计算节点执行


### 激活一个 Anaconda 环境 your_env
conda activate llm

### 执行你的作业
### 本任务是tokenize你的zst数据文件,
### input是zst数据的文件地址
### output-prefix写文件输出目录+输出文件的prefix, 最后生成的bin和idx文件会自动增加_text_document.bin or .idx的后缀
### vocab和merge-file都是tokenizer的文件
### workers多进程数目要开大才够快
### append-eod是说在把多个doc合并成一个sample的时候,doc和doc之间会加特殊字符eod作为分隔
python tools/preprocess_data.py \
--input /home/share/jarvis/arxiv.jsonl \
--output-prefix /fs/fast/share/jarvis/tokenized_data/jarvis_v1/arxiv/train \
--vocab /fs/fast/share/jarvis/tokenizer/jarvis_tokenizer_v1/tokenizer.model \
--dataset-impl mmap \
--tokenizer-type LlamaTokenizer \
--append-eod \
--workers=64 \
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
#!/bin/bash

### 将本次作业计费到导师课题组,tutor_project改为导师创建的课题组名
#SBATCH --comment=joint_project

### 给你这个作业起个名字,方便识别不同的作业
#SBATCH --job-name="tokenize_jarvis_v1_cbook"

### 指定该作业需要多少个节点
### 注意!没有使用多机并行(MPI/NCCL等),下面参数写1!不要多写,多写了也不会加速程序!
#SBATCH --nodes=1

### 指定该作业需要多少个CPU核心
### 注意!一般根据队列的CPU核心数填写,比如cpu队列64核,这里申请64核,并在你的程序中尽量使用多线程充分利用64核资源!
#SBATCH --gres=gpu:8

### 指定该作业在哪个队列上执行
#SBATCH --partition=gpu-a800-gsai

### 以上参数用来申请所需资源
### 以下命令将在计算节点执行


### 激活一个 Anaconda 环境 your_env
conda activate llm

### 执行你的作业
### 本任务是tokenize你的zst数据文件,
### input是zst数据的文件地址
### output-prefix写文件输出目录+输出文件的prefix, 最后生成的bin和idx文件会自动增加_text_document.bin or .idx的后缀
### vocab和merge-file都是tokenizer的文件
### workers多进程数目要开大才够快
### append-eod是说在把多个doc合并成一个sample的时候,doc和doc之间会加特殊字符eod作为分隔
python /home/share/gsai_joint_project/llama_train/gpt-neox-main/tools/preprocess_data.py \
--input /fs/fast/share/jarvis/cbook/cbook-train.jsonl \
--output-prefix /fs/fast/share/jarvis/tokenized_data/jarvis_v1/cbook/train \
--vocab /fs/fast/share/jarvis/tokenizer/jarvis_tokenizer_v1/tokenizer.model \
--dataset-impl mmap \
--tokenizer-type LlamaTokenizer \
--append-eod \
--workers=64 \


python /home/share/gsai_joint_project/llama_train/gpt-neox-main/tools/preprocess_data.py \
--input /fs/fast/share/jarvis/cbook/cbook-val.jsonl \
--output-prefix /fs/fast/share/jarvis/tokenized_data/jarvis_v1/cbook/val \
--vocab /fs/fast/share/jarvis/tokenizer/jarvis_tokenizer_v1/tokenizer.model \
--dataset-impl mmap \
--tokenizer-type LlamaTokenizer \
--append-eod \
--workers=64 \
Loading

0 comments on commit 6ac71ff

Please sign in to comment.