65B v1 config | jarvis_v1_tokenize_scripts | fix llama tokenizer enco…

…ding for prerpocessing of tokenization that enables BOS | modify custom checkpoint loading function for llama (65B), still need to refine
Jarvis-LLM · May 22, 2023 · 6ac71ff · 6ac71ff
1 parent 7dfe6b0
commit 6ac71ff
Show file tree

Hide file tree

Showing 35 changed files with 911 additions and 67 deletions.
diff --git a/jarvis_configs/65B/tmp/params.yml b/jarvis_configs/65B/tmp/params.yml
@@ -19,6 +19,8 @@
     "output-layer-parallelism": "column",
     "norm": "rmsnorm",
     "rms_norm_epsilon": 1.0e-6,
+
+   "attention_config": [[["flash"], 80]],
 
     "scaled-upper-triang-masked-softmax-fusion": true,
     "bias-gelu-fusion": false,
@@ -39,12 +41,12 @@
    "optimizer": {
      "type": "Adam",
      "params": {
-       "lr": 1.5e-5,
+       "lr": 2.0e-5,
        "betas": [0.9, 0.95],
        "eps": 1.0e-8,
      }
    },
-   "min_lr": 5.0e-6,
+   "min_lr": 1.0e-6,
 
    # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
    "zero_optimization": {
@@ -58,7 +60,7 @@
   },
 
    # batch / data settings
-   "train_micro_batch_size_per_gpu": 2,
+   "train_micro_batch_size_per_gpu": 32,
    "data-impl": "mmap",
 
    # activation checkpointing
@@ -72,15 +74,19 @@
    "weight-decay": 0.1,
    "hidden-dropout": 0,
    "attention-dropout": 0,
-
+  
    # precision settings
-   "fp16": {
-     "fp16": true,
-     "enabled": true,
-     "loss_scale": 0,
-     "loss_scale_window": 1000,
-     "hysteresis": 2,
-     "min_loss_scale": 1
+  #  "fp16": {
+  #    "fp16": true,
+  #    "enabled": true,
+  #    "loss_scale": 0,
+  #    "loss_scale_window": 1000,
+  #    "hysteresis": 2,
+  #    "min_loss_scale": 1
+  #  },
+
+   "bf16": {
+     "enabled": True,
    },
 
    # misc. training settings
@@ -89,14 +95,14 @@
    "distributed-backend": "nccl",
    "lr-decay-style": "cosine",
    "warmup": 0.01,
-   "checkpoint-factor": 100,
-   "eval-interval": 100,
-   "eval-iters": 100,
+   "checkpoint-factor": 200,
+   "eval-interval": 200,
+   "eval-iters": 200,
 
    # logging
    "log-interval": 10,
    "steps_per_print": 10,
-   "keep-last-n-checkpoints": 2,
+   "keep-last-n-checkpoints": 3,
    "wall_clock_breakdown": true,
 }
 

diff --git a/jarvis_configs/65B/tmp/setup.yml b/jarvis_configs/65B/tmp/setup.yml
@@ -4,32 +4,51 @@
 
   # or for weighted datasets:
   "train-data-paths": [
-    "/fs/fast/u2020000280/data/comments2019_llama_tokenized/train_text_document", 
-    "/fs/fast/u2020000280/data/csl_llama_tokenized/train_text_document", 
-    "/fs/fast/u2020000280/data/news2016zh_llama_tokenized/train_text_document", 
+    "/fs/fast/u2021000178/data/llama-cn_tokenized/zhihu/train_text_document",
+    "/fs/fast/u2021000178/data/llama-cn_tokenized/csl/train_text_document",
+    "/fs/fast/u2021000178/data/llama-cn_tokenized/wudao/train_text_document",
+    "/fs/fast/u2021000178/data/llama-cn_tokenized/news2016zh/train_text_document",
+    "/fs/fast/u2021000178/data/llama-cn_tokenized/wiki-wentong/train_text_document",
+    "/fs/fast/u2021000178/data/llama-cn_tokenized/pubmed_central/train_text_document",
+    "/fs/fast/u2021000178/data/llama-cn_tokenized/pubmed_abstract/train_text_document",
+    "/fs/fast/u2021000178/data/llama-cn_tokenized/freelaw/train_text_document",
+    "/fs/fast/u2021000178/data/llama-cn_tokenized/uspto/train_text_document",
+    "/fs/fast/u2021000178/data/llama-cn_tokenized/europarl/train_text_document",
+    "/fs/fast/u2021000178/data/llama-cn_tokenized/github/train_text_document",
+  ],
+  # [13942805, 59112, 7738333, 1674883, 419468, 379260051]
+  "valid-data-paths": [
+    "/fs/fast/u2021000178/data/llama-cn_tokenized/zhihu/val_text_document",
+    "/fs/fast/u2021000178/data/llama-cn_tokenized/csl/val_text_document",
+    "/fs/fast/u2021000178/data/llama-cn_tokenized/wudao/val_text_document",
+    "/fs/fast/u2021000178/data/llama-cn_tokenized/news2016zh/val_text_document",
+    "/fs/fast/u2021000178/data/llama-cn_tokenized/wiki-wentong/val_text_document",
+    "/fs/fast/u2021000178/data/llama-cn_tokenized/pile-val-test/val_text_document", 
     ],
   "test-data-paths": [
-    "/fs/fast/u2020000280/data/comments2019_llama_tokenized/val_text_document", 
-    "/fs/fast/u2020000280/data/csl_llama_tokenized/val_text_document", 
-    "/fs/fast/u2020000280/data/news2016zh_llama_tokenized/val_text_document", 
-    ],
-  "valid-data-paths": [
-    "/fs/fast/u2020000280/data/comments2019_llama_tokenized/val_text_document", 
-    "/fs/fast/u2020000280/data/csl_llama_tokenized/val_text_document", 
-    "/fs/fast/u2020000280/data/news2016zh_llama_tokenized/val_text_document", 
+    "/fs/fast/u2021000178/data/llama-cn_tokenized/pile-val-test/test_text_document",
     ],
-  "train-data-weights": [1., 1., 1.],
-  "test-data-weights": [1., 1., 1.],
-  "valid-data-weights": [1., 1., 1.],
+  "train-data-weights": [1.0, 1.0, 1.0, 1.0, 1.0, 0.831, 0.831, 0.831, 0.831, 0.831, 0.12],
+  "valid-data-weights": [1.0, 1.0, 1.0, 1.0, 1.0, 0.06],
+  #   "train-data-weights": [1.0, 1.0, 1.0, 1.0, 1.0],
+  # "valid-data-weights": [1.0],
+  "test-data-weights": [1.0],
 
-  "vocab-file": "/home/share/jarvis/llama-7b-hf/tokenizer.model",
+  "vocab-file": "/home/share/jarvis/tokenizer/llama_chinese_tokenizer.model",
+  # "vocab-file": "/home/u2021000178/share/gsai_joint_project/llama_train/gpt-neox-main/tokenizer.model",
   "tokenizer_type": "LlamaTokenizer",
 
+
   "save": "checkpoints_65B_tmp",
-  "load": "checkpints_65B_tmp",
-  "finetune": True,
+  # "load": "/home/share/jarvis/llama-65B-neox-mp",
+  "load": "checkpoints_65B_tmp",
+  "finetune": False,
   "checkpoint_validation_with_forward_pass": False,
 
+  # new added parameters for training llama-cn
+  "initialize_llama_cn_with_llama_word_embeddings": False,
+  # "only_train_new_chinese": True,
+
   "use_wandb": True,
   "wandb_host": "https://api.wandb.ai",
   "wandb_project": "65B_tmp",

diff --git a/megatron/checkpointing.py b/megatron/checkpointing.py
@@ -224,16 +224,19 @@ def save_checkpoint(neox_args, iteration, model, optimizer, lr_scheduler):
 
 def custom_llama_cn_vocab_expand_load_func(src, dst):
     # llama word embedding + random initialized chinese word embeddings
-    original_random_cn_word_embeddings = dst.sequential[0].word_embeddings.weight.data[4000:].cpu().detach()
+    len_llama_vocab = src['sequential.0.word_embeddings.weight'].shape[0]
+    print("original llama vocab size:", len_llama_vocab)
+
+    original_random_cn_word_embeddings = dst.sequential[0].word_embeddings.weight.data[len_llama_vocab:].cpu().detach()
     new_word_embeddings = torch.cat([src['sequential.0.word_embeddings.weight'], original_random_cn_word_embeddings], dim=0)
     src['sequential.0.word_embeddings.weight'] = new_word_embeddings
 
     # llama word embedding + random initialized chinese word embeddings
-    original_random_final_linear_embeddings = dst.sequential[84].final_linear.weight.data[4000:].cpu().detach()
+    original_random_final_linear_embeddings = dst.sequential[84].final_linear.weight.data[len_llama_vocab:].cpu().detach()
     new_final_linear_embeddings = torch.cat([src['sequential.84.final_linear.weight'], original_random_final_linear_embeddings], dim=0)
     src['sequential.84.final_linear.weight'] = new_final_linear_embeddings
 
-    dst.load_state_dict(src, strict=False)
+    dst.load_state_dict(src, strict=True)
 
 
 def zero_grad_hook(grad):

diff --git a/megatron/tokenizer/tokenizer.py b/megatron/tokenizer/tokenizer.py
@@ -247,7 +247,7 @@ def inv_vocab(self):
         raise NotImplementedError
 
     def tokenize(self, text: str):
-        return self.tokenizer.encode(text, add_special_tokens=False)
+        return self.tokenizer.encode(text, add_special_tokens=True)
 
     def tokenize_batch(self, text_batch: Union[List[str], str]):
         if isinstance(text_batch, list):

diff --git a/run_scripts/run_convert_raw_llama_to_neox.sh b/run_scripts/run_convert_raw_llama_to_neox.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+
+### 将本次作业计费到导师课题组，tutor_project改为导师创建的课题组名
+#SBATCH --comment=joint_project
+
+### 给你这个作业起个名字，方便识别不同的作业
+#SBATCH --job-name=convert_raw_llama_to_neox
+
+### 指定该作业需要多少个节点
+### 注意！没有使用多机并行（MPI/NCCL等），下面参数写1！不要多写，多写了也不会加速程序！
+#SBATCH --nodes=1
+
+### 指定该作业需要多少个CPU核心
+### 注意！一般根据队列的CPU核心数填写，比如cpu队列64核，这里申请64核，并在你的程序中尽量使用多线程充分利用64核资源！
+#SBATCH --gres=gpu:8
+
+### 指定该作业在哪个队列上执行
+#SBATCH --partition=gpu-a800-gsai
+
+### 以上参数用来申请所需资源
+### 以下命令将在计算节点执行
+
+### 本例使用Anaconda中的Python，先将Python添加到环境变量配置好环境变量
+source activate llm
+
+
+### 执行你的作业
+python ./tools/convert_raw_llama_weights_to_neox.py --input_dir /home/share/jarvis/llama-checkpoints --model_size 65B --num_output_shards 8 --output_dir /home/share/jarvis/llama-65B-neox-mp
+echo "-----> Task finished..."
+
diff --git a/run_scripts/run_train_65B_tmp.sh b/run_scripts/run_train_65B_tmp.sh
@@ -0,0 +1,38 @@
+#!/bin/bash
+
+### 将本次作业计费到导师课题组，tutor_project改为导师创建的课题组名
+#SBATCH --comment=joint_project
+
+### 给你这个作业起个名字，方便识别不同的作业
+#SBATCH --job-name=train_65B_tmp
+
+### 指定该作业需要多少个节点
+### 注意！没有使用多机并行（MPI/NCCL等），下面参数写1！不要多写，多写了也不会加速程序！
+#SBATCH --nodes=12
+### SBATCH --nodelist=a800-[5,6,7,8]
+### 指定该作业需要多少个CPU核心
+### 注意！一般根据队列的CPU核心数填写，比如cpu队列64核，这里申请64核，并在你的程序中尽量使用多线程充分利用64核资源！
+#SBATCH --gres=gpu:8
+
+### 指定该作业在哪个队列上执行
+#SBATCH --partition=gpu-a800-gsai
+
+### 以上参数用来申请所需资源
+### 以下命令将在计算节点执行
+
+### 本例使用Anaconda中的Python，先将Python添加到环境变量配置好环境变量
+# source activate llm
+nodelist=$(scontrol show hostname $SLURM_NODELIST)
+for i in "${nodelist[@]}"
+do
+   printf "%s slots=8\n" $i > 65B_tmp_hostfile
+
+   # or do whatever with individual element of the array
+done
+
+### 执行你的作业
+module load cuda/11.8
+source activate llm
+python ./deepy.py train.py -d jarvis_configs/65B/tmp params.yml setup.yml -H 65B_tmp_hostfile
+
+rm 65B_tmp_hostfile
diff --git a/run_scripts/run_train_65B_v1.sh b/run_scripts/run_train_65B_v1.sh
@@ -0,0 +1,38 @@
+#!/bin/bash
+
+### 将本次作业计费到导师课题组，tutor_project改为导师创建的课题组名
+#SBATCH --comment=joint_project
+
+### 给你这个作业起个名字，方便识别不同的作业
+#SBATCH --job-name=train_65B_v1
+
+### 指定该作业需要多少个节点
+### 注意！没有使用多机并行（MPI/NCCL等），下面参数写1！不要多写，多写了也不会加速程序！
+#SBATCH --nodes=10
+### SBATCH --nodelist=a800-[5,6,7,8]
+### 指定该作业需要多少个CPU核心
+### 注意！一般根据队列的CPU核心数填写，比如cpu队列64核，这里申请64核，并在你的程序中尽量使用多线程充分利用64核资源！
+#SBATCH --gres=gpu:8
+
+### 指定该作业在哪个队列上执行
+#SBATCH --partition=gpu-a800-gsai
+
+### 以上参数用来申请所需资源
+### 以下命令将在计算节点执行
+
+### 本例使用Anaconda中的Python，先将Python添加到环境变量配置好环境变量
+# source activate llm
+nodelist=$(scontrol show hostname $SLURM_NODELIST)
+for i in "${nodelist[@]}"
+do
+   printf "%s slots=8\n" $i > 65B_v1_hostfile
+
+   # or do whatever with individual element of the array
+done
+
+### 执行你的作业
+module load cuda/11.8
+source activate llm
+python ./deepy.py train.py -d jarvis_configs/65B/v1 params.yml setup.yml -H 65B_v1_hostfile
+
+rm 65B_v1_hostfile
diff --git a/run_scripts/tokenize/jarvis_tokenizer_v1/run_tokenize_data_arxiv.sh b/run_scripts/tokenize/jarvis_tokenizer_v1/run_tokenize_data_arxiv.sh
@@ -0,0 +1,41 @@
+#!/bin/bash
+
+### 将本次作业计费到导师课题组，tutor_project改为导师创建的课题组名
+#SBATCH --comment=joint_project
+
+### 给你这个作业起个名字，方便识别不同的作业
+#SBATCH --job-name="tokenize_jarvis_v1_arxiv"
+
+### 指定该作业需要多少个节点
+### 注意！没有使用多机并行（MPI/NCCL等），下面参数写1！不要多写，多写了也不会加速程序！
+#SBATCH --nodes=1
+
+### 指定该作业需要多少个CPU核心
+### 注意！一般根据队列的CPU核心数填写，比如cpu队列64核，这里申请64核，并在你的程序中尽量使用多线程充分利用64核资源！
+#SBATCH --gres=gpu:8
+
+### 指定该作业在哪个队列上执行
+#SBATCH --partition=gpu-a800-gsai
+
+### 以上参数用来申请所需资源
+### 以下命令将在计算节点执行
+
+
+### 激活一个 Anaconda 环境 your_env
+conda activate llm
+
+### 执行你的作业
+### 本任务是tokenize你的zst数据文件, 
+### input是zst数据的文件地址
+### output-prefix写文件输出目录+输出文件的prefix, 最后生成的bin和idx文件会自动增加_text_document.bin or .idx的后缀
+### vocab和merge-file都是tokenizer的文件
+### workers多进程数目要开大才够快
+### append-eod是说在把多个doc合并成一个sample的时候，doc和doc之间会加特殊字符eod作为分隔
+python tools/preprocess_data.py \
+            --input /home/share/jarvis/arxiv.jsonl \
+            --output-prefix /fs/fast/share/jarvis/tokenized_data/jarvis_v1/arxiv/train \
+            --vocab /fs/fast/share/jarvis/tokenizer/jarvis_tokenizer_v1/tokenizer.model \
+            --dataset-impl mmap \
+            --tokenizer-type LlamaTokenizer \
+            --append-eod \
+            --workers=64 \
diff --git a/run_scripts/tokenize/jarvis_tokenizer_v1/run_tokenize_data_cbook.sh b/run_scripts/tokenize/jarvis_tokenizer_v1/run_tokenize_data_cbook.sh
@@ -0,0 +1,51 @@
+#!/bin/bash
+
+### 将本次作业计费到导师课题组，tutor_project改为导师创建的课题组名
+#SBATCH --comment=joint_project
+
+### 给你这个作业起个名字，方便识别不同的作业
+#SBATCH --job-name="tokenize_jarvis_v1_cbook"
+
+### 指定该作业需要多少个节点
+### 注意！没有使用多机并行（MPI/NCCL等），下面参数写1！不要多写，多写了也不会加速程序！
+#SBATCH --nodes=1
+
+### 指定该作业需要多少个CPU核心
+### 注意！一般根据队列的CPU核心数填写，比如cpu队列64核，这里申请64核，并在你的程序中尽量使用多线程充分利用64核资源！
+#SBATCH --gres=gpu:8
+
+### 指定该作业在哪个队列上执行
+#SBATCH --partition=gpu-a800-gsai
+
+### 以上参数用来申请所需资源
+### 以下命令将在计算节点执行
+
+
+### 激活一个 Anaconda 环境 your_env
+conda activate llm
+
+### 执行你的作业
+### 本任务是tokenize你的zst数据文件, 
+### input是zst数据的文件地址
+### output-prefix写文件输出目录+输出文件的prefix, 最后生成的bin和idx文件会自动增加_text_document.bin or .idx的后缀
+### vocab和merge-file都是tokenizer的文件
+### workers多进程数目要开大才够快
+### append-eod是说在把多个doc合并成一个sample的时候，doc和doc之间会加特殊字符eod作为分隔
+python /home/share/gsai_joint_project/llama_train/gpt-neox-main/tools/preprocess_data.py \
+            --input /fs/fast/share/jarvis/cbook/cbook-train.jsonl \
+            --output-prefix /fs/fast/share/jarvis/tokenized_data/jarvis_v1/cbook/train \
+            --vocab /fs/fast/share/jarvis/tokenizer/jarvis_tokenizer_v1/tokenizer.model \
+            --dataset-impl mmap \
+            --tokenizer-type LlamaTokenizer \
+            --append-eod \
+            --workers=64 \
+
+
+python /home/share/gsai_joint_project/llama_train/gpt-neox-main/tools/preprocess_data.py \
+            --input /fs/fast/share/jarvis/cbook/cbook-val.jsonl \
+            --output-prefix /fs/fast/share/jarvis/tokenized_data/jarvis_v1/cbook/val \
+            --vocab /fs/fast/share/jarvis/tokenizer/jarvis_tokenizer_v1/tokenizer.model \
+            --dataset-impl mmap \
+            --tokenizer-type LlamaTokenizer \
+            --append-eod \
+            --workers=64 \