forked from EleutherAI/gpt-neox
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
65B v1 config | jarvis_v1_tokenize_scripts | fix llama tokenizer enco…
…ding for prerpocessing of tokenization that enables BOS | modify custom checkpoint loading function for llama (65B), still need to refine
- Loading branch information
Showing
35 changed files
with
911 additions
and
67 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
#!/bin/bash | ||
|
||
### 将本次作业计费到导师课题组,tutor_project改为导师创建的课题组名 | ||
#SBATCH --comment=joint_project | ||
|
||
### 给你这个作业起个名字,方便识别不同的作业 | ||
#SBATCH --job-name=convert_raw_llama_to_neox | ||
|
||
### 指定该作业需要多少个节点 | ||
### 注意!没有使用多机并行(MPI/NCCL等),下面参数写1!不要多写,多写了也不会加速程序! | ||
#SBATCH --nodes=1 | ||
|
||
### 指定该作业需要多少个CPU核心 | ||
### 注意!一般根据队列的CPU核心数填写,比如cpu队列64核,这里申请64核,并在你的程序中尽量使用多线程充分利用64核资源! | ||
#SBATCH --gres=gpu:8 | ||
|
||
### 指定该作业在哪个队列上执行 | ||
#SBATCH --partition=gpu-a800-gsai | ||
|
||
### 以上参数用来申请所需资源 | ||
### 以下命令将在计算节点执行 | ||
|
||
### 本例使用Anaconda中的Python,先将Python添加到环境变量配置好环境变量 | ||
source activate llm | ||
|
||
|
||
### 执行你的作业 | ||
python ./tools/convert_raw_llama_weights_to_neox.py --input_dir /home/share/jarvis/llama-checkpoints --model_size 65B --num_output_shards 8 --output_dir /home/share/jarvis/llama-65B-neox-mp | ||
echo "-----> Task finished..." | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,38 @@ | ||
#!/bin/bash | ||
|
||
### 将本次作业计费到导师课题组,tutor_project改为导师创建的课题组名 | ||
#SBATCH --comment=joint_project | ||
|
||
### 给你这个作业起个名字,方便识别不同的作业 | ||
#SBATCH --job-name=train_65B_tmp | ||
|
||
### 指定该作业需要多少个节点 | ||
### 注意!没有使用多机并行(MPI/NCCL等),下面参数写1!不要多写,多写了也不会加速程序! | ||
#SBATCH --nodes=12 | ||
### SBATCH --nodelist=a800-[5,6,7,8] | ||
### 指定该作业需要多少个CPU核心 | ||
### 注意!一般根据队列的CPU核心数填写,比如cpu队列64核,这里申请64核,并在你的程序中尽量使用多线程充分利用64核资源! | ||
#SBATCH --gres=gpu:8 | ||
|
||
### 指定该作业在哪个队列上执行 | ||
#SBATCH --partition=gpu-a800-gsai | ||
|
||
### 以上参数用来申请所需资源 | ||
### 以下命令将在计算节点执行 | ||
|
||
### 本例使用Anaconda中的Python,先将Python添加到环境变量配置好环境变量 | ||
# source activate llm | ||
nodelist=$(scontrol show hostname $SLURM_NODELIST) | ||
for i in "${nodelist[@]}" | ||
do | ||
printf "%s slots=8\n" $i > 65B_tmp_hostfile | ||
|
||
# or do whatever with individual element of the array | ||
done | ||
|
||
### 执行你的作业 | ||
module load cuda/11.8 | ||
source activate llm | ||
python ./deepy.py train.py -d jarvis_configs/65B/tmp params.yml setup.yml -H 65B_tmp_hostfile | ||
|
||
rm 65B_tmp_hostfile |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,38 @@ | ||
#!/bin/bash | ||
|
||
### 将本次作业计费到导师课题组,tutor_project改为导师创建的课题组名 | ||
#SBATCH --comment=joint_project | ||
|
||
### 给你这个作业起个名字,方便识别不同的作业 | ||
#SBATCH --job-name=train_65B_v1 | ||
|
||
### 指定该作业需要多少个节点 | ||
### 注意!没有使用多机并行(MPI/NCCL等),下面参数写1!不要多写,多写了也不会加速程序! | ||
#SBATCH --nodes=10 | ||
### SBATCH --nodelist=a800-[5,6,7,8] | ||
### 指定该作业需要多少个CPU核心 | ||
### 注意!一般根据队列的CPU核心数填写,比如cpu队列64核,这里申请64核,并在你的程序中尽量使用多线程充分利用64核资源! | ||
#SBATCH --gres=gpu:8 | ||
|
||
### 指定该作业在哪个队列上执行 | ||
#SBATCH --partition=gpu-a800-gsai | ||
|
||
### 以上参数用来申请所需资源 | ||
### 以下命令将在计算节点执行 | ||
|
||
### 本例使用Anaconda中的Python,先将Python添加到环境变量配置好环境变量 | ||
# source activate llm | ||
nodelist=$(scontrol show hostname $SLURM_NODELIST) | ||
for i in "${nodelist[@]}" | ||
do | ||
printf "%s slots=8\n" $i > 65B_v1_hostfile | ||
|
||
# or do whatever with individual element of the array | ||
done | ||
|
||
### 执行你的作业 | ||
module load cuda/11.8 | ||
source activate llm | ||
python ./deepy.py train.py -d jarvis_configs/65B/v1 params.yml setup.yml -H 65B_v1_hostfile | ||
|
||
rm 65B_v1_hostfile |
41 changes: 41 additions & 0 deletions
41
run_scripts/tokenize/jarvis_tokenizer_v1/run_tokenize_data_arxiv.sh
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
#!/bin/bash | ||
|
||
### 将本次作业计费到导师课题组,tutor_project改为导师创建的课题组名 | ||
#SBATCH --comment=joint_project | ||
|
||
### 给你这个作业起个名字,方便识别不同的作业 | ||
#SBATCH --job-name="tokenize_jarvis_v1_arxiv" | ||
|
||
### 指定该作业需要多少个节点 | ||
### 注意!没有使用多机并行(MPI/NCCL等),下面参数写1!不要多写,多写了也不会加速程序! | ||
#SBATCH --nodes=1 | ||
|
||
### 指定该作业需要多少个CPU核心 | ||
### 注意!一般根据队列的CPU核心数填写,比如cpu队列64核,这里申请64核,并在你的程序中尽量使用多线程充分利用64核资源! | ||
#SBATCH --gres=gpu:8 | ||
|
||
### 指定该作业在哪个队列上执行 | ||
#SBATCH --partition=gpu-a800-gsai | ||
|
||
### 以上参数用来申请所需资源 | ||
### 以下命令将在计算节点执行 | ||
|
||
|
||
### 激活一个 Anaconda 环境 your_env | ||
conda activate llm | ||
|
||
### 执行你的作业 | ||
### 本任务是tokenize你的zst数据文件, | ||
### input是zst数据的文件地址 | ||
### output-prefix写文件输出目录+输出文件的prefix, 最后生成的bin和idx文件会自动增加_text_document.bin or .idx的后缀 | ||
### vocab和merge-file都是tokenizer的文件 | ||
### workers多进程数目要开大才够快 | ||
### append-eod是说在把多个doc合并成一个sample的时候,doc和doc之间会加特殊字符eod作为分隔 | ||
python tools/preprocess_data.py \ | ||
--input /home/share/jarvis/arxiv.jsonl \ | ||
--output-prefix /fs/fast/share/jarvis/tokenized_data/jarvis_v1/arxiv/train \ | ||
--vocab /fs/fast/share/jarvis/tokenizer/jarvis_tokenizer_v1/tokenizer.model \ | ||
--dataset-impl mmap \ | ||
--tokenizer-type LlamaTokenizer \ | ||
--append-eod \ | ||
--workers=64 \ |
51 changes: 51 additions & 0 deletions
51
run_scripts/tokenize/jarvis_tokenizer_v1/run_tokenize_data_cbook.sh
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,51 @@ | ||
#!/bin/bash | ||
|
||
### 将本次作业计费到导师课题组,tutor_project改为导师创建的课题组名 | ||
#SBATCH --comment=joint_project | ||
|
||
### 给你这个作业起个名字,方便识别不同的作业 | ||
#SBATCH --job-name="tokenize_jarvis_v1_cbook" | ||
|
||
### 指定该作业需要多少个节点 | ||
### 注意!没有使用多机并行(MPI/NCCL等),下面参数写1!不要多写,多写了也不会加速程序! | ||
#SBATCH --nodes=1 | ||
|
||
### 指定该作业需要多少个CPU核心 | ||
### 注意!一般根据队列的CPU核心数填写,比如cpu队列64核,这里申请64核,并在你的程序中尽量使用多线程充分利用64核资源! | ||
#SBATCH --gres=gpu:8 | ||
|
||
### 指定该作业在哪个队列上执行 | ||
#SBATCH --partition=gpu-a800-gsai | ||
|
||
### 以上参数用来申请所需资源 | ||
### 以下命令将在计算节点执行 | ||
|
||
|
||
### 激活一个 Anaconda 环境 your_env | ||
conda activate llm | ||
|
||
### 执行你的作业 | ||
### 本任务是tokenize你的zst数据文件, | ||
### input是zst数据的文件地址 | ||
### output-prefix写文件输出目录+输出文件的prefix, 最后生成的bin和idx文件会自动增加_text_document.bin or .idx的后缀 | ||
### vocab和merge-file都是tokenizer的文件 | ||
### workers多进程数目要开大才够快 | ||
### append-eod是说在把多个doc合并成一个sample的时候,doc和doc之间会加特殊字符eod作为分隔 | ||
python /home/share/gsai_joint_project/llama_train/gpt-neox-main/tools/preprocess_data.py \ | ||
--input /fs/fast/share/jarvis/cbook/cbook-train.jsonl \ | ||
--output-prefix /fs/fast/share/jarvis/tokenized_data/jarvis_v1/cbook/train \ | ||
--vocab /fs/fast/share/jarvis/tokenizer/jarvis_tokenizer_v1/tokenizer.model \ | ||
--dataset-impl mmap \ | ||
--tokenizer-type LlamaTokenizer \ | ||
--append-eod \ | ||
--workers=64 \ | ||
|
||
|
||
python /home/share/gsai_joint_project/llama_train/gpt-neox-main/tools/preprocess_data.py \ | ||
--input /fs/fast/share/jarvis/cbook/cbook-val.jsonl \ | ||
--output-prefix /fs/fast/share/jarvis/tokenized_data/jarvis_v1/cbook/val \ | ||
--vocab /fs/fast/share/jarvis/tokenizer/jarvis_tokenizer_v1/tokenizer.model \ | ||
--dataset-impl mmap \ | ||
--tokenizer-type LlamaTokenizer \ | ||
--append-eod \ | ||
--workers=64 \ |
Oops, something went wrong.