Support Yi-6b sft (#134)

modelscope · Nov 4, 2023 · 0b3f840 · 0b3f840
1 parent 0bfc662
commit 0b3f840
Show file tree

Hide file tree

Showing 4 changed files with 59 additions and 0 deletions.
diff --git a/examples/pytorch/llm/scripts/yi_6b/lora/infer.sh b/examples/pytorch/llm/scripts/yi_6b/lora/infer.sh
@@ -0,0 +1,15 @@
+# Experimental environment: A10
+PYTHONPATH=../../.. \
+CUDA_VISIBLE_DEVICES=0 \
+python llm_infer.py \
+ --ckpt_dir "output/yi-6b/vx_xxx/checkpoint-xxx" \
+ --load_args_from_ckpt_dir true \
+ --eval_human false \
+ --max_length 256 \
+ --max_new_tokens 256 \
+ --temperature 0.9 \
+ --top_k 20 \
+ --top_p 0.9 \
+ --repetition_penalty 1.05 \
+ --do_sample true \
+ --merge_lora_and_save false \
diff --git a/examples/pytorch/llm/scripts/yi_6b/lora/sft.sh b/examples/pytorch/llm/scripts/yi_6b/lora/sft.sh
@@ -0,0 +1,36 @@
+# Experimental environment: A10
+# 15GB GPU memory
+PYTHONPATH=../../.. \
+CUDA_VISIBLE_DEVICES=0 \
+python llm_sft.py \
+ --model_id_or_path 01ai/Yi-6B \
+ --model_revision master \
+ --sft_type lora \
+ --tuner_backend swift \
+ --template_type default-generation \
+ --dtype bf16 \
+ --output_dir output \
+ --dataset dureader-robust-zh \
+ --train_dataset_sample -1 \
+ --num_train_epochs 1 \
+ --max_length 2048 \
+ --check_dataset_strategy warning \
+ --lora_rank 8 \
+ --lora_alpha 32 \
+ --lora_dropout_p 0.05 \
+ --lora_target_modules ALL \
+ --gradient_checkpointing true \
+ --batch_size 1 \
+ --weight_decay 0.01 \
+ --learning_rate 1e-4 \
+ --gradient_accumulation_steps 16 \
+ --max_grad_norm 0.5 \
+ --warmup_ratio 0.03 \
+ --eval_steps 100 \
+ --save_steps 100 \
+ --save_total_limit 2 \
+ --logging_steps 10 \
+ --push_to_hub false \
+ --hub_model_id yi-6b-qlora \
+ --hub_private_repo true \
+ --hub_token 'your-sdk-token' \
diff --git a/swift/llm/utils/model.py b/swift/llm/utils/model.py
@@ -92,6 +92,8 @@ class ModelType:
  # other
  polylm_13b = 'polylm-13b'
  seqgpt_560m = 'seqgpt-560m'
+ yi_6b = 'yi-6b'
+ yi_34b = 'yi-34b'
 
 
 class LoRATM(NamedTuple):
@@ -106,6 +108,7 @@ class LoRATM(NamedTuple):
  xverse = ['q_proj', 'k_proj', 'v_proj']
  mistral = ['q_proj', 'k_proj', 'v_proj']
  ziya = ['q_proj', 'k_proj', 'v_proj']
+ yi = ['q_proj', 'k_proj', 'v_proj']
 
 
 GetModelTokenizerFunction = Callable[..., Tuple[Optional[PreTrainedModel],
@@ -169,6 +172,10 @@ def _register_model(
  return _register_model
 
 
+@register_model(ModelType.yi_34b, '01ai/Yi-34B', LoRATM.yi,
+ TemplateType.default_generation)
+@register_model(ModelType.yi_6b, '01ai/Yi-6B', LoRATM.yi,
+ TemplateType.default_generation)
 @register_model(ModelType.seqgpt_560m, 'damo/nlp_seqgpt-560m', LoRATM.bloom,
  TemplateType.default_generation)
 @register_model(ModelType.ziya2_13b_chat, 'Fengshenbang/Ziya2-13B-Chat',

diff --git a/swift/llm/utils/template.py b/swift/llm/utils/template.py
@@ -22,6 +22,7 @@ class TemplateType:
  xverse = 'xverse'
  ziya = 'ziya'
  skywork = 'skywork'
+ yi = 'yi'
 
 
 Prompt = List[Union[str, List[Union[str, int]]]]