modelscope · Jintao-Huang · Dec 14, 2023 · Dec 14, 2023 · Dec 14, 2023 · Dec 14, 2023
diff --git a/README.md b/README.md
@@ -118,7 +118,7 @@ Users can check the [documentation of SWIFT](docs/source/GetStarted/快速使用
  - chatglm series: [chatglm2-6b](https://modelscope.cn/models/ZhipuAI/chatglm2-6b/summary), [chatglm2-6b-32k](https://modelscope.cn/models/ZhipuAI/chatglm2-6b-32k/summary), [chatglm3-6b-base](https://modelscope.cn/models/ZhipuAI/chatglm3-6b-base/summary), [chatglm3-6b](https://modelscope.cn/models/ZhipuAI/chatglm3-6b/summary), [chatglm3-6b-32k](https://modelscope.cn/models/ZhipuAI/chatglm3-6b-32k/summary)
  - baichuan series: [baichuan-7b](https://modelscope.cn/models/baichuan-inc/baichuan-7B/summary), [baichuan-13b](https://modelscope.cn/models/baichuan-inc/Baichuan-13B-Base/summary), [baichuan-13b-chat](https://modelscope.cn/models/baichuan-inc/Baichuan-13B-Chat/summary), [baichuan2-7b](https://modelscope.cn/models/baichuan-inc/Baichuan2-7B-Base/summary), [baichuan2-7b-chat](https://modelscope.cn/models/baichuan-inc/Baichuan2-7B-Chat/summary), [baichuan2-13b](https://modelscope.cn/models/baichuan-inc/Baichuan2-13B-Base/summary), [baichuan2-13b-chat](https://modelscope.cn/models/baichuan-inc/Baichuan2-13B-Chat/summary), [baichuan2-7b-chat-int4](https://modelscope.cn/models/baichuan-inc/Baichuan2-7B-Chat-4bits/summary), [baichuan2-13b-chat-int4](https://modelscope.cn/models/baichuan-inc/Baichuan2-13B-Chat-4bits/summary)
  - llama series: [llama2-7b](https://modelscope.cn/models/modelscope/Llama-2-7b-ms/summary), [llama2-7b-chat](https://modelscope.cn/models/modelscope/Llama-2-7b-chat-ms/summary), [llama2-13b](https://modelscope.cn/models/modelscope/Llama-2-13b-ms/summary), [llama2-13b-chat](https://modelscope.cn/models/modelscope/Llama-2-13b-chat-ms/summary), [llama2-70b](https://modelscope.cn/models/modelscope/Llama-2-70b-ms/summary), [llama2-70b-chat](https://modelscope.cn/models/modelscope/Llama-2-70b-chat-ms/summary)
- - yi series: [yi-6b](https://modelscope.cn/models/01ai/Yi-6B/summary), [yi-6b-200k](https://modelscope.cn/models/01ai/Yi-6B-200K/summary), [yi-34b](https://modelscope.cn/models/01ai/Yi-34B/summary), [yi-34b-200k](https://modelscope.cn/models/01ai/Yi-34B-200K/summary), [yi-34b-chat](https://modelscope.cn/models/01ai/Yi-34B-Chat/summary)
+ - yi series: [yi-6b](https://modelscope.cn/models/01ai/Yi-6B/summary), [yi-6b-200k](https://modelscope.cn/models/01ai/Yi-6B-200K/summary), [yi-6b-chat](https://modelscope.cn/models/01ai/Yi-6B-Chat/summary), [yi-34b](https://modelscope.cn/models/01ai/Yi-34B/summary), [yi-34b-200k](https://modelscope.cn/models/01ai/Yi-34B-200K/summary), [yi-34b-chat](https://modelscope.cn/models/01ai/Yi-34B-Chat/summary)
  - openbuddy series: [openbuddy-llama2-13b-chat](https://modelscope.cn/models/OpenBuddy/openbuddy-llama2-13b-v8.1-fp16/summary), [openbuddy-llama-65b-chat](https://modelscope.cn/models/OpenBuddy/openbuddy-llama-65b-v8-bf16/summary), [openbuddy-llama2-70b-chat](https://modelscope.cn/models/OpenBuddy/openbuddy-llama2-70b-v10.1-bf16/summary), [openbuddy-mistral-7b-chat](https://modelscope.cn/models/OpenBuddy/openbuddy-mistral-7b-v13.1/summary), [openbuddy-zephyr-7b-chat](https://modelscope.cn/models/OpenBuddy/openbuddy-zephyr-7b-v14.1/summary)
  - internlm series: [internlm-7b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-7b/summary), [internlm-7b-chat](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-chat-7b-v1_1/summary), [internlm-7b-chat-8k](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-chat-7b-8k/summary), [internlm-20b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-20b/summary), [internlm-20b-chat](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-chat-20b/summary)
  - xverse series: [xverse-7b](https://modelscope.cn/models/xverse/XVERSE-7B/summary), [xverse-7b-chat](https://modelscope.cn/models/xverse/XVERSE-7B-Chat/summary), [xverse-13b](https://modelscope.cn/models/xverse/XVERSE-13B/summary), [xverse-13b-chat](https://modelscope.cn/models/xverse/XVERSE-13B-Chat/summary), [xverse-65b](https://modelscope.cn/models/xverse/XVERSE-65B/summary)

diff --git a/README_CN.md b/README_CN.md
@@ -116,7 +116,7 @@ SWIFT（Scalable lightWeight Infrastructure for Fine-Tuning）是一个可扩展
  - chatglm 系列: [chatglm2-6b](https://modelscope.cn/models/ZhipuAI/chatglm2-6b/summary), [chatglm2-6b-32k](https://modelscope.cn/models/ZhipuAI/chatglm2-6b-32k/summary), [chatglm3-6b-base](https://modelscope.cn/models/ZhipuAI/chatglm3-6b-base/summary), [chatglm3-6b](https://modelscope.cn/models/ZhipuAI/chatglm3-6b/summary), [chatglm3-6b-32k](https://modelscope.cn/models/ZhipuAI/chatglm3-6b-32k/summary)
  - baichuan 系列: [baichuan-7b](https://modelscope.cn/models/baichuan-inc/baichuan-7B/summary), [baichuan-13b](https://modelscope.cn/models/baichuan-inc/Baichuan-13B-Base/summary), [baichuan-13b-chat](https://modelscope.cn/models/baichuan-inc/Baichuan-13B-Chat/summary), [baichuan2-7b](https://modelscope.cn/models/baichuan-inc/Baichuan2-7B-Base/summary), [baichuan2-7b-chat](https://modelscope.cn/models/baichuan-inc/Baichuan2-7B-Chat/summary), [baichuan2-13b](https://modelscope.cn/models/baichuan-inc/Baichuan2-13B-Base/summary), [baichuan2-13b-chat](https://modelscope.cn/models/baichuan-inc/Baichuan2-13B-Chat/summary), [baichuan2-7b-chat-int4](https://modelscope.cn/models/baichuan-inc/Baichuan2-7B-Chat-4bits/summary), [baichuan2-13b-chat-int4](https://modelscope.cn/models/baichuan-inc/Baichuan2-13B-Chat-4bits/summary)
  - llama 系列: [llama2-7b](https://modelscope.cn/models/modelscope/Llama-2-7b-ms/summary), [llama2-7b-chat](https://modelscope.cn/models/modelscope/Llama-2-7b-chat-ms/summary), [llama2-13b](https://modelscope.cn/models/modelscope/Llama-2-13b-ms/summary), [llama2-13b-chat](https://modelscope.cn/models/modelscope/Llama-2-13b-chat-ms/summary), [llama2-70b](https://modelscope.cn/models/modelscope/Llama-2-70b-ms/summary), [llama2-70b-chat](https://modelscope.cn/models/modelscope/Llama-2-70b-chat-ms/summary)
- - yi 系列: [yi-6b](https://modelscope.cn/models/01ai/Yi-6B/summary), [yi-6b-200k](https://modelscope.cn/models/01ai/Yi-6B-200K/summary), [yi-34b](https://modelscope.cn/models/01ai/Yi-34B/summary), [yi-34b-200k](https://modelscope.cn/models/01ai/Yi-34B-200K/summary), [yi-34b-chat](https://modelscope.cn/models/01ai/Yi-34B-Chat/summary)
+ - yi 系列: [yi-6b](https://modelscope.cn/models/01ai/Yi-6B/summary), [yi-6b-200k](https://modelscope.cn/models/01ai/Yi-6B-200K/summary), [yi-6b-chat](https://modelscope.cn/models/01ai/Yi-6B-Chat/summary), [yi-34b](https://modelscope.cn/models/01ai/Yi-34B/summary), [yi-34b-200k](https://modelscope.cn/models/01ai/Yi-34B-200K/summary), [yi-34b-chat](https://modelscope.cn/models/01ai/Yi-34B-Chat/summary)
  - openbuddy 系列: [openbuddy-llama2-13b-chat](https://modelscope.cn/models/OpenBuddy/openbuddy-llama2-13b-v8.1-fp16/summary), [openbuddy-llama-65b-chat](https://modelscope.cn/models/OpenBuddy/openbuddy-llama-65b-v8-bf16/summary), [openbuddy-llama2-70b-chat](https://modelscope.cn/models/OpenBuddy/openbuddy-llama2-70b-v10.1-bf16/summary), [openbuddy-mistral-7b-chat](https://modelscope.cn/models/OpenBuddy/openbuddy-mistral-7b-v13.1/summary), [openbuddy-zephyr-7b-chat](https://modelscope.cn/models/OpenBuddy/openbuddy-zephyr-7b-v14.1/summary)
  - internlm 系列: [internlm-7b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-7b/summary), [internlm-7b-chat](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-chat-7b-v1_1/summary), [internlm-7b-chat-8k](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-chat-7b-8k/summary), [internlm-20b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-20b/summary), [internlm-20b-chat](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-chat-20b/summary)
  - xverse 系列: [xverse-7b](https://modelscope.cn/models/xverse/XVERSE-7B/summary), [xverse-7b-chat](https://modelscope.cn/models/xverse/XVERSE-7B-Chat/summary), [xverse-13b](https://modelscope.cn/models/xverse/XVERSE-13B/summary), [xverse-13b-chat](https://modelscope.cn/models/xverse/XVERSE-13B-Chat/summary), [xverse-65b](https://modelscope.cn/models/xverse/XVERSE-65B/summary)

diff --git a/docs/source/LLM/支持的模型和数据集.md b/docs/source/LLM/支持的模型和数据集.md
@@ -77,6 +77,7 @@
 |mistral-7b-chat|[AI-ModelScope/Mistral-7B-Instruct-v0.1](https://modelscope.cn/models/AI-ModelScope/Mistral-7B-Instruct-v0.1/summary)|q_proj, k_proj, v_proj|llama|&#x2714;|transformers>=4.34|
 |yi-6b|[01ai/Yi-6B](https://modelscope.cn/models/01ai/Yi-6B/summary)|q_proj, k_proj, v_proj|default-generation|&#x2714;||
 |yi-6b-200k|[01ai/Yi-6B-200K](https://modelscope.cn/models/01ai/Yi-6B-200K/summary)|q_proj, k_proj, v_proj|default-generation|&#x2714;||
+|yi-6b-chat|[01ai/Yi-6B-Chat](https://modelscope.cn/models/01ai/Yi-6B-Chat/summary)|q_proj, k_proj, v_proj|yi|&#x2714;||
 |yi-34b|[01ai/Yi-34B](https://modelscope.cn/models/01ai/Yi-34B/summary)|q_proj, k_proj, v_proj|default-generation|&#x2714;||
 |yi-34b-200k|[01ai/Yi-34B-200K](https://modelscope.cn/models/01ai/Yi-34B-200K/summary)|q_proj, k_proj, v_proj|default-generation|&#x2714;||
 |yi-34b-chat|[01ai/Yi-34B-Chat](https://modelscope.cn/models/01ai/Yi-34B-Chat/summary)|q_proj, k_proj, v_proj|yi|&#x2714;||

diff --git a/requirements/framework.txt b/requirements/framework.txt
@@ -13,4 +13,4 @@ rouge
 safetensors
 tensorboard
 tqdm
-transformers>=4.33
+transformers>=4.33,<4.37
diff --git a/swift/llm/sft.py b/swift/llm/sft.py
@@ -257,6 +257,8 @@ def llm_sft(args: SftArguments) -> str:
  acc_strategy=args.acc_strategy)
 
  if args.gradient_checkpointing:
+ model.config.use_cache = False # fix transformers==4.36
+ logger.info('Setting model.config.use_cache: False')
  model.enable_input_require_grads()
  if is_dist():
  # Compatible with https://github.com/huggingface/transformers/pull/25903

diff --git a/swift/llm/utils/model.py b/swift/llm/utils/model.py
@@ -113,6 +113,7 @@ class ModelType:
  # yi
  yi_6b = 'yi-6b'
  yi_6b_200k = 'yi-6b-200k'
+ yi_6b_chat = 'yi-6b-chat'
  yi_34b = 'yi-34b'
  yi_34b_200k = 'yi-34b-200k'
  yi_34b_chat = 'yi-34b-chat'
@@ -630,12 +631,22 @@ def get_model_tokenizer_with_flash_attn(model_dir: str,
  if model_config is None:
  model_config = AutoConfig.from_pretrained(
  model_dir, trust_remote_code=True)
- _flash_attn_2_enabled = kwargs.pop('use_flash_attn', False)
- model_config._flash_attn_2_enabled = _flash_attn_2_enabled
+ use_flash_attn = kwargs.pop('use_flash_attn', False)
+ if version.parse(transformers.__version__) >= version.parse('4.36'):
+ if use_flash_attn:
+ model_config._attn_implementation = 'flash_attention_2'
+ else:
+ model_config._flash_attn_2_enabled = use_flash_attn
  return get_model_tokenizer_from_repo(model_dir, torch_dtype, model_kwargs,
  load_model, model_config, **kwargs)
 
 
+@register_model(
+ ModelType.yi_6b_chat,
+ '01ai/Yi-6B-Chat',
+ LoRATM.yi,
+ TemplateType.yi,
+ support_flash_attn=True)
 @register_model(
  ModelType.yi_34b_chat,
  '01ai/Yi-34B-Chat',

diff --git a/swift/llm/utils/utils.py b/swift/llm/utils/utils.py
@@ -2,18 +2,17 @@
 # Part of the implementation is borrowed from huggingface/transformers.
 import heapq
 import logging
-import multiprocessing
 import os
 import shutil
 import time
-from functools import wraps
-from multiprocessing.pool import AsyncResult
+from functools import partial, wraps
 from queue import Empty, Queue
 from tempfile import TemporaryDirectory
 from typing import (Any, Callable, Dict, Iterator, List, Optional, Tuple,
  TypeVar, Union)
 
 import accelerate
+import multiprocess
 import numpy as np
 import requests
 import torch
@@ -218,10 +217,10 @@ def _map_mp_single(subset: HfDataset, map_func: MapFunc, queue: Queue,
 
 def _map_mp_i(dataset: HfDataset, map_func: MapFunc,
  num_proc: int) -> Iterator[Tuple[int, Dict[str, Any]]]:
- with multiprocessing.Pool(
- num_proc) as pool, multiprocessing.Manager() as manager:
+ with multiprocess.Pool(
+ num_proc) as pool, multiprocess.Manager() as manager:
  queue = manager.Queue()
- async_results: List[AsyncResult] = []
+ async_results = []
  split_idx = np.linspace(0, len(dataset), num_proc + 1, dtype=np.int32)
  for i in range(num_proc):
  subset = dataset.select(range(split_idx[i], split_idx[i + 1]))
@@ -251,14 +250,15 @@ def _map_mp(dataset: HfDataset, map_func: MapFunc,
 def dataset_map(dataset: HfDataset,
  map_func: MapFunc,
  num_proc: int = 1) -> LLMDataset:
+ single_map = partial(_single_map, map_func=map_func)
  if num_proc == 1:
  data = []
  for d in tqdm(dataset):
- d = _single_map(d, map_func)
+ d = single_map(d)
  data.append(d)
  else:
  assert num_proc > 1
- data = _map_mp(dataset, map_func, num_proc)
+ data = _map_mp(dataset, single_map, num_proc)
  data = [d for d in data if d is not None]
  if len(data) == 0:
  logger.info('len(dataset): 0')

diff --git a/swift/trainers/mixin.py b/swift/trainers/mixin.py
@@ -352,10 +352,8 @@ def _save_sft_args(self, output_dir: str) -> None:
  def _save(self, output_dir: Optional[str] = None, state_dict=None):
  """Compatible with swift and peft"""
  # If we are executing this function, we are the process zero, so we don't check for that.
- self.state.last_model_checkpoint = output_dir
  output_dir = output_dir if output_dir is not None else self.args.output_dir
  os.makedirs(output_dir, exist_ok=True)
- logger.info(f'Saving model checkpoint to {output_dir}')
  # configuration.json
  model_dir = getattr(self.model, 'model_dir', None)
  if model_dir is not None:
@@ -421,6 +419,10 @@ def _save(self, output_dir: Optional[str] = None, state_dict=None):
  shutil.copy(src_path, dst_path)
 
  def _save_checkpoint(self, model, trial, metrics=None):
+ self.state.last_model_checkpoint = os.path.join(
+ self.args.output_dir, f'checkpoint-{self.state.global_step}')
+ logger.info(
+ f'Saving model checkpoint to {self.state.last_model_checkpoint}')
  only_save_model = self.args.only_save_model
  if only_save_model:
  return self._only_save_model(model, trial, metrics)