downloader; quick start.

foldl · Jun 19, 2024 · 8bdacf8 · 8bdacf8
1 parent dc581d7
commit 8bdacf8
Show file tree

Hide file tree

Showing 5 changed files with 267 additions and 3 deletions.
diff --git a/README.md b/README.md
@@ -9,7 +9,7 @@
 Inference of a bunch of models from less than 1B to more than 300B, for real-time chatting with [RAG](./docs/rag.md) on your computer (CPU),
 pure C++ implementation based on [@ggerganov](https://github.com/ggerganov)'s [ggml](https://github.com/ggerganov/ggml).
 
-| [Supported Models](./docs/models.md) | [Download Quantized Models](https://modelscope.cn/models/judd2024/chatllm_quantized_models) |
+| [Supported Models](./docs/models.md) | [Download Quantized Models](./docs/quick_start.md#download-quantized-models) |
 
 **What's New:**
 
@@ -44,6 +44,10 @@ pure C++ implementation based on [@ggerganov](https://github.com/ggerganov)'s [g
 * [x] [LoRA](./docs/models.md#lora-models);
 * [x] Python/JavaScript/C [Bindings](./docs/binding.md), web demo, and more possibilities.
 
+## Quick Start
+
+As simple as `python chatllm.py -i -m :model_id`. [Check it out](./docs/quick_start.md).
+
 ## Usage
 
 ### Preparation

diff --git a/README_zh.md b/README_zh.md
@@ -8,7 +8,7 @@
 
 在计算机（CPU）上实时聊天，可 [检索增强生成](./docs/rag.md) 。支持从不到 1B 到超过 300B 的一系列模型的推理。基于 [@ggerganov](https://github.com/ggerganov) 的 [ggml](https://github.com/ggerganov/ggml)，纯 C++ 实现。
 
-| [支持的模型](./docs/models.md) | [下载量化模型](https://modelscope.cn/models/judd2024/chatllm_quantized_models) |
+| [支持的模型](./docs/models.md) | [下载量化模型](./docs/quick_start.md#download-quantized-models) |
 
 ## 特点
 
@@ -24,6 +24,10 @@
 - [x] LoRA
 - [x] Python/JavaScript/C [绑定](./docs/binding.md)，网页演示，以及更多可能性。
 
+## 快速开始
+
+只需要简单一行 `python chatllm.py -i -m :model_id`. 查看 [详情](./docs/quick_start.md).
+
 ## 使用方法
 
 #### 准备工作

diff --git a/bindings/chatllm.py b/bindings/chatllm.py
@@ -4,6 +4,12 @@
 import threading
 from typing import Any, Iterable, List, Union
 
+try:
+ import model_downloader
+except:
+ this_dir = os.path.dirname(os.path.abspath(sys.argv[0]))
+ sys.path.append(os.path.join(this_dir, '..', 'scripts'))
+ import model_downloader
 
 class PrintType(IntEnum):
  PRINT_CHAT_CHUNK = 0,
@@ -20,10 +26,13 @@ class LibChatLLM:
  _obj2id = {}
  _id2obj = {}
 
- def __init__(self, lib: str = '') -> None:
+ def __init__(self, lib: str = '', model_storage: str = '') -> None:
 
  if lib == '':
  lib = os.path.dirname(os.path.abspath(sys.argv[0]))
+ self._lib_path = lib
+ self.model_storage = os.path.abspath(model_storage if model_storage != '' else os.path.join(lib, '..', 'quantized'))
+
  lib = os.path.join(lib, 'libchatllm.')
  if sys.platform == 'win32':
  lib = lib + 'dll'
@@ -120,6 +129,9 @@ def alloc_id_for_obj(self, obj: Any) -> int:
  def append_param(self, obj: c_void_p, param: Union[str, List[str]]) -> None:
  if isinstance(param, str):
  param = [param]
+ return
+
+ param = model_downloader.preprocess_args(param, self.model_storage)
  for s in param:
  self._chatllm_append_param(obj, c_char_p(s.encode()))
 

diff --git a/docs/quick_start.md b/docs/quick_start.md
@@ -0,0 +1,30 @@
+## Quick Start
+
+For Windows users, the easies way is to download a release, extract it, and start chatting:
+
+```
+python chatllm.py -i -m :qwen2:0.5b
+downloading qwen2:0.5b
+ |████████████████████████████████████████████████████████████| 100.0%
+ ________ __ __ __ __ ___ (通义千问)
+ / ____/ /_ ____ _/ /_/ / / / / |/ /_________ ____
+ / / / __ \/ __ `/ __/ / / / / /|_/ // ___/ __ \/ __ \
+ / /___/ / / / /_/ / /_/ /___/ /___/ / / // /__/ /_/ / /_/ /
+ \____/_/ /_/\__,_/\__/_____/_____/_/ /_(_)___/ .___/ .___/
+You are served by QWen2, /_/ /_/
+with 494032768 (0.5B) parameters.
+
+You > hi
+A.I. > Hello! How can I assist you today?
+You >
+```
+
+For Linux/MacOS (and Windows) users, build [binding](binding.md) and start chatting.
+
+### Download Quantized Models
+
+A [script](../scripts/model_downloader.py) is provided, which can download some quantized models on demand.
+When a model name starting with `:` is given to `-m` option (as shown in above example), this script will
+treat it as a model ID and try to download it if the file does not exist.
+
+Use `python model_downloader.py` to check all quantized models.
diff --git a/scripts/model_downloader.py b/scripts/model_downloader.py
@@ -0,0 +1,214 @@
+import requests
+import os
+
+def model_on_modelscope(proj: str, fn: str) -> dict:
+ url = f"https://modelscope.cn/api/v1/models/judd2024/{proj}/repo?Revision=master&FilePath={fn}"
+ return { 'fn': fn, 'url': url }
+
+all_models = {
+ 'qwen2': {
+ 'default': '1.5b',
+ 'brief': 'Qwen2 is a new series of large language models from Alibaba group.',
+ 'variants': {
+ '7b': {
+ 'default': 'q8',
+ 'quantized': {
+ 'q8': model_on_modelscope('chatllm_quantized_qwen2', 'qwen2-7b.bin')
+ }
+ },
+ '1.5b': {
+ 'default': 'q8',
+ 'quantized': {
+ 'q8': model_on_modelscope('chatllm_quantized_qwen2', 'qwen2-1.5b.bin')
+ }
+ },
+ '0.5b': {
+ 'default': 'q8',
+ 'quantized': {
+ 'q8': model_on_modelscope('chatllm_quantized_qwen2', 'qwen2-0.5b.bin')
+ }
+ },
+ }
+ },
+ 'gemma': {
+ 'default': '2b',
+ 'brief': 'Gemma is a family of lightweight, state-of-the-art open models built by Google DeepMind. Updated to version 1.1.',
+ 'variants': {
+ '2b': {
+ 'default': 'q8',
+ 'quantized': {
+ 'q8': model_on_modelscope('chatllm_quantized_models', 'gemma-1.1-2b.bin')
+ }
+ },
+ }
+ },
+ 'llama3': {
+ 'default': '8b',
+ 'brief': 'Meta Llama 3: The most capable openly available LLM to date.',
+ 'variants': {
+ '8b': {
+ 'default': 'q4_1',
+ 'quantized': {
+ 'q4_1': model_on_modelscope('chatllm_quantized_models', 'llama3-8b-q4_1.bin')
+ }
+ },
+ }
+ },
+ 'minicpm': {
+ 'default': '2b-sft',
+ 'brief': 'Meta Llama 3: The most capable openly available LLM to date.',
+ 'variants': {
+ '2b-sft': {
+ 'default': 'q8',
+ 'quantized': {
+ 'q8': model_on_modelscope('chatllm_quantized_models', 'minicpm_sft_q8.bin')
+ }
+ },
+ '2b-dpo': {
+ 'default': 'q4_1',
+ 'quantized': {
+ 'q4_1': model_on_modelscope('chatllm_quantized_models', 'minicpm-dpo-q4_1.bin')
+ }
+ },
+ }
+ },
+ 'qwen1.5': {
+ 'default': 'moe',
+ 'brief': 'Qwen1.5 is the beta version of Qwen2 from Alibaba group.',
+ 'variants': {
+ '1.8b': {
+ 'default': 'q8',
+ 'quantized': {
+ 'q8': model_on_modelscope('chatllm_quantized_models', 'qwen1.5-1.8b.bin')
+ }
+ },
+ 'moe': {
+ 'default': 'q4_1',
+ 'quantized': {
+ 'q4_1': model_on_modelscope('chatllm_quantized_models', 'qwen1.5-moe-q4_1.bin')
+ }
+ },
+ }
+ },
+ 'qanything': {
+ 'default': '7b',
+ 'brief': 'QAnything is a local knowledge base question-answering system based on QwenLM.',
+ 'variants': {
+ '7b': {
+ 'default': 'q4_1',
+ 'quantized': {
+ 'q4_1': model_on_modelscope('chatllm_quantized_models', 'qwen-qany-7b-q4_1.bin')
+ }
+ },
+ }
+ },
+ 'starling-lm': {
+ 'default': '7b',
+ 'brief': 'Starling is a large language model trained by reinforcement learning from AI feedback focused on improving chatbot helpfulness.',
+ 'variants': {
+ '7b': {
+ 'default': 'q4_1',
+ 'quantized': {
+ 'q4_1': model_on_modelscope('chatllm_quantized_models', 'starling-7b-q4_1.bin')
+ }
+ },
+ }
+ },
+ 'yi-1': {
+ 'default': '34b',
+ 'brief': 'Yi (v1) is a high-performing, bilingual language model.',
+ 'variants': {
+ '34b': {
+ 'default': 'q4_1',
+ 'quantized': {
+ 'q4_1': model_on_modelscope('chatllm_quantized_models', 'yi-34b-q4.bin')
+ }
+ },
+ }
+ },
+}
+
+def print_progress_bar (iteration, total, prefix = '', suffix = '', decimals = 1, length = 60, fill = '█', printEnd = "\r", auto_nl = True):
+ percent = ("{0:." + str(decimals) + "f}").format(100 * (iteration / float(total)))
+ filledLength = int(length * iteration // total)
+ bar = fill * filledLength + '-' * (length - filledLength)
+ print(f'\r{prefix} |{bar}| {percent}% {suffix}', end = printEnd)
+ if (iteration == total) and auto_nl:
+ print()
+
+def download_file(url: str, fn: str, prefix: str):
+ flag = False
+ print(f"downloading {prefix}")
+ with open(fn, 'wb') as f:
+ with requests.get(url, stream=True) as r:
+ r.raise_for_status()
+ total = int(r.headers.get('content-length', 0))
+
+ progress = 0
+
+ for chunk in r.iter_content(chunk_size=8192):
+ progress += len(chunk)
+ f.write(chunk)
+ print_progress_bar(progress, total)
+
+ flag = progress == total
+ return flag
+
+def show():
+ def show_variants(info, default):
+ sizes = [s for s in info.keys()]
+ variants = [m + ":" + s for s in sizes]
+ all_var = ', '.join(variants)
+ print(f"Available: {all_var}")
+ if len(variants) > 1:
+ print(f"Default : {m + ':' + default}")
+
+ def show_model(m):
+ info = all_models[m]
+ print(f"**{m}**: {info['brief']}")
+ show_variants(info['variants'], info['default'])
+ print()
+
+ for m in all_models.keys():
+ show_model(m)
+
+def parse_model_id(model_id: str):
+ parts = model_id.split(':')
+ model = all_models[parts[0]]
+ variants = model['variants']
+ var = variants[parts[1]] if len(parts) >= 2 else variants['default']
+ return var['quantized'][var['default']]
+
+def get_model(model_id, storage_dir):
+ if not os.path.isdir(storage_dir):
+ os.mkdir(storage_dir)
+ assert os.path.isdir(storage_dir), f"{storage_dir} is invalid"
+
+ info = parse_model_id(model_id)
+ fn = os.path.join(storage_dir, info['fn'])
+ if os.path.isfile(fn):
+ return fn
+
+ assert download_file(info['url'], fn, model_id), f"failed to download {model_id}"
+
+ return fn
+
+def find_index(l: list, x) -> int:
+ if x in l:
+ return l.index(x)
+ else:
+ return -1
+
+def preprocess_args(args: list[str], storage_dir) -> list[str]:
+ i = find_index(args, '-m')
+ if i < 0:
+ i = find_index(args, '--model')
+ if i < 0:
+ return args
+ if args[i + 1].startswith(':'):
+ args[i + 1] = get_model(args[i + 1][1:], storage_dir)
+
+ return args
+
+if __name__ == '__main__':
+ show()