Skip to content

Commit

Permalink
downloader; quick start.
Browse files Browse the repository at this point in the history
  • Loading branch information
Judd committed Jun 19, 2024
1 parent dc581d7 commit 8bdacf8
Show file tree
Hide file tree
Showing 5 changed files with 267 additions and 3 deletions.
6 changes: 5 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
Inference of a bunch of models from less than 1B to more than 300B, for real-time chatting with [RAG](./docs/rag.md) on your computer (CPU),
pure C++ implementation based on [@ggerganov](https://github.com/ggerganov)'s [ggml](https://github.com/ggerganov/ggml).

| [Supported Models](./docs/models.md) | [Download Quantized Models](https://modelscope.cn/models/judd2024/chatllm_quantized_models) |
| [Supported Models](./docs/models.md) | [Download Quantized Models](./docs/quick_start.md#download-quantized-models) |

**What's New:**

Expand Down Expand Up @@ -44,6 +44,10 @@ pure C++ implementation based on [@ggerganov](https://github.com/ggerganov)'s [g
* [x] [LoRA](./docs/models.md#lora-models);
* [x] Python/JavaScript/C [Bindings](./docs/binding.md), web demo, and more possibilities.

## Quick Start

As simple as `python chatllm.py -i -m :model_id`. [Check it out](./docs/quick_start.md).

## Usage

### Preparation
Expand Down
6 changes: 5 additions & 1 deletion README_zh.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

在计算机(CPU)上实时聊天,可 [检索增强生成](./docs/rag.md) 。支持从不到 1B 到超过 300B 的一系列模型的推理。基于 [@ggerganov](https://github.com/ggerganov)[ggml](https://github.com/ggerganov/ggml),纯 C++ 实现。

| [支持的模型](./docs/models.md) | [下载量化模型](https://modelscope.cn/models/judd2024/chatllm_quantized_models) |
| [支持的模型](./docs/models.md) | [下载量化模型](./docs/quick_start.md#download-quantized-models) |

## 特点

Expand All @@ -24,6 +24,10 @@
- [x] LoRA
- [x] Python/JavaScript/C [绑定](./docs/binding.md),网页演示,以及更多可能性。

## 快速开始

只需要简单一行 `python chatllm.py -i -m :model_id`. 查看 [详情](./docs/quick_start.md).

## 使用方法

#### 准备工作
Expand Down
14 changes: 13 additions & 1 deletion bindings/chatllm.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,12 @@
import threading
from typing import Any, Iterable, List, Union

try:
import model_downloader
except:
this_dir = os.path.dirname(os.path.abspath(sys.argv[0]))
sys.path.append(os.path.join(this_dir, '..', 'scripts'))
import model_downloader

class PrintType(IntEnum):
PRINT_CHAT_CHUNK = 0,
Expand All @@ -20,10 +26,13 @@ class LibChatLLM:
_obj2id = {}
_id2obj = {}

def __init__(self, lib: str = '') -> None:
def __init__(self, lib: str = '', model_storage: str = '') -> None:

if lib == '':
lib = os.path.dirname(os.path.abspath(sys.argv[0]))
self._lib_path = lib
self.model_storage = os.path.abspath(model_storage if model_storage != '' else os.path.join(lib, '..', 'quantized'))

lib = os.path.join(lib, 'libchatllm.')
if sys.platform == 'win32':
lib = lib + 'dll'
Expand Down Expand Up @@ -120,6 +129,9 @@ def alloc_id_for_obj(self, obj: Any) -> int:
def append_param(self, obj: c_void_p, param: Union[str, List[str]]) -> None:
if isinstance(param, str):
param = [param]
return

param = model_downloader.preprocess_args(param, self.model_storage)
for s in param:
self._chatllm_append_param(obj, c_char_p(s.encode()))

Expand Down
30 changes: 30 additions & 0 deletions docs/quick_start.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
## Quick Start

For Windows users, the easies way is to download a release, extract it, and start chatting:

```
python chatllm.py -i -m :qwen2:0.5b
downloading qwen2:0.5b
|████████████████████████████████████████████████████████████| 100.0%
________ __ __ __ __ ___ (通义千问)
/ ____/ /_ ____ _/ /_/ / / / / |/ /_________ ____
/ / / __ \/ __ `/ __/ / / / / /|_/ // ___/ __ \/ __ \
/ /___/ / / / /_/ / /_/ /___/ /___/ / / // /__/ /_/ / /_/ /
\____/_/ /_/\__,_/\__/_____/_____/_/ /_(_)___/ .___/ .___/
You are served by QWen2, /_/ /_/
with 494032768 (0.5B) parameters.
You > hi
A.I. > Hello! How can I assist you today?
You >
```

For Linux/MacOS (and Windows) users, build [binding](binding.md) and start chatting.

### Download Quantized Models

A [script](../scripts/model_downloader.py) is provided, which can download some quantized models on demand.
When a model name starting with `:` is given to `-m` option (as shown in above example), this script will
treat it as a model ID and try to download it if the file does not exist.

Use `python model_downloader.py` to check all quantized models.
214 changes: 214 additions & 0 deletions scripts/model_downloader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,214 @@
import requests
import os

def model_on_modelscope(proj: str, fn: str) -> dict:
url = f"https://modelscope.cn/api/v1/models/judd2024/{proj}/repo?Revision=master&FilePath={fn}"
return { 'fn': fn, 'url': url }

all_models = {
'qwen2': {
'default': '1.5b',
'brief': 'Qwen2 is a new series of large language models from Alibaba group.',
'variants': {
'7b': {
'default': 'q8',
'quantized': {
'q8': model_on_modelscope('chatllm_quantized_qwen2', 'qwen2-7b.bin')
}
},
'1.5b': {
'default': 'q8',
'quantized': {
'q8': model_on_modelscope('chatllm_quantized_qwen2', 'qwen2-1.5b.bin')
}
},
'0.5b': {
'default': 'q8',
'quantized': {
'q8': model_on_modelscope('chatllm_quantized_qwen2', 'qwen2-0.5b.bin')
}
},
}
},
'gemma': {
'default': '2b',
'brief': 'Gemma is a family of lightweight, state-of-the-art open models built by Google DeepMind. Updated to version 1.1.',
'variants': {
'2b': {
'default': 'q8',
'quantized': {
'q8': model_on_modelscope('chatllm_quantized_models', 'gemma-1.1-2b.bin')
}
},
}
},
'llama3': {
'default': '8b',
'brief': 'Meta Llama 3: The most capable openly available LLM to date.',
'variants': {
'8b': {
'default': 'q4_1',
'quantized': {
'q4_1': model_on_modelscope('chatllm_quantized_models', 'llama3-8b-q4_1.bin')
}
},
}
},
'minicpm': {
'default': '2b-sft',
'brief': 'Meta Llama 3: The most capable openly available LLM to date.',
'variants': {
'2b-sft': {
'default': 'q8',
'quantized': {
'q8': model_on_modelscope('chatllm_quantized_models', 'minicpm_sft_q8.bin')
}
},
'2b-dpo': {
'default': 'q4_1',
'quantized': {
'q4_1': model_on_modelscope('chatllm_quantized_models', 'minicpm-dpo-q4_1.bin')
}
},
}
},
'qwen1.5': {
'default': 'moe',
'brief': 'Qwen1.5 is the beta version of Qwen2 from Alibaba group.',
'variants': {
'1.8b': {
'default': 'q8',
'quantized': {
'q8': model_on_modelscope('chatllm_quantized_models', 'qwen1.5-1.8b.bin')
}
},
'moe': {
'default': 'q4_1',
'quantized': {
'q4_1': model_on_modelscope('chatllm_quantized_models', 'qwen1.5-moe-q4_1.bin')
}
},
}
},
'qanything': {
'default': '7b',
'brief': 'QAnything is a local knowledge base question-answering system based on QwenLM.',
'variants': {
'7b': {
'default': 'q4_1',
'quantized': {
'q4_1': model_on_modelscope('chatllm_quantized_models', 'qwen-qany-7b-q4_1.bin')
}
},
}
},
'starling-lm': {
'default': '7b',
'brief': 'Starling is a large language model trained by reinforcement learning from AI feedback focused on improving chatbot helpfulness.',
'variants': {
'7b': {
'default': 'q4_1',
'quantized': {
'q4_1': model_on_modelscope('chatllm_quantized_models', 'starling-7b-q4_1.bin')
}
},
}
},
'yi-1': {
'default': '34b',
'brief': 'Yi (v1) is a high-performing, bilingual language model.',
'variants': {
'34b': {
'default': 'q4_1',
'quantized': {
'q4_1': model_on_modelscope('chatllm_quantized_models', 'yi-34b-q4.bin')
}
},
}
},
}

def print_progress_bar (iteration, total, prefix = '', suffix = '', decimals = 1, length = 60, fill = '█', printEnd = "\r", auto_nl = True):
percent = ("{0:." + str(decimals) + "f}").format(100 * (iteration / float(total)))
filledLength = int(length * iteration // total)
bar = fill * filledLength + '-' * (length - filledLength)
print(f'\r{prefix} |{bar}| {percent}% {suffix}', end = printEnd)
if (iteration == total) and auto_nl:
print()

def download_file(url: str, fn: str, prefix: str):
flag = False
print(f"downloading {prefix}")
with open(fn, 'wb') as f:
with requests.get(url, stream=True) as r:
r.raise_for_status()
total = int(r.headers.get('content-length', 0))

progress = 0

for chunk in r.iter_content(chunk_size=8192):
progress += len(chunk)
f.write(chunk)
print_progress_bar(progress, total)

flag = progress == total
return flag

def show():
def show_variants(info, default):
sizes = [s for s in info.keys()]
variants = [m + ":" + s for s in sizes]
all_var = ', '.join(variants)
print(f"Available: {all_var}")
if len(variants) > 1:
print(f"Default : {m + ':' + default}")

def show_model(m):
info = all_models[m]
print(f"**{m}**: {info['brief']}")
show_variants(info['variants'], info['default'])
print()

for m in all_models.keys():
show_model(m)

def parse_model_id(model_id: str):
parts = model_id.split(':')
model = all_models[parts[0]]
variants = model['variants']
var = variants[parts[1]] if len(parts) >= 2 else variants['default']
return var['quantized'][var['default']]

def get_model(model_id, storage_dir):
if not os.path.isdir(storage_dir):
os.mkdir(storage_dir)
assert os.path.isdir(storage_dir), f"{storage_dir} is invalid"

info = parse_model_id(model_id)
fn = os.path.join(storage_dir, info['fn'])
if os.path.isfile(fn):
return fn

assert download_file(info['url'], fn, model_id), f"failed to download {model_id}"

return fn

def find_index(l: list, x) -> int:
if x in l:
return l.index(x)
else:
return -1

def preprocess_args(args: list[str], storage_dir) -> list[str]:
i = find_index(args, '-m')
if i < 0:
i = find_index(args, '--model')
if i < 0:
return args
if args[i + 1].startswith(':'):
args[i + 1] = get_model(args[i + 1][1:], storage_dir)

return args

if __name__ == '__main__':
show()

0 comments on commit 8bdacf8

Please sign in to comment.