Skip to content

Commit

Permalink
Merge branch 'master' into concedo_experimental
Browse files Browse the repository at this point in the history
# Conflicts:
#	README.md
  • Loading branch information
LostRuins committed Jul 7, 2023
2 parents 8424a35 + 481f793 commit 32102c2
Show file tree
Hide file tree
Showing 9 changed files with 30 additions and 15 deletions.
6 changes: 6 additions & 0 deletions convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,9 +154,15 @@ def guessed(model: 'LazyModel') -> 'Params':
# try transformer naming first
if "model.layers.0.self_attn.q_proj.weight" in model:
n_layer=next(i for i in itertools.count() if f"model.layers.{i}.self_attn.q_proj.weight" not in model)
elif "model.layers.0.self_attn.W_pack.weight" in model: # next: try baichuan naming
n_layer=next(i for i in itertools.count() if f"model.layers.{i}.self_attn.W_pack.weight" not in model)
else:
n_layer=next(i for i in itertools.count() if f"layers.{i}.attention.wq.weight" not in model)

if n_layer < 1:
raise Exception("failed to guess 'n_layer'. This model is unknown or unsupported.\n"
"Suggestion: provide 'config.json' of the model in the same directory containing model files.")

n_head=n_embd // 128 # guessed

return Params(
Expand Down
2 changes: 1 addition & 1 deletion examples/alpaca.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
cd `dirname $0`
cd ..

./main -m ./models/ggml-alpaca-7b-q4.bin \
./main -m ./models/alpaca.13b.ggmlv3.q8_0.bin \
--color \
-f ./prompts/alpaca.txt \
--ctx_size 2048 \
Expand Down
2 changes: 1 addition & 1 deletion examples/embedding/embedding.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ int main(int argc, char ** argv) {
params.embedding = true;

if (params.n_ctx > 2048) {
fprintf(stderr, "%s: warning: model does not support context sizes greater than 2048 tokens (%d specified);"
fprintf(stderr, "%s: warning: model might not support context sizes greater than 2048 tokens (%d specified);"
"expect poor results\n", __func__, params.n_ctx);
}

Expand Down
2 changes: 1 addition & 1 deletion examples/main/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ int main(int argc, char ** argv) {
}

if (params.n_ctx > 2048) {
fprintf(stderr, "%s: warning: model does not support context sizes greater than 2048 tokens (%d specified);"
fprintf(stderr, "%s: warning: model might not support context sizes greater than 2048 tokens (%d specified);"
"expect poor results\n", __func__, params.n_ctx);
} else if (params.n_ctx < 8) {
fprintf(stderr, "%s: warning: minimum context size is 8, using minimum size.\n", __func__);
Expand Down
2 changes: 1 addition & 1 deletion examples/perplexity/perplexity.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ int main(int argc, char ** argv) {
params.n_batch = std::min(params.n_batch, params.n_ctx);

if (params.n_ctx > 2048) {
fprintf(stderr, "%s: warning: model does not support context sizes greater than 2048 tokens (%d specified);"
fprintf(stderr, "%s: warning: model might not support context sizes greater than 2048 tokens (%d specified);"
"expect poor results\n", __func__, params.n_ctx);
}

Expand Down
2 changes: 1 addition & 1 deletion examples/server/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ Command line options:
- `--threads N`, `-t N`: Set the number of threads to use during computation.
- `-m FNAME`, `--model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.bin`).
- `-m ALIAS`, `--alias ALIAS`: Set an alias for the model. The alias will be returned in API responses.
- `-c N`, `--ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference.
- `-c N`, `--ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference. The size may differ in other models, for example, baichuan models were build with a context of 4096.
- `-ngl N`, `--n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
- `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS.
- `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. Requires cuBLAS.
Expand Down
16 changes: 10 additions & 6 deletions ggml-opencl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -654,13 +654,17 @@ __kernel void dequantize_mul_mat_vec_q6_K(__global const struct block_q6_K * xx,
const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128...
const int in = tid - step*im; // 0...15 or 0...7

#if K_QUANTS_PER_ITERATION == 1
\n#if K_QUANTS_PER_ITERATION == 1\n
const int l0 = K_QUANTS_PER_ITERATION*in; // 0...15
const int is = 0;
#else

\n#else\n

const int l0 = 4 * in; // 0, 4, 8, ..., 28
const int is = in / 4;
#endif

\n#endif\n

const int ql_offset = 64*im + l0;
const int qh_offset = 32*im + l0;
const int s_offset = 8*im + is;
Expand All @@ -677,7 +681,7 @@ __kernel void dequantize_mul_mat_vec_q6_K(__global const struct block_q6_K * xx,

const float d = vload_half(0, &x[i].d);

#if K_QUANTS_PER_ITERATION == 1
\n#if K_QUANTS_PER_ITERATION == 1\n
float sum = y[ 0] * s[0] * d * ((int8_t)((ql[ 0] & 0xF) | ((qh[ 0] & 0x03) << 4)) - 32)
+ y[16] * s[1] * d * ((int8_t)((ql[16] & 0xF) | ((qh[16] & 0x03) << 4)) - 32)
+ y[32] * s[2] * d * ((int8_t)((ql[32] & 0xF) | ((qh[ 0] & 0x0c) << 2)) - 32)
Expand All @@ -687,7 +691,7 @@ __kernel void dequantize_mul_mat_vec_q6_K(__global const struct block_q6_K * xx,
+ y[96] * s[6] * d * ((int8_t)((ql[32] >> 4) | ((qh[ 0] & 0xc0) >> 2)) - 32)
+y[112] * s[7] * d * ((int8_t)((ql[48] >> 4) | ((qh[16] & 0xc0) >> 2)) - 32);
tmp[16 * ix + tid] += sum;
#else
\n#else\n
float sum = 0;
for (int l = 0; l < 4; ++l) {
sum += y[l+ 0] * s[0] * d * ((int8_t)((ql[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32)
Expand All @@ -696,7 +700,7 @@ __kernel void dequantize_mul_mat_vec_q6_K(__global const struct block_q6_K * xx,
+ y[l+96] * s[6] * d * ((int8_t)((ql[l+32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32);
}
tmp[16 * ix + tid] += sum;
#endif
\n#endif\n

}

Expand Down
12 changes: 9 additions & 3 deletions ggml.h
Original file line number Diff line number Diff line change
Expand Up @@ -1516,9 +1516,15 @@ extern "C" {
// Internal types and functions exposed for tests and benchmarks
//

typedef void (*ggml_to_float_t)(const void * x, float * y, int k);
typedef void (*ggml_from_float_t)(const float * x, void * y, int k);
typedef void (*ggml_vec_dot_t)(const int n, float * s, const void * x, const void * y);
#ifdef __cplusplus
// restrict not standard in C++
#define GGML_RESTRICT
#else
#define GGML_RESTRICT restrict
#endif
typedef void (*ggml_to_float_t) (const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
typedef void (*ggml_vec_dot_t) (const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y);

typedef struct {
ggml_to_float_t to_float;
Expand Down
1 change: 0 additions & 1 deletion koboldcpp.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,7 +198,6 @@ def load_model(model_filename):
inputs.executable_path = (getdirpath()+"/").encode("UTF-8")
inputs.debugmode = args.debugmode
banned_tokens = args.bantokens
print(banned_tokens)
for n in range(ban_token_max):
if not banned_tokens or n >= len(banned_tokens):
inputs.banned_tokens[n] = "".encode("UTF-8")
Expand Down

0 comments on commit 32102c2

Please sign in to comment.