Skip to content

Commit

Permalink
Merge branch 'LostRuins:concedo' into main
Browse files Browse the repository at this point in the history
  • Loading branch information
YellowRoseCx committed Jun 24, 2023
2 parents ea6d320 + 8342fe8 commit 06c3bf0
Show file tree
Hide file tree
Showing 13 changed files with 216 additions and 89 deletions.
91 changes: 69 additions & 22 deletions convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,28 +130,76 @@ def make_tensors_list() -> List[str]:
TENSORS_SET = set(TENSORS_LIST)


def find_n_mult(n_ff: int, n_embd: int) -> int:
# hardcoded magic range
for n_mult in range(256, 1, -1):
calc_ff = (((8*n_embd) // 3 + n_mult - 1) // n_mult)*n_mult
if calc_ff == n_ff:
return n_mult
return 1

@dataclass
class Params:
n_vocab: int
n_embd: int
n_mult: int
n_head: int
n_layer: int
file_type: GGMLFileType

@staticmethod
def guessed(model: 'LazyModel', file_type: GGMLFileType) -> 'Params':
n_vocab, n_embd = model["tok_embeddings.weight"].shape
def guessed(model: 'LazyModel') -> 'Params':
# try transformer naming first
n_vocab, n_embd = model["model.embed_tokens.weight"].shape if "model.embed_tokens.weight" in model else model["tok_embeddings.weight"].shape

# try transformer naming first
if "model.layers.0.self_attn.q_proj.weight" in model:
n_layer=next(i for i in itertools.count() if f"model.layers.{i}.self_attn.q_proj.weight" not in model)
else:
n_layer=next(i for i in itertools.count() if f"layers.{i}.attention.wq.weight" not in model)

n_head=n_embd // 128 # guessed

return Params(
n_vocab=n_vocab,
n_embd=n_embd,
n_mult=256,
n_head=n_embd // 128,
n_layer=next(i for i in itertools.count() if f"layers.{i}.attention.wq.weight" not in model),
file_type=file_type,
n_head=n_head,
n_layer=n_layer,
)

@staticmethod
def loadHFTransformerJson(model: 'LazyModel', config_path: 'Path') -> 'Params':
config = json.load(open(config_path))

n_vocab = config["vocab_size"];
n_embd = config["hidden_size"];
n_head = config["num_attention_heads"];
n_layer = config["num_hidden_layers"];
n_ff = config["intermediate_size"];

n_mult = find_n_mult(n_ff, n_embd);

return Params(
n_vocab=n_vocab,
n_embd=n_embd,
n_mult=n_mult,
n_head=n_head,
n_layer=n_layer,
)

@staticmethod
def load(model_plus: 'ModelPlus') -> 'Params':
orig_config_path = model_plus.paths[0].parent / "params.json"
hf_transformer_config_path = model_plus.paths[0].parent / "config.json"

if hf_transformer_config_path.exists():
params = Params.loadHFTransformerJson(model_plus.model, hf_transformer_config_path)
else:
params = Params.guessed(model_plus.model)

print(f'params: n_vocab:{params.n_vocab} n_embd:{params.n_embd} n_mult:{params.n_mult} n_head:{params.n_head} n_layer:{params.n_layer}')
return params


class SentencePieceVocab:
def __init__(self, fname_tokenizer: Path, fname_added_tokens: Optional[Path]) -> None:
Expand Down Expand Up @@ -595,18 +643,17 @@ def load() -> Tensor:
return LazyTensor(load, lazy_tensor.shape, lazy_tensor.data_type, f'permute({n_head}) ' + lazy_tensor.description)


def convert_transformers_to_orig(model: LazyModel) -> LazyModel:
def convert_transformers_to_orig(model: LazyModel, params: Params) -> LazyModel:
out: LazyModel = {}
out["tok_embeddings.weight"] = model["model.embed_tokens.weight"]
out["norm.weight"] = model["model.norm.weight"]
out["output.weight"] = model["lm_head.weight"]

n_head = model["model.layers.0.self_attn.q_proj.weight"].shape[1] // 128
for i in itertools.count():
if f"model.layers.{i}.self_attn.q_proj.weight" not in model:
break
out[f"layers.{i}.attention.wq.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.q_proj.weight"], n_head)
out[f"layers.{i}.attention.wk.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.k_proj.weight"], n_head)
out[f"layers.{i}.attention.wq.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.q_proj.weight"], params.n_head)
out[f"layers.{i}.attention.wk.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.k_proj.weight"], params.n_head)
out[f"layers.{i}.attention.wv.weight"] = model[f"model.layers.{i}.self_attn.v_proj.weight"]
out[f"layers.{i}.attention.wo.weight"] = model[f"model.layers.{i}.self_attn.o_proj.weight"]

Expand Down Expand Up @@ -920,7 +967,7 @@ class OutputFile:
def __init__(self, fname_out: Path) -> None:
self.fout = open(fname_out, "wb")

def write_file_header(self, params: Params) -> None:
def write_file_header(self, params: Params, file_type: GGMLFileType) -> None:
self.fout.write(b"ggjt"[::-1]) # magic
values = [
1, # file version
Expand All @@ -930,7 +977,7 @@ def write_file_header(self, params: Params) -> None:
params.n_head,
params.n_layer,
params.n_embd // params.n_head, # rot (obsolete)
params.file_type.value,
file_type.value,
]
self.fout.write(struct.pack("i" * len(values), *values))

Expand Down Expand Up @@ -958,10 +1005,10 @@ def write_vocab_only(fname_out: Path, vocab: Vocab) -> None:
of.fout.close()

@staticmethod
def write_all(fname_out: Path, params: Params, model: LazyModel, vocab: Vocab) -> None:
def write_all(fname_out: Path, params: Params, file_type: GGMLFileType, model: LazyModel, vocab: Vocab) -> None:
check_vocab_size(params, vocab)
of = OutputFile(fname_out)
of.write_file_header(params)
of.write_file_header(params, file_type)
print("Writing vocab...")
of.write_vocab(vocab)

Expand Down Expand Up @@ -997,11 +1044,11 @@ def pick_output_type(model: LazyModel, output_type_str: Optional[str]) -> GGMLFi
raise Exception(f"Unexpected combination of types: {name_to_type}")


def do_necessary_conversions(model: LazyModel) -> LazyModel:
def do_necessary_conversions(model: LazyModel, params: Params) -> LazyModel:
model = handle_quantization(model)

if "lm_head.weight" in model:
model = convert_transformers_to_orig(model)
model = convert_transformers_to_orig(model, params)
model = filter_and_sort_tensors(model)

return model
Expand Down Expand Up @@ -1107,14 +1154,14 @@ def load_vocab(path: Path) -> SentencePieceVocab:
return SentencePieceVocab(path, added_tokens_path if added_tokens_path.exists() else None)


def default_outfile(model_paths: List[Path], params: Params) -> Path:
def default_outfile(model_paths: List[Path], file_type: GGMLFileType) -> Path:
namestr = {
GGMLFileType.AllF32: "f32",
GGMLFileType.MostlyF16: "f16",
GGMLFileType.MostlyQ4_0: "q4_0",
GGMLFileType.MostlyQ4_1: "q4_1",
GGMLFileType.PerLayerIsQ4_1: "q4_1",
}[params.file_type]
}[file_type]
ret = model_paths[0].parent / f"ggml-model-{namestr}.bin"
if ret in model_paths:
sys.stderr.write(
Expand Down Expand Up @@ -1164,13 +1211,13 @@ def main(args_in: Optional[List[str]] = None) -> None:
else:
vocab_dir = args.vocab_dir if args.vocab_dir else model_plus.paths[0].parent
vocab = load_vocab(vocab_dir)
params = Params.load(model_plus)
model = model_plus.model
model = do_necessary_conversions(model)
model = do_necessary_conversions(model, params)
output_type = pick_output_type(model, args.outtype)
model = convert_to_output_type(model, output_type)
params = Params.guessed(model, output_type)
outfile = args.outfile or default_outfile(model_plus.paths, params)
OutputFile.write_all(outfile, params, model, vocab)
outfile = args.outfile or default_outfile(model_plus.paths, output_type)
OutputFile.write_all(outfile, params, output_type, model, vocab)
print(f"Wrote {outfile}")


Expand Down
3 changes: 3 additions & 0 deletions ggml-opencl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -184,7 +184,9 @@ void convert_f16(__global half* x, const int ib, const int iqs, float* v0, float
*v0 = vload_half(0, &x[ib + 0]);
*v1 = vload_half(0, &x[ib + 1]);
}
);

static std::string k_quants_source = MULTILINE_QUOTE(
inline void get_scale_min_k4(int j, const __global uint8_t *q, uint8_t *d, uint8_t *m)
{
if (j < 4)
Expand Down Expand Up @@ -856,6 +858,7 @@ std::string& replace(std::string& s, const std::string& from, const std::string&
std::string generate_kernels() {
std::stringstream src;
src << program_source << '\n';
src << k_quants_source << '\n';
for (size_t i = 0; i < dequant_str_values.size(); i += dequant_str_keys.size()) {
std::string dequant_kernel = dequant_template;
std::string dmmv_kernel = dequant_mul_mat_vec_template;
Expand Down
28 changes: 17 additions & 11 deletions gpttype_adapter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -308,8 +308,13 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
params.memory_f16 = inputs.f16_kv;
params.n_ctx = inputs.max_context_length;

neox_ctx_v2.hparams.n_ctx = gptj_ctx_v1.hparams.n_ctx = gptj_ctx_v2.hparams.n_ctx = gpt2_ctx_v1.hparams.n_ctx = gpt2_ctx_v2.hparams.n_ctx
= neox_ctx_v3.hparams.n_ctx = gptj_ctx_v3.hparams.n_ctx = gptj_ctx_v3.hparams.n_ctx = mpt_ctx_v3.hparams.n_ctx = params.n_ctx;
neox_ctx_v2.hparams.n_ctx = neox_ctx_v3.hparams.n_ctx
= gptj_ctx_v1.hparams.n_ctx = gptj_ctx_v2.hparams.n_ctx = gptj_ctx_v3.hparams.n_ctx
= gpt2_ctx_v1.hparams.n_ctx = gpt2_ctx_v2.hparams.n_ctx = gpt2_ctx_v3.hparams.n_ctx
= mpt_ctx_v3.hparams.n_ctx = params.n_ctx;

//this is used for the mem_per_token eval, openblas needs more RAM
bool use_scratch = ggml_cpu_has_gpublas();

printf("System Info: %s\n", llama_print_system_info());
SetQuantsUnshuffled(false);
Expand Down Expand Up @@ -546,7 +551,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
return res;
}
// determine the required inference memory per token:
gpt2_eval(gpt2_ctx_v3, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token, file_format);
gpt2_eval(gpt2_ctx_v3, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token, use_scratch);
return ModelLoadResult::SUCCESS;
}
else
Expand Down Expand Up @@ -613,14 +618,14 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
}

// determine the required inference memory per token:
gptj_eval(gptj_ctx_v3, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token);
gptj_eval(gptj_ctx_v3, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token, use_scratch);

//if the logits are NAN or duplicated, it means the model is incompatible
std::vector<float> oldlogits(logits);

//this is another hack because they change the library - we run the eval through the model
//twice and compare logits. if they give the same logits for different inputs, model is broken
gptj_eval(gptj_ctx_v3, params.n_threads, 0, {4, 5, 6, 7}, logits, mem_per_token);
gptj_eval(gptj_ctx_v3, params.n_threads, 0, {4, 5, 6, 7}, logits, mem_per_token, use_scratch);

if(logits.size()>0 && (IsNanCheck(logits[0]) || LogitsDuplicated(oldlogits,logits)))
{
Expand Down Expand Up @@ -685,7 +690,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
}

// determine the required inference memory per token:
gpt_neox_eval(neox_ctx_v3, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token);
gpt_neox_eval(neox_ctx_v3, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token, use_scratch);

return ModelLoadResult::SUCCESS;
}
Expand Down Expand Up @@ -742,7 +747,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
}

// determine the required inference memory per token:
mpt_eval(mpt_ctx_v3, params.n_threads, 0, { 0, 1, 2, 3 }, logits, false, mem_per_token);
mpt_eval(mpt_ctx_v3, params.n_threads, 0, { 0, 1, 2, 3 }, logits, false, mem_per_token, use_scratch);
return ModelLoadResult::SUCCESS;
}
else
Expand Down Expand Up @@ -901,6 +906,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
concat_output = "";

bool startedsampling = false;
bool use_scratch = true; //for normal inference always use scratch

timer_start();
double time1 = 0, time2 = 0;
Expand Down Expand Up @@ -1075,15 +1081,15 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
}
else if(file_format==FileFormat::GPT2_4)
{
evalres = gpt2_eval(gpt2_ctx_v3, params.n_threads, n_past, embd, logits, mem_per_token, file_format);
evalres = gpt2_eval(gpt2_ctx_v3, params.n_threads, n_past, embd, logits, mem_per_token, use_scratch);
}
else if(file_format==FileFormat::NEOX_1 || file_format == FileFormat::NEOX_2 || file_format == FileFormat::NEOX_3 || file_format==FileFormat::NEOX_4 || file_format==FileFormat::NEOX_5)
{
evalres = gpt_neox_v2_eval(neox_ctx_v2, params.n_threads, n_past, embd, logits, mem_per_token);
}
else if(file_format==FileFormat::NEOX_6|| file_format==FileFormat::NEOX_7)
{
evalres = gpt_neox_eval(neox_ctx_v3, params.n_threads, n_past, embd, logits, mem_per_token);
evalres = gpt_neox_eval(neox_ctx_v3, params.n_threads, n_past, embd, logits, mem_per_token, use_scratch);
}
else if(file_format==FileFormat::GPTJ_1 || file_format==FileFormat::GPTJ_2)
{
Expand All @@ -1095,11 +1101,11 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
}
else if(file_format==FileFormat::GPTJ_5)
{
evalres = gptj_eval(gptj_ctx_v3, params.n_threads, n_past, embd, logits, mem_per_token);
evalres = gptj_eval(gptj_ctx_v3, params.n_threads, n_past, embd, logits, mem_per_token, use_scratch);
}
else if(file_format==FileFormat::MPT_1)
{
evalres = mpt_eval(mpt_ctx_v3, params.n_threads, n_past, embd, logits, false, mem_per_token);
evalres = mpt_eval(mpt_ctx_v3, params.n_threads, n_past, embd, logits, false, mem_per_token, use_scratch);
}
else
{
Expand Down
2 changes: 1 addition & 1 deletion koboldcpp.py
Original file line number Diff line number Diff line change
Expand Up @@ -227,7 +227,7 @@ def utfprint(str):
maxhordelen = 256
modelbusy = False
defaultport = 5001
KcppVersion = "1.32"
KcppVersion = "1.32.3"
showdebug = True

class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
Expand Down
8 changes: 4 additions & 4 deletions llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
{ MODEL_3B, 256ull * MB },
{ MODEL_7B, 512ull * MB },
{ MODEL_13B, 512ull * MB },
{ MODEL_30B, 512ull * MB },
{ MODEL_30B, 640ull * MB },
{ MODEL_65B, 1024ull * MB },
};
return k_sizes;
Expand All @@ -92,7 +92,7 @@ static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1()
{ MODEL_3B, 256ull * MB },
{ MODEL_7B, 512ull * MB },
{ MODEL_13B, 512ull * MB },
{ MODEL_30B, 512ull * MB },
{ MODEL_30B, 640ull * MB },
{ MODEL_65B, 1024ull * MB },
};
return k_sizes;
Expand All @@ -105,7 +105,7 @@ static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
{ MODEL_3B, 682ull * MB },
{ MODEL_7B, 1026ull * MB },
{ MODEL_13B, 1608ull * MB },
{ MODEL_30B, 3124ull * MB },
{ MODEL_30B, 3224ull * MB },
{ MODEL_65B, 5120ull * MB },
};
return k_sizes;
Expand All @@ -119,7 +119,7 @@ static const std::map<e_model, size_t> & MEM_REQ_EVAL()
{ MODEL_3B, 512ull * MB },
{ MODEL_7B, 800ull * MB },
{ MODEL_13B, 1024ull * MB },
{ MODEL_30B, 1280ull * MB },
{ MODEL_30B, 1380ull * MB },
{ MODEL_65B, 1536ull * MB },
};
return k_sizes;
Expand Down
2 changes: 1 addition & 1 deletion model_adapter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ void print_tok_vec(std::vector<float> &embd)
//we need to read more to determine
int32_t vocabsiz = 0;
fin.read((char *) &vocabsiz, sizeof(int32_t));
if(vocabsiz==4096) //actually the d_model for mpt
if(vocabsiz==4096 || vocabsiz==7168) //actually the d_model for mpt
{
fileformat = FileFormat::MPT_1;
}
Expand Down
Loading

0 comments on commit 06c3bf0

Please sign in to comment.