Merge branch 'LostRuins:concedo' into main

YellowRoseCx · Jun 24, 2023 · 06c3bf0 · 06c3bf0
2 parents ea6d320 + 8342fe8
commit 06c3bf0
Show file tree

Hide file tree

Showing 13 changed files with 216 additions and 89 deletions.
diff --git a/convert.py b/convert.py
@@ -130,28 +130,76 @@ def make_tensors_list() -> List[str]:
 TENSORS_SET = set(TENSORS_LIST)
 
 
+def find_n_mult(n_ff: int, n_embd: int) -> int:
+ # hardcoded magic range
+ for n_mult in range(256, 1, -1):
+ calc_ff = (((8*n_embd) // 3 + n_mult - 1) // n_mult)*n_mult
+ if calc_ff == n_ff:
+ return n_mult
+ return 1
+
 @dataclass
 class Params:
  n_vocab: int
  n_embd: int
  n_mult: int
  n_head: int
  n_layer: int
- file_type: GGMLFileType
 
  @staticmethod
- def guessed(model: 'LazyModel', file_type: GGMLFileType) -> 'Params':
- n_vocab, n_embd = model["tok_embeddings.weight"].shape
+ def guessed(model: 'LazyModel') -> 'Params':
+ # try transformer naming first
+ n_vocab, n_embd = model["model.embed_tokens.weight"].shape if "model.embed_tokens.weight" in model else model["tok_embeddings.weight"].shape
+
+ # try transformer naming first
+ if "model.layers.0.self_attn.q_proj.weight" in model:
+ n_layer=next(i for i in itertools.count() if f"model.layers.{i}.self_attn.q_proj.weight" not in model)
+ else:
+ n_layer=next(i for i in itertools.count() if f"layers.{i}.attention.wq.weight" not in model)
+
+ n_head=n_embd // 128 # guessed
 
  return Params(
  n_vocab=n_vocab,
  n_embd=n_embd,
  n_mult=256,
- n_head=n_embd // 128,
- n_layer=next(i for i in itertools.count() if f"layers.{i}.attention.wq.weight" not in model),
- file_type=file_type,
+ n_head=n_head,
+ n_layer=n_layer,
  )
 
+ @staticmethod
+ def loadHFTransformerJson(model: 'LazyModel', config_path: 'Path') -> 'Params':
+ config = json.load(open(config_path))
+
+ n_vocab = config["vocab_size"];
+ n_embd = config["hidden_size"];
+ n_head = config["num_attention_heads"];
+ n_layer = config["num_hidden_layers"];
+ n_ff = config["intermediate_size"];
+
+ n_mult = find_n_mult(n_ff, n_embd);
+
+ return Params(
+ n_vocab=n_vocab,
+ n_embd=n_embd,
+ n_mult=n_mult,
+ n_head=n_head,
+ n_layer=n_layer,
+ )
+
+ @staticmethod
+ def load(model_plus: 'ModelPlus') -> 'Params':
+ orig_config_path = model_plus.paths[0].parent / "params.json"
+ hf_transformer_config_path = model_plus.paths[0].parent / "config.json"
+
+ if hf_transformer_config_path.exists():
+ params = Params.loadHFTransformerJson(model_plus.model, hf_transformer_config_path)
+ else:
+ params = Params.guessed(model_plus.model)
+
+ print(f'params: n_vocab:{params.n_vocab} n_embd:{params.n_embd} n_mult:{params.n_mult} n_head:{params.n_head} n_layer:{params.n_layer}')
+ return params
+
 
 class SentencePieceVocab:
  def __init__(self, fname_tokenizer: Path, fname_added_tokens: Optional[Path]) -> None:
@@ -595,18 +643,17 @@ def load() -> Tensor:
  return LazyTensor(load, lazy_tensor.shape, lazy_tensor.data_type, f'permute({n_head}) ' + lazy_tensor.description)
 
 
-def convert_transformers_to_orig(model: LazyModel) -> LazyModel:
+def convert_transformers_to_orig(model: LazyModel, params: Params) -> LazyModel:
  out: LazyModel = {}
  out["tok_embeddings.weight"] = model["model.embed_tokens.weight"]
  out["norm.weight"] = model["model.norm.weight"]
  out["output.weight"] = model["lm_head.weight"]
 
- n_head = model["model.layers.0.self_attn.q_proj.weight"].shape[1] // 128
  for i in itertools.count():
  if f"model.layers.{i}.self_attn.q_proj.weight" not in model:
  break
- out[f"layers.{i}.attention.wq.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.q_proj.weight"], n_head)
- out[f"layers.{i}.attention.wk.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.k_proj.weight"], n_head)
+ out[f"layers.{i}.attention.wq.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.q_proj.weight"], params.n_head)
+ out[f"layers.{i}.attention.wk.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.k_proj.weight"], params.n_head)
  out[f"layers.{i}.attention.wv.weight"] = model[f"model.layers.{i}.self_attn.v_proj.weight"]
  out[f"layers.{i}.attention.wo.weight"] = model[f"model.layers.{i}.self_attn.o_proj.weight"]
 
@@ -920,7 +967,7 @@ class OutputFile:
  def __init__(self, fname_out: Path) -> None:
  self.fout = open(fname_out, "wb")
 
- def write_file_header(self, params: Params) -> None:
+ def write_file_header(self, params: Params, file_type: GGMLFileType) -> None:
  self.fout.write(b"ggjt"[::-1]) # magic
  values = [
  1, # file version
@@ -930,7 +977,7 @@ def write_file_header(self, params: Params) -> None:
  params.n_head,
  params.n_layer,
  params.n_embd // params.n_head, # rot (obsolete)
- params.file_type.value,
+ file_type.value,
  ]
  self.fout.write(struct.pack("i" * len(values), *values))
 
@@ -958,10 +1005,10 @@ def write_vocab_only(fname_out: Path, vocab: Vocab) -> None:
  of.fout.close()
 
  @staticmethod
- def write_all(fname_out: Path, params: Params, model: LazyModel, vocab: Vocab) -> None:
+ def write_all(fname_out: Path, params: Params, file_type: GGMLFileType, model: LazyModel, vocab: Vocab) -> None:
  check_vocab_size(params, vocab)
  of = OutputFile(fname_out)
- of.write_file_header(params)
+ of.write_file_header(params, file_type)
  print("Writing vocab...")
  of.write_vocab(vocab)
 
@@ -997,11 +1044,11 @@ def pick_output_type(model: LazyModel, output_type_str: Optional[str]) -> GGMLFi
  raise Exception(f"Unexpected combination of types: {name_to_type}")
 
 
-def do_necessary_conversions(model: LazyModel) -> LazyModel:
+def do_necessary_conversions(model: LazyModel, params: Params) -> LazyModel:
  model = handle_quantization(model)
 
  if "lm_head.weight" in model:
- model = convert_transformers_to_orig(model)
+ model = convert_transformers_to_orig(model, params)
  model = filter_and_sort_tensors(model)
 
  return model
@@ -1107,14 +1154,14 @@ def load_vocab(path: Path) -> SentencePieceVocab:
  return SentencePieceVocab(path, added_tokens_path if added_tokens_path.exists() else None)
 
 
-def default_outfile(model_paths: List[Path], params: Params) -> Path:
+def default_outfile(model_paths: List[Path], file_type: GGMLFileType) -> Path:
  namestr = {
  GGMLFileType.AllF32: "f32",
  GGMLFileType.MostlyF16: "f16",
  GGMLFileType.MostlyQ4_0: "q4_0",
  GGMLFileType.MostlyQ4_1: "q4_1",
  GGMLFileType.PerLayerIsQ4_1: "q4_1",
- }[params.file_type]
+ }[file_type]
  ret = model_paths[0].parent / f"ggml-model-{namestr}.bin"
  if ret in model_paths:
  sys.stderr.write(
@@ -1164,13 +1211,13 @@ def main(args_in: Optional[List[str]] = None) -> None:
  else:
  vocab_dir = args.vocab_dir if args.vocab_dir else model_plus.paths[0].parent
  vocab = load_vocab(vocab_dir)
+ params = Params.load(model_plus)
  model = model_plus.model
- model = do_necessary_conversions(model)
+ model = do_necessary_conversions(model, params)
  output_type = pick_output_type(model, args.outtype)
  model = convert_to_output_type(model, output_type)
- params = Params.guessed(model, output_type)
- outfile = args.outfile or default_outfile(model_plus.paths, params)
- OutputFile.write_all(outfile, params, model, vocab)
+ outfile = args.outfile or default_outfile(model_plus.paths, output_type)
+ OutputFile.write_all(outfile, params, output_type, model, vocab)
  print(f"Wrote {outfile}")
 
 

diff --git a/ggml-opencl.cpp b/ggml-opencl.cpp
@@ -184,7 +184,9 @@ void convert_f16(__global half* x, const int ib, const int iqs, float* v0, float
  *v0 = vload_half(0, &x[ib + 0]);
  *v1 = vload_half(0, &x[ib + 1]);
 }
+);
 
+static std::string k_quants_source = MULTILINE_QUOTE(
 inline void get_scale_min_k4(int j, const __global uint8_t *q, uint8_t *d, uint8_t *m)
 {
  if (j < 4)
@@ -856,6 +858,7 @@ std::string& replace(std::string& s, const std::string& from, const std::string&
 std::string generate_kernels() {
  std::stringstream src;
  src << program_source << '\n';
+ src << k_quants_source << '\n';
  for (size_t i = 0; i < dequant_str_values.size(); i += dequant_str_keys.size()) {
  std::string dequant_kernel = dequant_template;
  std::string dmmv_kernel = dequant_mul_mat_vec_template;

diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp
@@ -308,8 +308,13 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
  params.memory_f16 = inputs.f16_kv;
  params.n_ctx = inputs.max_context_length;
 
- neox_ctx_v2.hparams.n_ctx = gptj_ctx_v1.hparams.n_ctx = gptj_ctx_v2.hparams.n_ctx = gpt2_ctx_v1.hparams.n_ctx = gpt2_ctx_v2.hparams.n_ctx
- = neox_ctx_v3.hparams.n_ctx = gptj_ctx_v3.hparams.n_ctx = gptj_ctx_v3.hparams.n_ctx = mpt_ctx_v3.hparams.n_ctx = params.n_ctx;
+ neox_ctx_v2.hparams.n_ctx = neox_ctx_v3.hparams.n_ctx
+ = gptj_ctx_v1.hparams.n_ctx = gptj_ctx_v2.hparams.n_ctx = gptj_ctx_v3.hparams.n_ctx
+ = gpt2_ctx_v1.hparams.n_ctx = gpt2_ctx_v2.hparams.n_ctx = gpt2_ctx_v3.hparams.n_ctx
+ = mpt_ctx_v3.hparams.n_ctx = params.n_ctx;
+
+ //this is used for the mem_per_token eval, openblas needs more RAM
+ bool use_scratch = ggml_cpu_has_gpublas();
 
  printf("System Info: %s\n", llama_print_system_info());
  SetQuantsUnshuffled(false);
@@ -546,7 +551,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
  return res;
  }
  // determine the required inference memory per token:
- gpt2_eval(gpt2_ctx_v3, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token, file_format);
+ gpt2_eval(gpt2_ctx_v3, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token, use_scratch);
  return ModelLoadResult::SUCCESS;
  }
  else
@@ -613,14 +618,14 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
  }
 
  // determine the required inference memory per token:
- gptj_eval(gptj_ctx_v3, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token);
+ gptj_eval(gptj_ctx_v3, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token, use_scratch);
 
  //if the logits are NAN or duplicated, it means the model is incompatible
  std::vector<float> oldlogits(logits);
 
  //this is another hack because they change the library - we run the eval through the model
  //twice and compare logits. if they give the same logits for different inputs, model is broken
- gptj_eval(gptj_ctx_v3, params.n_threads, 0, {4, 5, 6, 7}, logits, mem_per_token);
+ gptj_eval(gptj_ctx_v3, params.n_threads, 0, {4, 5, 6, 7}, logits, mem_per_token, use_scratch);
 
  if(logits.size()>0 && (IsNanCheck(logits[0]) || LogitsDuplicated(oldlogits,logits)))
  {
@@ -685,7 +690,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
  }
 
  // determine the required inference memory per token:
- gpt_neox_eval(neox_ctx_v3, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token);
+ gpt_neox_eval(neox_ctx_v3, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token, use_scratch);
 
  return ModelLoadResult::SUCCESS;
  }
@@ -742,7 +747,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
  }
 
  // determine the required inference memory per token:
- mpt_eval(mpt_ctx_v3, params.n_threads, 0, { 0, 1, 2, 3 }, logits, false, mem_per_token);
+ mpt_eval(mpt_ctx_v3, params.n_threads, 0, { 0, 1, 2, 3 }, logits, false, mem_per_token, use_scratch);
  return ModelLoadResult::SUCCESS;
  }
  else
@@ -901,6 +906,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
  concat_output = "";
 
  bool startedsampling = false;
+ bool use_scratch = true; //for normal inference always use scratch
 
  timer_start();
  double time1 = 0, time2 = 0;
@@ -1075,15 +1081,15 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
  }
  else if(file_format==FileFormat::GPT2_4)
  {
- evalres = gpt2_eval(gpt2_ctx_v3, params.n_threads, n_past, embd, logits, mem_per_token, file_format);
+ evalres = gpt2_eval(gpt2_ctx_v3, params.n_threads, n_past, embd, logits, mem_per_token, use_scratch);
  }
  else if(file_format==FileFormat::NEOX_1 || file_format == FileFormat::NEOX_2 || file_format == FileFormat::NEOX_3 || file_format==FileFormat::NEOX_4 || file_format==FileFormat::NEOX_5)
  {
  evalres = gpt_neox_v2_eval(neox_ctx_v2, params.n_threads, n_past, embd, logits, mem_per_token);
  }
  else if(file_format==FileFormat::NEOX_6|| file_format==FileFormat::NEOX_7)
  {
- evalres = gpt_neox_eval(neox_ctx_v3, params.n_threads, n_past, embd, logits, mem_per_token);
+ evalres = gpt_neox_eval(neox_ctx_v3, params.n_threads, n_past, embd, logits, mem_per_token, use_scratch);
  }
  else if(file_format==FileFormat::GPTJ_1 || file_format==FileFormat::GPTJ_2)
  {
@@ -1095,11 +1101,11 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
  }
  else if(file_format==FileFormat::GPTJ_5)
  {
- evalres = gptj_eval(gptj_ctx_v3, params.n_threads, n_past, embd, logits, mem_per_token);
+ evalres = gptj_eval(gptj_ctx_v3, params.n_threads, n_past, embd, logits, mem_per_token, use_scratch);
  }
  else if(file_format==FileFormat::MPT_1)
  {
- evalres = mpt_eval(mpt_ctx_v3, params.n_threads, n_past, embd, logits, false, mem_per_token);
+ evalres = mpt_eval(mpt_ctx_v3, params.n_threads, n_past, embd, logits, false, mem_per_token, use_scratch);
  }
  else
  {

diff --git a/koboldcpp.py b/koboldcpp.py
@@ -227,7 +227,7 @@ def utfprint(str):
 maxhordelen = 256
 modelbusy = False
 defaultport = 5001
-KcppVersion = "1.32"
+KcppVersion = "1.32.3"
 showdebug = True
 
 class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):

diff --git a/llama.cpp b/llama.cpp
@@ -80,7 +80,7 @@ static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
  { MODEL_3B, 256ull * MB },
  { MODEL_7B, 512ull * MB },
  { MODEL_13B, 512ull * MB },
- { MODEL_30B, 512ull * MB },
+ { MODEL_30B, 640ull * MB },
  { MODEL_65B, 1024ull * MB },
  };
  return k_sizes;
@@ -92,7 +92,7 @@ static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1()
  { MODEL_3B, 256ull * MB },
  { MODEL_7B, 512ull * MB },
  { MODEL_13B, 512ull * MB },
- { MODEL_30B, 512ull * MB },
+ { MODEL_30B, 640ull * MB },
  { MODEL_65B, 1024ull * MB },
  };
  return k_sizes;
@@ -105,7 +105,7 @@ static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
  { MODEL_3B, 682ull * MB },
  { MODEL_7B, 1026ull * MB },
  { MODEL_13B, 1608ull * MB },
- { MODEL_30B, 3124ull * MB },
+ { MODEL_30B, 3224ull * MB },
  { MODEL_65B, 5120ull * MB },
  };
  return k_sizes;
@@ -119,7 +119,7 @@ static const std::map<e_model, size_t> & MEM_REQ_EVAL()
  { MODEL_3B, 512ull * MB },
  { MODEL_7B, 800ull * MB },
  { MODEL_13B, 1024ull * MB },
- { MODEL_30B, 1280ull * MB },
+ { MODEL_30B, 1380ull * MB },
  { MODEL_65B, 1536ull * MB },
  };
  return k_sizes;

diff --git a/model_adapter.cpp b/model_adapter.cpp
@@ -98,7 +98,7 @@ void print_tok_vec(std::vector<float> &embd)
  //we need to read more to determine
  int32_t vocabsiz = 0;
  fin.read((char *) &vocabsiz, sizeof(int32_t));
- if(vocabsiz==4096) //actually the d_model for mpt
+ if(vocabsiz==4096 || vocabsiz==7168) //actually the d_model for mpt
  {
  fileformat = FileFormat::MPT_1;
  }