fix initialization before padding

ggerganov · cndn · Jan 23, 2024 · Jan 27, 2024 · Jan 31, 2024 · Jan 27, 2024
commit e394ec93c741b1a19c610c7b7710d19f4ae154f0
diff --git a/examples/unity/fairseq2.cpp b/examples/unity/fairseq2.cpp
@@ -810,14 +810,22 @@ extern "C" ggml_tensor* StandardConformerEncoderAdaptorLayer_forward(
  ggml_tensor* residual = seqs;
  residual = LayerNorm_forward(model, prefix + ".residual_layer_norm", residual);
  residual = ggml_dup(ctx, ggml_permute(ctx, residual, 1, 0, 2, 3));
- residual = ggml_conv_1d(ctx, model.tensors[prefix + ".residual_conv.weight"], residual, 8, 4, 1);
+ ggml_tensor* residual_conv_weight = model.tensors[prefix + ".residual_conv.weight"];
+ // ggml_tensor* from = model.tensors[prefix + ".residual_conv.weight"];
+ // FORCE_ALLOC(residual_conv_weight, ctx, ggml_new_tensor_3d(ctx, GGML_TYPE_F16, from->ne[0], from->ne[1], from->ne[2]));
+ // ggml_fp32_to_fp16_row((float*)model.tensors[prefix + ".residual_conv.weight"]->data, (ggml_fp16_t*)residual_conv_weight->data, from->ne[0] * from->ne[1] * from->ne[2]);
+ residual = ggml_conv_1d(ctx, residual_conv_weight, residual, 8, 4, 1);
  residual = ggml_dup(ctx, ggml_permute(ctx, residual, 1, 0, 2, 3));
  residual = ggml_add_inplace(ctx, ggml_repeat(ctx, model.tensors[prefix + ".residual_conv.bias"], residual), residual);
  residual = ggml_glu(ctx, residual);
 
  seqs = LayerNorm_forward(model, prefix + ".self_attn_layer_norm", seqs);
  seqs = ggml_dup(ctx, ggml_permute(ctx, seqs, 1, 0, 2, 3));
- seqs = ggml_conv_1d(ctx, model.tensors[prefix + ".self_attn_conv.weight"], seqs, 8, 4, 1);
+ ggml_tensor* self_attn_conv_weight = model.tensors[prefix + ".self_attn_conv.weight"];
+ // from = model.tensors[prefix + ".self_attn_conv.weight"];
+ // FORCE_ALLOC(self_attn_conv_weight, ctx, ggml_new_tensor_3d(ctx, GGML_TYPE_F16, from->ne[0], from->ne[1], from->ne[2]));
+ // ggml_fp32_to_fp16_row((float*)model.tensors[prefix + ".self_attn_conv.weight"]->data, (ggml_fp16_t*)residual_conv_weight->data, from->ne[0] * from->ne[1] * from->ne[2]);
+ seqs = ggml_conv_1d(ctx, self_attn_conv_weight, seqs, 8, 4, 1);
  seqs = ggml_dup(ctx, ggml_permute(ctx, seqs, 1, 0, 2, 3));
  seqs = ggml_add_inplace(ctx, seqs, ggml_repeat(ctx, model.tensors[prefix + ".self_attn_conv.bias"], seqs));
  seqs = ggml_glu(ctx, seqs);

diff --git a/examples/unity/lib/unity_lib.cpp b/examples/unity/lib/unity_lib.cpp
@@ -68,7 +68,7 @@ extern "C" Result unity_eval_speech(fairseq2_model& model, std::vector<float>& d
  Result result;
  // The ctx_size_mb mostly depends of input length and model dim.
  int ctx_size_mb = opts.mem_mb;
- auto encoder_buf = std::vector<uint8_t>(8 * 1024 * 1024); // this is only for tensor metadata, it can be small
+ auto encoder_buf = std::vector<uint8_t>(80 * 1024 * 1024); // this is only for tensor metadata, it can be small
  auto encoder_fwd_buf = std::vector<uint8_t>(ctx_size_mb * 1024 * 1024);
  ggml_allocr* fwd_alloc = ggml_allocr_new(encoder_fwd_buf.data(), encoder_fwd_buf.capacity(), 8);
  int tgt_lang_idx;
@@ -97,6 +97,11 @@ extern "C" Result unity_eval_speech(fairseq2_model& model, std::vector<float>& d
  ggml_graph_compute_with_ctx(model.ctx, gf, n_threads);
  // encoder_output is valid until we call `ggml_allocr_reset(fwd_alloc)`
  ggml_tensor* encoder_output = gf->nodes[gf->n_nodes - 1];
+ // for(int i = 0; i < 100; i++) {
+ // float* ptr = static_cast<float*>(encoder_output->data);
+ // printf("%4f ", ptr[i]);
+ // }
+ // exit(0);
 
  // Beam search decoding
  const Hypothesis* hypo = unity_decode(model, opts, tgt_lang_idx, encoder_output, n_threads);

diff --git a/examples/unity/model_loader.cpp b/examples/unity/model_loader.cpp
@@ -137,25 +137,18 @@ void model_loader::load_vocab(llama_vocab& vocab, std::ifstream &fin)
  if (vocab_size == 0) {
  return;
  }
- printf("load vocab 0\n");
 
  vocab.token_to_id.reserve(vocab_size);
  vocab.id_to_token.reserve(vocab_size);
 
  std::string packed_vocab = get_name(fin);
  std::int64_t ctx_size = vocab_size * sizeof(float) + vocab_size + 2 * ggml_tensor_overhead();
  ctx_size *= 2;
- printf("load vocab 1\n");
  ggml_context* ctx = ggml_init(ggml_init_params{static_cast<size_t>(ctx_size), nullptr, false});
- printf("load vocab 1.1\n");
  ggml_tensor* lengths_tensor = load_tensor_value(fin, ctx, true);
- printf("load vocab 1.2\n");
  std::int8_t* lengths = (std::int8_t*)lengths_tensor->data;
- printf("load vocab 1.3\n");
  ggml_tensor* scores_tensor = load_tensor_value(fin, ctx, true);
- printf("load vocab 1.4\n");
  float* scores = ggml_get_data_f32(scores_tensor);
- printf("load vocab 2\n");
 
  int64_t offset = 0;
  for (int i = 0; i < vocab_size; ++i) {
@@ -165,7 +158,6 @@ void model_loader::load_vocab(llama_vocab& vocab, std::ifstream &fin)
  vocab.id_to_token.push_back({word, scores[i], LLAMA_TOKEN_TYPE_NORMAL});
  offset += lengths[i] + 1;
  }
- printf("load vocab 3\n");
  // Since we copied lengths and scores, we don't need the context anymore.
  ggml_free(ctx);
 
@@ -224,22 +216,14 @@ model_loader::get_name(std::ifstream& fin)
 
 extern "C" int load_fairseq2_ggml_file(fairseq2_model& model, const char* fname) {
  model_loader loader;
- printf("here 1\n");
  assert_endianness();
- printf("here 2\n");
  auto fin = open_ggml_file(fname);
- printf("here 3\n");
  loader.load_hparams(model.hparams, fin);
- printf("here 4\n");
  loader.load_hparams(model.layer_config, fin);
- printf("here 4.2\n");
  loader.load_vocab(model.vocab, fin);
- printf("here 4.7\n");
  loader.load_model_weights(model, fin);
- printf("here 5\n");
 
  // load optional target vocabulary in cases of bilingual models
  loader.load_vocab(model.tgt_vocab, fin);
- printf("here 6\n");
  return 0;
 }
diff --git a/examples/unity/scripts/ggml.py b/examples/unity/scripts/ggml.py
@@ -55,14 +55,14 @@
 import ctypes
 import pathlib
 import importlib.resources
-from typing import Callable, List, Optional, Sequence, Union
+from pathlib import Path
+from typing import List, Optional, Sequence, Union, Callable
 from typing_extensions import TypeAlias
 
 
 # Load the library
-def load_shared_library(module_name: str, lib_base_name: str):
+def load_shared_library(base_path: Path, lib_base_name: str):
  # Construct the paths to the possible shared library names
- base_path = pathlib.Path(__file__).parent.resolve()
  # Searching for the library in the current directory under the name "libggml" (default name
  # for ggml) and "ggml" (default name for this repo)
  lib_names: List[str] = [
@@ -71,41 +71,30 @@ def load_shared_library(module_name: str, lib_base_name: str):
  f"{lib_base_name}.dll",
  ]
 
- path: Optional[pathlib.Path] = None
-
- for lib_name in lib_names:
- try:
- print(module_name)
- print(lib_name)
- with importlib.resources.path(module_name, lib_name) as p:
- if os.path.exists(p):
- path = p
- break
- except FileNotFoundError:
- pass
-
- if path is None:
- raise FileNotFoundError(
- f"Shared library with base name '{lib_base_name}' not found"
- )
-
  cdll_args = dict() # type: ignore
  # Add the library directory to the DLL search path on Windows (if needed)
  if sys.platform == "win32" and sys.version_info >= (3, 8):
  os.add_dll_directory(str(base_path))
  cdll_args["winmode"] = 0
 
- # Try to load the shared library, handling potential errors
- try:
- return ctypes.CDLL(str(path), **cdll_args)
- except Exception as e:
- raise RuntimeError(f"Failed to load shared library '{path}': {e}")
+ for lib_name in lib_names:
+ # Try to load the shared library, handling potential errors
+ path = base_path / lib_name
+ if not path.exists():
+ continue
+ try:
+ return ctypes.CDLL(str(path), **cdll_args)
+ except Exception as e:
+ raise RuntimeError(f"Failed to load shared library '{path}': {e}")
 
+ raise FileNotFoundError(
+ f"Shared library with base name '{lib_base_name}' not found in {base_path}"
+ )
 
-module_name = "ggml"
-lib_base_name = "ggml"
-lib = load_shared_library(module_name, lib_base_name)
 
+base_path = pathlib.Path(__file__).parent.resolve() / "../../../build/examples/unity"
+lib_base_name = "fairseq2_cpp"
+lib = load_shared_library(base_path, lib_base_name)
 
 #####################################################
 # GGML Utility Types
@@ -231,7 +220,6 @@ def ggml_fp32_to_fp16_row(
 ]
 lib.ggml_fp32_to_fp16_row.restype = None
 
-
 # struct ggml_context;
 ggml_context_p = ctypes.c_void_p
 """Opaque pointer to a ggml_context.
@@ -7330,29 +7318,29 @@ def ggml_quantize_chunk(
 
 # // These are needed for IQ2_XS and IQ2_XXS quantizations
 # GGML_API void ggml_init_iq2_quantization(enum ggml_type type);
-def ggml_init_iq2_quantization(
- type: Union[ctypes.c_int, int],
-):
- return lib.ggml_init_iq2_quantization(type)
+# def ggml_init_iq2_quantization(
+#  type: Union[ctypes.c_int, int],
+# ):
+#  return lib.ggml_init_iq2_quantization(type)
 
 
-lib.ggml_init_iq2_quantization.argtypes = [
- ctypes.c_int,
-]
-lib.ggml_init_iq2_quantization.restype = None
+# lib.ggml_init_iq2_quantization.argtypes = [
+#  ctypes.c_int,
+# ]
+# lib.ggml_init_iq2_quantization.restype = None
 
 
 # GGML_API void ggml_deinit_iq2_quantization(enum ggml_type type);
-def ggml_deinit_iq2_quantization(
- type: Union[ctypes.c_int, int],
-):
- return lib.ggml_deinit_iq2_quantization(type)
+# def ggml_deinit_iq2_quantization(
+#  type: Union[ctypes.c_int, int],
+# ):
+#  return lib.ggml_deinit_iq2_quantization(type)
 
 
-lib.ggml_deinit_iq2_quantization.argtypes = [
- ctypes.c_int,
-]
-lib.ggml_deinit_iq2_quantization.restype = None
+# lib.ggml_deinit_iq2_quantization.argtypes = [
+#  ctypes.c_int,
+# ]
+# lib.ggml_deinit_iq2_quantization.restype = None
 
 # //
 # // Importance matrix
@@ -7364,18 +7352,18 @@ def ggml_deinit_iq2_quantization(
 
 
 # GGML_API void ggml_set_imatrix_collection(ggml_collect_imatrix_t imatrix_collect);
-def ggml_set_imatrix_collection(
- imatrix_collect: Callable[
- [ggml_tensor_p, ggml_tensor_p], None
- ] # TODO: Fix type signature here
-):
- return lib.ggml_set_imatrix_collection(imatrix_collect)
+# def ggml_set_imatrix_collection(
+#  imatrix_collect: Callable[
+#  [ggml_tensor_p, ggml_tensor_p], None
+#  ] # TODO: Fix type signature here
+# ):
+#  return lib.ggml_set_imatrix_collection(imatrix_collect)
 
 
-lib.ggml_set_imatrix_collection.argtypes = [
- ggml_collect_imatrix_t,
-]
-lib.ggml_set_imatrix_collection.restype = None
+# lib.ggml_set_imatrix_collection.argtypes = [
+#  ggml_collect_imatrix_t,
+# ]
+# lib.ggml_set_imatrix_collection.restype = None
 
 # //
 # // gguf

diff --git a/examples/unity/unity.cpp b/examples/unity/unity.cpp
@@ -99,22 +99,21 @@ int main(int argc, char ** argv) {
 
  // The ctx_size_mb mostly depends of input length and model dim.
  int ctx_size_mb = params.opts.mem_mb;
- auto encoder_buf = std::vector<uint8_t>(8 * 1024 * 1024); // Only tensor metadata goes in there
  auto encoder_fwd_buf = std::vector<uint8_t>(ctx_size_mb * 1024 * 1024 / 2);
 
  while (true) {
  // S2ST
  if (!params.text) {
- std::string input;
- std::cout << "\nEnter audio_path and tgt_lang, separated by space (or 'exit' to quit):\n";
- std::getline(std::cin, input);
- if (input == "exit") {
- break;
- }
- std::istringstream iss(input);
- std::string audio_path;
- std::string tgt_lang;
- iss >> audio_path >> tgt_lang;
+ // std::string input;
+ // std::cout << "\nEnter audio_path and tgt_lang, separated by space (or 'exit' to quit):\n";
+ // std::getline(std::cin, input);
+ // if (input == "exit") {
+ //  break;
+ // }
+ // std::istringstream iss(input);
+ std::string audio_path = "/private/home/yilinyang/tmp/LJ037-0171_sr16k.wav";
+ std::string tgt_lang = "eng";
+ // iss >> audio_path >> tgt_lang;
  if (audio_path == "-") {
  audio_path = "/proc/self/fd/0";
  }

diff --git a/include/ggml/ggml.h b/include/ggml/ggml.h
@@ -450,6 +450,10 @@ extern "C" {
  GGML_OP_DEPTHWISE_CONV_STAGE_0, // internal
  GGML_OP_DEPTHWISE_CONV_STAGE_1, // internal
  GGML_OP_DEPTHWISE_CONV_STAGE_2, // internal
+
+ GGML_OP_CONV_1D_GENERIC_STAGE_0,
+ GGML_OP_CONV_1D_GENERIC_STAGE_1, 
+
  GGML_OP_UPSCALE, // nearest interpolate
  GGML_OP_PAD,
  GGML_OP_ARGSORT,