demo: add main example (#8)

PABannier · Oct 1, 2023 · 8f5c964 · 8f5c964
1 parent 2a23eed
commit 8f5c964
Show file tree

Hide file tree

Showing 8 changed files with 6,771 additions and 119 deletions.
diff --git a/.gitignore b/.gitignore
@@ -4,7 +4,6 @@ main.dSYM
 encodec
 *.o
 *.th
-main
 .vscode/
 
 build/
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -16,6 +16,8 @@ else()
  set(ENCODEC_STANDALONE OFF)
 endif()
 
+option(ENCODEC_BUILD_EXAMPLES "encodec: build examples" ${ENCODEC_STANDALONE})
+
 # Build libraries
 
 set(ENCODEC_LIB encodec.cpp)
@@ -29,6 +31,10 @@ add_library(
  encodec.h
 )
 
+if (ENCODEC_BUILD_EXAMPLES)
+ add_subdirectory(examples)
+endif()
+
 target_link_libraries(${ENCODEC_LIB} PUBLIC ggml)
 target_include_directories(${ENCODEC_LIB} PUBLIC .)
 target_compile_features(${ENCODEC_LIB} PUBLIC cxx_std_11)
diff --git a/encodec.cpp b/encodec.cpp
@@ -14,106 +14,6 @@
 
 static const size_t TENSOR_ALIGNMENT = 32;
 
-// res + downsample block at some ratio
-struct encodec_encoder_block {
- // conv1
- struct ggml_tensor * conv_1_w;
- struct ggml_tensor * conv_1_b;
-
- // conv2
- struct ggml_tensor * conv_2_w;
- struct ggml_tensor * conv_2_b;
-
- // shortcut
- struct ggml_tensor * conv_sc_w;
- struct ggml_tensor * conv_sc_b;
-
- // downsampling layers
- struct ggml_tensor * ds_conv_w;
- struct ggml_tensor * ds_conv_b;
-};
-
-struct encodec_lstm {
- struct ggml_tensor * l0_ih_w;
- struct ggml_tensor * l0_hh_w;
-
- struct ggml_tensor * l0_ih_b;
- struct ggml_tensor * l0_hh_b;
-
- struct ggml_tensor * l1_ih_w;
- struct ggml_tensor * l1_hh_w;
-
- struct ggml_tensor * l1_ih_b;
- struct ggml_tensor * l1_hh_b;
-};
-
-struct encodec_encoder {
- struct ggml_tensor * init_conv_w;
- struct ggml_tensor * init_conv_b;
-
- encodec_lstm lstm;
-
- struct ggml_tensor * final_conv_w;
- struct ggml_tensor * final_conv_b;
-
- std::vector<encodec_encoder_block> blocks;
-};
-
-struct encodec_quant_block {
- struct ggml_tensor * inited;
- struct ggml_tensor * cluster_size;
- struct ggml_tensor * embed;
- struct ggml_tensor * embed_avg;
-};
-
-struct encodec_quantizer {
- std::vector<encodec_quant_block> blocks;
-};
-
-struct encodec_decoder_block {
- //upsampling layers
- struct ggml_tensor * us_conv_w;
- struct ggml_tensor * us_conv_b;
-
- // conv1
- struct ggml_tensor * conv_1_w;
- struct ggml_tensor * conv_1_b;
-
- // conv2
- struct ggml_tensor * conv_2_w;
- struct ggml_tensor * conv_2_b;
-
- // shortcut
- struct ggml_tensor * conv_sc_w;
- struct ggml_tensor * conv_sc_b;
-};
-
-struct encodec_decoder {
- struct ggml_tensor * init_conv_w;
- struct ggml_tensor * init_conv_b;
-
- encodec_lstm lstm;
-
- struct ggml_tensor * final_conv_w;
- struct ggml_tensor * final_conv_b;
-
- std::vector<encodec_decoder_block> blocks;
-};
-
-struct encodec_model {
- encodec_hparams hparams;
-
- encodec_encoder encoder;
- encodec_quantizer quantizer;
- encodec_decoder decoder;
-
- // context
- struct ggml_context * ctx;
- int n_loaded;
-
- std::map<std::string, struct ggml_tensor *> tensors;
-};
-
 template<typename T>
 static void read_safe(std::ifstream& infile, T& dest) {
  infile.read((char*)& dest, sizeof(T));
@@ -137,7 +37,12 @@ static void ggml_disconnect_node_from_graph(ggml_tensor * t) {
  }
 }
 
-static void encodec_sigmoid_impl(struct ggml_tensor * dst, const struct ggml_tensor * src, int ith, int nth, void * userdata) {
+static void encodec_sigmoid_impl(
+ struct ggml_tensor * dst,
+ const struct ggml_tensor * src,
+ int ith,
+ int nth,
+ void * userdata) {
  GGML_ASSERT(userdata == NULL);
  GGML_ASSERT(ggml_are_same_shape(dst, src));
  GGML_ASSERT(ggml_is_contiguous(dst));
@@ -208,11 +113,11 @@ static struct ggml_tensor * unpad_1d(ggml_context * ctx0, ggml_tensor * inp, int
 }
 
 static struct ggml_tensor * strided_conv_1d(
- ggml_context * ctx0,
- ggml_tensor * inp,
- ggml_tensor * conv_w,
- ggml_tensor * conv_b,
- int stride) {
+  ggml_context * ctx0,
+  ggml_tensor * inp,
+  ggml_tensor * conv_w,
+  ggml_tensor * conv_b,
+  int stride) {
  int kernel_size = conv_w->ne[0];
  int padding_total = kernel_size - stride;
  int extra_padding = get_extra_padding_for_conv_1d(inp, kernel_size, stride, padding_total);
@@ -230,11 +135,11 @@ static struct ggml_tensor * strided_conv_1d(
 
 static struct ggml_tensor * forward_pass_lstm_unilayer(
  struct ggml_context * ctx0,
- struct ggml_tensor * inp,
- struct ggml_tensor * weight_ih,
- struct ggml_tensor * weight_hh,
- struct ggml_tensor * bias_ih,
- struct ggml_tensor * bias_hh) {
+  struct ggml_tensor * inp,
+  struct ggml_tensor * weight_ih,
+  struct ggml_tensor * weight_hh,
+  struct ggml_tensor * bias_ih,
+  struct ggml_tensor * bias_hh) {
 
  const int input_dim = inp->ne[1];
  const int hidden_dim = weight_ih->ne[1]/4;
@@ -624,7 +529,7 @@ bool encodec_model_load(const std::string& fname, encodec_model& model) {
 
  infile.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
 
- printf("%48s - [%5d, %5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ne[2], ftype == 0 ? "float" : "f16", ggml_nbytes(tensor)/1024.0/1024.0);
+ // printf("%48s - [%5d, %5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ne[2], ftype == 0 ? "float" : "f16", ggml_nbytes(tensor)/1024.0/1024.0);
 
  total_size += ggml_nbytes(tensor);
  model.n_loaded++;
@@ -889,14 +794,12 @@ static struct ggml_cgraph * encodec_build_graph(
  return gf;
 }
 
-static bool encodec_model_eval(
- std::vector<float> & raw_audio,
+bool encodec_model_eval(
  encodec_context & ectx,
+ std::vector<float> & raw_audio,
  int n_threads) {
  const int64_t t_start_ms = ggml_time_ms();
 
- fprintf(stderr, "%s: raw audio (t=%zu)\n", __func__, raw_audio.size());
-
  static const size_t buf_size = 256u*1024*1024;
 
  if (ectx.ctx_audio) {
@@ -951,3 +854,23 @@ static bool encodec_model_eval(
 
  return true;
 }
+
+struct encodec_context encodec_new_context_with_model(encodec_model & model) {
+ encodec_context ctx = encodec_context(model);
+ return ctx;
+}
+
+struct encodec_model encodec_load_model_from_file(std::string fname) {
+ encodec_model model;
+ if (!encodec_model_load(fname, model)) {
+ fprintf(stderr, "%s: failed to load model\n", __func__);
+ exit(0);
+ }
+ return model;
+}
+
+void encodec_free(encodec_context & ectx) {
+ if (ectx.ctx_audio) {
+ ggml_free(ectx.ctx_audio);
+ }
+}
diff --git a/encodec.h b/encodec.h
@@ -13,7 +13,7 @@
 #define ENCODEC_FILE_MAGIC 'ggml'
 #define ENCODEC_FILE_VERSION 1
 
-static const size_t MB = 4*1024*1024;
+static const size_t MB = 1024*1024;
 
 struct encodec_hparams {
  int32_t in_channels = 1;
@@ -33,7 +33,105 @@ struct encodec_hparams {
  int32_t sr = 24000;
 };
 
-struct encodec_model;
+// res + downsample block at some ratio
+struct encodec_encoder_block {
+ // conv1
+ struct ggml_tensor * conv_1_w;
+ struct ggml_tensor * conv_1_b;
+
+ // conv2
+ struct ggml_tensor * conv_2_w;
+ struct ggml_tensor * conv_2_b;
+
+ // shortcut
+ struct ggml_tensor * conv_sc_w;
+ struct ggml_tensor * conv_sc_b;
+
+ // downsampling layers
+ struct ggml_tensor * ds_conv_w;
+ struct ggml_tensor * ds_conv_b;
+};
+
+struct encodec_lstm {
+ struct ggml_tensor * l0_ih_w;
+ struct ggml_tensor * l0_hh_w;
+
+ struct ggml_tensor * l0_ih_b;
+ struct ggml_tensor * l0_hh_b;
+
+ struct ggml_tensor * l1_ih_w;
+ struct ggml_tensor * l1_hh_w;
+
+ struct ggml_tensor * l1_ih_b;
+ struct ggml_tensor * l1_hh_b;
+};
+
+struct encodec_encoder {
+ struct ggml_tensor * init_conv_w;
+ struct ggml_tensor * init_conv_b;
+
+ encodec_lstm lstm;
+
+ struct ggml_tensor * final_conv_w;
+ struct ggml_tensor * final_conv_b;
+
+ std::vector<encodec_encoder_block> blocks;
+};
+
+struct encodec_quant_block {
+ struct ggml_tensor * inited;
+ struct ggml_tensor * cluster_size;
+ struct ggml_tensor * embed;
+ struct ggml_tensor * embed_avg;
+};
+
+struct encodec_quantizer {
+ std::vector<encodec_quant_block> blocks;
+};
+
+struct encodec_decoder_block {
+ //upsampling layers
+ struct ggml_tensor * us_conv_w;
+ struct ggml_tensor * us_conv_b;
+
+ // conv1
+ struct ggml_tensor * conv_1_w;
+ struct ggml_tensor * conv_1_b;
+
+ // conv2
+ struct ggml_tensor * conv_2_w;
+ struct ggml_tensor * conv_2_b;
+
+ // shortcut
+ struct ggml_tensor * conv_sc_w;
+ struct ggml_tensor * conv_sc_b;
+};
+
+struct encodec_decoder {
+ struct ggml_tensor * init_conv_w;
+ struct ggml_tensor * init_conv_b;
+
+ encodec_lstm lstm;
+
+ struct ggml_tensor * final_conv_w;
+ struct ggml_tensor * final_conv_b;
+
+ std::vector<encodec_decoder_block> blocks;
+};
+
+struct encodec_model {
+ encodec_hparams hparams;
+
+ encodec_encoder encoder;
+ encodec_quantizer quantizer;
+ encodec_decoder decoder;
+
+ // context
+ struct ggml_context * ctx;
+ int n_loaded;
+
+ std::map<std::string, struct ggml_tensor *> tensors;
+};
 
 struct encodec_context {
  encodec_context(encodec_model & model) : model(model) {}
@@ -62,3 +160,15 @@ struct encodec_context {
  // statistics
  int64_t t_compute_ms = 0;
 };
+
+
+struct encodec_model encodec_load_model_from_file(std::string fname);
+
+struct encodec_context encodec_new_context_with_model(encodec_model & model);
+
+bool encodec_model_eval(
+ encodec_context & ectx,
+ std::vector<float> & raw_audio,
+ int n_threads);
+
+void encodec_free(encodec_context & ectx);
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
@@ -0,0 +1,3 @@
+include_directories(${CMAKE_CURRENT_SOURCE_DIR})
+
+add_subdirectory(main)
diff --git a/examples/main/CMakeLists.txt b/examples/main/CMakeLists.txt
@@ -0,0 +1,11 @@
+set(TARGET main)
+
+add_executable(${TARGET} main.cpp dr_wav.h)
+
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE encodec.cpp ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
+
+if(MSVC)
+ target_compile_definitions(${TARGET} PRIVATE -D_CRT_SECURE_NO_WARNINGS=1)
+endif()