feat: decompress codes (#21)

PABannier · Oct 23, 2023 · 161f6ed · 161f6ed
1 parent f00ad69
commit 161f6ed
Show file tree

Hide file tree

Showing 6 changed files with 344 additions and 70 deletions.
diff --git a/encodec.cpp b/encodec.cpp
@@ -19,11 +19,11 @@
 
 typedef enum {
  // Run the end-to-end encoder-decoder pipeline
- full  = 0,
+ full = 0,
  // Encode an audio (encoder + quantizer encode)
- encode_only = 1,
+ encode = 1,
  // Decode an audio from a compressed representation (quantizer decode + decoder)
- decode_only = 2,
+ decode = 2,
 } encodec_run_mode;
 
 void print_tensor(struct ggml_tensor * a) {
@@ -1000,6 +1000,7 @@ struct ggml_cgraph * encodec_build_graph(
  struct encodec_context * ectx,
  std::vector<float> & inp_audio,
  const encodec_run_mode mode) {
+ assert(mode == encodec_run_mode::full || mode == encodec_run_mode::encode);
 
  const auto & model = ectx->model;
  const auto & hparams = model.hparams;
@@ -1042,10 +1043,14 @@ struct ggml_cgraph * encodec_build_graph(
  {
  ggml_build_forward_expand(gf, decoded);
  } break;
- case encodec_run_mode::encode_only:
+ case encodec_run_mode::encode:
  {
  ggml_build_forward_expand(gf, codes);
  } break;
+ case encodec_run_mode::decode:
+ {
+ return NULL;
+ } break;
  default:
  {
  fprintf(stderr, "%s: unknown run mode\n", __func__);
@@ -1062,6 +1067,77 @@ struct ggml_cgraph * encodec_build_graph(
  return gf;
 }
 
+struct ggml_cgraph * encodec_build_graph(
+ struct encodec_context * ectx,
+ std::vector<int32_t> & codes,
+ const encodec_run_mode mode) {
+ assert(mode == encodec_run_mode::decode);
+
+ const auto & model = ectx->model;
+ const auto & hparams = model.hparams;
+ const auto & allocr = ectx->allocr;
+
+ const int n_bins = hparams.n_bins;
+ const int sr = hparams.sr;
+ const int bandwidth = hparams.bandwidth;
+ const int hop_length = hparams.hop_length;
+
+ const int frame_rate = (int) ceilf(sr / hop_length);
+ const int n_q = get_num_quantizers_for_bandwidth(n_bins, frame_rate, bandwidth);
+
+ if (codes.size() % n_q != 0) {
+ fprintf(stderr, "%s: invalid number of codes\n", __func__);
+ return NULL;
+ }
+
+ const int N = codes.size() / n_q;
+
+ // since we are using ggml-alloc, this buffer only needs enough space to hold the
+ // ggml_tensor and ggml_cgraph structs, but not the tensor data
+ static size_t buf_size = ggml_tensor_overhead()*GGML_MAX_NODES + ggml_graph_overhead();
+ static std::vector<uint8_t> buf(buf_size);
+
+ struct ggml_init_params ggml_params = {
+ /*.mem_size =*/ buf_size,
+ /*.mem_buffer =*/ buf.data(),
+ /*.no_alloc =*/ true,
+ };
+
+ struct ggml_context * ctx0 = ggml_init(ggml_params);
+
+ struct ggml_cgraph * gf = ggml_new_graph(ctx0);
+
+ struct ggml_tensor * inp_codes = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, N, n_q);
+ ggml_allocr_alloc(allocr, inp_codes);
+
+ // avoid writing to tensors if we are only measuring the memory usage
+ if (!ggml_allocr_is_measure(allocr)) {
+ ggml_backend_tensor_set(inp_codes, codes.data(), 0, N*n_q*ggml_element_size(inp_codes));
+ }
+
+ struct ggml_tensor * quantized = encodec_forward_quantizer_decode(ectx, ctx0, inp_codes);
+ struct ggml_tensor * decoded = encodec_forward_decoder(ectx, ctx0, quantized);
+
+ switch(mode) {
+ case encodec_run_mode::decode:
+ {
+ ggml_build_forward_expand(gf, decoded);
+ } break;
+ default:
+ {
+ fprintf(stderr, "%s: unknown run mode\n", __func__);
+ return NULL;
+ } break;
+ }
+
+ ggml_free(ctx0);
+
+ ectx->codes = inp_codes;
+ ectx->decoded = decoded;
+
+ return gf;
+}
+
 bool encodec_eval_internal(
  struct encodec_context * ectx,
  std::vector<float> & raw_audio,
@@ -1087,6 +1163,32 @@ bool encodec_eval_internal(
  return true;
 }
 
+bool encodec_eval_internal(
+ struct encodec_context * ectx,
+ std::vector<int32_t> & codes,
+ const int n_threads,
+ const encodec_run_mode mode) {
+ auto & model = ectx->model;
+ auto & allocr = ectx->allocr;
+
+ // reset the allocator to free all the memory allocated during the previous inference
+ ggml_allocr_reset(allocr);
+
+ struct ggml_cgraph * gf = encodec_build_graph(ectx, codes, mode);
+
+ // allocate tensors
+ ggml_allocr_alloc_graph(allocr, gf);
+
+ // run the computation
+ if (ggml_backend_is_cpu(model.backend)) {
+ ggml_backend_cpu_set_n_threads(model.backend, n_threads);
+ }
+ ggml_backend_graph_compute(model.backend, gf);
+
+ return true;
+}
+
+
 bool encodec_eval(
  struct encodec_context * ectx,
  std::vector<float> & raw_audio,
@@ -1125,6 +1227,44 @@ bool encodec_eval(
  return true;
 }
 
+bool encodec_eval(
+ struct encodec_context * ectx,
+ std::vector<int32_t> & codes,
+ const int n_threads,
+ const encodec_run_mode mode) {
+ const int64_t t_start_ms = ggml_time_ms();
+
+ // allocate the compute buffer
+ {
+ // alignment required by the backend
+ size_t align = ggml_backend_get_alignment(ectx->model.backend);
+ ectx->allocr = ggml_allocr_new_measure(align);
+
+ // create the graph for memory usage estimation
+ struct ggml_cgraph * gf = encodec_build_graph(ectx, codes, mode);
+
+ // compute the required memory
+ size_t mem_size = ggml_allocr_alloc_graph(ectx->allocr, gf);
+
+ // recreate the allocator with the required memory
+ ggml_allocr_free(ectx->allocr);
+ ectx->buf_compute = ggml_backend_alloc_buffer(ectx->model.backend, mem_size);
+ ectx->allocr = ggml_allocr_new_from_buffer(ectx->buf_compute);
+
+ fprintf(stderr, "%s: compute buffer size: %.2f MB\n\n", __func__, mem_size/1024.0/1024.0);
+ }
+
+ // encodec eval
+ if (!encodec_eval_internal(ectx, codes, n_threads, mode)) {
+ fprintf(stderr, "%s: failed to run encodec eval\n", __func__);
+ return false;
+ }
+
+ ectx->t_compute_ms = ggml_time_ms() - t_start_ms;
+
+ return true;
+}
+
 bool encodec_reconstruct_audio(
  struct encodec_context * ectx,
  std::vector<float> & raw_audio,
@@ -1155,7 +1295,7 @@ bool encodec_compress_audio(
  struct encodec_context * ectx,
  std::vector<float> & raw_audio,
  int n_threads) {
- if(!encodec_eval(ectx, raw_audio, n_threads, encodec_run_mode::encode_only)) {
+ if(!encodec_eval(ectx, raw_audio, n_threads, encodec_run_mode::encode)) {
  fprintf(stderr, "%s: failed to run encodec eval\n", __func__);
  return false;
  }
@@ -1177,6 +1317,32 @@ bool encodec_compress_audio(
  return true;
 }
 
+bool encodec_decompress_audio(
+ struct encodec_context * ectx,
+ std::vector<int32_t> & codes,
+ int n_threads) {
+ if (!encodec_eval(ectx, codes, n_threads, encodec_run_mode::decode)) {
+ fprintf(stderr, "%s: failed to run encodec eval\n", __func__);
+ return false;
+ }
+
+ if (!ectx->decoded) {
+ fprintf(stderr, "%s: null decoded tensor\n", __func__);
+ return false;
+ }
+
+ struct ggml_tensor * decoded = ectx->decoded;
+
+ auto & out_audio = ectx->out_audio;
+
+ int out_length = decoded->ne[0];
+ out_audio.resize(out_length);
+
+ ggml_backend_tensor_get(decoded, out_audio.data(), 0, out_length*ggml_element_size(decoded));
+
+ return true;
+}
+
 struct encodec_context * encodec_load_model(const std::string & model_path) {
  int64_t t_start_load_us = ggml_time_us();
 

diff --git a/encodec.h b/encodec.h
@@ -1,3 +1,13 @@
+/**
+ * @file encodec.h
+ * @brief Header file for the encodec library.
+ *
+ * This file contains the declarations of the structs and functions used in the encodec library.
+ * The library provides functionality for audio compression and decompression using a custom model.
+ * The model consists of an encoder, a quantizer and a decoder, each with their own set of parameters.
+ * The library also provides functions for loading and freeing the model, as well as compressing and decompressing audio data.
+ *
+ */
 #pragma once
 
 #include <cmath>
@@ -173,18 +183,68 @@ struct encodec_context {
  int64_t t_compute_ms = 0;
 };
 
-struct encodec_context * encodec_load_model(const std::string & model_path);
-
-void encodec_set_target_bandwidth(struct encodec_context * ectx, int bandwidth);
-
+/**
+ * Loads an encodec model from the specified file path.
+ *
+ * @param model_path The file path to the encodec model.
+ * @return A pointer to the encodec context struct.
+ */
+struct encodec_context * encodec_load_model(
+ const std::string & model_path);
+
+/**
+ * Sets the target bandwidth for the given encodec context.
+ *
+ * @param ectx The encodec context to set the target bandwidth for.
+ * @param bandwidth The target bandwidth to set, in bits per second.
+ */
+void encodec_set_target_bandwidth(
+ struct encodec_context * ectx,
+ int bandwidth);
+
+/**
+ * Reconstructs audio from raw audio data using the specified encodec context.
+ *
+ * @param ectx The encodec context to use for reconstruction.
+ * @param raw_audio The raw audio data to reconstruct.
+ * @param n_threads The number of threads to use for reconstruction.
+ * @return True if the reconstruction was successful, false otherwise.
+ */
 bool encodec_reconstruct_audio(
  struct encodec_context * ectx,
  std::vector<float> & raw_audio,
  int n_threads);
 
+/**
+ * Compresses audio data using the specified encodec context.
+ *
+ * @param ectx The encodec context to use for compression.
+ * @param raw_audio The raw audio data to compress.
+ * @param n_threads The number of threads to use for compression.
+ * @return True if the compression was successful, false otherwise.
+ */
 bool encodec_compress_audio(
  struct encodec_context * ectx,
  std::vector<float> & raw_audio,
  int n_threads);
 
-void encodec_free(struct encodec_context * ectx);
+/**
+ * Decompresses audio data using the specified encodec context.
+ *
+ * @param ectx The encodec context to use for decompression.
+ * @param codes The compressed audio data to decompress.
+ * @param n_threads The number of threads to use for decompression.
+ * @return True if the audio data was successfully decompressed, false otherwise.
+ */
+bool encodec_decompress_audio(
+ struct encodec_context * ectx,
+ std::vector<int32_t> & codes,
+ int n_threads);
+
+/**
+ * @brief Frees the memory allocated for an encodec context.
+ *
+ * @param ectx The encodec context to free.
+ */
+void encodec_free(
+ struct encodec_context * ectx);