Skip to content

Commit

Permalink
feat: decompress codes (#21)
Browse files Browse the repository at this point in the history
  • Loading branch information
PABannier committed Oct 23, 2023
1 parent f00ad69 commit 161f6ed
Show file tree
Hide file tree
Showing 6 changed files with 344 additions and 70 deletions.
176 changes: 171 additions & 5 deletions encodec.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,11 @@

typedef enum {
// Run the end-to-end encoder-decoder pipeline
full = 0,
full = 0,
// Encode an audio (encoder + quantizer encode)
encode_only = 1,
encode = 1,
// Decode an audio from a compressed representation (quantizer decode + decoder)
decode_only = 2,
decode = 2,
} encodec_run_mode;

void print_tensor(struct ggml_tensor * a) {
Expand Down Expand Up @@ -1000,6 +1000,7 @@ struct ggml_cgraph * encodec_build_graph(
struct encodec_context * ectx,
std::vector<float> & inp_audio,
const encodec_run_mode mode) {
assert(mode == encodec_run_mode::full || mode == encodec_run_mode::encode);

const auto & model = ectx->model;
const auto & hparams = model.hparams;
Expand Down Expand Up @@ -1042,10 +1043,14 @@ struct ggml_cgraph * encodec_build_graph(
{
ggml_build_forward_expand(gf, decoded);
} break;
case encodec_run_mode::encode_only:
case encodec_run_mode::encode:
{
ggml_build_forward_expand(gf, codes);
} break;
case encodec_run_mode::decode:
{
return NULL;
} break;
default:
{
fprintf(stderr, "%s: unknown run mode\n", __func__);
Expand All @@ -1062,6 +1067,77 @@ struct ggml_cgraph * encodec_build_graph(
return gf;
}

struct ggml_cgraph * encodec_build_graph(
struct encodec_context * ectx,
std::vector<int32_t> & codes,
const encodec_run_mode mode) {
assert(mode == encodec_run_mode::decode);

const auto & model = ectx->model;
const auto & hparams = model.hparams;
const auto & allocr = ectx->allocr;

const int n_bins = hparams.n_bins;
const int sr = hparams.sr;
const int bandwidth = hparams.bandwidth;
const int hop_length = hparams.hop_length;

const int frame_rate = (int) ceilf(sr / hop_length);
const int n_q = get_num_quantizers_for_bandwidth(n_bins, frame_rate, bandwidth);

if (codes.size() % n_q != 0) {
fprintf(stderr, "%s: invalid number of codes\n", __func__);
return NULL;
}

const int N = codes.size() / n_q;

// since we are using ggml-alloc, this buffer only needs enough space to hold the
// ggml_tensor and ggml_cgraph structs, but not the tensor data
static size_t buf_size = ggml_tensor_overhead()*GGML_MAX_NODES + ggml_graph_overhead();
static std::vector<uint8_t> buf(buf_size);

struct ggml_init_params ggml_params = {
/*.mem_size =*/ buf_size,
/*.mem_buffer =*/ buf.data(),
/*.no_alloc =*/ true,
};

struct ggml_context * ctx0 = ggml_init(ggml_params);

struct ggml_cgraph * gf = ggml_new_graph(ctx0);

struct ggml_tensor * inp_codes = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, N, n_q);
ggml_allocr_alloc(allocr, inp_codes);

// avoid writing to tensors if we are only measuring the memory usage
if (!ggml_allocr_is_measure(allocr)) {
ggml_backend_tensor_set(inp_codes, codes.data(), 0, N*n_q*ggml_element_size(inp_codes));
}

struct ggml_tensor * quantized = encodec_forward_quantizer_decode(ectx, ctx0, inp_codes);
struct ggml_tensor * decoded = encodec_forward_decoder(ectx, ctx0, quantized);

switch(mode) {
case encodec_run_mode::decode:
{
ggml_build_forward_expand(gf, decoded);
} break;
default:
{
fprintf(stderr, "%s: unknown run mode\n", __func__);
return NULL;
} break;
}

ggml_free(ctx0);

ectx->codes = inp_codes;
ectx->decoded = decoded;

return gf;
}

bool encodec_eval_internal(
struct encodec_context * ectx,
std::vector<float> & raw_audio,
Expand All @@ -1087,6 +1163,32 @@ bool encodec_eval_internal(
return true;
}

bool encodec_eval_internal(
struct encodec_context * ectx,
std::vector<int32_t> & codes,
const int n_threads,
const encodec_run_mode mode) {
auto & model = ectx->model;
auto & allocr = ectx->allocr;

// reset the allocator to free all the memory allocated during the previous inference
ggml_allocr_reset(allocr);

struct ggml_cgraph * gf = encodec_build_graph(ectx, codes, mode);

// allocate tensors
ggml_allocr_alloc_graph(allocr, gf);

// run the computation
if (ggml_backend_is_cpu(model.backend)) {
ggml_backend_cpu_set_n_threads(model.backend, n_threads);
}
ggml_backend_graph_compute(model.backend, gf);

return true;
}


bool encodec_eval(
struct encodec_context * ectx,
std::vector<float> & raw_audio,
Expand Down Expand Up @@ -1125,6 +1227,44 @@ bool encodec_eval(
return true;
}

bool encodec_eval(
struct encodec_context * ectx,
std::vector<int32_t> & codes,
const int n_threads,
const encodec_run_mode mode) {
const int64_t t_start_ms = ggml_time_ms();

// allocate the compute buffer
{
// alignment required by the backend
size_t align = ggml_backend_get_alignment(ectx->model.backend);
ectx->allocr = ggml_allocr_new_measure(align);

// create the graph for memory usage estimation
struct ggml_cgraph * gf = encodec_build_graph(ectx, codes, mode);

// compute the required memory
size_t mem_size = ggml_allocr_alloc_graph(ectx->allocr, gf);

// recreate the allocator with the required memory
ggml_allocr_free(ectx->allocr);
ectx->buf_compute = ggml_backend_alloc_buffer(ectx->model.backend, mem_size);
ectx->allocr = ggml_allocr_new_from_buffer(ectx->buf_compute);

fprintf(stderr, "%s: compute buffer size: %.2f MB\n\n", __func__, mem_size/1024.0/1024.0);
}

// encodec eval
if (!encodec_eval_internal(ectx, codes, n_threads, mode)) {
fprintf(stderr, "%s: failed to run encodec eval\n", __func__);
return false;
}

ectx->t_compute_ms = ggml_time_ms() - t_start_ms;

return true;
}

bool encodec_reconstruct_audio(
struct encodec_context * ectx,
std::vector<float> & raw_audio,
Expand Down Expand Up @@ -1155,7 +1295,7 @@ bool encodec_compress_audio(
struct encodec_context * ectx,
std::vector<float> & raw_audio,
int n_threads) {
if(!encodec_eval(ectx, raw_audio, n_threads, encodec_run_mode::encode_only)) {
if(!encodec_eval(ectx, raw_audio, n_threads, encodec_run_mode::encode)) {
fprintf(stderr, "%s: failed to run encodec eval\n", __func__);
return false;
}
Expand All @@ -1177,6 +1317,32 @@ bool encodec_compress_audio(
return true;
}

bool encodec_decompress_audio(
struct encodec_context * ectx,
std::vector<int32_t> & codes,
int n_threads) {
if (!encodec_eval(ectx, codes, n_threads, encodec_run_mode::decode)) {
fprintf(stderr, "%s: failed to run encodec eval\n", __func__);
return false;
}

if (!ectx->decoded) {
fprintf(stderr, "%s: null decoded tensor\n", __func__);
return false;
}

struct ggml_tensor * decoded = ectx->decoded;

auto & out_audio = ectx->out_audio;

int out_length = decoded->ne[0];
out_audio.resize(out_length);

ggml_backend_tensor_get(decoded, out_audio.data(), 0, out_length*ggml_element_size(decoded));

return true;
}

struct encodec_context * encodec_load_model(const std::string & model_path) {
int64_t t_start_load_us = ggml_time_us();

Expand Down
70 changes: 65 additions & 5 deletions encodec.h
Original file line number Diff line number Diff line change
@@ -1,3 +1,13 @@
/**
* @file encodec.h
* @brief Header file for the encodec library.
*
* This file contains the declarations of the structs and functions used in the encodec library.
* The library provides functionality for audio compression and decompression using a custom model.
* The model consists of an encoder, a quantizer and a decoder, each with their own set of parameters.
* The library also provides functions for loading and freeing the model, as well as compressing and decompressing audio data.
*
*/
#pragma once

#include <cmath>
Expand Down Expand Up @@ -173,18 +183,68 @@ struct encodec_context {
int64_t t_compute_ms = 0;
};

struct encodec_context * encodec_load_model(const std::string & model_path);

void encodec_set_target_bandwidth(struct encodec_context * ectx, int bandwidth);

/**
* Loads an encodec model from the specified file path.
*
* @param model_path The file path to the encodec model.
* @return A pointer to the encodec context struct.
*/
struct encodec_context * encodec_load_model(
const std::string & model_path);

/**
* Sets the target bandwidth for the given encodec context.
*
* @param ectx The encodec context to set the target bandwidth for.
* @param bandwidth The target bandwidth to set, in bits per second.
*/
void encodec_set_target_bandwidth(
struct encodec_context * ectx,
int bandwidth);

/**
* Reconstructs audio from raw audio data using the specified encodec context.
*
* @param ectx The encodec context to use for reconstruction.
* @param raw_audio The raw audio data to reconstruct.
* @param n_threads The number of threads to use for reconstruction.
* @return True if the reconstruction was successful, false otherwise.
*/
bool encodec_reconstruct_audio(
struct encodec_context * ectx,
std::vector<float> & raw_audio,
int n_threads);

/**
* Compresses audio data using the specified encodec context.
*
* @param ectx The encodec context to use for compression.
* @param raw_audio The raw audio data to compress.
* @param n_threads The number of threads to use for compression.
* @return True if the compression was successful, false otherwise.
*/
bool encodec_compress_audio(
struct encodec_context * ectx,
std::vector<float> & raw_audio,
int n_threads);

void encodec_free(struct encodec_context * ectx);
/**
* Decompresses audio data using the specified encodec context.
*
* @param ectx The encodec context to use for decompression.
* @param codes The compressed audio data to decompress.
* @param n_threads The number of threads to use for decompression.
* @return True if the audio data was successfully decompressed, false otherwise.
*/
bool encodec_decompress_audio(
struct encodec_context * ectx,
std::vector<int32_t> & codes,
int n_threads);

/**
* @brief Frees the memory allocated for an encodec context.
*
* @param ectx The encodec context to free.
*/
void encodec_free(
struct encodec_context * ectx);
Loading

0 comments on commit 161f6ed

Please sign in to comment.