passing tokenizer test

PABannier · PABannier · Feb 13, 2024 · Oct 26, 2023 · Oct 26, 2023 · Oct 26, 2023
commit 8ae7dc5a79a3c1e31ccda3ab557f9844c5bc74ee
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -37,10 +37,10 @@ if (BARK_BUILD_EXAMPLES)
  add_subdirectory(examples)
 endif()
 
-# if (BARK_BUILD_TESTS)
-#  include(CTest)
-#  add_subdirectory(tests)
-# endif ()
+if (BARK_BUILD_TESTS)
+ include(CTest)
+ add_subdirectory(tests)
+endif ()
 
 target_link_libraries(${BARK_LIB} PUBLIC ggml)
 # target_link_libraries(${BARK_LIB} PUBLIC encodec)

diff --git a/bark.cpp b/bark.cpp
@@ -220,7 +220,7 @@ static bark_token gpt_sample(
  return res;
 }
 
-static bool bark_vocab_load(
+bool bark_vocab_load(
  const std::string & fname,
  bark_vocab * vocab,
  int32_t expected_size) {
@@ -308,7 +308,7 @@ static std::string strip_accents(const std::string & in_str) {
  return out_str;
 }
 
-static void bert_tokenize(
+void bert_tokenize(
  const bark_vocab * vocab,
  const char * text,
  int32_t * tokens,
@@ -1808,7 +1808,7 @@ static bool bark_eval_fine_encoder(struct bark_context * bctx, int n_threads) {
  return true;
 }
 
-static bool bark_forward_text_encoder(struct bark_context * bctx, int n_threads) {
+bool bark_forward_text_encoder(struct bark_context * bctx, int n_threads) {
  const int64_t t_main_start_us = ggml_time_us();
 
  auto & model = bctx->model.text_model;

diff --git a/bark.h b/bark.h
@@ -217,3 +217,44 @@ BARK_API bool bark_model_quantize(
  */
 BARK_API void bark_free(
  struct bark_context * bctx);
+
+/**
+ * Loads a vocabulary from a file.
+ *
+ * @param fname The name of the file to load the vocabulary from.
+ * @param vocab A pointer to the bark_vocab struct to store the loaded vocabulary in.
+ * @param expected_size The expected size of the vocabulary.
+ * @return true if the vocabulary was loaded successfully, false otherwise.
+ */
+bool bark_vocab_load(
+ const std::string & fname,
+ bark_vocab * vocab,
+ int32_t expected_size);
+
+/**
+ * Tokenizes the input text using the provided vocabulary.
+ *
+ * @param vocab Pointer to the vocabulary to use for tokenization.
+ * @param text The input text to tokenize.
+ * @param tokens Pointer to an array where the resulting tokens will be stored.
+ * @param n_tokens Pointer to an integer where the number of resulting tokens will be stored.
+ * @param n_max_tokens The maximum number of tokens that can be stored in the tokens array.
+ */
+void bert_tokenize(
+ const bark_vocab * vocab,
+ const char * text,
+ int32_t * tokens,
+ int32_t * n_tokens,
+ int32_t n_max_tokens);
+
+
+/**
+ * Encodes the input text using the forward algorithm.
+ * 
+ * @param bctx A pointer to the bark context struct.
+ * @param n_threads The number of threads to use for encoding.
+ * @return Returns true if the encoding was successful, false otherwise.
+ */
+bool bark_forward_text_encoder(
+ struct bark_context * bctx, 
+ int n_threads);
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
@@ -1,13 +1,24 @@
-function(bark_add_test source)
- get_filename_component(TEST_TARGET ${source} NAME_WE)
- add_executable(${TEST_TARGET} ${source})
- install(TARGETS ${TEST_TARGET} RUNTIME)
- target_link_libraries(${TEST_TARGET} PRIVATE bark ${CMAKE_THREAD_LIBS_INIT})
- target_compile_features(${TEST_TARGET} PRIVATE cxx_std_11)
- add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}> ${ARGN})
-endfunction()
-
-bark_add_test(test-tokenizer.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../ggml_weights/ggml_vocab.bin)
-# bark_add_test(test-forward-semantic.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../ggml_weights/ggml_weights_text.bin)
-bark_add_test(test-forward-coarse.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../ggml_weights/ggml_weights_text.bin)
-# bark_add_test(test-forward-fine.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../ggml_weights/ggml_weights_fine.bin)
+add_library(test_utils STATIC common.cpp)
+target_include_directories(test_utils PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
+target_compile_features(test_utils PRIVATE cxx_std_11)
+
+#
+# test-tokenizer
+
+set(TEST_TARGET test-tokenizer)
+add_executable(${TEST_TARGET} ${TEST_TARGET}.cpp)
+target_link_libraries(${TEST_TARGET} PRIVATE bark)
+
+#
+# test-gpt-eval
+
+# set(TEST_TARGET test-gpt-eval)
+# add_executable(${TEST_TARGET} ${TEST_TARGET}.cpp)
+# target_link_libraries(${TEST_TARGET} PRIVATE bark)
+
+#
+# test-forward-semantic
+
+set(TEST_TARGET test-forward-semantic)
+add_executable(${TEST_TARGET} ${TEST_TARGET}.cpp)
+target_link_libraries(${TEST_TARGET} PRIVATE bark test_utils)
diff --git a/tests/common.cpp b/tests/common.cpp
@@ -1,12 +1,10 @@
+#include <cmath>
 #include <fstream>
-#include <vector>
 #include <tuple>
+#include <vector>
 
-#include "bark-util.h"
 #include "common.h"
 
-#define BARK_API_INTERNAL
-
 int64_t bytes_left(std::ifstream & f) {
  // utils to check all bytes are read from stream
  int64_t curr_pos = f.tellg();
@@ -16,6 +14,11 @@ int64_t bytes_left(std::ifstream & f) {
  return bytes_left_to_read;
 }
 
+template<typename T>
+static void read_safe(std::ifstream& fin, T& dest) {
+ fin.read((char*)& dest, sizeof(T));
+}
+
 template <typename T, typename U>
 inline bool all_close(
  std::vector<T> s1,

diff --git a/tests/common.h b/tests/common.h
@@ -1,5 +1,4 @@
 #pragma once
-#include "bark.h"
 
 #include <tuple>
 #include <vector>
@@ -9,6 +8,8 @@
 typedef std::vector<float> logit_sequence;
 typedef std::vector<std::vector<float>> logit_matrix;
 
+typedef std::vector<std::vector<int32_t>> bark_codes;
+
 /* Comparison utils */
 template <typename T, typename U>
 inline bool all_equal(std::vector<T> s1, std::vector<U> s2, int * n_violations);

diff --git a/tests/test-forward-semantic.cpp b/tests/test-forward-semantic.cpp
@@ -6,54 +6,54 @@
 #include "bark.h"
 #include "common.h"
 
-static const std::vector<std::string> test_data = {
- "./data/semantic/test_pass_semantic_1.bin", // prompt: Ceci est un texte en français pour tester le bon fonctionnement de bark.
- "./data/semantic/test_pass_semantic_2.bin", // prompt: Sometimes the heart sees what is invisible to the eye
- "./data/semantic/test_pass_semantic_3.bin", // prompt: El Arte de Vencer se Aprende en las Derrotas
+const std::vector<std::string> test_data = {
+ "../tests/data/semantic/test_pass_semantic_1.bin", // prompt: Ceci est un texte en français pour tester le bon fonctionnement de bark.
+ "../tests/data/semantic/test_pass_semantic_2.bin", // prompt: Sometimes the heart sees what is invisible to the eye
+ "../tests/data/semantic/test_pass_semantic_3.bin", // prompt: El Arte de Vencer se Aprende en las Derrotas
 };
 
-static const int n_threads = 4;
-static const float min_eos_p = 0.2;
-static const float temp = 0.0f; // deterministic sampling
+const int n_threads = 4;
+const float min_eos_p = 0.2;
+const float temp = 0.0f; // deterministic sampling
 
 int main() {
- const std::string fname = "../ggml_weights/ggml_weights_text.bin";
+ const std::string dirname = "../ggml_weights/";
 
- std::mt19937 rng(0);
+ bark_sequence input, gt_tokens;
 
- bark_model model;
+ std::mt19937 rng(0);
 
- if (gpt_model_load(fname, model.text_model) > 0) {
- fprintf(stderr, "%s: invalid model file '%s'\n", __func__, fname.c_str());
- return 1;
+ // initialize bark context
+ struct bark_context * bctx = bark_load_model(dirname);
+ if (!bctx) {
+ fprintf(stderr, "%s: Could not load model\n", __func__);
+ exit(1);
  }
-
- bark_context * ctx = bark_new_context_with_model(&model);
- ctx->rng = rng;
-
- bark_sequence input;
- bark_sequence gt_tokens;
+ bctx->rng = rng;
 
  for (int i = 0; i < (int) test_data.size(); i++) {
  input.clear();
  gt_tokens.clear();
 
  std::string path = test_data[i];
  load_test_data(path, input, gt_tokens);
- ctx->tokens = input;
+ bctx->tokens = input;
 
- bark_forward_text_encoder(ctx, temp, min_eos_p, n_threads);
+ if (!bark_forward_text_encoder(bctx, n_threads)) {
+ fprintf(stderr, "%s: failed to forward text encoder\n", __func__);
+ exit(1);
+ }
 
  printf("\n");
  printf("%s: %s\n", __func__, path.c_str());
- if (!run_test(gt_tokens, ctx->semantic_tokens)) {
+ if (!run_test(gt_tokens, bctx->semantic_tokens)) {
  printf("%s: test %d failed.\n", __func__, i+1);
  } else {
  printf("%s: test %d passed.\n", __func__, i+1);
  }
  }
 
- bark_free(ctx);
+ bark_free(bctx);
 
  return 0;
 }
diff --git a/tests/test-gpt-eval.cpp b/tests/test-gpt-eval.cpp
@@ -5,7 +5,7 @@
 #include "common.h"
 
 
-static const std::vector<std::tuple<std::string, bool>> test_args = {
+const std::vector<std::tuple<std::string, bool>> test_args = {
  { "./data/gpt_eval/test_gpt_eval_1_no_merge.bin", false }, // prompt: Hello, my name is Suno. And, uh - and I like pizza. [laughs] But I also have other interests such as playing tic tac toe.
  { "./data/gpt_eval/test_gpt_eval_2_no_merge.bin", false }, // prompt: Buenos días Miguel. Tu colega piensa que tu alemán es extremadamente malo. But I suppose your english isn't terrible.
  { "./data/gpt_eval/test_gpt_eval_3_no_merge.bin", false }, // prompt: ♪ In the jungle, the mighty jungle, the lion barks tonight ♪
@@ -15,30 +15,27 @@ static const std::vector<std::tuple<std::string, bool>> test_args = {
  { "./data/gpt_eval/test_gpt_eval_3_merge.bin", true }, // prompt: Ceci est un texte en français pour tester le bon fonctionnement de bark.
 };
 
-static const int n_threads = 4;
+const int n_threads = 4;
 
 int main() {
- const std::string fname = "../ggml_weights/ggml_weights_text.bin";
+ const std::string fname = "../ggml_weights/";
 
- gpt_model model;
- if (gpt_model_load(fname, model) > 0) {
- fprintf(stderr, "%s: invalid model file '%s'\n", __func__, fname.c_str());
- return 1;
+ // initialize bark context
+ struct bark_context * bctx = bark_load_model(fname);
+ if (!bctx) {
+ fprintf(stderr, "%s: Could not load model\n", __func__);
+ exit(1);
  }
 
  bark_sequence tokens;
  logit_sequence gt_logits, logits;
 
+ auto & model = bctx->model.text_model;
  auto & hparams = model.hparams;
+
  int n_vocab = hparams.n_out_vocab;
- logits.resize(n_vocab);
 
- // dry run to estimate mem_per_token
- {
- int n_past = 0;
- bark_token decoy[4] = { 0, 1, 2, 3 };
- gpt_eval(model, decoy, 4, nullptr, &n_past, false, n_threads);
- }
+ logits.resize(n_vocab);
 
  for (int i = 0; i < (int) test_args.size(); i++) {
  tokens.clear();
@@ -50,7 +47,7 @@ int main() {
  load_test_data(path, tokens, gt_logits);
 
  int n_past = 0;
- gpt_eval(model, tokens.data(), tokens.size(), logits.data(), &n_past, merge_ctx, n_threads);
+ // gpt_eval(model, tokens.data(), tokens.size(), logits.data(), &n_past, merge_ctx, n_threads);
 
  printf("\n");
  printf("%s: %s\n", __func__, path.c_str());

diff --git a/tests/test-tokenizer.cpp b/tests/test-tokenizer.cpp
@@ -1,19 +1,16 @@
+/* Usage:
+
+```bash
+ ./bin/test-tokenizer ../ggml_weights/ggml_vocab.bin
+```
+*/
 #include <cstdio>
 #include <string>
 #include <map>
 #include <vector>
 
-#define BARK_API_INTERNAL 
 #include "bark.h"
 
-struct bark_vocab {
- using id = int32_t;
- using token = std::string;
-
- std::map<token, id> token_to_id;
- std::map<id, token> id_to_token;
-};
-
 static const std::map<std::string, bark_sequence> & k_tests()
 {
  static std::map<std::string, bark_sequence> _k_tests = {
@@ -39,9 +36,9 @@ int main(int argc, char **argv) {
  bark_vocab vocab;
  int max_ctx_size = 256;
 
- if (bark_vocab_load(fname.c_str(), &vocab, 119547) > 0) {
+ if (!bark_vocab_load(fname.c_str(), &vocab, 119547)) {
  fprintf(stderr, "%s: invalid vocab file '%s'\n", __func__, fname.c_str());
- return 1;
+ exit(1);
  }
 
  for (const auto & test_kv : k_tests()) {