Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

MNT Unit tests tokenizer #18

Merged
merged 3 commits into from
Jul 30, 2023
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
unit test tokenization + main scripts in example
  • Loading branch information
PABannier committed Jul 30, 2023
commit c9ba853e910bcbff910b45be53dfb12be7ff0c38
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@ build/

bark
encodec
main
tests/test-tokenizer

*.o
*.plist
10 changes: 5 additions & 5 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ option(BARK_CUDA_DMMV_F16 "bark: use 16 bit floats for dmmv CU
option(BARK_CLBLAST "bark: use CLBlast" OFF)
option(BARK_METAL "bark: use Metal" OFF)

# option(BARK_BUILD_TESTS "bark: build tests" ${BARK_STANDALONE})
option(BARK_BUILD_TESTS "bark: build tests" ${BARK_STANDALONE})

#
# Build info header
Expand Down Expand Up @@ -518,7 +518,7 @@ install(
# programs, examples and tests
#

# if (BARK_BUILD_TESTS AND NOT CMAKE_JS_VERSION)
# include(CTest)
# add_subdirectory(tests)
# endif ()
if (BARK_BUILD_TESTS AND NOT CMAKE_JS_VERSION)
include(CTest)
add_subdirectory(tests)
endif ()
17 changes: 14 additions & 3 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,7 @@
BUILD_TARGETS = bark

# Binaries only useful for tests
# TEST_TARGETS = tests/test-double-float tests/test-grad0 tests/test-opt tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0
TEST_TARGETS =
TEST_TARGETS = tests/test-tokenizer

default: $(BUILD_TARGETS)

Expand Down Expand Up @@ -302,5 +301,17 @@ bark.o: bark.cpp bark.h
clean:
rm -vf *.o *.so *.dll encodec bark

bark: bark.cpp encodec.o ggml.o $(OBJS)
bark: bark.cpp encodec.o ggml.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

main: examples/main.cpp ggml.o bark.o encodec.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

#
# Test
#

tests: $(TEST_TARGETS)

tests/test-tokenizer: tests/test-tokenizer.cpp ggml.o bark.o encodec.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS)
53 changes: 0 additions & 53 deletions bark.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1460,56 +1460,3 @@ bool bark_generate_audio(

return true;
}

int main() {
const int64_t t_main_start_us = ggml_time_us();

int64_t t_load_us = 0;
int64_t t_eval_us = 0;

bark_model model;
std::string fname = "./ggml_weights";

// load the model
{
const int64_t t_start_us = ggml_time_us();

if(!bark_model_load(fname, model)) {
fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, fname.c_str());
return 1;
}

t_load_us = ggml_time_us() - t_start_us;
}

printf("\n");

// forward pass
const std::string prompt = "This is an audio";
{
const int64_t t_eval_us_start = ggml_time_us();

// call to generate audio
bark_generate_audio(model, model.vocab, prompt.data(), 4);

t_eval_us = ggml_time_us() - t_eval_us_start;
}

// report timing
{
const int64_t t_main_end_us = ggml_time_us();

printf("\n\n");
printf("%s: load time = %8.2f ms\n", __func__, t_load_us/1000.0f);
printf("%s: eval time = %8.2f ms\n", __func__, t_eval_us/1000.0f);
printf("%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f);
}

// TODO: write wrapper
ggml_free(model.coarse_model.ctx);
ggml_free(model.fine_model.ctx);
ggml_free(model.text_model.ctx);
ggml_free(model.codec_model.ctx);

return 0;
}
8 changes: 8 additions & 0 deletions bark.h
Original file line number Diff line number Diff line change
Expand Up @@ -114,9 +114,17 @@ struct bark_model {

bool gpt_model_load(const std::string& fname, gpt_model& model, bark_vocab& vocab, bool has_vocab);

bool bark_model_load(const std::string & dirname, bark_model & model);

void bert_tokenize(
const bark_vocab& vocab,
const char * text,
int32_t * tokens,
int32_t * n_tokens,
int32_t n_max_tokens);

bool bark_generate_audio(
bark_model model,
const bark_vocab& vocab,
const char * text,
const int n_threads);
55 changes: 55 additions & 0 deletions examples/main.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
#include "ggml.h"
#include "bark.h"

int main() {
const int64_t t_main_start_us = ggml_time_us();

int64_t t_load_us = 0;
int64_t t_eval_us = 0;

bark_model model;
std::string fname = "./ggml_weights";

// load the model
{
const int64_t t_start_us = ggml_time_us();

if(!bark_model_load(fname, model)) {
fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, fname.c_str());
return 1;
}

t_load_us = ggml_time_us() - t_start_us;
}

printf("\n");

// forward pass
const std::string prompt = "This is an audio";
{
const int64_t t_eval_us_start = ggml_time_us();

// call to generate audio
bark_generate_audio(model, model.vocab, prompt.data(), 4);

t_eval_us = ggml_time_us() - t_eval_us_start;
}

// report timing
{
const int64_t t_main_end_us = ggml_time_us();

printf("\n\n");
printf("%s: load time = %8.2f ms\n", __func__, t_load_us/1000.0f);
printf("%s: eval time = %8.2f ms\n", __func__, t_eval_us/1000.0f);
printf("%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f);
}

// TODO: write wrapper
ggml_free(model.coarse_model.ctx);
ggml_free(model.fine_model.ctx);
ggml_free(model.text_model.ctx);
ggml_free(model.codec_model.ctx);

return 0;
}
12 changes: 6 additions & 6 deletions tests/test-tokenizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,11 @@
static const std::map<std::string, std::vector<bark_vocab::id>> & k_tests()
{
static std::map<std::string, std::vector<bark_vocab::id>> _k_tests = {
{ "Hello World!", { 31178, 11356, 106, }, },
{ "Hello World", { 31178, 11356, }, },
{ " Hello World!", { 31178, 11356, 106, }, },
{ "this is an audio generated by bark", { 10531, 10124, 10151, 23685, 48918, 10155, 18121, 10174 }, },
{ "l'Amérique si c'est un rêve je le saurai ", { 180, 112, 28426, 10294, 171, 112, 10176, 10119, 89952, 10144, 10141, 11731, 33186 }, },
{ "Hello World!", { 31178, 11356, 106, }, },
{ "Hello World", { 31178, 11356, }, },
{ " Hello World!", { 31178, 11356, 106, }, },
{ "this is an audio generated by bark", { 10531, 10124, 10151, 23685, 48918, 10155, 18121, 10174, }, },
{ "l'Amérique si c'est un rêve je le saurai ", { 180, 112, 28426, 10294, 171, 112, 10176, 10119, 89952, 10144, 10141, 11731, 33186, }, },
};
return _k_tests;
};
Expand All @@ -33,7 +33,7 @@ int main(int argc, char **argv) {
{
if(!gpt_model_load(fname, model.text_model, model.vocab, true)) {
fprintf(stderr, "%s: invalid model file '%s' (bad text)\n", __func__, fname.c_str());
return false;
return 1;
}
model.memsize += model.text_model.memsize;
}
Expand Down