Skip to content

Commit

Permalink
sync : llama.cpp (fused soft max, gpu cpy ops, etc.) (#640)
Browse files Browse the repository at this point in the history
* sync : llama.cpp (fused soft max, gpu cpy ops, etc.)

ggml-ci

* cuda : restore accidentally deleted changes

ggml-ci

* cuda : fix rope + disable device-side dequantize

ggml-ci

* test-backend-ops : enable stablelm rope test

* cuda : remove rope assert

* sync.sh : add test-backend-ops

* ggml : fix ggml_concat + ggml_get_n_tasks logic

* sync : whisper.cpp

ggml-ci

* metal : fix assert

* ci : fix Metal path to shaders

ggml-ci

* whisper : fix bug if metal init fails

---------

Co-authored-by: slaren <[email protected]>
  • Loading branch information
ggerganov and slaren committed Dec 7, 2023
1 parent fc7a58d commit c57aa8e
Show file tree
Hide file tree
Showing 16 changed files with 1,004 additions and 327 deletions.
1 change: 1 addition & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ jobs:
llvm-profdata merge -sparse tests/*.profraw -o ggml.profdata
llvm-cov report ./bin/test-grad0 -instr-profile=ggml.profdata
llvm-cov report ./bin/test-opt -instr-profile=ggml.profdata
test-macos-metal:
runs-on: macos-13
env:
Expand Down
5 changes: 5 additions & 0 deletions ci/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -375,6 +375,11 @@ ret=0

test $ret -eq 0 && gg_run ctest_debug
test $ret -eq 0 && gg_run ctest_release

if [ ! -z ${GG_BUILD_METAL} ]; then
export GGML_METAL_PATH_RESOURCES="${SRC}/build-ci-release/bin"
fi

test $ret -eq 0 && gg_run gpt_2
test $ret -eq 0 && gg_run mnist
test $ret -eq 0 && gg_run whisper
Expand Down
4 changes: 2 additions & 2 deletions examples/whisper/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -165,8 +165,8 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
else if (arg == "-m" || arg == "--model") { params.model = argv[++i]; }
else if (arg == "-f" || arg == "--file") { params.fname_inp.emplace_back(argv[++i]); }
else if (arg == "-oved" || arg == "--ov-e-device") { params.openvino_encode_device = argv[++i]; }
else if (arg == "-ls" || arg == "--log-score") { params.log_score = true; }
else if (arg == "-ng" || arg == "--no-gpu") { params.use_gpu = false; }
else if (arg == "-ls" || arg == "--log-score") { params.log_score = true; }
else if (arg == "-ng" || arg == "--no-gpu") { params.use_gpu = false; }
else {
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
whisper_print_usage(argc, argv, params);
Expand Down
149 changes: 113 additions & 36 deletions examples/whisper/whisper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1077,6 +1077,10 @@ static ggml_backend_t whisper_backend_init(const whisper_context_params & params
backend_gpu = ggml_backend_metal_init();
if (!backend_gpu) {
WHISPER_LOG_ERROR("%s: ggml_backend_metal_init() failed\n", __func__);
} else if (!ggml_backend_metal_supports_family(backend_gpu, 7)) {
WHISPER_LOG_ERROR("%s: Metal GPU does not support family 7 - falling back to CPU\n", __func__);
ggml_backend_free(backend_gpu);
backend_gpu = NULL;
}
}
#endif
Expand Down Expand Up @@ -1611,24 +1615,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
// read into a temporary buffer first, then copy to device memory
read_buf.resize(ggml_nbytes(tensor));

// we repeat the 2 bias tensors along dim 0:
// [1, 512] -> [3000, 512] (conv1.bias)
// [1, 512] -> [1500, 512] (conv2.bias)
if (false) {
loader->read(loader->context, read_buf.data(), read_buf.size() / tensor->ne[0]);

float * data_f32 = (float *) read_buf.data();
for (int64_t y = 0; y < tensor->ne[1]; ++y) {
const int64_t yy = tensor->ne[1] - y - 1;
const float val = data_f32[yy];

for (int64_t x = 0; x < tensor->ne[0]; ++x) {
data_f32[yy*tensor->ne[0] + x] = val;
}
}
} else {
loader->read(loader->context, read_buf.data(), read_buf.size());
}
loader->read(loader->context, read_buf.data(), read_buf.size());

ggml_backend_tensor_set(tensor, read_buf.data(), 0, ggml_nbytes(tensor));
}
Expand Down Expand Up @@ -3513,7 +3500,7 @@ int whisper_encode(struct whisper_context * ctx, int offset, int n_threads) {
int whisper_decode_with_state(struct whisper_context * ctx, struct whisper_state * state, const whisper_token * tokens, int n_tokens, int n_past, int n_threads) {
whisper_batch_prep_legacy(state->batch, tokens, n_tokens, n_past, 0);

whisper_kv_cache_seq_rm(ctx->state->kv_self, 0, n_past, -1);
whisper_kv_cache_seq_rm(state->kv_self, 0, n_past, -1);

if (!whisper_decode_internal(*ctx, *state, state->batch, n_threads, nullptr, nullptr)) {
WHISPER_LOG_ERROR("%s: failed to eval\n", __func__);
Expand All @@ -3526,19 +3513,10 @@ int whisper_decode_with_state(struct whisper_context * ctx, struct whisper_state
int whisper_decode(struct whisper_context * ctx, const whisper_token * tokens, int n_tokens, int n_past, int n_threads) {
if (ctx->state == nullptr) {
WHISPER_LOG_ERROR("%s: ERROR state was not loaded.\n", __func__);
return false;
}

whisper_kv_cache_seq_rm(ctx->state->kv_self, 0, n_past, -1);

whisper_batch_prep_legacy(ctx->state->batch, tokens, n_tokens, n_past, 0);

if (!whisper_decode_internal(*ctx, *ctx->state, ctx->state->batch, n_threads, nullptr, nullptr)) {
WHISPER_LOG_ERROR("%s: failed to eval\n", __func__);
return 1;
return -1;
}

return 0;
return whisper_decode_with_state(ctx, ctx->state, tokens, n_tokens, n_past, n_threads);
}

int whisper_tokenize(struct whisper_context * ctx, const char * text, whisper_token * tokens, int n_max_tokens) {
Expand Down Expand Up @@ -3590,6 +3568,17 @@ const char * whisper_lang_str(int id) {
return nullptr;
}

const char * whisper_lang_str_full(int id) {
for (const auto & kv : g_lang) {
if (kv.second.first == id) {
return kv.second.second.c_str();
}
}

WHISPER_LOG_ERROR("%s: unknown language id %d\n", __func__, id);
return nullptr;
}

int whisper_lang_auto_detect_with_state(
struct whisper_context * ctx,
struct whisper_state * state,
Expand Down Expand Up @@ -5174,10 +5163,10 @@ int whisper_full_with_state(
const int progress_cur = (100*(seek - seek_start))/(seek_end - seek_start);

params.progress_callback(
ctx, ctx->state, progress_cur, params.progress_callback_user_data);
ctx, state, progress_cur, params.progress_callback_user_data);
}

// of only 1 second left, then stop
// if only 1 second left, then stop
if (seek + 100 >= seek_end) {
break;
}
Expand Down Expand Up @@ -6061,6 +6050,43 @@ WHISPER_API const char * whisper_bench_memcpy_str(int n_threads) {
// 1GB array
const size_t size = arr*1e6;

double sum = 0.0;

// heat-up
{
char * src = (char *) malloc(size);
char * dst = (char *) malloc(size);

for (size_t i = 0; i < size; i++) src[i] = i;

memcpy(dst, src, size); // heat-up

double tsum = 0.0;

for (size_t i = 0; i < n; i++) {
const int64_t t0 = ggml_time_us();

memcpy(dst, src, size);

const int64_t t1 = ggml_time_us();

tsum += (t1 - t0)*1e-6;

src[rand() % size] = rand() % 256;
}

snprintf(strbuf, sizeof(strbuf), "memcpy: %7.2f GB/s (heat-up)\n", (double) (n*size)/(tsum*1e9));
s += strbuf;

// needed to prevent the compiler from optimizing the memcpy away
{
for (size_t i = 0; i < size; i++) sum += dst[i];
}

free(src);
free(dst);
}

// single-thread
{
char * src = (char *) malloc(size);
Expand All @@ -6071,7 +6097,6 @@ WHISPER_API const char * whisper_bench_memcpy_str(int n_threads) {
memcpy(dst, src, size); // heat-up

double tsum = 0.0;
double sum = 0.0;

for (size_t i = 0; i < n; i++) {
const int64_t t0 = ggml_time_us();
Expand All @@ -6085,21 +6110,73 @@ WHISPER_API const char * whisper_bench_memcpy_str(int n_threads) {
src[rand() % size] = rand() % 256;
}

snprintf(strbuf, sizeof(strbuf), "memcpy: %.2f GB/s (1 thread)\n", (double) (n*size)/(tsum*1e9));
snprintf(strbuf, sizeof(strbuf), "memcpy: %7.2f GB/s ( 1 thread)\n", (double) (n*size)/(tsum*1e9));
s += strbuf;

// needed to prevent the compiler from optimizing the memcpy away
{
for (size_t i = 0; i < size; i++) sum += dst[i];
}

snprintf(strbuf, sizeof(strbuf), "sum: %f\n", sum);
s += strbuf;
free(src);
free(dst);
}

// multi-thread

for (uint32_t k = 1; k <= n_threads; k++) {
char * src = (char *) malloc(size);
char * dst = (char *) malloc(size);

for (size_t i = 0; i < size; i++) src[i] = i;

memcpy(dst, src, size); // heat-up

double tsum = 0.0;

auto helper = [&](int th) {
const int64_t i0 = (th + 0)*size/k;
const int64_t i1 = (th + 1)*size/k;

for (size_t i = 0; i < n; i++) {
memcpy(dst + i0, src + i0, i1 - i0);

src[i0 + rand() % (i1 - i0)] = rand() % 256;
};
};

const int64_t t0 = ggml_time_us();

std::vector<std::thread> threads(k - 1);
for (uint32_t th = 0; th < k - 1; ++th) {
threads[th] = std::thread(helper, th);
}

helper(k - 1);

for (uint32_t th = 0; th < k - 1; ++th) {
threads[th].join();
}

const int64_t t1 = ggml_time_us();

tsum += (t1 - t0)*1e-6;

snprintf(strbuf, sizeof(strbuf), "memcpy: %7.2f GB/s (%2d thread)\n", (double) (n*size)/(tsum*1e9), k);
s += strbuf;

// needed to prevent the compiler from optimizing the memcpy away
{
for (size_t i = 0; i < size; i++) sum += dst[i];
}

free(src);
free(dst);
}

snprintf(strbuf, sizeof(strbuf), "sum: %f\n", sum);
s += strbuf;

return s.c_str();
}

Expand Down
7 changes: 6 additions & 1 deletion examples/whisper/whisper.h
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,9 @@ extern "C" {
//
// ...
//
// struct whisper_context * ctx = whisper_init_from_file("/path/to/ggml-base.en.bin");
// whisper_context_params cparams = whisper_context_default_params();
//
// struct whisper_context * ctx = whisper_init_from_file_with_params("/path/to/ggml-base.en.bin", cparams);
//
// if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
// fprintf(stderr, "failed to process audio\n");
Expand Down Expand Up @@ -313,6 +315,9 @@ extern "C" {
// Return the short string of the specified language id (e.g. 2 -> "de"), returns nullptr if not found
WHISPER_API const char * whisper_lang_str(int id);

// Return the short string of the specified language name (e.g. 2 -> "german"), returns nullptr if not found
WHISPER_API const char * whisper_lang_str_full(int id);

// Use mel data at offset_ms to try and auto-detect the spoken language
// Make sure to call whisper_pcm_to_mel() or whisper_set_mel() first
// Returns the top language id or negative on failure
Expand Down
14 changes: 11 additions & 3 deletions include/ggml/ggml.h
Original file line number Diff line number Diff line change
Expand Up @@ -244,11 +244,10 @@
#define GGML_ASSERT(x) \
do { \
if (!(x)) { \
fprintf(stderr, "GGML_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
fflush(stderr); \
fflush(stdout); \
fprintf(stderr, "GGML_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
ggml_print_backtrace(); \
exit(1); \
abort(); \
} \
} while (0)

Expand Down Expand Up @@ -1312,6 +1311,14 @@ extern "C" {
struct ggml_context * ctx,
struct ggml_tensor * a);

// fused soft_max(a*scale + mask)
// mask is optional
GGML_API struct ggml_tensor * ggml_soft_max_ext(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * mask,
float scale);

GGML_API struct ggml_tensor * ggml_soft_max_back(
struct ggml_context * ctx,
struct ggml_tensor * a,
Expand Down Expand Up @@ -2090,6 +2097,7 @@ extern "C" {
GGML_API double gguf_get_val_f64 (const struct gguf_context * ctx, int key_id);
GGML_API bool gguf_get_val_bool(const struct gguf_context * ctx, int key_id);
GGML_API const char * gguf_get_val_str (const struct gguf_context * ctx, int key_id);
GGML_API const void * gguf_get_val_data(const struct gguf_context * ctx, int key_id);
GGML_API int gguf_get_arr_n (const struct gguf_context * ctx, int key_id);
GGML_API const void * gguf_get_arr_data(const struct gguf_context * ctx, int key_id);
GGML_API const char * gguf_get_arr_str (const struct gguf_context * ctx, int key_id, int i);
Expand Down
1 change: 1 addition & 0 deletions scripts/sync-llama.sh
Original file line number Diff line number Diff line change
Expand Up @@ -24,3 +24,4 @@ cp -rpv ../llama.cpp/tests/test-opt.cpp tests/test-opt.cpp
cp -rpv ../llama.cpp/tests/test-grad0.cpp tests/test-grad0.cpp
cp -rpv ../llama.cpp/tests/test-quantize-fns.cpp tests/test-quantize-fns.cpp
cp -rpv ../llama.cpp/tests/test-quantize-perf.cpp tests/test-quantize-perf.cpp
cp -rpv ../llama.cpp/tests/test-backend-ops.cpp tests/test-backend-ops.cpp
3 changes: 1 addition & 2 deletions src/ggml-alloc.c
Original file line number Diff line number Diff line change
Expand Up @@ -135,10 +135,9 @@ void ggml_tallocr_alloc(ggml_tallocr_t alloc, struct ggml_tensor * tensor) {
ggml_backend_buffer_init_tensor(alloc->buffer, tensor);
}


#ifdef GGML_ALLOCATOR_DEBUG
add_allocated_tensor(alloc, tensor);
size_t cur_max = (char*)addr - (char*)alloc->data + size;
size_t cur_max = (char*)addr - (char*)alloc->base + size;
if (cur_max > alloc->max_size) {
printf("max_size = %.2f MB: tensors: ", cur_max / 1024.0 / 1024.0);
for (int i = 0; i < 1024; i++) {
Expand Down
Loading

0 comments on commit c57aa8e

Please sign in to comment.