sync : llama.cpp (fused soft max, gpu cpy ops, etc.) (#640)

* sync : llama.cpp (fused soft max, gpu cpy ops, etc.) ggml-ci * cuda : restore accidentally deleted changes ggml-ci * cuda : fix rope + disable device-side dequantize ggml-ci * test-backend-ops : enable stablelm rope test * cuda : remove rope assert * sync.sh : add test-backend-ops * ggml : fix ggml_concat + ggml_get_n_tasks logic * sync : whisper.cpp ggml-ci * metal : fix assert * ci : fix Metal path to shaders ggml-ci * whisper : fix bug if metal init fails --------- Co-authored-by: slaren <[email protected]>
ggerganov · Dec 7, 2023 · c57aa8e · c57aa8e
1 parent fc7a58d
commit c57aa8e
Show file tree

Hide file tree

Showing 16 changed files with 1,004 additions and 327 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -45,6 +45,7 @@ jobs:
  llvm-profdata merge -sparse tests/*.profraw -o ggml.profdata
  llvm-cov report ./bin/test-grad0 -instr-profile=ggml.profdata
  llvm-cov report ./bin/test-opt -instr-profile=ggml.profdata
+
  test-macos-metal:
  runs-on: macos-13
  env:

diff --git a/ci/run.sh b/ci/run.sh
@@ -375,6 +375,11 @@ ret=0
 
 test $ret -eq 0 && gg_run ctest_debug
 test $ret -eq 0 && gg_run ctest_release
+
+if [ ! -z ${GG_BUILD_METAL} ]; then
+ export GGML_METAL_PATH_RESOURCES="${SRC}/build-ci-release/bin"
+fi
+
 test $ret -eq 0 && gg_run gpt_2
 test $ret -eq 0 && gg_run mnist
 test $ret -eq 0 && gg_run whisper

diff --git a/examples/whisper/main.cpp b/examples/whisper/main.cpp
@@ -165,8 +165,8 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
  else if (arg == "-m" || arg == "--model") { params.model = argv[++i]; }
  else if (arg == "-f" || arg == "--file") { params.fname_inp.emplace_back(argv[++i]); }
  else if (arg == "-oved" || arg == "--ov-e-device") { params.openvino_encode_device = argv[++i]; }
- else if (arg == "-ls" || arg == "--log-score") { params.log_score = true; }
- else if (arg == "-ng" || arg == "--no-gpu") { params.use_gpu = false; }
+ else if (arg == "-ls" || arg == "--log-score") { params.log_score  = true; }
+ else if (arg == "-ng" || arg == "--no-gpu") { params.use_gpu  = false; }
  else {
  fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
  whisper_print_usage(argc, argv, params);

diff --git a/examples/whisper/whisper.cpp b/examples/whisper/whisper.cpp
@@ -1077,6 +1077,10 @@ static ggml_backend_t whisper_backend_init(const whisper_context_params & params
  backend_gpu = ggml_backend_metal_init();
  if (!backend_gpu) {
  WHISPER_LOG_ERROR("%s: ggml_backend_metal_init() failed\n", __func__);
+ } else if (!ggml_backend_metal_supports_family(backend_gpu, 7)) {
+ WHISPER_LOG_ERROR("%s: Metal GPU does not support family 7 - falling back to CPU\n", __func__);
+ ggml_backend_free(backend_gpu);
+ backend_gpu = NULL;
  }
  }
 #endif
@@ -1611,24 +1615,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
  // read into a temporary buffer first, then copy to device memory
  read_buf.resize(ggml_nbytes(tensor));
 
- // we repeat the 2 bias tensors along dim 0:
- // [1, 512] -> [3000, 512] (conv1.bias)
- // [1, 512] -> [1500, 512] (conv2.bias)
- if (false) {
- loader->read(loader->context, read_buf.data(), read_buf.size() / tensor->ne[0]);
-
- float * data_f32 = (float *) read_buf.data();
- for (int64_t y = 0; y < tensor->ne[1]; ++y) {
- const int64_t yy = tensor->ne[1] - y - 1;
- const float val = data_f32[yy];
-
- for (int64_t x = 0; x < tensor->ne[0]; ++x) {
- data_f32[yy*tensor->ne[0] + x] = val;
- }
- }
- } else {
- loader->read(loader->context, read_buf.data(), read_buf.size());
- }
+ loader->read(loader->context, read_buf.data(), read_buf.size());
 
  ggml_backend_tensor_set(tensor, read_buf.data(), 0, ggml_nbytes(tensor));
  }
@@ -3513,7 +3500,7 @@ int whisper_encode(struct whisper_context * ctx, int offset, int n_threads) {
 int whisper_decode_with_state(struct whisper_context * ctx, struct whisper_state * state, const whisper_token * tokens, int n_tokens, int n_past, int n_threads) {
  whisper_batch_prep_legacy(state->batch, tokens, n_tokens, n_past, 0);
 
- whisper_kv_cache_seq_rm(ctx->state->kv_self, 0, n_past, -1);
+ whisper_kv_cache_seq_rm(state->kv_self, 0, n_past, -1);
 
  if (!whisper_decode_internal(*ctx, *state, state->batch, n_threads, nullptr, nullptr)) {
  WHISPER_LOG_ERROR("%s: failed to eval\n", __func__);
@@ -3526,19 +3513,10 @@ int whisper_decode_with_state(struct whisper_context * ctx, struct whisper_state
 int whisper_decode(struct whisper_context * ctx, const whisper_token * tokens, int n_tokens, int n_past, int n_threads) {
  if (ctx->state == nullptr) {
  WHISPER_LOG_ERROR("%s: ERROR state was not loaded.\n", __func__);
- return false;
- }
-
- whisper_kv_cache_seq_rm(ctx->state->kv_self, 0, n_past, -1);
-
- whisper_batch_prep_legacy(ctx->state->batch, tokens, n_tokens, n_past, 0);
-
- if (!whisper_decode_internal(*ctx, *ctx->state, ctx->state->batch, n_threads, nullptr, nullptr)) {
- WHISPER_LOG_ERROR("%s: failed to eval\n", __func__);
- return 1;
+ return -1;
  }
 
- return 0;
+ return whisper_decode_with_state(ctx, ctx->state, tokens, n_tokens, n_past, n_threads);
 }
 
 int whisper_tokenize(struct whisper_context * ctx, const char * text, whisper_token * tokens, int n_max_tokens) {
@@ -3590,6 +3568,17 @@ const char * whisper_lang_str(int id) {
  return nullptr;
 }
 
+const char * whisper_lang_str_full(int id) {
+ for (const auto & kv : g_lang) {
+ if (kv.second.first == id) {
+ return kv.second.second.c_str();
+ }
+ }
+
+ WHISPER_LOG_ERROR("%s: unknown language id %d\n", __func__, id);
+ return nullptr;
+}
+
 int whisper_lang_auto_detect_with_state(
  struct whisper_context * ctx,
  struct whisper_state * state,
@@ -5174,10 +5163,10 @@ int whisper_full_with_state(
  const int progress_cur = (100*(seek - seek_start))/(seek_end - seek_start);
 
  params.progress_callback(
- ctx, ctx->state, progress_cur, params.progress_callback_user_data);
+ ctx, state, progress_cur, params.progress_callback_user_data);
  }
 
- // of only 1 second left, then stop
+ // if only 1 second left, then stop
  if (seek + 100 >= seek_end) {
  break;
  }
@@ -6061,6 +6050,43 @@ WHISPER_API const char * whisper_bench_memcpy_str(int n_threads) {
  // 1GB array
  const size_t size = arr*1e6;
 
+ double sum = 0.0;
+
+ // heat-up
+ {
+ char * src = (char *) malloc(size);
+ char * dst = (char *) malloc(size);
+
+ for (size_t i = 0; i < size; i++) src[i] = i;
+
+ memcpy(dst, src, size); // heat-up
+
+ double tsum = 0.0;
+
+ for (size_t i = 0; i < n; i++) {
+ const int64_t t0 = ggml_time_us();
+
+ memcpy(dst, src, size);
+
+ const int64_t t1 = ggml_time_us();
+
+ tsum += (t1 - t0)*1e-6;
+
+ src[rand() % size] = rand() % 256;
+ }
+
+ snprintf(strbuf, sizeof(strbuf), "memcpy: %7.2f GB/s (heat-up)\n", (double) (n*size)/(tsum*1e9));
+ s += strbuf;
+
+ // needed to prevent the compiler from optimizing the memcpy away
+ {
+ for (size_t i = 0; i < size; i++) sum += dst[i];
+ }
+
+ free(src);
+ free(dst);
+ }
+
  // single-thread
  {
  char * src = (char *) malloc(size);
@@ -6071,7 +6097,6 @@ WHISPER_API const char * whisper_bench_memcpy_str(int n_threads) {
  memcpy(dst, src, size); // heat-up
 
  double tsum = 0.0;
- double sum = 0.0;
 
  for (size_t i = 0; i < n; i++) {
  const int64_t t0 = ggml_time_us();
@@ -6085,21 +6110,73 @@ WHISPER_API const char * whisper_bench_memcpy_str(int n_threads) {
  src[rand() % size] = rand() % 256;
  }
 
- snprintf(strbuf, sizeof(strbuf), "memcpy: %.2f GB/s (1 thread)\n", (double) (n*size)/(tsum*1e9));
+ snprintf(strbuf, sizeof(strbuf), "memcpy: %7.2f GB/s ( 1 thread)\n", (double) (n*size)/(tsum*1e9));
  s += strbuf;
 
  // needed to prevent the compiler from optimizing the memcpy away
  {
  for (size_t i = 0; i < size; i++) sum += dst[i];
+ }
 
- snprintf(strbuf, sizeof(strbuf), "sum: %f\n", sum);
- s += strbuf;
+ free(src);
+ free(dst);
+ }
+
+ // multi-thread
+
+ for (uint32_t k = 1; k <= n_threads; k++) {
+ char * src = (char *) malloc(size);
+ char * dst = (char *) malloc(size);
+
+ for (size_t i = 0; i < size; i++) src[i] = i;
+
+ memcpy(dst, src, size); // heat-up
+
+ double tsum = 0.0;
+
+ auto helper = [&](int th) {
+ const int64_t i0 = (th + 0)*size/k;
+ const int64_t i1 = (th + 1)*size/k;
+
+ for (size_t i = 0; i < n; i++) {
+ memcpy(dst + i0, src + i0, i1 - i0);
+
+ src[i0 + rand() % (i1 - i0)] = rand() % 256;
+ };
+ };
+
+ const int64_t t0 = ggml_time_us();
+
+ std::vector<std::thread> threads(k - 1);
+ for (uint32_t th = 0; th < k - 1; ++th) {
+ threads[th] = std::thread(helper, th);
+ }
+
+ helper(k - 1);
+
+ for (uint32_t th = 0; th < k - 1; ++th) {
+ threads[th].join();
+ }
+
+ const int64_t t1 = ggml_time_us();
+
+ tsum += (t1 - t0)*1e-6;
+
+ snprintf(strbuf, sizeof(strbuf), "memcpy: %7.2f GB/s (%2d thread)\n", (double) (n*size)/(tsum*1e9), k);
+ s += strbuf;
+
+ // needed to prevent the compiler from optimizing the memcpy away
+ {
+ for (size_t i = 0; i < size; i++) sum += dst[i];
  }
 
  free(src);
  free(dst);
  }
 
+ snprintf(strbuf, sizeof(strbuf), "sum: %f\n", sum);
+ s += strbuf;
+
  return s.c_str();
 }
 

diff --git a/examples/whisper/whisper.h b/examples/whisper/whisper.h
@@ -50,7 +50,9 @@ extern "C" {
  //
  // ...
  //
- // struct whisper_context * ctx = whisper_init_from_file("/path/to/ggml-base.en.bin");
+ // whisper_context_params cparams = whisper_context_default_params();
+ //
+ // struct whisper_context * ctx = whisper_init_from_file_with_params("/path/to/ggml-base.en.bin", cparams);
  //
  // if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
  // fprintf(stderr, "failed to process audio\n");
@@ -313,6 +315,9 @@ extern "C" {
  // Return the short string of the specified language id (e.g. 2 -> "de"), returns nullptr if not found
  WHISPER_API const char * whisper_lang_str(int id);
 
+ // Return the short string of the specified language name (e.g. 2 -> "german"), returns nullptr if not found
+ WHISPER_API const char * whisper_lang_str_full(int id);
+
  // Use mel data at offset_ms to try and auto-detect the spoken language
  // Make sure to call whisper_pcm_to_mel() or whisper_set_mel() first
  // Returns the top language id or negative on failure

diff --git a/include/ggml/ggml.h b/include/ggml/ggml.h
@@ -244,11 +244,10 @@
 #define GGML_ASSERT(x) \
  do { \
  if (!(x)) { \
- fprintf(stderr, "GGML_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
- fflush(stderr); \
  fflush(stdout); \
+ fprintf(stderr, "GGML_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
  ggml_print_backtrace(); \
- exit(1); \
+ abort(); \
  } \
  } while (0)
 
@@ -1312,6 +1311,14 @@ extern "C" {
  struct ggml_context * ctx,
  struct ggml_tensor * a);
 
+ // fused soft_max(a*scale + mask)
+ // mask is optional
+ GGML_API struct ggml_tensor * ggml_soft_max_ext(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ struct ggml_tensor * mask,
+ float scale);
+
  GGML_API struct ggml_tensor * ggml_soft_max_back(
  struct ggml_context * ctx,
  struct ggml_tensor * a,
@@ -2090,6 +2097,7 @@ extern "C" {
  GGML_API double gguf_get_val_f64 (const struct gguf_context * ctx, int key_id);
  GGML_API bool gguf_get_val_bool(const struct gguf_context * ctx, int key_id);
  GGML_API const char * gguf_get_val_str (const struct gguf_context * ctx, int key_id);
+ GGML_API const void * gguf_get_val_data(const struct gguf_context * ctx, int key_id);
  GGML_API int gguf_get_arr_n (const struct gguf_context * ctx, int key_id);
  GGML_API const void * gguf_get_arr_data(const struct gguf_context * ctx, int key_id);
  GGML_API const char * gguf_get_arr_str (const struct gguf_context * ctx, int key_id, int i);

diff --git a/scripts/sync-llama.sh b/scripts/sync-llama.sh
@@ -24,3 +24,4 @@ cp -rpv ../llama.cpp/tests/test-opt.cpp tests/test-opt.cpp
 cp -rpv ../llama.cpp/tests/test-grad0.cpp tests/test-grad0.cpp
 cp -rpv ../llama.cpp/tests/test-quantize-fns.cpp tests/test-quantize-fns.cpp
 cp -rpv ../llama.cpp/tests/test-quantize-perf.cpp tests/test-quantize-perf.cpp
+cp -rpv ../llama.cpp/tests/test-backend-ops.cpp tests/test-backend-ops.cpp
diff --git a/src/ggml-alloc.c b/src/ggml-alloc.c
@@ -135,10 +135,9 @@ void ggml_tallocr_alloc(ggml_tallocr_t alloc, struct ggml_tensor * tensor) {
  ggml_backend_buffer_init_tensor(alloc->buffer, tensor);
  }
 
-
 #ifdef GGML_ALLOCATOR_DEBUG
  add_allocated_tensor(alloc, tensor);
- size_t cur_max = (char*)addr - (char*)alloc->data + size;
+ size_t cur_max = (char*)addr - (char*)alloc->base + size;
  if (cur_max > alloc->max_size) {
  printf("max_size = %.2f MB: tensors: ", cur_max / 1024.0 / 1024.0);
  for (int i = 0; i < 1024; i++) {