Fixed OpenCL offloading prints

ggerganov · JohannesGaessler · Jul 5, 2023 · Jul 3, 2023 · Jul 3, 2023 · fdbf3982e24d94404e036b945b84f302815b9c6a
commit fdbf3982e24d94404e036b945b84f302815b9c6a
diff --git a/llama.cpp b/llama.cpp
@@ -1158,6 +1158,7 @@ static void llama_model_load_internal(
  }
  }
 #endif // GGML_USE_CUBLAS
+
 #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
  const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
 
@@ -1166,6 +1167,10 @@ static void llama_model_load_internal(
  fprintf(stderr, "%s: offloading non-repeating layers to GPU\n", __func__);
  }
  size_t vram_kv_cache = 0;
+
+#ifdef GGML_USE_CUBLAS
+ const int max_backend_supported_layers = hparams.n_layer + 3;
+ const int max_offloadable_layers = low_vram ? hparams.n_layer + 1 : hparams.n_layer + 3;
  if (n_gpu_layers > (int) hparams.n_layer + 1) {
  if (low_vram) {
  fprintf(stderr, "%s: cannot offload v cache to GPU due to low VRAM option\n", __func__);
@@ -1182,14 +1187,18 @@ static void llama_model_load_internal(
  vram_kv_cache += MEM_REQ_KV_SELF().at(model.type) / 2;
  }
  }
- const int max_offloadable_layers = low_vram ? hparams.n_layer + 1 : hparams.n_layer + 3;
+#elif defined(GGML_USE_CLBLAST)
+ const int max_backend_supported_layers = hparams.n_layer + 1;
+ const int max_offloadable_layers = hparams.n_layer + 1;
+#endif // GGML_USE_CUBLAS
+
  fprintf(stderr, "%s: offloaded %d/%d layers to GPU\n",
- __func__, std::min(n_gpu_layers, max_offloadable_layers), hparams.n_layer + 3);
+ __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
  fprintf(stderr, "%s: total VRAM used: %zu MB\n",
  __func__, (vram_weights + vram_scratch + vram_kv_cache + MB - 1) / MB); // round up
 #else
  (void) n_gpu_layers;
-#endif
+#endif // defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
  }
 
  // populate `tensors_by_name`