diff --git a/CMakeLists.txt b/CMakeLists.txt index cd4bf92..1cd69f4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -39,6 +39,10 @@ target_link_libraries(${ENCODEC_LIB} PUBLIC ggml) target_include_directories(${ENCODEC_LIB} PUBLIC .) target_compile_features(${ENCODEC_LIB} PUBLIC cxx_std_11) +if (GGML_CUBLAS) + add_compile_definitions(GGML_USE_CUBLAS) +endif() + if (GGML_METAL) add_compile_definitions(GGML_USE_METAL) endif() diff --git a/README.md b/README.md index 968cb7d..cd7be33 100644 --- a/README.md +++ b/README.md @@ -21,7 +21,7 @@ https://github.com/PABannier/encodec.cpp/assets/12958149/d11561be-98e9-4504-bba7 - [x] Mixed F16 / F32 precision - [ ] 4-bit and 8-bit quantization - [x] Metal support -- [ ] cuBLAS support +- [x] cuBLAS support ## Implementation details @@ -61,3 +61,12 @@ the power consumption and CPU activity is reduced. cmake -DGGML_METAL=ON -DBUILD_SHARED_LIBS=Off .. cmake --build . --config Release ``` + +### Using cuBLAS + +The inference can be offloaded on a CUDA backend with cuBLAS. + +```bash +cmake -DGGML_CUBLAS=ON -DBUILD_SHARED_LIBS=Off .. +cmake --build . --config Release +``` diff --git a/encodec.cpp b/encodec.cpp index a39d741..5a014f5 100644 --- a/encodec.cpp +++ b/encodec.cpp @@ -2,6 +2,10 @@ #include "ggml-alloc.h" #include "ggml-backend.h" +#ifdef GGML_USE_CUBLAS +#include "ggml-cuda.h" +#endif + #ifdef GGML_USE_METAL #include "ggml-metal.h" #endif @@ -446,6 +450,16 @@ bool encodec_load_model_weights(const std::string & fname, encodec_model & model } } +#ifdef GGML_USE_CUBLAS + if (n_gpu_layers > 0) { + fprintf(stderr, "%s: using CUDA backend\n", __func__); + model.backend = ggml_backend_cuda_init(); + if (!model.backend) { + fprintf(stderr, "%s: ggml_backend_cuda_init() failed\n", __func__); + } + } +#endif + #ifdef GGML_USE_METAL if (n_gpu_layers > 0) { fprintf(stderr, "%s: using Metal backend\n", __func__);