rwkv_gpu_offload.inc

#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)

#if defined(GGML_USE_CUBLAS)
#    include "ggml/src/ggml-cuda.h"
#elif defined(GGML_USE_CLBLAST)
#    include "ggml/src/ggml-opencl.h"
#endif

// API function.
bool rwkv_gpu_offload_layers(struct rwkv_context * ctx, const uint32_t n_layers) {
    const auto offload = [&](struct ggml_tensor * tensor) {
        // TODO Support multi-GPU
        tensor->backend = GGML_BACKEND_GPU;
#if defined(GGML_USE_CUBLAS)
        ggml_cuda_transform_tensor(tensor->data, tensor);
#elif defined(GGML_USE_CLBLAST)
        ggml_cl_transform_tensor(tensor->data, tensor);
#endif
    };

    const size_t n_gpu = std::min(n_layers, ctx->model->header.n_layer + 1);

    if (ctx->model->offloaded_layer_count >= n_gpu) {
        return false;
    }

    for (size_t & i = ctx->model->offloaded_layer_count; i < n_gpu; i++) {
        if (i == ctx->model->header.n_layer) {
            // This is the index of the model head.
            offload(ctx->model->head);

            continue;
        }

        const struct rwkv_layer & layer = ctx->model->layers[i];

        // TODO Also offload other supported operations to GPU
        offload(layer.att_key);
        offload(layer.att_value);
        offload(layer.att_receptance);
        offload(layer.att_output);

        if (layer.att_gate != NULL) {
            offload(layer.att_gate);
        }

        offload(layer.ffn_key);
        offload(layer.ffn_value);
        offload(layer.ffn_receptance);
    }

    return true;
}

#else

// API function.
bool rwkv_gpu_offload_layers(struct rwkv_context * ctx, const uint32_t n_layers) {
    return false;
}

#endif