forked from RWKV/rwkv.cpp
-
Notifications
You must be signed in to change notification settings - Fork 0
/
rwkv_gpu_offload.inc
62 lines (48 loc) · 1.62 KB
/
rwkv_gpu_offload.inc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
#if defined(GGML_USE_CUBLAS)
# include "ggml/src/ggml-cuda.h"
#elif defined(GGML_USE_CLBLAST)
# include "ggml/src/ggml-opencl.h"
#endif
// API function.
bool rwkv_gpu_offload_layers(struct rwkv_context * ctx, const uint32_t n_layers) {
const auto offload = [&](struct ggml_tensor * tensor) {
// TODO Support multi-GPU
tensor->backend = GGML_BACKEND_GPU;
#if defined(GGML_USE_CUBLAS)
ggml_cuda_transform_tensor(tensor->data, tensor);
#elif defined(GGML_USE_CLBLAST)
ggml_cl_transform_tensor(tensor->data, tensor);
#endif
};
const size_t n_gpu = std::min(n_layers, ctx->model->header.n_layer + 1);
if (ctx->model->offloaded_layer_count >= n_gpu) {
return false;
}
for (size_t & i = ctx->model->offloaded_layer_count; i < n_gpu; i++) {
if (i == ctx->model->header.n_layer) {
// This is the index of the model head.
offload(ctx->model->head);
continue;
}
const struct rwkv_layer & layer = ctx->model->layers[i];
// TODO Also offload other supported operations to GPU
offload(layer.att_key);
offload(layer.att_value);
offload(layer.att_receptance);
offload(layer.att_output);
if (layer.att_gate != NULL) {
offload(layer.att_gate);
}
offload(layer.ffn_key);
offload(layer.ffn_value);
offload(layer.ffn_receptance);
}
return true;
}
#else
// API function.
bool rwkv_gpu_offload_layers(struct rwkv_context * ctx, const uint32_t n_layers) {
return false;
}
#endif