Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

llama : ggml-backend integration #4766

Merged
merged 39 commits into from
Jan 12, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
33f0761
llama : ggml-backend integration
slaren Dec 28, 2023
6483328
ggml-backend : add names to buffers
slaren Jan 5, 2024
a1ab35c
fix unmap after loading
slaren Jan 5, 2024
1fa7ee2
batched-bench : add tensor_split param
ggerganov Jan 5, 2024
863ef45
llama : check for null tensor_split
slaren Jan 5, 2024
d107459
ggml-backend : increase GGML_MAX_BACKENDS
slaren Jan 5, 2024
ece0b0d
improve graph splitting, partial fix for --no-kv-offload
slaren Jan 5, 2024
2f2c367
cuda : add ggml-backend split buffer support
slaren Jan 6, 2024
72b74f3
cuda : do not create buffer types for devices that don't exist (fixes…
slaren Jan 6, 2024
f77c72f
ggml : fix null backend dereference (#4807)
ggerganov Jan 7, 2024
7c16cf1
test-backend-ops : check buffer allocation failures
slaren Jan 7, 2024
87c8207
Merge remote-tracking branch 'origin/master' into sl/backend-sched
slaren Jan 7, 2024
5e879c9
llama : add cparam (split_mode) and command line argument (--split-mo…
slaren Jan 7, 2024
ac145fd
ggml : fix mul_mat_id work size
slaren Jan 8, 2024
444b975
llama : rewrite session kv load/set without graphs
slaren Jan 8, 2024
d41cef9
minor
slaren Jan 8, 2024
5a62db3
llama : only initialize used backends, free backends on context free
slaren Jan 8, 2024
4813e17
llama : abort ctx if cuda backend init fails
slaren Jan 8, 2024
11583c1
llama : rewrite lora with ggml-backend and compute on CPU
slaren Jan 8, 2024
4ed5f62
llama : only map to a backend buffer the region of the file mapping c…
slaren Jan 8, 2024
fa76201
opencl : add ggml-backend buffer type
slaren Jan 9, 2024
2e7814a
Merge remote-tracking branch 'origin/master' into sl/backend-sched
slaren Jan 9, 2024
5d2dffc
cuda : only use batched_cublas with batched mat muls (fixes fp16 tg p…
slaren Jan 10, 2024
3cb1c1f
Merge remote-tracking branch 'origin/master' into sl/backend-sched
slaren Jan 10, 2024
07a1b05
llama : on Metal, by default offload the full model
ggerganov Jan 10, 2024
3cd0cbb
metal : page align the data ptr (#4854)
ggerganov Jan 10, 2024
74066f8
Apply suggestions from code review
slaren Jan 10, 2024
c522c11
cuda : fix split buffer free
slaren Jan 10, 2024
9d4ba6e
address review comments
slaren Jan 11, 2024
d83c084
llama-bench : add split-mode parameter
slaren Jan 11, 2024
6dcc42b
fix whitespace
slaren Jan 11, 2024
42aa835
opencl : fix double initialization
slaren Jan 11, 2024
c3681af
Merge remote-tracking branch 'origin/master' into sl/backend-sched
slaren Jan 11, 2024
c486719
server : add --split-mode parameter
slaren Jan 11, 2024
23c14ef
use async copy and compute to improve multi-gpu performance
slaren Jan 11, 2024
e73009e
use async memcpys to copy the graph outputs to the CPU
slaren Jan 12, 2024
1e7694e
fix opencl
slaren Jan 12, 2024
458674c
Merge remote-tracking branch 'origin/master' into sl/backend-sched
slaren Jan 12, 2024
53ae0dd
use a host buffer for the cpu compute buffer for faster copies to the…
slaren Jan 12, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
test-backend-ops : check buffer allocation failures
  • Loading branch information
slaren committed Jan 7, 2024
commit 7c16cf106d1f6410a7d75845ed29c78b662786ee
37 changes: 36 additions & 1 deletion ggml-backend.c
Original file line number Diff line number Diff line change
Expand Up @@ -1503,6 +1503,21 @@ struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, s
struct ggml_context * ctx_allocated = ggml_init(params);
struct ggml_context * ctx_unallocated = ggml_init(params);

if (ctx_allocated == NULL || ctx_unallocated == NULL) {
fprintf(stderr, "failed to allocate context for graph copy\n");
free(hash_set.keys);
free(node_copies);
free(node_init);
ggml_free(ctx_allocated);
ggml_free(ctx_unallocated);
return (struct ggml_backend_graph_copy) {
/* .buffer = */ NULL,
/* .ctx_allocated = */ NULL,
/* .ctx_unallocated = */ NULL,
/* .graph = */ NULL,
};
}

// dup nodes
for (int i = 0; i < graph->n_nodes; i++) {
struct ggml_tensor * node = graph->nodes[i];
Expand All @@ -1511,6 +1526,20 @@ struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, s

// allocate nodes
ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors(ctx_allocated, backend);
if (buffer == NULL) {
fprintf(stderr, "failed to allocate buffer for graph copy\n");
free(hash_set.keys);
free(node_copies);
free(node_init);
ggml_free(ctx_allocated);
ggml_free(ctx_unallocated);
return (struct ggml_backend_graph_copy) {
/* .buffer = */ NULL,
/* .ctx_allocated = */ NULL,
/* .ctx_unallocated = */ NULL,
/* .graph = */ NULL,
};
}

//printf("copy buffer size: %zu MB\n", ggml_backend_buffer_get_size(buffer) / 1024 / 1024);

Expand Down Expand Up @@ -1547,8 +1576,12 @@ void ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy) {
ggml_free(copy.ctx_unallocated);
}

void ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data) {
bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data) {
struct ggml_backend_graph_copy copy = ggml_backend_graph_copy(backend2, graph);
if (copy.buffer == NULL) {
return false;
}

struct ggml_cgraph * g1 = graph;
struct ggml_cgraph * g2 = copy.graph;

Expand Down Expand Up @@ -1578,4 +1611,6 @@ void ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t
}

ggml_backend_graph_copy_free(copy);

return true;
}
2 changes: 1 addition & 1 deletion ggml-backend.h
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,7 @@ extern "C" {
typedef bool (*ggml_backend_eval_callback)(int node_index, struct ggml_tensor * t1, struct ggml_tensor * t2, void * user_data);

// Compare the output of two backends
GGML_API void ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data);
GGML_API bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data);

// Tensor initialization
GGML_API void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr);
Expand Down
18 changes: 16 additions & 2 deletions tests/test-backend-ops.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -376,6 +376,11 @@ struct test_case {

// allocate
ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors(ctx, backend1);
if (buf == NULL) {
printf("failed to allocate tensors [%s] ", ggml_backend_name(backend1));
ggml_free(ctx);
return false;
}

// build graph
ggml_build_forward_expand(gf, out);
Expand Down Expand Up @@ -463,9 +468,13 @@ struct test_case {
GGML_UNUSED(index);
};

ggml_backend_compare_graph_backend(backend1, backend2, gf, callback, &ud);
bool cmp_ok = ggml_backend_compare_graph_backend(backend1, backend2, gf, callback, &ud);
slaren marked this conversation as resolved.
Show resolved Hide resolved

if (!cmp_ok) {
printf("compare failed ");
}

if (ud.ok) {
if (ud.ok && cmp_ok) {
printf("\033[1;32mOK\033[0m\n");
} else {
printf("\033[1;31mFAIL\033[0m\n");
Expand Down Expand Up @@ -519,6 +528,11 @@ struct test_case {

// allocate
ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors(ctx, backend);
if (buf == NULL) {
printf("failed to allocate tensors\n");
ggml_free(ctx);
return false;
}

// randomize tensors
initialize_tensors(ctx);
Expand Down
Loading