Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ggml : change ggml_graph_compute() API to not require context #1999

Merged
merged 20 commits into from
Jul 7, 2023
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
reusable buffers
  • Loading branch information
mqy committed Jul 6, 2023
commit b1331d7e604eeae9b9b0e4f7b3a50b70b49c1b44
23 changes: 8 additions & 15 deletions examples/baby-llama/baby-llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1569,6 +1569,8 @@ int main(int argc, char ** argv) {
int n_tokens = model.hparams.n_ctx;
int n_vocab = model.hparams.n_vocab;

auto compute_plan_buffer = std::vector<uint8_t>();

for (int ex=0; ex<n_examples; ++ex) {
struct ggml_init_params params = {
/*.mem_size =*/ compute_size,
Expand Down Expand Up @@ -1598,13 +1600,10 @@ int main(int argc, char ** argv) {
{
struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, /*n_threads*/ 1);
if (plan.work_size > 0) {
plan.work_data = malloc(plan.work_size);
GGML_ASSERT(plan.work_data);
compute_plan_buffer.resize(plan.work_size);
plan.work_data = compute_plan_buffer.data();
}
ggml_graph_compute(&plan, &gf);
if (plan.work_data) {
free(plan.work_data);
}
}

float error_before_opt = ggml_get_f32_1d(e, 0);
Expand All @@ -1625,13 +1624,10 @@ int main(int argc, char ** argv) {
{
struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, /*n_threads*/ 1);
if (plan.work_size > 0) {
plan.work_data = malloc(plan.work_size);
GGML_ASSERT(plan.work_data);
compute_plan_buffer.resize(plan.work_size);
plan.work_data = compute_plan_buffer.data();
}
ggml_graph_compute(&plan, &gf);
if (plan.work_data) {
free(plan.work_data);
}
}

float error_after_opt = ggml_get_f32_1d(e, 0);
Expand Down Expand Up @@ -1689,13 +1685,10 @@ int main(int argc, char ** argv) {
{
struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, /*n_threads*/ 1);
if (plan.work_size > 0) {
plan.work_data = malloc(plan.work_size);
GGML_ASSERT(plan.work_data);
compute_plan_buffer.resize(plan.work_size);
plan.work_data = compute_plan_buffer.data();
}
ggml_graph_compute(&plan, &gf);
if (plan.work_data) {
free(plan.work_data);
}
}

struct ggml_tensor * best_samples = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, sample_ctx);
Expand Down
29 changes: 11 additions & 18 deletions examples/benchmark/benchmark-matmult.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -164,16 +164,15 @@ int main(int argc, char ** argv) {
TENSOR_DUMP(m11);
TENSOR_DUMP(m2);

auto compute_plan_buffer = std::vector<uint8_t>();

{
struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, benchmark_params.n_threads);
auto plan = ggml_graph_compute_make_plan(&gf, benchmark_params.n_threads);
if (plan.work_size > 0) {
plan.work_data = malloc(plan.work_size);
GGML_ASSERT(plan.work_data);
compute_plan_buffer.resize(plan.work_size);
plan.work_data = compute_plan_buffer.data();
}
ggml_graph_compute(&plan, &gf);
if (plan.work_data) {
free(plan.work_data);
}
}

TENSOR_DUMP(gf.nodes[0]);
Expand Down Expand Up @@ -229,15 +228,12 @@ int main(int argc, char ** argv) {
long long int start = ggml_time_us();
//printf("Running ggml_graph_compute\n");
{
struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf31, benchmark_params.n_threads);
auto plan = ggml_graph_compute_make_plan(&gf31, benchmark_params.n_threads);
if (plan.work_size > 0) {
plan.work_data = malloc(plan.work_size);
GGML_ASSERT(plan.work_data);
compute_plan_buffer.resize(plan.work_size);
plan.work_data = compute_plan_buffer.data();
}
ggml_graph_compute(&plan, &gf31);
if (plan.work_data) {
free(plan.work_data);
}
}

long long int stop = ggml_time_us();
Expand Down Expand Up @@ -272,15 +268,12 @@ int main(int argc, char ** argv) {

// Running a different graph computation to make sure we override the CPU cache lines
{
struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf32, benchmark_params.n_threads);
auto plan = ggml_graph_compute_make_plan(&gf32, benchmark_params.n_threads);
if (plan.work_size > 0) {
plan.work_data = malloc(plan.work_size);
GGML_ASSERT(plan.work_data);
compute_plan_buffer.resize(plan.work_size);
plan.work_data = compute_plan_buffer.data();
}
ggml_graph_compute(&plan, &gf32);
if (plan.work_data) {
free(plan.work_data);
}
}
}
printf("\n");
Expand Down
29 changes: 11 additions & 18 deletions examples/train-text-from-scratch/train-text-from-scratch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3181,6 +3181,8 @@ int main(int argc, char ** argv) {
GGML_ASSERT(train_samples[i]+n_tokens-1 < (int) train_tokens.size());
}

auto compute_plan_buffer = std::vector<uint8_t>();

printf("%s: begin training\n", __func__);

for (int ex = 0; ex < params.n_examples; ++ex) {
Expand Down Expand Up @@ -3244,15 +3246,12 @@ int main(int argc, char ** argv) {
}

{
struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(gf, params.n_threads);
auto plan = ggml_graph_compute_make_plan(gf, params.n_threads);
if (plan.work_size > 0) {
plan.work_data = malloc(plan.work_size);
GGML_ASSERT(plan.work_data);
compute_plan_buffer.resize(plan.work_size);
plan.work_data = compute_plan_buffer.data();
}
ggml_graph_compute(&plan, gf);
if (plan.work_data) {
free(plan.work_data);
}
}

size_t used_mem_before_opt = ggml_used_mem(ctx0);
Expand All @@ -3278,15 +3277,12 @@ int main(int argc, char ** argv) {
model.train_tokens += n_batch * n_tokens;

{
struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(gf, params.n_threads);
auto plan = ggml_graph_compute_make_plan(gf, params.n_threads);
if (plan.work_size > 0) {
plan.work_data = malloc(plan.work_size);
GGML_ASSERT(plan.work_data);
compute_plan_buffer.resize(plan.work_size);
plan.work_data = compute_plan_buffer.data();
}
ggml_graph_compute(&plan, gf);
if (plan.work_data) {
free(plan.work_data);
}
}

float error_after_opt = ggml_get_f32_1d(loss, 0);
Expand Down Expand Up @@ -3376,15 +3372,12 @@ int main(int argc, char ** argv) {
ggml_build_forward_expand(&gf, logits);

{
struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, params.n_threads);
auto plan = ggml_graph_compute_make_plan(&gf, params.n_threads);
if (plan.work_size > 0) {
plan.work_data = malloc(plan.work_size);
GGML_ASSERT(plan.work_data);
compute_plan_buffer.resize(plan.work_size);
plan.work_data = compute_plan_buffer.data();
}
ggml_graph_compute(&plan, &gf);
if (plan.work_data) {
free(plan.work_data);
}
}

//struct ggml_tensor * best_samples = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, sample_ctx);
Expand Down
3 changes: 2 additions & 1 deletion ggml.c
Original file line number Diff line number Diff line change
Expand Up @@ -15974,7 +15974,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
const struct ggml_cgraph * cgraph = state->shared->cgraph;

const struct ggml_graph_compute_plan * plan = state->shared->plan;
const int *n_tasks_arr = plan->n_tasks;
const int * n_tasks_arr = plan->n_tasks;

const int n_threads = state->shared->n_threads;
set_numa_thread_affinity(state->ith, n_threads);
Expand Down Expand Up @@ -16490,6 +16490,7 @@ void ggml_graph_compute(struct ggml_graph_compute_plan * plan, struct ggml_cgrap
}
}

// TODO: avoid allocating memory frequently.
static void ggml_graph_compute_sugar(struct ggml_cgraph * cgraph, int n_threads) {
struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(cgraph, n_threads);
if (plan.work_size > 0) {
Expand Down
2 changes: 1 addition & 1 deletion ggml.h
Original file line number Diff line number Diff line change
Expand Up @@ -449,7 +449,7 @@ extern "C" {
// Size of work buffer, calculated by `ggml_graph_compute_make_plan()`.
size_t work_size;
// Work buffer, to be allocated by caller before calling to `ggml_graph_compute()`.
void * work_data;
uint8_t * work_data;

int n_threads;

Expand Down
64 changes: 25 additions & 39 deletions llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -321,6 +321,10 @@ struct llama_context {
// input embedding (1-dimensional array: [n_embd])
std::vector<float> embedding;

// reusable buffer for `struct ggml_graph_compute_plan.work_data`
// std::vector guarantees the elements are stored contiguously.
std::vector<uint8_t> compute_plan_buffer;

// memory buffers used to evaluate the model
// TODO: move in llama_state
llama_ctx_buffer buf_compute;
Expand Down Expand Up @@ -1591,10 +1595,13 @@ static bool llama_eval_internal(
// run the computation
ggml_build_forward_expand(&gf, cur);

bool call_ggml_graph_compute = true;

#ifdef GGML_USE_METAL
if (lctx.ctx_metal && N == 1) {
ggml_metal_graph_compute(lctx.ctx_metal, &gf);
ggml_metal_get_tensor (lctx.ctx_metal, cur);
call_ggml_graph_compute = false;
} else {
// IMPORTANT:
// Since we don't have efficient Matrix x Matrix Metal multiplication yet, we fallback to vanilla
Expand All @@ -1611,32 +1618,17 @@ static bool llama_eval_internal(
ggml_metal_get_tensor(lctx.ctx_metal, kv_self.k);
ggml_metal_get_tensor(lctx.ctx_metal, kv_self.v);
}

{
struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, actual_n_threads);
if (plan.work_size > 0) {
plan.work_data = malloc(plan.work_size);
GGML_ASSERT(plan.work_data);
}
ggml_graph_compute(&plan, &gf);
if (plan.work_data) {
free(plan.work_data);
}
}
}
#else
{
struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, actual_n_threads);
#endif

if (call_ggml_graph_compute) {
auto plan = ggml_graph_compute_make_plan(&gf, actual_n_threads);
if (plan.work_size > 0) {
plan.work_data = malloc(plan.work_size);
GGML_ASSERT(plan.work_data);
lctx.compute_plan_buffer.resize(plan.work_size);
plan.work_data = lctx.compute_plan_buffer.data();
}
ggml_graph_compute(&plan, &gf);
if (plan.work_data) {
free(plan.work_data);
}
}
#endif

if (cgraph_fname) {
ggml_graph_export(&gf, cgraph_fname);
Expand Down Expand Up @@ -2822,6 +2814,9 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
// read tensors and apply
bool warned = false;
int n_tensors = 0;

auto compute_plan_buffer = std::vector<uint8_t>();

while (true) {
int32_t n_dims;
int32_t length;
Expand Down Expand Up @@ -2988,15 +2983,12 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
struct ggml_cgraph gf = ggml_build_forward(r);

{
struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, n_threads);
auto plan = ggml_graph_compute_make_plan(&gf, n_threads);
if (plan.work_size > 0) {
plan.work_data = malloc(plan.work_size);
GGML_ASSERT(plan.work_data);
compute_plan_buffer.resize(plan.work_size);
plan.work_data = compute_plan_buffer.data();
}
ggml_graph_compute(&plan, &gf);
if (plan.work_data) {
free(plan.work_data);
}
}

// we won't need these tensors again, reset the context to save memory
Expand Down Expand Up @@ -3171,15 +3163,12 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, v3d, vout3d));

{
struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, /*n_threads*/ 1);
auto plan = ggml_graph_compute_make_plan(&gf, /*n_threads*/ 1);
if (plan.work_size > 0) {
plan.work_data = malloc(plan.work_size);
GGML_ASSERT(plan.work_data);
ctx->compute_plan_buffer.resize(plan.work_size);
plan.work_data = ctx->compute_plan_buffer.data();
}
ggml_graph_compute(&plan, &gf);
if (plan.work_data) {
free(plan.work_data);
}
}

ggml_free(cpy_ctx);
Expand Down Expand Up @@ -3287,15 +3276,12 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, vin3d, v3d));

{
struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, /*n_threads*/ 1);
auto plan = ggml_graph_compute_make_plan(&gf, /*n_threads*/ 1);
if (plan.work_size > 0) {
plan.work_data = malloc(plan.work_size);
GGML_ASSERT(plan.work_data);
ctx->compute_plan_buffer.resize(plan.work_size);
plan.work_data = ctx->compute_plan_buffer.data();
}
ggml_graph_compute(&plan, &gf);
if (plan.work_data) {
free(plan.work_data);
}
}

ggml_free(cpy_ctx);
Expand Down
Loading
Loading