Skip to content

Commit

Permalink
ggml : change ggml_graph_compute() API to not require context (#1999)
Browse files Browse the repository at this point in the history
* ggml_graph_compute: deprecate using ggml_context, try resolve issue ggerganov#287

* rewrite: no longer consider backward compitability; plan and make_plan

* minor: rename ctx as plan; const

* remove ggml_graph_compute from tests/test-grad0.c, but current change breaks backward

* add static ggml_graph_compute_sugar()

* minor: update comments

* reusable buffers

* ggml : more consistent naming + metal fixes

* ggml : fix docs

* tests : disable grad / opt + minor naming changes

* ggml : add ggml_graph_compute_with_ctx()

- backwards compatible API
- deduplicates a lot of copy-paste

* ci : enable test-grad0

* examples : factor out plan allocation into a helper function

* llama : factor out plan stuff into a helper function

* ci : fix env

* llama : fix duplicate symbols + refactor example benchmark

* ggml : remove obsolete assert + refactor n_tasks section

* ggml : fix indentation in switch

* llama : avoid unnecessary bool

* ggml : remove comments from source file and match order in header

---------

Co-authored-by: Georgi Gerganov <[email protected]>
  • Loading branch information
mqy and ggerganov committed Jul 7, 2023
1 parent 7242140 commit 1d656d6
Show file tree
Hide file tree
Showing 13 changed files with 531 additions and 409 deletions.
13 changes: 8 additions & 5 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,9 @@ on:
paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu']

env:
BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
GGML_NLOOP: 3
GGML_NITER: 1

jobs:
ubuntu-focal-make:
Expand Down Expand Up @@ -64,7 +66,7 @@ jobs:
id: cmake_test
run: |
cd build
ctest --verbose
ctest --verbose --timeout 900
ubuntu-latest-cmake-sanitizer:
runs-on: ubuntu-latest
Expand Down Expand Up @@ -99,7 +101,7 @@ jobs:
id: cmake_test
run: |
cd build
ctest --verbose
ctest --verbose --timeout 900
macOS-latest-make:
runs-on: macos-latest
Expand Down Expand Up @@ -147,10 +149,11 @@ jobs:
id: cmake_test
run: |
cd build
ctest --verbose
ctest --verbose --timeout 900
windows-latest-cmake:
runs-on: windows-latest

env:
OPENBLAS_VERSION: 0.3.23
OPENCL_VERSION: 2023.04.17
Expand Down Expand Up @@ -249,7 +252,7 @@ jobs:
if: ${{ matrix.build != 'clblast' && (matrix.build != 'avx512' || env.HAS_AVX512F == '1') }} # Test AVX-512 only when possible
run: |
cd build
ctest -C Release --verbose
ctest -C Release --verbose --timeout 900
- name: Get commit hash
id: commit
Expand Down
24 changes: 18 additions & 6 deletions examples/baby-llama/baby-llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,17 @@ float frand_normal(struct random_normal_distribution * rnd) {
return ((r < rnd->min) ? (rnd->min) : (r > rnd->max) ? (rnd->max) : r);
}

void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);

if (plan.work_size > 0) {
buf.resize(plan.work_size);
plan.work_data = buf.data();
}

ggml_graph_compute(graph, &plan);
}

struct ggml_tensor * randomize_tensor(
struct ggml_tensor * tensor,
int ndims,
Expand Down Expand Up @@ -1569,6 +1580,8 @@ int main(int argc, char ** argv) {
int n_tokens = model.hparams.n_ctx;
int n_vocab = model.hparams.n_vocab;

std::vector<uint8_t> work_buffer;

for (int ex=0; ex<n_examples; ++ex) {
struct ggml_init_params params = {
/*.mem_size =*/ compute_size,
Expand All @@ -1586,7 +1599,6 @@ int main(int argc, char ** argv) {
int n_past = 0;

ggml_cgraph gf = {};
gf.n_threads = 1;

get_example_targets_batch(ctx0, 64*ex+0, tokens_input, targets);

Expand All @@ -1595,7 +1607,7 @@ int main(int argc, char ** argv) {
struct ggml_tensor * e = square_error_loss(ctx0, targets, logits);

ggml_build_forward_expand(&gf, e);
ggml_graph_compute(ctx0, &gf);
ggml_graph_compute_helper(work_buffer, &gf, /*n_threads*/ 1);

float error_before_opt = ggml_get_f32_1d(e, 0);

Expand All @@ -1611,7 +1623,7 @@ int main(int argc, char ** argv) {
ggml_opt(ctx0, opt_params_lbfgs, e);
//
ggml_build_forward_expand(&gf, e);
ggml_graph_compute(ctx0, &gf);
ggml_graph_compute_helper(work_buffer, &gf, /*n_threads*/ 1);

float error_after_opt = ggml_get_f32_1d(e, 0);

Expand Down Expand Up @@ -1659,13 +1671,12 @@ int main(int argc, char ** argv) {
struct ggml_context * ctx0 = ggml_init(params);

ggml_cgraph gf = {};
gf.n_threads = 1;

int n_past = 0;
struct ggml_tensor * logits = forward(&model, &kv_self, ctx0, &gf, tokens_input, sample_ctx, n_past);

ggml_build_forward_expand(&gf, logits);
ggml_graph_compute(ctx0, &gf);
ggml_graph_compute_helper(work_buffer, &gf, /*n_threads*/ 1);

struct ggml_tensor * best_samples = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, sample_ctx);
struct ggml_tensor * probs = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_vocab, sample_ctx);
Expand All @@ -1687,10 +1698,11 @@ int main(int argc, char ** argv) {
}

print_matrix(model.tok_embeddings);

printf("done\n");

// ggml_free(kv_self.ctx);
// ggml_free(model_lora.ctx);
ggml_free(model.ctx);

return 0;
}
29 changes: 20 additions & 9 deletions examples/benchmark/benchmark-matmult.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,17 @@
#pragma warning(disable: 4244 4267) // possible loss of data
#endif

void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);

if (plan.work_size > 0) {
buf.resize(plan.work_size);
plan.work_data = buf.data();
}

ggml_graph_compute(graph, &plan);
}

float tensor_sum_elements(const ggml_tensor * tensor) {
float sum = 0;
if (tensor->type==GGML_TYPE_F32) {
Expand Down Expand Up @@ -159,13 +170,14 @@ int main(int argc, char ** argv) {
// printf("Creating compute graph\n");
struct ggml_cgraph gf = ggml_build_forward(m11xm2);

gf.n_threads=benchmark_params.n_threads;
printf("cgraph->n_threads=%i\n",gf.n_threads);
printf("n_threads=%i\n", benchmark_params.n_threads);

TENSOR_DUMP(m11);
TENSOR_DUMP(m2);

ggml_graph_compute(ctx, &gf);
std::vector<uint8_t> work_buffer;

ggml_graph_compute_helper(work_buffer, &gf, benchmark_params.n_threads);

TENSOR_DUMP(gf.nodes[0]);

Expand All @@ -187,7 +199,6 @@ int main(int argc, char ** argv) {

// printf("Creating compute graph\n");
struct ggml_cgraph gf31 = ggml_build_forward(q31);
gf31.n_threads=benchmark_params.n_threads;

// Set up a second graph computation to make sure we override the CPU cache lines
// printf("Creating new tensor q12 & Running quantize\n");
Expand All @@ -199,8 +210,7 @@ int main(int argc, char ** argv) {

//printf("Creating compute graph\n");
struct ggml_cgraph gf32 = ggml_build_forward(q32);
gf32.n_threads=benchmark_params.n_threads;
printf("cgraph->n_threads=%i\n",gf31.n_threads);
printf("n_threads=%i\n", benchmark_params.n_threads);

const int dimx = sizex;
const int dimy = sizey;
Expand All @@ -221,14 +231,15 @@ int main(int argc, char ** argv) {

long long int start = ggml_time_us();
//printf("Running ggml_graph_compute\n");
ggml_graph_compute(ctx, &gf31);
ggml_graph_compute_helper(work_buffer, &gf31, benchmark_params.n_threads);

long long int stop = ggml_time_us();
long long int usec = stop-start;
double gflops = (double)(flops_per_matrix)/usec/1000.0;
gflops_sum += gflops;
printf("%9i;%8i;%6i;%6i;%6i;%15lli;%18lli;%10.2f\n",
i,
gf31.n_threads,
benchmark_params.n_threads,
sizex, sizey, sizez, flops_per_matrix,
usec,gflops);

Expand All @@ -253,7 +264,7 @@ int main(int argc, char ** argv) {
}

// Running a different graph computation to make sure we override the CPU cache lines
ggml_graph_compute(ctx, &gf32);
ggml_graph_compute_helper(work_buffer, &gf32, benchmark_params.n_threads);
}
printf("\n");
printf("Average%78.2f\n",gflops_sum/((double)benchmark_params.n_iterations));
Expand Down
3 changes: 1 addition & 2 deletions examples/metal/metal.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,10 +35,9 @@ int main(int argc, char ** argv) {
struct ggml_context * ctx_eval = NULL;

struct ggml_cgraph gf = ggml_graph_import(fname_cgraph, &ctx_data, &ctx_eval);
gf.n_threads = 1;

// this allocates all Metal resources and memory buffers
auto * ctx_metal = ggml_metal_init();
auto * ctx_metal = ggml_metal_init(1);

const size_t max_size_data = ggml_get_max_tensor_size(ctx_data);
const size_t max_size_eval = ggml_get_max_tensor_size(ctx_eval);
Expand Down
27 changes: 18 additions & 9 deletions examples/train-text-from-scratch/train-text-from-scratch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,17 @@ float frand_uniform(struct random_uniform_distribution * rnd) {
return rnd->rd(rnd->gen);
}

void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);

if (plan.work_size > 0) {
buf.resize(plan.work_size);
plan.work_data = buf.data();
}

ggml_graph_compute(graph, &plan);
}

struct ggml_tensor * randomize_tensor_normal(struct ggml_tensor * tensor, struct random_normal_distribution * rnd) {
float scale = 1.0f; // xavier
switch (tensor->n_dims) {
Expand Down Expand Up @@ -1426,11 +1437,9 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train(

gf->n_nodes = 0;
gf->n_leafs = 0;
gf->work_size = 0;
gf->perf_runs = 0;
gf->perf_cycles = 0;
gf->perf_time_us = 0;
gf->work = NULL;

const auto & hparams = model->hparams;
//const int n_ctx = hparams.n_ctx;
Expand Down Expand Up @@ -3162,6 +3171,7 @@ int main(int argc, char ** argv) {
printf("used_mem model+cache: %zu bytes\n", ggml_used_mem(model.ctx));
// ggml_print_tensor_objects(model.ctx);

// TODO: use std::vector<uint8_t> intead of "new"
size_t compute_size = 1024ll*1024ll*1024ll*((size_t) params.mem_compute_gb);
uint8_t * compute_addr = new uint8_t[compute_size];

Expand All @@ -3183,6 +3193,8 @@ int main(int argc, char ** argv) {
GGML_ASSERT(train_samples[i]+n_tokens-1 < (int) train_tokens.size());
}

std::vector<uint8_t> work_buffer;

printf("%s: begin training\n", __func__);

for (int ex = 0; ex < params.n_examples; ++ex) {
Expand Down Expand Up @@ -3217,9 +3229,6 @@ int main(int argc, char ** argv) {
struct ggml_cgraph * gf = (struct ggml_cgraph *) gfbuf->data;
struct ggml_cgraph * gb = (struct ggml_cgraph *) gbbuf->data;

// ggml_cgraph gf = {};
gf->n_threads = params.n_threads;
gb->n_threads = params.n_threads;

get_example_targets_batch(lctx, train_samples.data(), train_samples.size(), train_tokens.data(), train_tokens.size(), ex, tokens_input, target_logits, target_probs);

Expand Down Expand Up @@ -3248,7 +3257,7 @@ int main(int argc, char ** argv) {
*gb = ggml_build_backward(ctx0, gf, true);
}

ggml_graph_compute(ctx0, gf);
ggml_graph_compute_helper(work_buffer, gf, params.n_threads);

size_t used_mem_before_opt = ggml_used_mem(ctx0);

Expand All @@ -3272,7 +3281,7 @@ int main(int argc, char ** argv) {
model.train_samples += n_batch;
model.train_tokens += n_batch * n_tokens;

ggml_graph_compute(ctx0, gf);
ggml_graph_compute_helper(work_buffer, gf, params.n_threads);

float error_after_opt = ggml_get_f32_1d(loss, 0);

Expand Down Expand Up @@ -3354,13 +3363,12 @@ int main(int argc, char ** argv) {
struct ggml_context * ctx0 = ggml_init(cparams);

ggml_cgraph gf = {};
gf.n_threads = params.n_threads;

int n_past = 0;
struct ggml_tensor * logits = forward(&model, &kv_self, ctx0, &gf, tokens_input, sample_ctx, n_past);

ggml_build_forward_expand(&gf, logits);
ggml_graph_compute(ctx0, &gf);
ggml_graph_compute_helper(work_buffer, &gf, params.n_threads);

//struct ggml_tensor * best_samples = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, sample_ctx);
//struct ggml_tensor * probs = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_vocab, sample_ctx);
Expand All @@ -3386,6 +3394,7 @@ int main(int argc, char ** argv) {
delete[] compute_addr;
delete[] compute_buf_0;
delete[] compute_buf_1;

llama_free(lctx);
llama_free_model(lmodel);
ggml_free(model.ctx);
Expand Down
6 changes: 5 additions & 1 deletion ggml-metal.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,13 @@ extern "C" {

struct ggml_metal_context;

struct ggml_metal_context * ggml_metal_init(void);
// number of command buffers to use
struct ggml_metal_context * ggml_metal_init(int n_cb);
void ggml_metal_free(struct ggml_metal_context * ctx);

// set the number of command buffers to use
void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb);

// creates a mapping between a host memory buffer and a device memory buffer
// - make sure to map all buffers used in the graph before calling ggml_metal_graph_compute
// - the mapping is used during computation to determine the arguments of the compute kernels
Expand Down
11 changes: 9 additions & 2 deletions ggml-metal.m
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@
};

struct ggml_metal_context {
int n_cb;

float * logits;

id<MTLDevice> device;
Expand Down Expand Up @@ -86,11 +88,12 @@ @interface GGMLMetalClass : NSObject
@implementation GGMLMetalClass
@end

struct ggml_metal_context * ggml_metal_init(void) {
struct ggml_metal_context * ggml_metal_init(int n_cb) {
fprintf(stderr, "%s: allocating\n", __func__);

struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context));

ctx->n_cb = n_cb;
ctx->device = MTLCreateSystemDefaultDevice();
ctx->queue = [ctx->device newCommandQueue];
ctx->n_buffers = 0;
Expand Down Expand Up @@ -208,6 +211,10 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
free(ctx);
}

void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb) {
ctx->n_cb = n_cb;
}

// finds the Metal buffer that contains the tensor data on the GPU device
// the assumption is that there is 1-to-1 mapping between the host and device memory buffers, so we can find the
// Metal buffer based on the host memory pointer
Expand Down Expand Up @@ -354,7 +361,7 @@ void ggml_metal_graph_compute(
// create multiple command buffers and enqueue them
// then, we encode the graph into the command buffers in parallel

const int n_cb = gf->n_threads;
const int n_cb = ctx->n_cb;

NSMutableArray * command_buffers = [NSMutableArray arrayWithCapacity:n_cb];

Expand Down
Loading

0 comments on commit 1d656d6

Please sign in to comment.