ggml : cgraph export/import/eval example + GPU support (#108)

* ggml : cgraph export brainstorming * mnist : code style * mnist : minor * ggml : initial cgraph export * ggml : initial graph import (wip) * ggml : import op args correctly * ggml : add ggml_get_tensor_by_name() * mnist : add compute graph evaluation on CPU example * ggml : add ggml_tensor_overhead() * ggml : rename new functions to ggml_cgraph_... * mnist : add Metal inference skeleton (WIP) * mnist : working on the Metal pipeline (WIP) * mnist : prepare the Metal encoder (WIP) * mnist : first Metal kernel for F32 ADD * mnist : looks like MTLHeap does not work * mnist : initial full pass of MNIST on the GPU (not verified) * mnist : minor cleanup * mnist : full GPU inference works * mnist : use custom soft_max kernel since MPSMatrixSoftMax is bugged * mnist : use constant for soft_max instead of hardcoded 10 * mnist : check multiple predictions (Metal) * mnist : minor * ggml : move cgraph import / export to ggml * mnist : remove common dependencies * mnist : fix soft_max threadgroup size * mnist : init no_alloc member * ggml : improve "get tensor" API
ggerganov · May 29, 2023 · 3b697a2 · 3b697a2
1 parent db5eef1
commit 3b697a2
Show file tree

Hide file tree

Showing 8 changed files with 1,297 additions and 11 deletions.
diff --git a/examples/mnist/CMakeLists.txt b/examples/mnist/CMakeLists.txt
@@ -5,3 +5,29 @@ set(TEST_TARGET mnist)
 add_executable(${TEST_TARGET} main.cpp)
 target_link_libraries(${TEST_TARGET} PRIVATE ggml common)
 
+#
+# mnist-cpu
+
+set(TEST_TARGET mnist-cpu)
+add_executable(${TEST_TARGET} main-cpu.cpp)
+target_link_libraries(${TEST_TARGET} PRIVATE ggml)
+
+if (APPLE)
+ #
+ # mnist-mtl
+
+ find_library(FOUNDATION_LIBRARY Foundation REQUIRED)
+ find_library(METAL_FRAMEWORK Metal REQUIRED)
+ find_library(METALKIT_FRAMEWORK MetalKit REQUIRED)
+ find_library(METALPERFORMANCE_FRAMEWORK MetalPerformanceShaders REQUIRED)
+
+ set(TEST_TARGET mnist-mtl)
+ add_executable(${TEST_TARGET} main-mtl.cpp main-mtl.h main-mtl.m)
+ target_link_libraries(${TEST_TARGET} PRIVATE
+ ggml
+ ${FOUNDATION_LIBRARY}
+ ${METAL_FRAMEWORK}
+ ${METALKIT_FRAMEWORK}
+ ${METALPERFORMANCE_FRAMEWORK}
+ )
+endif()
diff --git a/examples/mnist/main-cpu.cpp b/examples/mnist/main-cpu.cpp
@@ -0,0 +1,116 @@
+// Use a pre-generated MNIST compute graph for inference on the CPU
+//
+// You can generate a compute graph using the "mnist" tool:
+//
+// $ ./bin/mnist ./models/mnist/ggml-model-f32.bin ../examples/mnist/models/mnist/t10k-images.idx3-ubyte
+//
+// This command creates the "mnist.ggml" file, which contains the generated compute graph.
+// Now, you can re-use the compute graph with the "mnist-cpu" tool:
+//
+// $ ./bin/mnist-cpu ./models/mnist/mnist.ggml ../examples/mnist/models/mnist/t10k-images.idx3-ubyte
+//
+
+#include "ggml/ggml.h"
+
+#include <cmath>
+#include <cstdio>
+#include <cstring>
+#include <ctime>
+#include <fstream>
+#include <vector>
+
+// evaluate the MNIST compute graph
+//
+// - fname_cgraph: path to the compute graph
+// - n_threads: number of threads to use
+// - digit: 784 pixel values
+//
+// returns 0 - 9 prediction
+int mnist_eval(
+ const char * fname_cgraph,
+ const int n_threads,
+ std::vector<float> digit
+ ) {
+ // load the compute graph
+ struct ggml_context * ctx_data = NULL;
+ struct ggml_context * ctx_eval = NULL;
+
+ struct ggml_cgraph gfi = ggml_graph_import(fname_cgraph, &ctx_data, &ctx_eval);
+ gfi.n_threads = n_threads;
+
+ // allocate eval context
+ // needed during ggml_graph_compute() to allocate a work tensor
+ static size_t buf_size = gfi.work_size; // TODO
+ static void * buf = malloc(buf_size);
+
+ struct ggml_init_params params = {
+ .mem_size = buf_size,
+ .mem_buffer = buf,
+ .no_alloc = false,
+ };
+
+ struct ggml_context * ctx0 = ggml_init(params);
+
+ struct ggml_tensor * input = ggml_graph_get_tensor(&gfi, "input");
+ memcpy(input->data, digit.data(), ggml_nbytes(input));
+
+ ggml_graph_compute(ctx0, &gfi);
+
+ const float * probs_data = ggml_get_data_f32(ggml_graph_get_tensor(&gfi, "probs"));
+
+ const int prediction = std::max_element(probs_data, probs_data + 10) - probs_data;
+
+ ggml_free(ctx0);
+ ggml_free(ctx_data);
+ ggml_free(ctx_eval);
+
+ return prediction;
+}
+
+int main(int argc, char ** argv) {
+ srand(time(NULL));
+ ggml_time_init();
+
+ if (argc != 3) {
+ fprintf(stderr, "Usage: %s models/mnist/mnist.ggml models/mnist/t10k-images.idx3-ubyte\n", argv[0]);
+ exit(0);
+ }
+
+ uint8_t buf[784];
+ std::vector<float> digit;
+
+ // read a random digit from the test set
+ {
+ std::ifstream fin(argv[2], std::ios::binary);
+ if (!fin) {
+ fprintf(stderr, "%s: failed to open '%s'\n", __func__, argv[2]);
+ return 1;
+ }
+
+ // seek to a random digit: 16-byte header + 28*28 * (random 0 - 10000)
+ fin.seekg(16 + 784 * (rand() % 10000));
+ fin.read((char *) &buf, sizeof(buf));
+ }
+
+ // render the digit in ASCII
+ {
+ digit.resize(sizeof(buf));
+
+ for (int row = 0; row < 28; row++) {
+ for (int col = 0; col < 28; col++) {
+ fprintf(stderr, "%c ", (float)buf[row*28 + col] > 230 ? '*' : '_');
+ digit[row*28 + col] = ((float)buf[row*28 + col]);
+ }
+
+ fprintf(stderr, "\n");
+ }
+
+ fprintf(stderr, "\n");
+ }
+
+ const int prediction = mnist_eval(argv[1], 1, digit);
+
+ fprintf(stdout, "%s: predicted digit is %d\n", __func__, prediction);
+
+ return 0;
+}
diff --git a/examples/mnist/main-mtl.cpp b/examples/mnist/main-mtl.cpp
@@ -0,0 +1,129 @@
+// Use a pre-generated MNIST compute graph for inference on the M1 GPU via MPS
+//
+// You can generate a compute graph using the "mnist" tool:
+//
+// $ ./bin/mnist ./models/mnist/ggml-model-f32.bin ../examples/mnist/models/mnist/t10k-images.idx3-ubyte
+//
+// This command creates the "mnist.ggml" file, which contains the generated compute graph.
+// Now, you can re-use the compute graph on the GPU with the "mnist-mtl" tool:
+//
+// $ ./bin/mnist-mtl ./models/mnist/mnist.ggml ../examples/mnist/models/mnist/t10k-images.idx3-ubyte
+//
+
+#include "ggml/ggml.h"
+
+#include "main-mtl.h"
+
+#include <cmath>
+#include <cstdio>
+#include <cstring>
+#include <ctime>
+#include <fstream>
+#include <vector>
+
+// evaluate the MNIST compute graph
+//
+// - fname_cgraph: path to the compute graph
+// - n_threads: number of threads to use
+// - digit: 784 pixel values
+//
+// returns 0 - 9 prediction
+int mnist_eval(
+ const char * fname_cgraph,
+ const int n_threads,
+ std::vector<float> digit
+ ) {
+ // load the compute graph
+ struct ggml_context * ctx_data = NULL;
+ struct ggml_context * ctx_eval = NULL;
+
+ struct ggml_cgraph gf = ggml_graph_import(fname_cgraph, &ctx_data, &ctx_eval);
+ gf.n_threads = n_threads;
+
+ // allocate eval context
+ // needed during ggml_graph_compute() to allocate a work tensor
+ static size_t buf_size = gf.work_size; // TODO
+ static void * buf = malloc(buf_size);
+
+ struct ggml_init_params params = {
+ .mem_size = buf_size,
+ .mem_buffer = buf,
+ .no_alloc = false,
+ };
+
+ struct ggml_context * ctx_work = ggml_init(params);
+
+ // this allocates all Metal resources and memory buffers
+ auto ctx_mtl = mnist_mtl_init(ctx_data, ctx_eval, ctx_work, &gf);
+
+ int prediction = -1;
+
+ for (int i = 0; i < 1; ++i) {
+ struct ggml_tensor * input = ggml_graph_get_tensor(&gf, "input");
+
+ if (i % 2 == 0) {
+ memcpy(input->data, digit.data(), ggml_nbytes(input));
+ } else {
+ memset(input->data, 0, ggml_nbytes(input));
+ }
+
+ // the actual inference happens here
+ prediction = mnist_mtl_eval(ctx_mtl, &gf);
+ }
+
+ mnist_mtl_free(ctx_mtl);
+
+ ggml_free(ctx_work);
+ ggml_free(ctx_data);
+ ggml_free(ctx_eval);
+
+ return prediction;
+}
+
+int main(int argc, char ** argv) {
+ srand(time(NULL));
+ ggml_time_init();
+
+ if (argc != 3) {
+ fprintf(stderr, "Usage: %s models/mnist/mnist.ggml models/mnist/t10k-images.idx3-ubyte\n", argv[0]);
+ exit(0);
+ }
+
+ uint8_t buf[784];
+ std::vector<float> digit;
+
+ // read a random digit from the test set
+ {
+ std::ifstream fin(argv[2], std::ios::binary);
+ if (!fin) {
+ fprintf(stderr, "%s: failed to open '%s'\n", __func__, argv[2]);
+ return 1;
+ }
+
+ // seek to a random digit: 16-byte header + 28*28 * (random 0 - 10000)
+ fin.seekg(16 + 784 * (rand() % 10000));
+ fin.read((char *) &buf, sizeof(buf));
+ }
+
+ // render the digit in ASCII
+ {
+ digit.resize(sizeof(buf));
+
+ for (int row = 0; row < 28; row++) {
+ for (int col = 0; col < 28; col++) {
+ fprintf(stderr, "%c ", (float)buf[row*28 + col] > 230 ? '*' : '_');
+ digit[row*28 + col] = ((float)buf[row*28 + col]);
+ }
+
+ fprintf(stderr, "\n");
+ }
+
+ fprintf(stderr, "\n");
+ }
+
+ const int prediction = mnist_eval(argv[1], 1, digit);
+
+ fprintf(stdout, "%s: predicted digit is %d\n", __func__, prediction);
+
+ return 0;
+}
diff --git a/examples/mnist/main-mtl.h b/examples/mnist/main-mtl.h
@@ -0,0 +1,26 @@
+#pragma once
+
+struct ggml_context;
+struct ggml_cgraph;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct ggml_mtl_context;
+
+struct ggml_mtl_context * mnist_mtl_init(
+ struct ggml_context * ctx_data,
+ struct ggml_context * ctx_eval,
+ struct ggml_context * ctx_work,
+ struct ggml_cgraph * gf);
+
+void mnist_mtl_free(struct ggml_mtl_context * ctx);
+
+int mnist_mtl_eval(
+ struct ggml_mtl_context * ctx,
+ struct ggml_cgraph * gf);
+
+#ifdef __cplusplus
+}
+#endif