ggml : sync llama.cpp (gguf + metal + ROCm + etc.) (ggerganov#489)

* ggml : sync llama.cpp (gguf + metal + ROCm + etc.) ggml-ci * cuda : sync rope updates ggml-ci
balisujohn · Aug 28, 2023 · ea9193f · ea9193f
1 parent 9d05810
commit ea9193f
Show file tree

Hide file tree

Showing 9 changed files with 804 additions and 281 deletions.
diff --git a/include/ggml/ggml-alloc.h b/include/ggml/ggml-alloc.h
@@ -12,7 +12,7 @@ GGML_API struct ggml_allocr * ggml_allocr_new_measure(size_t alignment);
 
 // tell the allocator to parse nodes following the order described in the list
 // you should call this if your graph are optimized to execute out-of-order
-GGML_API void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, int * list, int n);
+GGML_API void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, const int * list, int n);
 
 GGML_API void ggml_allocr_free(struct ggml_allocr * alloc);
 GGML_API bool ggml_allocr_is_measure(struct ggml_allocr * alloc);

diff --git a/include/ggml/ggml.h b/include/ggml/ggml.h
@@ -224,7 +224,7 @@
 #define GGML_EXIT_ABORTED 1
 
 #define GGUF_MAGIC 0x46554747 // "GGUF"
-#define GGUF_VERSION 1
+#define GGUF_VERSION 2
 
 #define GGUF_DEFAULT_ALIGNMENT 32
 
@@ -1835,6 +1835,9 @@ extern "C" {
  GGUF_TYPE_BOOL = 7,
  GGUF_TYPE_STRING = 8,
  GGUF_TYPE_ARRAY = 9,
+ GGUF_TYPE_UINT64 = 10,
+ GGUF_TYPE_INT64 = 11,
+ GGUF_TYPE_FLOAT64 = 12,
  GGUF_TYPE_COUNT, // marks the end of the enum
  };
 
@@ -1875,6 +1878,9 @@ extern "C" {
  GGML_API uint32_t gguf_get_val_u32 (struct gguf_context * ctx, int i);
  GGML_API int32_t gguf_get_val_i32 (struct gguf_context * ctx, int i);
  GGML_API float gguf_get_val_f32 (struct gguf_context * ctx, int i);
+ GGML_API uint64_t gguf_get_val_u64 (struct gguf_context * ctx, int i);
+ GGML_API int64_t gguf_get_val_i64 (struct gguf_context * ctx, int i);
+ GGML_API double gguf_get_val_f64 (struct gguf_context * ctx, int i);
  GGML_API bool gguf_get_val_bool(struct gguf_context * ctx, int i);
  GGML_API const char * gguf_get_val_str (struct gguf_context * ctx, int i);
  GGML_API int gguf_get_arr_n (struct gguf_context * ctx, int i);
@@ -1894,6 +1900,9 @@ extern "C" {
  GGML_API void gguf_set_val_u32 (struct gguf_context * ctx, const char * key, uint32_t val);
  GGML_API void gguf_set_val_i32 (struct gguf_context * ctx, const char * key, int32_t val);
  GGML_API void gguf_set_val_f32 (struct gguf_context * ctx, const char * key, float val);
+ GGML_API void gguf_set_val_u64 (struct gguf_context * ctx, const char * key, uint64_t val);
+ GGML_API void gguf_set_val_i64 (struct gguf_context * ctx, const char * key, int64_t val);
+ GGML_API void gguf_set_val_f64 (struct gguf_context * ctx, const char * key, double val);
  GGML_API void gguf_set_val_bool(struct gguf_context * ctx, const char * key, bool val);
  GGML_API void gguf_set_val_str (struct gguf_context * ctx, const char * key, const char * val);
  GGML_API void gguf_set_arr_data(struct gguf_context * ctx, const char * key, enum gguf_type type, const void * data, int n);
@@ -1952,6 +1961,7 @@ extern "C" {
  GGML_API int ggml_cpu_has_clblast (void);
  GGML_API int ggml_cpu_has_gpublas (void);
  GGML_API int ggml_cpu_has_sse3 (void);
+ GGML_API int ggml_cpu_has_ssse3 (void);
  GGML_API int ggml_cpu_has_vsx (void);
 
  //

diff --git a/src/ggml-alloc.c b/src/ggml-alloc.c
@@ -8,6 +8,7 @@
 
 #define UNUSED(x) (void)(x)
 #define MAX(a, b) ((a) > (b) ? (a) : (b))
+#define GGML_MAX_CONCUR (2*GGML_MAX_NODES)
 
 //#define GGML_ALLOCATOR_DEBUG
 
@@ -67,8 +68,8 @@ struct ggml_allocr {
  struct hash_node hash_table[GGML_GRAPH_HASHTABLE_SIZE];
  size_t max_size;
  bool measure;
- int parse_seq[GGML_MAX_NODES];
- bool has_parse_seq;
+ int parse_seq[GGML_MAX_CONCUR];
+ int parse_seq_len;
 
 #ifdef GGML_ALLOCATOR_DEBUG
  struct ggml_tensor * allocated_tensors[1024];
@@ -238,15 +239,11 @@ static void ggml_allocator_free_tensor(struct ggml_allocr * alloc, struct ggml_t
  alloc->n_free_blocks++;
 }
 
-void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, int * list, int n) {
- int pos = 0;
+void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, const int * list, int n) {
  for (int i = 0; i < n; i++) {
- if (list[i] != -1) {
- alloc->parse_seq[pos] = list[i];
- pos++;
- }
+ alloc->parse_seq[i] = list[i];
  }
- alloc->has_parse_seq = true;
+ alloc->parse_seq_len = n;
 }
 
 void ggml_allocr_reset(struct ggml_allocr * alloc) {
@@ -269,7 +266,7 @@ struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment)
  /*.max_size = */ 0,
  /*.measure = */ false,
  /*.parse_seq = */ {0},
- /*.has_parse_seq = */ false,
+ /*.parse_seq_len = */ 0,
 #ifdef GGML_ALLOCATOR_DEBUG
  /*.allocated_tensors = */ {0},
 #endif
@@ -298,7 +295,7 @@ struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
  /*.max_size = */ 0,
  /*.measure = */ true,
  /*.parse_seq = */ {0},
- /*.has_parse_seq = */ false,
+ /*.parse_seq_len = */ 0,
 #ifdef GGML_ALLOCATOR_DEBUG
  /*.allocated_tensors = */ {0},
 #endif
@@ -445,8 +442,8 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
  else {
  AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name);
  node->data = parent->data;
+ return;
  }
- return;
  }
  }
  }
@@ -497,69 +494,86 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
  allocate_node(alloc, input);
  }
  }
- for (int ind = 0; ind < gf->n_nodes; ind++) {
- int i;
- if (alloc->has_parse_seq) {
- i = alloc->parse_seq[ind];
- } else {
- i = ind;
- }
- struct ggml_tensor * node = gf->nodes[i];
-
- // allocate parents (leafs)
- for (int j = 0; j < GGML_MAX_SRC; j++) {
- struct ggml_tensor * parent = node->src[j];
- if (parent == NULL) {
- break;
+ // if we have parse_seq then we allocate nodes following the list, and we only free nodes at barriers
+ int last_barrier_pos = 0;
+ int n_nodes = alloc->parse_seq_len ? alloc->parse_seq_len : gf->n_nodes;
+
+ for (int ind = 0; ind < n_nodes; ind++) {
+ // allocate a node if there is no parse_seq or this is not a barrier
+ if ((alloc->parse_seq_len==0) || alloc->parse_seq[ind] != -1) {
+ int i = alloc->parse_seq_len ? alloc->parse_seq[ind] : ind;
+ struct ggml_tensor * node = gf->nodes[i];
+
+ // allocate parents (leafs)
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
+ struct ggml_tensor * parent = node->src[j];
+ if (parent == NULL) {
+ break;
+ }
+ allocate_node(alloc, parent);
  }
- allocate_node(alloc, parent);
- }
 
- // allocate node
- allocate_node(alloc, node);
+  // allocate node
+  allocate_node(alloc, node);
 
- AT_PRINTF("exec: %s (%s) <= ", ggml_op_name(node->op), node->name);
- for (int j = 0; j < GGML_MAX_SRC; j++) {
- struct ggml_tensor * parent = node->src[j];
- if (parent == NULL) {
- break;
- }
- AT_PRINTF("%s", parent->name);
- if (j < GGML_MAX_SRC - 1 && node->src[j + 1] != NULL) {
- AT_PRINTF(", ");
+ AT_PRINTF("exec: %s (%s) <= ", ggml_op_name(node->op), node->name);
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
+ struct ggml_tensor * parent = node->src[j];
+ if (parent == NULL) {
+ break;
+ }
+ AT_PRINTF("%s", parent->name);
+ if (j < GGML_MAX_SRC - 1 && node->src[j + 1] != NULL) {
+ AT_PRINTF(", ");
+ }
  }
+ AT_PRINTF("\n");
  }
- AT_PRINTF("\n");
+
 
  // update parents
- for (int j = 0; j < GGML_MAX_SRC; j++) {
- struct ggml_tensor * parent = node->src[j];
- if (parent == NULL) {
- break;
- }
- struct hash_node * p_hn = hash_get(ht, parent);
- p_hn->n_children -= 1;
-
- //AT_PRINTF("parent %s: %d children, %d views\n", parent->name, parent->n_children, parent->n_views);
-
- if (p_hn->n_children == 0 && p_hn->n_views == 0) {
- if (ggml_is_view(parent)) {
- struct ggml_tensor * view_src = get_view_source(parent);
- struct hash_node * view_src_hn = hash_get(ht, view_src);
- view_src_hn->n_views -= 1;
- AT_PRINTF("view_src %s: %d children, %d views\n", view_src->name, view_src_hn->n_children, view_src_hn->n_views);
- if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src->data != node->data) {
- ggml_allocator_free_tensor(alloc, view_src);
+ // update immediately if there is no parse_seq
+ // update only at barriers if there is parse_seq
+ if ((alloc->parse_seq_len==0) || alloc->parse_seq[ind] == -1) {
+ int update_start = alloc->parse_seq_len ? last_barrier_pos : ind;
+ int update_end = alloc->parse_seq_len ? ind : ind + 1;
+ for (int i = update_start; i < update_end; i++) {
+ int node_i = alloc->parse_seq_len ? alloc->parse_seq[i] : i;
+ struct ggml_tensor * node = gf->nodes[node_i];
+
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
+ struct ggml_tensor * parent = node->src[j];
+ if (parent == NULL) {
+ break;
  }
- }
- else {
- if (parent->data != node->data) {
- ggml_allocator_free_tensor(alloc, parent);
+ struct hash_node * p_hn = hash_get(ht, parent);
+ p_hn->n_children -= 1;
+
+ //AT_PRINTF("parent %s: %d children, %d views\n", parent->name, parent->n_children, parent->n_views);
+
+ if (p_hn->n_children == 0 && p_hn->n_views == 0) {
+ if (ggml_is_view(parent)) {
+ struct ggml_tensor * view_src = get_view_source(parent);
+ struct hash_node * view_src_hn = hash_get(ht, view_src);
+ view_src_hn->n_views -= 1;
+ AT_PRINTF("view_src %s: %d children, %d views\n", view_src->name, view_src_hn->n_children, view_src_hn->n_views);
+ if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src->data != node->data) {
+ ggml_allocator_free_tensor(alloc, view_src);
+ }
+ }
+ else {
+ if (parent->data != node->data) {
+ ggml_allocator_free_tensor(alloc, parent);
+ }
+ }
  }
  }
  }
+ AT_PRINTF("\n");
+ if (alloc->parse_seq_len) {
+ last_barrier_pos = ind + 1;
+ }
  }
- AT_PRINTF("\n");
  }
  // free graph outputs here that wouldn't be freed otherwise because they have no children
  if (outputs != NULL && outputs[g] != NULL) {