Improve ADD_REL_POS perf in SAM by doing it inplace

- Add unit tests for the ADD_REL_POS operation - I am not sure if this is valid implementation as we reuse the src0 memory in order to avoid copying it - When running SAM with the "Example output" command, image, point and 16 threads, this reduces the cumulative time of the ADD_REL_POS operation from 1000-1100 ms to 180-200ms - There is further room for optimization in the access patterns used in the implementation of the opration
ggerganov · ggerganov · Aug 21, 2023 · Aug 20, 2023 · Aug 21, 2023 · Aug 21, 2023
commit 58d9081fa87f54506269534dd4f2d1428d5ef2db
diff --git a/examples/sam/main.cpp b/examples/sam/main.cpp
@@ -1228,7 +1228,7 @@ bool sam_encode_image(
  0, 2, 1, 3));
  struct ggml_tensor * rel_h = ggml_mul_mat(ctx0, rh, q_r);
 
- struct ggml_tensor * attn = ggml_add_rel_pos(ctx0, KQ_scaled, rel_w, rel_h);
+ struct ggml_tensor * attn = ggml_add_rel_pos_inplace(ctx0, KQ_scaled, rel_w, rel_h);
 
  struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, attn);
 

diff --git a/include/ggml/ggml.h b/include/ggml/ggml.h
@@ -1384,7 +1384,8 @@ extern "C" {
  int kh);
 
  // used in sam
- GGML_API struct ggml_tensor * ggml_add_rel_pos(
+
+ GGML_API struct ggml_tensor * ggml_add_rel_pos_inplace(
  struct ggml_context * ctx,
  struct ggml_tensor * a,
  struct ggml_tensor * pw,

diff --git a/src/ggml.c b/src/ggml.c
@@ -3930,7 +3930,6 @@ static void ggml_setup_op_has_task_pass(void) {
  p[GGML_OP_CONV_TRANSPOSE_2D ] = true;
  p[GGML_OP_FLASH_ATTN_BACK ] = true;
  p[GGML_OP_CROSS_ENTROPY_LOSS ] = true;
- p[GGML_OP_ADD_REL_POS ] = true;
  }
 
  { // FINALIZE
@@ -7329,7 +7328,7 @@ struct ggml_tensor * ggml_get_rel_pos(
 
 // ggml_add_rel_pos
 
-struct ggml_tensor * ggml_add_rel_pos(
+struct ggml_tensor * ggml_add_rel_pos_inplace(
  struct ggml_context * ctx,
  struct ggml_tensor * a,
  struct ggml_tensor * pw,
@@ -7347,18 +7346,9 @@ struct ggml_tensor * ggml_add_rel_pos(
  GGML_ASSERT(pw->type == GGML_TYPE_F32);
  GGML_ASSERT(ph->type == GGML_TYPE_F32);
 
- bool is_node = false;
-
- if (a->grad) {
- GGML_ASSERT(false); // TODO: implement backward
- is_node = true;
- }
-
- const int64_t ne[4] = { a->ne[0], a->ne[1], a->ne[2], 1, };
- struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne);
-
+ struct ggml_tensor * result = ggml_view_tensor(ctx, a);
  result->op = GGML_OP_ADD_REL_POS;
- result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+ result->grad = NULL;
  result->src[0] = a;
  result->src[1] = pw;
  result->src[2] = ph;
@@ -14586,7 +14576,6 @@ static void ggml_compute_forward_get_rel_pos_f16(
  for (int64_t i2 = 0; i2 < ne2; ++i2) {
  for (int64_t i1 = 0; i1 < ne1; ++i1) {
  const int64_t pos = (w - i1 - 1) + i2;
-
  for (int64_t i0 = 0; i0 < ne0; ++i0) {
  dst_data[i2*ne1*ne0 + i1*ne0 + i0] = src0_data[pos*ne00 + i0];
  }
@@ -14618,26 +14607,20 @@ static void ggml_compute_forward_add_rel_pos_f32(
  const struct ggml_tensor * src1,
  const struct ggml_tensor * src2,
  struct ggml_tensor * dst) {
- if (params->type == GGML_TASK_FINALIZE) {
+ UNUSED(src0);
+ if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
  return;
  }
 
- // ref: https://github.com/facebookresearch/segment-anything/blob/main/segment_anything/modeling/image_encoder.py#L357-L359
+ int64_t t0 = ggml_perf_time_us();
+ UNUSED(t0);
 
- const int64_t ne0 = dst->ne[0];
- const int64_t ne1 = dst->ne[1];
- const int64_t ne2 = dst->ne[2];
+ // ref: https://github.com/facebookresearch/segment-anything/blob/main/segment_anything/modeling/image_encoder.py#L357-L359
 
- float * src0_data = (float *) src0->data;
  float * src1_data = (float *) src1->data;
  float * src2_data = (float *) src2->data;
  float * dst_data = (float *) dst->data;
 
- if (params->type == GGML_TASK_INIT) {
- memcpy(dst_data, src0_data, ne0*ne1*ne2*sizeof(float));
- return;
- }
-
  const int64_t ne10 = src1->ne[0];
  const int64_t ne11 = src1->ne[1];
  const int64_t ne12 = src1->ne[2];
@@ -14656,23 +14639,22 @@ static void ggml_compute_forward_add_rel_pos_f32(
  const int ip0 = dp*ith;
  const int ip1 = MIN(ip0 + dp, np);
 
+
  for (int64_t i13 = ip0; i13 < ip1; ++i13) {
  for (int64_t i12 = 0; i12 < ne12; ++i12) {
  for (int64_t i11 = 0; i11 < ne11; ++i11) {
+ const int64_t jp1 = i13*ne12*ne11*ne10 + i12*ne11*ne10 + i11*ne10;
  for (int64_t i10 = 0; i10 < ne10; ++i10) {
- // add rel pos W (src1) to src0
- const int64_t i2 = i11;
- const int64_t i3 = i12;
- const int64_t i4 = i13;
-
- const int64_t jp = i13*ne12*ne11*ne10 + i12*ne11*ne10 + i11*ne10 + i10;
+ const int64_t jp0 = jp1 + i10;
+ const float src1_e = src1_data[jp0];
+ const float src2_e = src2_data[jp0];
 
- const int64_t jdw = i4*ne1*ne0 + i3*ne11*ne0 + i2*ne0 + i10;
- const int64_t jdh = i4*ne1*ne0 + i3*ne11*ne0 + i2*ne0 + i10*ne10;
+ const int64_t jdh = jp0 * ne10;
+ const int64_t jdw = jdh - (ne10 - 1) * i10;
 
  for (int64_t j = 0; j < ne10; ++j) {
- dst_data[jdw + j*ne10] += src1_data[jp];
- dst_data[jdh + j ] += src2_data[jp];
+ dst_data[jdh + j ] += src2_e;
+ dst_data[jdw + j*ne10] += src1_e;
  }
  }
  }

diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
@@ -291,6 +291,14 @@ add_executable(${TEST_TARGET} ${TEST_TARGET}.c)
 target_link_libraries(${TEST_TARGET} PRIVATE ggml)
 add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}>)
 
+#
+# test-rel-pos
+
+set(TEST_TARGET test-rel-pos)
+add_executable(${TEST_TARGET} ${TEST_TARGET}.c)
+target_link_libraries(${TEST_TARGET} PRIVATE ggml)
+add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}>)
+
 #
 # test-svd0 (arm/x86)
 

diff --git a/tests/test-rel-pos.c b/tests/test-rel-pos.c
@@ -0,0 +1,80 @@
+#include "ggml/ggml.h"
+
+#include <string.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+struct ggml_context* make_ctx(void) {
+ struct ggml_init_params params = {
+ .mem_size = 2 * 1024 * 1024,
+ };
+
+ return ggml_init(params);
+}
+
+void check_tensor(struct ggml_tensor * t, float * expected_t_d, int ne0, int ne1, int ne2) {
+ GGML_ASSERT(t->type == GGML_TYPE_F32);
+ GGML_ASSERT(t->ne[0] == ne0);
+ GGML_ASSERT(t->ne[1] == ne1);
+ GGML_ASSERT(t->ne[2] == ne2);
+ for (int i2 = 0; i2 < ne2; ++i2) {
+ for (int i1 = 0; i1 < ne1; ++i1) {
+ for (int i0 = 0; i0 < ne0; ++i0) {
+ float expected = *(expected_t_d + i2 * ne1 * ne0 + i1 * ne0 + i0);
+ float actual = ggml_get_data_f32(t)[i2 * ne1 * ne0 + i1 * ne0 + i0];
+ GGML_ASSERT(expected == actual);
+ }
+ }
+ }
+}
+
+int main(int argc, const char** argv) {
+ ggml_fp16_t buf_f16[1024];
+ for (int i = 0; i < 1024; ++i) {
+ buf_f16[i] = ggml_fp32_to_fp16((float)i);
+ }
+
+ float expected_out[4][9] = {
+ { 8.0, 9.0, 10.0, 9.0, 10.0, 11.0, 10.0, 11.0, 12.0 },
+ { 2.0, 3.0, 4.0, 3.0, 4.0, 5.0, 4.0, 5.0, 6.0 },
+ { 14.0, 15.0, 16.0, 15.0, 16.0, 17.0, 16.0, 17.0, 18.0 },
+ { 8.0, 9.0, 10.0, 9.0, 10.0, 11.0, 10.0, 11.0, 12.0 },
+ };
+
+ {
+ struct ggml_context * ctx = make_ctx();
+
+
+ struct ggml_tensor * t = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, 3, 3);
+ ggml_fp16_t* t_d = (ggml_fp16_t*)t->data;
+ memcpy(t_d, buf_f16, ggml_nbytes(t));
+
+ struct ggml_tensor * t_2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, 3, 3);
+ ggml_fp16_t* t_d_2 = (ggml_fp16_t*)t_2->data;
+ memcpy(t_d_2, buf_f16 + 1, ggml_nbytes(t_2));
+
+ struct ggml_tensor * rw = ggml_get_rel_pos(ctx, t, 2, 2);
+ struct ggml_tensor * rh = ggml_get_rel_pos(ctx, t_2, 2, 2);
+
+ struct ggml_tensor * rw_f32 = ggml_cpy(ctx, rw, ggml_new_tensor_3d(ctx, GGML_TYPE_F32, 3, 2, 2));
+ struct ggml_tensor * rh_f32 = ggml_cpy(ctx, rh, ggml_new_tensor_3d(ctx, GGML_TYPE_F32, 3, 2, 2));
+
+ struct ggml_tensor * out = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 9, 4);
+ float* out_d = (float*)out->data;
+ for (int i = 0; i < ggml_nelements(out); ++i) {
+ out_d[i] = 1.f;
+ }
+
+ out = ggml_add_rel_pos_inplace(ctx, out, rw_f32, rh_f32);
+
+ struct ggml_cgraph gf = ggml_build_forward(out);
+ ggml_build_forward_expand(&gf, rw_f32);
+ ggml_build_forward_expand(&gf, rh_f32);
+
+ ggml_graph_compute_with_ctx(ctx, &gf, 1);
+
+ check_tensor(out, (float*)expected_out, 9, 4, 1);
+ }
+
+ return 0;
+}