conv transpose 1d passing test for 1d input and kernel

ggerganov · balisujohn · Jun 10, 2024 · Jun 11, 2024 · Jun 12, 2024 · Jun 13, 2024
commit 70de8b7bbb098087a79d52ad7a0552e8e4fe598c
diff --git a/src/ggml-cuda.cu b/src/ggml-cuda.cu
@@ -29,6 +29,7 @@
 #include "ggml-cuda/tsembd.cuh"
 #include "ggml-cuda/unary.cuh"
 #include "ggml-cuda/upscale.cuh"
+#include "ggml-cuda/conv-transpose-1d.cuh"
 
 #include <algorithm>
 #include <array>
@@ -2338,6 +2339,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
  case GGML_OP_IM2COL:
  ggml_cuda_op_im2col(ctx, dst);
  break;
+ case GGML_OP_CONV_TRANSPOSE_1D:
+ ggml_cuda_op_conv_transpose_1d(ctx,dst);
+ break;
  case GGML_OP_POOL_2D:
  ggml_cuda_op_pool2d(ctx, dst);
  break;

diff --git a/src/ggml-cuda/conv-transpose-1d.cu b/src/ggml-cuda/conv-transpose-1d.cu
@@ -0,0 +1,65 @@
+#include "conv-transpose-1d.cuh"
+
+static __global__ void conv_transpose_1d_kernel(
+ const int s0, const int p0, const int d0,
+ const int kernel_size, const int input_size, const int output_size,
+ const float * src0, const float * src1, float * dst) {
+ int idx = threadIdx.x + blockIdx.x * blockDim.x;
+ if (idx >= output_size) {
+ return;
+ }
+
+ int upper_bound = idx > input_size-1 ? input_size-1 : idx; //inclusive
+ int lower_bound = idx - kernel_size + 1 >= 0 ? idx - kernel_size + 1 : 0;
+
+ int initial_weight_idx = idx > kernel_size -1 ? kernel_size-1 : idx;
+
+
+ printf("idx: %d initial_weight_idx: %d\n", idx,initial_weight_idx);
+ printf("idx: %d upper bound: %d\n", idx, upper_bound);
+ printf("idx: %d lower bound: %d\n", idx, lower_bound);
+
+
+ for (int i = lower_bound; i <= upper_bound; i++)
+ {
+ dst[idx] += src0[initial_weight_idx-(i-lower_bound)] * src1[i];
+ }
+ //dst[idx] = 7;
+}
+
+static void conv_transpose_1d_f32_f32_cuda(
+ const int s0, const int p0, const int d0,
+ const int kernel_size, const int input_size, const int output_size,
+ const float * src0, const float * src1, float * dst,
+ cudaStream_t stream) {
+
+ const int num_blocks = (output_size + CUDA_CONV_TRANPOSE_1D_BLOCK_SIZE - 1) / CUDA_CONV_TRANPOSE_1D_BLOCK_SIZE;
+ conv_transpose_1d_kernel<<<num_blocks,CUDA_CONV_TRANPOSE_1D_BLOCK_SIZE, 0, stream>>>(s0,p0,d0,kernel_size, input_size, output_size, src0,src1, dst);
+}
+
+void ggml_cuda_op_conv_transpose_1d(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+ const ggml_tensor * src0 = dst->src[0];
+ const float * src0_d = (const float *)src0->data;
+
+ const ggml_tensor * src1 = dst->src[1];
+ const float * src1_d = (const float *)src1->data;
+
+ float * dst_d = (float *)dst->data;
+ cudaStream_t stream = ctx.stream();
+
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+ const int32_t * opts = (const int32_t *)dst->op_params;
+
+ const int s0 = 1;//opts[2];
+ const int p0 = 0;//opts[3];
+ const int d0 = 1;//opts[4];
+
+ const int64_t kernel_size = src0->ne[0];
+ const int64_t input_size = src1->ne[0];
+ const int64_t output_size = dst->ne[0];
+
+
+ conv_transpose_1d_f32_f32_cuda( s0,p0,d0,kernel_size, input_size, output_size, src0_d, src1_d, dst_d, stream);
+}
diff --git a/src/ggml-cuda/conv-transpose-1d.cuh b/src/ggml-cuda/conv-transpose-1d.cuh
@@ -0,0 +1,5 @@
+#include "common.cuh"
+
+#define CUDA_CONV_TRANPOSE_1D_BLOCK_SIZE 256
+
+void ggml_cuda_op_conv_transpose_1d(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
@@ -333,7 +333,7 @@ set_property(TEST ${TEST_TARGET} PROPERTY ENVIRONMENT "LLVM_PROFILE_FILE=${TEST_
 # test-conv-transpose
 
 set(TEST_TARGET test-conv-transpose)
-add_executable(${TEST_TARGET} ${TEST_TARGET}.c)
+add_executable(${TEST_TARGET} ${TEST_TARGET}.cpp)
 target_link_libraries(${TEST_TARGET} PRIVATE ggml)
 add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}>)
 

diff --git a/tests/test-conv-transpose.c b/tests/test-conv-transpose.c