Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: cuda implementation for ggml_conv_transpose_1d #854

Open
wants to merge 11 commits into
base: master
Choose a base branch
from
Prev Previous commit
Next Next commit
working with all old and new conv1d tests
  • Loading branch information
balisujohn committed Jun 13, 2024
commit 53a4fcfcee3e1bd8819ba6726ceca9a4633e217e
12 changes: 7 additions & 5 deletions src/ggml-cuda/conv-transpose-1d.cu
Original file line number Diff line number Diff line change
Expand Up @@ -15,15 +15,17 @@ static __global__ void conv_transpose_1d_kernel(

int out_index = global_index / dst_ne0;

dst[global_index] = 0;

for (int c = 0; c < src0_ne2; c++)
{

int idx = global_index % dst_ne0;

int kernel_offset = (src0_ne0 * src0_ne1 * out_index) + (c * src0_ne0);
int kernel_offset = (src0_ne0 * src0_ne1 * c) + (out_index * src0_ne0);
int input_offset = src1_ne0 * c;

if(global_index == 3 && s0 == 2)
if(global_index == 0 && output_size == 12)
{
printf("idx: %d ???: %d\n", global_index,src0_ne2);

Expand All @@ -43,7 +45,7 @@ static __global__ void conv_transpose_1d_kernel(

int initial_weight_idx = idx > src0_ne0 -1 ? src0_ne0-1 : idx;

if(global_index == 3 && s0 == 2)
if(global_index == 0 && output_size == 12)
{
printf("idx: %d initial_weight_idx: %d\n", global_index,initial_weight_idx);
printf("idx: %d upper bound: %d\n", global_index, upper_bound);
Expand All @@ -59,7 +61,7 @@ static __global__ void conv_transpose_1d_kernel(
int weight_idx = idx - i*s0;


if(global_index == 3 && s0 == 2)
if(global_index == 0 && output_size == 12)
{
//printf("idx: %d partial sum: %d x %d \n", global_index,src0[kernel_offset + (initial_weight_idx-(i-lower_bound))] , src1[input_offset+i]);
//printf("idx: %d kernel_index: %d\n", global_index, kernel_offset + (initial_weight_idx-(i-lower_bound)));
Expand All @@ -70,7 +72,7 @@ static __global__ void conv_transpose_1d_kernel(
}
int test1 = src0[kernel_offset + weight_idx];
int test2 = src1[input_offset+i];
if(global_index == 3 && s0 == 2)
if(global_index == 0 && output_size == 12)
{
//printf("idx: %d partial sum: %d x %d \n", global_index,src0[kernel_offset + (initial_weight_idx-(i-lower_bound))] , src1[input_offset+i]);
//printf("idx: %d kernel_index: %d\n", global_index, kernel_offset + (initial_weight_idx-(i-lower_bound)));
Expand Down
173 changes: 166 additions & 7 deletions tests/test-conv-transpose.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@ struct test_model {
struct ggml_tensor * a_2;
struct ggml_tensor * b_2;

struct ggml_tensor * a_3;
struct ggml_tensor * b_3;


ggml_backend_t backend = NULL;
Expand All @@ -56,6 +58,11 @@ void load_model(test_model & model, bool use_gpu = false) {
float adata_2[] = {3,2,1,1,2,3,1,2,3,3,2,1};
float bdata_2[] = {2,3,1,1,3,2};

float data[1024];
for (int i = 0; i < 1024; ++i) {
data[i] = (float)i;
}




Expand All @@ -64,11 +71,16 @@ void load_model(test_model & model, bool use_gpu = false) {
buffer_size += 3* ggml_type_size(GGML_TYPE_F32); // tensor a_0
buffer_size += 2* ggml_type_size(GGML_TYPE_F32); // tensor b_0

buffer_size += 6* ggml_type_size(GGML_TYPE_F32); // tensor a_0
buffer_size += 6* ggml_type_size(GGML_TYPE_F32); // tensor b_0
buffer_size += 6* ggml_type_size(GGML_TYPE_F32); // tensor a_1
buffer_size += 6* ggml_type_size(GGML_TYPE_F32); // tensor b_1

buffer_size += 12* ggml_type_size(GGML_TYPE_F32); // tensor a_2
buffer_size += 6* ggml_type_size(GGML_TYPE_F32); // tensor b_2

buffer_size += 2 * 3 * 2 * ggml_type_size(GGML_TYPE_F32); // tensor a_3
buffer_size += 3 * 2* ggml_type_size(GGML_TYPE_F32); // tensor b_3


buffer_size += 12* ggml_type_size(GGML_TYPE_F32); // tensor a_0
buffer_size += 6* ggml_type_size(GGML_TYPE_F32); // tensor b_0


buffer_size += 1024;
Expand All @@ -77,7 +89,7 @@ void load_model(test_model & model, bool use_gpu = false) {
printf("%s: ggml tensor size = %d bytes\n", __func__, (int) sizeof(ggml_tensor));
printf("%s: backend buffer size = %0.2f MB\n", __func__, (buffer_size/ 1024.f/ 1024.f));

int num_tensors = 6;
int num_tensors = 8;
struct ggml_init_params params {
/*.mem_size =*/ ggml_tensor_overhead() * num_tensors,
/*.mem_buffer =*/ NULL,
Expand Down Expand Up @@ -127,6 +139,9 @@ void load_model(test_model & model, bool use_gpu = false) {
model.a_2 = ggml_new_tensor_3d(model.ctx, GGML_TYPE_F32, 3,2,2);
model.b_2 = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, 3,2);

model.a_3 = ggml_new_tensor_3d(model.ctx, GGML_TYPE_F32, 2,3,2);
model.b_3 = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, 3,2);



// create a allocator
Expand Down Expand Up @@ -163,6 +178,16 @@ void load_model(test_model & model, bool use_gpu = false) {
ggml_backend_tensor_set(model.a_2, adata_2, 0, ggml_nbytes(model.a_2));
}

// alloc memory
ggml_tallocr_alloc(&alloc, model.a_3);

// load data to buffer
if(ggml_backend_is_cpu(model.backend)) {
memcpy(model.a_3->data, data, ggml_nbytes(model.a_3));
} else {
ggml_backend_tensor_set(model.a_3, data, 0, ggml_nbytes(model.a_3));
}



// alloc memory
Expand Down Expand Up @@ -204,6 +229,19 @@ void load_model(test_model & model, bool use_gpu = false) {
ggml_backend_tensor_set(model.b_2, bdata_2, 0, ggml_nbytes(model.b_2));
}

// alloc memory
ggml_tallocr_alloc(&alloc, model.b_3);

if(ggml_backend_is_cpu(model.backend)
#ifdef GGML_USE_METAL
|| ggml_backend_is_metal(model.backend)
#endif
) {
memcpy(model.b_3->data, data, ggml_nbytes(model.b_3));
} else {
ggml_backend_tensor_set(model.b_3, data, 0, ggml_nbytes(model.b_3));
}

}

struct ggml_cgraph * build_graph(const test_model& model) {
Expand Down Expand Up @@ -254,6 +292,30 @@ struct ggml_cgraph * build_graph(const test_model& model) {
ggml_set_name(conv1d_transpose_res_3, "conv1d_transpose_res_3");
ggml_build_forward_expand(gf, conv1d_transpose_res_3);

s0 = 1;
p0 = 0;
d0 = 1;

struct ggml_tensor* conv1d_transpose_res_4 = ggml_conv_transpose_1d(ctx0, model.a_3, model.b_3, s0, p0, d0);
ggml_set_name(conv1d_transpose_res_4, "conv1d_transpose_res_4");
ggml_build_forward_expand(gf, conv1d_transpose_res_4);

s0 = 2;
p0 = 0;
d0 = 1;

struct ggml_tensor* conv1d_transpose_res_5 = ggml_conv_transpose_1d(ctx0, model.a_3, model.b_3, s0, p0, d0);
ggml_set_name(conv1d_transpose_res_5, "conv1d_transpose_res_5");
ggml_build_forward_expand(gf, conv1d_transpose_res_5);

s0 = 3;
p0 = 0;
d0 = 1;

struct ggml_tensor* conv1d_transpose_res_6 = ggml_conv_transpose_1d(ctx0, model.a_3, model.b_3, s0, p0, d0);
ggml_set_name(conv1d_transpose_res_6, "conv1d_transpose_res_6");
ggml_build_forward_expand(gf, conv1d_transpose_res_6);



// delete the temporally context used to build the graph
Expand Down Expand Up @@ -388,10 +450,68 @@ int main(void)
,5.0f, 6.0f, 19.0f, 12.0f, 19.0f, 6.0f, 5.0f};


struct ggml_tensor * conv1d_transpose_res_4 = NULL;

for(int i = 0; i < gf_res->n_nodes; i++) {
if(strcmp(ggml_get_name(gf_res->nodes[i]), "conv1d_transpose_res_4") == 0) {
conv1d_transpose_res_4 = gf_res->nodes[i];
}
}

float* conv1d_transpose_data_4 = new float[ggml_nelements(conv1d_transpose_res_4)];

ggml_backend_tensor_get(conv1d_transpose_res_4, conv1d_transpose_data_4, 0, ggml_nbytes(conv1d_transpose_res_4));


const int n_conv_transpose_1d_test_4 = 12;


float expected_conv1d_4[3*4] = {
18.0, 45.0, 59.0, 37.0,
24.0, 61.0, 83.0, 51.0,
30.0, 77.0, 107.0, 65.0
};

struct ggml_tensor * conv1d_transpose_res_5 = NULL;

for(int i = 0; i < gf_res->n_nodes; i++) {
if(strcmp(ggml_get_name(gf_res->nodes[i]), "conv1d_transpose_res_5") == 0) {
conv1d_transpose_res_5 = gf_res->nodes[i];
}
}

float* conv1d_transpose_data_5 = new float[ggml_nelements(conv1d_transpose_res_5)];

ggml_backend_tensor_get(conv1d_transpose_res_5, conv1d_transpose_data_5, 0, ggml_nbytes(conv1d_transpose_res_5));


const int n_conv_transpose_1d_test_5 = 18;

float expected_conv1d_5[3*6] = {
18.0, 21.0, 24.0, 29.0, 30.0, 37.0,
24.0, 27.0, 34.0, 39.0, 44.0, 51.0,
30.0, 33.0, 44.0, 49.0, 58.0, 65.0
};

struct ggml_tensor * conv1d_transpose_res_6 = NULL;

for(int i = 0; i < gf_res->n_nodes; i++) {
if(strcmp(ggml_get_name(gf_res->nodes[i]), "conv1d_transpose_res_6") == 0) {
conv1d_transpose_res_6 = gf_res->nodes[i];
}
}

float* conv1d_transpose_data_6 = new float[ggml_nelements(conv1d_transpose_res_6)];

ggml_backend_tensor_get(conv1d_transpose_res_6, conv1d_transpose_data_6, 0, ggml_nbytes(conv1d_transpose_res_6));


const int n_conv_transpose_1d_test_6 = 24;

float expected_conv1d_6[3*8] = {
18.0, 21.0, 0.0, 24.0, 29.0, 0.0, 30.0, 37.0,
24.0, 27.0, 0.0, 34.0, 39.0, 0.0, 44.0, 51.0,
30.0, 33.0, 0.0, 44.0, 49.0, 0.0, 58.0, 65.0};



printf("\nPerforming test:\n");

Expand Down Expand Up @@ -449,6 +569,45 @@ int main(void)
printf("ggml_conv_1d_transpose (%d): %s\n", (int) ggml_nelements(conv1d_transpose_res_3), passed && (ggml_nelements(conv1d_transpose_res_3) == n_conv_transpose_1d_test_3) ? "\033[32mPASSED\033[0m" : "\033[31mFAILED\033[0m");


for(int i = 0; i < n_conv_transpose_1d_test_4; i++) {
if(
conv1d_transpose_data_4[i] != expected_conv1d_4[i]) {
std::cout << "index: " << i << std::endl;
std::cout << "expected: " << expected_conv1d_4[i] << std::endl;
std::cout << "actual: " << conv1d_transpose_data_4[i] << std::endl;
passed = false;
}
}

printf("ggml_conv_1d_transpose (%d): %s\n", (int) ggml_nelements(conv1d_transpose_res_4), passed && (ggml_nelements(conv1d_transpose_res_4) == n_conv_transpose_1d_test_4) ? "\033[32mPASSED\033[0m" : "\033[31mFAILED\033[0m");

for(int i = 0; i < n_conv_transpose_1d_test_5; i++) {
if(
conv1d_transpose_data_5[i] != expected_conv1d_5[i]) {
std::cout << "index: " << i << std::endl;
std::cout << "expected: " << expected_conv1d_5[i] << std::endl;
std::cout << "actual: " << conv1d_transpose_data_5[i] << std::endl;
passed = false;
}
}

printf("ggml_conv_1d_transpose (%d): %s\n", (int) ggml_nelements(conv1d_transpose_res_5), passed && (ggml_nelements(conv1d_transpose_res_5) == n_conv_transpose_1d_test_5) ? "\033[32mPASSED\033[0m" : "\033[31mFAILED\033[0m");


for(int i = 0; i < n_conv_transpose_1d_test_6; i++) {
if(
conv1d_transpose_data_6[i] != expected_conv1d_6[i]) {
std::cout << "index: " << i << std::endl;
std::cout << "expected: " << expected_conv1d_6[i] << std::endl;
std::cout << "actual: " << conv1d_transpose_data_6[i] << std::endl;
passed = false;
}
}


printf("ggml_conv_1d_transpose (%d): %s\n", (int) ggml_nelements(conv1d_transpose_res_6), passed && (ggml_nelements(conv1d_transpose_res_6) == n_conv_transpose_1d_test_6) ? "\033[32mPASSED\033[0m" : "\033[31mFAILED\033[0m");


ggml_free(model.ctx);

ggml_backend_buffer_free(model.buffer);
Expand Down