Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: adds ggml_pad_reflect_1d #850

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
Next Next commit
initial draft of ggml_pad_reflect_1d
  • Loading branch information
balisujohn committed Jun 6, 2024
commit 5dafdf01cced13c06391ef80cf706eec3b8f19cf
9 changes: 9 additions & 0 deletions include/ggml/ggml.h
Original file line number Diff line number Diff line change
Expand Up @@ -474,6 +474,7 @@ extern "C" {
GGML_OP_CONV_TRANSPOSE_2D,
GGML_OP_POOL_1D,
GGML_OP_POOL_2D,
GGML_OP_PAD_REFLECT_1D,
GGML_OP_UPSCALE, // nearest interpolate
GGML_OP_PAD,
GGML_OP_ARANGE,
Expand Down Expand Up @@ -1681,6 +1682,14 @@ extern "C" {
struct ggml_tensor * b,
int stride);


GGML_API struct ggml_tensor * ggml_pad_reflect_1d(
struct ggml_context * ctx,
struct ggml_tensor * a,
int p0,
int p1);


enum ggml_op_pool {
GGML_OP_POOL_MAX,
GGML_OP_POOL_AVG,
Expand Down
5 changes: 5 additions & 0 deletions src/ggml-cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
#include "ggml-cuda/tsembd.cuh"
#include "ggml-cuda/unary.cuh"
#include "ggml-cuda/upscale.cuh"
#include "ggml-cuda/padreflect.cuh"

#include <algorithm>
#include <array>
Expand Down Expand Up @@ -2285,6 +2286,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
case GGML_OP_UPSCALE:
ggml_cuda_op_upscale(ctx, dst);
break;
case GGML_OP_PAD_REFLECT_1D:
ggml_cuda_op_pad_reflect_1d(ctx, dst);
break;
case GGML_OP_PAD:
ggml_cuda_op_pad(ctx, dst);
break;
Expand Down Expand Up @@ -2894,6 +2898,7 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
case GGML_OP_ACC:
case GGML_OP_GROUP_NORM:
case GGML_OP_UPSCALE:
case GGML_OP_PAD_REFLECT_1D:
case GGML_OP_PAD:
case GGML_OP_ARANGE:
case GGML_OP_TIMESTEP_EMBEDDING:
Expand Down
65 changes: 65 additions & 0 deletions src/ggml-cuda/padreflect.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
#include "padreflect.cuh"

static __global__ void pad_reflect_1d_f32(const float * x, float * dst,
const int nb00, const int nb01,
const int ne10, const int ne11, const int p0,
const int p1, const int inp_size, const int dst_size
) {
int index = threadIdx.x + blockIdx.x * blockDim.x;
if (index >= ne10 * ne11) {
return;
}


const int row_size = ne10;
int column_index = index % row_size;
const int row_index = index / row_size;

if (column_index < p0)
{
column_index = p0 - column_index;
}
else if(column_index < row_size -p1)
{
column_index = column_index - p0;
}
else
{
column_index = (row_size - p1 - p0) - (p1+1-(row_size-column_index)) - 1;
}

int i00 = column_index;
int i01 = row_index;



dst[index] = *(float *)((char *)x + i01 * nb01 + i00 * nb00);
}

static void pad_reflect_1d_f32_cuda(const float * x, float * dst,
const int nb00, const int nb01,
const int ne10, const int ne11,
const int p0, const int p1,
const int inp_size, const int dst_size,
cudaStream_t stream) {
int num_blocks = (dst_size + CUDA_PAD_REFLECT_BLOCK_SIZE - 1) / CUDA_PAD_REFLECT_BLOCK_SIZE;

pad_reflect_1d_f32<<<num_blocks, CUDA_PAD_REFLECT_BLOCK_SIZE,0,stream>>>(x, dst, nb00, nb01, ne10, ne11,p0,p1, inp_size,dst_size);
}

void ggml_cuda_op_pad_reflect_1d(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0];
const float * src0_d = (const float *)src0->data;
float * dst_d = (float *)dst->data;
cudaStream_t stream = ctx.stream();

GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_F32);

const int inp_size = src0->ne[0] * src0->ne[1];
const int dst_size = dst->ne[0] * dst->ne[1];



pad_reflect_1d_f32_cuda(src0_d, dst_d, src0->nb[0], src0->nb[1], dst->ne[0], dst->ne[1], dst->op_params[0],dst->op_params[1], inp_size,dst_size, stream);
}
5 changes: 5 additions & 0 deletions src/ggml-cuda/padreflect.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
#include "common.cuh"

#define CUDA_PAD_REFLECT_BLOCK_SIZE 256

void ggml_cuda_op_pad_reflect_1d(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
86 changes: 84 additions & 2 deletions src/ggml.c
Original file line number Diff line number Diff line change
Expand Up @@ -2696,7 +2696,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
"CROSS_ENTROPY_LOSS_BACK",
};

static_assert(GGML_OP_COUNT == 74, "GGML_OP_COUNT != 74");
static_assert(GGML_OP_COUNT == 75, "GGML_OP_COUNT != 75");

static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"none",
Expand Down Expand Up @@ -2758,6 +2758,8 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"argsort(x)",
"leaky_relu(x)",

"pad_reflect_1d(x)",

"flash_attn_ext(x)",
"flash_attn_back(x)",
"ssm_conv(x)",
Expand All @@ -2784,7 +2786,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"cross_entropy_loss_back(x,y)",
};

static_assert(GGML_OP_COUNT == 74, "GGML_OP_COUNT != 74");
static_assert(GGML_OP_COUNT == 75, "GGML_OP_COUNT != 75");

static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");

Expand Down Expand Up @@ -7303,6 +7305,34 @@ struct ggml_tensor * ggml_get_rel_pos(
return result;
}

// ggml_pad_reflect_1d

struct ggml_tensor * ggml_pad_reflect_1d(
struct ggml_context * ctx,
struct ggml_tensor * a,
int p0,
int p1) {

bool is_node = false;

if (a->grad) {
GGML_ASSERT(false); // TODO: implement backward
is_node = true;
}

const int64_t ne[2] = { p0 + a->ne[0] + p1, a->ne[1] };
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);

int32_t params[] = { p0, p1 };
ggml_set_op_params(result, params, sizeof(params));

result->op = GGML_OP_PAD_REFLECT_1D;
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
result->src[0] = a;

return result;
}

// ggml_add_rel_pos

static struct ggml_tensor * ggml_add_rel_pos_impl(
Expand Down Expand Up @@ -13888,6 +13918,46 @@ static void ggml_compute_forward_diag_mask_f32(
}
}

// ggml_compute_forward_pad_reflect_1d

static void ggml_compute_forward_pad_reflect_1d(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
struct ggml_tensor * dst) {
GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_F32);

const int32_t * opts = (const int32_t *) dst->op_params;
const int p0 = opts[0];
const int p1 = opts[1];
GGML_ASSERT(p0 >= 0);
GGML_ASSERT(p1 >= 0);

if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
return;
}

const int ne00 = src0->ne[0];

const int nb01 = src0->nb[1];

const int ne0 = dst->ne[0];
const int ne1 = dst->ne[1];

const int nb0 = dst->nb[0];
const int nb1 = dst->nb[1];

for (int i1 = 0; i1 < ne1; i1++) {
float * left = (float *) ((char *) dst->data + i1*nb1 + p0*nb0);
float * right = (float *) ((char *) dst->data + i1*nb1 + (ne0-p1-1)*nb0);

ggml_vec_cpy_f32(ne00, left, (float *) ((char *) src0->data + i1*nb01));

for (int i0 = 1; i0 <= p0; i0++) { left[-i0] = left[i0]; }
for (int i0 = 1; i0 <= p1; i0++) { right[i0] = right[-i0]; }
}
}

static void ggml_compute_forward_diag_mask_inf(
const struct ggml_compute_params * params,
struct ggml_tensor * dst) {
Expand Down Expand Up @@ -17441,6 +17511,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
{
ggml_compute_forward_conv_transpose_2d(params, tensor);
} break;
case GGML_OP_PAD_REFLECT_1D:
{
ggml_compute_forward_pad_reflect_1d(params, tensor->src[0], tensor);
} break;
case GGML_OP_POOL_1D:
{
ggml_compute_forward_pool_1d(params, tensor);
Expand Down Expand Up @@ -18451,6 +18525,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
{
GGML_ASSERT(false); // TODO: not implemented
} break;
case GGML_OP_PAD_REFLECT_1D:
{
GGML_ASSERT(false); // TODO: not implemented
} break;
case GGML_OP_POOL_1D:
{
GGML_ASSERT(false); // TODO: not implemented
Expand Down Expand Up @@ -19197,6 +19275,10 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads, int n_cur_
{
n_tasks = n_threads;
} break;
case GGML_OP_PAD_REFLECT_1D:
{
n_tasks = 1;
} break;
case GGML_OP_POOL_1D:
case GGML_OP_POOL_2D:
{
Expand Down