From b49f1c026e868ca147ac4a685a07cfd46885a356 Mon Sep 17 00:00:00 2001 From: "Meng, Hengyu" Date: Wed, 5 Jun 2024 20:22:32 +0800 Subject: [PATCH] remove duplicate extra and global work group size --- ggml-sycl.cpp | 43 ++++++++++++++----------------------------- 1 file changed, 14 insertions(+), 29 deletions(-) diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp index 4369fe53302fd6..e7ac5ba4deb3f8 100644 --- a/ggml-sycl.cpp +++ b/ggml-sycl.cpp @@ -6261,7 +6261,8 @@ static void norm_f32_sycl(const float *x, float *dst, const int ncols, }); }); } else { - const int work_group_size = g_work_group_size; + // FIXME: 1024 from cuda + const int work_group_size = 1024; const sycl::range<3> block_dims(1, 1, work_group_size); /* DPCT1049:17: The work-group size passed to the SYCL kernel may exceed @@ -6307,7 +6308,7 @@ static void group_norm_f32_sycl(const float *x, float *dst, }); }); } else { - const int work_group_size = g_work_group_size; + const int work_group_size = 1024; const sycl::range<3> block_dims(1, 1, work_group_size); /* DPCT1049:18: The work-group size passed to the SYCL kernel may exceed @@ -6396,7 +6397,7 @@ static void rms_norm_f32_sycl(const float *x, float *dst, const int ncols, }); }); } else { - const int work_group_size = g_work_group_size; + const int work_group_size = 1024; const sycl::range<3> block_dims(1, 1, work_group_size); /* DPCT1049:19: The work-group size passed to the SYCL kernel may exceed @@ -9246,7 +9247,7 @@ static void soft_max_f32_sycl(const float * x, const float * mask, const int nrows_y, const float scale, const float max_bias, queue_ptr stream) { int nth = WARP_SIZE; - int max_block_size = g_work_group_size; + int max_block_size = 1024; while (nth < ncols_x && nth < max_block_size) nth *= 2; if (nth>max_block_size) nth = max_block_size; @@ -11452,14 +11453,9 @@ static void ggml_sycl_mul_mat_vec_p021(ggml_backend_sycl_context & ctx, const gg SYCL_CHECK(ggml_sycl_set_device(ctx.device)); queue_ptr main_stream = ctx.stream(); - ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra; - void * src0_ddq = src0_extra->data_device[ctx.device]; - - ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra; - float * src1_ddf = (float *) src1_extra->data_device[ctx.device]; - - ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra; - float * dst_ddf = (float *) dst_extra->data_device[ctx.device]; + void * src0_ddq = src0->data; + float * src1_ddf = (float *) src1->data; + float * dst_ddf = (float *) dst->data; ggml_mul_mat_p021_f16_f32_sycl(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, ne12, main_stream); } @@ -11490,15 +11486,10 @@ static void ggml_sycl_mul_mat_vec_nc(ggml_backend_sycl_context & ctx, const ggml SYCL_CHECK(ggml_sycl_set_device(ctx.device)); queue_ptr main_stream = ctx.stream(); - - ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra; - void * src0_ddq = src0_extra->data_device[ctx.device]; - - ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra; - float * src1_ddf = (float *) src1_extra->data_device[ctx.device]; - - ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra; - float * dst_ddf = (float *) dst_extra->data_device[ctx.device]; + + void * src0_ddq = src0->data; + float * src1_ddf = (float *) src1->data; + float * dst_ddf = (float *) dst->data; const int64_t row_stride_x = nb01 / sizeof(sycl::half); const int64_t channel_stride_x = nb02 / sizeof(sycl::half); @@ -12042,9 +12033,6 @@ static void ggml_sycl_cpy(ggml_backend_sycl_context & ctx, const ggml_tensor *sr const int64_t ne = ggml_nelements(src0); GGML_ASSERT(ne == ggml_nelements(src1)); - GGML_ASSERT(src0->backend == GGML_BACKEND_TYPE_GPU); - GGML_ASSERT(src1->backend == GGML_BACKEND_TYPE_GPU); - GGML_ASSERT(ggml_nbytes(src0) <= INT_MAX); GGML_ASSERT(ggml_nbytes(src1) <= INT_MAX); @@ -12053,11 +12041,8 @@ static void ggml_sycl_cpy(ggml_backend_sycl_context & ctx, const ggml_tensor *sr SYCL_CHECK(ggml_sycl_set_device(ctx.device)); queue_ptr main_stream = ctx.stream(); - const ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra; - const ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra; - - char * src0_ddc = (char *) src0_extra->data_device[ctx.device]; - char * src1_ddc = (char *) src1_extra->data_device[ctx.device]; + char * src0_ddc = (char *) src0->data; + char * src1_ddc = (char *) src1->data; if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) { ggml_cpy_f32_f32_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);