Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SYCL] remove global variables #7710

Merged
merged 10 commits into from
Jun 15, 2024
Prev Previous commit
Next Next commit
remove useless backend check
  • Loading branch information
airMeng committed Jun 14, 2024
commit 996b35a0ade3684742f8fa7a05bc684b08cdcd51
120 changes: 21 additions & 99 deletions ggml-sycl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10774,14 +10774,10 @@ static void ggml_sycl_op_flatten(ggml_backend_sycl_context & ctx, const ggml_ten
ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr;
ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;

const bool src0_on_device = src0->backend == GGML_BACKEND_TYPE_GPU || src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT;
const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_TYPE_GPU;
const bool dst_on_device = dst->backend == GGML_BACKEND_TYPE_GPU;

// dd = data device
float * src0_ddf = nullptr;
float * src1_ddf = nullptr;
float * dst_ddf = nullptr;
float * src0_ddf = (float *) src0->data;
float * src1_ddf = use_src1 ? (float *) src1->data : nullptr;
float * dst_ddf = (float *) dst->data;

ggml_sycl_pool_alloc<float> src0_f(ctx.pool());
ggml_sycl_pool_alloc<float> src1_f(ctx.pool());
Expand All @@ -10792,48 +10788,8 @@ static void ggml_sycl_op_flatten(ggml_backend_sycl_context & ctx, const ggml_ten
// GGML_SYCL_DEBUG("ctx.device=%d, main_stream=%p src0_on_device=%d, src1_on_device=%d, dst_on_device=%d\n",
// ctx.device, main_stream, src0_on_device, src1_on_device, dst_on_device);

if (src0_on_device) {
src0_ddf = (float *) src0_extra->data_device[ctx.device];
} else {
src0_ddf = src0_f.alloc(ggml_nelements(src0));
// GGML_SYCL_DEBUG("before ggml_sycl_cpy_tensor_2d src0_ddf=%p, src0=%p\n", src0_ddf, src0);
SYCL_CHECK(ggml_sycl_cpy_tensor_2d(src0_ddf, src0, 0, 0, 0, nrows0, main_stream));
}

if (use_src1) {
if (src1_on_device) {
src1_ddf = (float *) src1_extra->data_device[ctx.device];
} else {
src1_ddf = src1_f.alloc(ggml_nelements(src1));
SYCL_CHECK(ggml_sycl_cpy_tensor_2d(src1_ddf, src1, 0, 0, 0, nrows1, main_stream));
}
}
if (dst_on_device) {
dst_ddf = (float *) dst_extra->data_device[ctx.device];
} else {
dst_ddf = dst_f.alloc(ggml_nelements(dst));
}

// GGML_SYCL_DEBUG("op src0=%p, src1=%p, dst=%p, src0_ddf=%p, src1_ddf=%p, dst_ddf=%p, main_stream=%p\n",
// src0, src1, dst, src0_ddf, src1_ddf, dst_ddf, main_stream);
// do the computation
op(ctx, src0, src1, dst, src0_ddf, src1_ddf, dst_ddf, main_stream);
/*
DPCT1010:89: SYCL uses exceptions to report errors and does not use the
error codes. The call was replaced with 0. You need to rewrite this code.
*/
SYCL_CHECK(0);

// copy dst to host if necessary
if (!dst_on_device) {
SYCL_CHECK(CHECK_TRY_ERROR(
main_stream->memcpy(dst->data, dst_ddf, ggml_nbytes(dst)).wait()));
}

if (dst->backend == GGML_BACKEND_TYPE_CPU) {
SYCL_CHECK(CHECK_TRY_ERROR(
dpct::get_current_device().queues_wait_and_throw()));
}
// print_ggml_tensor("tensor", dst);
}
catch (sycl::exception const &exc) {
Expand Down Expand Up @@ -10923,7 +10879,6 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;

const bool src0_on_device = src0->backend == GGML_BACKEND_TYPE_GPU || src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT;
const bool src0_is_contiguous = ggml_is_contiguous(src0);
const bool src1_is_contiguous = ggml_is_contiguous(src1);

Expand Down Expand Up @@ -10995,20 +10950,20 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten

used_devices++;

const bool src1_on_device = src1->backend == GGML_BACKEND_TYPE_GPU && i == ctx.device;
const bool dst_on_device = dst->backend == GGML_BACKEND_TYPE_GPU && i == ctx.device;
const bool src1_on_device = i == ctx.device;
const bool dst_on_device = i == ctx.device;

ggml_sycl_set_device(i);
queue_ptr stream = ctx.stream(i, 0);

if (src0_on_device && src0_is_contiguous) {
dev[i].src0_dd = (char *) src0_extra->data_device[i];
if (src0_is_contiguous) {
dev[i].src0_dd = (char *) src0->data;
} else {
dev[i].src0_dd = dev[i].src0_dd_alloc.alloc(ctx.pool(i), ggml_nbytes(src0));
}

if (src1_on_device && src1_is_contiguous) {
dev[i].src1_ddf = (float *) src1_extra->data_device[i];
dev[i].src1_ddf = (float *) src1->data;
} else {
dev[i].src1_ddf = dev[i].src1_ddf_alloc.alloc(ctx.pool(i), ggml_nelements(src1));
}
Expand All @@ -11028,7 +10983,7 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
}

if (dst_on_device) {
dev[i].dst_dd = (float *) dst_extra->data_device[i];
dev[i].dst_dd = (float *) dst->data;
} else {
const size_t size_dst_ddf = split ? (dev[i].row_high - dev[i].row_low)*ne1 : ggml_nelements(dst);
dev[i].dst_dd = dev[i].dst_dd_alloc.alloc(ctx.pool(i), size_dst_ddf);
Expand Down Expand Up @@ -11059,8 +11014,8 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
continue;
}

const bool src1_on_device = src1->backend == GGML_BACKEND_TYPE_GPU && i == ctx.device;
const bool dst_on_device = dst->backend == GGML_BACKEND_TYPE_GPU && i == ctx.device;
const bool src1_on_device = i == ctx.device;
const bool dst_on_device = i == ctx.device;
const int64_t row_diff = dev[i].row_high - dev[i].row_low;

ggml_sycl_set_device(i);
Expand Down Expand Up @@ -11091,12 +11046,12 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten

// the main device memory buffer can be on VRAM scratch, with space for all partial results
// in that case an offset on dst_ddf_i is needed
if (dst->backend == GGML_BACKEND_TYPE_GPU && i == ctx.device) {
if (i == ctx.device) {
dst_dd_i += dev[i].row_low; // offset is 0 if no tensor split
}

// copy src0, src1 to device if necessary
if (src1->backend == GGML_BACKEND_TYPE_GPU && src1_is_contiguous) {
if (src1_is_contiguous) {
if (i != ctx.device) {
if (convert_src1_to_q8_1) {
char * src1_ddq_i_source = dev[ctx.device].src1_ddq + src1_ddq_i_offset;
Expand All @@ -11114,14 +11069,14 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
src1_ncols * ne10 * sizeof(float))));
}
}
} else if (src1->backend == GGML_BACKEND_TYPE_CPU || (src1_on_device && !src1_is_contiguous)) {
} else if (src1_on_device && !src1_is_contiguous) {
SYCL_CHECK(ggml_sycl_cpy_tensor_2d(
src1_ddf_i, src1, i03, i02, src1_col_0, src1_col_0+src1_ncols, stream));
} else {
GGML_ASSERT(false);
}

if (convert_src1_to_q8_1 && (src1->backend == GGML_BACKEND_TYPE_CPU || !src1_is_contiguous)) {
if (convert_src1_to_q8_1 && !src1_is_contiguous) {
quantize_row_q8_1_sycl(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, src1_padded_col_size, stream);
/*
DPCT1010:92: SYCL uses exceptions to report errors and does
Expand All @@ -11131,7 +11086,7 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
SYCL_CHECK(0);
}

if (src1_col_0 == 0 && (!src0_on_device || !src0_is_contiguous) && i02 % i02_divisor == 0) {
if (src1_col_0 == 0 && !src0_is_contiguous && i02 % i02_divisor == 0) {
SYCL_CHECK(ggml_sycl_cpy_tensor_2d(src0_dd_i, src0, i03, i02/i02_divisor, dev[i].row_low, dev[i].row_high, stream));
}
if (src1->type == GGML_TYPE_F16) {
Expand All @@ -11149,17 +11104,7 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten

// copy dst to host or other device if necessary
if (!dst_on_device) {
void * dst_off_device;
dpct::memcpy_direction kind;
if (dst->backend == GGML_BACKEND_TYPE_CPU) {
dst_off_device = dst->data;
kind = dpct::device_to_host;
} else if (dst->backend == GGML_BACKEND_TYPE_GPU) {
dst_off_device = dst_extra->data_device[ctx.device];
kind = dpct::device_to_device;
} else {
GGML_ASSERT(false);
}
void * dst_off_device = dst->data;
if (split) {
// src0 = weight matrix is saved as a transposed matrix for better memory layout.
// dst is NOT transposed.
Expand All @@ -11170,27 +11115,10 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
GGML_ASSERT(dst->nb[1] == ne0*sizeof(float));
dhf_dst_i += src1_col_0*ne0 + dev[i].row_low;

//todo, dirty solution. Need be updated when device2device memcpy() is supported.
if (kind == dpct::device_to_device) {
size_t dst_size = ggml_nbytes_pad(dst);
float *host_buf = (float *)malloc(dst_size);
SYCL_CHECK(CHECK_TRY_ERROR(dpct::async_dpct_memcpy(
host_buf, ne0 * sizeof(float), dst_dd_i,
row_diff * sizeof(float), row_diff * sizeof(float),
src1_ncols, dpct::device_to_host, *stream)));
dpct::dev_mgr::instance().get_device(i).queues_wait_and_throw();
SYCL_CHECK(CHECK_TRY_ERROR(dpct::async_dpct_memcpy(
dhf_dst_i, ne0 * sizeof(float), host_buf,
row_diff * sizeof(float), row_diff * sizeof(float),
src1_ncols, dpct::host_to_device, *main_stream)));
dpct::dev_mgr::instance().get_device(ctx.device).queues_wait_and_throw();
free(host_buf);
} else {
SYCL_CHECK(CHECK_TRY_ERROR(dpct::async_dpct_memcpy(
dhf_dst_i, ne0 * sizeof(float), dst_dd_i,
row_diff * sizeof(float), row_diff * sizeof(float),
src1_ncols, kind, *stream)));
}
SYCL_CHECK(CHECK_TRY_ERROR(dpct::async_dpct_memcpy(
dhf_dst_i, ne0 * sizeof(float), dst_dd_i,
row_diff * sizeof(float), row_diff * sizeof(float),
src1_ncols, dpct::device_to_device, *stream)));
} else {
float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
GGML_ASSERT(dst->nb[1] == ne0*sizeof(float));
Expand Down Expand Up @@ -11234,12 +11162,6 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
}
}
}

if (dst->backend == GGML_BACKEND_TYPE_CPU) {
SYCL_CHECK(ggml_sycl_set_device(ctx.device));
SYCL_CHECK(CHECK_TRY_ERROR(
dpct::get_current_device().queues_wait_and_throw()));
}
}
catch (sycl::exception const &exc) {
std::cerr << exc.what() << "Exception caught at file:" << __FILE__
Expand Down