Skip to content

Commit

Permalink
cuda : fix tensor size calculation for non-split buffer (llama/5145)
Browse files Browse the repository at this point in the history
  • Loading branch information
slaren authored and ggerganov committed Jan 27, 2024
1 parent c249812 commit 397fa0e
Show file tree
Hide file tree
Showing 2 changed files with 8 additions and 15 deletions.
4 changes: 3 additions & 1 deletion src/ggml-backend.c
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,9 @@ size_t ggml_backend_buft_get_alignment(ggml_backend_buffer_type_t buft) {
GGML_CALL size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor) {
// get_alloc_size is optional, defaults to ggml_nbytes
if (buft->iface.get_alloc_size) {
return buft->iface.get_alloc_size(buft, tensor);
size_t size = buft->iface.get_alloc_size(buft, tensor);
assert(size >= ggml_nbytes(tensor));
return size;
}
return ggml_nbytes(tensor);
}
Expand Down
19 changes: 5 additions & 14 deletions src/ggml-cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -9790,8 +9790,8 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s
// TODO: mmq/mmv support
#endif

const int64_t nb11 = src1->nb[1];
const int64_t nb1 = dst->nb[1];
const size_t nb11 = src1->nb[1];
const size_t nb1 = dst->nb[1];

const struct ggml_tensor * ids = src0;
const int32_t id = ((int32_t *) dst->op_params)[0];
Expand Down Expand Up @@ -10304,15 +10304,11 @@ GGML_CALL static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t

if (ggml_is_quantized(tensor->type)) {
// initialize padding to 0 to avoid possible NaN values
int64_t row_low = 0;
int64_t row_high = ggml_nrows(tensor);
int64_t nrows_split = row_high - row_low;

size_t original_size = ggml_nbytes_split(tensor, nrows_split);
size_t original_size = ggml_nbytes(tensor);
size_t padded_size = ggml_backend_buft_get_alloc_size(buffer->buft, tensor);

if (padded_size > original_size && tensor->view_src == nullptr) {
CUDA_CHECK(cudaMemsetAsync((char *)tensor->data + original_size, 0, padded_size - original_size, g_cudaStreams[ctx->device][0]));
CUDA_CHECK(cudaMemset((char *)tensor->data + original_size, 0, padded_size - original_size));
}
}
}
Expand Down Expand Up @@ -10415,12 +10411,7 @@ GGML_CALL static size_t ggml_backend_cuda_buffer_type_get_alignment(ggml_backend
}

GGML_CALL static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
int64_t row_low = 0;
int64_t row_high = ggml_nrows(tensor);
int64_t nrows_split = row_high - row_low;

size_t size = ggml_nbytes_split(tensor, nrows_split);

size_t size = ggml_nbytes(tensor);
int64_t ne0 = tensor->ne[0];

if (ggml_is_quantized(tensor->type)) {
Expand Down

0 comments on commit 397fa0e

Please sign in to comment.