Skip to content

Commit

Permalink
Move GPU memory allocation/deallocation outside tcgemm subroutine
Browse files Browse the repository at this point in the history
  • Loading branch information
Sam Hatfield committed Jul 30, 2019
1 parent bd325f8 commit fc4ed43
Show file tree
Hide file tree
Showing 3 changed files with 45 additions and 28 deletions.
48 changes: 25 additions & 23 deletions cublas_gemm_c.cu
Original file line number Diff line number Diff line change
Expand Up @@ -27,13 +27,37 @@ __global__ void double2half(half *out, const double *in, int n) {

cublasHandle_t cublasHandle;

// Device-side arrays
double *a_d, *b_d;
half *a_d_16, *b_d_16;
float *c_d_32;

// Sets up GPU and cuBLAS and allocates memory
extern "C" {
void init_gpu_c(int m, int n, int k) {
cudaSetDevice(0);
cublasErrCheck(cublasCreate(&cublasHandle));
cudaDeviceReset();
cublasErrCheck(cublasSetMathMode(cublasHandle, CUBLAS_TENSOR_OP_MATH));

// Allocate memory on device for all arrays
// TODO: should the dimensions used below (m*k etc.) take into account transa, lda etc.?
cudaErrCheck(cudaMalloc((void **)&a_d, m*k*sizeof(double)));
cudaErrCheck(cudaMalloc((void **)&b_d, k*n*sizeof(double)));
cudaErrCheck(cudaMalloc((void**)&a_d_16, m*k*sizeof(half)));
cudaErrCheck(cudaMalloc((void**)&b_d_16, k*n*sizeof(half)));
cudaErrCheck(cudaMalloc((void**)&c_d_32, m*n*sizeof(float)));
}
}

extern "C" {
void fin_gpu_c() {
// Free memory on device
cudaErrCheck(cudaFree(a_d));
cudaErrCheck(cudaFree(b_d));
cudaErrCheck(cudaFree(a_d_16));
cudaErrCheck(cudaFree(b_d_16));
cudaErrCheck(cudaFree(c_d_32));
}
}

Expand All @@ -53,19 +77,6 @@ extern "C" {
// Compute GEMM using Tensor Core
// =========================================================================

// Set up device-side arrays
double *a_d, *b_d;
half *a_d_16, *b_d_16;
float *c_d_32;

// Allocate memory on device for all arrays
// TODO: should the dimensions used below (m*k etc.) take into account transa, lda etc.?
cudaErrCheck(cudaMalloc((void **)&a_d, m*k*sizeof(double)));
cudaErrCheck(cudaMalloc((void **)&b_d, k*n*sizeof(double)));
cudaErrCheck(cudaMalloc((void**)&a_d_16, m*k*sizeof(half)));
cudaErrCheck(cudaMalloc((void**)&b_d_16, k*n*sizeof(half)));
cudaErrCheck(cudaMalloc((void**)&c_d_32, m*n*sizeof(float)));

// Copy input arrays to device
cudaErrCheck(cudaMemcpy(a_d, a_h, m*k*sizeof(double), cudaMemcpyHostToDevice));
cudaErrCheck(cudaMemcpy(b_d, b_h, k*n*sizeof(double), cudaMemcpyHostToDevice));
Expand All @@ -74,8 +85,6 @@ extern "C" {
double2half<<<(int)((m*k)/256) + 1, 256 >>>(a_d_16, a_d, m*k);
double2half<<<(int)((k*n)/256) + 1, 256 >>>(b_d_16, b_d, k*n);

cudaDeviceSynchronize();

// Perform GEMM with Tensor Core
cublasErrCheck(
cublasGemmEx(
Expand All @@ -93,13 +102,6 @@ extern "C" {

// Copy results back from device to host
cudaErrCheck(cudaMemcpy(c_h, c_d_32, m*n*sizeof(float), cudaMemcpyDeviceToHost));
cudaDeviceSynchronize();

// Free memory on device
cudaErrCheck(cudaFree(a_d));
cudaErrCheck(cudaFree(b_d));
cudaErrCheck(cudaFree(a_d_16));
cudaErrCheck(cudaFree(b_d_16));
cudaErrCheck(cudaFree(c_d_32));
}
}

19 changes: 16 additions & 3 deletions cublas_gemm_f.f90
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,26 @@ subroutine tcgemm_c(transa, transb, m, n, k, alpha, a_p, lda, b_p, ldb, beta, c_
end interface

interface
subroutine init_gpu_c() bind(c)
subroutine init_gpu_c(m, n, k) bind(c)
use iso_c_binding, only: c_int
integer(kind=c_int), value :: m, n, k
end subroutine
end interface

interface
subroutine fin_gpu_c() bind(c)
end subroutine
end interface

contains
subroutine init_gpu
call init_gpu_c
subroutine init_gpu(m, n, k)
integer :: m, n, k

call init_gpu_c(m, n, k)
end subroutine

subroutine fin_gpu
call fin_gpu_c
end subroutine

!> Perform matrix-matrix multiplication using Tensor Core (wrapper for C
Expand Down
6 changes: 4 additions & 2 deletions matmul_test.f90
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
program matmul_test
use cublas_gemm_f, only: init_gpu, tcgemm
use cublas_gemm_f, only: init_gpu, fin_gpu, tcgemm

implicit none

Expand Down Expand Up @@ -39,13 +39,15 @@ program matmul_test
! Device DGEMM (with transpose)
! =========================================================================

call init_gpu
call init_gpu(m, m, n)

! Call Tensor Core GEMM routine
call cpu_time(tick)
call tcgemm("N", "T", m, m, n, 1.0, a2, m, b2, m, 0.0, c2, m)
call cpu_time(tock)

call fin_gpu

write (*,"(A35,F17.10)") "C matrix Frobenius norm (device) = ", frob_norm(real(c2,8))
write (*,"(A11,F13.10)") "GPU time = ", tock - tick

Expand Down

0 comments on commit fc4ed43

Please sign in to comment.