Move GPU memory allocation/deallocation outside tcgemm subroutine

samhatfield · Jul 30, 2019 · fc4ed43 · fc4ed43
1 parent bd325f8
commit fc4ed43
Show file tree

Hide file tree

Showing 3 changed files with 45 additions and 28 deletions.
diff --git a/cublas_gemm_c.cu b/cublas_gemm_c.cu
@@ -27,13 +27,37 @@ __global__ void double2half(half *out, const double *in, int n) {
 
 cublasHandle_t cublasHandle;
 
+// Device-side arrays
+double *a_d, *b_d;
+half *a_d_16, *b_d_16;
+float *c_d_32;
+
 // Sets up GPU and cuBLAS and allocates memory
 extern "C" {
  void init_gpu_c(int m, int n, int k) {
  cudaSetDevice(0);
  cublasErrCheck(cublasCreate(&cublasHandle));
  cudaDeviceReset();
  cublasErrCheck(cublasSetMathMode(cublasHandle, CUBLAS_TENSOR_OP_MATH));
+
+ // Allocate memory on device for all arrays
+ // TODO: should the dimensions used below (m*k etc.) take into account transa, lda etc.?
+ cudaErrCheck(cudaMalloc((void **)&a_d, m*k*sizeof(double)));
+ cudaErrCheck(cudaMalloc((void **)&b_d, k*n*sizeof(double)));
+ cudaErrCheck(cudaMalloc((void**)&a_d_16, m*k*sizeof(half)));
+ cudaErrCheck(cudaMalloc((void**)&b_d_16, k*n*sizeof(half)));
+ cudaErrCheck(cudaMalloc((void**)&c_d_32, m*n*sizeof(float)));
+ }
+}
+
+extern "C" {
+ void fin_gpu_c() {
+ // Free memory on device
+ cudaErrCheck(cudaFree(a_d));
+ cudaErrCheck(cudaFree(b_d));
+ cudaErrCheck(cudaFree(a_d_16));
+ cudaErrCheck(cudaFree(b_d_16));
+ cudaErrCheck(cudaFree(c_d_32));
  }
 }
 
@@ -53,19 +77,6 @@ extern "C" {
  // Compute GEMM using Tensor Core
  // =========================================================================
 
- // Set up device-side arrays
- double *a_d, *b_d;
- half *a_d_16, *b_d_16;
- float *c_d_32;
-
- // Allocate memory on device for all arrays
- // TODO: should the dimensions used below (m*k etc.) take into account transa, lda etc.?
- cudaErrCheck(cudaMalloc((void **)&a_d, m*k*sizeof(double)));
- cudaErrCheck(cudaMalloc((void **)&b_d, k*n*sizeof(double)));
- cudaErrCheck(cudaMalloc((void**)&a_d_16, m*k*sizeof(half)));
- cudaErrCheck(cudaMalloc((void**)&b_d_16, k*n*sizeof(half)));
- cudaErrCheck(cudaMalloc((void**)&c_d_32, m*n*sizeof(float)));
-
  // Copy input arrays to device
  cudaErrCheck(cudaMemcpy(a_d, a_h, m*k*sizeof(double), cudaMemcpyHostToDevice));
  cudaErrCheck(cudaMemcpy(b_d, b_h, k*n*sizeof(double), cudaMemcpyHostToDevice));
@@ -74,8 +85,6 @@ extern "C" {
  double2half<<<(int)((m*k)/256) + 1, 256 >>>(a_d_16, a_d, m*k);
  double2half<<<(int)((k*n)/256) + 1, 256 >>>(b_d_16, b_d, k*n);
 
- cudaDeviceSynchronize();
-
  // Perform GEMM with Tensor Core
  cublasErrCheck(
  cublasGemmEx(
@@ -93,13 +102,6 @@ extern "C" {
 
  // Copy results back from device to host
  cudaErrCheck(cudaMemcpy(c_h, c_d_32, m*n*sizeof(float), cudaMemcpyDeviceToHost));
- cudaDeviceSynchronize();
-
- // Free memory on device
- cudaErrCheck(cudaFree(a_d));
- cudaErrCheck(cudaFree(b_d));
- cudaErrCheck(cudaFree(a_d_16));
- cudaErrCheck(cudaFree(b_d_16));
- cudaErrCheck(cudaFree(c_d_32));
  }
 }
+
diff --git a/cublas_gemm_f.f90 b/cublas_gemm_f.f90
@@ -17,13 +17,26 @@ subroutine tcgemm_c(transa, transb, m, n, k, alpha, a_p, lda, b_p, ldb, beta, c_
  end interface
 
  interface
- subroutine init_gpu_c() bind(c)
+ subroutine init_gpu_c(m, n, k) bind(c)
+ use iso_c_binding, only: c_int
+ integer(kind=c_int), value :: m, n, k
+ end subroutine
+ end interface
+
+ interface
+ subroutine fin_gpu_c() bind(c)
  end subroutine
  end interface
 
 contains
- subroutine init_gpu
- call init_gpu_c
+ subroutine init_gpu(m, n, k)
+ integer :: m, n, k
+
+ call init_gpu_c(m, n, k)
+ end subroutine
+
+ subroutine fin_gpu
+ call fin_gpu_c
  end subroutine
 
  !> Perform matrix-matrix multiplication using Tensor Core (wrapper for C

diff --git a/matmul_test.f90 b/matmul_test.f90
@@ -1,5 +1,5 @@
 program matmul_test
- use cublas_gemm_f, only: init_gpu, tcgemm
+ use cublas_gemm_f, only: init_gpu, fin_gpu, tcgemm
 
  implicit none
 
@@ -39,13 +39,15 @@ program matmul_test
  ! Device DGEMM (with transpose)
  ! =========================================================================
 
- call init_gpu
+ call init_gpu(m, m, n)
 
  ! Call Tensor Core GEMM routine
  call cpu_time(tick)
  call tcgemm("N", "T", m, m, n, 1.0, a2, m, b2, m, 0.0, c2, m)
  call cpu_time(tock)
 
+ call fin_gpu
+
  write (*,"(A35,F17.10)") "C matrix Frobenius norm (device) = ", frob_norm(real(c2,8))
  write (*,"(A11,F13.10)") "GPU time = ", tock - tick