Split up GPU initialization into separate subroutine

samhatfield · Jul 29, 2019 · bd325f8 · bd325f8
1 parent badf17b
commit bd325f8
Show file tree

Hide file tree

Showing 3 changed files with 24 additions and 8 deletions.
diff --git a/cublas_gemm_c.cu b/cublas_gemm_c.cu
@@ -25,6 +25,18 @@ __global__ void double2half(half *out, const double *in, int n) {
  }
 }
 
+cublasHandle_t cublasHandle;
+
+// Sets up GPU and cuBLAS and allocates memory
+extern "C" {
+ void init_gpu_c(int m, int n, int k) {
+ cudaSetDevice(0);
+ cublasErrCheck(cublasCreate(&cublasHandle));
+ cudaDeviceReset();
+ cublasErrCheck(cublasSetMathMode(cublasHandle, CUBLAS_TENSOR_OP_MATH));
+ }
+}
+
 // Performs matrix-matrix multiplication using Tensor Core.
 extern "C" {
  void tcgemm_c(int transa, int transb, int m, int n, int k, float alpha, void *a_p, int lda, void *b_p,
@@ -41,12 +53,6 @@ extern "C" {
  // Compute GEMM using Tensor Core
  // =========================================================================
 
- // Set up GPU and cuBLAS
- cublasHandle_t cublasHandle;
- cudaSetDevice(0);
- cudaDeviceReset();
- cublasErrCheck(cublasCreate(&cublasHandle));
-
  // Set up device-side arrays
  double *a_d, *b_d;
  half *a_d_16, *b_d_16;
@@ -71,7 +77,6 @@ extern "C" {
  cudaDeviceSynchronize();
 
  // Perform GEMM with Tensor Core
- cublasErrCheck(cublasSetMathMode(cublasHandle, CUBLAS_TENSOR_OP_MATH));
  cublasErrCheck(
  cublasGemmEx(
  cublasHandle, (cublasOperation_t)transa, (cublasOperation_t)transb,

diff --git a/cublas_gemm_f.f90 b/cublas_gemm_f.f90
@@ -16,7 +16,16 @@ subroutine tcgemm_c(transa, transb, m, n, k, alpha, a_p, lda, b_p, ldb, beta, c_
  end subroutine
  end interface
 
+ interface
+ subroutine init_gpu_c() bind(c)
+ end subroutine
+ end interface
+
 contains
+ subroutine init_gpu
+ call init_gpu_c
+ end subroutine
+
  !> Perform matrix-matrix multiplication using Tensor Core (wrapper for C
  ! function).
  subroutine tcgemm(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc)

diff --git a/matmul_test.f90 b/matmul_test.f90
@@ -1,5 +1,5 @@
 program matmul_test
- use cublas_gemm_f, only: tcgemm
+ use cublas_gemm_f, only: init_gpu, tcgemm
 
  implicit none
 
@@ -39,6 +39,8 @@ program matmul_test
  ! Device DGEMM (with transpose)
  ! =========================================================================
 
+ call init_gpu
+
  ! Call Tensor Core GEMM routine
  call cpu_time(tick)
  call tcgemm("N", "T", m, m, n, 1.0, a2, m, b2, m, 0.0, c2, m)