#include #include // Handles CUDA errors #define cudaErrCheck(stat) { cudaErrCheck_((stat), __FILE__, __LINE__); } void cudaErrCheck_(cudaError_t stat, const char *file, int line) { if (stat != cudaSuccess) { fprintf(stderr, "CUDA Error: %s %s %d\n", cudaGetErrorString(stat), file, line); } } // Handles cuBLAS errors #define cublasErrCheck(stat) { cublasErrCheck_((stat), __FILE__, __LINE__); } void cublasErrCheck_(cublasStatus_t stat, const char *file, int line) { if (stat != CUBLAS_STATUS_SUCCESS) { fprintf(stderr, "cuBLAS Error: %d %s %d\n", stat, file, line); } } // Converts from double-precision to half-precision (CUDA kernel) __global__ void double2half(half *out, const double *in, int n) { int idx = blockDim.x * blockIdx.x + threadIdx.x; if (idx < n) { out[idx] = __float2half((float)(in[idx])); } } cublasHandle_t cublasHandle; // Device-side arrays double *a_d, *b_d; half *a_d_16, *b_d_16; float *c_d_32; // Sets up GPU and cuBLAS and allocates memory extern "C" { void init_gpu_c(int m, int n, int k) { cudaSetDevice(0); cudaDeviceReset(); cublasErrCheck(cublasCreate(&cublasHandle)); cublasErrCheck(cublasSetMathMode(cublasHandle, CUBLAS_TENSOR_OP_MATH)); // Allocate memory on device for all arrays // TODO: should the dimensions used below (m*k etc.) take into account transa, lda etc.? cudaErrCheck(cudaMalloc((void **)&a_d, m*k*sizeof(double))); cudaErrCheck(cudaMalloc((void **)&b_d, k*n*sizeof(double))); cudaErrCheck(cudaMalloc((void**)&a_d_16, m*k*sizeof(half))); cudaErrCheck(cudaMalloc((void**)&b_d_16, k*n*sizeof(half))); cudaErrCheck(cudaMalloc((void**)&c_d_32, m*n*sizeof(float))); } } extern "C" { void fin_gpu_c() { // Free memory on device cudaErrCheck(cudaFree(a_d)); cudaErrCheck(cudaFree(b_d)); cudaErrCheck(cudaFree(a_d_16)); cudaErrCheck(cudaFree(b_d_16)); cudaErrCheck(cudaFree(c_d_32)); } } // Performs matrix-matrix multiplication using Tensor Core. extern "C" { void tcgemm_c(int transa, int transb, int m, int n, int k, float alpha, void *a_p, int lda, void *b_p, int ldb, float beta, void *c_p, int ldc) { // Set up host-side arrays double *a_h, *b_h; float *c_h; a_h = (double *)a_p; b_h = (double *)b_p; c_h = (float *)c_p; // ========================================================================= // Compute GEMM using Tensor Core // ========================================================================= // Copy input arrays to device cudaErrCheck(cudaMemcpy(a_d, a_h, m*k*sizeof(double), cudaMemcpyHostToDevice)); cudaErrCheck(cudaMemcpy(b_d, b_h, k*n*sizeof(double), cudaMemcpyHostToDevice)); // Convert arrays to half-precision double2half<<<(int)((m*k)/256) + 1, 256 >>>(a_d_16, a_d, m*k); double2half<<<(int)((k*n)/256) + 1, 256 >>>(b_d_16, b_d, k*n); // Perform GEMM with Tensor Core cublasErrCheck( cublasGemmEx( cublasHandle, (cublasOperation_t)transa, (cublasOperation_t)transb, m, n, k, &alpha, a_d_16, CUDA_R_16F, lda, b_d_16, CUDA_R_16F, ldb, &beta, c_d_32, CUDA_R_32F, ldc, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP ) ); // Copy results back from device to host cudaErrCheck(cudaMemcpy(c_h, c_d_32, m*n*sizeof(float), cudaMemcpyDeviceToHost)); } }