apache · leezu · Apr 16, 2020 · Apr 16, 2020
diff --git a/src/operator/contrib/transformer.cu b/src/operator/contrib/transformer.cu
@@ -43,7 +43,7 @@ void CublasStridedBatchedGemm(mshadow::Stream<gpu>* s, bool transA, bool transB,
  float alpha, const DType* a, int32_t lda, int32_t strideA,
  const DType *b, int32_t ldb, int32_t strideB, float beta,
  DType *c, int32_t ldc, int32_t strideC, int32_t batchCount,
- cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT_TENSOR_OP) {
+ cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT) {
 #if CUDA_VERSION >= 9010
  using namespace mxnet::common::cuda;
  CHECK_EQ(s->blas_handle_ownership_, mshadow::Stream<gpu>::OwnHandle)
@@ -142,9 +142,9 @@ void gemm_switch_fp32accum(mshadow::Stream<gpu>* s, bool transA, bool transB,
  float alpha, const DType *a, int32_t lda,
  int32_t strideA, const DType *b, int32_t ldb,
  int32_t strideB, float beta, DType *c, int32_t ldc,
- int32_t strideC, int32_t batchCount) {
+ int32_t strideC, int32_t batchCount, bool using_fp16) {
  cudaStream_t stream = mshadow::Stream<gpu>::GetStream(s);
- if (!(lda & 0x7) && !(ldb & 0x7) && !(ldc & 0x7)) {
+ if (using_fp16) {
  CublasStridedBatchedGemm(s, transA, transB, m, n, k, alpha, a, lda, strideA, b, ldb,
  strideB, beta, c, ldc, strideC, batchCount, CUBLAS_GEMM_DEFAULT_TENSOR_OP);
  } else {
@@ -175,6 +175,7 @@ void InterleavedMatMulSelfAttQKGPU(const nnvm::NodeAttrs& attrs,
  const int32_t batch_stride = 3 * head_dim;
  const float beta = req[0] == kAddTo ? 1.f : 0.f;
  const float scale = 1.0 / sqrt(static_cast<float>(head_dim));
+ const bool using_fp16 = inputs[0].type_flag_ == mshadow::kFloat16;
 
  if (req[0] == kNullOp)
  return;
@@ -196,7 +197,8 @@ void InterleavedMatMulSelfAttQKGPU(const nnvm::NodeAttrs& attrs,
  output,
  qkv_seq_len,
  qkv_seq_len * qkv_seq_len,
- attn_batches);
+ attn_batches,
+ using_fp16);
  })
 }
 
@@ -220,7 +222,8 @@ void BackwardInterleavedMatMulSelfAttQKGPU(const nnvm::NodeAttrs& attrs,
  const int32_t lead_dim = attn_batches * 3 * head_dim;
  const int32_t batch_stride = 3 * head_dim;
  const float scale = 1.0 / sqrt(static_cast<float>(head_dim));
- const float beta = req[0] == kAddTo ? 1.f : 0.f;
+ const float beta = req[0] == kAddTo ? 1.f : 0.f;
+ const bool using_fp16 = inputs[0].type_flag_ == mshadow::kFloat16;
 
  if (req[0] == kNullOp)
  return;
@@ -247,7 +250,8 @@ void BackwardInterleavedMatMulSelfAttQKGPU(const nnvm::NodeAttrs& attrs,
  queries_keys_values_grads,
  lead_dim,
  batch_stride,
- attn_batches);
+ attn_batches,
+ using_fp16);
  gemm_switch_fp32accum(s,
  false,
  true,
@@ -265,7 +269,8 @@ void BackwardInterleavedMatMulSelfAttQKGPU(const nnvm::NodeAttrs& attrs,
  queries_keys_values_grads + head_dim,
  lead_dim,
  batch_stride,
- attn_batches);
+ attn_batches,
+ using_fp16);
  })
 }
 
@@ -290,6 +295,7 @@ void InterleavedMatMulSelfAttValAttGPU(const nnvm::NodeAttrs& attrs,
  const int32_t batch_stride = 3 * head_dim;
  const float alpha = 1.f;
  const float beta = req[0] == kAddTo ? 1.f : 0.f;
+ const bool using_fp16 = inputs[0].type_flag_ == mshadow::kFloat16;
 
  if (req[0] == kNullOp)
  return;
@@ -311,7 +317,8 @@ void InterleavedMatMulSelfAttValAttGPU(const nnvm::NodeAttrs& attrs,
  output,
  head_dim * attn_batches,
  head_dim,
- attn_batches);
+ attn_batches,
+ using_fp16);
  })
 }
 
@@ -337,6 +344,8 @@ void BackwardInterleavedMatMulSelfAttValAttGPU(const nnvm::NodeAttrs& attrs,
  const int32_t lead_dim = attn_batches * 3 * head_dim;
  const int32_t batch_stride = 3 * head_dim;
  const float alpha = 1.f;
+ const bool using_fp16 = inputs[0].type_flag_ == mshadow::kFloat16;
+
  if (req[0] != kNullOp) {
  if (req[0] == kWriteTo) {
  cudaMemsetAsync(queries_keys_values_grads, 0, outputs[0].shape_.Size() * sizeof(DType),
@@ -360,7 +369,8 @@ void BackwardInterleavedMatMulSelfAttValAttGPU(const nnvm::NodeAttrs& attrs,
  queries_keys_values_grads + 2 * head_dim,
  lead_dim,
  batch_stride,
- attn_batches);
+ attn_batches,
+ using_fp16);
  }
  if (req[1] != kNullOp) {
  const float beta = req[1] == kAddTo ? 1.f : 0.f;
@@ -381,7 +391,8 @@ void BackwardInterleavedMatMulSelfAttValAttGPU(const nnvm::NodeAttrs& attrs,
  attention_maps_grads,
  qkv_seq_len,
  qkv_seq_len * qkv_seq_len,
- attn_batches);
+ attn_batches,
+ using_fp16);
  }
  })
 }
@@ -412,6 +423,7 @@ void InterleavedMatMulEncDecQKGPU(const nnvm::NodeAttrs& attrs,
  const int32_t batch_stride_kv = head_dim * 2;
  const float beta = req[0] == kAddTo ? 1.f : 0.f;
  const float scale = 1.f / sqrt(static_cast<float>(head_dim));
+ const bool using_fp16 = inputs[0].type_flag_ == mshadow::kFloat16;
 
  if (req[0] == kNullOp)
  return;
@@ -433,7 +445,8 @@ void InterleavedMatMulEncDecQKGPU(const nnvm::NodeAttrs& attrs,
  output,
  kv_seq_len,
  kv_seq_len * q_seq_len,
- attn_batches);
+ attn_batches,
+ using_fp16);
  })
 }
 
@@ -463,6 +476,7 @@ void BackwardInterleavedMatMulEncDecQKGPU(const nnvm::NodeAttrs& attrs,
  const int32_t batch_stride_q = head_dim;
  const int32_t batch_stride_kv = head_dim * 2;
  const float scale = 1.f / sqrt(static_cast<float>(head_dim));
+ const bool using_fp16 = inputs[0].type_flag_ == mshadow::kFloat16;
 
  if (req[0] != kNullOp) {
  const float beta = req[0] == kAddTo ? 1.f : 0.f;
@@ -483,7 +497,8 @@ void BackwardInterleavedMatMulEncDecQKGPU(const nnvm::NodeAttrs& attrs,
  queries_grads,
  lead_dim_q,
  batch_stride_q,
- attn_batches);
+ attn_batches,
+ using_fp16);
  }
  if (req[1] != kNullOp) {
  if (req[1] == kWriteTo) {
@@ -508,7 +523,8 @@ void BackwardInterleavedMatMulEncDecQKGPU(const nnvm::NodeAttrs& attrs,
  keys_values_grads,
  lead_dim_kv,
  batch_stride_kv,
- attn_batches);
+ attn_batches,
+ using_fp16);
  }
  })
 }
@@ -535,6 +551,7 @@ void InterleavedMatMulEncDecValAttGPU(const nnvm::NodeAttrs& attrs,
  const int32_t batch_stride_kv = 2 * head_dim;
  const float alpha = 1.f;
  const float beta = req[0] == kAddTo ? 1.f : 0.f;
+ const bool using_fp16 = inputs[0].type_flag_ == mshadow::kFloat16;
 
  if (req[0] == kNullOp)
  return;
@@ -556,7 +573,8 @@ void InterleavedMatMulEncDecValAttGPU(const nnvm::NodeAttrs& attrs,
  output,
  head_dim * attn_batches,
  head_dim,
- attn_batches);
+ attn_batches,
+ using_fp16);
  })
 }
 
@@ -583,6 +601,7 @@ void BackwardInterleavedMatMulEncDecValAttGPU(const nnvm::NodeAttrs& attrs,
  const int32_t lead_dim_kv = attn_batches * head_dim * 2;
  const int32_t batch_stride_kv = 2 * head_dim;
  const float alpha = 1.f;
+ const bool using_fp16 = inputs[0].type_flag_ == mshadow::kFloat16;
 
  if (req[0] != kNullOp) {
  if (req[0] == kWriteTo) {
@@ -607,7 +626,8 @@ void BackwardInterleavedMatMulEncDecValAttGPU(const nnvm::NodeAttrs& attrs,
  keys_values_grads + head_dim,
  lead_dim_kv,
  batch_stride_kv,
- attn_batches);
+ attn_batches,
+ using_fp16);
  }
  if (req[1] != kNullOp) {
  const float beta = req[1] == kAddTo ? 1.f : 0.f;
@@ -628,7 +648,8 @@ void BackwardInterleavedMatMulEncDecValAttGPU(const nnvm::NodeAttrs& attrs,
  attention_maps_grads,
  kv_seq_len,
  kv_seq_len * q_seq_len,
- attn_batches);
+ attn_batches,
+ using_fp16);
  }
  })
 }