From f971e460435aafdfcfcad3b2dd39389d1154c64f Mon Sep 17 00:00:00 2001 From: Justin Liutjens Date: Mon, 5 Nov 2018 21:14:58 -0500 Subject: [PATCH 1/2] [src] Use cuda streams in matrix library --- src/cudamatrix/cu-array-inl.h | 8 ++++---- src/cudamatrix/cu-matrix.cc | 15 +++++++++------ src/cudamatrix/cu-packed-matrix.cc | 4 ++-- src/cudamatrix/cu-value.h | 2 +- src/cudamatrix/cu-vector.cc | 12 +++++++----- 5 files changed, 23 insertions(+), 18 deletions(-) diff --git a/src/cudamatrix/cu-array-inl.h b/src/cudamatrix/cu-array-inl.h index ddae19b9a4e..5786de0b126 100644 --- a/src/cudamatrix/cu-array-inl.h +++ b/src/cudamatrix/cu-array-inl.h @@ -139,8 +139,8 @@ void CuArray::CopyFromArray(const CuArrayBase &src) { #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { CuTimer tim; - CU_SAFE_CALL(cudaMemcpy(this->data_, src.data_, this->dim_ * sizeof(T), - cudaMemcpyDeviceToDevice)); + CU_SAFE_CALL(cudaMemcpyAsync(this->data_, src.data_, this->dim_ * sizeof(T), + cudaMemcpyDeviceToDevice, cudaStreamPerThread)); CuDevice::Instantiate().AccuProfile(__func__, tim); } else #endif @@ -158,8 +158,8 @@ void CuArrayBase::CopyFromArray(const CuArrayBase &src) { if (CuDevice::Instantiate().Enabled()) { CuTimer tim; CU_SAFE_CALL( - cudaMemcpy(this->data_, src.data_, dim_ * sizeof(T), - cudaMemcpyDeviceToDevice)); + cudaMemcpyAsync(this->data_, src.data_, dim_ * sizeof(T), + cudaMemcpyDeviceToDevice, cudaStreamPerThread)); CuDevice::Instantiate().AccuProfile(__func__, tim); } else #endif diff --git a/src/cudamatrix/cu-matrix.cc b/src/cudamatrix/cu-matrix.cc index beccd9dc4a5..f526d53f10f 100644 --- a/src/cudamatrix/cu-matrix.cc +++ b/src/cudamatrix/cu-matrix.cc @@ -229,8 +229,9 @@ void CuMatrixBase::CopyFromMat(const CuMatrixBase &M, MatrixIndexT dst_pitch = stride_ * sizeof(Real); MatrixIndexT src_pitch = M.Stride() * sizeof(Real); MatrixIndexT width = M.NumCols() * sizeof(Real); - CU_SAFE_CALL(cudaMemcpy2D(data_, dst_pitch, M.data_, src_pitch, - width, M.num_rows_, cudaMemcpyDeviceToDevice)); + CU_SAFE_CALL(cudaMemcpy2DAsync(data_, dst_pitch, M.data_, src_pitch, + width, M.num_rows_, cudaMemcpyDeviceToDevice, + cudaStreamPerThread)); } else { if (trans == kNoTrans) { dim3 dimGrid, dimBlock; @@ -2286,14 +2287,16 @@ void CuMatrixBase::CopyRowsFromVec(const CuVectorBase &v) { if (v.Dim() == num_rows_*num_cols_) { if (stride_ == num_cols_) { const Real* v_data = v.Data(); - CU_SAFE_CALL(cudaMemcpy(data_, v_data, + CU_SAFE_CALL(cudaMemcpyAsync(data_, v_data, sizeof(Real)*num_rows_*num_cols_, - cudaMemcpyDeviceToDevice)); + cudaMemcpyDeviceToDevice, + cudaStreamPerThread)); } else { - CU_SAFE_CALL(cudaMemcpy2D(data_, stride_ * sizeof(Real), v.Data(), + CU_SAFE_CALL(cudaMemcpy2DAsync(data_, stride_ * sizeof(Real), v.Data(), num_cols_*sizeof(Real), num_cols_*sizeof(Real), num_rows_, - cudaMemcpyDeviceToDevice)); + cudaMemcpyDeviceToDevice, + cudaStreamPerThread)); } } else if (v.Dim() == num_cols_) { dim3 dimGrid, dimBlock; diff --git a/src/cudamatrix/cu-packed-matrix.cc b/src/cudamatrix/cu-packed-matrix.cc index 64f8afe0616..3c441b118e4 100644 --- a/src/cudamatrix/cu-packed-matrix.cc +++ b/src/cudamatrix/cu-packed-matrix.cc @@ -143,8 +143,8 @@ void CuPackedMatrix::CopyFromPacked(const CuPackedMatrix &src) { size_t nr = static_cast(num_rows_), num_bytes = ((nr * (nr+1)) / 2) * sizeof(Real); - CU_SAFE_CALL(cudaMemcpy(data_, src.data_, num_bytes, - cudaMemcpyDeviceToDevice)); + CU_SAFE_CALL(cudaMemcpyAsync(data_, src.data_, num_bytes, + cudaMemcpyDeviceToDevice, cudaStreamPerThread)); CuDevice::Instantiate().AccuProfile("CuPackedMatrix::CopyFromPacked1", tim); } else diff --git a/src/cudamatrix/cu-value.h b/src/cudamatrix/cu-value.h index b9b3035ccbd..abb450ae527 100644 --- a/src/cudamatrix/cu-value.h +++ b/src/cudamatrix/cu-value.h @@ -39,7 +39,7 @@ class CuValue { inline CuValue operator = (const CuValue &other) { #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { - CU_SAFE_CALL(cudaMemcpy(data_, other.data_, sizeof(Real), cudaMemcpyDeviceToDevice)); + CU_SAFE_CALL(cudaMemcpyAsync(data_, other.data_, sizeof(Real), cudaMemcpyDeviceToDevice, cudaStreamPerThread)); return *this; } else #endif diff --git a/src/cudamatrix/cu-vector.cc b/src/cudamatrix/cu-vector.cc index dcca5a76cde..5dd0b1ab7f8 100644 --- a/src/cudamatrix/cu-vector.cc +++ b/src/cudamatrix/cu-vector.cc @@ -167,14 +167,15 @@ void CuVectorBase::CopyRowsFromMat(const CuMatrixBase &mat) { if (dim_ == 0) return; CuTimer tim; if (mat.Stride() == mat.NumCols() && mat.NumRows() != 0) { - CU_SAFE_CALL(cudaMemcpy(data_, mat.Data(), sizeof(Real)*dim_, - cudaMemcpyDeviceToDevice)); + CU_SAFE_CALL(cudaMemcpyAsync(data_, mat.Data(), sizeof(Real)*dim_, + cudaMemcpyDeviceToDevice, cudaStreamPerThread)); } else { Real* vec_data = data_; for (MatrixIndexT r = 0; r < mat.NumRows(); r++) { - CU_SAFE_CALL(cudaMemcpy(vec_data, mat.RowData(r), + CU_SAFE_CALL(cudaMemcpyAsync(vec_data, mat.RowData(r), sizeof(Real) * mat.NumCols(), - cudaMemcpyDeviceToDevice)); + cudaMemcpyDeviceToDevice, + cudaStreamPerThread)); vec_data += mat.NumCols(); } } @@ -1049,7 +1050,8 @@ void CuVectorBase::CopyFromVec(const CuVectorBase &src) { if (CuDevice::Instantiate().Enabled()) { if (dim_ == 0) return; CuTimer tim; - CU_SAFE_CALL(cudaMemcpy(data_, src.data_, src.dim_ * sizeof(Real), cudaMemcpyDeviceToDevice)); + CU_SAFE_CALL(cudaMemcpyAsync(data_, src.data_, src.dim_ * sizeof(Real), cudaMemcpyDeviceToDevice, + cudaStreamPerThread)); CuDevice::Instantiate().AccuProfile(__func__, tim); } else #endif From 3d774f87c1c9f3d62a912532ec922640194ba264 Mon Sep 17 00:00:00 2001 From: Ryan Leary Date: Mon, 5 Nov 2018 23:54:47 -0500 Subject: [PATCH 2/2] Fix formatting of whitespace and line width --- src/cudamatrix/cu-array-inl.h | 7 ++++--- src/cudamatrix/cu-matrix.cc | 24 ++++++++++++------------ src/cudamatrix/cu-packed-matrix.cc | 5 +++-- src/cudamatrix/cu-value.h | 4 +++- src/cudamatrix/cu-vector.cc | 16 +++++++++------- 5 files changed, 31 insertions(+), 25 deletions(-) diff --git a/src/cudamatrix/cu-array-inl.h b/src/cudamatrix/cu-array-inl.h index 5786de0b126..23b20501d4c 100644 --- a/src/cudamatrix/cu-array-inl.h +++ b/src/cudamatrix/cu-array-inl.h @@ -140,7 +140,8 @@ void CuArray::CopyFromArray(const CuArrayBase &src) { if (CuDevice::Instantiate().Enabled()) { CuTimer tim; CU_SAFE_CALL(cudaMemcpyAsync(this->data_, src.data_, this->dim_ * sizeof(T), - cudaMemcpyDeviceToDevice, cudaStreamPerThread)); + cudaMemcpyDeviceToDevice, + cudaStreamPerThread)); CuDevice::Instantiate().AccuProfile(__func__, tim); } else #endif @@ -158,8 +159,8 @@ void CuArrayBase::CopyFromArray(const CuArrayBase &src) { if (CuDevice::Instantiate().Enabled()) { CuTimer tim; CU_SAFE_CALL( - cudaMemcpyAsync(this->data_, src.data_, dim_ * sizeof(T), - cudaMemcpyDeviceToDevice, cudaStreamPerThread)); + cudaMemcpyAsync(this->data_, src.data_, dim_ * sizeof(T), + cudaMemcpyDeviceToDevice, cudaStreamPerThread)); CuDevice::Instantiate().AccuProfile(__func__, tim); } else #endif diff --git a/src/cudamatrix/cu-matrix.cc b/src/cudamatrix/cu-matrix.cc index f526d53f10f..247c2236565 100644 --- a/src/cudamatrix/cu-matrix.cc +++ b/src/cudamatrix/cu-matrix.cc @@ -229,9 +229,10 @@ void CuMatrixBase::CopyFromMat(const CuMatrixBase &M, MatrixIndexT dst_pitch = stride_ * sizeof(Real); MatrixIndexT src_pitch = M.Stride() * sizeof(Real); MatrixIndexT width = M.NumCols() * sizeof(Real); - CU_SAFE_CALL(cudaMemcpy2DAsync(data_, dst_pitch, M.data_, src_pitch, - width, M.num_rows_, cudaMemcpyDeviceToDevice, - cudaStreamPerThread)); + CU_SAFE_CALL( + cudaMemcpy2DAsync(data_, dst_pitch, M.data_, src_pitch, + width, M.num_rows_, cudaMemcpyDeviceToDevice, + cudaStreamPerThread)); } else { if (trans == kNoTrans) { dim3 dimGrid, dimBlock; @@ -2287,16 +2288,15 @@ void CuMatrixBase::CopyRowsFromVec(const CuVectorBase &v) { if (v.Dim() == num_rows_*num_cols_) { if (stride_ == num_cols_) { const Real* v_data = v.Data(); - CU_SAFE_CALL(cudaMemcpyAsync(data_, v_data, - sizeof(Real)*num_rows_*num_cols_, - cudaMemcpyDeviceToDevice, - cudaStreamPerThread)); + CU_SAFE_CALL( + cudaMemcpyAsync(data_, v_data, sizeof(Real)*num_rows_*num_cols_, + cudaMemcpyDeviceToDevice, cudaStreamPerThread)); } else { - CU_SAFE_CALL(cudaMemcpy2DAsync(data_, stride_ * sizeof(Real), v.Data(), - num_cols_*sizeof(Real), num_cols_*sizeof(Real), - num_rows_, - cudaMemcpyDeviceToDevice, - cudaStreamPerThread)); + CU_SAFE_CALL( + cudaMemcpy2DAsync(data_, stride_ * sizeof(Real), v.Data(), + num_cols_*sizeof(Real), num_cols_*sizeof(Real), + num_rows_, cudaMemcpyDeviceToDevice, + cudaStreamPerThread)); } } else if (v.Dim() == num_cols_) { dim3 dimGrid, dimBlock; diff --git a/src/cudamatrix/cu-packed-matrix.cc b/src/cudamatrix/cu-packed-matrix.cc index 3c441b118e4..d4dbdf12143 100644 --- a/src/cudamatrix/cu-packed-matrix.cc +++ b/src/cudamatrix/cu-packed-matrix.cc @@ -143,8 +143,9 @@ void CuPackedMatrix::CopyFromPacked(const CuPackedMatrix &src) { size_t nr = static_cast(num_rows_), num_bytes = ((nr * (nr+1)) / 2) * sizeof(Real); - CU_SAFE_CALL(cudaMemcpyAsync(data_, src.data_, num_bytes, - cudaMemcpyDeviceToDevice, cudaStreamPerThread)); + CU_SAFE_CALL( + cudaMemcpyAsync(data_, src.data_, num_bytes, cudaMemcpyDeviceToDevice, + cudaStreamPerThread)); CuDevice::Instantiate().AccuProfile("CuPackedMatrix::CopyFromPacked1", tim); } else diff --git a/src/cudamatrix/cu-value.h b/src/cudamatrix/cu-value.h index abb450ae527..cab0a3235d7 100644 --- a/src/cudamatrix/cu-value.h +++ b/src/cudamatrix/cu-value.h @@ -39,7 +39,9 @@ class CuValue { inline CuValue operator = (const CuValue &other) { #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { - CU_SAFE_CALL(cudaMemcpyAsync(data_, other.data_, sizeof(Real), cudaMemcpyDeviceToDevice, cudaStreamPerThread)); + CU_SAFE_CALL( + cudaMemcpyAsync(data_, other.data_, sizeof(Real), + cudaMemcpyDeviceToDevice, cudaStreamPerThread)); return *this; } else #endif diff --git a/src/cudamatrix/cu-vector.cc b/src/cudamatrix/cu-vector.cc index 5dd0b1ab7f8..536e55d8a3b 100644 --- a/src/cudamatrix/cu-vector.cc +++ b/src/cudamatrix/cu-vector.cc @@ -167,15 +167,16 @@ void CuVectorBase::CopyRowsFromMat(const CuMatrixBase &mat) { if (dim_ == 0) return; CuTimer tim; if (mat.Stride() == mat.NumCols() && mat.NumRows() != 0) { - CU_SAFE_CALL(cudaMemcpyAsync(data_, mat.Data(), sizeof(Real)*dim_, - cudaMemcpyDeviceToDevice, cudaStreamPerThread)); + CU_SAFE_CALL( + cudaMemcpyAsync(data_, mat.Data(), sizeof(Real)*dim_, + cudaMemcpyDeviceToDevice, cudaStreamPerThread)); } else { Real* vec_data = data_; for (MatrixIndexT r = 0; r < mat.NumRows(); r++) { CU_SAFE_CALL(cudaMemcpyAsync(vec_data, mat.RowData(r), - sizeof(Real) * mat.NumCols(), - cudaMemcpyDeviceToDevice, - cudaStreamPerThread)); + sizeof(Real) * mat.NumCols(), + cudaMemcpyDeviceToDevice, + cudaStreamPerThread)); vec_data += mat.NumCols(); } } @@ -1050,8 +1051,9 @@ void CuVectorBase::CopyFromVec(const CuVectorBase &src) { if (CuDevice::Instantiate().Enabled()) { if (dim_ == 0) return; CuTimer tim; - CU_SAFE_CALL(cudaMemcpyAsync(data_, src.data_, src.dim_ * sizeof(Real), cudaMemcpyDeviceToDevice, - cudaStreamPerThread)); + CU_SAFE_CALL( + cudaMemcpyAsync(data_, src.data_, src.dim_ * sizeof(Real), + cudaMemcpyDeviceToDevice, cudaStreamPerThread)); CuDevice::Instantiate().AccuProfile(__func__, tim); } else #endif