Merge fix for norm calculations

Previously, all norm calculations resulted in a norm vector of `ValueType` instead of `remove_complex<ValueType>`. Additionally, some norms were calculated incorrectly (`(x * x)` instead of `(x * conj(x))`). Both issues are resolved by this PR. Related PR: #564
ginkgo-project · Jun 17, 2020 · 54e479a · 54e479a
2 parents ba6f99a + 335967b
commit 54e479a
Show file tree

Hide file tree

Showing 38 changed files with 433 additions and 281 deletions.
diff --git a/common/matrix/dense_kernels.hpp.inc b/common/matrix/dense_kernels.hpp.inc
@@ -73,11 +73,12 @@ __global__ __launch_bounds__(block_size) void add_scaled(
 }
 
 
-template <size_type block_size, typename ValueType>
-__global__ __launch_bounds__(block_size) void compute_partial_dot(
- size_type num_rows, const ValueType *__restrict__ x, size_type stride_x,
- const ValueType *__restrict__ y, size_type stride_y,
- ValueType *__restrict__ work)
+template <size_type block_size, typename OutType, typename CallableGetValue,
+ typename CallableReduce>
+__device__ void compute_partial_reduce(size_type num_rows,
+ OutType *__restrict__ work,
+ CallableGetValue get_value,
+ CallableReduce reduce_op)
 {
  constexpr auto warps_per_block = block_size / config::warp_size;
 
@@ -86,52 +87,95 @@ __global__ __launch_bounds__(block_size) void compute_partial_dot(
  const auto global_id =
  thread::get_thread_id<config::warp_size, warps_per_block>();
 
- auto tmp = zero<ValueType>();
+ auto tmp = zero<OutType>();
  for (auto i = global_id; i < num_rows; i += block_size * num_blocks) {
- tmp += x[i * stride_x] * y[i * stride_y];
+ tmp = reduce_op(tmp, get_value(i));
  }
- __shared__ UninitializedArray<ValueType, block_size> tmp_work;
+ __shared__ UninitializedArray<OutType, block_size> tmp_work;
  tmp_work[local_id] = tmp;
 
- reduce(group::this_thread_block(), static_cast<ValueType *>(tmp_work),
- [](const ValueType &x, const ValueType &y) { return x + y; });
+ reduce(group::this_thread_block(), static_cast<OutType *>(tmp_work),
+ reduce_op);
 
  if (local_id == 0) {
  work[thread::get_block_id()] = tmp_work[0];
  }
 }
 
 
-template <size_type block_size, typename ValueType>
-__global__ __launch_bounds__(block_size) void finalize_dot_computation(
- size_type size, const ValueType *work, ValueType *result)
+template <size_type block_size, typename ValueType, typename CallableReduce,
+ typename CallableFinalize>
+__device__ void finalize_reduce_computation(size_type size,
+ const ValueType *work,
+ ValueType *result,
+ CallableReduce reduce_op,
+ CallableFinalize finalize_op)
 {
  const auto local_id = thread::get_local_thread_id<config::warp_size>();
 
  ValueType tmp = zero<ValueType>();
  for (auto i = local_id; i < size; i += block_size) {
- tmp += work[i];
+ tmp = reduce_op(tmp, work[i]);
  }
  __shared__ UninitializedArray<ValueType, block_size> tmp_work;
  tmp_work[local_id] = tmp;
 
  reduce(group::this_thread_block(), static_cast<ValueType *>(tmp_work),
- [](const ValueType &x, const ValueType &y) { return x + y; });
+ reduce_op);
 
  if (local_id == 0) {
- *result = tmp_work[0];
+ *result = finalize_op(tmp_work[0]);
  }
 }
 
 
-template <typename ValueType>
-__global__ __launch_bounds__(default_block_size) void compute_sqrt(
- size_type num_cols, ValueType *__restrict__ work)
+template <size_type block_size, typename ValueType>
+__global__ __launch_bounds__(block_size) void compute_partial_dot(
+ size_type num_rows, const ValueType *__restrict__ x, size_type stride_x,
+ const ValueType *__restrict__ y, size_type stride_y,
+ ValueType *__restrict__ work)
 {
- const auto tidx = thread::get_thread_id_flat();
- if (tidx < num_cols) {
- work[tidx] = sqrt(abs(work[tidx]));
- }
+ compute_partial_reduce<block_size>(
+ num_rows, work,
+ [x, stride_x, y, stride_y](size_type i) {
+ return x[i * stride_x] * conj(y[i * stride_y]);
+ },
+ [](const ValueType &x, const ValueType &y) { return x + y; });
+}
+
+
+template <size_type block_size, typename ValueType>
+__global__ __launch_bounds__(block_size) void finalize_dot_computation(
+ size_type size, const ValueType *work, ValueType *result)
+{
+ finalize_reduce_computation<block_size>(
+ size, work, result,
+ [](const ValueType &x, const ValueType &y) { return x + y; },
+ [](const ValueType &x) { return x; });
+}
+
+
+template <size_type block_size, typename ValueType>
+__global__ __launch_bounds__(block_size) void compute_partial_norm2(
+ size_type num_rows, const ValueType *__restrict__ x, size_type stride_x,
+ remove_complex<ValueType> *__restrict__ work)
+{
+ using norm_type = remove_complex<ValueType>;
+ compute_partial_reduce<block_size>(
+ num_rows, work,
+ [x, stride_x](size_type i) { return squared_norm(x[i * stride_x]); },
+ [](const norm_type &x, const norm_type &y) { return x + y; });
+}
+
+
+template <size_type block_size, typename ValueType>
+__global__ __launch_bounds__(block_size) void finalize_norm2_computation(
+ size_type size, const ValueType *work, ValueType *result)
+{
+ finalize_reduce_computation<block_size>(
+ size, work, result,
+ [](const ValueType &x, const ValueType &y) { return x + y; },
+ [](const ValueType &x) { return sqrt(x); });
 }
 
 

diff --git a/common/solver/gmres_kernels.hpp.inc b/common/solver/gmres_kernels.hpp.inc
@@ -65,35 +65,12 @@ __global__ __launch_bounds__(block_size) void initialize_1_kernel(
 }
 
 
-// Must be called with at least `num_rows * stride_krylov` threads in total.
-template <size_type block_size, typename ValueType>
-__global__ __launch_bounds__(block_size) void initialize_2_1_kernel(
- size_type num_rows, size_type num_rhs, size_type krylov_dim,
- ValueType *__restrict__ krylov_bases, size_type stride_krylov,
- ValueType *__restrict__ residual_norm_collection,
- size_type stride_residual_nc)
-{
- const auto global_id = thread::get_thread_id_flat();
- const auto row_idx = global_id / stride_krylov;
- const auto col_idx = global_id % stride_krylov;
- const auto krylov_bases_nrows = (krylov_dim + 1) * num_rows;
- if (row_idx < krylov_bases_nrows && col_idx < num_rhs) {
- krylov_bases[row_idx * stride_krylov + col_idx] = zero<ValueType>();
- }
-
- if (row_idx < krylov_dim + 1 && col_idx < num_rhs) {
- residual_norm_collection[row_idx * stride_residual_nc + col_idx] =
- zero<ValueType>();
- }
-}
-
-
 // Must be called with at least `num_rows * num_rhs` threads in total.
 template <size_type block_size, typename ValueType>
 __global__ __launch_bounds__(block_size) void initialize_2_2_kernel(
  size_type num_rows, size_type num_rhs,
  const ValueType *__restrict__ residual, size_type stride_residual,
- const ValueType *__restrict__ residual_norm,
+ const remove_complex<ValueType> *__restrict__ residual_norm,
  ValueType *__restrict__ residual_norm_collection,
  ValueType *__restrict__ krylov_bases, size_type stride_krylov,
  size_type *__restrict__ final_iter_nums)
@@ -293,8 +270,10 @@ template <typename ValueType>
 __device__ void calculate_residual_norm_kernel(
  size_type col_idx, size_type num_cols, size_type iter,
  const ValueType &register_sin, const ValueType &register_cos,
- ValueType *residual_norm, ValueType *residual_norm_collection,
- size_type stride_residual_norm_collection, const ValueType *b_norm)
+ remove_complex<ValueType> *residual_norm,
+ ValueType *residual_norm_collection,
+ size_type stride_residual_norm_collection,
+ const remove_complex<ValueType> *b_norm)
 {
  const auto this_rnc =
  residual_norm_collection[iter * stride_residual_norm_collection +
@@ -315,10 +294,10 @@ __global__ __launch_bounds__(block_size) void givens_rotation_kernel(
  ValueType *__restrict__ hessenberg_iter, size_type stride_hessenberg,
  ValueType *__restrict__ givens_sin, size_type stride_sin,
  ValueType *__restrict__ givens_cos, size_type stride_cos,
- ValueType *__restrict__ residual_norm,
+ remove_complex<ValueType> *__restrict__ residual_norm,
  ValueType *__restrict__ residual_norm_collection,
  size_type stride_residual_norm_collection,
- const ValueType *__restrict__ b_norm,
+ const remove_complex<ValueType> *__restrict__ b_norm,
  const stopping_status *__restrict__ stop_status)
 {
  const auto col_idx = thread::get_thread_id_flat();

diff --git a/core/device_hooks/common_kernels.inc.cpp b/core/device_hooks/common_kernels.inc.cpp
@@ -967,7 +967,8 @@ namespace residual_norm_reduction {
 template <typename ValueType>
 GKO_DECLARE_RESIDUAL_NORM_REDUCTION_KERNEL(ValueType)
 GKO_NOT_COMPILED(GKO_HOOK_MODULE);
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_RESIDUAL_NORM_REDUCTION_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(
+ GKO_DECLARE_RESIDUAL_NORM_REDUCTION_KERNEL);
 
 
 } // namespace residual_norm_reduction

diff --git a/core/log/convergence.cpp b/core/log/convergence.cpp
@@ -34,6 +34,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 #include <ginkgo/core/base/array.hpp>
+#include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/stop/criterion.hpp>
 #include <ginkgo/core/stop/stopping_status.hpp>
 
@@ -59,7 +60,8 @@ void Convergence<ValueType>::on_criterion_check_completed(
  this->residual_norm_.reset(residual_norm->clone().release());
  } else if (residual != nullptr) {
  using Vector = matrix::Dense<ValueType>;
- this->residual_norm_ = Vector::create(
+ using NormVector = matrix::Dense<remove_complex<ValueType>>;
+ this->residual_norm_ = NormVector::create(
  residual->get_executor(), dim<2>{1, residual->get_size()[1]});
  auto dense_r = as<Vector>(residual);
  dense_r->compute_norm2(this->residual_norm_.get());

diff --git a/core/matrix/dense.cpp b/core/matrix/dense.cpp
@@ -267,10 +267,11 @@ void Dense<ValueType>::compute_dot_impl(const LinOp *b, LinOp *result) const
 template <typename ValueType>
 void Dense<ValueType>::compute_norm2_impl(LinOp *result) const
 {
+ using NormVector = Dense<remove_complex<ValueType>>;
  GKO_ASSERT_EQUAL_DIMENSIONS(result, dim<2>(1, this->get_size()[1]));
  auto exec = this->get_executor();
  exec->run(dense::make_compute_norm2(as<Dense<ValueType>>(this),
- as<Dense<ValueType>>(result)));
+ as<NormVector>(result)));
 }
 
 

diff --git a/core/matrix/dense_kernels.hpp b/core/matrix/dense_kernels.hpp
@@ -37,6 +37,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/matrix/dense.hpp>
 
 
+#include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/base/types.hpp>
 
 
@@ -73,7 +74,7 @@ namespace kernels {
 #define GKO_DECLARE_DENSE_COMPUTE_NORM2_KERNEL(_type) \
  void compute_norm2(std::shared_ptr<const DefaultExecutor> exec, \
  const matrix::Dense<_type> *x, \
- matrix::Dense<_type> *result)
+ matrix::Dense<remove_complex<_type>> *result)
 
 #define GKO_DECLARE_DENSE_CONVERT_TO_COO_KERNEL(_type, _prec) \
  void convert_to_coo(std::shared_ptr<const DefaultExecutor> exec, \

diff --git a/core/solver/fcg.cpp b/core/solver/fcg.cpp
@@ -93,7 +93,6 @@ void Fcg<ValueType>::apply_impl(const LinOp *b, LinOp *x) const
  constexpr uint8 RelativeStoppingId{1};
 
  auto exec = this->get_executor();
- size_type num_vectors = dense_b->get_size()[1];
 
  auto one_op = initialize<Vector>({one<ValueType>()}, exec);
  auto neg_one_op = initialize<Vector>({-one<ValueType>()}, exec);

diff --git a/core/solver/gmres.cpp b/core/solver/gmres.cpp
@@ -97,6 +97,7 @@ void Gmres<ValueType>::apply_impl(const LinOp *b, LinOp *x) const
  GKO_ASSERT_IS_SQUARE_MATRIX(system_matrix_);
 
  using Vector = matrix::Dense<ValueType>;
+ using NormVector = matrix::Dense<remove_complex<ValueType>>;
 
  constexpr uint8 RelativeStoppingId{1};
 
@@ -122,8 +123,8 @@ void Gmres<ValueType>::apply_impl(const LinOp *b, LinOp *x) const
  auto residual_norm_collection =
  Vector::create(exec, dim<2>{krylov_dim_ + 1, dense_b->get_size()[1]});
  auto residual_norm =
- Vector::create(exec, dim<2>{1, dense_b->get_size()[1]});
- auto b_norm = Vector::create(exec, dim<2>{1, dense_b->get_size()[1]});
+ NormVector::create(exec, dim<2>{1, dense_b->get_size()[1]});
+ auto b_norm = NormVector::create(exec, dim<2>{1, dense_b->get_size()[1]});
  Array<size_type> final_iter_nums(this->get_executor(),
  dense_b->get_size()[1]);
  auto y = Vector::create(exec, dim<2>{krylov_dim_, dense_b->get_size()[1]});

diff --git a/core/solver/gmres_kernels.hpp b/core/solver/gmres_kernels.hpp
@@ -40,6 +40,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/matrix/dense.hpp>
 #include <ginkgo/core/stop/stopping_status.hpp>
 
+
 namespace gko {
 namespace kernels {
 namespace gmres {
@@ -48,31 +49,32 @@ namespace gmres {
 #define GKO_DECLARE_GMRES_INITIALIZE_1_KERNEL(_type) \
  void initialize_1( \
  std::shared_ptr<const DefaultExecutor> exec, \
- const matrix::Dense<_type> *b, matrix::Dense<_type> *b_norm, \
+ const matrix::Dense<_type> *b, \
+ matrix::Dense<remove_complex<_type>> *b_norm, \
  matrix::Dense<_type> *residual, matrix::Dense<_type> *givens_sin, \
  matrix::Dense<_type> *givens_cos, Array<stopping_status> *stop_status, \
  size_type krylov_dim)
 
 
-#define GKO_DECLARE_GMRES_INITIALIZE_2_KERNEL(_type) \
- void initialize_2(std::shared_ptr<const DefaultExecutor> exec, \
- const matrix::Dense<_type> *residual, \
- matrix::Dense<_type> *residual_norm,  \
- matrix::Dense<_type> *residual_norm_collection, \
- matrix::Dense<_type> *krylov_bases, \
+#define GKO_DECLARE_GMRES_INITIALIZE_2_KERNEL(_type)  \
+ void initialize_2(std::shared_ptr<const DefaultExecutor> exec,  \
+ const matrix::Dense<_type> *residual,  \
+ matrix::Dense<remove_complex<_type>> *residual_norm, \
+ matrix::Dense<_type> *residual_norm_collection,  \
+ matrix::Dense<_type> *krylov_bases,  \
  Array<size_type> *final_iter_nums, size_type krylov_dim)
 
 
 #define GKO_DECLARE_GMRES_STEP_1_KERNEL(_type) \
  void step_1(std::shared_ptr<const DefaultExecutor> exec, \
  size_type num_rows, matrix::Dense<_type> *givens_sin, \
  matrix::Dense<_type> *givens_cos, \
- matrix::Dense<_type> *residual_norm,  \
+ matrix::Dense<remove_complex<_type>> *residual_norm, \
  matrix::Dense<_type> *residual_norm_collection, \
  matrix::Dense<_type> *krylov_bases, \
  matrix::Dense<_type> *hessenberg_iter, \
- const matrix::Dense<_type> *b_norm, size_type iter, \
- Array<size_type> *final_iter_nums,  \
+ const matrix::Dense<remove_complex<_type>> *b_norm, \
+ size_type iter, Array<size_type> *final_iter_nums, \
  const Array<stopping_status> *stop_status)
 
 

diff --git a/core/stop/residual_norm_reduction.cpp b/core/stop/residual_norm_reduction.cpp
@@ -53,10 +53,9 @@ bool ResidualNormReduction<ValueType>::check_impl(
  uint8 stoppingId, bool setFinalized, Array<stopping_status> *stop_status,
  bool *one_changed, const Criterion::Updater &updater)
 {
- std::unique_ptr<Vector> u_dense_tau;
- const Vector *dense_tau;
+ const NormVector *dense_tau;
  if (updater.residual_norm_ != nullptr) {
- dense_tau = as<Vector>(updater.residual_norm_);
+ dense_tau = as<NormVector>(updater.residual_norm_);
  } else if (updater.residual_ != nullptr) {
  auto *dense_r = as<Vector>(updater.residual_);
  dense_r->compute_norm2(u_dense_tau_.get());

diff --git a/core/stop/residual_norm_reduction_kernels.hpp b/core/stop/residual_norm_reduction_kernels.hpp
@@ -50,9 +50,9 @@ namespace residual_norm_reduction {
  void residual_norm_reduction( \
  std::shared_ptr<const DefaultExecutor> exec, \
  const matrix::Dense<_type> *tau, const matrix::Dense<_type> *orig_tau, \
- remove_complex<_type> rel_residual_goal, uint8 stoppingId,   \
- bool setFinalized, Array<stopping_status> *stop_status,   \
- Array<bool> *device_storage, bool *all_converged, bool *one_changed)
+ _type rel_residual_goal, uint8 stoppingId, bool setFinalized, \
+ Array<stopping_status> *stop_status, Array<bool> *device_storage, \
+ bool *all_converged, bool *one_changed)
 
 
 #define GKO_DECLARE_ALL_AS_TEMPLATES \

diff --git a/cuda/base/cublas_bindings.hpp b/cuda/base/cublas_bindings.hpp
@@ -214,24 +214,9 @@ GKO_BIND_CUBLAS_DOT(ValueType, detail::not_implemented);
 #undef GKO_BIND_CUBLAS_DOT
 
 
-#define GKO_BIND_CUBLAS_COMPLEX_NORM2(ValueType, CublasName) \
- inline void norm2(cublasHandle_t handle, int n, const ValueType *x, \
- int incx, ValueType *result) \
- { \
- cudaMemset(result, 0, sizeof(ValueType)); \
- GKO_ASSERT_NO_CUBLAS_ERRORS( \
- CublasName(handle, n, as_culibs_type(x), incx, \
- reinterpret_cast<remove_complex<ValueType> *>( \
- as_culibs_type(result)))); \
- } \
- static_assert(true, \
- "This assert is used to counter the false positive extra " \
- "semi-colon warnings")
-
-
 #define GKO_BIND_CUBLAS_NORM2(ValueType, CublasName) \
  inline void norm2(cublasHandle_t handle, int n, const ValueType *x, \
- int incx, ValueType *result)  \
+ int incx, remove_complex<ValueType> *result) \
  { \
  GKO_ASSERT_NO_CUBLAS_ERRORS(CublasName(handle, n, as_culibs_type(x), \
  incx, as_culibs_type(result))); \
@@ -243,12 +228,11 @@ GKO_BIND_CUBLAS_DOT(ValueType, detail::not_implemented);
 
 GKO_BIND_CUBLAS_NORM2(float, cublasSnrm2);
 GKO_BIND_CUBLAS_NORM2(double, cublasDnrm2);
-GKO_BIND_CUBLAS_COMPLEX_NORM2(std::complex<float>, cublasScnrm2);
-GKO_BIND_CUBLAS_COMPLEX_NORM2(std::complex<double>, cublasDznrm2);
+GKO_BIND_CUBLAS_NORM2(std::complex<float>, cublasScnrm2);
+GKO_BIND_CUBLAS_NORM2(std::complex<double>, cublasDznrm2);
 template <typename ValueType>
 GKO_BIND_CUBLAS_NORM2(ValueType, detail::not_implemented);
 
-#undef GKO_BIND_CUBLAS_COMPLEX_NORM2
 #undef GKO_BIND_CUBLAS_NORM2