Skip to content

Commit

Permalink
Merge fix for norm calculations
Browse files Browse the repository at this point in the history
Previously, all norm calculations resulted in a norm vector of
`ValueType` instead of `remove_complex<ValueType>`.
Additionally, some norms were calculated incorrectly (`(x * x)`
instead of `(x * conj(x))`).
Both issues are resolved by this PR.

Related PR: #564
  • Loading branch information
Thomas Grützmacher committed Jun 17, 2020
2 parents ba6f99a + 335967b commit 54e479a
Show file tree
Hide file tree
Showing 38 changed files with 433 additions and 281 deletions.
90 changes: 67 additions & 23 deletions common/matrix/dense_kernels.hpp.inc
Original file line number Diff line number Diff line change
Expand Up @@ -73,11 +73,12 @@ __global__ __launch_bounds__(block_size) void add_scaled(
}


template <size_type block_size, typename ValueType>
__global__ __launch_bounds__(block_size) void compute_partial_dot(
size_type num_rows, const ValueType *__restrict__ x, size_type stride_x,
const ValueType *__restrict__ y, size_type stride_y,
ValueType *__restrict__ work)
template <size_type block_size, typename OutType, typename CallableGetValue,
typename CallableReduce>
__device__ void compute_partial_reduce(size_type num_rows,
OutType *__restrict__ work,
CallableGetValue get_value,
CallableReduce reduce_op)
{
constexpr auto warps_per_block = block_size / config::warp_size;

Expand All @@ -86,52 +87,95 @@ __global__ __launch_bounds__(block_size) void compute_partial_dot(
const auto global_id =
thread::get_thread_id<config::warp_size, warps_per_block>();

auto tmp = zero<ValueType>();
auto tmp = zero<OutType>();
for (auto i = global_id; i < num_rows; i += block_size * num_blocks) {
tmp += x[i * stride_x] * y[i * stride_y];
tmp = reduce_op(tmp, get_value(i));
}
__shared__ UninitializedArray<ValueType, block_size> tmp_work;
__shared__ UninitializedArray<OutType, block_size> tmp_work;
tmp_work[local_id] = tmp;

reduce(group::this_thread_block(), static_cast<ValueType *>(tmp_work),
[](const ValueType &x, const ValueType &y) { return x + y; });
reduce(group::this_thread_block(), static_cast<OutType *>(tmp_work),
reduce_op);

if (local_id == 0) {
work[thread::get_block_id()] = tmp_work[0];
}
}


template <size_type block_size, typename ValueType>
__global__ __launch_bounds__(block_size) void finalize_dot_computation(
size_type size, const ValueType *work, ValueType *result)
template <size_type block_size, typename ValueType, typename CallableReduce,
typename CallableFinalize>
__device__ void finalize_reduce_computation(size_type size,
const ValueType *work,
ValueType *result,
CallableReduce reduce_op,
CallableFinalize finalize_op)
{
const auto local_id = thread::get_local_thread_id<config::warp_size>();

ValueType tmp = zero<ValueType>();
for (auto i = local_id; i < size; i += block_size) {
tmp += work[i];
tmp = reduce_op(tmp, work[i]);
}
__shared__ UninitializedArray<ValueType, block_size> tmp_work;
tmp_work[local_id] = tmp;

reduce(group::this_thread_block(), static_cast<ValueType *>(tmp_work),
[](const ValueType &x, const ValueType &y) { return x + y; });
reduce_op);

if (local_id == 0) {
*result = tmp_work[0];
*result = finalize_op(tmp_work[0]);
}
}


template <typename ValueType>
__global__ __launch_bounds__(default_block_size) void compute_sqrt(
size_type num_cols, ValueType *__restrict__ work)
template <size_type block_size, typename ValueType>
__global__ __launch_bounds__(block_size) void compute_partial_dot(
size_type num_rows, const ValueType *__restrict__ x, size_type stride_x,
const ValueType *__restrict__ y, size_type stride_y,
ValueType *__restrict__ work)
{
const auto tidx = thread::get_thread_id_flat();
if (tidx < num_cols) {
work[tidx] = sqrt(abs(work[tidx]));
}
compute_partial_reduce<block_size>(
num_rows, work,
[x, stride_x, y, stride_y](size_type i) {
return x[i * stride_x] * conj(y[i * stride_y]);
},
[](const ValueType &x, const ValueType &y) { return x + y; });
}


template <size_type block_size, typename ValueType>
__global__ __launch_bounds__(block_size) void finalize_dot_computation(
size_type size, const ValueType *work, ValueType *result)
{
finalize_reduce_computation<block_size>(
size, work, result,
[](const ValueType &x, const ValueType &y) { return x + y; },
[](const ValueType &x) { return x; });
}


template <size_type block_size, typename ValueType>
__global__ __launch_bounds__(block_size) void compute_partial_norm2(
size_type num_rows, const ValueType *__restrict__ x, size_type stride_x,
remove_complex<ValueType> *__restrict__ work)
{
using norm_type = remove_complex<ValueType>;
compute_partial_reduce<block_size>(
num_rows, work,
[x, stride_x](size_type i) { return squared_norm(x[i * stride_x]); },
[](const norm_type &x, const norm_type &y) { return x + y; });
}


template <size_type block_size, typename ValueType>
__global__ __launch_bounds__(block_size) void finalize_norm2_computation(
size_type size, const ValueType *work, ValueType *result)
{
finalize_reduce_computation<block_size>(
size, work, result,
[](const ValueType &x, const ValueType &y) { return x + y; },
[](const ValueType &x) { return sqrt(x); });
}


Expand Down
35 changes: 7 additions & 28 deletions common/solver/gmres_kernels.hpp.inc
Original file line number Diff line number Diff line change
Expand Up @@ -65,35 +65,12 @@ __global__ __launch_bounds__(block_size) void initialize_1_kernel(
}


// Must be called with at least `num_rows * stride_krylov` threads in total.
template <size_type block_size, typename ValueType>
__global__ __launch_bounds__(block_size) void initialize_2_1_kernel(
size_type num_rows, size_type num_rhs, size_type krylov_dim,
ValueType *__restrict__ krylov_bases, size_type stride_krylov,
ValueType *__restrict__ residual_norm_collection,
size_type stride_residual_nc)
{
const auto global_id = thread::get_thread_id_flat();
const auto row_idx = global_id / stride_krylov;
const auto col_idx = global_id % stride_krylov;
const auto krylov_bases_nrows = (krylov_dim + 1) * num_rows;
if (row_idx < krylov_bases_nrows && col_idx < num_rhs) {
krylov_bases[row_idx * stride_krylov + col_idx] = zero<ValueType>();
}

if (row_idx < krylov_dim + 1 && col_idx < num_rhs) {
residual_norm_collection[row_idx * stride_residual_nc + col_idx] =
zero<ValueType>();
}
}


// Must be called with at least `num_rows * num_rhs` threads in total.
template <size_type block_size, typename ValueType>
__global__ __launch_bounds__(block_size) void initialize_2_2_kernel(
size_type num_rows, size_type num_rhs,
const ValueType *__restrict__ residual, size_type stride_residual,
const ValueType *__restrict__ residual_norm,
const remove_complex<ValueType> *__restrict__ residual_norm,
ValueType *__restrict__ residual_norm_collection,
ValueType *__restrict__ krylov_bases, size_type stride_krylov,
size_type *__restrict__ final_iter_nums)
Expand Down Expand Up @@ -293,8 +270,10 @@ template <typename ValueType>
__device__ void calculate_residual_norm_kernel(
size_type col_idx, size_type num_cols, size_type iter,
const ValueType &register_sin, const ValueType &register_cos,
ValueType *residual_norm, ValueType *residual_norm_collection,
size_type stride_residual_norm_collection, const ValueType *b_norm)
remove_complex<ValueType> *residual_norm,
ValueType *residual_norm_collection,
size_type stride_residual_norm_collection,
const remove_complex<ValueType> *b_norm)
{
const auto this_rnc =
residual_norm_collection[iter * stride_residual_norm_collection +
Expand All @@ -315,10 +294,10 @@ __global__ __launch_bounds__(block_size) void givens_rotation_kernel(
ValueType *__restrict__ hessenberg_iter, size_type stride_hessenberg,
ValueType *__restrict__ givens_sin, size_type stride_sin,
ValueType *__restrict__ givens_cos, size_type stride_cos,
ValueType *__restrict__ residual_norm,
remove_complex<ValueType> *__restrict__ residual_norm,
ValueType *__restrict__ residual_norm_collection,
size_type stride_residual_norm_collection,
const ValueType *__restrict__ b_norm,
const remove_complex<ValueType> *__restrict__ b_norm,
const stopping_status *__restrict__ stop_status)
{
const auto col_idx = thread::get_thread_id_flat();
Expand Down
3 changes: 2 additions & 1 deletion core/device_hooks/common_kernels.inc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -967,7 +967,8 @@ namespace residual_norm_reduction {
template <typename ValueType>
GKO_DECLARE_RESIDUAL_NORM_REDUCTION_KERNEL(ValueType)
GKO_NOT_COMPILED(GKO_HOOK_MODULE);
GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_RESIDUAL_NORM_REDUCTION_KERNEL);
GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(
GKO_DECLARE_RESIDUAL_NORM_REDUCTION_KERNEL);


} // namespace residual_norm_reduction
Expand Down
4 changes: 3 additions & 1 deletion core/log/convergence.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


#include <ginkgo/core/base/array.hpp>
#include <ginkgo/core/base/math.hpp>
#include <ginkgo/core/stop/criterion.hpp>
#include <ginkgo/core/stop/stopping_status.hpp>

Expand All @@ -59,7 +60,8 @@ void Convergence<ValueType>::on_criterion_check_completed(
this->residual_norm_.reset(residual_norm->clone().release());
} else if (residual != nullptr) {
using Vector = matrix::Dense<ValueType>;
this->residual_norm_ = Vector::create(
using NormVector = matrix::Dense<remove_complex<ValueType>>;
this->residual_norm_ = NormVector::create(
residual->get_executor(), dim<2>{1, residual->get_size()[1]});
auto dense_r = as<Vector>(residual);
dense_r->compute_norm2(this->residual_norm_.get());
Expand Down
3 changes: 2 additions & 1 deletion core/matrix/dense.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -267,10 +267,11 @@ void Dense<ValueType>::compute_dot_impl(const LinOp *b, LinOp *result) const
template <typename ValueType>
void Dense<ValueType>::compute_norm2_impl(LinOp *result) const
{
using NormVector = Dense<remove_complex<ValueType>>;
GKO_ASSERT_EQUAL_DIMENSIONS(result, dim<2>(1, this->get_size()[1]));
auto exec = this->get_executor();
exec->run(dense::make_compute_norm2(as<Dense<ValueType>>(this),
as<Dense<ValueType>>(result)));
as<NormVector>(result)));
}


Expand Down
3 changes: 2 additions & 1 deletion core/matrix/dense_kernels.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include <ginkgo/core/matrix/dense.hpp>


#include <ginkgo/core/base/math.hpp>
#include <ginkgo/core/base/types.hpp>


Expand Down Expand Up @@ -73,7 +74,7 @@ namespace kernels {
#define GKO_DECLARE_DENSE_COMPUTE_NORM2_KERNEL(_type) \
void compute_norm2(std::shared_ptr<const DefaultExecutor> exec, \
const matrix::Dense<_type> *x, \
matrix::Dense<_type> *result)
matrix::Dense<remove_complex<_type>> *result)

#define GKO_DECLARE_DENSE_CONVERT_TO_COO_KERNEL(_type, _prec) \
void convert_to_coo(std::shared_ptr<const DefaultExecutor> exec, \
Expand Down
1 change: 0 additions & 1 deletion core/solver/fcg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,6 @@ void Fcg<ValueType>::apply_impl(const LinOp *b, LinOp *x) const
constexpr uint8 RelativeStoppingId{1};

auto exec = this->get_executor();
size_type num_vectors = dense_b->get_size()[1];

auto one_op = initialize<Vector>({one<ValueType>()}, exec);
auto neg_one_op = initialize<Vector>({-one<ValueType>()}, exec);
Expand Down
5 changes: 3 additions & 2 deletions core/solver/gmres.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,7 @@ void Gmres<ValueType>::apply_impl(const LinOp *b, LinOp *x) const
GKO_ASSERT_IS_SQUARE_MATRIX(system_matrix_);

using Vector = matrix::Dense<ValueType>;
using NormVector = matrix::Dense<remove_complex<ValueType>>;

constexpr uint8 RelativeStoppingId{1};

Expand All @@ -122,8 +123,8 @@ void Gmres<ValueType>::apply_impl(const LinOp *b, LinOp *x) const
auto residual_norm_collection =
Vector::create(exec, dim<2>{krylov_dim_ + 1, dense_b->get_size()[1]});
auto residual_norm =
Vector::create(exec, dim<2>{1, dense_b->get_size()[1]});
auto b_norm = Vector::create(exec, dim<2>{1, dense_b->get_size()[1]});
NormVector::create(exec, dim<2>{1, dense_b->get_size()[1]});
auto b_norm = NormVector::create(exec, dim<2>{1, dense_b->get_size()[1]});
Array<size_type> final_iter_nums(this->get_executor(),
dense_b->get_size()[1]);
auto y = Vector::create(exec, dim<2>{krylov_dim_, dense_b->get_size()[1]});
Expand Down
22 changes: 12 additions & 10 deletions core/solver/gmres_kernels.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include <ginkgo/core/matrix/dense.hpp>
#include <ginkgo/core/stop/stopping_status.hpp>


namespace gko {
namespace kernels {
namespace gmres {
Expand All @@ -48,31 +49,32 @@ namespace gmres {
#define GKO_DECLARE_GMRES_INITIALIZE_1_KERNEL(_type) \
void initialize_1( \
std::shared_ptr<const DefaultExecutor> exec, \
const matrix::Dense<_type> *b, matrix::Dense<_type> *b_norm, \
const matrix::Dense<_type> *b, \
matrix::Dense<remove_complex<_type>> *b_norm, \
matrix::Dense<_type> *residual, matrix::Dense<_type> *givens_sin, \
matrix::Dense<_type> *givens_cos, Array<stopping_status> *stop_status, \
size_type krylov_dim)


#define GKO_DECLARE_GMRES_INITIALIZE_2_KERNEL(_type) \
void initialize_2(std::shared_ptr<const DefaultExecutor> exec, \
const matrix::Dense<_type> *residual, \
matrix::Dense<_type> *residual_norm, \
matrix::Dense<_type> *residual_norm_collection, \
matrix::Dense<_type> *krylov_bases, \
#define GKO_DECLARE_GMRES_INITIALIZE_2_KERNEL(_type) \
void initialize_2(std::shared_ptr<const DefaultExecutor> exec, \
const matrix::Dense<_type> *residual, \
matrix::Dense<remove_complex<_type>> *residual_norm, \
matrix::Dense<_type> *residual_norm_collection, \
matrix::Dense<_type> *krylov_bases, \
Array<size_type> *final_iter_nums, size_type krylov_dim)


#define GKO_DECLARE_GMRES_STEP_1_KERNEL(_type) \
void step_1(std::shared_ptr<const DefaultExecutor> exec, \
size_type num_rows, matrix::Dense<_type> *givens_sin, \
matrix::Dense<_type> *givens_cos, \
matrix::Dense<_type> *residual_norm, \
matrix::Dense<remove_complex<_type>> *residual_norm, \
matrix::Dense<_type> *residual_norm_collection, \
matrix::Dense<_type> *krylov_bases, \
matrix::Dense<_type> *hessenberg_iter, \
const matrix::Dense<_type> *b_norm, size_type iter, \
Array<size_type> *final_iter_nums, \
const matrix::Dense<remove_complex<_type>> *b_norm, \
size_type iter, Array<size_type> *final_iter_nums, \
const Array<stopping_status> *stop_status)


Expand Down
5 changes: 2 additions & 3 deletions core/stop/residual_norm_reduction.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -53,10 +53,9 @@ bool ResidualNormReduction<ValueType>::check_impl(
uint8 stoppingId, bool setFinalized, Array<stopping_status> *stop_status,
bool *one_changed, const Criterion::Updater &updater)
{
std::unique_ptr<Vector> u_dense_tau;
const Vector *dense_tau;
const NormVector *dense_tau;
if (updater.residual_norm_ != nullptr) {
dense_tau = as<Vector>(updater.residual_norm_);
dense_tau = as<NormVector>(updater.residual_norm_);
} else if (updater.residual_ != nullptr) {
auto *dense_r = as<Vector>(updater.residual_);
dense_r->compute_norm2(u_dense_tau_.get());
Expand Down
6 changes: 3 additions & 3 deletions core/stop/residual_norm_reduction_kernels.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,9 +50,9 @@ namespace residual_norm_reduction {
void residual_norm_reduction( \
std::shared_ptr<const DefaultExecutor> exec, \
const matrix::Dense<_type> *tau, const matrix::Dense<_type> *orig_tau, \
remove_complex<_type> rel_residual_goal, uint8 stoppingId, \
bool setFinalized, Array<stopping_status> *stop_status, \
Array<bool> *device_storage, bool *all_converged, bool *one_changed)
_type rel_residual_goal, uint8 stoppingId, bool setFinalized, \
Array<stopping_status> *stop_status, Array<bool> *device_storage, \
bool *all_converged, bool *one_changed)


#define GKO_DECLARE_ALL_AS_TEMPLATES \
Expand Down
22 changes: 3 additions & 19 deletions cuda/base/cublas_bindings.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -214,24 +214,9 @@ GKO_BIND_CUBLAS_DOT(ValueType, detail::not_implemented);
#undef GKO_BIND_CUBLAS_DOT


#define GKO_BIND_CUBLAS_COMPLEX_NORM2(ValueType, CublasName) \
inline void norm2(cublasHandle_t handle, int n, const ValueType *x, \
int incx, ValueType *result) \
{ \
cudaMemset(result, 0, sizeof(ValueType)); \
GKO_ASSERT_NO_CUBLAS_ERRORS( \
CublasName(handle, n, as_culibs_type(x), incx, \
reinterpret_cast<remove_complex<ValueType> *>( \
as_culibs_type(result)))); \
} \
static_assert(true, \
"This assert is used to counter the false positive extra " \
"semi-colon warnings")


#define GKO_BIND_CUBLAS_NORM2(ValueType, CublasName) \
inline void norm2(cublasHandle_t handle, int n, const ValueType *x, \
int incx, ValueType *result) \
int incx, remove_complex<ValueType> *result) \
{ \
GKO_ASSERT_NO_CUBLAS_ERRORS(CublasName(handle, n, as_culibs_type(x), \
incx, as_culibs_type(result))); \
Expand All @@ -243,12 +228,11 @@ GKO_BIND_CUBLAS_DOT(ValueType, detail::not_implemented);

GKO_BIND_CUBLAS_NORM2(float, cublasSnrm2);
GKO_BIND_CUBLAS_NORM2(double, cublasDnrm2);
GKO_BIND_CUBLAS_COMPLEX_NORM2(std::complex<float>, cublasScnrm2);
GKO_BIND_CUBLAS_COMPLEX_NORM2(std::complex<double>, cublasDznrm2);
GKO_BIND_CUBLAS_NORM2(std::complex<float>, cublasScnrm2);
GKO_BIND_CUBLAS_NORM2(std::complex<double>, cublasDznrm2);
template <typename ValueType>
GKO_BIND_CUBLAS_NORM2(ValueType, detail::not_implemented);

#undef GKO_BIND_CUBLAS_COMPLEX_NORM2
#undef GKO_BIND_CUBLAS_NORM2


Expand Down
Loading

0 comments on commit 54e479a

Please sign in to comment.