Skip to content

Commit

Permalink
Merge branch 'branch-0.17' into fix-agg-clone
Browse files Browse the repository at this point in the history
  • Loading branch information
jlowe authored Dec 4, 2020
2 parents 40db418 + 1af9bc0 commit 55d1e14
Show file tree
Hide file tree
Showing 38 changed files with 1,370 additions and 233 deletions.
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@
- PR #6652 Add support for struct columns in concatenate
- PR #6675 Add DecimalDtype to cuDF
- PR #6739 Add Java bindings for is_timestamp
- PR #6808 Add support for reading decimal32 and decimal64 from parquet
- PR #6781 Add serial murmur3 hashing
- PR #6811 First class support for unbounded window function bounds
- PR #6768 Add support for scatter() on list columns
- PR #6796 Add create_metadata_file in dask_cudf
Expand All @@ -37,6 +39,7 @@
- PR #6805 Implement `cudf::detail::copy_if` for `decimal32` and `decimal64`
- PR #6726 Support selecting different hash functions in hash_partition
- PR #6619 Improve Dockerfile
- PR #6831 Added parquet chunked writing ability for list columns

## Improvements

Expand Down Expand Up @@ -112,12 +115,14 @@
- PR #6809 size_type overflow checking when concatenating columns
- PR #6789 Rename `unary_op` to `unary_operator`
- PR #6770 Support building decimal columns with Table.TestBuilder
- PR #6815 Add wildcard path support to `read_parquet`
- PR #6800 Push DeviceScalar to cython-only
- PR #6822 Split out `cudf::distinct_count` from `drop_duplicates.cu`
- PR #6813 Enable `expand=False` in `.str.split` and `.str.rsplit`
- PR #6829 Enable workaround to write categorical columns in csv
- PR #6819 Use CMake 3.19 for RMM when building cuDF jar
- PR #6833 Use settings.xml if existing for internal build
- PR #6839 Handle index when dispatching __array_function__ and __array_ufunc__ to cupy for cudf.Series
- PR #6835 Move template param to member var to improve compile of hash/groupby.cu
- PR #6837 Avoid gather when copying strings view from start of strings column
- PR #6859 Move align_ptr_for_type() from cuda.cuh to alignment.hpp
Expand Down Expand Up @@ -185,6 +190,7 @@
- PR #6806 Force install of local conda artifacts
- PR #6887 Fix typo and `0-d` numpy array handling in binary operation
- PR #6898 Fix missing clone overrides on derived aggregations
- PR #6899 Update JNI to new gather boundary check API


# cuDF 0.16.0 (21 Oct 2020)
Expand Down
7 changes: 7 additions & 0 deletions cpp/include/cudf/detail/hashing.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ std::unique_ptr<column> hash(
table_view const& input,
hash_id hash_function = hash_id::HASH_MURMUR3,
std::vector<uint32_t> const& initial_hash = {},
uint32_t seed = 0,
rmm::cuda_stream_view stream = rmm::cuda_stream_default,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

Expand All @@ -45,5 +46,11 @@ std::unique_ptr<column> md5_hash(
rmm::cuda_stream_view stream = rmm::cuda_stream_default,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

std::unique_ptr<column> serial_murmur_hash3_32(
table_view const& input,
uint32_t seed = 0,
rmm::cuda_stream_view stream = rmm::cuda_stream_default,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

} // namespace detail
} // namespace cudf
3 changes: 2 additions & 1 deletion cpp/include/cudf/detail/utilities/hash_functions.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -371,6 +371,7 @@ struct MurmurHash3_32 {
using result_type = hash_value_type;

CUDA_HOST_DEVICE_CALLABLE MurmurHash3_32() : m_seed(0) {}
CUDA_HOST_DEVICE_CALLABLE MurmurHash3_32(uint32_t seed) : m_seed(seed) {}

CUDA_HOST_DEVICE_CALLABLE uint32_t rotl32(uint32_t x, int8_t r) const
{
Expand Down Expand Up @@ -416,7 +417,7 @@ struct MurmurHash3_32 {
hash_value_type CUDA_HOST_DEVICE_CALLABLE compute_floating_point(T const& key) const
{
if (key == T{0.0}) {
return 0;
return compute(T{0.0});
} else if (isnan(key)) {
T nan = std::numeric_limits<T>::quiet_NaN();
return compute(nan);
Expand Down
13 changes: 13 additions & 0 deletions cpp/include/cudf/fixed_point/fixed_point.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -515,6 +515,19 @@ class fixed_point {
Rep const value = detail::shift<Rep, Rad>(_value, scale_type{scale - _scale});
return fixed_point<Rep, Rad>{scaled_integer<Rep>{value, scale}};
}

/**
* @brief Returns a string representation of the fixed_point value.
*/
explicit operator std::string() const
{
int const n = std::pow(10, -_scale);
int const f = _value % n;
auto const num_zeros = std::max(0, (-_scale - static_cast<int32_t>(std::to_string(f).size())));
auto const zeros = num_zeros <= 0 ? std::string("") : std::string(num_zeros, '0');
return std::to_string(_value / n) + std::string(".") + zeros +
std::to_string(std::abs(_value) % n);
}
}; // namespace numeric

/** @brief Function that converts Rep to `std::string`
Expand Down
1 change: 1 addition & 0 deletions cpp/include/cudf/hashing.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ std::unique_ptr<column> hash(
table_view const& input,
hash_id hash_function = hash_id::HASH_MURMUR3,
std::vector<uint32_t> const& initial_hash = {},
uint32_t seed = 0,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/** @} */ // end of group
Expand Down
30 changes: 30 additions & 0 deletions cpp/include/cudf/io/parquet.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,10 @@ class parquet_reader_options {
// Cast timestamp columns to a specific type
data_type _timestamp_type{type_id::EMPTY};

// force decimal reading to error if resorting to
// doubles for storage of types unsupported by cudf
bool _strict_decimal_types = false;

/**
* @brief Constructor from source info.
*
Expand Down Expand Up @@ -130,6 +134,12 @@ class parquet_reader_options {
*/
data_type get_timestamp_type() const { return _timestamp_type; }

/**
* @brief Returns true if strict decimal types is set, which errors if reading
* a decimal type that is unsupported.
*/
bool is_enabled_strict_decimal_types() const { return _strict_decimal_types; }

/**
* @brief Sets names of the columns to be read.
*
Expand Down Expand Up @@ -199,6 +209,14 @@ class parquet_reader_options {
* @param type The timestamp data_type to which all timestamp columns need to be cast.
*/
void set_timestamp_type(data_type type) { _timestamp_type = type; }

/**
* @brief Enables/disables strict decimal type checking.
*
* @param val If true, cudf will error if reading a decimal type that is unsupported. If false,
* cudf will convert unsupported types to double.
*/
void set_strict_decimal_types(bool val) { _strict_decimal_types = val; }
};

class parquet_reader_options_builder {
Expand Down Expand Up @@ -303,6 +321,18 @@ class parquet_reader_options_builder {
return *this;
}

/**
* @brief Sets to enable/disable error with unsupported decimal types.
*
* @param val Boolean value whether to error with unsupported decimal types.
* @return this for chaining.
*/
parquet_reader_options_builder& use_strict_decimal_types(bool val)
{
options._strict_decimal_types = val;
return *this;
}

/**
* @brief move parquet_reader_options member once it's built.
*/
Expand Down
16 changes: 13 additions & 3 deletions cpp/include/cudf/io/types.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -126,15 +126,25 @@ struct table_metadata {
};

/**
* @brief Derived class of table_metadata which includes nullability information per column of
* input.
* @brief Derived class of table_metadata which includes flattened nullability information of input.
*
* This information is used as an optimization for chunked writes. If the caller leaves
* column_nullable uninitialized, the writer code will assume the worst case : that all columns are
* nullable.
*
* If the column_nullable field is not empty, it is expected that it has a length equal to the
* number of columns in the table being written.
* number of columns in the flattened table being written.
*
* Flattening refers to the flattening of nested columns. For list columns, the number of values
* expected in the nullability vector is equal to the depth of the nesting. e.g. for a table of
* three columns of types: {int, list<double>, float}, the nullability vector contains the values:
*
* |Index| Nullability of |
* |-----|----------------------------------------|
* | 0 | int column |
* | 1 | Level 0 of list column (list itself) |
* | 2 | Level 1 of list column (double values) |
* | 3 | float column |
*
* In the case where column nullability is known, pass `true` if the corresponding column could
* contain nulls in one or more subtables to be written, otherwise `false`.
Expand Down
26 changes: 26 additions & 0 deletions cpp/include/cudf/table/row_operators.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -401,6 +401,32 @@ class element_hasher {
}
};

template <template <typename> class hash_function, bool has_nulls = true>
class element_hasher_with_seed {
public:
__device__ element_hasher_with_seed()
: _seed{0}, _null_hash(std::numeric_limits<hash_value_type>::max())
{
}
__device__ element_hasher_with_seed(
uint32_t seed = 0, hash_value_type null_hash = std::numeric_limits<hash_value_type>::max())
: _seed{seed}, _null_hash(null_hash)
{
}
// seed, null_hash, byte endianness
template <typename T>
__device__ inline hash_value_type operator()(column_device_view col, size_type row_index)
{
if (has_nulls && col.is_null(row_index)) { return _null_hash; }

return hash_function<T>{_seed}(col.element<T>(row_index));
}

private:
uint32_t _seed;
hash_value_type _null_hash;
};

/**
* @brief Computes the hash value of a row in the given table.
*
Expand Down
7 changes: 4 additions & 3 deletions cpp/include/cudf/types.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -325,9 +325,10 @@ std::size_t size_of(data_type t);
* @brief Identifies the hash function to be used
*/
enum class hash_id {
HASH_IDENTITY = 0, ///< Identity hash function that simply returns the key to be hashed
HASH_MURMUR3, ///< Murmur3 hash function
HASH_MD5 ///< MD5 hash function
HASH_IDENTITY = 0, ///< Identity hash function that simply returns the key to be hashed
HASH_MURMUR3, ///< Murmur3 hash function
HASH_MD5, ///< MD5 hash function
HASH_SERIAL_MURMUR3 ///< Serial Murmur3 hash function
};

/** @} */
Expand Down
49 changes: 49 additions & 0 deletions cpp/include/cudf_test/column_wrapper.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -527,6 +527,55 @@ class fixed_point_column_wrapper : public detail::column_wrapper {
: fixed_point_column_wrapper(std::cbegin(values), std::cend(values), scale)
{
}

/**
* @brief Construct a nullable column of the fixed-point elements from a range.
*
* Constructs a nullable column of the fixed-point elements in the range `[begin,end)` using the
* range `[v, v + distance(begin,end))` interpreted as Booleans to indicate the validity of each
* element.
*
* If `v[i] == true`, element `i` is valid, else it is null.
*
* Example:
* @code{.cpp}
* // Creates a nullable column of DECIMAL32 elements with 5 elements: {null, 100, null, 300,
* null}
* auto elements = make_counting_transform_iterator(0, [](auto i){ return i; });
* auto validity = make_counting_transform_iterator(0, [](auto i){ return i%2; });
* fixed_point_column_wrapper<int32_t> w(elements, elements + 5, validity, 2);
* @endcode
*
* Note: similar to `std::vector`, this "range" constructor should be used
* with parentheses `()` and not braces `{}`. The latter should only
* be used for the `initializer_list` constructors
*
* @param begin The beginning of the sequence of elements
* @param end The end of the sequence of elements
* @param v The beginning of the sequence of validity indicators
* @param scale The scale of the elements in the column
*/
template <typename FixedPointRepIterator, typename ValidityIterator>
fixed_point_column_wrapper(FixedPointRepIterator begin,
FixedPointRepIterator end,
ValidityIterator v,
numeric::scale_type scale)
: column_wrapper{}
{
CUDF_EXPECTS(numeric::is_supported_representation_type<Rep>(), "not valid representation type");

auto const size = cudf::distance(begin, end);
auto const elements = thrust::host_vector<Rep>(begin, end);
auto const is_decimal32 = std::is_same<Rep, int32_t>::value;
auto const id = is_decimal32 ? type_id::DECIMAL32 : type_id::DECIMAL64;
auto const data_type = cudf::data_type{id, static_cast<int32_t>(scale)};

wrapped.reset(new cudf::column{data_type,
size,
rmm::device_buffer{elements.data(), size * sizeof(Rep)},
detail::make_null_mask(v, v + size),
cudf::UNKNOWN_NULL_COUNT});
}
};

/**
Expand Down
60 changes: 59 additions & 1 deletion cpp/src/hash/hashing.cu
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
#include <cudf/types.hpp>

#include <rmm/cuda_stream_view.hpp>
#include "thrust/detail/seq.h"

namespace cudf {
namespace {
Expand All @@ -45,12 +46,14 @@ namespace detail {
std::unique_ptr<column> hash(table_view const& input,
hash_id hash_function,
std::vector<uint32_t> const& initial_hash,
uint32_t seed,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
switch (hash_function) {
case (hash_id::HASH_MURMUR3): return murmur_hash3_32(input, initial_hash, stream, mr);
case (hash_id::HASH_MD5): return md5_hash(input, stream, mr);
case (hash_id::HASH_SERIAL_MURMUR3): return serial_murmur_hash3_32(input, seed, stream, mr);
default: return nullptr;
}
}
Expand Down Expand Up @@ -119,6 +122,60 @@ std::unique_ptr<column> md5_hash(table_view const& input,
mr);
}

std::unique_ptr<column> serial_murmur_hash3_32(table_view const& input,
uint32_t seed,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
auto output = make_numeric_column(
data_type(type_id::INT32), input.num_rows(), mask_state::UNALLOCATED, stream, mr);

if (input.num_columns() == 0 || input.num_rows() == 0) { return output; }

auto const device_input = table_device_view::create(input, stream);
auto output_view = output->mutable_view();

if (has_nulls(input)) {
thrust::tabulate(rmm::exec_policy(stream)->on(stream.value()),
output_view.begin<int32_t>(),
output_view.end<int32_t>(),
[device_input = *device_input, seed] __device__(auto row_index) {
return thrust::reduce(
thrust::seq,
device_input.begin(),
device_input.end(),
seed,
[rindex = row_index] __device__(auto hash, auto column) {
return cudf::type_dispatcher(
column.type(),
element_hasher_with_seed<MurmurHash3_32, true>{hash, hash},
column,
rindex);
});
});
} else {
thrust::tabulate(rmm::exec_policy(stream)->on(stream.value()),
output_view.begin<int32_t>(),
output_view.end<int32_t>(),
[device_input = *device_input, seed] __device__(auto row_index) {
return thrust::reduce(
thrust::seq,
device_input.begin(),
device_input.end(),
seed,
[rindex = row_index] __device__(auto hash, auto column) {
return cudf::type_dispatcher(
column.type(),
element_hasher_with_seed<MurmurHash3_32, false>{hash, hash},
column,
rindex);
});
});
}

return output;
}

std::unique_ptr<column> murmur_hash3_32(table_view const& input,
std::vector<uint32_t> const& initial_hash,
rmm::cuda_stream_view stream,
Expand Down Expand Up @@ -176,10 +233,11 @@ std::unique_ptr<column> murmur_hash3_32(table_view const& input,
std::unique_ptr<column> hash(table_view const& input,
hash_id hash_function,
std::vector<uint32_t> const& initial_hash,
uint32_t seed,
rmm::mr::device_memory_resource* mr)
{
CUDF_FUNC_RANGE();
return detail::hash(input, hash_function, initial_hash, rmm::cuda_stream_default, mr);
return detail::hash(input, hash_function, initial_hash, seed, rmm::cuda_stream_default, mr);
}

std::unique_ptr<column> murmur_hash3_32(table_view const& input,
Expand Down
Loading

0 comments on commit 55d1e14

Please sign in to comment.