Merge branch 'branch-0.17' into fix-agg-clone

rapidsai · Dec 4, 2020 · 55d1e14 · 55d1e14
2 parents 40db418 + 1af9bc0
commit 55d1e14
Show file tree

Hide file tree

Showing 38 changed files with 1,370 additions and 233 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -29,6 +29,8 @@
 - PR #6652 Add support for struct columns in concatenate
 - PR #6675 Add DecimalDtype to cuDF
 - PR #6739 Add Java bindings for is_timestamp
+- PR #6808 Add support for reading decimal32 and decimal64 from parquet
+- PR #6781 Add serial murmur3 hashing
 - PR #6811 First class support for unbounded window function bounds
 - PR #6768 Add support for scatter() on list columns
 - PR #6796 Add create_metadata_file in dask_cudf
@@ -37,6 +39,7 @@
 - PR #6805 Implement `cudf::detail::copy_if` for `decimal32` and `decimal64`
 - PR #6726 Support selecting different hash functions in hash_partition
 - PR #6619 Improve Dockerfile
+- PR #6831 Added parquet chunked writing ability for list columns
 
 ## Improvements
 
@@ -112,12 +115,14 @@
 - PR #6809 size_type overflow checking when concatenating columns
 - PR #6789 Rename `unary_op` to `unary_operator`
 - PR #6770 Support building decimal columns with Table.TestBuilder
+- PR #6815 Add wildcard path support to `read_parquet`
 - PR #6800 Push DeviceScalar to cython-only
 - PR #6822 Split out `cudf::distinct_count` from `drop_duplicates.cu`
 - PR #6813 Enable `expand=False` in `.str.split` and `.str.rsplit`
 - PR #6829 Enable workaround to write categorical columns in csv
 - PR #6819 Use CMake 3.19 for RMM when building cuDF jar
 - PR #6833 Use settings.xml if existing for internal build
+- PR #6839 Handle index when dispatching __array_function__ and __array_ufunc__ to cupy for cudf.Series
 - PR #6835 Move template param to member var to improve compile of hash/groupby.cu
 - PR #6837 Avoid gather when copying strings view from start of strings column
 - PR #6859 Move align_ptr_for_type() from cuda.cuh to alignment.hpp
@@ -185,6 +190,7 @@
 - PR #6806 Force install of local conda artifacts
 - PR #6887 Fix typo and `0-d` numpy array handling in binary operation
 - PR #6898 Fix missing clone overrides on derived aggregations
+- PR #6899 Update JNI to new gather boundary check API
 
 
 # cuDF 0.16.0 (21 Oct 2020)

diff --git a/cpp/include/cudf/detail/hashing.hpp b/cpp/include/cudf/detail/hashing.hpp
@@ -31,6 +31,7 @@ std::unique_ptr<column> hash(
  table_view const& input,
  hash_id hash_function = hash_id::HASH_MURMUR3,
  std::vector<uint32_t> const& initial_hash = {},
+ uint32_t seed = 0,
  rmm::cuda_stream_view stream = rmm::cuda_stream_default,
  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
@@ -45,5 +46,11 @@ std::unique_ptr<column> md5_hash(
  rmm::cuda_stream_view stream = rmm::cuda_stream_default,
  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+std::unique_ptr<column> serial_murmur_hash3_32(
+ table_view const& input,
+ uint32_t seed = 0,
+ rmm::cuda_stream_view stream = rmm::cuda_stream_default,
+ rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 } // namespace detail
 } // namespace cudf
diff --git a/cpp/include/cudf/detail/utilities/hash_functions.cuh b/cpp/include/cudf/detail/utilities/hash_functions.cuh
@@ -371,6 +371,7 @@ struct MurmurHash3_32 {
  using result_type = hash_value_type;
 
  CUDA_HOST_DEVICE_CALLABLE MurmurHash3_32() : m_seed(0) {}
+ CUDA_HOST_DEVICE_CALLABLE MurmurHash3_32(uint32_t seed) : m_seed(seed) {}
 
  CUDA_HOST_DEVICE_CALLABLE uint32_t rotl32(uint32_t x, int8_t r) const
  {
@@ -416,7 +417,7 @@ struct MurmurHash3_32 {
  hash_value_type CUDA_HOST_DEVICE_CALLABLE compute_floating_point(T const& key) const
  {
  if (key == T{0.0}) {
- return 0;
+ return compute(T{0.0});
  } else if (isnan(key)) {
  T nan = std::numeric_limits<T>::quiet_NaN();
  return compute(nan);

diff --git a/cpp/include/cudf/fixed_point/fixed_point.hpp b/cpp/include/cudf/fixed_point/fixed_point.hpp
@@ -515,6 +515,19 @@ class fixed_point {
  Rep const value = detail::shift<Rep, Rad>(_value, scale_type{scale - _scale});
  return fixed_point<Rep, Rad>{scaled_integer<Rep>{value, scale}};
  }
+
+ /**
+ * @brief Returns a string representation of the fixed_point value.
+ */
+ explicit operator std::string() const
+ {
+ int const n = std::pow(10, -_scale);
+ int const f = _value % n;
+ auto const num_zeros = std::max(0, (-_scale - static_cast<int32_t>(std::to_string(f).size())));
+ auto const zeros = num_zeros <= 0 ? std::string("") : std::string(num_zeros, '0');
+ return std::to_string(_value / n) + std::string(".") + zeros +
+ std::to_string(std::abs(_value) % n);
+ }
 }; // namespace numeric
 
 /** @brief Function that converts Rep to `std::string`

diff --git a/cpp/include/cudf/hashing.hpp b/cpp/include/cudf/hashing.hpp
@@ -39,6 +39,7 @@ std::unique_ptr<column> hash(
  table_view const& input,
  hash_id hash_function = hash_id::HASH_MURMUR3,
  std::vector<uint32_t> const& initial_hash = {},
+ uint32_t seed = 0,
  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */ // end of group

diff --git a/cpp/include/cudf/io/parquet.hpp b/cpp/include/cudf/io/parquet.hpp
@@ -64,6 +64,10 @@ class parquet_reader_options {
  // Cast timestamp columns to a specific type
  data_type _timestamp_type{type_id::EMPTY};
 
+ // force decimal reading to error if resorting to
+ // doubles for storage of types unsupported by cudf
+ bool _strict_decimal_types = false;
+
  /**
  * @brief Constructor from source info.
  *
@@ -130,6 +134,12 @@ class parquet_reader_options {
  */
  data_type get_timestamp_type() const { return _timestamp_type; }
 
+ /**
+ * @brief Returns true if strict decimal types is set, which errors if reading
+ * a decimal type that is unsupported.
+ */
+ bool is_enabled_strict_decimal_types() const { return _strict_decimal_types; }
+
  /**
  * @brief Sets names of the columns to be read.
  *
@@ -199,6 +209,14 @@ class parquet_reader_options {
  * @param type The timestamp data_type to which all timestamp columns need to be cast.
  */
  void set_timestamp_type(data_type type) { _timestamp_type = type; }
+
+ /**
+ * @brief Enables/disables strict decimal type checking.
+ *
+ * @param val If true, cudf will error if reading a decimal type that is unsupported. If false,
+ * cudf will convert unsupported types to double.
+ */
+ void set_strict_decimal_types(bool val) { _strict_decimal_types = val; }
 };
 
 class parquet_reader_options_builder {
@@ -303,6 +321,18 @@ class parquet_reader_options_builder {
  return *this;
  }
 
+ /**
+ * @brief Sets to enable/disable error with unsupported decimal types.
+ *
+ * @param val Boolean value whether to error with unsupported decimal types.
+ * @return this for chaining.
+ */
+ parquet_reader_options_builder& use_strict_decimal_types(bool val)
+ {
+ options._strict_decimal_types = val;
+ return *this;
+ }
+
  /**
  * @brief move parquet_reader_options member once it's built.
  */

diff --git a/cpp/include/cudf/io/types.hpp b/cpp/include/cudf/io/types.hpp
@@ -126,15 +126,25 @@ struct table_metadata {
 };
 
 /**
- * @brief Derived class of table_metadata which includes nullability information per column of
- * input.
+ * @brief Derived class of table_metadata which includes flattened nullability information of input.
  *
  * This information is used as an optimization for chunked writes. If the caller leaves
  * column_nullable uninitialized, the writer code will assume the worst case : that all columns are
  * nullable.
  *
  * If the column_nullable field is not empty, it is expected that it has a length equal to the
- * number of columns in the table being written.
+ * number of columns in the flattened table being written.
+ *
+ * Flattening refers to the flattening of nested columns. For list columns, the number of values
+ * expected in the nullability vector is equal to the depth of the nesting. e.g. for a table of
+ * three columns of types: {int, list<double>, float}, the nullability vector contains the values:
+ *
+ * |Index| Nullability of |
+ * |-----|----------------------------------------|
+ * | 0 | int column |
+ * | 1 | Level 0 of list column (list itself) |
+ * | 2 | Level 1 of list column (double values) |
+ * | 3 | float column |
  *
  * In the case where column nullability is known, pass `true` if the corresponding column could
  * contain nulls in one or more subtables to be written, otherwise `false`.

diff --git a/cpp/include/cudf/table/row_operators.cuh b/cpp/include/cudf/table/row_operators.cuh
@@ -401,6 +401,32 @@ class element_hasher {
  }
 };
 
+template <template <typename> class hash_function, bool has_nulls = true>
+class element_hasher_with_seed {
+ public:
+ __device__ element_hasher_with_seed()
+ : _seed{0}, _null_hash(std::numeric_limits<hash_value_type>::max())
+ {
+ }
+ __device__ element_hasher_with_seed(
+ uint32_t seed = 0, hash_value_type null_hash = std::numeric_limits<hash_value_type>::max())
+ : _seed{seed}, _null_hash(null_hash)
+ {
+ }
+ // seed, null_hash, byte endianness
+ template <typename T>
+ __device__ inline hash_value_type operator()(column_device_view col, size_type row_index)
+ {
+ if (has_nulls && col.is_null(row_index)) { return _null_hash; }
+
+ return hash_function<T>{_seed}(col.element<T>(row_index));
+ }
+
+ private:
+ uint32_t _seed;
+ hash_value_type _null_hash;
+};
+
 /**
  * @brief Computes the hash value of a row in the given table.
  *

diff --git a/cpp/include/cudf/types.hpp b/cpp/include/cudf/types.hpp
@@ -325,9 +325,10 @@ std::size_t size_of(data_type t);
  * @brief Identifies the hash function to be used
  */
 enum class hash_id {
- HASH_IDENTITY = 0, ///< Identity hash function that simply returns the key to be hashed
- HASH_MURMUR3, ///< Murmur3 hash function
- HASH_MD5 ///< MD5 hash function
+ HASH_IDENTITY = 0, ///< Identity hash function that simply returns the key to be hashed
+ HASH_MURMUR3, ///< Murmur3 hash function
+ HASH_MD5, ///< MD5 hash function
+ HASH_SERIAL_MURMUR3 ///< Serial Murmur3 hash function
 };
 
 /** @} */

diff --git a/cpp/include/cudf_test/column_wrapper.hpp b/cpp/include/cudf_test/column_wrapper.hpp
@@ -527,6 +527,55 @@ class fixed_point_column_wrapper : public detail::column_wrapper {
  : fixed_point_column_wrapper(std::cbegin(values), std::cend(values), scale)
  {
  }
+
+ /**
+ * @brief Construct a nullable column of the fixed-point elements from a range.
+ *
+ * Constructs a nullable column of the fixed-point elements in the range `[begin,end)` using the
+ * range `[v, v + distance(begin,end))` interpreted as Booleans to indicate the validity of each
+ * element.
+ *
+ * If `v[i] == true`, element `i` is valid, else it is null.
+ *
+ * Example:
+ * @code{.cpp}
+ * // Creates a nullable column of DECIMAL32 elements with 5 elements: {null, 100, null, 300,
+ * null}
+ * auto elements = make_counting_transform_iterator(0, [](auto i){ return i; });
+ * auto validity = make_counting_transform_iterator(0, [](auto i){ return i%2; });
+ * fixed_point_column_wrapper<int32_t> w(elements, elements + 5, validity, 2);
+ * @endcode
+ *
+ * Note: similar to `std::vector`, this "range" constructor should be used
+ * with parentheses `()` and not braces `{}`. The latter should only
+ * be used for the `initializer_list` constructors
+ *
+ * @param begin The beginning of the sequence of elements
+ * @param end The end of the sequence of elements
+ * @param v The beginning of the sequence of validity indicators
+ * @param scale The scale of the elements in the column
+ */
+ template <typename FixedPointRepIterator, typename ValidityIterator>
+ fixed_point_column_wrapper(FixedPointRepIterator begin,
+ FixedPointRepIterator end,
+ ValidityIterator v,
+ numeric::scale_type scale)
+ : column_wrapper{}
+ {
+ CUDF_EXPECTS(numeric::is_supported_representation_type<Rep>(), "not valid representation type");
+
+ auto const size = cudf::distance(begin, end);
+ auto const elements = thrust::host_vector<Rep>(begin, end);
+ auto const is_decimal32 = std::is_same<Rep, int32_t>::value;
+ auto const id = is_decimal32 ? type_id::DECIMAL32 : type_id::DECIMAL64;
+ auto const data_type = cudf::data_type{id, static_cast<int32_t>(scale)};
+
+ wrapped.reset(new cudf::column{data_type,
+ size,
+ rmm::device_buffer{elements.data(), size * sizeof(Rep)},
+ detail::make_null_mask(v, v + size),
+ cudf::UNKNOWN_NULL_COUNT});
+ }
 };
 
 /**

diff --git a/cpp/src/hash/hashing.cu b/cpp/src/hash/hashing.cu
@@ -28,6 +28,7 @@
 #include <cudf/types.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include "thrust/detail/seq.h"
 
 namespace cudf {
 namespace {
@@ -45,12 +46,14 @@ namespace detail {
 std::unique_ptr<column> hash(table_view const& input,
  hash_id hash_function,
  std::vector<uint32_t> const& initial_hash,
+ uint32_t seed,
  rmm::cuda_stream_view stream,
  rmm::mr::device_memory_resource* mr)
 {
  switch (hash_function) {
  case (hash_id::HASH_MURMUR3): return murmur_hash3_32(input, initial_hash, stream, mr);
  case (hash_id::HASH_MD5): return md5_hash(input, stream, mr);
+ case (hash_id::HASH_SERIAL_MURMUR3): return serial_murmur_hash3_32(input, seed, stream, mr);
  default: return nullptr;
  }
 }
@@ -119,6 +122,60 @@ std::unique_ptr<column> md5_hash(table_view const& input,
  mr);
 }
 
+std::unique_ptr<column> serial_murmur_hash3_32(table_view const& input,
+ uint32_t seed,
+ rmm::cuda_stream_view stream,
+ rmm::mr::device_memory_resource* mr)
+{
+ auto output = make_numeric_column(
+ data_type(type_id::INT32), input.num_rows(), mask_state::UNALLOCATED, stream, mr);
+
+ if (input.num_columns() == 0 || input.num_rows() == 0) { return output; }
+
+ auto const device_input = table_device_view::create(input, stream);
+ auto output_view = output->mutable_view();
+
+ if (has_nulls(input)) {
+ thrust::tabulate(rmm::exec_policy(stream)->on(stream.value()),
+ output_view.begin<int32_t>(),
+ output_view.end<int32_t>(),
+ [device_input = *device_input, seed] __device__(auto row_index) {
+ return thrust::reduce(
+ thrust::seq,
+ device_input.begin(),
+ device_input.end(),
+ seed,
+ [rindex = row_index] __device__(auto hash, auto column) {
+ return cudf::type_dispatcher(
+ column.type(),
+ element_hasher_with_seed<MurmurHash3_32, true>{hash, hash},
+ column,
+ rindex);
+ });
+ });
+ } else {
+ thrust::tabulate(rmm::exec_policy(stream)->on(stream.value()),
+ output_view.begin<int32_t>(),
+ output_view.end<int32_t>(),
+ [device_input = *device_input, seed] __device__(auto row_index) {
+ return thrust::reduce(
+ thrust::seq,
+ device_input.begin(),
+ device_input.end(),
+ seed,
+ [rindex = row_index] __device__(auto hash, auto column) {
+ return cudf::type_dispatcher(
+ column.type(),
+ element_hasher_with_seed<MurmurHash3_32, false>{hash, hash},
+ column,
+ rindex);
+ });
+ });
+ }
+
+ return output;
+}
+
 std::unique_ptr<column> murmur_hash3_32(table_view const& input,
  std::vector<uint32_t> const& initial_hash,
  rmm::cuda_stream_view stream,
@@ -176,10 +233,11 @@ std::unique_ptr<column> murmur_hash3_32(table_view const& input,
 std::unique_ptr<column> hash(table_view const& input,
  hash_id hash_function,
  std::vector<uint32_t> const& initial_hash,
+ uint32_t seed,
  rmm::mr::device_memory_resource* mr)
 {
  CUDF_FUNC_RANGE();
- return detail::hash(input, hash_function, initial_hash, rmm::cuda_stream_default, mr);
+ return detail::hash(input, hash_function, initial_hash, seed, rmm::cuda_stream_default, mr);
 }
 
 std::unique_ptr<column> murmur_hash3_32(table_view const& input,