Skip to content

Commit

Permalink
Verify that concatenating columns does not overflow size_type(#6809)
Browse files Browse the repository at this point in the history
When concatenating large columns, it is possible to overflow the max size of a column (size_type).  We need to verify this won't happen ahead of time.   Previously, we were doing it for non-nested types, and only in the fused kernel concatenation case.   This addresses both issues.

Authors:
  - Dave Baranec <[email protected]>

Approvers:
  - Paul Taylor
  - Ram (Ramakrishna Prabhu)
  - Christopher Harris

URL: #6809
  • Loading branch information
nvdbaranec committed Dec 2, 2020
1 parent bd537b6 commit 0ddba3d
Show file tree
Hide file tree
Showing 3 changed files with 164 additions and 8 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,7 @@
- PR #6776 Use `void` return type for kernel wrapper functions instead of returning `cudaError_t`
- PR #6786 Add nested type support to ColumnVector#getDeviceMemorySize
- PR #6780 Move `cudf::cast` tests to separate test file
- PR #6809 size_type overflow checking when concatenating columns
- PR #6789 Rename `unary_op` to `unary_operator`
- PR #6770 Support building decimal columns with Table.TestBuilder
- PR #6800 Push DeviceScalar to cython-only
Expand Down
75 changes: 67 additions & 8 deletions cpp/src/copying/concatenate.cu
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@

namespace cudf {
namespace detail {

// From benchmark data, the fused kernel optimization appears to perform better
// when there are more than a trivial number of columns, or when the null mask
// can also be computed at the same time
Expand Down Expand Up @@ -335,26 +336,82 @@ std::unique_ptr<column> concatenate_dispatch::operator()<cudf::struct_view>()
return cudf::structs::detail::concatenate(views, stream, mr);
}

namespace {

/**
* @brief Verifies that the sum of the sizes of all the columns to be concatenated
* will not exceed the max value of size_type, and verifies all column types match
*
* @param begin Beginning of range of columns to check
* @param end End of range of columns to check
*
* @throws cudf::logic_error if the total length of the concatenated columns would
* exceed the max value of size_type
*
* @throws cudf::logic_error if all of the input column types don't match
*/
template <typename ColIter>
void bounds_and_type_check(ColIter begin, ColIter end)
{
CUDF_EXPECTS(std::all_of(begin,
end,
[expected_type = (*begin).type()](auto const& c) {
return c.type() == expected_type;
}),
"Type mismatch in columns to concatenate.");

// total size of all concatenated rows
size_t const total_row_count =
std::accumulate(begin, end, std::size_t{}, [](size_t a, auto const& b) {
return a + static_cast<size_t>(b.size());
});
CUDF_EXPECTS(total_row_count <= std::numeric_limits<size_type>::max(),
"Total number of concatenated rows exceeds size_type range");

// march each child
auto child_iter = thrust::make_counting_iterator(0);
auto const num_children = (*begin).num_children();
std::for_each(child_iter, child_iter + num_children, [&](auto child_index) {
std::vector<column_view> nth_children;
nth_children.reserve(std::distance(begin, end));

// we cannot do this via a transform iterator + std::copy_if because some columns
// can have no children if they are empty. so if we had 3 input string columns
// and 1 of them was empty, 2 of them would have 2 children, and 1 of them would have
// 0 children. so it is not safe to index col.child() directly.
std::for_each(begin, end, [child_index, &nth_children](column_view const& col) {
if (col.num_children() <= child_index) {
CUDF_EXPECTS(col.num_children() == 0,
"Encountered a child column with an unexpected # of children");
} else {
nth_children.push_back(col.child(child_index));
}
});

bounds_and_type_check(nth_children.begin(), nth_children.end());
});
}

} // anonymous namespace

// Concatenates the elements from a vector of column_views
std::unique_ptr<column> concatenate(std::vector<column_view> const& columns_to_concat,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
CUDF_EXPECTS(not columns_to_concat.empty(), "Unexpected empty list of columns to concatenate.");

data_type const type = columns_to_concat.front().type();
CUDF_EXPECTS(std::all_of(columns_to_concat.begin(),
columns_to_concat.end(),
[&type](auto const& c) { return c.type() == type; }),
"Type mismatch in columns to concatenate.");
// verify all types match and that we won't overflow size_type in output size
bounds_and_type_check(columns_to_concat.begin(), columns_to_concat.end());

if (std::all_of(columns_to_concat.begin(), columns_to_concat.end(), [](column_view const& c) {
return c.is_empty();
})) {
return empty_like(columns_to_concat.front());
}

return type_dispatcher(type, concatenate_dispatch{columns_to_concat, stream, mr});
return type_dispatcher(columns_to_concat.front().type(),
concatenate_dispatch{columns_to_concat, stream, mr});
}

std::unique_ptr<table> concatenate(std::vector<table_view> const& tables_to_concat,
Expand All @@ -367,8 +424,7 @@ std::unique_ptr<table> concatenate(std::vector<table_view> const& tables_to_conc
CUDF_EXPECTS(std::all_of(tables_to_concat.cbegin(),
tables_to_concat.cend(),
[&first_table](auto const& t) {
return t.num_columns() == first_table.num_columns() &&
have_same_types(first_table, t);
return t.num_columns() == first_table.num_columns();
}),
"Mismatch in table columns to concatenate.");

Expand All @@ -379,6 +435,9 @@ std::unique_ptr<table> concatenate(std::vector<table_view> const& tables_to_conc
tables_to_concat.cend(),
std::back_inserter(cols),
[i](auto const& t) { return t.column(i); });

// verify all types match and that we won't overflow size_type in output size
bounds_and_type_check(cols.begin(), cols.end());
concat_columns.emplace_back(detail::concatenate(cols, stream, mr));
}
return std::make_unique<table>(std::move(concat_columns));
Expand Down
96 changes: 96 additions & 0 deletions cpp/tests/copying/concatenate_tests.cu
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@
#include <cudf/table/table.hpp>
#include <string>

#include <rmm/device_uvector.hpp>

#include <thrust/iterator/counting_iterator.h>
#include <thrust/sequence.h>

Expand Down Expand Up @@ -326,6 +328,100 @@ TEST_F(TableTest, ConcatenateTablesWithOffsetsAndNulls)
}
}

TEST_F(TableTest, SizeOverflowTest)
{
// primitive column
{
constexpr cudf::size_type size =
static_cast<cudf::size_type>(static_cast<uint32_t>(1024) * 1024 * 1024);

// try and concatenate 6 char columns of size 1 billion each
auto many_chars = cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::INT8}, size);

cudf::table_view tbl({*many_chars});
EXPECT_THROW(cudf::concatenate({tbl, tbl, tbl, tbl, tbl, tbl}), cudf::logic_error);
}

// string column, overflow on chars
{
constexpr cudf::size_type size =
static_cast<cudf::size_type>(static_cast<uint32_t>(1024) * 1024 * 1024);

// try and concatenate 6 string columns of with 1 billion chars in each
auto offsets = cudf::test::fixed_width_column_wrapper<int>{0, size};
auto many_chars = cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::INT8}, size);
auto col = cudf::make_strings_column(
1, offsets.release(), std::move(many_chars), 0, rmm::device_buffer{0});

cudf::table_view tbl({*col});
EXPECT_THROW(cudf::concatenate({tbl, tbl, tbl, tbl, tbl, tbl}), cudf::logic_error);
}

// string column, overflow on offsets (rows)
{
constexpr cudf::size_type size =
static_cast<cudf::size_type>(static_cast<uint32_t>(1024) * 1024 * 1024);

// try and concatenate 6 string columns 1 billion rows each
auto many_offsets =
cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::INT32}, size + 1);
auto chars = cudf::test::fixed_width_column_wrapper<int8_t>{0, 1, 2};
auto col = cudf::make_strings_column(
size, std::move(many_offsets), chars.release(), 0, rmm::device_buffer{0});

cudf::table_view tbl({*col});
EXPECT_THROW(cudf::concatenate({tbl, tbl, tbl, tbl, tbl, tbl}), cudf::logic_error);
}

// list<struct>, structs too long
{
constexpr cudf::size_type inner_size =
static_cast<cudf::size_type>(static_cast<uint32_t>(512) * 1024 * 1024);

// struct
std::vector<std::unique_ptr<column>> children;
children.push_back(
cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::INT8}, inner_size));
children.push_back(
cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::INT8}, inner_size));
auto struct_col =
cudf::make_structs_column(inner_size, std::move(children), 0, rmm::device_buffer{0});

// list
auto offsets = cudf::test::fixed_width_column_wrapper<int>{0, inner_size};
auto col = cudf::make_lists_column(
1, offsets.release(), std::move(struct_col), 0, rmm::device_buffer{0});

cudf::table_view tbl({*col});
EXPECT_THROW(cudf::concatenate({tbl, tbl, tbl, tbl, tbl, tbl, tbl, tbl, tbl, tbl, tbl, tbl}),
cudf::logic_error);
}

// struct<int, list>, list child too long
{
constexpr cudf::size_type inner_size =
static_cast<cudf::size_type>(static_cast<uint32_t>(512) * 1024 * 1024);
constexpr cudf::size_type size = 3;

// list
auto offsets = cudf::test::fixed_width_column_wrapper<int>{0, 0, 0, inner_size};
auto many_chars =
cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::INT8}, inner_size);
auto list_col = cudf::make_lists_column(
3, offsets.release(), std::move(many_chars), 0, rmm::device_buffer{0});

// struct
std::vector<std::unique_ptr<column>> children;
children.push_back(cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::INT32}, size));
children.push_back(std::move(list_col));
auto col = cudf::make_structs_column(size, std::move(children), 0, rmm::device_buffer{0});

cudf::table_view tbl({*col});
EXPECT_THROW(cudf::concatenate({tbl, tbl, tbl, tbl, tbl, tbl, tbl, tbl, tbl, tbl, tbl, tbl}),
cudf::logic_error);
}
}

struct StructsColumnTest : public cudf::test::BaseFixture {
};

Expand Down

0 comments on commit 0ddba3d

Please sign in to comment.