Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[REVIEW] Verify that concatenating columns does not overflow size_type #6809

Merged
merged 6 commits into from
Dec 2, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@
- PR #6776 Use `void` return type for kernel wrapper functions instead of returning `cudaError_t`
- PR #6786 Add nested type support to ColumnVector#getDeviceMemorySize
- PR #6780 Move `cudf::cast` tests to separate test file
- PR #6809 size_type overflow checking when concatenating columns
- PR #6789 Rename `unary_op` to `unary_operator`
- PR #6770 Support building decimal columns with Table.TestBuilder
- PR #6822 Split out `cudf::distinct_count` from `drop_duplicates.cu`
Expand Down
75 changes: 67 additions & 8 deletions cpp/src/copying/concatenate.cu
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@

namespace cudf {
namespace detail {

// From benchmark data, the fused kernel optimization appears to perform better
// when there are more than a trivial number of columns, or when the null mask
// can also be computed at the same time
Expand Down Expand Up @@ -335,26 +336,82 @@ std::unique_ptr<column> concatenate_dispatch::operator()<cudf::struct_view>()
return cudf::structs::detail::concatenate(views, stream, mr);
}

namespace {

/**
* @brief Verifies that the sum of the sizes of all the columns to be concatenated
* will not exceed the max value of size_type, and verifies all column types match
*
* @param begin Beginning of range of columns to check
* @param end End of range of columns to check
*
* @throws cudf::logic_error if the total length of the concatenated columns would
* exceed the max value of size_type
*
* @throws cudf::logic_error if all of the input column types don't match
*/
template <typename ColIter>
void bounds_and_type_check(ColIter begin, ColIter end)
{
CUDF_EXPECTS(std::all_of(begin,
end,
[expected_type = (*begin).type()](auto const& c) {
return c.type() == expected_type;
}),
"Type mismatch in columns to concatenate.");

// total size of all concatenated rows
size_t const total_row_count =
std::accumulate(begin, end, std::size_t{}, [](size_t a, auto const& b) {
return a + static_cast<size_t>(b.size());
});
CUDF_EXPECTS(total_row_count <= std::numeric_limits<size_type>::max(),
"Total number of concatenated rows exceeds size_type range");

// march each child
auto child_iter = thrust::make_counting_iterator(0);
auto const num_children = (*begin).num_children();
std::for_each(child_iter, child_iter + num_children, [&](auto child_index) {
std::vector<column_view> nth_children;
nth_children.reserve(std::distance(begin, end));

// we cannot do this via a transform iterator + std::copy_if because some columns
// can have no children if they are empty. so if we had 3 input string columns
// and 1 of them was empty, 2 of them would have 2 children, and 1 of them would have
// 0 children. so it is not safe to index col.child() directly.
std::for_each(begin, end, [child_index, &nth_children](column_view const& col) {
if (col.num_children() <= child_index) {
CUDF_EXPECTS(col.num_children() == 0,
"Encountered a child column with an unexpected # of children");
} else {
nth_children.push_back(col.child(child_index));
}
});

bounds_and_type_check(nth_children.begin(), nth_children.end());
});
}

} // anonymous namespace

// Concatenates the elements from a vector of column_views
std::unique_ptr<column> concatenate(std::vector<column_view> const& columns_to_concat,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
CUDF_EXPECTS(not columns_to_concat.empty(), "Unexpected empty list of columns to concatenate.");

data_type const type = columns_to_concat.front().type();
CUDF_EXPECTS(std::all_of(columns_to_concat.begin(),
columns_to_concat.end(),
[&type](auto const& c) { return c.type() == type; }),
"Type mismatch in columns to concatenate.");
// verify all types match and that we won't overflow size_type in output size
bounds_and_type_check(columns_to_concat.begin(), columns_to_concat.end());

if (std::all_of(columns_to_concat.begin(), columns_to_concat.end(), [](column_view const& c) {
return c.is_empty();
})) {
return empty_like(columns_to_concat.front());
}

return type_dispatcher(type, concatenate_dispatch{columns_to_concat, stream, mr});
return type_dispatcher(columns_to_concat.front().type(),
concatenate_dispatch{columns_to_concat, stream, mr});
}

std::unique_ptr<table> concatenate(std::vector<table_view> const& tables_to_concat,
Expand All @@ -367,8 +424,7 @@ std::unique_ptr<table> concatenate(std::vector<table_view> const& tables_to_conc
CUDF_EXPECTS(std::all_of(tables_to_concat.cbegin(),
tables_to_concat.cend(),
[&first_table](auto const& t) {
return t.num_columns() == first_table.num_columns() &&
have_same_types(first_table, t);
return t.num_columns() == first_table.num_columns();
}),
"Mismatch in table columns to concatenate.");

Expand All @@ -379,6 +435,9 @@ std::unique_ptr<table> concatenate(std::vector<table_view> const& tables_to_conc
tables_to_concat.cend(),
std::back_inserter(cols),
[i](auto const& t) { return t.column(i); });

// verify all types match and that we won't overflow size_type in output size
bounds_and_type_check(cols.begin(), cols.end());
concat_columns.emplace_back(detail::concatenate(cols, stream, mr));
}
return std::make_unique<table>(std::move(concat_columns));
Expand Down
96 changes: 96 additions & 0 deletions cpp/tests/copying/concatenate_tests.cu
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@
#include <cudf/table/table.hpp>
#include <string>

#include <rmm/device_uvector.hpp>

#include <thrust/iterator/counting_iterator.h>
#include <thrust/sequence.h>

Expand Down Expand Up @@ -326,6 +328,100 @@ TEST_F(TableTest, ConcatenateTablesWithOffsetsAndNulls)
}
}

TEST_F(TableTest, SizeOverflowTest)
{
// primitive column
{
constexpr cudf::size_type size =
static_cast<cudf::size_type>(static_cast<uint32_t>(1024) * 1024 * 1024);

// try and concatenate 6 char columns of size 1 billion each
auto many_chars = cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::INT8}, size);

cudf::table_view tbl({*many_chars});
EXPECT_THROW(cudf::concatenate({tbl, tbl, tbl, tbl, tbl, tbl}), cudf::logic_error);
}

// string column, overflow on chars
{
constexpr cudf::size_type size =
static_cast<cudf::size_type>(static_cast<uint32_t>(1024) * 1024 * 1024);

// try and concatenate 6 string columns of with 1 billion chars in each
auto offsets = cudf::test::fixed_width_column_wrapper<int>{0, size};
auto many_chars = cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::INT8}, size);
auto col = cudf::make_strings_column(
1, offsets.release(), std::move(many_chars), 0, rmm::device_buffer{0});

cudf::table_view tbl({*col});
EXPECT_THROW(cudf::concatenate({tbl, tbl, tbl, tbl, tbl, tbl}), cudf::logic_error);
}

// string column, overflow on offsets (rows)
{
constexpr cudf::size_type size =
static_cast<cudf::size_type>(static_cast<uint32_t>(1024) * 1024 * 1024);

// try and concatenate 6 string columns 1 billion rows each
auto many_offsets =
cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::INT32}, size + 1);
auto chars = cudf::test::fixed_width_column_wrapper<int8_t>{0, 1, 2};
auto col = cudf::make_strings_column(
size, std::move(many_offsets), chars.release(), 0, rmm::device_buffer{0});

cudf::table_view tbl({*col});
EXPECT_THROW(cudf::concatenate({tbl, tbl, tbl, tbl, tbl, tbl}), cudf::logic_error);
}

// list<struct>, structs too long
{
constexpr cudf::size_type inner_size =
static_cast<cudf::size_type>(static_cast<uint32_t>(512) * 1024 * 1024);

// struct
std::vector<std::unique_ptr<column>> children;
children.push_back(
cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::INT8}, inner_size));
children.push_back(
cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::INT8}, inner_size));
auto struct_col =
cudf::make_structs_column(inner_size, std::move(children), 0, rmm::device_buffer{0});

// list
auto offsets = cudf::test::fixed_width_column_wrapper<int>{0, inner_size};
auto col = cudf::make_lists_column(
1, offsets.release(), std::move(struct_col), 0, rmm::device_buffer{0});

cudf::table_view tbl({*col});
EXPECT_THROW(cudf::concatenate({tbl, tbl, tbl, tbl, tbl, tbl, tbl, tbl, tbl, tbl, tbl, tbl}),
cudf::logic_error);
}

// struct<int, list>, list child too long
{
constexpr cudf::size_type inner_size =
static_cast<cudf::size_type>(static_cast<uint32_t>(512) * 1024 * 1024);
constexpr cudf::size_type size = 3;

// list
auto offsets = cudf::test::fixed_width_column_wrapper<int>{0, 0, 0, inner_size};
auto many_chars =
cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::INT8}, inner_size);
auto list_col = cudf::make_lists_column(
3, offsets.release(), std::move(many_chars), 0, rmm::device_buffer{0});

// struct
std::vector<std::unique_ptr<column>> children;
children.push_back(cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::INT32}, size));
children.push_back(std::move(list_col));
auto col = cudf::make_structs_column(size, std::move(children), 0, rmm::device_buffer{0});

cudf::table_view tbl({*col});
EXPECT_THROW(cudf::concatenate({tbl, tbl, tbl, tbl, tbl, tbl, tbl, tbl, tbl, tbl, tbl, tbl}),
cudf::logic_error);
}
}

struct StructsColumnTest : public cudf::test::BaseFixture {
};

Expand Down