Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve performance of casting StringView/BinaryView to DictionaryArray #5872

Merged
merged 10 commits into from
Jun 13, 2024
Next Next commit
zero-copy dict to view
  • Loading branch information
XiangpengHao committed Jun 11, 2024
commit 63db2753df4026947349573d01710be8de0d8175
87 changes: 87 additions & 0 deletions arrow-cast/src/cast/dictionary.rs
Original file line number Diff line number Diff line change
Expand Up @@ -85,10 +85,97 @@ pub(crate) fn dictionary_cast<K: ArrowDictionaryKeyType>(

Ok(new_array)
}
Utf8View => {
// `unpack_dictionary` can handle Utf8View/BinaryView types, but incurs unnecessary data copy of the value buffer.
// we handle it here to avoid the copy.
let dict_array = array
.as_dictionary::<K>()
.downcast_dict::<StringArray>()
.unwrap();

let string_values = dict_array.values();
let value_offsets = string_values.value_offsets();
let value_buffer = string_values.values().clone();

let view_buffer =
view_from_dict_values(value_offsets, &value_buffer, dict_array.keys());

// Safety:
// the buffer is from StringArray which is utf8.
let string_view = unsafe {
StringViewArray::new_unchecked(
view_buffer,
vec![value_buffer],
dict_array.nulls().cloned(),
)
};
Ok(Arc::new(string_view))
}
BinaryView => {
// `unpack_dictionary` can handle Utf8View/BinaryView types, but incurs unnecessary data copy of the value buffer.
// we handle it here to avoid the copy.
let dict_array = array
.as_dictionary::<K>()
.downcast_dict::<BinaryArray>()
.unwrap();

let binary_values = dict_array.values();
let value_offsets = binary_values.value_offsets();
let value_buffer = binary_values.values().clone();

let view_buffer =
view_from_dict_values(value_offsets, &value_buffer, dict_array.keys());
let binary_view = unsafe {
BinaryViewArray::new_unchecked(
view_buffer,
vec![value_buffer],
dict_array.nulls().cloned(),
)
};
Ok(Arc::new(binary_view))
}
_ => unpack_dictionary::<K>(array, to_type, cast_options),
}
}

fn view_from_dict_values<K: ArrowDictionaryKeyType>(
value_offsets: &[i32],
value_buffer: &arrow_buffer::Buffer,
keys: &PrimitiveArray<K>,
) -> ScalarBuffer<u128> {
let mut view_builder = BufferBuilder::<u128>::new(keys.len());
for i in keys.iter() {
match i {
Some(v) => {
let idx = v.to_usize().unwrap();
let offset = value_offsets[idx];
let end = value_offsets[idx + 1];
let length = end - offset;
let value_buf = &value_buffer[offset as usize..end as usize];

if length <= 12 {
let mut view_buffer = [0; 16];
view_buffer[0..4].copy_from_slice(&length.to_le_bytes());
view_buffer[4..4 + value_buf.len()].copy_from_slice(value_buf);
view_builder.append(u128::from_le_bytes(view_buffer));
} else {
let view = ByteView {
length: length as u32,
prefix: u32::from_le_bytes(value_buf[0..4].try_into().unwrap()),
buffer_index: 0,
offset: offset as u32,
};
view_builder.append(view.into());
}
}
None => {
view_builder.append_n_zeroed(1);
}
}
}
ScalarBuffer::new(view_builder.finish(), 0, keys.len())
}

// Unpack a dictionary where the keys are of type <K> into a flattened array of type to_type
pub(crate) fn unpack_dictionary<K>(
array: &dyn Array,
Expand Down
66 changes: 34 additions & 32 deletions arrow-cast/src/cast/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5203,19 +5203,19 @@ mod tests {
_test_string_to_view::<i64>();
}

const VIEW_TEST_DATA: [Option<&str>; 5] = [
Some("hello"),
Some("world"),
None,
Some("large payload over 12 bytes"),
Some("lulu"),
];

fn _test_string_to_view<O>()
where
O: OffsetSizeTrait,
{
let data = vec![
Some("hello"),
Some("world"),
None,
Some("large payload over 12 bytes"),
Some("lulu"),
];

let string_array = GenericStringArray::<O>::from(data.clone());
let string_array = GenericStringArray::<O>::from_iter(VIEW_TEST_DATA);

assert!(can_cast_types(
string_array.data_type(),
Expand All @@ -5225,7 +5225,7 @@ mod tests {
let string_view_array = cast(&string_array, &DataType::Utf8View).unwrap();
assert_eq!(string_view_array.data_type(), &DataType::Utf8View);

let expect_string_view_array = StringViewArray::from(data);
let expect_string_view_array = StringViewArray::from_iter(VIEW_TEST_DATA);
assert_eq!(string_view_array.as_ref(), &expect_string_view_array);
}

Expand All @@ -5239,15 +5239,7 @@ mod tests {
where
O: OffsetSizeTrait,
{
let data: Vec<Option<&[u8]>> = vec![
Some(b"hello"),
Some(b"world"),
None,
Some(b"large payload over 12 bytes"),
Some(b"lulu"),
];

let binary_array = GenericBinaryArray::<O>::from(data.clone());
let binary_array = GenericBinaryArray::<O>::from_iter(VIEW_TEST_DATA);

assert!(can_cast_types(
binary_array.data_type(),
Expand All @@ -5257,10 +5249,30 @@ mod tests {
let binary_view_array = cast(&binary_array, &DataType::BinaryView).unwrap();
assert_eq!(binary_view_array.data_type(), &DataType::BinaryView);

let expect_binary_view_array = BinaryViewArray::from(data);
let expect_binary_view_array = BinaryViewArray::from_iter(VIEW_TEST_DATA);
assert_eq!(binary_view_array.as_ref(), &expect_binary_view_array);
}

#[test]
fn test_dict_to_view() {
let string_view_array = StringViewArray::from_iter(VIEW_TEST_DATA);
let string_dict_array: DictionaryArray<Int8Type> = VIEW_TEST_DATA.into_iter().collect();
let expected_string_array_type = string_view_array.data_type();
let casted_string_array = cast(&string_dict_array, expected_string_array_type).unwrap();
assert_eq!(casted_string_array.data_type(), expected_string_array_type);
assert_eq!(casted_string_array.as_ref(), &string_view_array);

let binary_view_array = BinaryViewArray::from_iter(VIEW_TEST_DATA);
let binary_dict_array = string_dict_array.downcast_dict::<StringArray>().unwrap();
let binary_buffer = cast(&binary_dict_array.values(), &DataType::Binary).unwrap();
let binary_dict_array =
DictionaryArray::<Int8Type>::new(binary_dict_array.keys().clone(), binary_buffer);
let expected_binary_array_type = binary_view_array.data_type();
let casted_binary_array = cast(&binary_dict_array, expected_binary_array_type).unwrap();
assert_eq!(casted_binary_array.data_type(), expected_binary_array_type);
assert_eq!(casted_binary_array.as_ref(), &binary_view_array);
}

#[test]
fn test_view_to_string() {
_test_view_to_string::<i32>();
Expand All @@ -5271,24 +5283,15 @@ mod tests {
where
O: OffsetSizeTrait,
{
let data: Vec<Option<&str>> = vec![
Some("hello"),
Some("world"),
None,
Some("large payload over 12 bytes"),
Some("lulu"),
];

let view_array = {
// ["hello", "world", null, "large payload over 12 bytes", "lulu"]
let mut builder = StringViewBuilder::new().with_block_size(8); // multiple buffers.
for s in data.iter() {
for s in VIEW_TEST_DATA.iter() {
builder.append_option(*s);
}
builder.finish()
};

let expected_string_array = GenericStringArray::<O>::from(data);
let expected_string_array = GenericStringArray::<O>::from_iter(VIEW_TEST_DATA);
let expected_type = expected_string_array.data_type();

assert!(can_cast_types(view_array.data_type(), expected_type));
Expand Down Expand Up @@ -5318,7 +5321,6 @@ mod tests {
];

let view_array = {
// ["hello", "world", null, "large payload over 12 bytes", "lulu"]
let mut builder = BinaryViewBuilder::new().with_block_size(8); // multiple buffers.
for s in data.iter() {
builder.append_option(*s);
Expand Down
Loading