Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve performance of casting StringView/BinaryView to DictionaryArray #5872

Merged
merged 10 commits into from
Jun 13, 2024
47 changes: 32 additions & 15 deletions arrow-array/src/builder/generic_bytes_view_builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,36 @@ impl<T: ByteViewType + ?Sized> GenericByteViewBuilder<T> {
offset as u32
}

/// Append a view of the given `block`, `offset` and `length`
///
/// # Safety
/// (1) The block must have been added using [`Self::append_block`]
/// (2) The range `offset..offset+length` must be within the bounds of the block
/// (3) The data in the block must be valid of type `T`
pub unsafe fn append_view_unchecked(&mut self, block: u32, offset: u32, len: u32) {
let b = self.completed.get_unchecked(block as usize);
let start = offset as usize;
let end = start.saturating_add(len as usize);
let b = b.get_unchecked(start..end);

if len <= 12 {
let mut view_buffer = [0; 16];
view_buffer[0..4].copy_from_slice(&len.to_le_bytes());
view_buffer[4..4 + b.len()].copy_from_slice(b);
self.views_builder.append(u128::from_le_bytes(view_buffer));
} else {
let view = ByteView {
length: len,
prefix: u32::from_le_bytes(b[0..4].try_into().unwrap()),
buffer_index: block,
offset,
};
self.views_builder.append(view.into());
}

self.null_buffer_builder.append_non_null();
}

/// Try to append a view of the given `block`, `offset` and `length`
///
/// See [`Self::append_block`]
Expand All @@ -139,22 +169,9 @@ impl<T: ByteViewType + ?Sized> GenericByteViewBuilder<T> {
));
}

if len <= 12 {
let mut view_buffer = [0; 16];
view_buffer[0..4].copy_from_slice(&len.to_le_bytes());
view_buffer[4..4 + b.len()].copy_from_slice(b);
self.views_builder.append(u128::from_le_bytes(view_buffer));
} else {
let view = ByteView {
length: len,
prefix: u32::from_le_bytes(b[0..4].try_into().unwrap()),
buffer_index: block,
offset,
};
self.views_builder.append(view.into());
unsafe {
self.append_view_unchecked(block, offset, len);
}

self.null_buffer_builder.append_non_null();
Ok(())
}

Expand Down
78 changes: 25 additions & 53 deletions arrow-cast/src/cast/dictionary.rs
Original file line number Diff line number Diff line change
Expand Up @@ -93,22 +93,10 @@ pub(crate) fn dictionary_cast<K: ArrowDictionaryKeyType>(
.downcast_dict::<StringArray>()
.unwrap();

let string_values = dict_array.values();
let value_offsets = string_values.value_offsets();
let value_buffer = string_values.values().clone();

let view_buffer =
view_from_dict_values(value_offsets, &value_buffer, dict_array.keys());

// Safety:
// the buffer is from StringArray which is utf8.
let string_view = unsafe {
StringViewArray::new_unchecked(
view_buffer,
vec![value_buffer],
dict_array.nulls().cloned(),
)
};
let string_view = view_from_dict_values::<K, StringViewType, GenericStringType<i32>>(
dict_array.values(),
dict_array.keys(),
);
Ok(Arc::new(string_view))
}
BinaryView => {
Expand All @@ -119,61 +107,45 @@ pub(crate) fn dictionary_cast<K: ArrowDictionaryKeyType>(
.downcast_dict::<BinaryArray>()
.unwrap();

let binary_values = dict_array.values();
let value_offsets = binary_values.value_offsets();
let value_buffer = binary_values.values().clone();

let view_buffer =
view_from_dict_values(value_offsets, &value_buffer, dict_array.keys());
let binary_view = unsafe {
BinaryViewArray::new_unchecked(
view_buffer,
vec![value_buffer],
dict_array.nulls().cloned(),
)
};
let binary_view = view_from_dict_values::<K, BinaryViewType, BinaryType>(
dict_array.values(),
dict_array.keys(),
);
Ok(Arc::new(binary_view))
}
_ => unpack_dictionary::<K>(array, to_type, cast_options),
}
}

fn view_from_dict_values<K: ArrowDictionaryKeyType>(
value_offsets: &[i32],
value_buffer: &arrow_buffer::Buffer,
fn view_from_dict_values<K: ArrowDictionaryKeyType, T: ByteViewType, V: ByteArrayType>(
array: &GenericByteArray<V>,
keys: &PrimitiveArray<K>,
) -> ScalarBuffer<u128> {
let mut view_builder = BufferBuilder::<u128>::new(keys.len());
) -> GenericByteViewArray<T> {
let value_buffer = array.values();
let value_offsets = array.value_offsets();
let mut builder = GenericByteViewBuilder::<T>::with_capacity(keys.len());
builder.append_block(value_buffer.clone());
for i in keys.iter() {
match i {
Some(v) => {
let idx = v.to_usize().unwrap();
let offset = value_offsets[idx];
let end = value_offsets[idx + 1];
let length = end - offset;
let value_buf = &value_buffer[offset as usize..end as usize];

if length <= 12 {
let mut view_buffer = [0; 16];
view_buffer[0..4].copy_from_slice(&length.to_le_bytes());
view_buffer[4..4 + value_buf.len()].copy_from_slice(value_buf);
view_builder.append(u128::from_le_bytes(view_buffer));
} else {
let view = ByteView {
length: length as u32,
prefix: u32::from_le_bytes(value_buf[0..4].try_into().unwrap()),
buffer_index: 0,
offset: offset as u32,
};
view_builder.append(view.into());
// Safety
// (1) The index is within bounds as they are offsets
// (2) The append_view is safe
unsafe {
let offset = value_offsets.get_unchecked(idx).as_usize();
let end = value_offsets.get_unchecked(idx + 1).as_usize();
let length = end - offset;
builder.append_view_unchecked(0, offset as u32, length as u32)
}
}
None => {
view_builder.append_n_zeroed(1);
builder.append_null();
}
}
}
ScalarBuffer::new(view_builder.finish(), 0, keys.len())
builder.finish()
}

// Unpack a dictionary where the keys are of type <K> into a flattened array of type to_type
Expand Down
30 changes: 24 additions & 6 deletions arrow-cast/src/cast/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5255,18 +5255,36 @@ mod tests {

#[test]
fn test_dict_to_view() {
let string_view_array = StringViewArray::from_iter(VIEW_TEST_DATA);
let string_dict_array: DictionaryArray<Int8Type> = VIEW_TEST_DATA.into_iter().collect();
let values = StringArray::from_iter(VIEW_TEST_DATA);
let keys = Int8Array::from_iter([Some(1), Some(0), None, Some(3), None, Some(1), Some(4)]);
let string_dict_array =
DictionaryArray::<Int8Type>::try_new(keys, Arc::new(values)).unwrap();
let typed_dict = string_dict_array.downcast_dict::<StringArray>().unwrap();

let string_view_array = {
let mut builder = StringViewBuilder::new().with_block_size(8); // multiple buffers.
for v in typed_dict.into_iter() {
builder.append_option(v);
}
builder.finish()
};
let expected_string_array_type = string_view_array.data_type();
let casted_string_array = cast(&string_dict_array, expected_string_array_type).unwrap();
assert_eq!(casted_string_array.data_type(), expected_string_array_type);
assert_eq!(casted_string_array.as_ref(), &string_view_array);

let binary_view_array = BinaryViewArray::from_iter(VIEW_TEST_DATA);
let binary_dict_array = string_dict_array.downcast_dict::<StringArray>().unwrap();
let binary_buffer = cast(&binary_dict_array.values(), &DataType::Binary).unwrap();
let binary_buffer = cast(&typed_dict.values(), &DataType::Binary).unwrap();
let binary_dict_array =
DictionaryArray::<Int8Type>::new(binary_dict_array.keys().clone(), binary_buffer);
DictionaryArray::<Int8Type>::new(typed_dict.keys().clone(), binary_buffer);
let typed_binary_dict = binary_dict_array.downcast_dict::<BinaryArray>().unwrap();

let binary_view_array = {
let mut builder = BinaryViewBuilder::new().with_block_size(8); // multiple buffers.
for v in typed_binary_dict.into_iter() {
builder.append_option(v);
}
builder.finish()
};
let expected_binary_array_type = binary_view_array.data_type();
let casted_binary_array = cast(&binary_dict_array, expected_binary_array_type).unwrap();
assert_eq!(casted_binary_array.data_type(), expected_binary_array_type);
Expand Down