Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve performance of casting StringView/BinaryView to DictionaryArray #5872

Merged
merged 10 commits into from
Jun 13, 2024
Prev Previous commit
Next Next commit
unchecked view
  • Loading branch information
XiangpengHao committed Jun 11, 2024
commit c27d548fff9aa6e17782d3bbea4a450693778449
47 changes: 32 additions & 15 deletions arrow-array/src/builder/generic_bytes_view_builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,36 @@ impl<T: ByteViewType + ?Sized> GenericByteViewBuilder<T> {
offset as u32
}

/// Append a view of the given `block`, `offset` and `length`
///
/// # Safety
/// (1) The block must have been added using [`Self::append_block`]
/// (2) The range `offset..offset+length` must be within the bounds of the block
/// (3) The data in the block must be valid of type `T`
pub unsafe fn append_view_unchecked(&mut self, block: u32, offset: u32, len: u32) {
let b = self.completed.get_unchecked(block as usize);
let start = offset as usize;
let end = start.saturating_add(len as usize);
let b = b.get_unchecked(start..end);

if len <= 12 {
let mut view_buffer = [0; 16];
view_buffer[0..4].copy_from_slice(&len.to_le_bytes());
view_buffer[4..4 + b.len()].copy_from_slice(b);
self.views_builder.append(u128::from_le_bytes(view_buffer));
} else {
let view = ByteView {
length: len,
prefix: u32::from_le_bytes(b[0..4].try_into().unwrap()),
buffer_index: block,
offset,
};
self.views_builder.append(view.into());
}

self.null_buffer_builder.append_non_null();
}

/// Try to append a view of the given `block`, `offset` and `length`
///
/// See [`Self::append_block`]
Expand All @@ -139,22 +169,9 @@ impl<T: ByteViewType + ?Sized> GenericByteViewBuilder<T> {
));
}

if len <= 12 {
let mut view_buffer = [0; 16];
view_buffer[0..4].copy_from_slice(&len.to_le_bytes());
view_buffer[4..4 + b.len()].copy_from_slice(b);
self.views_builder.append(u128::from_le_bytes(view_buffer));
} else {
let view = ByteView {
length: len,
prefix: u32::from_le_bytes(b[0..4].try_into().unwrap()),
buffer_index: block,
offset,
};
self.views_builder.append(view.into());
unsafe {
self.append_view_unchecked(block, offset, len);
}

self.null_buffer_builder.append_non_null();
Ok(())
}

Expand Down
16 changes: 10 additions & 6 deletions arrow-cast/src/cast/dictionary.rs
Original file line number Diff line number Diff line change
Expand Up @@ -129,12 +129,16 @@ fn view_from_dict_values<K: ArrowDictionaryKeyType, T: ByteViewType, V: ByteArra
match i {
Some(v) => {
let idx = v.to_usize().unwrap();
let offset = value_offsets[idx];
let end = value_offsets[idx + 1];
let length = end - offset;
builder
.try_append_view(0, offset.as_usize() as u32, length.as_usize() as u32)
.unwrap();

// Safety
// (1) The index is within bounds as they are offsets
// (2) The append_view is safe
unsafe {
let offset = value_offsets.get_unchecked(idx).as_usize();
let end = value_offsets.get_unchecked(idx + 1).as_usize();
let length = end - offset;
builder.append_view_unchecked(0, offset as u32, length as u32)
}
}
None => {
builder.append_null();
Expand Down
Loading