Skip to content

Commit f7d6008

Browse files
Implement String View (Utf8View/BinaryView) Optimizations
Introduces a two-stage filter for ByteView types. Stage 1 uses a fast DirectProbeFilter on masked views (len + prefix) for quick rejection; Stage 2 performs full verification only for potential long-string matches. Triggers for Utf8View and BinaryView.
1 parent 2f3c5dc commit f7d6008

4 files changed

Lines changed: 344 additions & 7 deletions

File tree

datafusion/physical-expr/src/expressions/in_list/primitive_filter.rs

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -370,6 +370,16 @@ where
370370
))
371371
}
372372

373+
/// Creates a DirectProbeFilter from an iterator of values.
374+
///
375+
/// This is useful when building the filter from pre-processed values
376+
/// (e.g., masked views for Utf8View).
377+
pub(crate) fn from_values(values: impl Iterator<Item = T::Native>) -> Self {
378+
// Collect into HashSet for deduplication
379+
let unique_values: HashSet<_> = values.collect();
380+
Self::from_values_inner(unique_values.into_iter(), 0)
381+
}
382+
373383
/// Internal constructor from deduplicated values
374384
fn from_values_inner(
375385
unique_values: impl Iterator<Item = T::Native>,

datafusion/physical-expr/src/expressions/in_list/result.rs

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,6 @@
2121
//! from IN list membership tests, handling null propagation correctly
2222
//! according to SQL three-valued logic.
2323
24-
#![expect(dead_code)]
25-
2624
use arrow::array::BooleanArray;
2725
use arrow::buffer::{BooleanBuffer, NullBuffer};
2826

datafusion/physical-expr/src/expressions/in_list/strategy.rs

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,9 @@ use super::primitive_filter::*;
2626
use super::result::handle_dictionary;
2727
use super::static_filter::StaticFilter;
2828
use super::transform::{
29-
make_bitmap_filter, make_branchless_filter, reinterpret_any_primitive_to,
29+
make_bitmap_filter, make_branchless_filter, make_byte_view_masked_filter,
30+
make_utf8view_branchless_filter, make_utf8view_hash_filter,
31+
reinterpret_any_primitive_to, utf8view_all_short_strings,
3032
};
3133

3234
// =============================================================================
@@ -102,6 +104,16 @@ pub(super) fn instantiate_static_filter(
102104

103105
let len = in_array.len();
104106
let dt = in_array.data_type();
107+
108+
// Special case: Utf8View with short strings can be reinterpreted as i128
109+
if matches!(dt, DataType::Utf8View) && utf8view_all_short_strings(in_array.as_ref()) {
110+
return if len <= BRANCHLESS_MAX_16B {
111+
make_utf8view_branchless_filter(&in_array)
112+
} else {
113+
make_utf8view_hash_filter(&in_array)
114+
};
115+
}
116+
105117
let strategy = select_strategy(dt, len);
106118

107119
match (dt, strategy) {
@@ -122,6 +134,14 @@ pub(super) fn instantiate_static_filter(
122134
exec_datafusion_err!("Hashed strategy selected but no filter for {:?}", dt)
123135
})?,
124136

137+
// Byte view filters (Utf8View, BinaryView)
138+
(DataType::Utf8View, Generic) => {
139+
make_byte_view_masked_filter::<StringViewType>(in_array)
140+
}
141+
(DataType::BinaryView, Generic) => {
142+
make_byte_view_masked_filter::<BinaryViewType>(in_array)
143+
}
144+
125145
// Fallback for nested/complex types and strings.
126146
(_, Generic) => Ok(Arc::new(ArrayStaticFilter::try_new(in_array)?)),
127147
}

0 commit comments

Comments
 (0)