Skip to content

Commit 85f2d9a

Browse files
Implement Legacy String Optimization (Utf8TwoStageFilter)
Port of the two-stage View optimization to standard Utf8 and LargeUtf8 types. Encodes strings as i128 (len + prefix) for fast O(1) pre-filtering before falling back to full string comparison. Triggers for Utf8 and LargeUtf8.
1 parent 86836ee commit 85f2d9a

3 files changed

Lines changed: 315 additions & 11 deletions

File tree

datafusion/physical-expr/src/expressions/in_list.rs

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,13 @@
1717

1818
//! Implementation of `InList` expressions: [`InListExpr`]
1919
20+
mod nested_filter;
21+
mod primitive_filter;
22+
mod result;
23+
mod static_filter;
24+
mod strategy;
25+
mod transform;
26+
2027
use std::fmt::Debug;
2128
use std::hash::{Hash, Hasher};
2229
use std::sync::Arc;
@@ -30,19 +37,11 @@ use arrow::compute::SortOptions;
3037
use arrow::compute::kernels::boolean::{not, or_kleene};
3138
use arrow::compute::kernels::cmp::eq as arrow_eq;
3239
use arrow::datatypes::*;
33-
3440
use datafusion_common::{
3541
DFSchema, Result, ScalarValue, assert_or_internal_err, exec_err,
3642
};
3743
use datafusion_expr::{ColumnarValue, expr_vec_fmt};
3844

39-
mod nested_filter;
40-
mod primitive_filter;
41-
mod result;
42-
mod static_filter;
43-
mod strategy;
44-
mod transform;
45-
4645
use static_filter::StaticFilter;
4746
use strategy::instantiate_static_filter;
4847

datafusion/physical-expr/src/expressions/in_list/strategy.rs

Lines changed: 54 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,17 @@
1616
// under the License.
1717

1818
//! Filter selection strategy for InList expressions
19+
//!
20+
//! Selects the optimal lookup strategy based on data type and list size:
21+
//!
22+
//! - 1-byte types (Int8/UInt8): bitmap (32 bytes, O(1) bit test)
23+
//! - 2-byte types (Int16/UInt16): bitmap (8 KB, O(1) bit test)
24+
//! - 4-byte types (Int32/Float32): branchless (≤32) or hash (>32)
25+
//! - 8-byte types (Int64/Float64): branchless (≤16) or hash (>16)
26+
//! - 16-byte types (Decimal128): branchless (≤4) or hash (>4)
27+
//! - Utf8View (short strings): branchless (≤4) or hash (>4)
28+
//! - Byte arrays (Utf8, Binary, etc.): ByteArrayFilter / ByteViewFilter
29+
//! - Other types: NestedTypeFilter (fallback for List, Struct, Map, etc.)
1930
2031
use std::sync::Arc;
2132

@@ -29,13 +40,25 @@ use super::result::handle_dictionary;
2940
use super::static_filter::StaticFilter;
3041
use super::transform::{
3142
make_bitmap_filter, make_branchless_filter, make_byte_view_masked_filter,
32-
make_utf8view_branchless_filter, make_utf8view_hash_filter,
33-
reinterpret_any_primitive_to, utf8view_all_short_strings,
43+
make_utf8_two_stage_filter, make_utf8view_branchless_filter,
44+
make_utf8view_hash_filter, utf8_all_short_strings, utf8view_all_short_strings,
3445
};
3546

3647
// =============================================================================
3748
// LOOKUP STRATEGY THRESHOLDS (tuned via microbenchmarks)
3849
// =============================================================================
50+
//
51+
// Based on minimum batch time (8192 lookups per batch):
52+
// - Int8 (1 byte): BITMAP (32 bytes, always fastest)
53+
// - Int16 (2 bytes): BITMAP (8 KB, always fastest)
54+
// - Int32 (4 bytes): branchless up to 32, then hashset
55+
// - Int64 (8 bytes): branchless up to 16, then hashset
56+
// - Int128 (16 bytes): branchless up to 4, then hashset
57+
// - Byte arrays: ByteArrayFilter / ByteViewFilter
58+
// - Other types: NestedTypeFilter (fallback for List, Struct, Map, etc.)
59+
//
60+
// NOTE: Binary search and linear scan were benchmarked but consistently
61+
// lost to the strategies above at all tested list sizes.
3962

4063
/// Maximum list size for branchless lookup on 4-byte primitives (Int32, UInt32, Float32).
4164
const BRANCHLESS_MAX_4B: usize = 32;
@@ -65,6 +88,10 @@ enum FilterStrategy {
6588
}
6689

6790
/// Determines the optimal lookup strategy based on data type and list size.
91+
///
92+
/// For 1-byte and 2-byte types, bitmap is always used (benchmarks show it's
93+
/// faster than both branchless and hashed at all list sizes).
94+
/// For larger types, cutoffs are tuned per byte-width.
6895
fn select_strategy(dt: &DataType, len: usize) -> FilterStrategy {
6996
match dt.primitive_width() {
7097
Some(1) => FilterStrategy::Bitmap1B,
@@ -99,6 +126,9 @@ fn select_strategy(dt: &DataType, len: usize) -> FilterStrategy {
99126
// =============================================================================
100127

101128
/// Creates the optimal static filter for the given array.
129+
///
130+
/// This is the main entry point for filter creation. It analyzes the array's
131+
/// data type and size to select the best lookup strategy.
102132
pub(crate) fn instantiate_static_filter(
103133
in_array: ArrayRef,
104134
) -> Result<Arc<dyn StaticFilter + Send + Sync>> {
@@ -136,15 +166,31 @@ pub(crate) fn instantiate_static_filter(
136166
exec_datafusion_err!("Hashed strategy selected but no filter for {:?}", dt)
137167
})?,
138168

169+
// Utf8/LargeUtf8: Two-stage filter when all IN-list strings are short (≤12 bytes).
170+
// Stage 1 encodes as i128 (length + first 12 bytes) for O(1) rejection.
171+
// When strings are long, the encoding can't definitively match and the
172+
// overhead regresses vs the generic fallback, so we skip it.
173+
(DataType::Utf8 | DataType::LargeUtf8, Generic)
174+
if utf8_all_short_strings(in_array.as_ref()) =>
175+
{
176+
make_utf8_two_stage_filter(in_array)
177+
}
178+
179+
// Binary variants: Use NestedTypeFilter (make_comparator)
180+
(DataType::Binary | DataType::LargeBinary, Generic) => {
181+
Ok(Arc::new(NestedTypeFilter::try_new(in_array)?))
182+
}
183+
139184
// Byte view filters (Utf8View, BinaryView)
185+
// Both use two-stage filter: masked view pre-check + full verification
140186
(DataType::Utf8View, Generic) => {
141187
make_byte_view_masked_filter::<StringViewType>(in_array)
142188
}
143189
(DataType::BinaryView, Generic) => {
144190
make_byte_view_masked_filter::<BinaryViewType>(in_array)
145191
}
146192

147-
// Fallback for nested/complex types and strings (Phase 4: Strings use fallback)
193+
// Fallback for nested/complex types (List, Struct, Map, Union, etc.)
148194
(_, Generic) => Ok(Arc::new(NestedTypeFilter::try_new(in_array)?)),
149195
}
150196
}
@@ -157,6 +203,7 @@ fn dispatch_branchless(
157203
arr: &ArrayRef,
158204
) -> Option<Result<Arc<dyn StaticFilter + Send + Sync>>> {
159205
// Dispatch to width-specific branchless filter.
206+
// Each width has its own max size: 4B→32, 8B→16, 16B→4
160207
match arr.data_type().primitive_width() {
161208
Some(4) => Some(make_branchless_filter::<UInt32Type>(arr, 4)),
162209
Some(8) => Some(make_branchless_filter::<UInt64Type>(arr, 8)),
@@ -192,6 +239,8 @@ fn dispatch_hashed(
192239
Some(16) => Some(make_direct_probe_filter_reinterpreted::<Decimal128Type>(
193240
arr,
194241
)),
242+
// Other widths (1, 2) use Bitmap strategy and never reach here.
243+
// Unknown widths fall through to Generic strategy.
195244
_ => None,
196245
}
197246
}
@@ -204,6 +253,8 @@ where
204253
D: ArrowPrimitiveType + 'static,
205254
D::Native: Send + Sync + DirectProbeHashable + 'static,
206255
{
256+
use super::transform::reinterpret_any_primitive_to;
257+
207258
// Fast path: already the right type
208259
if in_array.data_type() == &D::DATA_TYPE {
209260
return Ok(Arc::new(DirectProbeFilter::<D>::try_new(in_array)?));

0 commit comments

Comments
 (0)