1616// under the License.
1717
1818//! Filter selection strategy for InList expressions
19+ //!
20+ //! Selects the optimal lookup strategy based on data type and list size:
21+ //!
22+ //! - 1-byte types (Int8/UInt8): bitmap (32 bytes, O(1) bit test)
23+ //! - 2-byte types (Int16/UInt16): bitmap (8 KB, O(1) bit test)
24+ //! - 4-byte types (Int32/Float32): branchless (≤32) or hash (>32)
25+ //! - 8-byte types (Int64/Float64): branchless (≤16) or hash (>16)
26+ //! - 16-byte types (Decimal128): branchless (≤4) or hash (>4)
27+ //! - Utf8View (short strings): branchless (≤4) or hash (>4)
28+ //! - Byte arrays (Utf8, Binary, etc.): ByteArrayFilter / ByteViewFilter
29+ //! - Other types: NestedTypeFilter (fallback for List, Struct, Map, etc.)
1930
2031use std:: sync:: Arc ;
2132
@@ -29,13 +40,25 @@ use super::result::handle_dictionary;
2940use super :: static_filter:: StaticFilter ;
3041use super :: transform:: {
3142 make_bitmap_filter, make_branchless_filter, make_byte_view_masked_filter,
32- make_utf8view_branchless_filter , make_utf8view_hash_filter ,
33- reinterpret_any_primitive_to , utf8view_all_short_strings,
43+ make_utf8_two_stage_filter , make_utf8view_branchless_filter ,
44+ make_utf8view_hash_filter , utf8_all_short_strings , utf8view_all_short_strings,
3445} ;
3546
3647// =============================================================================
3748// LOOKUP STRATEGY THRESHOLDS (tuned via microbenchmarks)
3849// =============================================================================
50+ //
51+ // Based on minimum batch time (8192 lookups per batch):
52+ // - Int8 (1 byte): BITMAP (32 bytes, always fastest)
53+ // - Int16 (2 bytes): BITMAP (8 KB, always fastest)
54+ // - Int32 (4 bytes): branchless up to 32, then hashset
55+ // - Int64 (8 bytes): branchless up to 16, then hashset
56+ // - Int128 (16 bytes): branchless up to 4, then hashset
57+ // - Byte arrays: ByteArrayFilter / ByteViewFilter
58+ // - Other types: NestedTypeFilter (fallback for List, Struct, Map, etc.)
59+ //
60+ // NOTE: Binary search and linear scan were benchmarked but consistently
61+ // lost to the strategies above at all tested list sizes.
3962
4063/// Maximum list size for branchless lookup on 4-byte primitives (Int32, UInt32, Float32).
4164const BRANCHLESS_MAX_4B : usize = 32 ;
@@ -65,6 +88,10 @@ enum FilterStrategy {
6588}
6689
6790/// Determines the optimal lookup strategy based on data type and list size.
91+ ///
92+ /// For 1-byte and 2-byte types, bitmap is always used (benchmarks show it's
93+ /// faster than both branchless and hashed at all list sizes).
94+ /// For larger types, cutoffs are tuned per byte-width.
6895fn select_strategy ( dt : & DataType , len : usize ) -> FilterStrategy {
6996 match dt. primitive_width ( ) {
7097 Some ( 1 ) => FilterStrategy :: Bitmap1B ,
@@ -99,6 +126,9 @@ fn select_strategy(dt: &DataType, len: usize) -> FilterStrategy {
99126// =============================================================================
100127
101128/// Creates the optimal static filter for the given array.
129+ ///
130+ /// This is the main entry point for filter creation. It analyzes the array's
131+ /// data type and size to select the best lookup strategy.
102132pub ( crate ) fn instantiate_static_filter (
103133 in_array : ArrayRef ,
104134) -> Result < Arc < dyn StaticFilter + Send + Sync > > {
@@ -136,15 +166,31 @@ pub(crate) fn instantiate_static_filter(
136166 exec_datafusion_err ! ( "Hashed strategy selected but no filter for {:?}" , dt)
137167 } ) ?,
138168
169+ // Utf8/LargeUtf8: Two-stage filter when all IN-list strings are short (≤12 bytes).
170+ // Stage 1 encodes as i128 (length + first 12 bytes) for O(1) rejection.
171+ // When strings are long, the encoding can't definitively match and the
172+ // overhead regresses vs the generic fallback, so we skip it.
173+ ( DataType :: Utf8 | DataType :: LargeUtf8 , Generic )
174+ if utf8_all_short_strings ( in_array. as_ref ( ) ) =>
175+ {
176+ make_utf8_two_stage_filter ( in_array)
177+ }
178+
179+ // Binary variants: Use NestedTypeFilter (make_comparator)
180+ ( DataType :: Binary | DataType :: LargeBinary , Generic ) => {
181+ Ok ( Arc :: new ( NestedTypeFilter :: try_new ( in_array) ?) )
182+ }
183+
139184 // Byte view filters (Utf8View, BinaryView)
185+ // Both use two-stage filter: masked view pre-check + full verification
140186 ( DataType :: Utf8View , Generic ) => {
141187 make_byte_view_masked_filter :: < StringViewType > ( in_array)
142188 }
143189 ( DataType :: BinaryView , Generic ) => {
144190 make_byte_view_masked_filter :: < BinaryViewType > ( in_array)
145191 }
146192
147- // Fallback for nested/complex types and strings (Phase 4: Strings use fallback )
193+ // Fallback for nested/complex types (List, Struct, Map, Union, etc. )
148194 ( _, Generic ) => Ok ( Arc :: new ( NestedTypeFilter :: try_new ( in_array) ?) ) ,
149195 }
150196}
@@ -157,6 +203,7 @@ fn dispatch_branchless(
157203 arr : & ArrayRef ,
158204) -> Option < Result < Arc < dyn StaticFilter + Send + Sync > > > {
159205 // Dispatch to width-specific branchless filter.
206+ // Each width has its own max size: 4B→32, 8B→16, 16B→4
160207 match arr. data_type ( ) . primitive_width ( ) {
161208 Some ( 4 ) => Some ( make_branchless_filter :: < UInt32Type > ( arr, 4 ) ) ,
162209 Some ( 8 ) => Some ( make_branchless_filter :: < UInt64Type > ( arr, 8 ) ) ,
@@ -192,6 +239,8 @@ fn dispatch_hashed(
192239 Some ( 16 ) => Some ( make_direct_probe_filter_reinterpreted :: < Decimal128Type > (
193240 arr,
194241 ) ) ,
242+ // Other widths (1, 2) use Bitmap strategy and never reach here.
243+ // Unknown widths fall through to Generic strategy.
195244 _ => None ,
196245 }
197246}
@@ -204,6 +253,8 @@ where
204253 D : ArrowPrimitiveType + ' static ,
205254 D :: Native : Send + Sync + DirectProbeHashable + ' static ,
206255{
256+ use super :: transform:: reinterpret_any_primitive_to;
257+
207258 // Fast path: already the right type
208259 if in_array. data_type ( ) == & D :: DATA_TYPE {
209260 return Ok ( Arc :: new ( DirectProbeFilter :: < D > :: try_new ( in_array) ?) ) ;
0 commit comments