2121//! from IN list membership tests, handling null propagation correctly
2222//! according to SQL three-valued logic.
2323
24+ #![ expect( dead_code) ]
25+
2426use arrow:: array:: BooleanArray ;
2527use arrow:: buffer:: { BooleanBuffer , NullBuffer } ;
2628
@@ -48,6 +50,9 @@ use arrow::buffer::{BooleanBuffer, NullBuffer};
4850/// This version computes contains for ALL positions (including nulls), then applies
4951/// null masking via bitmap operations. This is optimal for cheap contains checks
5052/// (like DirectProbeFilter) where the branch overhead exceeds the check cost.
53+ ///
54+ /// For expensive contains checks (like ByteViewMaskedFilter with string comparison),
55+ /// use `build_in_list_result_with_null_shortcircuit` instead.
5156#[ inline]
5257pub ( crate ) fn build_in_list_result < C > (
5358 len : usize ,
@@ -66,6 +71,106 @@ where
6671 build_result_from_contains ( needle_nulls, haystack_has_nulls, negated, contains_buf)
6772}
6873
74+ /// Builds a BooleanArray result with null short-circuit (optimized for expensive contains).
75+ ///
76+ /// Unlike `build_in_list_result`, this version checks nulls INSIDE the loop and
77+ /// skips the contains check for null positions. This is optimal for expensive
78+ /// contains checks (like ByteViewMaskedFilter with hash lookup + string comparison) where
79+ /// skipping lookups outweighs the branch overhead.
80+ ///
81+ /// The shortcircuit is only applied when `needle_null_count > 0` - if there are
82+ /// no actual nulls, we avoid the branch overhead entirely.
83+ ///
84+ /// Use this for: ByteViewMaskedFilter, Utf8TwoStageFilter (string/binary types)
85+ /// Use `build_in_list_result` for: DirectProbeFilter, BranchlessFilter (primitive types)
86+ #[ inline]
87+ pub ( crate ) fn build_in_list_result_with_null_shortcircuit < C > (
88+ len : usize ,
89+ needle_nulls : Option < & NullBuffer > ,
90+ needle_null_count : usize ,
91+ haystack_has_nulls : bool ,
92+ negated : bool ,
93+ mut contains : C ,
94+ ) -> BooleanArray
95+ where
96+ C : FnMut ( usize ) -> bool ,
97+ {
98+ // When null_count=0, treat as no validity buffer to avoid extra work.
99+ // The validity buffer might exist but have all bits set to true.
100+ let effective_nulls = needle_nulls. filter ( |_| needle_null_count > 0 ) ;
101+
102+ match effective_nulls {
103+ Some ( nulls) => {
104+ // Has nulls: check validity inside loop to skip expensive contains()
105+ let contains_buf =
106+ BooleanBuffer :: collect_bool ( len, |i| nulls. is_valid ( i) && contains ( i) ) ;
107+ build_result_from_contains_premasked (
108+ Some ( nulls) ,
109+ haystack_has_nulls,
110+ negated,
111+ contains_buf,
112+ )
113+ }
114+ None => {
115+ // No nulls: compute contains for all positions without branch overhead
116+ let contains_buf = BooleanBuffer :: collect_bool ( len, contains) ;
117+ // Use premasked path since contains_buf is "trivially premasked" (no nulls to mask)
118+ build_result_from_contains_premasked (
119+ None ,
120+ haystack_has_nulls,
121+ negated,
122+ contains_buf,
123+ )
124+ }
125+ }
126+ }
127+
128+ /// Builds result from a contains buffer that was pre-masked at null positions.
129+ ///
130+ /// This is used by `build_in_list_result_with_null_shortcircuit` where the
131+ /// contains buffer already has `false` at null positions due to the short-circuit.
132+ ///
133+ /// Since contains_buf is pre-masked (false at null positions), we can simplify:
134+ /// - `valid & contains_buf` = `contains_buf` (already 0 where valid is 0)
135+ /// - XOR can replace AND+NOT for the negated case: `valid ^ contains = valid & !contains`
136+ #[ inline]
137+ fn build_result_from_contains_premasked (
138+ needle_nulls : Option < & NullBuffer > ,
139+ haystack_has_nulls : bool ,
140+ negated : bool ,
141+ contains_buf : BooleanBuffer ,
142+ ) -> BooleanArray {
143+ match ( needle_nulls, haystack_has_nulls, negated) {
144+ // Haystack has nulls: result is null unless value is found
145+ ( _, true , false ) => {
146+ // contains_buf is already masked (false at null positions)
147+ BooleanArray :: new ( contains_buf. clone ( ) , Some ( NullBuffer :: new ( contains_buf) ) )
148+ }
149+ ( Some ( v) , true , true ) => {
150+ // NOT IN with nulls: true if valid and not found, null if found or needle null
151+ // XOR: valid ^ contains = 1 iff valid=1 and contains=0 (not found)
152+ BooleanArray :: new (
153+ v. inner ( ) ^ & contains_buf,
154+ Some ( NullBuffer :: new ( contains_buf) ) ,
155+ )
156+ }
157+ ( None , true , true ) => {
158+ BooleanArray :: new ( !& contains_buf, Some ( NullBuffer :: new ( contains_buf) ) )
159+ }
160+ // Haystack has no nulls: result validity follows needle validity
161+ ( Some ( v) , false , false ) => {
162+ // contains_buf is already masked, just use needle validity for nulls
163+ BooleanArray :: new ( contains_buf, Some ( v. clone ( ) ) )
164+ }
165+ ( Some ( v) , false , true ) => {
166+ // Need AND because !contains_buf is 1 at null positions
167+ BooleanArray :: new ( v. inner ( ) & & ( !& contains_buf) , Some ( v. clone ( ) ) )
168+ }
169+ ( None , false , false ) => BooleanArray :: new ( contains_buf, None ) ,
170+ ( None , false , true ) => BooleanArray :: new ( !& contains_buf, None ) ,
171+ }
172+ }
173+
69174/// Builds a BooleanArray result from a pre-computed contains buffer.
70175///
71176/// This version does NOT assume contains_buf is pre-masked at null positions.
0 commit comments