1515// specific language governing permissions and limitations
1616// under the License.
1717
18- //! Fallback filter for types without a specialized static filter.
18+ //! Fallback filter for nested/complex types (List, Struct, Map, Union, etc.)
1919
2020use arrow:: array:: {
2121 Array , ArrayRef , BooleanArray , downcast_array, downcast_dictionary_array,
@@ -25,26 +25,119 @@ use arrow::buffer::{BooleanBuffer, NullBuffer};
2525use arrow:: compute:: { SortOptions , take} ;
2626use arrow:: datatypes:: DataType ;
2727use arrow:: util:: bit_iterator:: BitIndexIterator ;
28- use datafusion_common:: HashMap ;
2928use datafusion_common:: Result ;
30- use datafusion_common:: hash_utils:: { RandomState , with_hashes} ;
31- use hashbrown:: hash_map:: RawEntryMut ;
29+ use datafusion_common:: hash_utils:: with_hashes;
3230
31+ use datafusion_common:: hash_utils:: RandomState ;
32+ use hashbrown:: HashTable ;
33+
34+ use super :: result:: build_in_list_result;
3335use super :: static_filter:: StaticFilter ;
3436
35- /// Static filter for InList that stores the array and hash set for O(1) lookups.
37+ /// Fallback filter for nested/complex types (List, Struct, Map, Union, etc.)
38+ ///
39+ /// Uses dynamic comparator via `make_comparator` since these types don't have
40+ /// a simple typed comparison. For primitive and byte array types, use the
41+ /// specialized filters instead (PrimitiveFilter, ByteArrayFilter, etc.)
3642#[ derive( Debug , Clone ) ]
37- pub ( crate ) struct ArrayStaticFilter {
43+ pub ( crate ) struct NestedTypeFilter {
3844 in_array : ArrayRef ,
3945 state : RandomState ,
40- /// Used to provide a lookup from value to in list index
46+ /// Stores indices into `in_array` for O(1) lookups.
47+ table : HashTable < usize > ,
48+ }
49+
50+ impl NestedTypeFilter {
51+ /// Creates a filter for nested/complex array types.
52+ ///
53+ /// This filter uses dynamic comparison and should only be used for types
54+ /// that don't have specialized filters (List, Struct, Map, Union).
55+ pub ( crate ) fn try_new ( in_array : ArrayRef ) -> Result < Self > {
56+ // Null type has no natural order - return empty hash set
57+ if in_array. data_type ( ) == & DataType :: Null {
58+ return Ok ( Self {
59+ in_array,
60+ state : RandomState :: default ( ) ,
61+ table : HashTable :: new ( ) ,
62+ } ) ;
63+ }
64+
65+ let state = RandomState :: default ( ) ;
66+ let table = Self :: build_haystack_table ( & in_array, & state) ?;
67+
68+ Ok ( Self {
69+ in_array,
70+ state,
71+ table,
72+ } )
73+ }
74+
75+ /// Build a hash table from haystack values for O(1) lookups.
76+ ///
77+ /// Each unique non-null value's index is stored, keyed by its hash.
78+ /// Uses dynamic comparison via `make_comparator` for complex types.
79+ fn build_haystack_table (
80+ haystack : & ArrayRef ,
81+ state : & RandomState ,
82+ ) -> Result < HashTable < usize > > {
83+ let mut table = HashTable :: new ( ) ;
84+
85+ with_hashes ( [ haystack. as_ref ( ) ] , state, |hashes| -> Result < ( ) > {
86+ let cmp = make_comparator ( haystack, haystack, SortOptions :: default ( ) ) ?;
87+
88+ let insert_value = |idx| {
89+ let hash = hashes[ idx] ;
90+ // Only insert if not already present (deduplication)
91+ if table. find ( hash, |& x| cmp ( x, idx) . is_eq ( ) ) . is_none ( ) {
92+ table. insert_unique ( hash, idx, |& x| hashes[ x] ) ;
93+ }
94+ } ;
95+
96+ match haystack. nulls ( ) {
97+ Some ( nulls) => {
98+ BitIndexIterator :: new ( nulls. validity ( ) , nulls. offset ( ) , nulls. len ( ) )
99+ . for_each ( insert_value)
100+ }
101+ None => ( 0 ..haystack. len ( ) ) . for_each ( insert_value) ,
102+ }
103+
104+ Ok ( ( ) )
105+ } ) ?;
106+
107+ Ok ( table)
108+ }
109+
110+ /// Check which needle values exist in the haystack.
41111 ///
42- /// Note: usize::hash is not used, instead the raw entry
43- /// API is used to store entries w.r.t their value
44- map : HashMap < usize , ( ) , ( ) > ,
112+ /// Hashes each needle value and looks it up in the pre-built haystack table.
113+ /// Uses dynamic comparison via `make_comparator` for complex types.
114+ fn find_needles_in_haystack (
115+ & self ,
116+ needles : & dyn Array ,
117+ negated : bool ,
118+ ) -> Result < BooleanArray > {
119+ let needle_nulls = needles. logical_nulls ( ) ;
120+ let haystack_has_nulls = self . in_array . null_count ( ) != 0 ;
121+
122+ with_hashes ( [ needles] , & self . state , |needle_hashes| {
123+ let cmp = make_comparator ( needles, & self . in_array , SortOptions :: default ( ) ) ?;
124+
125+ Ok ( build_in_list_result (
126+ needles. len ( ) ,
127+ needle_nulls. as_ref ( ) ,
128+ haystack_has_nulls,
129+ negated,
130+ #[ inline( always) ]
131+ |i| {
132+ let hash = needle_hashes[ i] ;
133+ self . table . find ( hash, |& idx| cmp ( i, idx) . is_eq ( ) ) . is_some ( )
134+ } ,
135+ ) )
136+ } )
137+ }
45138}
46139
47- impl StaticFilter for ArrayStaticFilter {
140+ impl StaticFilter for NestedTypeFilter {
48141 fn null_count ( & self ) -> usize {
49142 self . in_array . null_count ( )
50143 }
@@ -77,77 +170,6 @@ impl StaticFilter for ArrayStaticFilter {
77170 _ => { }
78171 }
79172
80- let needle_nulls = v. logical_nulls ( ) ;
81- let needle_nulls = needle_nulls. as_ref ( ) ;
82- let haystack_has_nulls = self . in_array . null_count ( ) != 0 ;
83-
84- with_hashes ( [ v] , & self . state , |hashes| {
85- let cmp = make_comparator ( v, & self . in_array , SortOptions :: default ( ) ) ?;
86- Ok ( ( 0 ..v. len ( ) )
87- . map ( |i| {
88- if needle_nulls. is_some_and ( |nulls| nulls. is_null ( i) ) {
89- return None ;
90- }
91-
92- let hash = hashes[ i] ;
93- let contains = self
94- . map
95- . raw_entry ( )
96- . from_hash ( hash, |idx| cmp ( i, * idx) . is_eq ( ) )
97- . is_some ( ) ;
98-
99- match contains {
100- true => Some ( !negated) ,
101- false if haystack_has_nulls => None ,
102- false => Some ( negated) ,
103- }
104- } )
105- . collect ( ) )
106- } )
107- }
108- }
109-
110- impl ArrayStaticFilter {
111- pub ( crate ) fn try_new ( in_array : ArrayRef ) -> Result < Self > {
112- if in_array. data_type ( ) == & DataType :: Null {
113- return Ok ( Self {
114- in_array,
115- state : RandomState :: default ( ) ,
116- map : HashMap :: with_hasher ( ( ) ) ,
117- } ) ;
118- }
119-
120- let state = RandomState :: default ( ) ;
121- let mut map: HashMap < usize , ( ) , ( ) > = HashMap :: with_hasher ( ( ) ) ;
122-
123- with_hashes ( [ & in_array] , & state, |hashes| -> Result < ( ) > {
124- let cmp = make_comparator ( & in_array, & in_array, SortOptions :: default ( ) ) ?;
125-
126- let insert_value = |idx| {
127- let hash = hashes[ idx] ;
128- if let RawEntryMut :: Vacant ( v) = map
129- . raw_entry_mut ( )
130- . from_hash ( hash, |x| cmp ( * x, idx) . is_eq ( ) )
131- {
132- v. insert_with_hasher ( hash, idx, ( ) , |x| hashes[ * x] ) ;
133- }
134- } ;
135-
136- match in_array. nulls ( ) {
137- Some ( nulls) => {
138- BitIndexIterator :: new ( nulls. validity ( ) , nulls. offset ( ) , nulls. len ( ) )
139- . for_each ( insert_value)
140- }
141- None => ( 0 ..in_array. len ( ) ) . for_each ( insert_value) ,
142- }
143-
144- Ok ( ( ) )
145- } ) ?;
146-
147- Ok ( Self {
148- in_array,
149- state,
150- map,
151- } )
173+ self . find_needles_in_haystack ( v, negated)
152174 }
153175}
0 commit comments