Skip to content

Commit 4fb2195

Browse files
Optimize generic InList static filtering
Introduces NestedTypeFilter for non-primitive constant IN lists. Replaces the legacy ArrayStaticFilter fallback with a HashTable-backed lookup and shared bitmap result construction.
1 parent ca697dd commit 4fb2195

5 files changed

Lines changed: 225 additions & 86 deletions

File tree

datafusion/physical-expr/src/expressions/in_list.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ use datafusion_expr::{ColumnarValue, expr_vec_fmt};
3838

3939
mod nested_filter;
4040
mod primitive_filter;
41+
mod result;
4142
mod static_filter;
4243
mod strategy;
4344

datafusion/physical-expr/src/expressions/in_list/nested_filter.rs

Lines changed: 105 additions & 83 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
// specific language governing permissions and limitations
1616
// under the License.
1717

18-
//! Fallback filter for types without a specialized static filter.
18+
//! Fallback filter for nested/complex types (List, Struct, Map, Union, etc.)
1919
2020
use arrow::array::{
2121
Array, ArrayRef, BooleanArray, downcast_array, downcast_dictionary_array,
@@ -25,26 +25,119 @@ use arrow::buffer::{BooleanBuffer, NullBuffer};
2525
use arrow::compute::{SortOptions, take};
2626
use arrow::datatypes::DataType;
2727
use arrow::util::bit_iterator::BitIndexIterator;
28-
use datafusion_common::HashMap;
2928
use datafusion_common::Result;
30-
use datafusion_common::hash_utils::{RandomState, with_hashes};
31-
use hashbrown::hash_map::RawEntryMut;
29+
use datafusion_common::hash_utils::with_hashes;
3230

31+
use datafusion_common::hash_utils::RandomState;
32+
use hashbrown::HashTable;
33+
34+
use super::result::build_in_list_result;
3335
use super::static_filter::StaticFilter;
3436

35-
/// Static filter for InList that stores the array and hash set for O(1) lookups.
37+
/// Fallback filter for nested/complex types (List, Struct, Map, Union, etc.)
38+
///
39+
/// Uses dynamic comparator via `make_comparator` since these types don't have
40+
/// a simple typed comparison. For primitive and byte array types, use the
41+
/// specialized filters instead (PrimitiveFilter, ByteArrayFilter, etc.)
3642
#[derive(Debug, Clone)]
37-
pub(crate) struct ArrayStaticFilter {
43+
pub(crate) struct NestedTypeFilter {
3844
in_array: ArrayRef,
3945
state: RandomState,
40-
/// Used to provide a lookup from value to in list index
46+
/// Stores indices into `in_array` for O(1) lookups.
47+
table: HashTable<usize>,
48+
}
49+
50+
impl NestedTypeFilter {
51+
/// Creates a filter for nested/complex array types.
52+
///
53+
/// This filter uses dynamic comparison and should only be used for types
54+
/// that don't have specialized filters (List, Struct, Map, Union).
55+
pub(crate) fn try_new(in_array: ArrayRef) -> Result<Self> {
56+
// Null type has no natural order - return empty hash set
57+
if in_array.data_type() == &DataType::Null {
58+
return Ok(Self {
59+
in_array,
60+
state: RandomState::default(),
61+
table: HashTable::new(),
62+
});
63+
}
64+
65+
let state = RandomState::default();
66+
let table = Self::build_haystack_table(&in_array, &state)?;
67+
68+
Ok(Self {
69+
in_array,
70+
state,
71+
table,
72+
})
73+
}
74+
75+
/// Build a hash table from haystack values for O(1) lookups.
76+
///
77+
/// Each unique non-null value's index is stored, keyed by its hash.
78+
/// Uses dynamic comparison via `make_comparator` for complex types.
79+
fn build_haystack_table(
80+
haystack: &ArrayRef,
81+
state: &RandomState,
82+
) -> Result<HashTable<usize>> {
83+
let mut table = HashTable::new();
84+
85+
with_hashes([haystack.as_ref()], state, |hashes| -> Result<()> {
86+
let cmp = make_comparator(haystack, haystack, SortOptions::default())?;
87+
88+
let insert_value = |idx| {
89+
let hash = hashes[idx];
90+
// Only insert if not already present (deduplication)
91+
if table.find(hash, |&x| cmp(x, idx).is_eq()).is_none() {
92+
table.insert_unique(hash, idx, |&x| hashes[x]);
93+
}
94+
};
95+
96+
match haystack.nulls() {
97+
Some(nulls) => {
98+
BitIndexIterator::new(nulls.validity(), nulls.offset(), nulls.len())
99+
.for_each(insert_value)
100+
}
101+
None => (0..haystack.len()).for_each(insert_value),
102+
}
103+
104+
Ok(())
105+
})?;
106+
107+
Ok(table)
108+
}
109+
110+
/// Check which needle values exist in the haystack.
41111
///
42-
/// Note: usize::hash is not used, instead the raw entry
43-
/// API is used to store entries w.r.t their value
44-
map: HashMap<usize, (), ()>,
112+
/// Hashes each needle value and looks it up in the pre-built haystack table.
113+
/// Uses dynamic comparison via `make_comparator` for complex types.
114+
fn find_needles_in_haystack(
115+
&self,
116+
needles: &dyn Array,
117+
negated: bool,
118+
) -> Result<BooleanArray> {
119+
let needle_nulls = needles.logical_nulls();
120+
let haystack_has_nulls = self.in_array.null_count() != 0;
121+
122+
with_hashes([needles], &self.state, |needle_hashes| {
123+
let cmp = make_comparator(needles, &self.in_array, SortOptions::default())?;
124+
125+
Ok(build_in_list_result(
126+
needles.len(),
127+
needle_nulls.as_ref(),
128+
haystack_has_nulls,
129+
negated,
130+
#[inline(always)]
131+
|i| {
132+
let hash = needle_hashes[i];
133+
self.table.find(hash, |&idx| cmp(i, idx).is_eq()).is_some()
134+
},
135+
))
136+
})
137+
}
45138
}
46139

47-
impl StaticFilter for ArrayStaticFilter {
140+
impl StaticFilter for NestedTypeFilter {
48141
fn null_count(&self) -> usize {
49142
self.in_array.null_count()
50143
}
@@ -77,77 +170,6 @@ impl StaticFilter for ArrayStaticFilter {
77170
_ => {}
78171
}
79172

80-
let needle_nulls = v.logical_nulls();
81-
let needle_nulls = needle_nulls.as_ref();
82-
let haystack_has_nulls = self.in_array.null_count() != 0;
83-
84-
with_hashes([v], &self.state, |hashes| {
85-
let cmp = make_comparator(v, &self.in_array, SortOptions::default())?;
86-
Ok((0..v.len())
87-
.map(|i| {
88-
if needle_nulls.is_some_and(|nulls| nulls.is_null(i)) {
89-
return None;
90-
}
91-
92-
let hash = hashes[i];
93-
let contains = self
94-
.map
95-
.raw_entry()
96-
.from_hash(hash, |idx| cmp(i, *idx).is_eq())
97-
.is_some();
98-
99-
match contains {
100-
true => Some(!negated),
101-
false if haystack_has_nulls => None,
102-
false => Some(negated),
103-
}
104-
})
105-
.collect())
106-
})
107-
}
108-
}
109-
110-
impl ArrayStaticFilter {
111-
pub(crate) fn try_new(in_array: ArrayRef) -> Result<Self> {
112-
if in_array.data_type() == &DataType::Null {
113-
return Ok(Self {
114-
in_array,
115-
state: RandomState::default(),
116-
map: HashMap::with_hasher(()),
117-
});
118-
}
119-
120-
let state = RandomState::default();
121-
let mut map: HashMap<usize, (), ()> = HashMap::with_hasher(());
122-
123-
with_hashes([&in_array], &state, |hashes| -> Result<()> {
124-
let cmp = make_comparator(&in_array, &in_array, SortOptions::default())?;
125-
126-
let insert_value = |idx| {
127-
let hash = hashes[idx];
128-
if let RawEntryMut::Vacant(v) = map
129-
.raw_entry_mut()
130-
.from_hash(hash, |x| cmp(*x, idx).is_eq())
131-
{
132-
v.insert_with_hasher(hash, idx, (), |x| hashes[*x]);
133-
}
134-
};
135-
136-
match in_array.nulls() {
137-
Some(nulls) => {
138-
BitIndexIterator::new(nulls.validity(), nulls.offset(), nulls.len())
139-
.for_each(insert_value)
140-
}
141-
None => (0..in_array.len()).for_each(insert_value),
142-
}
143-
144-
Ok(())
145-
})?;
146-
147-
Ok(Self {
148-
in_array,
149-
state,
150-
map,
151-
})
173+
self.find_needles_in_haystack(v, negated)
152174
}
153175
}
Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,116 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
//! Result building helpers for InList operations
19+
//!
20+
//! This module provides unified logic for building BooleanArray results
21+
//! from IN list membership tests, handling null propagation correctly
22+
//! according to SQL three-valued logic.
23+
24+
use arrow::array::BooleanArray;
25+
use arrow::buffer::{BooleanBuffer, NullBuffer};
26+
27+
// =============================================================================
28+
// RESULT BUILDER FOR IN LIST OPERATIONS
29+
// =============================================================================
30+
//
31+
// Truth table for (needle_nulls, haystack_has_nulls, negated):
32+
// (Some, true, false) → values: valid & contains, nulls: valid & contains
33+
// (None, true, false) → values: contains, nulls: contains
34+
// (Some, true, true) → values: valid ^ (valid & contains), nulls: valid & contains
35+
// (None, true, true) → values: !contains, nulls: contains
36+
// (Some, false, false) → values: valid & contains, nulls: valid
37+
// (Some, false, true) → values: valid & !contains, nulls: valid
38+
// (None, false, false) → values: contains, nulls: none
39+
// (None, false, true) → values: !contains, nulls: none
40+
41+
/// Builds a BooleanArray result for IN list operations (optimized for cheap contains).
42+
///
43+
/// This function handles the complex null propagation logic for SQL IN lists:
44+
/// - If the needle value is null, the result is null
45+
/// - If the needle is not in the set AND the haystack has nulls, the result is null
46+
/// - Otherwise, the result is true/false based on membership and negation
47+
///
48+
/// This version computes contains for ALL positions (including nulls), then applies
49+
/// null masking via bitmap operations. This is optimal for cheap contains checks
50+
/// (like DirectProbeFilter) where the branch overhead exceeds the check cost.
51+
#[inline]
52+
pub(crate) fn build_in_list_result<C>(
53+
len: usize,
54+
needle_nulls: Option<&NullBuffer>,
55+
haystack_has_nulls: bool,
56+
negated: bool,
57+
contains: C,
58+
) -> BooleanArray
59+
where
60+
C: FnMut(usize) -> bool,
61+
{
62+
// Always compute the contains buffer without checking nulls in the loop.
63+
// The null check inside the loop hurts vectorization and branch prediction.
64+
// Nulls are handled by build_result_from_contains using bitmap operations.
65+
let contains_buf = BooleanBuffer::collect_bool(len, contains);
66+
build_result_from_contains(needle_nulls, haystack_has_nulls, negated, contains_buf)
67+
}
68+
69+
/// Builds a BooleanArray result from a pre-computed contains buffer.
70+
///
71+
/// This version does NOT assume contains_buf is pre-masked at null positions.
72+
/// It handles nulls using bitmap operations which are more vectorization-friendly.
73+
#[inline]
74+
pub(crate) fn build_result_from_contains(
75+
needle_nulls: Option<&NullBuffer>,
76+
haystack_has_nulls: bool,
77+
negated: bool,
78+
contains_buf: BooleanBuffer,
79+
) -> BooleanArray {
80+
match (needle_nulls, haystack_has_nulls, negated) {
81+
// Haystack has nulls: result is null unless value is found
82+
(Some(v), true, false) => {
83+
// values: valid & contains, nulls: valid & contains
84+
// Result is valid (not null) only when needle is valid AND found in haystack
85+
let values = v.inner() & &contains_buf;
86+
BooleanArray::new(values.clone(), Some(NullBuffer::new(values)))
87+
}
88+
(None, true, false) => {
89+
BooleanArray::new(contains_buf.clone(), Some(NullBuffer::new(contains_buf)))
90+
}
91+
(Some(v), true, true) => {
92+
// NOT IN with nulls: true if valid and not found, null if found or needle null
93+
// values: valid & !contains, nulls: valid & contains
94+
// Result is valid only when needle is valid AND found (because NOT IN with
95+
// haystack nulls returns NULL when value isn't definitively excluded)
96+
let valid = v.inner();
97+
let values = valid & &(!&contains_buf);
98+
let nulls = valid & &contains_buf;
99+
BooleanArray::new(values, Some(NullBuffer::new(nulls)))
100+
}
101+
(None, true, true) => {
102+
BooleanArray::new(!&contains_buf, Some(NullBuffer::new(contains_buf)))
103+
}
104+
// Haystack has no nulls: result validity follows needle validity
105+
(Some(v), false, false) => {
106+
// values: valid & contains (mask out nulls), nulls: valid
107+
BooleanArray::new(v.inner() & &contains_buf, Some(v.clone()))
108+
}
109+
(Some(v), false, true) => {
110+
// values: valid & !contains, nulls: valid
111+
BooleanArray::new(v.inner() & &(!&contains_buf), Some(v.clone()))
112+
}
113+
(None, false, false) => BooleanArray::new(contains_buf, None),
114+
(None, false, true) => BooleanArray::new(!&contains_buf, None),
115+
}
116+
}

datafusion/physical-expr/src/expressions/in_list/static_filter.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ use datafusion_common::Result;
3030
/// - [`super::primitive_filter::BranchlessFilter`]: Unrolled OR-chain for small lists
3131
/// - [`super::primitive_filter::DirectProbeFilter`]: O(1) hash lookups for larger primitive types
3232
/// - [`super::transform::Utf8TwoStageFilter`]: Two-stage filter for Utf8/LargeUtf8
33-
/// - [`super::nested_filter::ArrayStaticFilter`]: Dynamic comparator for unsupported types
33+
/// - [`super::nested_filter::NestedTypeFilter`]: Dynamic comparator for complex types
3434
pub(crate) trait StaticFilter {
3535
/// Returns the number of null values in the filter's haystack.
3636
fn null_count(&self) -> usize;

datafusion/physical-expr/src/expressions/in_list/strategy.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ use arrow::array::ArrayRef;
2323
use arrow::datatypes::DataType;
2424
use datafusion_common::Result;
2525

26-
use super::nested_filter::ArrayStaticFilter;
26+
use super::nested_filter::NestedTypeFilter;
2727
use super::primitive_filter::*;
2828
use super::static_filter::StaticFilter;
2929

@@ -45,6 +45,6 @@ pub(crate) fn instantiate_static_filter(
4545
DataType::UInt64 => Ok(Arc::new(UInt64StaticFilter::try_new(&in_array)?)),
4646
DataType::Float32 => Ok(Arc::new(Float32StaticFilter::try_new(&in_array)?)),
4747
DataType::Float64 => Ok(Arc::new(Float64StaticFilter::try_new(&in_array)?)),
48-
_ => Ok(Arc::new(ArrayStaticFilter::try_new(in_array)?)),
48+
_ => Ok(Arc::new(NestedTypeFilter::try_new(in_array)?)),
4949
}
5050
}

0 commit comments

Comments
 (0)