Skip to content

Commit 985f0a4

Browse files
Deduplicate InList primitive static filters (#21932)
## Which issue does this PR close? - Part of #19241. - Addresses the duplication called out in #21649 (comment). ## Rationale for this change The existing primitive `IN LIST` static filters have separate integer and float macros. The float version exists only to wrap values in `OrderedFloat*` for hash/equality semantics, but the rest of the implementation is copied from the integer path, including the SQL three-valued-logic result construction. This PR removes that duplication independently from the larger and more controversial direct-probe optimization in #19390. ## What changes are included in this PR? - Generalizes the existing `primitive_static_filter!` macro so it can optionally use a different stored `HashSet` value type and conversion function. - Keeps the original two-argument `primitive_static_filter!(..., ...)` form for integer filters. - Keeps the original `float_static_filter!(..., ..., OrderedFloat*)` call sites, but makes `float_static_filter!` delegate to `primitive_static_filter!` instead of duplicating the full implementation. - Preserves the existing `HashSet` lookup strategy and `OrderedFloat*` bit-pattern handling for `Float32` / `Float64`. ## Are these changes tested? Yes. ## Are there any user-facing changes? No. This is an internal refactor only.
1 parent 2b95cde commit 985f0a4

1 file changed

Lines changed: 13 additions & 124 deletions

File tree

datafusion/physical-expr/src/expressions/in_list/primitive_filter.rs

Lines changed: 13 additions & 124 deletions
Original file line numberDiff line numberDiff line change
@@ -79,9 +79,17 @@ impl From<f64> for OrderedFloat64 {
7979
// Macro to generate specialized StaticFilter implementations for primitive types
8080
macro_rules! primitive_static_filter {
8181
($Name:ident, $ArrowType:ty) => {
82+
primitive_static_filter!(
83+
$Name,
84+
$ArrowType,
85+
<$ArrowType as ArrowPrimitiveType>::Native,
86+
|v| v
87+
);
88+
};
89+
($Name:ident, $ArrowType:ty, $SetValueType:ty, $to_set_value:expr) => {
8290
pub(super) struct $Name {
8391
null_count: usize,
84-
values: HashSet<<$ArrowType as ArrowPrimitiveType>::Native>,
92+
values: HashSet<$SetValueType>,
8593
}
8694

8795
impl $Name {
@@ -94,7 +102,7 @@ macro_rules! primitive_static_filter {
94102
let null_count = in_array.null_count();
95103

96104
for v in in_array.iter().flatten() {
97-
values.insert(v);
105+
values.insert(($to_set_value)(v));
98106
}
99107

100108
Ok(Self { null_count, values })
@@ -146,11 +154,11 @@ macro_rules! primitive_static_filter {
146154
// This ignores nulls - we handle them separately
147155
let contains_buffer = if negated {
148156
BooleanBuffer::collect_bool(needle_values.len(), |i| {
149-
!self.values.contains(&needle_values[i])
157+
!self.values.contains(&($to_set_value)(needle_values[i]))
150158
})
151159
} else {
152160
BooleanBuffer::collect_bool(needle_values.len(), |i| {
153-
self.values.contains(&needle_values[i])
161+
self.values.contains(&($to_set_value)(needle_values[i]))
154162
})
155163
};
156164

@@ -216,126 +224,7 @@ primitive_static_filter!(UInt64StaticFilter, UInt64Type);
216224
// Floats require a wrapper type (OrderedFloat*) to implement Hash/Eq due to NaN semantics
217225
macro_rules! float_static_filter {
218226
($Name:ident, $ArrowType:ty, $OrderedType:ty) => {
219-
pub(super) struct $Name {
220-
null_count: usize,
221-
values: HashSet<$OrderedType>,
222-
}
223-
224-
impl $Name {
225-
pub(super) fn try_new(in_array: &ArrayRef) -> Result<Self> {
226-
let in_array = in_array
227-
.as_primitive_opt::<$ArrowType>()
228-
.ok_or_else(|| exec_datafusion_err!("Failed to downcast an array to a '{}' array", stringify!($ArrowType)))?;
229-
230-
let mut values = HashSet::with_capacity(in_array.len());
231-
let null_count = in_array.null_count();
232-
233-
for v in in_array.iter().flatten() {
234-
values.insert(<$OrderedType>::from(v));
235-
}
236-
237-
Ok(Self { null_count, values })
238-
}
239-
}
240-
241-
impl StaticFilter for $Name {
242-
fn null_count(&self) -> usize {
243-
self.null_count
244-
}
245-
246-
fn contains(&self, v: &dyn Array, negated: bool) -> Result<BooleanArray> {
247-
// Handle dictionary arrays by recursing on the values
248-
downcast_dictionary_array! {
249-
v => {
250-
let values_contains = self.contains(v.values().as_ref(), negated)?;
251-
let result = take(&values_contains, v.keys(), None)?;
252-
return Ok(downcast_array(result.as_ref()))
253-
}
254-
_ => {}
255-
}
256-
257-
let v = v
258-
.as_primitive_opt::<$ArrowType>()
259-
.ok_or_else(|| exec_datafusion_err!("Failed to downcast an array to a '{}' array", stringify!($ArrowType)))?;
260-
261-
let haystack_has_nulls = self.null_count > 0;
262-
let needle_values = v.values();
263-
let needle_nulls = v.nulls();
264-
let needle_has_nulls = v.null_count() > 0;
265-
266-
// Truth table for `value [NOT] IN (set)` with SQL three-valued logic:
267-
// ("-" means the value doesn't affect the result)
268-
//
269-
// | needle_null | haystack_null | negated | in set? | result |
270-
// |-------------|---------------|---------|---------|--------|
271-
// | true | - | false | - | null |
272-
// | true | - | true | - | null |
273-
// | false | true | false | yes | true |
274-
// | false | true | false | no | null |
275-
// | false | true | true | yes | false |
276-
// | false | true | true | no | null |
277-
// | false | false | false | yes | true |
278-
// | false | false | false | no | false |
279-
// | false | false | true | yes | false |
280-
// | false | false | true | no | true |
281-
282-
// Compute the "contains" result using collect_bool (fast batched approach)
283-
// This ignores nulls - we handle them separately
284-
let contains_buffer = if negated {
285-
BooleanBuffer::collect_bool(needle_values.len(), |i| {
286-
!self.values.contains(&<$OrderedType>::from(needle_values[i]))
287-
})
288-
} else {
289-
BooleanBuffer::collect_bool(needle_values.len(), |i| {
290-
self.values.contains(&<$OrderedType>::from(needle_values[i]))
291-
})
292-
};
293-
294-
// Compute the null mask
295-
// Output is null when:
296-
// 1. needle value is null, OR
297-
// 2. needle value is not in set AND haystack has nulls
298-
let result_nulls = match (needle_has_nulls, haystack_has_nulls) {
299-
(false, false) => {
300-
// No nulls anywhere
301-
None
302-
}
303-
(true, false) => {
304-
// Only needle has nulls - just use needle's null mask
305-
needle_nulls.cloned()
306-
}
307-
(false, true) => {
308-
// Only haystack has nulls - result is null when value not in set
309-
// Valid (not null) when original "in set" is true
310-
// For NOT IN: contains_buffer = !original, so validity = !contains_buffer
311-
let validity = if negated {
312-
!&contains_buffer
313-
} else {
314-
contains_buffer.clone()
315-
};
316-
Some(NullBuffer::new(validity))
317-
}
318-
(true, true) => {
319-
// Both have nulls - combine needle nulls with haystack-induced nulls
320-
let needle_validity = needle_nulls.map(|n| n.inner().clone())
321-
.unwrap_or_else(|| BooleanBuffer::new_set(needle_values.len()));
322-
323-
// Valid when original "in set" is true (see above)
324-
let haystack_validity = if negated {
325-
!&contains_buffer
326-
} else {
327-
contains_buffer.clone()
328-
};
329-
330-
// Combined validity: valid only where both are valid
331-
let combined_validity = &needle_validity & &haystack_validity;
332-
Some(NullBuffer::new(combined_validity))
333-
}
334-
};
335-
336-
Ok(BooleanArray::new(contains_buffer, result_nulls))
337-
}
338-
}
227+
primitive_static_filter!($Name, $ArrowType, $OrderedType, <$OrderedType>::from);
339228
};
340229
}
341230

0 commit comments

Comments
 (0)