Skip to content

Commit 909608a

Browse files
authored
fix: Fix bug in array_has scalar path with sliced arrays (#20677)
## Which issue does this PR close? N/A ## Rationale for this change In #20374, `array_has` with a scalar needle was optimized to reconstruct matches more efficiently. Unfortunately, that code was incorrect for sliced arrays: `values()` returns the entire value buffer (including elements outside the visible slice), so we need to skip the corresponding indexes in the result bitmap. We could fix this by just skipping indexes, but it seems more robust and efficient to arrange to not compare the needle against elements outside the visible range in the first place. `array_position` has a similar behavior (introduced in #20532): it didn't have the buggy behavior, but it still did extra work for sliced arrays by comparing against elements outside the visible range. Benchmarking the revised code, there is no performance regression for unsliced arrays. ## What changes are included in this PR? * Fix `array_has` bug for sliced arrays with scalar needle * Improve `array_has` and `array_position` to not compare against elements outside the visible range of a sliced array * Add unit test for `array_has` bug * Add unit test to increase confidence in `array_position` behavior for sliced arrays ## Are these changes tested? Yes. ## Are there any user-facing changes? No.
1 parent 10b5f22 commit 909608a

2 files changed

Lines changed: 140 additions & 15 deletions

File tree

datafusion/functions-nested/src/array_has.rs

Lines changed: 66 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -352,8 +352,6 @@ fn array_has_dispatch_for_scalar(
352352
haystack: ArrayWrapper<'_>,
353353
needle: &dyn Datum,
354354
) -> Result<ArrayRef> {
355-
let values = haystack.values();
356-
let is_nested = values.data_type().is_nested();
357355
// If first argument is empty list (second argument is non-null), return false
358356
// i.e. array_has([], non-null element) -> false
359357
if haystack.len() == 0 {
@@ -362,7 +360,17 @@ fn array_has_dispatch_for_scalar(
362360
None,
363361
)));
364362
}
365-
let eq_array = compare_with_eq(values, needle, is_nested)?;
363+
364+
// For sliced ListArrays, values() returns the full underlying array but
365+
// only elements between the first and last offset are visible.
366+
let offsets: Vec<usize> = haystack.offsets().collect();
367+
let first_offset = offsets[0];
368+
let visible_values = haystack
369+
.values()
370+
.slice(first_offset, offsets[offsets.len() - 1] - first_offset);
371+
372+
let is_nested = visible_values.data_type().is_nested();
373+
let eq_array = compare_with_eq(&visible_values, needle, is_nested)?;
366374

367375
// When a haystack element is null, `eq()` returns null (not false).
368376
// In Arrow, a null BooleanArray entry has validity=0 but an
@@ -382,10 +390,14 @@ fn array_has_dispatch_for_scalar(
382390
ArrayWrapper::LargeList(arr) => arr.nulls(),
383391
};
384392
let mut matches = eq_bits.set_indices().peekable();
385-
let mut values = BooleanBufferBuilder::new(haystack.len());
386-
values.append_n(haystack.len(), false);
393+
let mut result = BooleanBufferBuilder::new(haystack.len());
394+
result.append_n(haystack.len(), false);
395+
396+
// Match positions are relative to visible_values (0-based), so
397+
// subtract first_offset from each offset when comparing.
398+
for (i, window) in offsets.windows(2).enumerate() {
399+
let end = window[1] - first_offset;
387400

388-
for (i, (_start, end)) in haystack.offsets().tuple_windows().enumerate() {
389401
let has_match = matches.peek().is_some_and(|&p| p < end);
390402

391403
// Advance past all match positions in this row's range.
@@ -394,14 +406,14 @@ fn array_has_dispatch_for_scalar(
394406
}
395407

396408
if has_match && validity.is_none_or(|v| v.is_valid(i)) {
397-
values.set_bit(i, true);
409+
result.set_bit(i, true);
398410
}
399411
}
400412

401413
// A null haystack row always produces a null output, so we can
402414
// reuse the haystack's null buffer directly.
403415
Ok(Arc::new(BooleanArray::new(
404-
values.finish(),
416+
result.finish(),
405417
validity.cloned(),
406418
)))
407419
}
@@ -1066,6 +1078,52 @@ mod tests {
10661078
Ok(())
10671079
}
10681080

1081+
#[test]
1082+
fn test_array_has_sliced_list() -> Result<(), DataFusionError> {
1083+
// [[10, 20], [30, 40], [50, 60], [70, 80]] → slice(1,2) → [[30, 40], [50, 60]]
1084+
let list = ListArray::from_iter_primitive::<Int32Type, _, _>(vec![
1085+
Some(vec![Some(10), Some(20)]),
1086+
Some(vec![Some(30), Some(40)]),
1087+
Some(vec![Some(50), Some(60)]),
1088+
Some(vec![Some(70), Some(80)]),
1089+
]);
1090+
let sliced = list.slice(1, 2);
1091+
let haystack_field =
1092+
Arc::new(Field::new("haystack", sliced.data_type().clone(), true));
1093+
let needle_field = Arc::new(Field::new("needle", DataType::Int32, true));
1094+
let return_field = Arc::new(Field::new("return", DataType::Boolean, true));
1095+
1096+
// Search for elements that exist only in sliced-away rows:
1097+
// 10 is in the prefix row, 70 is in the suffix row.
1098+
let invoke = |needle: i32| -> Result<ArrayRef, DataFusionError> {
1099+
ArrayHas::new()
1100+
.invoke_with_args(ScalarFunctionArgs {
1101+
args: vec![
1102+
ColumnarValue::Array(Arc::new(sliced.clone())),
1103+
ColumnarValue::Scalar(ScalarValue::Int32(Some(needle))),
1104+
],
1105+
arg_fields: vec![
1106+
Arc::clone(&haystack_field),
1107+
Arc::clone(&needle_field),
1108+
],
1109+
number_rows: 2,
1110+
return_field: Arc::clone(&return_field),
1111+
config_options: Arc::new(ConfigOptions::default()),
1112+
})?
1113+
.into_array(2)
1114+
};
1115+
1116+
let output = invoke(10)?.as_boolean().clone();
1117+
assert!(!output.value(0));
1118+
assert!(!output.value(1));
1119+
1120+
let output = invoke(70)?.as_boolean().clone();
1121+
assert!(!output.value(0));
1122+
assert!(!output.value(1));
1123+
1124+
Ok(())
1125+
}
1126+
10691127
#[test]
10701128
fn test_array_has_list_null_haystack() -> Result<(), DataFusionError> {
10711129
let haystack_field = Arc::new(Field::new("haystack", DataType::Null, true));

datafusion/functions-nested/src/position.rs

Lines changed: 74 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -230,26 +230,36 @@ fn array_position_scalar<O: OffsetSizeTrait>(
230230
"array_position",
231231
&[list_array.values(), element_array],
232232
)?;
233-
let element_datum = Scalar::new(Arc::clone(element_array));
234-
235-
let offsets = list_array.offsets();
236-
let validity = list_array.nulls();
237233

238234
if list_array.len() == 0 {
239235
return Ok(Arc::new(UInt64Array::new_null(0)));
240236
}
241237

238+
let element_datum = Scalar::new(Arc::clone(element_array));
239+
let validity = list_array.nulls();
240+
241+
// Only compare the visible portion of the values buffer, which avoids
242+
// wasted work for sliced ListArrays.
243+
let offsets = list_array.offsets();
244+
let first_offset = offsets[0].as_usize();
245+
let last_offset = offsets[list_array.len()].as_usize();
246+
let visible_values = list_array
247+
.values()
248+
.slice(first_offset, last_offset - first_offset);
249+
242250
// `not_distinct` treats NULL=NULL as true, matching the semantics of
243251
// `array_position`
244-
let eq_array = arrow_ord::cmp::not_distinct(list_array.values(), &element_datum)?;
252+
let eq_array = arrow_ord::cmp::not_distinct(&visible_values, &element_datum)?;
245253
let eq_bits = eq_array.values();
246254

247255
let mut result: Vec<Option<u64>> = Vec::with_capacity(list_array.len());
248256
let mut matches = eq_bits.set_indices().peekable();
249257

258+
// Match positions are relative to visible_values (0-based), so
259+
// subtract first_offset from each offset when comparing.
250260
for i in 0..list_array.len() {
251-
let start = offsets[i].as_usize();
252-
let end = offsets[i + 1].as_usize();
261+
let start = offsets[i].as_usize() - first_offset;
262+
let end = offsets[i + 1].as_usize() - first_offset;
253263

254264
if validity.is_some_and(|v| v.is_null(i)) {
255265
// Null row -> null output; advance past matches in range
@@ -474,3 +484,60 @@ fn general_positions<OffsetSize: OffsetSizeTrait>(
474484
ListArray::from_iter_primitive::<UInt64Type, _, _>(data),
475485
))
476486
}
487+
488+
#[cfg(test)]
489+
mod tests {
490+
use super::*;
491+
use arrow::array::AsArray;
492+
use arrow::datatypes::Int32Type;
493+
use datafusion_common::config::ConfigOptions;
494+
use datafusion_expr::ScalarFunctionArgs;
495+
496+
#[test]
497+
fn test_array_position_sliced_list() -> Result<()> {
498+
// [[10, 20], [30, 40], [50, 60], [70, 80]] → slice(1,2) → [[30, 40], [50, 60]]
499+
let list = ListArray::from_iter_primitive::<Int32Type, _, _>(vec![
500+
Some(vec![Some(10), Some(20)]),
501+
Some(vec![Some(30), Some(40)]),
502+
Some(vec![Some(50), Some(60)]),
503+
Some(vec![Some(70), Some(80)]),
504+
]);
505+
let sliced = list.slice(1, 2);
506+
let haystack_field =
507+
Arc::new(Field::new("haystack", sliced.data_type().clone(), true));
508+
let needle_field = Arc::new(Field::new("needle", DataType::Int32, true));
509+
let return_field = Arc::new(Field::new("return", UInt64, true));
510+
511+
// Search for elements that exist only in sliced-away rows:
512+
// 10 is in the prefix row, 70 is in the suffix row.
513+
let invoke = |needle: i32| -> Result<ArrayRef> {
514+
ArrayPosition::new()
515+
.invoke_with_args(ScalarFunctionArgs {
516+
args: vec![
517+
ColumnarValue::Array(Arc::new(sliced.clone())),
518+
ColumnarValue::Scalar(ScalarValue::Int32(Some(needle))),
519+
],
520+
arg_fields: vec![
521+
Arc::clone(&haystack_field),
522+
Arc::clone(&needle_field),
523+
],
524+
number_rows: 2,
525+
return_field: Arc::clone(&return_field),
526+
config_options: Arc::new(ConfigOptions::default()),
527+
})?
528+
.into_array(2)
529+
};
530+
531+
let output = invoke(10)?;
532+
let output = output.as_primitive::<UInt64Type>();
533+
assert!(output.is_null(0));
534+
assert!(output.is_null(1));
535+
536+
let output = invoke(70)?;
537+
let output = output.as_primitive::<UInt64Type>();
538+
assert!(output.is_null(0));
539+
assert!(output.is_null(1));
540+
541+
Ok(())
542+
}
543+
}

0 commit comments

Comments
 (0)