Skip to content

Commit 5185602

Browse files
authored
[53] fix: Fix bug in array_has scalar path with sliced arrays (#20677) (#20700)
Backport #20677 to the 53 release branch.
1 parent c466f82 commit 5185602

2 files changed

Lines changed: 140 additions & 15 deletions

File tree

datafusion/functions-nested/src/array_has.rs

Lines changed: 66 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -352,8 +352,6 @@ fn array_has_dispatch_for_scalar(
352352
haystack: ArrayWrapper<'_>,
353353
needle: &dyn Datum,
354354
) -> Result<ArrayRef> {
355-
let values = haystack.values();
356-
let is_nested = values.data_type().is_nested();
357355
// If first argument is empty list (second argument is non-null), return false
358356
// i.e. array_has([], non-null element) -> false
359357
if haystack.len() == 0 {
@@ -362,7 +360,17 @@ fn array_has_dispatch_for_scalar(
362360
None,
363361
)));
364362
}
365-
let eq_array = compare_with_eq(values, needle, is_nested)?;
363+
364+
// For sliced ListArrays, values() returns the full underlying array but
365+
// only elements between the first and last offset are visible.
366+
let offsets: Vec<usize> = haystack.offsets().collect();
367+
let first_offset = offsets[0];
368+
let visible_values = haystack
369+
.values()
370+
.slice(first_offset, offsets[offsets.len() - 1] - first_offset);
371+
372+
let is_nested = visible_values.data_type().is_nested();
373+
let eq_array = compare_with_eq(&visible_values, needle, is_nested)?;
366374

367375
// When a haystack element is null, `eq()` returns null (not false).
368376
// In Arrow, a null BooleanArray entry has validity=0 but an
@@ -382,10 +390,14 @@ fn array_has_dispatch_for_scalar(
382390
ArrayWrapper::LargeList(arr) => arr.nulls(),
383391
};
384392
let mut matches = eq_bits.set_indices().peekable();
385-
let mut values = BooleanBufferBuilder::new(haystack.len());
386-
values.append_n(haystack.len(), false);
393+
let mut result = BooleanBufferBuilder::new(haystack.len());
394+
result.append_n(haystack.len(), false);
395+
396+
// Match positions are relative to visible_values (0-based), so
397+
// subtract first_offset from each offset when comparing.
398+
for (i, window) in offsets.windows(2).enumerate() {
399+
let end = window[1] - first_offset;
387400

388-
for (i, (_start, end)) in haystack.offsets().tuple_windows().enumerate() {
389401
let has_match = matches.peek().is_some_and(|&p| p < end);
390402

391403
// Advance past all match positions in this row's range.
@@ -394,14 +406,14 @@ fn array_has_dispatch_for_scalar(
394406
}
395407

396408
if has_match && validity.is_none_or(|v| v.is_valid(i)) {
397-
values.set_bit(i, true);
409+
result.set_bit(i, true);
398410
}
399411
}
400412

401413
// A null haystack row always produces a null output, so we can
402414
// reuse the haystack's null buffer directly.
403415
Ok(Arc::new(BooleanArray::new(
404-
values.finish(),
416+
result.finish(),
405417
validity.cloned(),
406418
)))
407419
}
@@ -1066,6 +1078,52 @@ mod tests {
10661078
Ok(())
10671079
}
10681080

1081+
#[test]
1082+
fn test_array_has_sliced_list() -> Result<(), DataFusionError> {
1083+
// [[10, 20], [30, 40], [50, 60], [70, 80]] → slice(1,2) → [[30, 40], [50, 60]]
1084+
let list = ListArray::from_iter_primitive::<Int32Type, _, _>(vec![
1085+
Some(vec![Some(10), Some(20)]),
1086+
Some(vec![Some(30), Some(40)]),
1087+
Some(vec![Some(50), Some(60)]),
1088+
Some(vec![Some(70), Some(80)]),
1089+
]);
1090+
let sliced = list.slice(1, 2);
1091+
let haystack_field =
1092+
Arc::new(Field::new("haystack", sliced.data_type().clone(), true));
1093+
let needle_field = Arc::new(Field::new("needle", DataType::Int32, true));
1094+
let return_field = Arc::new(Field::new("return", DataType::Boolean, true));
1095+
1096+
// Search for elements that exist only in sliced-away rows:
1097+
// 10 is in the prefix row, 70 is in the suffix row.
1098+
let invoke = |needle: i32| -> Result<ArrayRef, DataFusionError> {
1099+
ArrayHas::new()
1100+
.invoke_with_args(ScalarFunctionArgs {
1101+
args: vec![
1102+
ColumnarValue::Array(Arc::new(sliced.clone())),
1103+
ColumnarValue::Scalar(ScalarValue::Int32(Some(needle))),
1104+
],
1105+
arg_fields: vec![
1106+
Arc::clone(&haystack_field),
1107+
Arc::clone(&needle_field),
1108+
],
1109+
number_rows: 2,
1110+
return_field: Arc::clone(&return_field),
1111+
config_options: Arc::new(ConfigOptions::default()),
1112+
})?
1113+
.into_array(2)
1114+
};
1115+
1116+
let output = invoke(10)?.as_boolean().clone();
1117+
assert!(!output.value(0));
1118+
assert!(!output.value(1));
1119+
1120+
let output = invoke(70)?.as_boolean().clone();
1121+
assert!(!output.value(0));
1122+
assert!(!output.value(1));
1123+
1124+
Ok(())
1125+
}
1126+
10691127
#[test]
10701128
fn test_array_has_list_null_haystack() -> Result<(), DataFusionError> {
10711129
let haystack_field = Arc::new(Field::new("haystack", DataType::Null, true));

datafusion/functions-nested/src/position.rs

Lines changed: 74 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -230,26 +230,36 @@ fn array_position_scalar<O: OffsetSizeTrait>(
230230
"array_position",
231231
&[list_array.values(), element_array],
232232
)?;
233-
let element_datum = Scalar::new(Arc::clone(element_array));
234-
235-
let offsets = list_array.offsets();
236-
let validity = list_array.nulls();
237233

238234
if list_array.len() == 0 {
239235
return Ok(Arc::new(UInt64Array::new_null(0)));
240236
}
241237

238+
let element_datum = Scalar::new(Arc::clone(element_array));
239+
let validity = list_array.nulls();
240+
241+
// Only compare the visible portion of the values buffer, which avoids
242+
// wasted work for sliced ListArrays.
243+
let offsets = list_array.offsets();
244+
let first_offset = offsets[0].as_usize();
245+
let last_offset = offsets[list_array.len()].as_usize();
246+
let visible_values = list_array
247+
.values()
248+
.slice(first_offset, last_offset - first_offset);
249+
242250
// `not_distinct` treats NULL=NULL as true, matching the semantics of
243251
// `array_position`
244-
let eq_array = arrow_ord::cmp::not_distinct(list_array.values(), &element_datum)?;
252+
let eq_array = arrow_ord::cmp::not_distinct(&visible_values, &element_datum)?;
245253
let eq_bits = eq_array.values();
246254

247255
let mut result: Vec<Option<u64>> = Vec::with_capacity(list_array.len());
248256
let mut matches = eq_bits.set_indices().peekable();
249257

258+
// Match positions are relative to visible_values (0-based), so
259+
// subtract first_offset from each offset when comparing.
250260
for i in 0..list_array.len() {
251-
let start = offsets[i].as_usize();
252-
let end = offsets[i + 1].as_usize();
261+
let start = offsets[i].as_usize() - first_offset;
262+
let end = offsets[i + 1].as_usize() - first_offset;
253263

254264
if validity.is_some_and(|v| v.is_null(i)) {
255265
// Null row -> null output; advance past matches in range
@@ -474,3 +484,60 @@ fn general_positions<OffsetSize: OffsetSizeTrait>(
474484
ListArray::from_iter_primitive::<UInt64Type, _, _>(data),
475485
))
476486
}
487+
488+
#[cfg(test)]
489+
mod tests {
490+
use super::*;
491+
use arrow::array::AsArray;
492+
use arrow::datatypes::Int32Type;
493+
use datafusion_common::config::ConfigOptions;
494+
use datafusion_expr::ScalarFunctionArgs;
495+
496+
#[test]
497+
fn test_array_position_sliced_list() -> Result<()> {
498+
// [[10, 20], [30, 40], [50, 60], [70, 80]] → slice(1,2) → [[30, 40], [50, 60]]
499+
let list = ListArray::from_iter_primitive::<Int32Type, _, _>(vec![
500+
Some(vec![Some(10), Some(20)]),
501+
Some(vec![Some(30), Some(40)]),
502+
Some(vec![Some(50), Some(60)]),
503+
Some(vec![Some(70), Some(80)]),
504+
]);
505+
let sliced = list.slice(1, 2);
506+
let haystack_field =
507+
Arc::new(Field::new("haystack", sliced.data_type().clone(), true));
508+
let needle_field = Arc::new(Field::new("needle", DataType::Int32, true));
509+
let return_field = Arc::new(Field::new("return", UInt64, true));
510+
511+
// Search for elements that exist only in sliced-away rows:
512+
// 10 is in the prefix row, 70 is in the suffix row.
513+
let invoke = |needle: i32| -> Result<ArrayRef> {
514+
ArrayPosition::new()
515+
.invoke_with_args(ScalarFunctionArgs {
516+
args: vec![
517+
ColumnarValue::Array(Arc::new(sliced.clone())),
518+
ColumnarValue::Scalar(ScalarValue::Int32(Some(needle))),
519+
],
520+
arg_fields: vec![
521+
Arc::clone(&haystack_field),
522+
Arc::clone(&needle_field),
523+
],
524+
number_rows: 2,
525+
return_field: Arc::clone(&return_field),
526+
config_options: Arc::new(ConfigOptions::default()),
527+
})?
528+
.into_array(2)
529+
};
530+
531+
let output = invoke(10)?;
532+
let output = output.as_primitive::<UInt64Type>();
533+
assert!(output.is_null(0));
534+
assert!(output.is_null(1));
535+
536+
let output = invoke(70)?;
537+
let output = output.as_primitive::<UInt64Type>();
538+
assert!(output.is_null(0));
539+
assert!(output.is_null(1));
540+
541+
Ok(())
542+
}
543+
}

0 commit comments

Comments
 (0)