Skip to content

Commit a0dbbab

Browse files
fix: Fix Spark slice function Null type to GenericListArray casting issue (#20469)
## Which issue does this PR close? - Closes #20466. ## Rationale for this change Currently, Spark `slice` function accepts Null Arrays and return `Null` for this particular queries. DataFusion-Spark `slice` function also needs to return `NULL` when Null Array is set. **Spark Behavior** (tested with latest Spark master): ``` > SELECT slice(NULL, 1, 2); +-----------------+ |slice(NULL, 1, 2)| +-----------------+ | null| +-----------------+ ``` **DF Behaviour:** Current: ``` query error SELECT slice(NULL, 1, 2); ---- DataFusion error: Internal error: could not cast array of type Null to arrow_array::array::list_array::GenericListArray<i32>. This issue was likely caused by a bug in DataFusion's code. Please help us to resolve this by filing a bug report in our issue tracker: https://github.com/apache/datafusion/issues ``` New: ``` query ? SELECT slice(NULL, 1, 2); ---- NULL ``` ## What changes are included in this PR? Explained under first section. ## Are these changes tested? Added new UT cases for both `slice.rs` and `slice.slt`. ## Are there any user-facing changes? Yes, currently, `slice` function returns error message for `Null` Array inputs, however, expected behavior is to be returned `NULL` so end-user will get expected result instead of error message. --------- Co-authored-by: Martin Grigorov <martin-g@users.noreply.github.com>
1 parent bd2af68 commit a0dbbab

2 files changed

Lines changed: 102 additions & 6 deletions

File tree

  • datafusion

datafusion/spark/src/function/array/slice.rs

Lines changed: 79 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,9 @@ use arrow::array::{Array, ArrayRef, Int64Builder};
1919
use arrow::datatypes::{DataType, Field, FieldRef};
2020
use datafusion_common::cast::{as_int64_array, as_list_array};
2121
use datafusion_common::utils::ListCoercion;
22-
use datafusion_common::{Result, exec_err, internal_err, utils::take_function_args};
22+
use datafusion_common::{
23+
Result, ScalarValue, exec_err, internal_err, utils::take_function_args,
24+
};
2325
use datafusion_expr::{
2426
ArrayFunctionArgument, ArrayFunctionSignature, ColumnarValue, ReturnFieldArgs,
2527
ScalarFunctionArgs, ScalarUDFImpl, Signature, TypeSignature, Volatility,
@@ -78,17 +80,28 @@ impl ScalarUDFImpl for SparkSlice {
7880
fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result<FieldRef> {
7981
let nullable = args.arg_fields.iter().any(|f| f.is_nullable());
8082

81-
Ok(Arc::new(Field::new(
82-
"slice",
83-
args.arg_fields[0].data_type().clone(),
84-
nullable,
85-
)))
83+
let data_type = match args.arg_fields[0].data_type() {
84+
DataType::Null => {
85+
DataType::List(Arc::new(Field::new_list_field(DataType::Null, true)))
86+
}
87+
dt => dt.clone(),
88+
};
89+
90+
Ok(Arc::new(Field::new("slice", data_type, nullable)))
8691
}
8792

8893
fn invoke_with_args(
8994
&self,
9095
mut func_args: ScalarFunctionArgs,
9196
) -> Result<ColumnarValue> {
97+
if func_args.args[0].data_type() == DataType::Null {
98+
return Ok(ColumnarValue::Scalar(ScalarValue::new_null_list(
99+
DataType::Null,
100+
true,
101+
1,
102+
)));
103+
}
104+
92105
let array_len = func_args
93106
.args
94107
.iter()
@@ -165,3 +178,63 @@ fn calculate_start_end(args: &[ArrayRef]) -> Result<(ArrayRef, ArrayRef)> {
165178

166179
Ok((Arc::new(adjusted_start.finish()), Arc::new(end.finish())))
167180
}
181+
182+
#[cfg(test)]
183+
mod tests {
184+
use super::*;
185+
use arrow::array::NullArray;
186+
use arrow::datatypes::Field;
187+
use datafusion_common::ScalarValue;
188+
use datafusion_common::cast::as_list_array;
189+
use datafusion_expr::ReturnFieldArgs;
190+
191+
#[test]
192+
fn test_spark_slice_function_when_input_is_null() {
193+
let slice = SparkSlice::new();
194+
let arg_fields: Vec<Arc<Field>> = vec![
195+
Arc::new(Field::new("a", DataType::Null, true)),
196+
Arc::new(Field::new("s", DataType::Int64, true)),
197+
Arc::new(Field::new("l", DataType::Int64, true)),
198+
];
199+
let out = slice
200+
.return_field_from_args(ReturnFieldArgs {
201+
arg_fields: &arg_fields,
202+
scalar_arguments: &[],
203+
})
204+
.unwrap();
205+
assert_eq!(
206+
out.data_type(),
207+
&DataType::List(Arc::new(Field::new_list_field(DataType::Null, true)))
208+
);
209+
}
210+
211+
#[test]
212+
fn test_spark_slice_function_when_input_array_is_null() {
213+
let input_args = vec![
214+
ColumnarValue::Array(Arc::new(NullArray::new(1))),
215+
ColumnarValue::Scalar(ScalarValue::Int64(Some(1))),
216+
ColumnarValue::Scalar(ScalarValue::Int64(Some(3))),
217+
];
218+
219+
let args = ScalarFunctionArgs {
220+
args: input_args,
221+
arg_fields: vec![Arc::new(Field::new("item", DataType::Null, true))],
222+
number_rows: 1,
223+
return_field: Arc::new(Field::new(
224+
"slice",
225+
DataType::List(Arc::new(Field::new_list_field(DataType::Null, true))),
226+
true,
227+
)),
228+
config_options: Arc::new(Default::default()),
229+
};
230+
let slice = SparkSlice::new();
231+
let result = slice.invoke_with_args(args).unwrap();
232+
let arr = result.to_array(1).unwrap();
233+
let list = as_list_array(&arr).unwrap();
234+
assert_eq!(
235+
arr.data_type(),
236+
&DataType::List(Arc::new(Field::new_list_field(DataType::Null, true)))
237+
);
238+
assert!(list.is_null(0));
239+
}
240+
}

datafusion/sqllogictest/test_files/spark/array/slice.slt

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,3 +114,26 @@ query ?
114114
SELECT slice([1, 2, 3, 4], CAST('2' AS INT), 4);
115115
----
116116
[2, 3, 4]
117+
118+
query ?
119+
SELECT slice(column1, column2, column3)
120+
FROM VALUES
121+
(NULL, 1, 2),
122+
(NULL, 1, -2),
123+
(NULL, -1, 2),
124+
(NULL, 0, 2);
125+
----
126+
NULL
127+
NULL
128+
NULL
129+
NULL
130+
131+
query ?
132+
SELECT slice(slice(NULL, 1, 2), 1, 2)
133+
----
134+
NULL
135+
136+
query ?
137+
SELECT slice(slice(make_array(NULL), 1, 2), 1, 2)
138+
----
139+
[NULL]

0 commit comments

Comments
 (0)