Skip to content

Commit 53c8541

Browse files
dd-david-levingabotechs
authored andcommitted
fix: string_to_array('', delim) returns empty array for PostgreSQL compatibility (apache#21104)
## Problem `string_to_array` was returning incorrect results for empty string input — both when the delimiter is non-empty and when the delimiter is itself an empty string. This diverges from PostgreSQL behavior. | Query | DataFusion (before) | PostgreSQL (expected) | |---|---|---| | `string_to_array('', ',')` | `['']` | `{}` | | `string_to_array('', '')` | `['']` | `{}` | | `string_to_array('', ',', 'x')` | `['']` | `{}` | | `string_to_array('', '', 'x')` | `['']` | `{}` | Results from datafusion-cli <img width="1435" height="104" alt="Screenshot 2026-03-23 at 9 14 08 AM" src="https://github.com/user-attachments/assets/2eaae366-7f8a-4220-87d2-f0b311c26712" /> **Root cause:** Rust's `str::split()` on an empty string always yields one empty-string element, so `"".split(",")` produces `[""]`. Additionally, the empty-delimiter branch unconditionally appended the (empty) string value. This is subtle because Arrow's text display format appears not to quote strings, so `[""]` renders as `[]` — indistinguishable from an actual empty array. Using `cardinality()` reveals the current length is 1, not 0. **PostgreSQL reference:** [db-fiddle](https://www.db-fiddle.com/f/oCF8EPaZFkDNKSg28rVVWy/3) ## Fix In `datafusion/functions-nested/src/string.rs`: - **Non-empty delimiter** `(Some(string), Some(delimiter))`: added `if !string.is_empty()` guard to skip splitting when input is empty. - **Empty delimiter** `(Some(string), Some(""))`: added `if !string.is_empty()` guard so the string value is only appended when non-empty. Both the plain variant and the `null_value` variant are fixed (4 checks total). ## Tests Added sqllogictest cases in `datafusion/sqllogictest/test_files/array.slt` using `cardinality()` to unambiguously verify the arrays are truly empty (not just displaying as empty): ```sql SELECT cardinality(string_to_array('', ',')) -- 0 SELECT cardinality(string_to_array('', '')) -- 0 SELECT cardinality(string_to_array('', ',', 'x')) -- 0 SELECT cardinality(string_to_array('', '', 'x')) -- 0 ``` Each test covers one of the four `is_empty` guard checks. All four fail without the fix (returning 1 instead of 0). (cherry picked from commit cdaecf0)
1 parent 0423028 commit 53c8541

File tree

2 files changed

+59
-35
lines changed

2 files changed

+59
-35
lines changed

datafusion/functions-nested/src/string.rs

Lines changed: 39 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -735,53 +735,57 @@ where
735735
let mut list_builder = ListBuilder::new(string_builder);
736736

737737
match null_value_array {
738-
None => {
739-
string_array.iter().zip(delimiter_array.iter()).for_each(
740-
|(string, delimiter)| {
741-
match (string, delimiter) {
742-
(Some(string), Some("")) => {
743-
list_builder.values().append_value(string);
744-
list_builder.append(true);
745-
}
746-
(Some(string), Some(delimiter)) => {
747-
string.split(delimiter).for_each(|s| {
748-
list_builder.values().append_value(s);
749-
});
750-
list_builder.append(true);
751-
}
752-
(Some(string), None) => {
753-
string.chars().map(|c| c.to_string()).for_each(|c| {
754-
list_builder.values().append_value(c.as_str());
755-
});
756-
list_builder.append(true);
757-
}
758-
_ => list_builder.append(false), // null value
738+
None => string_array.iter().zip(delimiter_array.iter()).for_each(
739+
|(string, delimiter)| match (string, delimiter) {
740+
(Some(string), Some("")) => {
741+
if !string.is_empty() {
742+
list_builder.values().append_value(string);
759743
}
760-
},
761-
)
762-
}
744+
list_builder.append(true);
745+
}
746+
(Some(string), Some(delimiter)) => {
747+
if !string.is_empty() {
748+
string.split(delimiter).for_each(|s| {
749+
list_builder.values().append_value(s);
750+
});
751+
}
752+
list_builder.append(true);
753+
}
754+
(Some(string), None) => {
755+
string.chars().map(|c| c.to_string()).for_each(|c| {
756+
list_builder.values().append_value(c.as_str());
757+
});
758+
list_builder.append(true);
759+
}
760+
_ => list_builder.append(false),
761+
},
762+
),
763763
Some(null_value_array) => string_array
764764
.iter()
765765
.zip(delimiter_array.iter())
766766
.zip(null_value_array.iter())
767767
.for_each(|((string, delimiter), null_value)| {
768768
match (string, delimiter) {
769769
(Some(string), Some("")) => {
770-
if Some(string) == null_value {
771-
list_builder.values().append_null();
772-
} else {
773-
list_builder.values().append_value(string);
770+
if !string.is_empty() {
771+
if Some(string) == null_value {
772+
list_builder.values().append_null();
773+
} else {
774+
list_builder.values().append_value(string);
775+
}
774776
}
775777
list_builder.append(true);
776778
}
777779
(Some(string), Some(delimiter)) => {
778-
string.split(delimiter).for_each(|s| {
779-
if Some(s) == null_value {
780-
list_builder.values().append_null();
781-
} else {
782-
list_builder.values().append_value(s);
783-
}
784-
});
780+
if !string.is_empty() {
781+
string.split(delimiter).for_each(|s| {
782+
if Some(s) == null_value {
783+
list_builder.values().append_null();
784+
} else {
785+
list_builder.values().append_value(s);
786+
}
787+
});
788+
}
785789
list_builder.append(true);
786790
}
787791
(Some(string), None) => {

datafusion/sqllogictest/test_files/array.slt

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8699,6 +8699,26 @@ SELECT string_to_array('abcxxxdef', 'xxx')
86998699
----
87008700
[abc, def]
87018701

8702+
query I
8703+
SELECT cardinality(string_to_array('', ','))
8704+
----
8705+
0
8706+
8707+
query I
8708+
SELECT cardinality(string_to_array('', ''))
8709+
----
8710+
0
8711+
8712+
query I
8713+
SELECT cardinality(string_to_array('', ',', 'x'))
8714+
----
8715+
0
8716+
8717+
query I
8718+
SELECT cardinality(string_to_array('', '', 'x'))
8719+
----
8720+
0
8721+
87028722
query ?
87038723
SELECT string_to_array('abc', '')
87048724
----

0 commit comments

Comments
 (0)