Skip to content

Commit ea0928c

Browse files
theirixJefffrey
andauthored
feat: support binary arguments for StringConcat operator (#21883)
## Which issue does this PR close? - Refers #12709. ## Rationale for this change Binary arguments are supported for concat UDFs, but not for the pipe operator (`||`), which supports only text. ## What changes are included in this PR? - Support binary concat by providing specialised kernels for pure binary operations. Avoid support of mixed string/binary arguments as it doesn't match the behaviour of major DBs, except for Postgres (see the table in the linked ticket). - Add `concat_elements_binary_view_array` kernel - Refactor private `binary_coercion` to support symmetric BinaryLike + BinaryLike - required for the new codeflow Concat UDFs are out of scope and supported separately. ## Are these changes tested? - Existing SLTs - Moved a few tests to a more appropriate `binary.slt` - Added new unit tests ## Are there any user-facing changes? Concatenation `||` operator now allows binary+binary concatenation (`SELECT x'636166c3a9' || x'68656c6c6f'`), but denies mixed string+binary concatenation `SELECT x'636166c3a9' || 'hello'` --------- Co-authored-by: Jeffrey Vo <jeffrey.vo.australia@gmail.com>
1 parent 37dbdaf commit ea0928c

6 files changed

Lines changed: 244 additions & 35 deletions

File tree

datafusion/expr-common/src/type_coercion/binary.rs

Lines changed: 34 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1611,9 +1611,37 @@ fn ree_coercion(
16111611
/// This is a union of string coercion rules and specified rules:
16121612
/// 1. At least one side of lhs and rhs should be string type (Utf8 / LargeUtf8)
16131613
/// 2. Data type of the other side should be able to cast to string type
1614+
/// 3. Binary and string types cannot be mixed
16141615
fn string_concat_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option<DataType> {
16151616
use arrow::datatypes::DataType::*;
16161617
string_coercion(lhs_type, rhs_type).or_else(|| match (lhs_type, rhs_type) {
1618+
// Allow pure binary + binary
1619+
(
1620+
Binary | LargeBinary | BinaryView | FixedSizeBinary(_),
1621+
Binary | LargeBinary | BinaryView | FixedSizeBinary(_),
1622+
) => {
1623+
// Coerce fixed-sized binary to variable-sized `Binary` to make uniform signature
1624+
// with the `Binary` result
1625+
let lhs_type = match lhs_type {
1626+
FixedSizeBinary(_) => &Binary,
1627+
val => val,
1628+
};
1629+
let rhs_type = match rhs_type {
1630+
FixedSizeBinary(_) => &Binary,
1631+
val => val,
1632+
};
1633+
binary_coercion(lhs_type, rhs_type)
1634+
}
1635+
// Deny other mixed binary + string combinations
1636+
(
1637+
Binary | LargeBinary | BinaryView | FixedSizeBinary(_),
1638+
Utf8 | LargeUtf8 | Utf8View,
1639+
) => None,
1640+
(
1641+
Utf8 | LargeUtf8 | Utf8View,
1642+
Binary | LargeBinary | BinaryView | FixedSizeBinary(_),
1643+
) => None,
1644+
// Predicate-based coercion rules are following
16171645
(Utf8View, from_type) | (from_type, Utf8View) => {
16181646
string_concat_internal_coercion(from_type, &Utf8View)
16191647
}
@@ -1626,7 +1654,6 @@ fn string_concat_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option<Da
16261654
(Dictionary(_, lhs_value_type), Dictionary(_, rhs_value_type)) => {
16271655
string_coercion(lhs_value_type, rhs_value_type).or(None)
16281656
}
1629-
(Binary, Binary) => Some(Utf8),
16301657
_ => None,
16311658
})
16321659
}
@@ -1756,13 +1783,17 @@ pub fn binary_to_string_coercion(
17561783
fn binary_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option<DataType> {
17571784
use arrow::datatypes::DataType::*;
17581785
match (lhs_type, rhs_type) {
1786+
// Prefer symmetric coercion (in case the function is called directly)
1787+
(Binary, Binary) => Some(Binary),
1788+
(LargeBinary, LargeBinary) => Some(LargeBinary),
1789+
(BinaryView, BinaryView) => Some(BinaryView),
17591790
// If BinaryView is in any side, we coerce to BinaryView.
1760-
(BinaryView, BinaryView | Binary | LargeBinary | Utf8 | LargeUtf8 | Utf8View)
1791+
(BinaryView, Binary | LargeBinary | Utf8 | LargeUtf8 | Utf8View)
17611792
| (LargeBinary | Binary | Utf8 | LargeUtf8 | Utf8View, BinaryView) => {
17621793
Some(BinaryView)
17631794
}
17641795
// Prefer LargeBinary over Binary
1765-
(LargeBinary | Binary | Utf8 | LargeUtf8 | Utf8View, LargeBinary)
1796+
(Binary | Utf8 | LargeUtf8 | Utf8View, LargeBinary)
17661797
| (LargeBinary, Binary | Utf8 | LargeUtf8 | Utf8View) => Some(LargeBinary),
17671798

17681799
// If Utf8View/LargeUtf8 presents need to be large Binary

datafusion/expr-common/src/type_coercion/binary/tests/comparison.rs

Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -897,3 +897,109 @@ fn test_binary_comparison_string_numeric_coercion() -> Result<()> {
897897
}
898898
Ok(())
899899
}
900+
901+
#[test]
902+
fn test_string_concat_coercion() -> Result<()> {
903+
// Binary
904+
test_coercion_binary_rule!(
905+
DataType::Binary,
906+
DataType::Binary,
907+
Operator::StringConcat,
908+
DataType::Binary
909+
);
910+
test_coercion_binary_rule!(
911+
DataType::LargeBinary,
912+
DataType::LargeBinary,
913+
Operator::StringConcat,
914+
DataType::LargeBinary
915+
);
916+
test_coercion_binary_rule!(
917+
DataType::BinaryView,
918+
DataType::BinaryView,
919+
Operator::StringConcat,
920+
DataType::BinaryView
921+
);
922+
test_coercion_binary_rule!(
923+
DataType::Binary,
924+
DataType::LargeBinary,
925+
Operator::StringConcat,
926+
DataType::LargeBinary
927+
);
928+
test_coercion_binary_rule!(
929+
DataType::BinaryView,
930+
DataType::Binary,
931+
Operator::StringConcat,
932+
DataType::BinaryView
933+
);
934+
test_coercion_binary_rule!(
935+
DataType::FixedSizeBinary(4),
936+
DataType::FixedSizeBinary(16),
937+
Operator::StringConcat,
938+
DataType::Binary
939+
);
940+
test_coercion_binary_rule!(
941+
DataType::FixedSizeBinary(4),
942+
DataType::LargeBinary,
943+
Operator::StringConcat,
944+
DataType::LargeBinary
945+
);
946+
test_coercion_binary_rule!(
947+
DataType::FixedSizeBinary(4),
948+
DataType::BinaryView,
949+
Operator::StringConcat,
950+
DataType::BinaryView
951+
);
952+
953+
// String
954+
test_coercion_binary_rule!(
955+
DataType::Utf8,
956+
DataType::Utf8,
957+
Operator::StringConcat,
958+
DataType::Utf8
959+
);
960+
test_coercion_binary_rule!(
961+
DataType::LargeUtf8,
962+
DataType::LargeUtf8,
963+
Operator::StringConcat,
964+
DataType::LargeUtf8
965+
);
966+
test_coercion_binary_rule!(
967+
DataType::Utf8View,
968+
DataType::Utf8View,
969+
Operator::StringConcat,
970+
DataType::Utf8View
971+
);
972+
973+
// Mixed string-binary
974+
for string_dt in [DataType::Utf8, DataType::LargeUtf8, DataType::Utf8View] {
975+
for binary_dt in [
976+
DataType::Binary,
977+
DataType::LargeBinary,
978+
DataType::BinaryView,
979+
DataType::FixedSizeBinary(8),
980+
] {
981+
assert!(
982+
BinaryTypeCoercer::new(&binary_dt, &Operator::StringConcat, &string_dt,)
983+
.get_input_types()
984+
.is_err(),
985+
"{binary_dt} || {string_dt}"
986+
);
987+
assert!(
988+
BinaryTypeCoercer::new(&string_dt, &Operator::StringConcat, &binary_dt,)
989+
.get_input_types()
990+
.is_err(),
991+
"{string_dt} || {binary_dt}"
992+
);
993+
}
994+
}
995+
996+
// Mixed string-other
997+
test_coercion_binary_rule!(
998+
DataType::Utf8,
999+
DataType::Timestamp(Second, None),
1000+
Operator::StringConcat,
1001+
DataType::Utf8
1002+
);
1003+
1004+
Ok(())
1005+
}

datafusion/physical-expr/src/expressions/binary.rs

Lines changed: 17 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,9 @@ use std::sync::Arc;
2424

2525
use arrow::array::*;
2626
use arrow::compute::kernels::boolean::{and_kleene, or_kleene};
27-
use arrow::compute::kernels::concat_elements::concat_elements_utf8;
27+
use arrow::compute::kernels::concat_elements::{
28+
concat_element_binary, concat_elements_utf8,
29+
};
2830
use arrow::compute::{SlicesIterator, cast, filter_record_batch};
2931
use arrow::datatypes::*;
3032
use arrow::error::ArrowError;
@@ -46,7 +48,8 @@ use kernels::{
4648
bitwise_and_dyn, bitwise_and_dyn_scalar, bitwise_or_dyn, bitwise_or_dyn_scalar,
4749
bitwise_shift_left_dyn, bitwise_shift_left_dyn_scalar, bitwise_shift_right_dyn,
4850
bitwise_shift_right_dyn_scalar, bitwise_xor_dyn, bitwise_xor_dyn_scalar,
49-
concat_elements_utf8view, regex_match_dyn, regex_match_dyn_scalar,
51+
concat_elements_binary_view_array, concat_elements_utf8view, regex_match_dyn,
52+
regex_match_dyn_scalar,
5053
};
5154

5255
/// Binary expression
@@ -928,18 +931,6 @@ fn pre_selection_scatter(
928931
}
929932

930933
fn concat_elements(left: &ArrayRef, right: &ArrayRef) -> Result<ArrayRef> {
931-
if *left.data_type() == DataType::Binary && *right.data_type() == DataType::Binary {
932-
// Cast Binary to Utf8 to validate UTF-8 encoding before concatenation
933-
// Follow widespread approach of PostgreSQL, sqlite, DuckDB, Snowflake
934-
// Spark does it in a different way by making a binary-to-binary concatenation
935-
let left = cast(left.as_ref(), &DataType::Utf8)?;
936-
let right = cast(right.as_ref(), &DataType::Utf8)?;
937-
return Ok(Arc::new(concat_elements_utf8(
938-
left.as_string::<i32>(),
939-
right.as_string::<i32>(),
940-
)?));
941-
}
942-
943934
Ok(match left.data_type() {
944935
DataType::Utf8 => Arc::new(concat_elements_utf8(
945936
left.as_string::<i32>(),
@@ -953,6 +944,18 @@ fn concat_elements(left: &ArrayRef, right: &ArrayRef) -> Result<ArrayRef> {
953944
left.as_string_view(),
954945
right.as_string_view(),
955946
)?),
947+
DataType::Binary => Arc::new(concat_element_binary::<i32>(
948+
left.as_binary(),
949+
right.as_binary(),
950+
)?),
951+
DataType::LargeBinary => Arc::new(concat_element_binary::<i64>(
952+
left.as_binary(),
953+
right.as_binary(),
954+
)?),
955+
DataType::BinaryView => Arc::new(concat_elements_binary_view_array(
956+
left.as_binary_view(),
957+
right.as_binary_view(),
958+
)?),
956959
other => {
957960
return internal_err!(
958961
"Data type {other:?} not supported for binary operation 'concat_elements' on string arrays"

datafusion/physical-expr/src/expressions/binary/kernels.rs

Lines changed: 47 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
//! This module contains computation kernels that are specific to
1919
//! datafusion and not (yet) targeted to port upstream to arrow
2020
use arrow::array::*;
21-
use arrow::buffer::NullBuffer;
21+
use arrow::buffer::{MutableBuffer, NullBuffer};
2222
use arrow::compute::kernels::bitwise::{
2323
bitwise_and, bitwise_and_scalar, bitwise_or, bitwise_or_scalar, bitwise_shift_left,
2424
bitwise_shift_left_scalar, bitwise_shift_right, bitwise_shift_right_scalar,
@@ -161,11 +161,11 @@ create_left_integral_dyn_scalar_kernel!(
161161
bitwise_shift_left_scalar
162162
);
163163

164-
/// Concatenates two `StringViewArray`s element-wise.
164+
/// Concatenates two `StringViewArray`s element-wise.
165165
/// If either element is `Null`, the result element is also `Null`.
166166
///
167167
/// # Errors
168-
/// - Returns an error if the input arrays have different lengths.
168+
/// - Returns an error if the input arrays have different lengths.
169169
/// - Returns an error if any concatenated string exceeds `u32::MAX` (≈4 GB) in length.
170170
pub fn concat_elements_utf8view(
171171
left: &StringViewArray,
@@ -204,6 +204,50 @@ pub fn concat_elements_utf8view(
204204
Ok(result.finish())
205205
}
206206

207+
/// Concatenates two `BinaryViewArray`s element-wise.
208+
/// If either element is `Null`, the result element is also `Null`.
209+
///
210+
/// # Errors
211+
/// - Returns an error if the input arrays have different lengths.
212+
/// - Returns an error if any concatenated string exceeds `u32::MAX` in length.
213+
pub fn concat_elements_binary_view_array(
214+
left: &BinaryViewArray,
215+
right: &BinaryViewArray,
216+
) -> std::result::Result<BinaryViewArray, ArrowError> {
217+
if left.len() != right.len() {
218+
return Err(ArrowError::ComputeError(format!(
219+
"Arrays must have the same length: {} != {}",
220+
left.len(),
221+
right.len()
222+
)));
223+
}
224+
let mut result = BinaryViewBuilder::with_capacity(left.len());
225+
226+
// Avoid reallocations by writing to a reused buffer (note we could be even
227+
// more efficient by creating the view directly here and avoid the buffer
228+
// but that would be more complex)
229+
let mut buffer = MutableBuffer::new(0);
230+
231+
// Pre-compute combined null bitmap, so the per-row NULL check is more
232+
// efficient
233+
let nulls = NullBuffer::union(left.nulls(), right.nulls());
234+
235+
for i in 0..left.len() {
236+
if nulls.as_ref().is_some_and(|n| n.is_null(i)) {
237+
result.append_null();
238+
} else {
239+
let l = left.value(i);
240+
let r = right.value(i);
241+
buffer.clear();
242+
buffer.extend_from_slice(l);
243+
buffer.extend_from_slice(r);
244+
// No try-version of append_value
245+
result.try_append_value(&buffer)?;
246+
}
247+
}
248+
Ok(result.finish())
249+
}
250+
207251
/// Invoke a compute kernel on a pair of binary data arrays with flags
208252
macro_rules! regexp_is_match_flag {
209253
($LEFT:expr, $RIGHT:expr, $ARRAYTYPE:ident, $NOT:expr, $FLAG:expr) => {{

datafusion/sqllogictest/test_files/binary.slt

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -321,3 +321,40 @@ query T
321321
SELECT split_part(CAST(binary AS VARCHAR), 'o', 2) FROM t WHERE binary = X'466f6f';
322322
----
323323
(empty)
324+
325+
# Pipe concatenation of binaries always provides a binary
326+
query ?
327+
SELECT x'636166c3a9' || x'68656c6c6f';
328+
----
329+
636166c3a968656c6c6f
330+
331+
# Pipe concatenation of binary and other kind of binary also provides a binary
332+
query ?
333+
SELECT x'636166c3a9' || arrow_cast(x'68656c6c6f', 'LargeBinary');
334+
----
335+
636166c3a968656c6c6f
336+
337+
query ?
338+
SELECT x'636166c3a9' || arrow_cast(arrow_cast(x'68656c6c6f', 'LargeBinary'), 'BinaryView');
339+
----
340+
636166c3a968656c6c6f
341+
342+
query ?T
343+
SELECT x'636166c3a9' || arrow_cast(x'68656c6c6f', 'FixedSizeBinary(5)'), arrow_typeof(x'636166c3a9' || arrow_cast(x'68656c6c6f', 'FixedSizeBinary(5)'));
344+
----
345+
636166c3a968656c6c6f Binary
346+
347+
query ?T
348+
SELECT arrow_cast(x'6361', 'FixedSizeBinary(2)') || arrow_cast(x'68656c6c6f', 'FixedSizeBinary(5)'), arrow_typeof(arrow_cast(x'6361', 'FixedSizeBinary(2)') || arrow_cast(x'68656c6c6f', 'FixedSizeBinary(5)'));
349+
----
350+
636168656c6c6f Binary
351+
352+
# Byte pipe operator is forbidden for mixed binary and text
353+
query error DataFusion error: Error during planning: Cannot infer common string type for string concat operation Binary || Utf8
354+
SELECT x'c3a9' || 'hello';
355+
356+
query error DataFusion error: Error during planning: Cannot infer common string type for string concat operation Utf8 || LargeBinary
357+
SELECT 'hello' || arrow_cast(arrow_cast('hello', 'Binary'), 'LargeBinary');
358+
359+
query error DataFusion error: Error during planning: Cannot infer common string type for string concat operation Utf8 || BinaryView
360+
SELECT 'hello' || arrow_cast(arrow_cast('hello', 'Binary'), 'BinaryView');

datafusion/sqllogictest/test_files/scalar.slt

Lines changed: 3 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1739,23 +1739,11 @@ SELECT 'a' || 42 || 23.3
17391739
----
17401740
a4223.3
17411741

1742-
# concat of binary and text provides a text output
1743-
query T
1744-
select arrow_cast('Café', 'Utf8') || arrow_cast('Foobar', 'Binary');
1745-
----
1746-
CaféFoobar
1747-
1748-
query T
1749-
select arrow_cast('Café', 'Binary') || arrow_cast('Foobar', 'Utf8');
1750-
----
1751-
CaféFoobar
1752-
1753-
# Concat of two binaries should cast arguments to text and produce a text output,
1754-
# following common behaviour of PostreSQL. However, Spark is providing binary
1755-
query T
1742+
# Concat operator of two binaries uses their binary representation without text at all
1743+
query ?
17561744
select arrow_cast('Café', 'Binary') || arrow_cast('Foobar', 'Binary');
17571745
----
1758-
CaféFoobar
1746+
436166c3a9466f6f626172
17591747

17601748

17611749
# test_not_expressions()

0 commit comments

Comments
 (0)