Skip to content

Commit f3cebc5

Browse files
authored
feat[expr-common]: support regex and LIKE coercion on REE and Dict value types that require an extra coercion step (#21924)
The coerce_fn applied to REE values needs to be the higher-level coerce function so that any REE value can be coerced (not just primitive types). ## Which issue does this PR close? <!-- We generally require a GitHub issue to be filed for all bug fixes and enhancements and this helps us generate change logs for our releases. You can link an issue to this PR using the GitHub syntax. For example `Closes #123` indicates that this PR will close issue #123. --> - Closes #21923 ## Rationale for this change <!-- Why are you proposing this change? If this is already explained clearly in the issue then this section is not needed. Explaining clearly why changes are proposed helps reviewers understand your changes and offer better suggestions for fixes. --> Queries were failing ## What changes are included in this PR? <!-- There is no need to duplicate the description in the issue here but it is sometimes worth providing a summary of the individual changes in this PR. --> Correct unwrapping of REE values for regex/LIKE coercion ## Are these changes tested? <!-- We typically require tests for all PRs in order to: 1. Prevent the code from being accidentally broken by subsequent changes 2. Serve as another way to document the expected behavior of the code If tests are not included in your PR, please explain why (for example, are they covered by existing tests)? --> Yes, with slt tests. ## Are there any user-facing changes? <!-- If there are user-facing changes then we may require documentation to be updated before approving the PR. --> Queries that would previously error now pass. <!-- If there are any breaking changes to public APIs, please add the `api change` label. --> --------- Signed-off-by: Alfonso Subiotto Marques <alfonso.subiotto@polarsignals.com>
1 parent 9bbc28b commit f3cebc5

3 files changed

Lines changed: 39 additions & 4 deletions

File tree

datafusion/expr-common/src/type_coercion/binary.rs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1766,8 +1766,8 @@ fn binary_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option<DataType>
17661766
pub fn like_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option<DataType> {
17671767
string_coercion(lhs_type, rhs_type)
17681768
.or_else(|| binary_to_string_coercion(lhs_type, rhs_type))
1769-
.or_else(|| dictionary_coercion(lhs_type, rhs_type, false, string_coercion))
1770-
.or_else(|| ree_coercion(lhs_type, rhs_type, false, string_coercion))
1769+
.or_else(|| dictionary_coercion(lhs_type, rhs_type, false, like_coercion))
1770+
.or_else(|| ree_coercion(lhs_type, rhs_type, false, like_coercion))
17711771
.or_else(|| regex_null_coercion(lhs_type, rhs_type))
17721772
.or_else(|| null_coercion(lhs_type, rhs_type))
17731773
}
@@ -1787,8 +1787,8 @@ fn regex_null_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option<DataT
17871787
/// This is a union of string coercion rules, dictionary coercion rules, and REE coercion rules.
17881788
pub fn regex_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option<DataType> {
17891789
string_coercion(lhs_type, rhs_type)
1790-
.or_else(|| dictionary_coercion(lhs_type, rhs_type, false, string_coercion))
1791-
.or_else(|| ree_coercion(lhs_type, rhs_type, false, string_coercion))
1790+
.or_else(|| dictionary_coercion(lhs_type, rhs_type, false, regex_coercion))
1791+
.or_else(|| ree_coercion(lhs_type, rhs_type, false, regex_coercion))
17921792
.or_else(|| regex_null_coercion(lhs_type, rhs_type))
17931793
}
17941794

datafusion/sqllogictest/test_files/regexp/regexp_like.slt

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -277,6 +277,23 @@ drop table strings
277277
statement ok
278278
drop table dict_table
279279

280+
# Dict value types that themselves need further coercion against the literal
281+
statement ok
282+
create table dict_inner as
283+
select arrow_cast(arrow_cast(c, 'Binary'), 'Dictionary(UInt32, Binary)') as bin_col,
284+
arrow_cast(arrow_cast(c, 'Dictionary(UInt32, Utf8)'),
285+
'Dictionary(Int32, Dictionary(UInt32, Utf8))') as nested_col
286+
from (values ('foo'), ('bar')) as t(c);
287+
288+
query BB
289+
select bin_col LIKE 'foo', nested_col ~ 'foo' from dict_inner;
290+
----
291+
true true
292+
false false
293+
294+
statement ok
295+
drop table dict_inner
296+
280297
# Ensure that regexp_like is rewritten to use the (more optimized) regex operators
281298
statement ok
282299
create table regexp_test as values

datafusion/sqllogictest/test_files/run_end_encoded.slt

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,3 +67,21 @@ FROM sensor_readings;
6767
23 23
6868
24 24
6969
25 25
70+
71+
# Regex / LIKE match against an REE column whose values are themselves Dictionary-encoded
72+
query BB rowsort
73+
SELECT
74+
arrow_cast(
75+
arrow_cast(sensor_id, 'Dictionary(UInt32, Utf8)'),
76+
'RunEndEncoded("run_ends": non-null Int32, "values": Dictionary(UInt32, Utf8))'
77+
) ~ 'sensor_A',
78+
arrow_cast(
79+
arrow_cast(sensor_id, 'Dictionary(UInt32, Utf8)'),
80+
'RunEndEncoded("run_ends": non-null Int32, "values": Dictionary(UInt32, Utf8))'
81+
) LIKE 'sensor_A'
82+
FROM sensor_readings;
83+
----
84+
false false
85+
true true
86+
true true
87+
true true

0 commit comments

Comments
 (0)