Skip to content

Commit ba74cd2

Browse files
committed
Revised approach to fixing numeric/string type coercion
1 parent 00e36e8 commit ba74cd2

20 files changed

Lines changed: 855 additions & 337 deletions

File tree

datafusion/core/src/physical_planner.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3436,16 +3436,16 @@ mod tests {
34363436

34373437
#[tokio::test]
34383438
async fn in_list_types() -> Result<()> {
3439-
// expression: "a in ('a', 1)"
3440-
let list = vec![lit("a"), lit(1i64)];
3439+
// expression: "c1 in ('a', 'b')" where c1 is Utf8
3440+
// Tests that IN list coercion works with compatible string types.
3441+
let list = vec![lit("a"), lit("b")];
34413442
let logical_plan = test_csv_scan()
34423443
.await?
34433444
// filter clause needs the type coercion rule applied
34443445
.filter(col("c12").lt(lit(0.05)))?
34453446
.project(vec![col("c1").in_list(list, false)])?
34463447
.build()?;
34473448
let execution_plan = plan(&logical_plan).await?;
3448-
// verify that the plan correctly adds cast from Int64(1) to Utf8, and the const will be evaluated.
34493449

34503450
let expected = r#"expr: BinaryExpr { left: BinaryExpr { left: Column { name: "c1", index: 0 }, op: Eq, right: Literal { value: Utf8("a"), field: Field { name: "lit", data_type: Utf8 } }, fail_on_overflow: false }"#;
34513451

datafusion/core/tests/expr_api/mod.rs

Lines changed: 18 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -342,20 +342,25 @@ fn test_create_physical_expr_nvl2() {
342342

343343
#[tokio::test]
344344
async fn test_create_physical_expr_coercion() {
345-
// create_physical_expr does apply type coercion and unwrapping in cast
345+
// create_physical_expr applies type coercion (and can unwrap/fold
346+
// literal casts). Comparison coercion prefers numeric types, so
347+
// string/int comparisons cast the string side to the numeric type.
346348
//
347-
// expect the cast on the literals
348-
// compare string function to int `id = 1`
349-
create_expr_test(col("id").eq(lit(1i32)), "id@0 = CAST(1 AS Utf8)");
350-
create_expr_test(lit(1i32).eq(col("id")), "CAST(1 AS Utf8) = id@0");
351-
// compare int col to string literal `i = '202410'`
352-
// Note this casts the column (not the field)
353-
create_expr_test(col("i").eq(lit("202410")), "CAST(i@1 AS Utf8) = 202410");
354-
create_expr_test(lit("202410").eq(col("i")), "202410 = CAST(i@1 AS Utf8)");
355-
// however, when simplified the casts on i should removed
356-
// https://github.com/apache/datafusion/issues/14944
357-
create_simplified_expr_test(col("i").eq(lit("202410")), "CAST(i@1 AS Utf8) = 202410");
358-
create_simplified_expr_test(lit("202410").eq(col("i")), "CAST(i@1 AS Utf8) = 202410");
349+
// string column vs int literal: id (Utf8) is cast to Int32
350+
create_expr_test(col("id").eq(lit(1i32)), "CAST(id@0 AS Int32) = 1");
351+
create_expr_test(lit(1i32).eq(col("id")), "1 = CAST(id@0 AS Int32)");
352+
// int column vs string literal: the string literal is cast to Int64
353+
create_expr_test(col("i").eq(lit("202410")), "i@1 = CAST(202410 AS Int64)");
354+
create_expr_test(lit("202410").eq(col("i")), "CAST(202410 AS Int64) = i@1");
355+
// when simplified, the literal cast is constant-folded
356+
create_simplified_expr_test(
357+
col("i").eq(lit("202410")),
358+
"i@1 = CAST(202410 AS Int64)",
359+
);
360+
create_simplified_expr_test(
361+
lit("202410").eq(col("i")),
362+
"i@1 = CAST(202410 AS Int64)",
363+
);
359364
}
360365

361366
/// Evaluates the specified expr as an aggregate and compares the result to the

datafusion/core/tests/sql/unparser.rs

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -143,16 +143,26 @@ fn tpch_queries() -> Vec<TestQuery> {
143143
}
144144

145145
/// Create a new SessionContext for testing that has all Clickbench tables registered.
146+
///
147+
/// Registers the raw Parquet as `hits_raw`, then creates a `hits` view that
148+
/// casts `EventDate` from UInt16 (day-offset) to DATE. This mirrors the
149+
/// approach used by the benchmark runner in `benchmarks/src/clickbench.rs`.
146150
async fn clickbench_test_context() -> Result<SessionContext> {
147151
let ctx = SessionContext::new();
148152
ctx.register_parquet(
149-
"hits",
153+
"hits_raw",
150154
"tests/data/clickbench_hits_10.parquet",
151155
ParquetReadOptions::default(),
152156
)
153157
.await?;
154-
// Sanity check we found the table by querying it's schema, it should not be empty
155-
// Otherwise if the path is wrong the tests will all fail in confusing ways
158+
ctx.sql(
159+
r#"CREATE VIEW hits AS
160+
SELECT * EXCEPT ("EventDate"),
161+
CAST(CAST("EventDate" AS INTEGER) AS DATE) AS "EventDate"
162+
FROM hits_raw"#,
163+
)
164+
.await?;
165+
// Sanity check we found the table by querying its schema
156166
let df = ctx.sql("SELECT * FROM hits LIMIT 1").await?;
157167
assert!(
158168
!df.schema().fields().is_empty(),

datafusion/expr-common/src/interval_arithmetic.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ use std::fmt::{self, Display, Formatter};
2222
use std::ops::{AddAssign, SubAssign};
2323

2424
use crate::operator::Operator;
25-
use crate::type_coercion::binary::{BinaryTypeCoercer, comparison_coercion_numeric};
25+
use crate::type_coercion::binary::{BinaryTypeCoercer, comparison_coercion};
2626

2727
use arrow::compute::{CastOptions, cast_with_options};
2828
use arrow::datatypes::{
@@ -734,7 +734,7 @@ impl Interval {
734734
(self.lower.clone(), self.upper.clone(), rhs.clone())
735735
} else {
736736
let maybe_common_type =
737-
comparison_coercion_numeric(&self.data_type(), &rhs.data_type());
737+
comparison_coercion(&self.data_type(), &rhs.data_type());
738738
assert_or_internal_err!(
739739
maybe_common_type.is_some(),
740740
"Data types must be compatible for containment checks, lhs:{}, rhs:{}",

datafusion/expr-common/src/signature.rs

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -158,7 +158,7 @@ pub enum Arity {
158158
pub enum TypeSignature {
159159
/// One or more arguments of a common type out of a list of valid types.
160160
///
161-
/// For functions that take no arguments (e.g. `random()` see [`TypeSignature::Nullary`]).
161+
/// For functions that take no arguments (e.g. `random()`), see [`TypeSignature::Nullary`]).
162162
///
163163
/// # Examples
164164
///
@@ -184,38 +184,39 @@ pub enum TypeSignature {
184184
Uniform(usize, Vec<DataType>),
185185
/// One or more arguments with exactly the specified types in order.
186186
///
187-
/// For functions that take no arguments (e.g. `random()`) use [`TypeSignature::Nullary`].
187+
/// For functions that take no arguments (e.g. `random()`), use [`TypeSignature::Nullary`].
188188
Exact(Vec<DataType>),
189189
/// One or more arguments belonging to the [`TypeSignatureClass`], in order.
190190
///
191191
/// [`Coercion`] contains not only the desired type but also the allowed
192192
/// casts. For example, if you expect a function has string type, but you
193193
/// also allow it to be casted from binary type.
194194
///
195-
/// For functions that take no arguments (e.g. `random()`) see [`TypeSignature::Nullary`].
195+
/// For functions that take no arguments (e.g. `random()`), see [`TypeSignature::Nullary`].
196196
Coercible(Vec<Coercion>),
197197
/// One or more arguments coercible to a single, comparable type.
198198
///
199199
/// Each argument will be coerced to a single type using the
200-
/// coercion rules described in [`comparison_coercion_numeric`].
200+
/// coercion rules described in [`comparison_coercion`].
201201
///
202202
/// # Examples
203203
///
204204
/// If the `nullif(1, 2)` function is called with `i32` and `i64` arguments
205205
/// the types will both be coerced to `i64` before the function is invoked.
206206
///
207207
/// If the `nullif('1', 2)` function is called with `Utf8` and `i64` arguments
208-
/// the types will both be coerced to `Utf8` before the function is invoked.
208+
/// the types will both be coerced to `Int64` before the function is invoked
209+
/// (numeric is preferred over string).
209210
///
210211
/// Note:
211-
/// - For functions that take no arguments (e.g. `random()` see [`TypeSignature::Nullary`]).
212+
/// - For functions that take no arguments (e.g. `random()`), see [`TypeSignature::Nullary`]).
212213
/// - If all arguments have type [`DataType::Null`], they are coerced to `Utf8`
213214
///
214-
/// [`comparison_coercion_numeric`]: crate::type_coercion::binary::comparison_coercion_numeric
215+
/// [`comparison_coercion`]: crate::type_coercion::binary::comparison_coercion
215216
Comparable(usize),
216217
/// One or more arguments of arbitrary types.
217218
///
218-
/// For functions that take no arguments (e.g. `random()`) use [`TypeSignature::Nullary`].
219+
/// For functions that take no arguments (e.g. `random()`), use [`TypeSignature::Nullary`].
219220
Any(usize),
220221
/// Matches exactly one of a list of [`TypeSignature`]s.
221222
///
@@ -233,7 +234,7 @@ pub enum TypeSignature {
233234
///
234235
/// See [`NativeType::is_numeric`] to know which type is considered numeric
235236
///
236-
/// For functions that take no arguments (e.g. `random()`) use [`TypeSignature::Nullary`].
237+
/// For functions that take no arguments (e.g. `random()`), use [`TypeSignature::Nullary`].
237238
///
238239
/// [`NativeType::is_numeric`]: datafusion_common::types::NativeType::is_numeric
239240
Numeric(usize),
@@ -246,7 +247,7 @@ pub enum TypeSignature {
246247
/// For example, if a function is called with (utf8, large_utf8), all
247248
/// arguments will be coerced to `LargeUtf8`
248249
///
249-
/// For functions that take no arguments (e.g. `random()` use [`TypeSignature::Nullary`]).
250+
/// For functions that take no arguments (e.g. `random()`), use [`TypeSignature::Nullary`]).
250251
String(usize),
251252
/// No arguments
252253
Nullary,

0 commit comments

Comments
 (0)