Skip to content

Commit 8c478e9

Browse files
kosiewalamb
andauthored
Disallow positional struct casting when field names don’t overlap (#19955)
## Which issue does this PR close? * Closes #19841. ## Rationale for this change Struct-to-struct casting previously fell back to **positional mapping** when there was **no field-name overlap** and the number of fields matched. That behavior is ambiguous and can silently produce incorrect results when source/target schemas have different field naming conventions or ordering. This PR makes struct casting **strictly name-based**: when there is no overlap in field names between the source and target structs, the cast is rejected with a clear planning error. This prevents accidental, hard-to-detect data corruption and forces callers to provide explicit field names (or align schemas) when casting. ## What changes are included in this PR? * Removed the positional fallback logic for struct casting in `cast_struct_column`; child fields are now resolved **only by name**. * Updated `validate_struct_compatibility` to **error out** when there is **no field name overlap**, instead of allowing positional compatibility checks. * Updated unit tests to reflect the new behavior (no-overlap casts now fail with an appropriate error). * Updated SQLLogicTest files to construct structs using **explicit field names** (e.g. `{id: 1}` / `{a: 1, b: 'x'}` or `struct(… AS field)`), avoiding reliance on positional behavior. * Improved error messaging to explicitly mention the lack of field name overlap. ## Are these changes tested? Yes. * Updated existing Rust unit tests in `nested_struct.rs` to assert the new failure mode and error message. * Updated SQLLogicTest coverage (`struct.slt`, `joins.slt`) to use named struct literals so tests continue to validate struct behavior without positional casting. ## Are there any user-facing changes? Yes — behavioral change / potential breaking change. * Casting between two `STRUCT`s with **no overlapping field names** now fails (previously it could succeed via positional mapping if field counts matched). * Users must provide explicit field names (e.g. `{a: 1, b: 'x'}` or `struct(expr AS a, expr AS b)`) or ensure schemas share field names. * Error messages are more explicit: casts are rejected when there is “no field name overlap”. ## LLM-generated code disclosure This PR includes LLM-generated code and comments. All LLM-generated content has been manually reviewed and tested. --------- Co-authored-by: Andrew Lamb <andrew@nerdnetworks.org>
1 parent 96a6bd7 commit 8c478e9

6 files changed

Lines changed: 170 additions & 94 deletions

File tree

datafusion/common/src/nested_struct.rs

Lines changed: 32 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ use std::{collections::HashSet, sync::Arc};
3131
///
3232
/// ## Field Matching Strategy
3333
/// - **By Name**: Source struct fields are matched to target fields by name (case-sensitive)
34-
/// - **By Position**: When there is no name overlap and the field counts match, fields are cast by index
34+
/// - **No Positional Mapping**: Structs with no overlapping field names are rejected
3535
/// - **Type Adaptation**: When a matching field is found, it is recursively cast to the target field's type
3636
/// - **Missing Fields**: Target fields not present in the source are filled with null values
3737
/// - **Extra Fields**: Source fields not present in the target are ignored
@@ -67,24 +67,16 @@ fn cast_struct_column(
6767
if let Some(source_struct) = source_col.as_any().downcast_ref::<StructArray>() {
6868
let source_fields = source_struct.fields();
6969
validate_struct_compatibility(source_fields, target_fields)?;
70-
let has_overlap = has_one_of_more_common_fields(source_fields, target_fields);
71-
7270
let mut fields: Vec<Arc<Field>> = Vec::with_capacity(target_fields.len());
7371
let mut arrays: Vec<ArrayRef> = Vec::with_capacity(target_fields.len());
7472
let num_rows = source_col.len();
7573

76-
// Iterate target fields and pick source child either by name (when fields overlap)
77-
// or by position (when there is no name overlap).
78-
for (index, target_child_field) in target_fields.iter().enumerate() {
74+
// Iterate target fields and pick source child by name when present.
75+
for target_child_field in target_fields.iter() {
7976
fields.push(Arc::clone(target_child_field));
8077

81-
// Determine the source child column: by name when overlapping names exist,
82-
// otherwise by position.
83-
let source_child_opt: Option<&ArrayRef> = if has_overlap {
84-
source_struct.column_by_name(target_child_field.name())
85-
} else {
86-
Some(source_struct.column(index))
87-
};
78+
let source_child_opt =
79+
source_struct.column_by_name(target_child_field.name());
8880

8981
match source_child_opt {
9082
Some(source_child_col) => {
@@ -230,20 +222,11 @@ pub fn validate_struct_compatibility(
230222
) -> Result<()> {
231223
let has_overlap = has_one_of_more_common_fields(source_fields, target_fields);
232224
if !has_overlap {
233-
if source_fields.len() != target_fields.len() {
234-
return _plan_err!(
235-
"Cannot cast struct with {} fields to {} fields without name overlap; positional mapping is ambiguous",
236-
source_fields.len(),
237-
target_fields.len()
238-
);
239-
}
240-
241-
for (source_field, target_field) in source_fields.iter().zip(target_fields.iter())
242-
{
243-
validate_field_compatibility(source_field, target_field)?;
244-
}
245-
246-
return Ok(());
225+
return _plan_err!(
226+
"Cannot cast struct with {} fields to {} fields because there is no field name overlap",
227+
source_fields.len(),
228+
target_fields.len()
229+
);
247230
}
248231

249232
// Check compatibility for each target field
@@ -323,7 +306,11 @@ fn validate_field_compatibility(
323306
Ok(())
324307
}
325308

326-
fn has_one_of_more_common_fields(
309+
/// Check if two field lists have at least one common field by name.
310+
///
311+
/// This is useful for validating struct compatibility when casting between structs,
312+
/// ensuring that source and target fields have overlapping names.
313+
pub fn has_one_of_more_common_fields(
327314
source_fields: &[FieldRef],
328315
target_fields: &[FieldRef],
329316
) -> bool {
@@ -546,7 +533,7 @@ mod tests {
546533
}
547534

548535
#[test]
549-
fn test_validate_struct_compatibility_positional_no_overlap_mismatch_len() {
536+
fn test_validate_struct_compatibility_no_overlap_mismatch_len() {
550537
let source_fields = vec![
551538
arc_field("left", DataType::Int32),
552539
arc_field("right", DataType::Int32),
@@ -556,7 +543,7 @@ mod tests {
556543
let result = validate_struct_compatibility(&source_fields, &target_fields);
557544
assert!(result.is_err());
558545
let error_msg = result.unwrap_err().to_string();
559-
assert!(error_msg.contains("positional mapping is ambiguous"));
546+
assert_contains!(error_msg, "no field name overlap");
560547
}
561548

562549
#[test]
@@ -665,21 +652,21 @@ mod tests {
665652
}
666653

667654
#[test]
668-
fn test_validate_struct_compatibility_positional_with_type_mismatch() {
669-
// Source struct: {left: Struct} - nested struct
670-
let source_fields =
671-
vec![arc_struct_field("left", vec![field("x", DataType::Int32)])];
655+
fn test_validate_struct_compatibility_no_overlap_equal_len() {
656+
let source_fields = vec![
657+
arc_field("left", DataType::Int32),
658+
arc_field("right", DataType::Utf8),
659+
];
672660

673-
// Target struct: {alpha: Int32} (no name overlap, incompatible type at position 0)
674-
let target_fields = vec![arc_field("alpha", DataType::Int32)];
661+
let target_fields = vec![
662+
arc_field("alpha", DataType::Int32),
663+
arc_field("beta", DataType::Utf8),
664+
];
675665

676666
let result = validate_struct_compatibility(&source_fields, &target_fields);
677667
assert!(result.is_err());
678668
let error_msg = result.unwrap_err().to_string();
679-
assert_contains!(
680-
error_msg,
681-
"Cannot cast struct field 'alpha' from type Struct(\"x\": Int32) to type Int32"
682-
);
669+
assert_contains!(error_msg, "no field name overlap");
683670
}
684671

685672
#[test]
@@ -948,7 +935,7 @@ mod tests {
948935
}
949936

950937
#[test]
951-
fn test_cast_struct_positional_when_no_overlap() {
938+
fn test_cast_struct_no_overlap_rejected() {
952939
let first = Arc::new(Int32Array::from(vec![Some(10), Some(20)])) as ArrayRef;
953940
let second =
954941
Arc::new(StringArray::from(vec![Some("alpha"), Some("beta")])) as ArrayRef;
@@ -964,17 +951,10 @@ mod tests {
964951
vec![field("a", DataType::Int64), field("b", DataType::Utf8)],
965952
);
966953

967-
let result =
968-
cast_column(&source_col, &target_field, &DEFAULT_CAST_OPTIONS).unwrap();
969-
let struct_array = result.as_any().downcast_ref::<StructArray>().unwrap();
970-
971-
let a_col = get_column_as!(&struct_array, "a", Int64Array);
972-
assert_eq!(a_col.value(0), 10);
973-
assert_eq!(a_col.value(1), 20);
974-
975-
let b_col = get_column_as!(&struct_array, "b", StringArray);
976-
assert_eq!(b_col.value(0), "alpha");
977-
assert_eq!(b_col.value(1), "beta");
954+
let result = cast_column(&source_col, &target_field, &DEFAULT_CAST_OPTIONS);
955+
assert!(result.is_err());
956+
let error_msg = result.unwrap_err().to_string();
957+
assert_contains!(error_msg, "no field name overlap");
978958
}
979959

980960
#[test]

datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ use std::ops::Not;
2828
use std::sync::Arc;
2929

3030
use datafusion_common::config::ConfigOptions;
31+
use datafusion_common::nested_struct::has_one_of_more_common_fields;
3132
use datafusion_common::{
3233
DFSchema, DataFusionError, Result, ScalarValue, exec_datafusion_err, internal_err,
3334
};
@@ -657,6 +658,11 @@ impl ConstEvaluator {
657658
return false;
658659
}
659660

661+
// Skip const-folding when there is no field name overlap
662+
if !has_one_of_more_common_fields(&source_fields, target_fields) {
663+
return false;
664+
}
665+
660666
// Don't const-fold struct casts with empty (0-row) literals
661667
// The simplifier uses a 1-row input batch, which causes dimension mismatches
662668
// when evaluating 0-row struct literals
@@ -5220,7 +5226,7 @@ mod tests {
52205226
#[test]
52215227
fn test_struct_cast_different_names_same_count() {
52225228
// Test struct cast with same field count but different names
5223-
// Field count matches; simplification should succeed
5229+
// Field count matches; simplification should be skipped because names do not overlap
52245230

52255231
let source_fields = Fields::from(vec![
52265232
Arc::new(Field::new("a", DataType::Int32, true)),
@@ -5237,14 +5243,11 @@ mod tests {
52375243
let simplifier =
52385244
ExprSimplifier::new(SimplifyContext::default().with_schema(test_schema()));
52395245

5240-
// The cast should be simplified since field counts match
5246+
// The cast should remain unchanged because there is no name overlap
52415247
let result = simplifier.simplify(expr.clone()).unwrap();
5242-
// Struct casts with same field count are const-folded to literals
5243-
assert!(matches!(result, Expr::Literal(_, _)));
5244-
// Ensure the simplifier made a change (not identical to original)
5245-
assert_ne!(
5248+
assert_eq!(
52465249
result, expr,
5247-
"Struct cast with different names but same field count should be simplified"
5250+
"Struct cast with different names but same field count should not be simplified"
52485251
);
52495252
}
52505253

datafusion/sqllogictest/test_files/joins.slt

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -57,15 +57,15 @@ statement ok
5757
CREATE TABLE join_t3(s3 struct<id INT>)
5858
AS VALUES
5959
(NULL),
60-
(struct(1)),
61-
(struct(2));
60+
({id: 1}),
61+
({id: 2});
6262

6363
statement ok
6464
CREATE TABLE join_t4(s4 struct<id INT>)
6565
AS VALUES
6666
(NULL),
67-
(struct(2)),
68-
(struct(3));
67+
({id: 2}),
68+
({id: 3});
6969

7070
# Left semi anti join
7171

datafusion/sqllogictest/test_files/struct.slt

Lines changed: 29 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -38,9 +38,9 @@ CREATE TABLE struct_values (
3838
s1 struct<INT>,
3939
s2 struct<a INT,b VARCHAR>
4040
) AS VALUES
41-
(struct(1), struct(1, 'string1')),
42-
(struct(2), struct(2, 'string2')),
43-
(struct(3), struct(3, 'string3'))
41+
(struct(1), struct(1 AS a, 'string1' AS b)),
42+
(struct(2), struct(2 AS a, 'string2' AS b)),
43+
(struct(3), struct(3 AS a, 'string3' AS b))
4444
;
4545

4646
query ??
@@ -397,7 +397,8 @@ drop view complex_view;
397397

398398
# struct with different keys r1 and r2 is not valid
399399
statement ok
400-
create table t(a struct<r1 varchar, c int>, b struct<r2 varchar, c float>) as values (struct('red', 1), struct('blue', 2.3));
400+
create table t(a struct<r1 varchar, c int>, b struct<r2 varchar, c float>) as values
401+
(struct('red' AS r1, 1 AS c), struct('blue' AS r2, 2.3 AS c));
401402

402403
# Expect same keys for struct type but got mismatched pair r1,c and r2,c
403404
query error
@@ -408,7 +409,8 @@ drop table t;
408409

409410
# struct with the same key
410411
statement ok
411-
create table t(a struct<r varchar, c int>, b struct<r varchar, c float>) as values (struct('red', 1), struct('blue', 2.3));
412+
create table t(a struct<r varchar, c int>, b struct<r varchar, c float>) as values
413+
(struct('red' AS r, 1 AS c), struct('blue' AS r, 2.3 AS c));
412414

413415
query T
414416
select arrow_typeof([a, b]) from t;
@@ -442,18 +444,18 @@ CREATE TABLE struct_values (
442444
s1 struct(a int, b varchar),
443445
s2 struct(a int, b varchar)
444446
) AS VALUES
445-
(row(1, 'red'), row(1, 'string1')),
446-
(row(2, 'blue'), row(2, 'string2')),
447-
(row(3, 'green'), row(3, 'string3'))
447+
({a: 1, b: 'red'}, {a: 1, b: 'string1'}),
448+
({a: 2, b: 'blue'}, {a: 2, b: 'string2'}),
449+
({a: 3, b: 'green'}, {a: 3, b: 'string3'})
448450
;
449451

450452
statement ok
451453
drop table struct_values;
452454

453455
statement ok
454456
create table t (c1 struct(r varchar, b int), c2 struct(r varchar, b float)) as values (
455-
row('red', 2),
456-
row('blue', 2.3)
457+
{r: 'red', b: 2},
458+
{r: 'blue', b: 2.3}
457459
);
458460

459461
query ??
@@ -501,9 +503,9 @@ CREATE TABLE t (
501503
s1 struct(a int, b varchar),
502504
s2 struct(a float, b varchar)
503505
) AS VALUES
504-
(row(1, 'red'), row(1.1, 'string1')),
505-
(row(2, 'blue'), row(2.2, 'string2')),
506-
(row(3, 'green'), row(33.2, 'string3'))
506+
({a: 1, b: 'red'}, {a: 1.1, b: 'string1'}),
507+
({a: 2, b: 'blue'}, {a: 2.2, b: 'string2'}),
508+
({a: 3, b: 'green'}, {a: 33.2, b: 'string3'})
507509
;
508510

509511
query ?
@@ -528,9 +530,9 @@ CREATE TABLE t (
528530
s1 struct(a int, b varchar),
529531
s2 struct(a float, b varchar)
530532
) AS VALUES
531-
(row(1, 'red'), row(1.1, 'string1')),
532-
(null, row(2.2, 'string2')),
533-
(row(3, 'green'), row(33.2, 'string3'))
533+
({a: 1, b: 'red'}, {a: 1.1, b: 'string1'}),
534+
(null, {a: 2.2, b: 'string2'}),
535+
({a: 3, b: 'green'}, {a: 33.2, b: 'string3'})
534536
;
535537

536538
query ?
@@ -553,8 +555,8 @@ drop table t;
553555
# row() with incorrect order - row() is positional, not name-based
554556
statement error DataFusion error: Optimizer rule 'simplify_expressions' failed[\s\S]*Arrow error: Cast error: Cannot cast string 'blue' to value of Float32 type
555557
create table t(a struct(r varchar, c int), b struct(r varchar, c float)) as values
556-
(row('red', 1), row(2.3, 'blue')),
557-
(row('purple', 1), row('green', 2.3));
558+
({r: 'red', c: 1}, {r: 2.3, c: 'blue'}),
559+
({r: 'purple', c: 1}, {r: 'green', c: 2.3});
558560

559561

560562
##################################
@@ -568,7 +570,7 @@ select [{r: 'a', c: 1}, {r: 'b', c: 2}];
568570

569571

570572
statement ok
571-
create table t(a struct(r varchar, c int), b struct(r varchar, c float)) as values (row('a', 1), row('b', 2.3));
573+
create table t(a struct(r varchar, c int), b struct(r varchar, c float)) as values ({r: 'a', c: 1}, {r: 'b', c: 2.3});
572574

573575
query T
574576
select arrow_typeof([a, b]) from t;
@@ -580,7 +582,7 @@ drop table t;
580582

581583

582584
statement ok
583-
create table t(a struct(r varchar, c int, g float), b struct(r varchar, c float, g int)) as values (row('a', 1, 2.3), row('b', 2.3, 2));
585+
create table t(a struct(r varchar, c int, g float), b struct(r varchar, c float, g int)) as values ({r: 'a', c: 1, g: 2.3}, {r: 'b', c: 2.3, g: 2});
584586

585587
# type of each column should not coerced but preserve as it is
586588
query T
@@ -602,7 +604,7 @@ drop table t;
602604
# This tests accessing struct fields using the subscript notation with string literals
603605

604606
statement ok
605-
create table test (struct_field struct(substruct int)) as values (struct(1));
607+
create table test (struct_field struct(substruct int)) as values ({substruct: 1});
606608

607609
query ??
608610
select *
@@ -615,7 +617,7 @@ statement ok
615617
DROP TABLE test;
616618

617619
statement ok
618-
create table test (struct_field struct(substruct struct(subsubstruct int))) as values (struct(struct(1)));
620+
create table test (struct_field struct(substruct struct(subsubstruct int))) as values ({substruct: {subsubstruct: 1}});
619621

620622
query ??
621623
select *
@@ -823,9 +825,9 @@ SELECT CAST({b: 3, a: 4} AS STRUCT(a BIGINT, b INT));
823825
----
824826
{a: 4, b: 3}
825827

826-
# Test positional casting when there is no name overlap
828+
# Test casting with explicit field names
827829
query ?
828-
SELECT CAST(struct(1, 'x') AS STRUCT(a INT, b VARCHAR));
830+
SELECT CAST({a: 1, b: 'x'} AS STRUCT(a INT, b VARCHAR));
829831
----
830832
{a: 1, b: x}
831833

@@ -859,9 +861,9 @@ statement ok
859861
CREATE TABLE struct_reorder_test (
860862
data STRUCT(b INT, a VARCHAR)
861863
) AS VALUES
862-
(struct(100, 'first')),
863-
(struct(200, 'second')),
864-
(struct(300, 'third'))
864+
({b: 100, a: 'first'}),
865+
({b: 200, a: 'second'}),
866+
({b: 300, a: 'third'})
865867
;
866868

867869
query ?

0 commit comments

Comments
 (0)