Skip to content

Commit e692e41

Browse files
committed
slt: Add slt for get_field with dict encoded structs
1 parent 2023f57 commit e692e41

2 files changed

Lines changed: 260 additions & 3 deletions

File tree

datafusion/sqllogictest/src/test_context.rs

Lines changed: 106 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -24,12 +24,14 @@ use std::sync::Arc;
2424
use std::vec;
2525

2626
use arrow::array::{
27-
Array, ArrayRef, BinaryArray, Float64Array, Int32Array, LargeBinaryArray,
28-
LargeStringArray, StringArray, TimestampNanosecondArray, UnionArray,
27+
Array, ArrayRef, BinaryArray, DictionaryArray, Float64Array, Int32Array,
28+
LargeBinaryArray, LargeStringArray, StringArray, StructArray,
29+
TimestampNanosecondArray, UInt32Array, UnionArray,
2930
};
3031
use arrow::buffer::ScalarBuffer;
3132
use arrow::datatypes::{
32-
DataType, Field, FieldRef, Schema, SchemaRef, TimeUnit, UnionFields,
33+
DataType, Field, FieldRef, Fields, Schema, SchemaRef, TimeUnit, UInt32Type,
34+
UnionFields,
3335
};
3436
use arrow::record_batch::RecordBatch;
3537
use datafusion::catalog::{
@@ -174,6 +176,10 @@ impl TestContext {
174176
info!("Registering table with union column");
175177
register_union_table(test_ctx.session_ctx())
176178
}
179+
"dictionary_struct.slt" => {
180+
info!("Registering table with dictionary-encoded struct column");
181+
register_dictionary_struct_table(test_ctx.session_ctx());
182+
}
177183
"async_udf.slt" => {
178184
info!("Registering dummy async udf");
179185
register_async_abs_udf(test_ctx.session_ctx())
@@ -584,6 +590,103 @@ fn register_union_table(ctx: &SessionContext) {
584590
ctx.register_batch("union_table", batch).unwrap();
585591
}
586592

593+
fn register_dictionary_struct_table(ctx: &SessionContext) {
594+
// Build deduplicated struct values: 3 unique structs
595+
let names =
596+
Arc::new(StringArray::from(vec!["Alice", "Bob", "Carol"])) as ArrayRef;
597+
let ids = Arc::new(Int32Array::from(vec![1, 2, 3])) as ArrayRef;
598+
599+
let struct_fields: Fields = vec![
600+
Field::new("name", DataType::Utf8, false),
601+
Field::new("id", DataType::Int32, false),
602+
]
603+
.into();
604+
605+
let values_struct = Arc::new(
606+
StructArray::try_new(struct_fields.clone(), vec![names, ids], None).unwrap(),
607+
) as ArrayRef;
608+
609+
// Dictionary keys index into the 3-element struct array.
610+
// 5 rows with repeated references to test dictionary deduplication.
611+
let keys = UInt32Array::from(vec![0u32, 1, 2, 0, 1]);
612+
let dict =
613+
DictionaryArray::<UInt32Type>::try_new(keys, values_struct.clone()).unwrap();
614+
615+
// Also build a non-dictionary plain struct column for comparison.
616+
let plain_names = Arc::new(StringArray::from(vec![
617+
"Alice", "Bob", "Carol", "Alice", "Bob",
618+
])) as ArrayRef;
619+
let plain_ids = Arc::new(Int32Array::from(vec![1, 2, 3, 1, 2])) as ArrayRef;
620+
let plain_struct = StructArray::try_new(
621+
struct_fields.clone(),
622+
vec![plain_names, plain_ids],
623+
None,
624+
)
625+
.unwrap();
626+
627+
let dict_type = DataType::Dictionary(
628+
Box::new(DataType::UInt32),
629+
Box::new(DataType::Struct(struct_fields.clone())),
630+
);
631+
632+
let schema = Schema::new(vec![
633+
Field::new("dict_struct", dict_type, false),
634+
Field::new(
635+
"plain_struct",
636+
DataType::Struct(struct_fields.clone()),
637+
false,
638+
),
639+
]);
640+
641+
let batch = RecordBatch::try_new(
642+
Arc::new(schema),
643+
vec![Arc::new(dict) as ArrayRef, Arc::new(plain_struct) as ArrayRef],
644+
)
645+
.unwrap();
646+
647+
ctx.register_batch("dict_struct_table", batch).unwrap();
648+
649+
// Second table: dictionary-encoded struct with nullable entries
650+
let names_nullable =
651+
Arc::new(StringArray::from(vec!["X", "Y"])) as ArrayRef;
652+
let ids_nullable = Arc::new(Int32Array::from(vec![10, 20])) as ArrayRef;
653+
let struct_fields_nullable: Fields = vec![
654+
Field::new("name", DataType::Utf8, false),
655+
Field::new("id", DataType::Int32, false),
656+
]
657+
.into();
658+
let values_struct_nullable = Arc::new(
659+
StructArray::try_new(
660+
struct_fields_nullable.clone(),
661+
vec![names_nullable, ids_nullable],
662+
None,
663+
)
664+
.unwrap(),
665+
) as ArrayRef;
666+
let keys_nullable =
667+
UInt32Array::from(vec![Some(0), None, Some(1), None]);
668+
let dict_nullable = DictionaryArray::<UInt32Type>::try_new(
669+
keys_nullable,
670+
values_struct_nullable,
671+
)
672+
.unwrap();
673+
674+
let dict_type_nullable = DataType::Dictionary(
675+
Box::new(DataType::UInt32),
676+
Box::new(DataType::Struct(struct_fields_nullable)),
677+
);
678+
679+
let schema_nullable =
680+
Schema::new(vec![Field::new("ds", dict_type_nullable, true)]);
681+
let batch_nullable = RecordBatch::try_new(
682+
Arc::new(schema_nullable),
683+
vec![Arc::new(dict_nullable) as ArrayRef],
684+
)
685+
.unwrap();
686+
ctx.register_batch("dict_struct_nullable", batch_nullable)
687+
.unwrap();
688+
}
689+
587690
fn register_async_abs_udf(ctx: &SessionContext) {
588691
#[derive(Debug, PartialEq, Eq, Hash)]
589692
struct AsyncAbs {
Lines changed: 154 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,154 @@
1+
# Licensed to the Apache Software Foundation (ASF) under one
2+
# or more contributor license agreements. See the NOTICE file
3+
# distributed with this work for additional information
4+
# regarding copyright ownership. The ASF licenses this file
5+
# to you under the Apache License, Version 2.0 (the
6+
# "License"); you may not use this file except in compliance
7+
# with the License. You may obtain a copy of the License at
8+
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
11+
# Unless required by applicable law or agreed to in writing,
12+
# software distributed under the License is distributed on an
13+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
# KIND, either express or implied. See the License for the
15+
# specific language governing permissions and limitations
16+
# under the License.
17+
18+
#############
19+
## Tests for get_field on dictionary-encoded struct columns
20+
#############
21+
22+
# The test fixture registers two tables:
23+
#
24+
# dict_struct_table:
25+
# dict_struct Dictionary(UInt32, Struct(name: Utf8, id: Int32)) — 5 rows, no nulls
26+
# plain_struct Struct(name: Utf8, id: Int32) — same data, not dict-encoded
27+
#
28+
# Rows (logical values):
29+
# {name: Alice, id: 1}
30+
# {name: Bob, id: 2}
31+
# {name: Carol, id: 3}
32+
# {name: Alice, id: 1}
33+
# {name: Bob, id: 2}
34+
#
35+
# dict_struct_nullable:
36+
# ds Dictionary(UInt32, Struct(name: Utf8, id: Int32)) — 4 rows, keys [0, NULL, 1, NULL]
37+
#
38+
# Rows (logical values):
39+
# {name: X, id: 10}
40+
# NULL
41+
# {name: Y, id: 20}
42+
# NULL
43+
44+
# Verify schema of dict_struct_table
45+
query TT
46+
SELECT arrow_typeof(dict_struct), arrow_typeof(plain_struct) FROM dict_struct_table LIMIT 1;
47+
----
48+
Dictionary(UInt32, Struct("name": non-null Utf8, "id": non-null Int32)) Struct("name": non-null Utf8, "id": non-null Int32)
49+
50+
# Extract a string field from dict-encoded struct using bracket notation
51+
query T
52+
SELECT dict_struct['name'] FROM dict_struct_table;
53+
----
54+
Alice
55+
Bob
56+
Carol
57+
Alice
58+
Bob
59+
60+
# Extract an integer field from dict-encoded struct
61+
query ?
62+
SELECT dict_struct['id'] FROM dict_struct_table;
63+
----
64+
1
65+
2
66+
3
67+
1
68+
2
69+
70+
# Verify the extracted field preserves dictionary encoding
71+
query T
72+
SELECT arrow_typeof(dict_struct['name']) FROM dict_struct_table LIMIT 1;
73+
----
74+
Dictionary(UInt32, Utf8)
75+
76+
query T
77+
SELECT arrow_typeof(dict_struct['id']) FROM dict_struct_table LIMIT 1;
78+
----
79+
Dictionary(UInt32, Int32)
80+
81+
# Extracted values from dict-encoded struct match the plain struct
82+
query TT
83+
SELECT dict_struct['name'], plain_struct['name'] FROM dict_struct_table;
84+
----
85+
Alice Alice
86+
Bob Bob
87+
Carol Carol
88+
Alice Alice
89+
Bob Bob
90+
91+
query ?I
92+
SELECT dict_struct['id'], plain_struct['id'] FROM dict_struct_table;
93+
----
94+
1 1
95+
2 2
96+
3 3
97+
1 1
98+
2 2
99+
100+
# Explicit get_field invocation on dict-encoded struct
101+
query T
102+
SELECT get_field(dict_struct, 'name') FROM dict_struct_table;
103+
----
104+
Alice
105+
Bob
106+
Carol
107+
Alice
108+
Bob
109+
110+
# Field extraction from dict-encoded struct with NULLs
111+
query T
112+
SELECT ds['name'] FROM dict_struct_nullable;
113+
----
114+
X
115+
NULL
116+
Y
117+
NULL
118+
119+
query ?
120+
SELECT ds['id'] FROM dict_struct_nullable;
121+
----
122+
10
123+
NULL
124+
20
125+
NULL
126+
127+
# Filtering on extracted dict-encoded struct field
128+
query T
129+
SELECT dict_struct['name'] FROM dict_struct_table WHERE dict_struct['id'] = 2;
130+
----
131+
Bob
132+
Bob
133+
134+
# Aggregation on extracted dict-encoded struct field
135+
query TI
136+
SELECT dict_struct['name'], count(*) FROM dict_struct_table GROUP BY dict_struct['name'] ORDER BY dict_struct['name'];
137+
----
138+
Alice 2
139+
Bob 2
140+
Carol 1
141+
142+
# Ordering by extracted dict-encoded struct field
143+
query T?
144+
SELECT dict_struct['name'], dict_struct['id'] FROM dict_struct_table ORDER BY dict_struct['id'] DESC;
145+
----
146+
Carol 3
147+
Bob 2
148+
Bob 2
149+
Alice 1
150+
Alice 1
151+
152+
# Error: non-existent field
153+
statement error
154+
SELECT dict_struct['nonexistent'] FROM dict_struct_table;

0 commit comments

Comments
 (0)