Skip to content

Commit 3b09c95

Browse files
committed
slt: Add slt for get_field with dict encoded structs
1 parent dbf1430 commit 3b09c95

2 files changed

Lines changed: 261 additions & 3 deletions

File tree

datafusion/sqllogictest/src/test_context.rs

Lines changed: 107 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -24,11 +24,14 @@ use std::sync::Arc;
2424
use std::vec;
2525

2626
use arrow::array::{
27-
Array, ArrayRef, BinaryArray, Float64Array, Int32Array, LargeBinaryArray,
28-
LargeStringArray, StringArray, TimestampNanosecondArray, UnionArray,
27+
Array, ArrayRef, BinaryArray, DictionaryArray, Float64Array, Int32Array,
28+
LargeBinaryArray, LargeStringArray, StringArray, StructArray,
29+
TimestampNanosecondArray, UInt32Array, UnionArray,
2930
};
3031
use arrow::buffer::ScalarBuffer;
31-
use arrow::datatypes::{DataType, Field, Schema, SchemaRef, TimeUnit, UnionFields};
32+
use arrow::datatypes::{
33+
DataType, Field, Fields, Schema, SchemaRef, TimeUnit, UInt32Type, UnionFields,
34+
};
3235
use arrow::record_batch::RecordBatch;
3336
use datafusion::catalog::{
3437
CatalogProvider, MemoryCatalogProvider, MemorySchemaProvider, SchemaProvider, Session,
@@ -145,6 +148,10 @@ impl TestContext {
145148
info!("Registering table with union column");
146149
register_union_table(test_ctx.session_ctx())
147150
}
151+
"dictionary_struct.slt" => {
152+
info!("Registering table with dictionary-encoded struct column");
153+
register_dictionary_struct_table(test_ctx.session_ctx());
154+
}
148155
"async_udf.slt" => {
149156
info!("Registering dummy async udf");
150157
register_async_abs_udf(test_ctx.session_ctx())
@@ -555,6 +562,103 @@ fn register_union_table(ctx: &SessionContext) {
555562
ctx.register_batch("union_table", batch).unwrap();
556563
}
557564

565+
fn register_dictionary_struct_table(ctx: &SessionContext) {
566+
// Build deduplicated struct values: 3 unique structs
567+
let names =
568+
Arc::new(StringArray::from(vec!["Alice", "Bob", "Carol"])) as ArrayRef;
569+
let ids = Arc::new(Int32Array::from(vec![1, 2, 3])) as ArrayRef;
570+
571+
let struct_fields: Fields = vec![
572+
Field::new("name", DataType::Utf8, false),
573+
Field::new("id", DataType::Int32, false),
574+
]
575+
.into();
576+
577+
let values_struct = Arc::new(
578+
StructArray::try_new(struct_fields.clone(), vec![names, ids], None).unwrap(),
579+
) as ArrayRef;
580+
581+
// Dictionary keys index into the 3-element struct array.
582+
// 5 rows with repeated references to test dictionary deduplication.
583+
let keys = UInt32Array::from(vec![0u32, 1, 2, 0, 1]);
584+
let dict =
585+
DictionaryArray::<UInt32Type>::try_new(keys, values_struct.clone()).unwrap();
586+
587+
// Also build a non-dictionary plain struct column for comparison.
588+
let plain_names = Arc::new(StringArray::from(vec![
589+
"Alice", "Bob", "Carol", "Alice", "Bob",
590+
])) as ArrayRef;
591+
let plain_ids = Arc::new(Int32Array::from(vec![1, 2, 3, 1, 2])) as ArrayRef;
592+
let plain_struct = StructArray::try_new(
593+
struct_fields.clone(),
594+
vec![plain_names, plain_ids],
595+
None,
596+
)
597+
.unwrap();
598+
599+
let dict_type = DataType::Dictionary(
600+
Box::new(DataType::UInt32),
601+
Box::new(DataType::Struct(struct_fields.clone())),
602+
);
603+
604+
let schema = Schema::new(vec![
605+
Field::new("dict_struct", dict_type, false),
606+
Field::new(
607+
"plain_struct",
608+
DataType::Struct(struct_fields.clone()),
609+
false,
610+
),
611+
]);
612+
613+
let batch = RecordBatch::try_new(
614+
Arc::new(schema),
615+
vec![Arc::new(dict) as ArrayRef, Arc::new(plain_struct) as ArrayRef],
616+
)
617+
.unwrap();
618+
619+
ctx.register_batch("dict_struct_table", batch).unwrap();
620+
621+
// Second table: dictionary-encoded struct with nullable entries
622+
let names_nullable =
623+
Arc::new(StringArray::from(vec!["X", "Y"])) as ArrayRef;
624+
let ids_nullable = Arc::new(Int32Array::from(vec![10, 20])) as ArrayRef;
625+
let struct_fields_nullable: Fields = vec![
626+
Field::new("name", DataType::Utf8, false),
627+
Field::new("id", DataType::Int32, false),
628+
]
629+
.into();
630+
let values_struct_nullable = Arc::new(
631+
StructArray::try_new(
632+
struct_fields_nullable.clone(),
633+
vec![names_nullable, ids_nullable],
634+
None,
635+
)
636+
.unwrap(),
637+
) as ArrayRef;
638+
let keys_nullable =
639+
UInt32Array::from(vec![Some(0), None, Some(1), None]);
640+
let dict_nullable = DictionaryArray::<UInt32Type>::try_new(
641+
keys_nullable,
642+
values_struct_nullable,
643+
)
644+
.unwrap();
645+
646+
let dict_type_nullable = DataType::Dictionary(
647+
Box::new(DataType::UInt32),
648+
Box::new(DataType::Struct(struct_fields_nullable)),
649+
);
650+
651+
let schema_nullable =
652+
Schema::new(vec![Field::new("ds", dict_type_nullable, true)]);
653+
let batch_nullable = RecordBatch::try_new(
654+
Arc::new(schema_nullable),
655+
vec![Arc::new(dict_nullable) as ArrayRef],
656+
)
657+
.unwrap();
658+
ctx.register_batch("dict_struct_nullable", batch_nullable)
659+
.unwrap();
660+
}
661+
558662
fn register_async_abs_udf(ctx: &SessionContext) {
559663
#[derive(Debug, PartialEq, Eq, Hash)]
560664
struct AsyncAbs {
Lines changed: 154 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,154 @@
1+
# Licensed to the Apache Software Foundation (ASF) under one
2+
# or more contributor license agreements. See the NOTICE file
3+
# distributed with this work for additional information
4+
# regarding copyright ownership. The ASF licenses this file
5+
# to you under the Apache License, Version 2.0 (the
6+
# "License"); you may not use this file except in compliance
7+
# with the License. You may obtain a copy of the License at
8+
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
11+
# Unless required by applicable law or agreed to in writing,
12+
# software distributed under the License is distributed on an
13+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
# KIND, either express or implied. See the License for the
15+
# specific language governing permissions and limitations
16+
# under the License.
17+
18+
#############
19+
## Tests for get_field on dictionary-encoded struct columns
20+
#############
21+
22+
# The test fixture registers two tables:
23+
#
24+
# dict_struct_table:
25+
# dict_struct Dictionary(UInt32, Struct(name: Utf8, id: Int32)) — 5 rows, no nulls
26+
# plain_struct Struct(name: Utf8, id: Int32) — same data, not dict-encoded
27+
#
28+
# Rows (logical values):
29+
# {name: Alice, id: 1}
30+
# {name: Bob, id: 2}
31+
# {name: Carol, id: 3}
32+
# {name: Alice, id: 1}
33+
# {name: Bob, id: 2}
34+
#
35+
# dict_struct_nullable:
36+
# ds Dictionary(UInt32, Struct(name: Utf8, id: Int32)) — 4 rows, keys [0, NULL, 1, NULL]
37+
#
38+
# Rows (logical values):
39+
# {name: X, id: 10}
40+
# NULL
41+
# {name: Y, id: 20}
42+
# NULL
43+
44+
# Verify schema of dict_struct_table
45+
query TT
46+
SELECT arrow_typeof(dict_struct), arrow_typeof(plain_struct) FROM dict_struct_table LIMIT 1;
47+
----
48+
Dictionary(UInt32, Struct("name": non-null Utf8, "id": non-null Int32)) Struct("name": non-null Utf8, "id": non-null Int32)
49+
50+
# Extract a string field from dict-encoded struct using bracket notation
51+
query T
52+
SELECT dict_struct['name'] FROM dict_struct_table;
53+
----
54+
Alice
55+
Bob
56+
Carol
57+
Alice
58+
Bob
59+
60+
# Extract an integer field from dict-encoded struct
61+
query ?
62+
SELECT dict_struct['id'] FROM dict_struct_table;
63+
----
64+
1
65+
2
66+
3
67+
1
68+
2
69+
70+
# Verify the extracted field preserves dictionary encoding
71+
query T
72+
SELECT arrow_typeof(dict_struct['name']) FROM dict_struct_table LIMIT 1;
73+
----
74+
Dictionary(UInt32, Utf8)
75+
76+
query T
77+
SELECT arrow_typeof(dict_struct['id']) FROM dict_struct_table LIMIT 1;
78+
----
79+
Dictionary(UInt32, Int32)
80+
81+
# Extracted values from dict-encoded struct match the plain struct
82+
query TT
83+
SELECT dict_struct['name'], plain_struct['name'] FROM dict_struct_table;
84+
----
85+
Alice Alice
86+
Bob Bob
87+
Carol Carol
88+
Alice Alice
89+
Bob Bob
90+
91+
query ?I
92+
SELECT dict_struct['id'], plain_struct['id'] FROM dict_struct_table;
93+
----
94+
1 1
95+
2 2
96+
3 3
97+
1 1
98+
2 2
99+
100+
# Explicit get_field invocation on dict-encoded struct
101+
query T
102+
SELECT get_field(dict_struct, 'name') FROM dict_struct_table;
103+
----
104+
Alice
105+
Bob
106+
Carol
107+
Alice
108+
Bob
109+
110+
# Field extraction from dict-encoded struct with NULLs
111+
query T
112+
SELECT ds['name'] FROM dict_struct_nullable;
113+
----
114+
X
115+
NULL
116+
Y
117+
NULL
118+
119+
query ?
120+
SELECT ds['id'] FROM dict_struct_nullable;
121+
----
122+
10
123+
NULL
124+
20
125+
NULL
126+
127+
# Filtering on extracted dict-encoded struct field
128+
query T
129+
SELECT dict_struct['name'] FROM dict_struct_table WHERE dict_struct['id'] = 2;
130+
----
131+
Bob
132+
Bob
133+
134+
# Aggregation on extracted dict-encoded struct field
135+
query TI
136+
SELECT dict_struct['name'], count(*) FROM dict_struct_table GROUP BY dict_struct['name'] ORDER BY dict_struct['name'];
137+
----
138+
Alice 2
139+
Bob 2
140+
Carol 1
141+
142+
# Ordering by extracted dict-encoded struct field
143+
query T?
144+
SELECT dict_struct['name'], dict_struct['id'] FROM dict_struct_table ORDER BY dict_struct['id'] DESC;
145+
----
146+
Carol 3
147+
Bob 2
148+
Bob 2
149+
Alice 1
150+
Alice 1
151+
152+
# Error: non-existent field
153+
statement error
154+
SELECT dict_struct['nonexistent'] FROM dict_struct_table;

0 commit comments

Comments
 (0)