Skip to content

Commit 041d03b

Browse files
adriangbclaude
andcommitted
Replace generate_series examples with parquet-based TopK tests
Use a parquet table with multiple row groups and a TopK ORDER BY LIMIT query that triggers DynamicFilter pushdown. This makes the slt examples much more realistic — they show pruning metrics, row group statistics, and the resolved DynamicFilter predicate. Add a 'timing' category example that shows only elapsed_compute and metadata_load_time (with <slt:ignore> since they are non-deterministic). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 285e03e commit 041d03b

1 file changed

Lines changed: 75 additions & 21 deletions

File tree

datafusion/sqllogictest/test_files/explain_analyze.slt

Lines changed: 75 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -70,61 +70,115 @@ reset datafusion.explain.analyze_level;
7070
# ------------------------------------------------
7171
# Test analyze_categories: filter metrics by kind
7272
# ------------------------------------------------
73+
# Categories classify metrics by determinism:
74+
# rows, bytes — depend on plan + data, deterministic across runs
75+
# timing — varies run-to-run even on same hardware
76+
77+
# --- Setup: create a small parquet table with multiple row groups ---
7378

74-
# "rows" — only row-count metrics (deterministic), no timing or bytes
7579
statement ok
76-
set datafusion.explain.analyze_categories = 'rows';
80+
set datafusion.execution.parquet.pushdown_filters = true;
81+
82+
statement ok
83+
CREATE TABLE _cat_data AS VALUES
84+
('Anow Vole', 7),
85+
('Brown Bear', 133),
86+
('Gray Wolf', 82),
87+
('Lynx', 71),
88+
('Red Fox', 40),
89+
('Alpine Bat', 6),
90+
('Nlpine Ibex', 101),
91+
('Nlpine Goat', 76),
92+
('Nlpine Sheep', 83),
93+
('Europ. Mole', 4),
94+
('Polecat', 16),
95+
('Alpine Ibex', 97);
96+
97+
statement ok
98+
COPY (SELECT column1 as species, column2 as s FROM _cat_data)
99+
TO 'test_files/scratch/explain_analyze/data.parquet'
100+
STORED AS PARQUET
101+
OPTIONS ('format.max_row_group_size' '3');
102+
103+
statement ok
104+
drop table _cat_data;
105+
106+
statement ok
107+
CREATE EXTERNAL TABLE cat_tracking
108+
STORED AS PARQUET
109+
LOCATION 'test_files/scratch/explain_analyze/data.parquet';
110+
111+
# ---- categories = 'none': plan only, no metrics at all ----
112+
113+
statement ok
114+
set datafusion.explain.analyze_level = summary;
115+
116+
statement ok
117+
set datafusion.explain.analyze_categories = 'none';
77118

78119
query TT
79-
EXPLAIN ANALYZE SELECT * FROM generate_series(100);
120+
explain analyze select * from cat_tracking where species > 'M' AND s >= 50 order by species limit 3;
80121
----
81-
Plan with Metrics LazyMemoryExec: partitions=1, batch_generators=[generate_series: start=0, end=100, batch_size=8192], metrics=[output_rows=101, output_batches=1]
122+
Plan with Metrics
123+
01)SortExec: TopK(fetch=3), expr=[species@0 ASC NULLS LAST], preserve_partitioning=[false], filter=[species@0 < Nlpine Sheep], metrics=[]
124+
02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/explain_analyze/data.parquet]]}, projection=[species, s], file_type=parquet, predicate=species@0 > M AND s@1 >= 50 AND DynamicFilter [ species@0 < Nlpine Sheep ], pruning_predicate=species_null_count@1 != row_count@2 AND species_max@0 > M AND s_null_count@4 != row_count@2 AND s_max@3 >= 50 AND species_null_count@1 != row_count@2 AND species_min@5 < Nlpine Sheep, required_guarantees=[], metrics=[]
82125

83126
statement ok
84127
reset datafusion.explain.analyze_categories;
85128

86-
# "none" — plan only, all metrics suppressed (empty brackets)
129+
# ---- categories = 'rows': deterministic row-count metrics only ----
130+
# Note: no elapsed_compute, no output_bytes, no bytes_scanned, no metadata_load_time
131+
87132
statement ok
88-
set datafusion.explain.analyze_categories = 'none';
133+
set datafusion.explain.analyze_categories = 'rows';
89134

90135
query TT
91-
EXPLAIN ANALYZE SELECT * FROM generate_series(100);
136+
explain analyze select * from cat_tracking where species > 'M' AND s >= 50 order by species limit 3;
92137
----
93-
Plan with Metrics LazyMemoryExec: partitions=1, batch_generators=[generate_series: start=0, end=100, batch_size=8192], metrics=[]
138+
Plan with Metrics
139+
01)SortExec: TopK(fetch=3), expr=[species@0 ASC NULLS LAST], preserve_partitioning=[false], filter=[species@0 < Nlpine Sheep], metrics=[output_rows=3]
140+
02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/explain_analyze/data.parquet]]}, projection=[species, s], file_type=parquet, predicate=species@0 > M AND s@1 >= 50 AND DynamicFilter [ species@0 < Nlpine Sheep ], pruning_predicate=species_null_count@1 != row_count@2 AND species_max@0 > M AND s_null_count@4 != row_count@2 AND s_max@3 >= 50 AND species_null_count@1 != row_count@2 AND species_min@5 < Nlpine Sheep, required_guarantees=[], metrics=[output_rows=3, files_ranges_pruned_statistics=1 total → 1 matched, row_groups_pruned_statistics=4 total → 3 matched -> 1 fully matched, row_groups_pruned_bloom_filter=3 total → 3 matched, page_index_pages_pruned=6 total → 6 matched, limit_pruned_row_groups=0 total → 0 matched, bytes_scanned=521, scan_efficiency_ratio=22% (521/2.35 K)]
94141

95142
statement ok
96143
reset datafusion.explain.analyze_categories;
97144

98-
# "rows,bytes" — row + byte metrics, no timing
145+
# ---- categories = 'rows,bytes': add byte metrics, still no timing ----
146+
99147
statement ok
100148
set datafusion.explain.analyze_categories = 'rows,bytes';
101149

102150
query TT
103-
EXPLAIN ANALYZE SELECT * FROM generate_series(100);
151+
explain analyze select * from cat_tracking where species > 'M' AND s >= 50 order by species limit 3;
104152
----
105-
Plan with Metrics LazyMemoryExec: partitions=1, batch_generators=[generate_series: start=0, end=100, batch_size=8192], metrics=[output_rows=101, output_bytes=<slt:ignore>]
153+
Plan with Metrics
154+
01)SortExec: TopK(fetch=3), expr=[species@0 ASC NULLS LAST], preserve_partitioning=[false], filter=[species@0 < Nlpine Sheep], metrics=[output_rows=3, output_bytes=<slt:ignore>]
155+
02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/explain_analyze/data.parquet]]}, projection=[species, s], file_type=parquet, predicate=species@0 > M AND s@1 >= 50 AND DynamicFilter [ species@0 < Nlpine Sheep ], pruning_predicate=species_null_count@1 != row_count@2 AND species_max@0 > M AND s_null_count@4 != row_count@2 AND s_max@3 >= 50 AND species_null_count@1 != row_count@2 AND species_min@5 < Nlpine Sheep, required_guarantees=[], metrics=[output_rows=3, output_bytes=<slt:ignore>, files_ranges_pruned_statistics=1 total → 1 matched, row_groups_pruned_statistics=4 total → 3 matched -> 1 fully matched, row_groups_pruned_bloom_filter=3 total → 3 matched, page_index_pages_pruned=6 total → 6 matched, limit_pruned_row_groups=0 total → 0 matched, bytes_scanned=<slt:ignore>, scan_efficiency_ratio=<slt:ignore>]
106156

107157
statement ok
108158
reset datafusion.explain.analyze_categories;
109159

110-
# "rows" with dev level — per-expression timing excluded, batches included
111-
statement ok
112-
set datafusion.explain.analyze_level = dev;
160+
# ---- categories = 'timing': only timing metrics (non-deterministic) ----
113161

114162
statement ok
115-
set datafusion.explain.analyze_categories = 'rows';
163+
set datafusion.explain.analyze_categories = 'timing';
116164

117165
query TT
118-
EXPLAIN ANALYZE
119-
SELECT a
120-
FROM generate_series(1, 100) as t1(a);
166+
explain analyze select * from cat_tracking where species > 'M' AND s >= 50 order by species limit 3;
121167
----
122168
Plan with Metrics
123-
01)ProjectionExec: expr=[value@0 as a], metrics=[output_rows=100, output_batches=1]
124-
02)--LazyMemoryExec: partitions=1, batch_generators=[generate_series: start=1, end=100, batch_size=8192], metrics=[output_rows=100, output_batches=1]
169+
01)SortExec: TopK(fetch=3), expr=[species@0 ASC NULLS LAST], preserve_partitioning=[false], filter=[species@0 < Nlpine Sheep], metrics=[elapsed_compute=<slt:ignore>]
170+
02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/explain_analyze/data.parquet]]}, projection=[species, s], file_type=parquet, predicate=species@0 > M AND s@1 >= 50 AND DynamicFilter [ species@0 < Nlpine Sheep ], pruning_predicate=species_null_count@1 != row_count@2 AND species_max@0 > M AND s_null_count@4 != row_count@2 AND s_max@3 >= 50 AND species_null_count@1 != row_count@2 AND species_min@5 < Nlpine Sheep, required_guarantees=[], metrics=[elapsed_compute=<slt:ignore>, metadata_load_time=<slt:ignore>]
171+
172+
statement ok
173+
reset datafusion.explain.analyze_categories;
125174

126175
statement ok
127176
reset datafusion.explain.analyze_level;
128177

178+
# --- Teardown ---
179+
129180
statement ok
130-
reset datafusion.explain.analyze_categories;
181+
drop table cat_tracking;
182+
183+
statement ok
184+
reset datafusion.execution.parquet.pushdown_filters;

0 commit comments

Comments
 (0)