From f96ee00f2d743bad38def425035192dc4ac8ce9f Mon Sep 17 00:00:00 2001 From: Matthew Kim <38759997+friendlymatthew@users.noreply.github.com> Date: Mon, 30 Mar 2026 11:17:40 -0400 Subject: [PATCH] add more benches --- .../core/benches/parquet_struct_projection.rs | 85 ++++++++++++++++++- 1 file changed, 84 insertions(+), 1 deletion(-) diff --git a/datafusion/core/benches/parquet_struct_projection.rs b/datafusion/core/benches/parquet_struct_projection.rs index 65b3905da89a0..7d5b220d397f8 100644 --- a/datafusion/core/benches/parquet_struct_projection.rs +++ b/datafusion/core/benches/parquet_struct_projection.rs @@ -404,10 +404,93 @@ fn nested_benchmarks(c: &mut Criterion) { drop(temp_file); } +fn flat_schema() -> SchemaRef { + Arc::new(Schema::new(vec![ + Field::new("id", DataType::Int32, false), + Field::new("large_string", DataType::Utf8, false), + Field::new("small_int", DataType::Int32, false), + ])) +} + +fn flat_batch(batch_id: usize) -> RecordBatch { + let schema = flat_schema(); + let len = WRITE_RECORD_BATCH_SIZE; + + let base_id = (batch_id * len) as i32; + let id_values: Vec = (0..len).map(|i| base_id + i as i32).collect(); + let id_array = Arc::new(Int32Array::from(id_values.clone())); + let small_int_array = Arc::new(Int32Array::from(id_values)); + + let large_string: String = "x".repeat(LARGE_STRING_LEN); + let mut string_builder = StringBuilder::new(); + for _ in 0..len { + string_builder.append_value(&large_string); + } + let large_string_array = Arc::new(string_builder.finish()); + + RecordBatch::try_new( + schema, + vec![id_array, large_string_array as ArrayRef, small_int_array], + ) + .unwrap() +} + +/// Compare selecting a small field from a flat (top-level) schema vs from +/// inside a struct. Both files contain the same logical data — the only +/// difference is whether `small_int` lives at the top level or nested inside +/// a struct column. +fn flat_vs_struct_benchmarks(c: &mut Criterion) { + let flat_file = generate_file(flat_schema(), flat_batch, "flat"); + let flat_path = flat_file.path().display().to_string(); + assert!(Path::new(&flat_path).exists(), "path not found"); + + let struct_file = generate_file(narrow_schema(), narrow_batch, "narrow_struct_cmp"); + let struct_path = struct_file.path().display().to_string(); + assert!(Path::new(&struct_path).exists(), "path not found"); + + let rt = Runtime::new().unwrap(); + let flat_ctx = create_context(&rt, &flat_path, "t"); + let struct_ctx = create_context(&rt, &struct_path, "t"); + + let mut group = c.benchmark_group("flat_vs_struct"); + group.sample_size(10); + group.warm_up_time(Duration::from_secs(1)); + group.measurement_time(Duration::from_secs(2)); + + // small int: top-level vs struct field + group.bench_function("flat_select_small_int", |b| { + b.iter(|| query(&flat_ctx, &rt, "SELECT small_int FROM t")) + }); + group.bench_function("struct_select_small_int", |b| { + b.iter(|| query(&struct_ctx, &rt, "SELECT s['small_int'] FROM t")) + }); + + // large string: top-level vs struct field + group.bench_function("flat_select_large_string", |b| { + b.iter(|| query(&flat_ctx, &rt, "SELECT large_string FROM t")) + }); + group.bench_function("struct_select_large_string", |b| { + b.iter(|| query(&struct_ctx, &rt, "SELECT s['large_string'] FROM t")) + }); + + // aggregation: SUM of small int + group.bench_function("flat_sum_small_int", |b| { + b.iter(|| query(&flat_ctx, &rt, "SELECT SUM(small_int) FROM t")) + }); + group.bench_function("struct_sum_small_int", |b| { + b.iter(|| query(&struct_ctx, &rt, "SELECT SUM(s['small_int']) FROM t")) + }); + + group.finish(); + drop(flat_file); + drop(struct_file); +} + criterion_group!( benches, narrow_benchmarks, wide_benchmarks, - nested_benchmarks + nested_benchmarks, + flat_vs_struct_benchmarks, ); criterion_main!(benches);