Skip to content

Commit 2b95cde

Browse files
Omega359martin-g
andauthored
Add SQL based benchmarking harness, port tpch to use framework (#21707)
## Which issue does this PR close? - part of #21706 ## Rationale for this change Add a sql based benchmark framework with tpch as the initial benchmark to use this new framework. The README.md includes notes about other benchmarks which will have individual PR's after the initial work is accepted. ## What changes are included in this PR? benchmarking code only. ## Are these changes tested? Yes ## Are there any user-facing changes? benchmarks/bench.sh now uses the new framework for benchmarking tpch ## Additional info AI assisted with refactoring and writing tests. I have reviewed all AI produced code. --------- Co-authored-by: Martin Grigorov <martin-g@users.noreply.github.com>
1 parent 5fda216 commit 2b95cde

36 files changed

Lines changed: 5424 additions & 23 deletions

Cargo.lock

Lines changed: 3 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

benchmarks/.gitignore

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
11
data
2-
results
2+
data_csv
3+
./results/
34
venv
5+
!sql_benchmarks/**/results/

benchmarks/Cargo.toml

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,8 @@ mimalloc_extended = ["libmimalloc-sys/extended"]
4343
arrow = { workspace = true }
4444
async-trait = "0.1"
4545
bytes = { workspace = true }
46-
clap = { version = "4.5.60", features = ["derive"] }
46+
clap = { version = "4.6.0", features = ["derive", "env"] }
47+
criterion = { workspace = true, features = ["html_reports"] }
4748
datafusion = { workspace = true, default-features = true }
4849
datafusion-common = { workspace = true, default-features = true }
4950
env_logger = { workspace = true }
@@ -63,3 +64,8 @@ tokio-util = { version = "0.7.17" }
6364

6465
[dev-dependencies]
6566
datafusion-proto = { workspace = true }
67+
tempfile = { workspace = true }
68+
69+
[[bench]]
70+
harness = false
71+
name = "sql"

benchmarks/bench.sh

Lines changed: 17 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ BENCHMARK=all
4141
DATAFUSION_DIR=${DATAFUSION_DIR:-$SCRIPT_DIR/..}
4242
DATA_DIR=${DATA_DIR:-$SCRIPT_DIR/data}
4343
CARGO_COMMAND=${CARGO_COMMAND:-"cargo run --release"}
44+
SQL_CARGO_COMMAND=${SQL_CARGO_COMMAND:-"cargo bench --bench sql"}
4445
PREFER_HASH_JOIN=${PREFER_HASH_JOIN:-true}
4546
SIMULATE_LATENCY=${SIMULATE_LATENCY:-false}
4647

@@ -685,14 +686,16 @@ run_tpch() {
685686
echo "Internal error: Scale factor not specified"
686687
exit 1
687688
fi
688-
TPCH_DIR="${DATA_DIR}/tpch_sf${SCALE_FACTOR}"
689-
690-
RESULTS_FILE="${RESULTS_DIR}/tpch_sf${SCALE_FACTOR}.json"
691-
echo "RESULTS_FILE: ${RESULTS_FILE}"
689+
FORMAT=$2
692690
echo "Running tpch benchmark..."
693691

694-
FORMAT=$2
695-
debug_run $CARGO_COMMAND --bin dfbench -- tpch --iterations 5 --path "${TPCH_DIR}" --scale-factor "${SCALE_FACTOR}" --prefer_hash_join "${PREFER_HASH_JOIN}" --format ${FORMAT} -o "${RESULTS_FILE}" ${QUERY_ARG} ${LATENCY_ARG}
692+
debug_run env BENCH_NAME=tpch \
693+
BENCH_SIZE="${SCALE_FACTOR}" \
694+
PREFER_HASH_JOIN="${PREFER_HASH_JOIN}" \
695+
TPCH_FILE_TYPE="${FORMAT}" \
696+
SIMULATE_LATENCY="${SIMULATE_LATENCY}" \
697+
${QUERY:+BENCH_QUERY="${QUERY}"} \
698+
bash -c "$SQL_CARGO_COMMAND"
696699
}
697700

698701
# Runs the tpch in memory (needs tpch parquet data)
@@ -702,13 +705,15 @@ run_tpch_mem() {
702705
echo "Internal error: Scale factor not specified"
703706
exit 1
704707
fi
705-
TPCH_DIR="${DATA_DIR}/tpch_sf${SCALE_FACTOR}"
706-
707-
RESULTS_FILE="${RESULTS_DIR}/tpch_mem_sf${SCALE_FACTOR}.json"
708-
echo "RESULTS_FILE: ${RESULTS_FILE}"
709708
echo "Running tpch_mem benchmark..."
710-
# -m means in memory
711-
debug_run $CARGO_COMMAND --bin dfbench -- tpch --iterations 5 --path "${TPCH_DIR}" --scale-factor "${SCALE_FACTOR}" --prefer_hash_join "${PREFER_HASH_JOIN}" -m --format parquet -o "${RESULTS_FILE}" ${QUERY_ARG} ${LATENCY_ARG}
709+
710+
debug_run env BENCH_NAME=tpch \
711+
BENCH_SIZE="${SCALE_FACTOR}" \
712+
TPCH_FILE_TYPE="mem" \
713+
PREFER_HASH_JOIN="${PREFER_HASH_JOIN}" \
714+
SIMULATE_LATENCY="${SIMULATE_LATENCY}" \
715+
${QUERY:+BENCH_QUERY="${QUERY}"} \
716+
bash -c "$SQL_CARGO_COMMAND"
712717
}
713718

714719
# Runs the tpcds benchmark

0 commit comments

Comments
 (0)