Skip to content

Commit a862f7a

Browse files
authored
Split clickbench query set into one file per query (apache#16476)
* Split clickbench query set into one file per query * Fix queries-path value in bench.sh
1 parent d6e8b07 commit a862f7a

58 files changed

Lines changed: 200 additions & 113 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

benchmarks/bench.sh

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -586,23 +586,23 @@ run_clickbench_1() {
586586
RESULTS_FILE="${RESULTS_DIR}/clickbench_1.json"
587587
echo "RESULTS_FILE: ${RESULTS_FILE}"
588588
echo "Running clickbench (1 file) benchmark..."
589-
debug_run $CARGO_COMMAND --bin dfbench -- clickbench --iterations 5 --path "${DATA_DIR}/hits.parquet" --queries-path "${SCRIPT_DIR}/queries/clickbench/queries.sql" -o "${RESULTS_FILE}"
589+
debug_run $CARGO_COMMAND --bin dfbench -- clickbench --iterations 5 --path "${DATA_DIR}/hits.parquet" --queries-path "${SCRIPT_DIR}/queries/clickbench/queries" -o "${RESULTS_FILE}"
590590
}
591591

592592
# Runs the clickbench benchmark with the partitioned parquet files
593593
run_clickbench_partitioned() {
594594
RESULTS_FILE="${RESULTS_DIR}/clickbench_partitioned.json"
595595
echo "RESULTS_FILE: ${RESULTS_FILE}"
596596
echo "Running clickbench (partitioned, 100 files) benchmark..."
597-
debug_run $CARGO_COMMAND --bin dfbench -- clickbench --iterations 5 --path "${DATA_DIR}/hits_partitioned" --queries-path "${SCRIPT_DIR}/queries/clickbench/queries.sql" -o "${RESULTS_FILE}"
597+
debug_run $CARGO_COMMAND --bin dfbench -- clickbench --iterations 5 --path "${DATA_DIR}/hits_partitioned" --queries-path "${SCRIPT_DIR}/queries/clickbench/queries" -o "${RESULTS_FILE}"
598598
}
599599

600600
# Runs the clickbench "extended" benchmark with a single large parquet file
601601
run_clickbench_extended() {
602602
RESULTS_FILE="${RESULTS_DIR}/clickbench_extended.json"
603603
echo "RESULTS_FILE: ${RESULTS_FILE}"
604604
echo "Running clickbench (1 file) extended benchmark..."
605-
debug_run $CARGO_COMMAND --bin dfbench -- clickbench --iterations 5 --path "${DATA_DIR}/hits.parquet" --queries-path "${SCRIPT_DIR}/queries/clickbench/extended.sql" -o "${RESULTS_FILE}"
605+
debug_run $CARGO_COMMAND --bin dfbench -- clickbench --iterations 5 --path "${DATA_DIR}/hits.parquet" --queries-path "${SCRIPT_DIR}/queries/clickbench/extended" -o "${RESULTS_FILE}"
606606
}
607607

608608
# Downloads the csv.gz files IMDB datasets from Peter Boncz's homepage(one of the JOB paper authors)

benchmarks/queries/clickbench/README.md

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,17 +6,17 @@ ClickBench is focused on aggregation and filtering performance (though it has no
66

77
## Files:
88

9-
- `queries.sql` - Actual ClickBench queries, downloaded from the [ClickBench repository]
10-
- `extended.sql` - "Extended" DataFusion specific queries.
9+
- `queries/*.sql` - Actual ClickBench queries, downloaded from the [ClickBench repository](https://raw.githubusercontent.com/ClickHouse/ClickBench/main/datafusion/queries.sql) and split by the `update_queries.sh` script.
10+
- `extended/*.sql` - "Extended" DataFusion specific queries.
1111

1212
[clickbench repository]: https://github.com/ClickHouse/ClickBench/blob/main/datafusion/queries.sql
1313

1414
## "Extended" Queries
1515

1616
The "extended" queries are not part of the official ClickBench benchmark.
1717
Instead they are used to test other DataFusion features that are not covered by
18-
the standard benchmark. Each description below is for the corresponding line in
19-
`extended.sql` (line 1 is `Q0`, line 2 is `Q1`, etc.)
18+
the standard benchmark. Each description below is for the corresponding file in
19+
`extended`
2020

2121
### Q0: Data Exploration
2222

benchmarks/queries/clickbench/extended.sql

Lines changed: 0 additions & 9 deletions
This file was deleted.
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
SELECT COUNT(DISTINCT "SearchPhrase"), COUNT(DISTINCT "MobilePhone"), COUNT(DISTINCT "MobilePhoneModel") FROM hits;
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
SELECT COUNT(DISTINCT "HitColor"), COUNT(DISTINCT "BrowserCountry"), COUNT(DISTINCT "BrowserLanguage") FROM hits;
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
SELECT "BrowserCountry", COUNT(DISTINCT "SocialNetwork"), COUNT(DISTINCT "HitColor"), COUNT(DISTINCT "BrowserLanguage"), COUNT(DISTINCT "SocialAction") FROM hits GROUP BY 1 ORDER BY 2 DESC LIMIT 10;
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
SELECT "SocialSourceNetworkID", "RegionID", COUNT(*), AVG("Age"), AVG("ParamPrice"), STDDEV("ParamPrice") as s, VAR("ParamPrice") FROM hits GROUP BY "SocialSourceNetworkID", "RegionID" HAVING s IS NOT NULL ORDER BY s DESC LIMIT 10;
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
SELECT "ClientIP", "WatchID", COUNT(*) c, MIN("ResponseStartTiming") tmin, MEDIAN("ResponseStartTiming") tmed, MAX("ResponseStartTiming") tmax FROM hits WHERE "JavaEnable" = 0 GROUP BY "ClientIP", "WatchID" HAVING c > 1 ORDER BY tmed DESC LIMIT 10;
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
SELECT "ClientIP", "WatchID", COUNT(*) c, MIN("ResponseStartTiming") tmin, APPROX_PERCENTILE_CONT(0.95) WITHIN GROUP (ORDER BY "ResponseStartTiming") tp95, MAX("ResponseStartTiming") tmax FROM 'hits' WHERE "JavaEnable" = 0 GROUP BY "ClientIP", "WatchID" HAVING c > 1 ORDER BY tp95 DESC LIMIT 10;
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
SELECT COUNT(*) AS ShareCount FROM hits WHERE "IsMobile" = 1 AND "MobilePhoneModel" LIKE 'iPhone%' AND "SocialAction" = 'share' AND "SocialSourceNetworkID" IN (5, 12) AND "ClientTimeZone" BETWEEN -5 AND 5 AND regexp_match("Referer", '\/campaign\/(spring|summer)_promo') IS NOT NULL AND CASE WHEN split_part(split_part("URL", 'resolution=', 2), '&', 1) ~ '^\d+$' THEN split_part(split_part("URL", 'resolution=', 2), '&', 1)::INT ELSE 0 END > 1920 AND levenshtein(CAST("UTMSource" AS STRING), CAST("UTMCampaign" AS STRING)) < 3;

0 commit comments

Comments
 (0)