Skip to content

Commit dcd364a

Browse files
authored
Respect DATA_DIR location for sql benchmarks (#21961)
## Which issue does this PR close? <!-- We generally require a GitHub issue to be filed for all bug fixes and enhancements and this helps us generate change logs for our releases. You can link an issue to this PR using the GitHub syntax. For example `Closes #123` indicates that this PR will close issue #123. --> - Closes #. ## Rationale for this change Currently if DATA_DIR is set for bench.sh it is not respected for sql benchmarks and defaults to benchmarks/data ## What changes are included in this PR? Updated tpch benchmarks to respect DATA_DIR defaulting to 'data' (in benchmarks/) ## Are these changes tested? Yes, unit test + manual testing. ## Are there any user-facing changes? No.
1 parent 2f2fe8f commit dcd364a

6 files changed

Lines changed: 96 additions & 25 deletions

File tree

benchmarks/bench.sh

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -691,6 +691,7 @@ run_tpch() {
691691

692692
debug_run env BENCH_NAME=tpch \
693693
BENCH_SIZE="${SCALE_FACTOR}" \
694+
DATA_DIR="${DATA_DIR}" \
694695
PREFER_HASH_JOIN="${PREFER_HASH_JOIN}" \
695696
TPCH_FILE_TYPE="${FORMAT}" \
696697
SIMULATE_LATENCY="${SIMULATE_LATENCY}" \
@@ -709,6 +710,7 @@ run_tpch_mem() {
709710

710711
debug_run env BENCH_NAME=tpch \
711712
BENCH_SIZE="${SCALE_FACTOR}" \
713+
DATA_DIR="${DATA_DIR}" \
712714
TPCH_FILE_TYPE="mem" \
713715
PREFER_HASH_JOIN="${PREFER_HASH_JOIN}" \
714716
SIMULATE_LATENCY="${SIMULATE_LATENCY}" \

benchmarks/sql_benchmarks/README.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,9 +68,10 @@ The SQL benchmarking tool uses the following environment variables:
6868
| BENCH_QUERY | A query number to run. |
6969
| BENCH_PERSIST_RESULTS | true/false to persist benchmark results. Results will be persisted in csv format so be cognizant of the size of the results. |
7070
| BENCH_VALIDATE | true/false to validate benchmark results against persisted results or result_query's. If both `BENCH_PERSIST_RESULTS` and `BENCH_VALIDATE` are true, persist mode runs and validation is skipped. |
71+
| DATA_DIR | Root directory for benchmark data loaded by SQL benchmark files. When unset, uses `data` (relative to the benchmarks/ directory). |
7172
| SIMULATE_LATENCY | Simulate object store latency to mimic remote storage (e.g. S3). Adds random latency in the range 20-200ms to each object store operation. |
7273
| MEM_POOL_TYPE | The memory pool type to use, should be one of "fair" or "greedy". |
73-
| MEMORY_LIMIT | Memory limit (e.g. '100M', '1.5G'). If not specified, run all pre-defined memory limits for given query if there's any, otherwise run with no memory limit. | |
74+
| MEMORY_LIMIT | Memory limit (e.g. '100M', '1.5G'). If not specified, run all pre-defined memory limits for given query if there's any, otherwise run with no memory limit. |
7475

7576
Example – Run the H2O window benchmarks on the 'small' sized CSV data files:
7677

benchmarks/sql_benchmarks/tpch/init/load_csv.sql

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -5,15 +5,15 @@ CREATE EXTERNAL TABLE nation
55
n_regionkey INT,
66
n_comment VARCHAR(152),
77
PRIMARY KEY (n_nationkey)
8-
) STORED AS CSV LOCATION 'data/tpch_sf${BENCH_SIZE:-1}/csv/nation/nation.1.csv';
8+
) STORED AS CSV LOCATION '${DATA_DIR:-data}/tpch_sf${BENCH_SIZE:-1}/csv/nation/nation.1.csv';
99

1010
CREATE EXTERNAL TABLE region
1111
(
1212
r_regionkey INT,
1313
r_name CHAR(25),
1414
r_comment VARCHAR(152),
1515
PRIMARY KEY (r_regionkey)
16-
) STORED AS CSV LOCATION 'data/tpch_sf${BENCH_SIZE:-1}/csv/region/region.1.csv';
16+
) STORED AS CSV LOCATION '${DATA_DIR:-data}/tpch_sf${BENCH_SIZE:-1}/csv/region/region.1.csv';
1717

1818
CREATE EXTERNAL TABLE supplier
1919
(
@@ -25,7 +25,7 @@ CREATE EXTERNAL TABLE supplier
2525
s_acctbal DECIMAL(15, 2),
2626
s_comment VARCHAR(101),
2727
PRIMARY KEY (s_suppkey)
28-
) STORED AS CSV LOCATION 'data/tpch_sf${BENCH_SIZE:-1}/csv/supplier/supplier.1.csv';
28+
) STORED AS CSV LOCATION '${DATA_DIR:-data}/tpch_sf${BENCH_SIZE:-1}/csv/supplier/supplier.1.csv';
2929

3030
CREATE EXTERNAL TABLE customer
3131
(
@@ -38,7 +38,7 @@ CREATE EXTERNAL TABLE customer
3838
c_mktsegment CHAR(10),
3939
c_comment VARCHAR(117),
4040
PRIMARY KEY (c_custkey)
41-
) STORED AS CSV LOCATION 'data/tpch_sf${BENCH_SIZE:-1}/csv/customer/customer.1.csv';
41+
) STORED AS CSV LOCATION '${DATA_DIR:-data}/tpch_sf${BENCH_SIZE:-1}/csv/customer/customer.1.csv';
4242

4343
CREATE EXTERNAL TABLE part
4444
(
@@ -52,7 +52,7 @@ CREATE EXTERNAL TABLE part
5252
p_retailprice DECIMAL(15, 2),
5353
p_comment VARCHAR(23),
5454
PRIMARY KEY (p_partkey)
55-
) STORED AS CSV LOCATION 'data/tpch_sf${BENCH_SIZE:-1}/csv/part/part.1.csv';
55+
) STORED AS CSV LOCATION '${DATA_DIR:-data}/tpch_sf${BENCH_SIZE:-1}/csv/part/part.1.csv';
5656

5757
CREATE EXTERNAL TABLE partsupp
5858
(
@@ -62,7 +62,7 @@ CREATE EXTERNAL TABLE partsupp
6262
ps_supplycost DECIMAL(15, 2),
6363
ps_comment VARCHAR(199),
6464
PRIMARY KEY (ps_partkey)
65-
) STORED AS CSV LOCATION 'data/tpch_sf${BENCH_SIZE:-1}/csv/partsupp/partsupp.1.csv';
65+
) STORED AS CSV LOCATION '${DATA_DIR:-data}/tpch_sf${BENCH_SIZE:-1}/csv/partsupp/partsupp.1.csv';
6666

6767
CREATE EXTERNAL TABLE orders
6868
(
@@ -76,7 +76,7 @@ CREATE EXTERNAL TABLE orders
7676
o_shippriority INT,
7777
o_comment VARCHAR(79),
7878
PRIMARY KEY (o_orderkey)
79-
) STORED AS CSV LOCATION 'data/tpch_sf${BENCH_SIZE:-1}/csv/orders/orders.1.csv';
79+
) STORED AS CSV LOCATION '${DATA_DIR:-data}/tpch_sf${BENCH_SIZE:-1}/csv/orders/orders.1.csv';
8080

8181
CREATE EXTERNAL TABLE lineitem
8282
(
@@ -96,4 +96,4 @@ CREATE EXTERNAL TABLE lineitem
9696
l_shipinstruct CHAR(25),
9797
l_shipmode CHAR(10),
9898
l_comment VARCHAR(44)
99-
) STORED AS CSV LOCATION 'data/tpch_sf${BENCH_SIZE:-1}/csv/lineitem/lineitem.1.csv';
99+
) STORED AS CSV LOCATION '${DATA_DIR:-data}/tpch_sf${BENCH_SIZE:-1}/csv/lineitem/lineitem.1.csv';

benchmarks/sql_benchmarks/tpch/init/load_mem.sql

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,18 @@
1-
CREATE EXTERNAL TABLE nation_raw STORED AS PARQUET LOCATION 'data/tpch_sf${BENCH_SIZE:-1}/nation/nation.1.parquet';
1+
CREATE EXTERNAL TABLE nation_raw STORED AS PARQUET LOCATION '${DATA_DIR:-data}/tpch_sf${BENCH_SIZE:-1}/nation/nation.1.parquet';
22

3-
CREATE EXTERNAL TABLE region_raw STORED AS PARQUET LOCATION 'data/tpch_sf${BENCH_SIZE:-1}/region/region.1.parquet';
3+
CREATE EXTERNAL TABLE region_raw STORED AS PARQUET LOCATION '${DATA_DIR:-data}/tpch_sf${BENCH_SIZE:-1}/region/region.1.parquet';
44

5-
CREATE EXTERNAL TABLE supplier_raw STORED AS PARQUET LOCATION 'data/tpch_sf${BENCH_SIZE:-1}/supplier/supplier.1.parquet';
5+
CREATE EXTERNAL TABLE supplier_raw STORED AS PARQUET LOCATION '${DATA_DIR:-data}/tpch_sf${BENCH_SIZE:-1}/supplier/supplier.1.parquet';
66

7-
CREATE EXTERNAL TABLE customer_raw STORED AS PARQUET LOCATION 'data/tpch_sf${BENCH_SIZE:-1}/customer/customer.1.parquet';
7+
CREATE EXTERNAL TABLE customer_raw STORED AS PARQUET LOCATION '${DATA_DIR:-data}/tpch_sf${BENCH_SIZE:-1}/customer/customer.1.parquet';
88

9-
CREATE EXTERNAL TABLE part_raw STORED AS PARQUET LOCATION 'data/tpch_sf${BENCH_SIZE:-1}/part/part.1.parquet';
9+
CREATE EXTERNAL TABLE part_raw STORED AS PARQUET LOCATION '${DATA_DIR:-data}/tpch_sf${BENCH_SIZE:-1}/part/part.1.parquet';
1010

11-
CREATE EXTERNAL TABLE partsupp_raw STORED AS PARQUET LOCATION 'data/tpch_sf${BENCH_SIZE:-1}/partsupp/partsupp.1.parquet';
11+
CREATE EXTERNAL TABLE partsupp_raw STORED AS PARQUET LOCATION '${DATA_DIR:-data}/tpch_sf${BENCH_SIZE:-1}/partsupp/partsupp.1.parquet';
1212

13-
CREATE EXTERNAL TABLE orders_raw STORED AS PARQUET LOCATION 'data/tpch_sf${BENCH_SIZE:-1}/orders/orders.1.parquet';
13+
CREATE EXTERNAL TABLE orders_raw STORED AS PARQUET LOCATION '${DATA_DIR:-data}/tpch_sf${BENCH_SIZE:-1}/orders/orders.1.parquet';
1414

15-
CREATE EXTERNAL TABLE lineitem_raw STORED AS PARQUET LOCATION 'data/tpch_sf${BENCH_SIZE:-1}/lineitem/lineitem.1.parquet';
15+
CREATE EXTERNAL TABLE lineitem_raw STORED AS PARQUET LOCATION '${DATA_DIR:-data}/tpch_sf${BENCH_SIZE:-1}/lineitem/lineitem.1.parquet';
1616

1717
CREATE TABLE nation as SELECT * FROM nation_raw;
1818

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,15 @@
1-
CREATE EXTERNAL TABLE nation STORED AS PARQUET LOCATION 'data/tpch_sf${BENCH_SIZE:-1}/nation/nation.1.parquet';
1+
CREATE EXTERNAL TABLE nation STORED AS PARQUET LOCATION '${DATA_DIR:-data}/tpch_sf${BENCH_SIZE:-1}/nation/nation.1.parquet';
22

3-
CREATE EXTERNAL TABLE region STORED AS PARQUET LOCATION 'data/tpch_sf${BENCH_SIZE:-1}/region/region.1.parquet';
3+
CREATE EXTERNAL TABLE region STORED AS PARQUET LOCATION '${DATA_DIR:-data}/tpch_sf${BENCH_SIZE:-1}/region/region.1.parquet';
44

5-
CREATE EXTERNAL TABLE supplier STORED AS PARQUET LOCATION 'data/tpch_sf${BENCH_SIZE:-1}/supplier/supplier.1.parquet';
5+
CREATE EXTERNAL TABLE supplier STORED AS PARQUET LOCATION '${DATA_DIR:-data}/tpch_sf${BENCH_SIZE:-1}/supplier/supplier.1.parquet';
66

7-
CREATE EXTERNAL TABLE customer STORED AS PARQUET LOCATION 'data/tpch_sf${BENCH_SIZE:-1}/customer/customer.1.parquet';
7+
CREATE EXTERNAL TABLE customer STORED AS PARQUET LOCATION '${DATA_DIR:-data}/tpch_sf${BENCH_SIZE:-1}/customer/customer.1.parquet';
88

9-
CREATE EXTERNAL TABLE part STORED AS PARQUET LOCATION 'data/tpch_sf${BENCH_SIZE:-1}/part/part.1.parquet';
9+
CREATE EXTERNAL TABLE part STORED AS PARQUET LOCATION '${DATA_DIR:-data}/tpch_sf${BENCH_SIZE:-1}/part/part.1.parquet';
1010

11-
CREATE EXTERNAL TABLE partsupp STORED AS PARQUET LOCATION 'data/tpch_sf${BENCH_SIZE:-1}/partsupp/partsupp.1.parquet';
11+
CREATE EXTERNAL TABLE partsupp STORED AS PARQUET LOCATION '${DATA_DIR:-data}/tpch_sf${BENCH_SIZE:-1}/partsupp/partsupp.1.parquet';
1212

13-
CREATE EXTERNAL TABLE orders STORED AS PARQUET LOCATION 'data/tpch_sf${BENCH_SIZE:-1}/orders/orders.1.parquet';
13+
CREATE EXTERNAL TABLE orders STORED AS PARQUET LOCATION '${DATA_DIR:-data}/tpch_sf${BENCH_SIZE:-1}/orders/orders.1.parquet';
1414

15-
CREATE EXTERNAL TABLE lineitem STORED AS PARQUET LOCATION 'data/tpch_sf${BENCH_SIZE:-1}/lineitem/lineitem.1.parquet';
15+
CREATE EXTERNAL TABLE lineitem STORED AS PARQUET LOCATION '${DATA_DIR:-data}/tpch_sf${BENCH_SIZE:-1}/lineitem/lineitem.1.parquet';

benchmarks/src/sql_benchmark.rs

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2230,6 +2230,74 @@ NULL|(empty)
22302230
);
22312231
}
22322232

2233+
#[tokio::test]
2234+
async fn parser_applies_data_dir_replacement_in_load_query_file() {
2235+
let temp_dir = tempdir().expect("failed to create benchmark test directory");
2236+
let data_dir = temp_dir.path().join("non_default_data");
2237+
let csv_dir = data_dir.join("tpch_sf1/csv/generated");
2238+
fs::create_dir_all(&csv_dir).expect("failed to create generated data directory");
2239+
fs::write(csv_dir.join("generated.1.csv"), "value\n42\n")
2240+
.expect("failed to write generated csv file");
2241+
2242+
let load_path = write_test_file(
2243+
&temp_dir,
2244+
"load_generated_csv.sql",
2245+
"CREATE EXTERNAL TABLE generated(value INT) STORED AS CSV LOCATION '${DATA_DIR:-data}/tpch_sf${BENCH_SIZE:-1}/csv/generated/generated.1.csv' OPTIONS ('format.has_header' 'true');\n",
2246+
);
2247+
let template_path = write_test_file(
2248+
&temp_dir,
2249+
"load_file_template.benchmark",
2250+
&format!(
2251+
"load {}\n\nrun\nSELECT value FROM generated;\n",
2252+
load_path.display()
2253+
),
2254+
);
2255+
let benchmark_path = write_test_file(
2256+
&temp_dir,
2257+
"load_file_driver.benchmark",
2258+
&format!(
2259+
"template {}\nDATA_DIR={}\n",
2260+
template_path.display(),
2261+
data_dir.display()
2262+
),
2263+
);
2264+
2265+
let ctx = SessionContext::new();
2266+
let path_string = benchmark_path.to_string_lossy().into_owned();
2267+
let mut benchmark = SqlBenchmark::new(&ctx, &path_string, "/tmp")
2268+
.await
2269+
.expect("benchmark should parse");
2270+
2271+
let load_queries = benchmark
2272+
.queries()
2273+
.get(&QueryDirective::Load)
2274+
.expect("load queries");
2275+
assert_eq!(load_queries.len(), 1);
2276+
assert!(
2277+
load_queries.iter().all(|query| !query.contains("${")),
2278+
"all placeholders should be replaced: {load_queries:?}"
2279+
);
2280+
let expected_location = format!(
2281+
"LOCATION '{}/tpch_sf1/csv/generated/generated.1.csv'",
2282+
data_dir.display()
2283+
);
2284+
assert!(
2285+
load_queries[0].contains(&expected_location),
2286+
"all load locations should use the non-default DATA_DIR: {load_queries:?}"
2287+
);
2288+
2289+
benchmark
2290+
.initialize(&ctx)
2291+
.await
2292+
.expect("benchmark should load generated csv file");
2293+
benchmark
2294+
.run(&ctx, true)
2295+
.await
2296+
.expect("benchmark should read generated csv file");
2297+
2298+
assert_eq!(formatted_last_results(&benchmark), vec![vec!["42"]]);
2299+
}
2300+
22332301
#[tokio::test]
22342302
async fn parser_rejects_inline_sql_when_query_file_is_provided() {
22352303
let temp_dir = tempdir().expect("failed to create benchmark test directory");

0 commit comments

Comments
 (0)