Skip to content

Commit be7d7b1

Browse files
committed
fix: generate scrambled+overlapping RGs for overlap benchmark
The previous jitter formula only added overlap between adjacent RGs but kept the overall RG order ascending by min values. This meant reorder_by_statistics was a no-op — there was nothing to reorder. Fix by bucketing rows into 60 chunks, sorting within each chunk (with jitter for overlap), then scrambling chunk order using a deterministic permutation. This produces RGs that are individually sorted but appear in scrambled order in the file, so reorder_by_statistics has real work to do.
1 parent 09ae8b0 commit be7d7b1

1 file changed

Lines changed: 25 additions & 16 deletions

File tree

benchmarks/bench.sh

Lines changed: 25 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1196,34 +1196,43 @@ data_sort_pushdown_inexact() {
11961196
echo "Sort pushdown Inexact shuffled data generated at ${INEXACT_DIR}"
11971197
ls -la "${INEXACT_DIR}"
11981198

1199-
# Also generate a file with partially overlapping row groups.
1200-
# Simulates streaming data with network delays: each chunk is mostly
1201-
# in order but has a small overlap with the next chunk (±5% of the
1202-
# chunk range). This is the pattern described by @adriangb — data
1203-
# arriving with timestamps that are generally increasing but with
1204-
# network-induced jitter causing small overlaps between row groups.
1199+
# Also generate a file with partially overlapping AND out-of-order
1200+
# row groups. Simulates streaming data with network delays: chunks
1201+
# arrive out of sequence, and each chunk has small jitter causing
1202+
# overlap with neighbors. This is the pattern described by
1203+
# @adriangb — data arriving with timestamps that are generally
1204+
# increasing but network-induced delays cause chunks to arrive
1205+
# out of order with small overlaps between row groups.
1206+
#
1207+
# Strategy: bucket rows into 60 chunks (~100K rows each), sort
1208+
# within each chunk (with jitter for overlap), then scramble the
1209+
# chunk order using a deterministic permutation. This produces
1210+
# RGs that are individually sorted but appear in scrambled order
1211+
# in the file — so reorder_by_statistics has real work to do.
12051212
OVERLAP_DIR="${DATA_DIR}/sort_pushdown_inexact_overlap/lineitem"
12061213
if [ -d "${OVERLAP_DIR}" ] && [ "$(ls -A ${OVERLAP_DIR}/*.parquet 2>/dev/null)" ]; then
12071214
echo "Sort pushdown Inexact overlap data already exists at ${OVERLAP_DIR}"
12081215
return
12091216
fi
12101217

1211-
echo "Generating sort pushdown Inexact overlap data (partially overlapping RGs)..."
1218+
echo "Generating sort pushdown Inexact overlap data (scrambled + overlapping RGs)..."
12121219
mkdir -p "${OVERLAP_DIR}"
12131220

12141221
(cd "${SCRIPT_DIR}/.." && cargo run --release -p datafusion-cli -- -c "
12151222
CREATE EXTERNAL TABLE src
12161223
STORED AS PARQUET
12171224
LOCATION '${SRC_DIR}';
12181225
1219-
-- Add jitter to l_orderkey: shift each row by a random-ish offset
1220-
-- proportional to its position. This creates overlap between adjacent
1221-
-- row groups while preserving the general ascending trend.
1222-
-- Formula: l_orderkey + (l_orderkey * 7 % 5000) - 2500
1223-
-- This adds ±2500 jitter, creating ~5K overlap between adjacent 100K-row RGs.
1226+
-- Bucket into 60 chunks (each ~100K rows), sort within each chunk,
1227+
-- then scramble chunk order. This produces overlapping RGs that are
1228+
-- individually sorted but appear in scrambled order in the file.
12241229
COPY (
12251230
SELECT * FROM src
1226-
ORDER BY l_orderkey + (l_orderkey * 7 % 5000) - 2500
1231+
ORDER BY
1232+
-- Scramble chunk order: chunk_id -> permuted_chunk_id
1233+
(CAST(l_orderkey / 100000 AS INT) * 37 + 13) % 60,
1234+
-- Within each chunk, add small jitter for overlap
1235+
l_orderkey + (l_orderkey * 7 % 5000) - 2500
12271236
)
12281237
TO '${OVERLAP_DIR}/overlapping.parquet'
12291238
STORED AS PARQUET
@@ -1256,9 +1265,9 @@ run_sort_pushdown_inexact_unsorted() {
12561265
debug_run $CARGO_COMMAND --bin dfbench -- sort-pushdown --iterations 5 --path "${INEXACT_DIR}" --queries-path "${SCRIPT_DIR}/queries/sort_pushdown_inexact_unsorted" -o "${RESULTS_FILE}" ${QUERY_ARG} ${LATENCY_ARG}
12571266
}
12581267

1259-
# Runs the sort pushdown benchmark with partially overlapping RGs.
1260-
# Simulates streaming data with network jitter — RGs are mostly in order
1261-
# but have small overlaps (±2500 orderkey jitter between adjacent RGs).
1268+
# Runs the sort pushdown benchmark with scrambled + overlapping RGs.
1269+
# Simulates streaming data with network delays — RGs are out of order
1270+
# AND have small overlaps (jitter). Tests reorder_by_statistics effectiveness.
12621271
run_sort_pushdown_inexact_overlap() {
12631272
OVERLAP_DIR="${DATA_DIR}/sort_pushdown_inexact_overlap"
12641273
RESULTS_FILE="${RESULTS_DIR}/sort_pushdown_inexact_overlap.json"

0 commit comments

Comments
 (0)