@@ -1196,34 +1196,43 @@ data_sort_pushdown_inexact() {
11961196 echo " Sort pushdown Inexact shuffled data generated at ${INEXACT_DIR} "
11971197 ls -la " ${INEXACT_DIR} "
11981198
1199- # Also generate a file with partially overlapping row groups.
1200- # Simulates streaming data with network delays: each chunk is mostly
1201- # in order but has a small overlap with the next chunk (±5% of the
1202- # chunk range). This is the pattern described by @adriangb — data
1203- # arriving with timestamps that are generally increasing but with
1204- # network-induced jitter causing small overlaps between row groups.
1199+ # Also generate a file with partially overlapping AND out-of-order
1200+ # row groups. Simulates streaming data with network delays: chunks
1201+ # arrive out of sequence, and each chunk has small jitter causing
1202+ # overlap with neighbors. This is the pattern described by
1203+ # @adriangb — data arriving with timestamps that are generally
1204+ # increasing but network-induced delays cause chunks to arrive
1205+ # out of order with small overlaps between row groups.
1206+ #
1207+ # Strategy: bucket rows into 60 chunks (~100K rows each), sort
1208+ # within each chunk (with jitter for overlap), then scramble the
1209+ # chunk order using a deterministic permutation. This produces
1210+ # RGs that are individually sorted but appear in scrambled order
1211+ # in the file — so reorder_by_statistics has real work to do.
12051212 OVERLAP_DIR=" ${DATA_DIR} /sort_pushdown_inexact_overlap/lineitem"
12061213 if [ -d " ${OVERLAP_DIR} " ] && [ " $( ls -A ${OVERLAP_DIR} /* .parquet 2> /dev/null) " ]; then
12071214 echo " Sort pushdown Inexact overlap data already exists at ${OVERLAP_DIR} "
12081215 return
12091216 fi
12101217
1211- echo " Generating sort pushdown Inexact overlap data (partially overlapping RGs)..."
1218+ echo " Generating sort pushdown Inexact overlap data (scrambled + overlapping RGs)..."
12121219 mkdir -p " ${OVERLAP_DIR} "
12131220
12141221 (cd " ${SCRIPT_DIR} /.." && cargo run --release -p datafusion-cli -- -c "
12151222 CREATE EXTERNAL TABLE src
12161223 STORED AS PARQUET
12171224 LOCATION '${SRC_DIR} ';
12181225
1219- -- Add jitter to l_orderkey: shift each row by a random-ish offset
1220- -- proportional to its position. This creates overlap between adjacent
1221- -- row groups while preserving the general ascending trend.
1222- -- Formula: l_orderkey + (l_orderkey * 7 % 5000) - 2500
1223- -- This adds ±2500 jitter, creating ~5K overlap between adjacent 100K-row RGs.
1226+ -- Bucket into 60 chunks (each ~100K rows), sort within each chunk,
1227+ -- then scramble chunk order. This produces overlapping RGs that are
1228+ -- individually sorted but appear in scrambled order in the file.
12241229 COPY (
12251230 SELECT * FROM src
1226- ORDER BY l_orderkey + (l_orderkey * 7 % 5000) - 2500
1231+ ORDER BY
1232+ -- Scramble chunk order: chunk_id -> permuted_chunk_id
1233+ (CAST(l_orderkey / 100000 AS INT) * 37 + 13) % 60,
1234+ -- Within each chunk, add small jitter for overlap
1235+ l_orderkey + (l_orderkey * 7 % 5000) - 2500
12271236 )
12281237 TO '${OVERLAP_DIR} /overlapping.parquet'
12291238 STORED AS PARQUET
@@ -1256,9 +1265,9 @@ run_sort_pushdown_inexact_unsorted() {
12561265 debug_run $CARGO_COMMAND --bin dfbench -- sort-pushdown --iterations 5 --path " ${INEXACT_DIR} " --queries-path " ${SCRIPT_DIR} /queries/sort_pushdown_inexact_unsorted" -o " ${RESULTS_FILE} " ${QUERY_ARG} ${LATENCY_ARG}
12571266}
12581267
1259- # Runs the sort pushdown benchmark with partially overlapping RGs.
1260- # Simulates streaming data with network jitter — RGs are mostly in order
1261- # but have small overlaps (±2500 orderkey jitter between adjacent RGs) .
1268+ # Runs the sort pushdown benchmark with scrambled + overlapping RGs.
1269+ # Simulates streaming data with network delays — RGs are out of order
1270+ # AND have small overlaps (jitter). Tests reorder_by_statistics effectiveness .
12621271run_sort_pushdown_inexact_overlap () {
12631272 OVERLAP_DIR=" ${DATA_DIR} /sort_pushdown_inexact_overlap"
12641273 RESULTS_FILE=" ${RESULTS_DIR} /sort_pushdown_inexact_overlap.json"
0 commit comments