apache
diff --git a/‎.github/workflows/extended.yml‎
Lines changed: 2 additions & 4 deletions b/‎.github/workflows/extended.yml‎
Lines changed: 2 additions & 4 deletions
diff --git a/‎Cargo.toml‎
Lines changed: 23 additions & 33 deletions b/‎Cargo.toml‎
Lines changed: 23 additions & 33 deletions
diff --git a/‎benchmarks/README.md‎
Lines changed: 1 addition & 1 deletion b/‎benchmarks/README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎benchmarks/bench.sh‎
Lines changed: 30 additions & 0 deletions b/‎benchmarks/bench.sh‎
Lines changed: 30 additions & 0 deletions
diff --git a/‎benchmarks/compile_profile.py‎
Lines changed: 33 additions & 10 deletions b/‎benchmarks/compile_profile.py‎
Lines changed: 33 additions & 10 deletions
diff --git a/‎benchmarks/queries/sort_pushdown/q1.sql‎
Lines changed: 6 additions & 0 deletions b/‎benchmarks/queries/sort_pushdown/q1.sql‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎benchmarks/queries/sort_pushdown/q2.sql‎
Lines changed: 7 additions & 0 deletions b/‎benchmarks/queries/sort_pushdown/q2.sql‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎benchmarks/queries/sort_pushdown/q3.sql‎
Lines changed: 5 additions & 0 deletions b/‎benchmarks/queries/sort_pushdown/q3.sql‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎benchmarks/queries/sort_pushdown/q4.sql‎
Lines changed: 5 additions & 0 deletions b/‎benchmarks/queries/sort_pushdown/q4.sql‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎benchmarks/src/bin/dfbench.rs‎
Lines changed: 4 additions & 1 deletion b/‎benchmarks/src/bin/dfbench.rs‎
Lines changed: 4 additions & 1 deletion
@@ -93,8 +93,7 @@ jobs:
   linux-test-extended:
     name: cargo test 'extended_tests' (amd64)
     needs: [linux-build-lib]
-    runs-on: ${{ github.repository_owner == 'apache' && format('runs-on={0},family=m8a+m7a+c8a,cpu=32,image=ubuntu24-full-x64,extras=s3-cache,disk=large,tag=datafusion,spot=false', github.run_id) || 'ubuntu-latest' }}
-    # spot=false because the tests are long, https://runs-on.com/configuration/spot-instances/#disable-spot-pricing
+    runs-on: ${{ github.repository_owner == 'apache' && format('runs-on={0},family=m8a+m7a+c8a,cpu=32,image=ubuntu24-full-x64,extras=s3-cache,disk=large,tag=datafusion', github.run_id) || 'ubuntu-latest' }}
     # note: do not use amd/rust container to preserve disk space
     steps:
       - uses: runs-on/action@cd2b598b0515d39d78c38a02d529db87d2196d1e  # v2.0.3
@@ -162,8 +161,7 @@ jobs:
 
   sqllogictest-sqlite:
     name: "Run sqllogictests with the sqlite test suite"
-    runs-on: ${{ github.repository_owner == 'apache' && format('runs-on={0},family=m8a+m7a+c8a,cpu=48,image=ubuntu24-full-x64,extras=s3-cache,disk=large,tag=datafusion,spot=false', github.run_id) || 'ubuntu-latest' }}
-    # spot=false because the tests are long, https://runs-on.com/configuration/spot-instances/#disable-spot-pricing
+    runs-on: ${{ github.repository_owner == 'apache' && format('runs-on={0},family=m8a+m7a+c8a,cpu=32,image=ubuntu24-full-x64,extras=s3-cache,disk=large,tag=datafusion', github.run_id) || 'ubuntu-latest' }}
     container:
       image: amd64/rust
     steps:
 
@@ -224,62 +224,57 @@ unused_qualifications = "deny"
 # --------------------
 # Compilation Profiles
 # --------------------
-#  A Cargo profile is a preset for the compiler/linker knobs that trade off:
+# A Cargo profile is a preset for the compiler/linker knobs that trades off:
 # - Build time: how quickly code compiles and links
 # - Runtime performance: how fast the resulting binaries execute
 # - Binary size: how large the executables end up
 # - Debuggability: how much debug information is preserved for debugging and profiling
 #
+# To use a profile: `cargo [ build | run | ... ] --profile <profile-name>`
+#
 # Profiles available:
-# - dev: default debug build; fastest to compile, slowest to run, full debug info
-#     for everyday development.
-#     Run: cargo run
-# - release: optimized build; slowest to compile, fastest to run, smallest
-#     binaries for public releases.
-#     Run: cargo run --release
-# - release-nonlto: skips LTO, so it builds quicker while staying close to
-#     release performance. It is useful when developing performance optimizations.
-#     Run: cargo run --profile release-nonlto
+# - dev: default debug build; fastest to compile, slowest to run, full debug info.
+#     For everyday development; default for "cargo [ build | test | run ]".
+# - release: fully optimized build; slowest to compile, fastest to run, smallest
+#     binaries. For public releases; default for "cargo [ bench | install ]".
+# - release-nonlto: skips LTO, so it builds much faster while staying close to
+#     release performance. Useful when developing performance optimizations.
 # - profiling: inherits release optimizations but retains debug info to support
 #     profiling tools and flamegraphs.
-#     Run: cargo run --profile profiling
-# - ci: derived from `dev` but disables incremental builds and strips dependency
-#     symbols to keep CI artifacts small and reproducible.
-#     Run: cargo run --profile ci
-# - ci-optimized: derived from `release` but enables debug assertions, and uses
-#     lighter optimizations. Used for long-running CI tasks.
-#     Run: cargo run --profile ci-release
+# - ci: derived from `dev` but disables debug info and incremental builds to keep
+#     CI artifacts small and reproducible.
+# - ci-optimized: derived from `release` but enables debug assertions and uses
+#     less aggressive optimizations for faster builds.  Used for long-running CI
+#     tasks.
 #
 # If you want to optimize compilation, the `compile_profile` benchmark can be useful.
 # See `benchmarks/README.md` for more details.
 [profile.release]
 codegen-units = 1
 lto = true
-strip = true      # Eliminate debug information to minimize binary size
+strip = true      # Eliminate debug info to minimize binary size
 
 [profile.release-nonlto]
-codegen-units = 16
-debug-assertions = false
-incremental = false
 inherits = "release"
+codegen-units = 16
 lto = false
-opt-level = 3
-overflow-checks = false
-rpath = false
-strip = false            # Retain debug info for flamegraphs
+strip = false        # Retain debug info for flamegraphs
+
+[profile.profiling]
+inherits = "release"
+debug = true
+strip = false
 
 [profile.ci-optimized]
 inherits = "release"
 debug-assertions = true
 codegen-units = 16
 lto = "thin"
-strip = true
 
 [profile.ci]
-debug = false
 inherits = "dev"
+debug = false
 incremental = false
-debug-assertions = true
 
 # This rule applies to every package except workspace members (dependencies
 # such as `arrow` and `tokio`). It disables debug info and related features on
@@ -289,8 +284,3 @@ debug = false
 debug-assertions = false
 strip = "debuginfo"
 incremental = false
-
-[profile.profiling]
-inherits = "release"
-debug = true
-strip = false
@@ -95,7 +95,7 @@ Generate the data required for the compile profile helper (TPC-H SF=1):
 ./bench.sh data compile_profile
 ```
 
-Run the benchmark across all default Cargo profiles (`dev`, `release`, `ci`, `release-nonlto`):
+Run the benchmark across all default Cargo profiles (`dev`, `release`, `ci`, `ci-optimized`, `release-nonlto`, `profiling`):
 
 ```shell
 ./bench.sh run compile_profile
 
@@ -106,6 +106,10 @@ clickbench_partitioned: ClickBench queries against partitioned (100 files) parqu
 clickbench_pushdown:    ClickBench queries against partitioned (100 files) parquet w/ filter_pushdown enabled
 clickbench_extended:    ClickBench \"inspired\" queries against a single parquet (DataFusion specific)
 
+# Sort Pushdown Benchmarks
+sort_pushdown:          Sort pushdown baseline (no WITH ORDER) on TPC-H data (SF=1)
+sort_pushdown_sorted:   Sort pushdown with WITH ORDER — tests sort elimination on non-overlapping files
+
 # Sorted Data Benchmarks (ORDER BY Optimization)
 clickbench_sorted:     ClickBench queries on pre-sorted data using prefer_existing_sort (tests sort elimination optimization)
 
@@ -309,6 +313,10 @@ main() {
                     # same data as for tpch
                     data_tpch "1" "parquet"
                     ;;
+                sort_pushdown|sort_pushdown_sorted)
+                    # same data as for tpch
+                    data_tpch "1" "parquet"
+                    ;;
                 sort_tpch)
                     # same data as for tpch
                     data_tpch "1" "parquet"
@@ -509,6 +517,12 @@ main() {
                 external_aggr)
                     run_external_aggr
                     ;;
+                sort_pushdown)
+                    run_sort_pushdown
+                    ;;
+                sort_pushdown_sorted)
+                    run_sort_pushdown_sorted
+                    ;;
                 sort_tpch)
                     run_sort_tpch "1"
                     ;;
@@ -1070,6 +1084,22 @@ run_external_aggr() {
     debug_run $CARGO_COMMAND --bin external_aggr -- benchmark --partitions 4 --iterations 5 --path "${TPCH_DIR}" -o "${RESULTS_FILE}" ${QUERY_ARG}
 }
 
+# Runs the sort pushdown benchmark (without WITH ORDER)
+run_sort_pushdown() {
+    TPCH_DIR="${DATA_DIR}/tpch_sf1"
+    RESULTS_FILE="${RESULTS_DIR}/sort_pushdown.json"
+    echo "Running sort pushdown benchmark (no WITH ORDER)..."
+    debug_run $CARGO_COMMAND --bin dfbench -- sort-pushdown --iterations 5 --path "${TPCH_DIR}" -o "${RESULTS_FILE}" ${QUERY_ARG} ${LATENCY_ARG}
+}
+
+# Runs the sort pushdown benchmark with WITH ORDER (enables sort elimination)
+run_sort_pushdown_sorted() {
+    TPCH_DIR="${DATA_DIR}/tpch_sf1"
+    RESULTS_FILE="${RESULTS_DIR}/sort_pushdown_sorted.json"
+    echo "Running sort pushdown benchmark (with WITH ORDER)..."
+    debug_run $CARGO_COMMAND --bin dfbench -- sort-pushdown --sorted --iterations 5 --path "${TPCH_DIR}" -o "${RESULTS_FILE}" ${QUERY_ARG} ${LATENCY_ARG}
+}
+
 # Runs the sort integration benchmark
 run_sort_tpch() {
     SCALE_FACTOR=$1
 
@@ -19,8 +19,10 @@
 
 """Compile profile benchmark runner for DataFusion.
 
-Builds the `tpch` benchmark binary with several Cargo profiles (e.g. `--release` or `--profile ci`), runs the full TPC-H suite against the Parquet data under `benchmarks/data/tpch_sf1`, and reports compile time, execution time, and resulting 
-binary size.
+Builds the `dfbench` benchmark binary with several Cargo profiles
+(e.g. `--release` or `--profile ci`), runs the full TPC-H suite against
+the Parquet data under `benchmarks/data/tpch_sf1`, and reports compile
+time, execution time, and resulting binary size.
 
 See `benchmarks/README.md` for usages.
 """
@@ -40,12 +42,15 @@
 DEFAULT_ITERATIONS = 1
 DEFAULT_FORMAT = "parquet"
 DEFAULT_PARTITIONS: int | None = None
-TPCH_BINARY = "tpch.exe" if os.name == "nt" else "tpch"
+BENCHMARK_PACKAGE = "datafusion-benchmarks"
+BENCHMARK_BINARY = "dfbench.exe" if os.name == "nt" else "dfbench"
 PROFILE_TARGET_DIR = {
     "dev": "debug",
     "release": "release",
     "ci": "ci",
+    "ci-optimized": "ci-optimized",
     "release-nonlto": "release-nonlto",
+    "profiling": "profiling",
 }
 
 
@@ -62,7 +67,10 @@ def parse_args() -> argparse.Namespace:
         "--profiles",
         nargs="+",
         default=list(PROFILE_TARGET_DIR.keys()),
-        help="Cargo profiles to test (default: dev release ci release-nonlto)",
+        help=(
+            "Cargo profiles to test "
+            "(default: dev release ci ci-optimized release-nonlto profiling)"
+        ),
     )
     parser.add_argument(
         "--data",
@@ -84,9 +92,25 @@ def timed_run(command: Iterable[str]) -> float:
 
 def cargo_build(profile: str) -> float:
     if profile == "dev":
-        command = ["cargo", "build", "--bin", "tpch"]
+        command = [
+            "cargo",
+            "build",
+            "--package",
+            BENCHMARK_PACKAGE,
+            "--bin",
+            "dfbench",
+        ]
     else:
-        command = ["cargo", "build", "--profile", profile, "--bin", "tpch"]
+        command = [
+            "cargo",
+            "build",
+            "--profile",
+            profile,
+            "--package",
+            BENCHMARK_PACKAGE,
+            "--bin",
+            "dfbench",
+        ]
     return timed_run(command)
 
 
@@ -102,14 +126,13 @@ def run_benchmark(profile: str, data_path: Path) -> float:
     binary_dir = PROFILE_TARGET_DIR.get(profile)
     if not binary_dir:
         raise ValueError(f"unknown profile '{profile}'")
-    binary_path = REPO_ROOT / "target" / binary_dir / TPCH_BINARY
+    binary_path = REPO_ROOT / "target" / binary_dir / BENCHMARK_BINARY
     if not binary_path.exists():
         raise FileNotFoundError(f"compiled binary not found at {binary_path}")
 
     command = [
         str(binary_path),
-        "benchmark",
-        "datafusion",
+        "tpch",
         "--iterations",
         str(DEFAULT_ITERATIONS),
         "--path",
@@ -132,7 +155,7 @@ def run_benchmark(profile: str, data_path: Path) -> float:
 
 def binary_size(profile: str) -> int:
     binary_dir = PROFILE_TARGET_DIR[profile]
-    binary_path = REPO_ROOT / "target" / binary_dir / TPCH_BINARY
+    binary_path = REPO_ROOT / "target" / binary_dir / BENCHMARK_BINARY
     return binary_path.stat().st_size
 
 
 
@@ -0,0 +1,6 @@
+-- Sort elimination: ORDER BY sort key ASC (full scan)
+-- With --sorted: SortExec removed, sequential scan in file order
+-- Without --sorted: full SortExec required
+SELECT l_orderkey, l_partkey, l_suppkey
+FROM lineitem
+ORDER BY l_orderkey
@@ -0,0 +1,7 @@
+-- Sort elimination + limit pushdown
+-- With --sorted: SortExec removed + limit pushed to DataSourceExec
+-- Without --sorted: TopK sort over all data
+SELECT l_orderkey, l_partkey, l_suppkey
+FROM lineitem
+ORDER BY l_orderkey
+LIMIT 100
@@ -0,0 +1,5 @@
+-- Sort elimination: wide projection (all columns)
+-- Tests sort elimination benefit with larger row payload
+SELECT *
+FROM lineitem
+ORDER BY l_orderkey
@@ -0,0 +1,5 @@
+-- Sort elimination + limit: wide projection
+SELECT *
+FROM lineitem
+ORDER BY l_orderkey
+LIMIT 100
@@ -34,7 +34,8 @@ static ALLOC: snmalloc_rs::SnMalloc = snmalloc_rs::SnMalloc;
 static ALLOC: mimalloc::MiMalloc = mimalloc::MiMalloc;
 
 use datafusion_benchmarks::{
-    cancellation, clickbench, h2o, hj, imdb, nlj, smj, sort_tpch, tpcds, tpch,
+    cancellation, clickbench, h2o, hj, imdb, nlj, smj, sort_pushdown, sort_tpch, tpcds,
+    tpch,
 };
 
 #[derive(Debug, Parser)]
@@ -53,6 +54,7 @@ enum Options {
     Imdb(imdb::RunOpt),
     Nlj(nlj::RunOpt),
     Smj(smj::RunOpt),
+    SortPushdown(sort_pushdown::RunOpt),
     SortTpch(sort_tpch::RunOpt),
     Tpch(tpch::RunOpt),
     Tpcds(tpcds::RunOpt),
@@ -72,6 +74,7 @@ pub async fn main() -> Result<()> {
         Options::Imdb(opt) => Box::pin(opt.run()).await,
         Options::Nlj(opt) => opt.run().await,
         Options::Smj(opt) => opt.run().await,
+        Options::SortPushdown(opt) => opt.run().await,
         Options::SortTpch(opt) => opt.run().await,
         Options::Tpch(opt) => Box::pin(opt.run()).await,
         Options::Tpcds(opt) => Box::pin(opt.run()).await,