fix: mid-stream skip for ineffective optional row/post-scan filters

adriangb · claude · adriangb · commit 6ebaaa2180d3 · 2026-04-20T14:26:03.000-05:00
After moving optional filters to RowFilter via byte_ratio, queries with
1-row-group-per-file inputs (e.g. TPC-DS) had no chance to demote when
the chosen filter turned out to be CPU-dominated and ineffective:
partition_filters runs once per file open, all 12 split openers fire in
parallel and see no stats, and the existing Demote/Drop branches never
re-trigger for the lifetime of the scan.

Add a per-FilterId Arc&lt;AtomicBool&gt; "skip flag" owned by
SelectivityTracker. Once a filter has accumulated enough samples and its
CI upper bound on bytes-per-second falls below min_bytes_per_sec, the
hot per-batch update() path flips the flag — but only for filters
recorded as optional at first encounter (mandatory filters must always
execute or the result set changes).

Both consumers honour it:
  * DatafusionArrowPredicate::evaluate returns an all-true mask without
    invoking physical_expr (filter columns are still decoded; CPU is
    reclaimed but I/O is not, pending arrow-rs API).
  * apply_post_scan_filters_with_stats `continue`s past the filter,
    skipping evaluation and the per-batch tracker.update.

Local TPC-DS sf1 (M-series, pushdown_filters=true), worst regressors
from main pushdown=off baseline:

| Query | Main(off) | Branch(byte-ratio) | +skip-flag |
|-------|-----------|--------------------|------------|
| Q72   |     619   |          554       |       261  |
| Q50   |     221   |          521       |       135  |
| Q23   |     892   |         1217       |       680  |
| Q67   |     310   |          510       |       306  |
| Q18   |     128   |          312       |       178  |
| Q13   |     399   |          558       |       363  |
| Q53   |     103   |          167       |        93  |
| Q63   |     106   |          173       |        93  |
| Q76   |     132   |          268       |       105  |

Q24-class wins are unaffected (Q24 holds at 70 ms vs 379 ms on main).

Co-Authored-By: Claude Opus 4.7 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/datafusion/datasource-parquet/src/opener.rs b/datafusion/datasource-parquet/src/opener.rs
@@ -1433,6 +1433,15 @@ fn apply_post_scan_filters_with_stats(
     let mut combined_mask: Option<BooleanArray> = None;
 
     for (i, (id, expr)) in filters.iter().enumerate() {
+        // Mid-stream drop, mirror of `DatafusionArrowPredicate::evaluate`.
+        // Set by the tracker on `OptionalFilterPhysicalExpr` whose CI
+        // upper bound has fallen below `min_bytes_per_sec`; correctness is
+        // preserved because the originating join independently enforces
+        // the predicate. We do not update the tracker for a skipped batch.
+        if tracker.is_filter_skipped(*id) {
+            continue;
+        }
+
         let start = datafusion_common::instant::Instant::now();
         let result = expr.evaluate(&batch)?.into_array(batch.num_rows())?;
         let bool_arr = as_boolean_array(result.as_ref())?;
diff --git a/datafusion/datasource-parquet/src/row_filter.rs b/datafusion/datasource-parquet/src/row_filter.rs
@@ -138,6 +138,15 @@ pub(crate) struct DatafusionArrowPredicate {
     /// paths disagreed, the tracker would rank row-filter and post-scan
     /// candidates on incomparable axes and mis-promote or mis-demote.
     other_projected_bytes_per_row: f64,
+    /// Mid-stream "drop" flag, shared with the
+    /// [`crate::selectivity::SelectivityTracker`]. The tracker flips this
+    /// when an `OptionalFilterPhysicalExpr` proves CPU-dominated and
+    /// ineffective; once set, [`Self::evaluate`] returns an all-true mask
+    /// without invoking `physical_expr`. Filter columns are still decoded
+    /// (the parquet decoder cannot be reconfigured mid-scan), so this only
+    /// reclaims CPU, not I/O. Flagged only for filters known to be
+    /// optional, so correctness is preserved by the join itself.
+    skip_flag: Arc<std::sync::atomic::AtomicBool>,
 }
 
 impl DatafusionArrowPredicate {
@@ -153,6 +162,7 @@ impl DatafusionArrowPredicate {
     ) -> Result<Self> {
         let physical_expr =
             reassign_expr_columns(candidate.expr, &candidate.read_plan.projected_schema)?;
+        let skip_flag = tracker.skip_flag(filter_id);
 
         Ok(Self {
             physical_expr,
@@ -163,6 +173,7 @@ impl DatafusionArrowPredicate {
             filter_id,
             tracker,
             other_projected_bytes_per_row,
+            skip_flag,
         })
     }
 }
@@ -173,6 +184,20 @@ impl ArrowPredicate for DatafusionArrowPredicate {
     }
 
     fn evaluate(&mut self, batch: RecordBatch) -> ArrowResult<BooleanArray> {
+        // Mid-stream drop: the tracker has decided this optional filter is
+        // pulling its weight no longer. Return an all-true mask to bypass
+        // expression evaluation entirely. We still bump `rows_matched` so
+        // the per-predicate count stays consistent with input rows; the
+        // tracker is intentionally NOT updated for skipped batches because
+        // (a) we have nothing meaningful to report and (b) flooding it
+        // with zero-cost samples would mask the underlying effectiveness
+        // signal if the flag is ever cleared.
+        if self.skip_flag.load(std::sync::atomic::Ordering::Acquire) {
+            let rows_in_batch = batch.num_rows();
+            self.rows_matched.add(rows_in_batch);
+            return Ok(BooleanArray::from(vec![true; rows_in_batch]));
+        }
+
         // scoped timer updates on drop
         let mut timer = self.time.timer();
         let start_nanos = datafusion_common::instant::Instant::now();
diff --git a/datafusion/datasource-parquet/src/selectivity.rs b/datafusion/datasource-parquet/src/selectivity.rs
@@ -26,6 +26,7 @@ use parking_lot::{Mutex, RwLock};
 use parquet::file::metadata::ParquetMetaData;
 use std::collections::HashMap;
 use std::sync::Arc;
+use std::sync::atomic::{AtomicBool, Ordering};
 
 use datafusion_physical_expr::utils::collect_columns;
 use datafusion_physical_expr_common::physical_expr::{
@@ -35,6 +36,11 @@ use datafusion_physical_expr_common::physical_expr::{
 /// Stable identifier for a filter conjunct, assigned by `ParquetSource::with_predicate`.
 pub type FilterId = usize;
 
+/// Re-evaluate the per-filter skip flag every Nth batch update. The CI
+/// upper bound is a couple of arithmetic ops so this cap mostly serves to
+/// keep cache lines for `is_optional` / `skip_flags` cold on the hot path.
+const SKIP_FLAG_CHECK_INTERVAL: u64 = 4;
+
 /// Per-filter lifecycle state in the adaptive filter system.
 ///
 /// State transitions:
@@ -211,6 +217,8 @@ impl TrackerConfig {
         SelectivityTracker {
             config: self,
             filter_stats: RwLock::new(HashMap::new()),
+            skip_flags: RwLock::new(HashMap::new()),
+            is_optional: RwLock::new(HashMap::new()),
             inner: Mutex::new(SelectivityTrackerInner::new()),
         }
     }
@@ -334,6 +342,25 @@ pub struct SelectivityTracker {
     /// counters, so concurrent `update()` calls on *different* filters
     /// proceed in parallel with zero contention.
     filter_stats: RwLock<HashMap<FilterId, Mutex<SelectivityStats>>>,
+    /// Per-filter "skip" flags — when set, the corresponding filter is
+    /// treated as a no-op by both the row-filter
+    /// (`DatafusionArrowPredicate::evaluate`) and the post-scan path
+    /// (`apply_post_scan_filters_with_stats`). This is the mid-stream
+    /// equivalent of dropping an optional filter: once the per-batch
+    /// `update()` path proves an `OptionalFilterPhysicalExpr` is
+    /// CPU-dominated and ineffective, it flips the flag and subsequent
+    /// batches stop paying the evaluation cost. The decoder still decodes
+    /// the filter columns (we cannot rebuild it mid-scan), so I/O is not
+    /// reclaimed; only the predicate evaluation is skipped.
+    ///
+    /// Only ever set for filters whose `is_optional` entry is `true` —
+    /// mandatory filters must always execute or queries return wrong rows.
+    skip_flags: RwLock<HashMap<FilterId, Arc<AtomicBool>>>,
+    /// Whether each filter is wrapped in an `OptionalFilterPhysicalExpr`,
+    /// captured at first-encounter in `partition_filters` so the per-batch
+    /// `update()` path can decide whether the filter is safe to no-op
+    /// without re-inspecting the expression tree on every batch.
+    is_optional: RwLock<HashMap<FilterId, bool>>,
     /// Filter lifecycle state machine and dynamic-filter generation tracking.
     ///
     /// Only `partition_filters()` acquires this lock (once per file open).
@@ -372,6 +399,16 @@ impl SelectivityTracker {
     /// before `partition_filters()` has registered the filter — in practice
     /// this cannot happen because `partition_filters()` runs during file open
     /// before any batches are processed).
+    ///
+    /// **Mid-stream drop:** after every `SKIP_FLAG_CHECK_INTERVAL`'th batch
+    /// we evaluate the CI upper bound; if it falls below
+    /// `min_bytes_per_sec` and the filter is wrapped in
+    /// `OptionalFilterPhysicalExpr`, we set the per-filter skip flag.
+    /// Subsequent calls to `DatafusionArrowPredicate::evaluate` (row-level)
+    /// and `apply_post_scan_filters_with_stats` (post-scan) observe the
+    /// flag and short-circuit their work for that filter. Mandatory
+    /// filters are never flagged because doing so would change the result
+    /// set.
     pub(crate) fn update(
         &self,
         id: FilterId,
@@ -380,12 +417,77 @@ impl SelectivityTracker {
         eval_nanos: u64,
         batch_bytes: u64,
     ) {
-        let map = self.filter_stats.read();
-        if let Some(entry) = map.get(&id) {
-            entry.lock().update(matched, total, eval_nanos, batch_bytes);
+        let stats_map = self.filter_stats.read();
+        let Some(entry) = stats_map.get(&id) else {
+            return;
+        };
+        let mut stats = entry.lock();
+        stats.update(matched, total, eval_nanos, batch_bytes);
+
+        // Mid-stream drop check. Only consult the skip mechanism for
+        // filters we already know to be optional, and only after enough
+        // samples for `confidence_upper_bound` to be defined. The modulo
+        // gate keeps the per-batch overhead tiny on the hot path.
+        if !self.config.min_bytes_per_sec.is_finite()
+            || !stats.sample_count.is_multiple_of(SKIP_FLAG_CHECK_INTERVAL)
+        {
+            return;
+        }
+        let Some(ub) = stats.confidence_upper_bound(self.config.confidence_z) else {
+            return;
+        };
+        if ub >= self.config.min_bytes_per_sec {
+            return;
+        }
+        drop(stats);
+        drop(stats_map);
+
+        // Optionality is captured at first sight in `partition_filters` so
+        // we can answer this without re-walking the expression tree.
+        let is_optional = self.is_optional.read().get(&id).copied().unwrap_or(false);
+        if !is_optional {
+            return;
+        }
+        if let Some(flag) = self.skip_flags.read().get(&id)
+            && !flag.swap(true, Ordering::Release)
+        {
+            debug!(
+                "FilterId {id}: mid-stream skip — CI upper bound {ub} < {} bytes/sec",
+                self.config.min_bytes_per_sec
+            );
         }
     }
 
+    /// Returns the shared skip flag for `id`, creating one if absent.
+    ///
+    /// Cloned into [`crate::row_filter::DatafusionArrowPredicate`] so the
+    /// row-filter path can short-circuit when the per-batch update path
+    /// decides the filter has stopped pulling its weight. The post-scan
+    /// path uses [`Self::is_filter_skipped`] instead — it does not need a
+    /// long-lived handle.
+    pub(crate) fn skip_flag(&self, id: FilterId) -> Arc<AtomicBool> {
+        if let Some(existing) = self.skip_flags.read().get(&id) {
+            return Arc::clone(existing);
+        }
+        let mut write = self.skip_flags.write();
+        Arc::clone(
+            write
+                .entry(id)
+                .or_insert_with(|| Arc::new(AtomicBool::new(false))),
+        )
+    }
+
+    /// Returns `true` when `id` has been mid-stream-dropped by the tracker.
+    ///
+    /// Cheap: a single `RwLock::read` plus an atomic load. Called from the
+    /// post-scan filter loop in `apply_post_scan_filters_with_stats`.
+    pub(crate) fn is_filter_skipped(&self, id: FilterId) -> bool {
+        self.skip_flags
+            .read()
+            .get(&id)
+            .is_some_and(|f| f.load(Ordering::Acquire))
+    }
+
     /// Partition filters into row-level predicates vs post-scan filters.
     ///
     /// Called once per file open (cold path).
@@ -420,12 +522,22 @@ impl SelectivityTracker {
         // Phase 2: if new filters were seen, briefly acquire write lock to insert entries
         if !result.new_filter_ids.is_empty() {
             let mut stats_write = self.filter_stats.write();
-            for id in result.new_filter_ids {
+            for id in &result.new_filter_ids {
                 stats_write
-                    .entry(id)
+                    .entry(*id)
                     .or_insert_with(|| Mutex::new(SelectivityStats::default()));
             }
         }
+        if !result.new_optional_flags.is_empty() {
+            let mut optional_write = self.is_optional.write();
+            let mut skip_write = self.skip_flags.write();
+            for (id, is_optional) in result.new_optional_flags {
+                optional_write.entry(id).or_insert(is_optional);
+                skip_write
+                    .entry(id)
+                    .or_insert_with(|| Arc::new(AtomicBool::new(false)));
+            }
+        }
 
         result.partitioned
     }
@@ -455,6 +567,12 @@ impl SelectivityTracker {
 struct PartitionResult {
     partitioned: PartitionedFilters,
     new_filter_ids: Vec<FilterId>,
+    /// `(FilterId, is_optional)` entries observed for the first time in this
+    /// `partition_filters` call. The outer `SelectivityTracker` records
+    /// optionality alongside `filter_stats` so that the hot `update()` path
+    /// can decide whether the per-filter skip flag is safe to flip without
+    /// inspecting the expression tree.
+    new_optional_flags: Vec<(FilterId, bool)>,
 }
 
 /// Filter state-machine and generation tracking, guarded by the `Mutex`
@@ -578,6 +696,7 @@ impl SelectivityTrackerInner {
         stats_map: &HashMap<FilterId, Mutex<SelectivityStats>>,
     ) -> PartitionResult {
         let mut new_filter_ids = Vec::new();
+        let mut new_optional_flags: Vec<(FilterId, bool)> = Vec::new();
 
         // If min_bytes_per_sec is INFINITY -> all filters are post-scan.
         if config.min_bytes_per_sec.is_infinite() {
@@ -586,9 +705,10 @@ impl SelectivityTrackerInner {
                 filters.len()
             );
             // Register all filter IDs so update() can find them
-            for &(id, _) in &filters {
-                if !stats_map.contains_key(&id) {
-                    new_filter_ids.push(id);
+            for (id, expr) in &filters {
+                if !stats_map.contains_key(id) {
+                    new_filter_ids.push(*id);
+                    new_optional_flags.push((*id, is_optional_filter(expr)));
                 }
             }
             return PartitionResult {
@@ -597,6 +717,7 @@ impl SelectivityTrackerInner {
                     post_scan: filters,
                 },
                 new_filter_ids,
+                new_optional_flags,
             };
         }
         // If min_bytes_per_sec is 0 -> all filters are promoted.
@@ -606,9 +727,10 @@ impl SelectivityTrackerInner {
                 filters.len()
             );
             // Register all filter IDs so update() can find them
-            for &(id, _) in &filters {
-                if !stats_map.contains_key(&id) {
-                    new_filter_ids.push(id);
+            for (id, expr) in &filters {
+                if !stats_map.contains_key(id) {
+                    new_filter_ids.push(*id);
+                    new_optional_flags.push((*id, is_optional_filter(expr)));
                 }
             }
             return PartitionResult {
@@ -617,6 +739,7 @@ impl SelectivityTrackerInner {
                     post_scan: Vec::new(),
                 },
                 new_filter_ids,
+                new_optional_flags,
             };
         }
 
@@ -667,6 +790,7 @@ impl SelectivityTrackerInner {
 
                 if !stats_map.contains_key(&id) {
                     new_filter_ids.push(id);
+                    new_optional_flags.push((id, is_optional_filter(&expr)));
                 }
 
                 if byte_ratio <= config.byte_ratio_threshold {
@@ -787,10 +911,16 @@ impl SelectivityTrackerInner {
                 post_scan: post_scan_filters,
             },
             new_filter_ids,
+            new_optional_flags,
         }
     }
 }
 
+/// Returns `true` if `expr` is wrapped in [`OptionalFilterPhysicalExpr`].
+fn is_optional_filter(expr: &Arc<dyn PhysicalExpr>) -> bool {
+    expr.downcast_ref::<OptionalFilterPhysicalExpr>().is_some()
+}
+
 /// Calculate the estimated number of bytes needed to evaluate a filter based on the columns
 /// it references as if it were applied to the entire file.
 /// This is used for initial placement of new filters before any stats are available, and as a fallback for filters without stats.