Skip to content

Commit 70c522f

Browse files
committed
test(filter): verify ExpressionAnalyzer inclusion-exclusion selectivity for OR predicates
OR predicates are inherently outside interval arithmetic (a union of two disjoint intervals cannot be represented as a single interval). This test confirms that ExpressionAnalyzerRegistry computes the correct inclusion-exclusion selectivity (0.28 = 0.1 + 0.2 - 0.02) on a 1000-row input, versus the default 20% (200 rows) without a registry.
1 parent fd549f9 commit 70c522f

1 file changed

Lines changed: 61 additions & 0 deletions

File tree

datafusion/physical-plan/src/filter.rs

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2626,6 +2626,67 @@ mod tests {
26262626
/// an explicit projection must not panic when `try_swapping_with_projection`
26272627
/// attempts to swap the two nodes.
26282628
///
2629+
/// Verifies that `ExpressionAnalyzerRegistry` computes selectivity for OR predicates
2630+
/// using inclusion-exclusion, which interval arithmetic cannot represent (a union of
2631+
/// two disjoint intervals is not a single interval).
2632+
///
2633+
/// For `(a = 42 OR b = 5)` with NDV_a=10, NDV_b=5 on 1000 rows:
2634+
/// - Without ExpressionAnalyzer: default 20% selectivity -> 200 rows
2635+
/// - With ExpressionAnalyzer: P(a=42) + P(b=5) - P(a=42)*P(b=5) = 0.1 + 0.2 - 0.02 = 0.28 -> 280 rows
2636+
#[tokio::test]
2637+
async fn test_filter_statistics_expression_analyzer_selectivity_or_predicate() -> Result<()> {
2638+
let schema = Schema::new(vec![
2639+
Field::new("a", DataType::Int64, false),
2640+
Field::new("b", DataType::Int64, false),
2641+
]);
2642+
let input = Arc::new(StatisticsExec::new(
2643+
Statistics {
2644+
num_rows: Precision::Inexact(1000),
2645+
total_byte_size: Precision::Absent,
2646+
column_statistics: vec![
2647+
ColumnStatistics {
2648+
distinct_count: Precision::Inexact(10),
2649+
..Default::default()
2650+
},
2651+
ColumnStatistics {
2652+
distinct_count: Precision::Inexact(5),
2653+
..Default::default()
2654+
},
2655+
],
2656+
},
2657+
schema.clone(),
2658+
));
2659+
// (a = 42 OR b = 5): OR is not expressible as a single interval
2660+
let predicate = Arc::new(BinaryExpr::new(
2661+
Arc::new(BinaryExpr::new(
2662+
Arc::new(Column::new("a", 0)),
2663+
Operator::Eq,
2664+
Arc::new(Literal::new(ScalarValue::Int64(Some(42)))),
2665+
)),
2666+
Operator::Or,
2667+
Arc::new(BinaryExpr::new(
2668+
Arc::new(Column::new("b", 1)),
2669+
Operator::Eq,
2670+
Arc::new(Literal::new(ScalarValue::Int64(Some(5)))),
2671+
)),
2672+
));
2673+
2674+
// Without ExpressionAnalyzer: default 20% selectivity -> 200 rows
2675+
let filter = Arc::new(FilterExec::try_new(predicate.clone(), input as _)?);
2676+
let stats = filter.partition_statistics(None)?;
2677+
assert_eq!(stats.num_rows, Precision::Inexact(200));
2678+
2679+
// With ExpressionAnalyzer: inclusion-exclusion -> 0.1 + 0.2 - 0.02 = 0.28 -> 280 rows
2680+
let registry = Arc::new(ExpressionAnalyzerRegistry::new());
2681+
let filter_with_registry = filter
2682+
.with_expression_analyzer_registry(&registry)
2683+
.expect("registry should be injectable when not already set");
2684+
let stats_with_registry = filter_with_registry.partition_statistics(None)?;
2685+
assert_eq!(stats_with_registry.num_rows, Precision::Inexact(280));
2686+
2687+
Ok(())
2688+
}
2689+
26292690
/// Before the fix, `FilterExecBuilder::from(self)` copied the old projection
26302691
/// (e.g. `[0, 1, 2]`) from the FilterExec. After `.with_input` replaced the
26312692
/// input with the narrower ProjectionExec (2 columns), `.build()` tried to

0 commit comments

Comments
 (0)