Skip to content

Commit 2c26e96

Browse files
committed
refactor(expression_analyzer): delegate when no NDV statistics are available
Return Delegate for all leaf predicates when NDV is unavailable, and propagate Delegate upward through AND/OR/NOT when any child has no estimate. DefaultExpressionAnalyzer now only produces a result when it has a genuine information advantage (NDV from column statistics).
1 parent a0d1846 commit 2c26e96

4 files changed

Lines changed: 123 additions & 82 deletions

File tree

datafusion/physical-expr/src/expression_analyzer/default.rs

Lines changed: 34 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ impl DefaultExpressionAnalyzer {
5656
///
5757
/// Using max is symmetric (order-independent) and handles column-vs-column,
5858
/// column-vs-expression, and expression-vs-expression uniformly through the
59-
/// registry chain
59+
/// registry chain. Returns `None` when either side has unknown NDV.
6060
fn resolve_ndv(
6161
left: &Arc<dyn PhysicalExpr>,
6262
right: &Arc<dyn PhysicalExpr>,
@@ -65,18 +65,12 @@ impl DefaultExpressionAnalyzer {
6565
) -> Option<usize> {
6666
let l = registry.get_distinct_count(left, input_stats);
6767
let r = registry.get_distinct_count(right, input_stats);
68-
l.max(r).filter(|&n| n > 0)
68+
match (l, r) {
69+
(Some(a), Some(b)) => Some(a.max(b)).filter(|&n| n > 0),
70+
_ => None,
71+
}
6972
}
7073

71-
/// Recursive selectivity estimation through the registry chain
72-
fn estimate_selectivity_recursive(
73-
&self,
74-
expr: &Arc<dyn PhysicalExpr>,
75-
input_stats: &Statistics,
76-
registry: &ExpressionAnalyzerRegistry,
77-
) -> f64 {
78-
registry.get_selectivity(expr, input_stats).unwrap_or(0.5)
79-
}
8074
}
8175

8276
impl ExpressionAnalyzer for DefaultExpressionAnalyzer {
@@ -88,36 +82,25 @@ impl ExpressionAnalyzer for DefaultExpressionAnalyzer {
8882
) -> AnalysisResult<f64> {
8983
// Binary expressions: AND, OR, comparisons
9084
if let Some(binary) = expr.as_any().downcast_ref::<BinaryExpr>() {
91-
let sel = match binary.op() {
92-
// Logical operators: need child selectivities
93-
Operator::And => {
94-
let left_sel = self.estimate_selectivity_recursive(
95-
binary.left(),
96-
input_stats,
97-
registry,
98-
);
99-
let right_sel = self.estimate_selectivity_recursive(
100-
binary.right(),
101-
input_stats,
102-
registry,
103-
);
104-
left_sel * right_sel
105-
}
106-
Operator::Or => {
107-
let left_sel = self.estimate_selectivity_recursive(
108-
binary.left(),
109-
input_stats,
110-
registry,
111-
);
112-
let right_sel = self.estimate_selectivity_recursive(
113-
binary.right(),
114-
input_stats,
115-
registry,
116-
);
117-
left_sel + right_sel - (left_sel * right_sel)
85+
match binary.op() {
86+
// AND/OR: only provide a value when both children have estimates.
87+
// Delegating when either child has no information prevents arbitrary
88+
// constants from contaminating the product/inclusion-exclusion formula.
89+
Operator::And | Operator::Or => {
90+
if let (Some(left), Some(right)) = (
91+
registry.get_selectivity(binary.left(), input_stats),
92+
registry.get_selectivity(binary.right(), input_stats),
93+
) {
94+
let sel = match binary.op() {
95+
Operator::And => left * right,
96+
Operator::Or => left + right - left * right,
97+
_ => unreachable!(),
98+
};
99+
return AnalysisResult::Computed(sel);
100+
}
118101
}
119102

120-
// Equality: selectivity = 1/NDV
103+
// Equality: 1/NDV. Delegate when NDV is unknown so default_selectivity applies.
121104
Operator::Eq => {
122105
if let Some(ndv) = Self::resolve_ndv(
123106
binary.left(),
@@ -127,10 +110,9 @@ impl ExpressionAnalyzer for DefaultExpressionAnalyzer {
127110
) {
128111
return AnalysisResult::Computed(1.0 / (ndv as f64));
129112
}
130-
0.1 // Default equality selectivity
131113
}
132114

133-
// Inequality: selectivity = 1 - 1/NDV
115+
// Inequality: 1 - 1/NDV. Delegate when NDV is unknown.
134116
Operator::NotEq => {
135117
if let Some(ndv) = Self::resolve_ndv(
136118
binary.left(),
@@ -140,34 +122,25 @@ impl ExpressionAnalyzer for DefaultExpressionAnalyzer {
140122
) {
141123
return AnalysisResult::Computed(1.0 - (1.0 / (ndv as f64)));
142124
}
143-
0.9
144125
}
145126

146-
// Range predicates: classic 1/3 estimate
147-
Operator::Lt | Operator::LtEq | Operator::Gt | Operator::GtEq => 0.33,
148-
149-
// LIKE: depends on pattern, use conservative estimate
150-
Operator::LikeMatch | Operator::ILikeMatch => 0.25,
151-
Operator::NotLikeMatch | Operator::NotILikeMatch => 0.75,
152-
153-
// Other operators: default
154-
_ => 0.5,
155-
};
127+
// All other operators: no statistics available, let caller decide.
128+
_ => {}
129+
}
156130

157-
return AnalysisResult::Computed(sel);
131+
return AnalysisResult::Delegate;
158132
}
159133

160-
// NOT expression: 1 - child selectivity
134+
// NOT expression: 1 - child selectivity. Delegate if child has no estimate.
161135
if let Some(not_expr) = expr.as_any().downcast_ref::<NotExpr>() {
162-
let child_sel = self.estimate_selectivity_recursive(
163-
not_expr.arg(),
164-
input_stats,
165-
registry,
166-
);
167-
return AnalysisResult::Computed(1.0 - child_sel);
136+
if let Some(child_sel) = registry.get_selectivity(not_expr.arg(), input_stats)
137+
{
138+
return AnalysisResult::Computed(1.0 - child_sel);
139+
}
140+
return AnalysisResult::Delegate;
168141
}
169142

170-
// Literal boolean: 0.0 or 1.0
143+
// Literal boolean: exact selectivity, no statistics needed.
171144
if let Some(b) = expr
172145
.as_any()
173146
.downcast_ref::<Literal>()
@@ -179,11 +152,6 @@ impl ExpressionAnalyzer for DefaultExpressionAnalyzer {
179152
return AnalysisResult::Computed(if b { 1.0 } else { 0.0 });
180153
}
181154

182-
// Column reference as predicate (boolean column)
183-
if expr.as_any().downcast_ref::<Column>().is_some() {
184-
return AnalysisResult::Computed(0.5);
185-
}
186-
187155
AnalysisResult::Delegate
188156
}
189157

datafusion/physical-expr/src/expression_analyzer/mod.rs

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -188,7 +188,6 @@ impl ExpressionAnalyzerRegistry {
188188
}
189189
}
190190

191-
/// Create a registry with only the given analyzers (no builtins).
192191
/// Create a registry with only the given analyzers and no built-in fallback.
193192
///
194193
/// If none of the provided analyzers can handle a request, the registry

datafusion/physical-expr/src/expression_analyzer/tests.rs

Lines changed: 87 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -119,43 +119,116 @@ fn test_equality_selectivity_column_on_right() {
119119
}
120120

121121
#[test]
122-
fn test_and_selectivity() {
123-
let stats = make_stats_with_ndvs(1000, &[100]);
122+
fn test_equality_selectivity_no_ndv_delegates() {
123+
// When the column has no distinct_count, resolve_ndv must return None
124+
// so the predicate delegates rather than using the literal's NDV (1) as
125+
// the denominator, which would produce selectivity = 1.0.
126+
let stats = Statistics {
127+
num_rows: Precision::Exact(1000),
128+
total_byte_size: Precision::Absent,
129+
column_statistics: vec![ColumnStatistics::default()],
130+
};
124131
let col = Arc::new(Column::new("a", 0)) as Arc<dyn PhysicalExpr>;
132+
let lit =
133+
Arc::new(Literal::new(ScalarValue::Int32(Some(42)))) as Arc<dyn PhysicalExpr>;
134+
let eq = Arc::new(BinaryExpr::new(col, Operator::Eq, lit)) as Arc<dyn PhysicalExpr>;
135+
136+
let registry = ExpressionAnalyzerRegistry::new();
137+
assert_eq!(registry.get_selectivity(&eq, &stats), None);
138+
}
139+
140+
#[test]
141+
fn test_and_selectivity() {
142+
// Both children have NDV, so AND can be computed.
143+
let stats = make_stats_with_ndvs(1000, &[100, 50]);
144+
let col_a = Arc::new(Column::new("a", 0)) as Arc<dyn PhysicalExpr>;
145+
let col_b = Arc::new(Column::new("b", 1)) as Arc<dyn PhysicalExpr>;
125146
let lit1 =
126147
Arc::new(Literal::new(ScalarValue::Int32(Some(42)))) as Arc<dyn PhysicalExpr>;
127148
let lit2 =
128149
Arc::new(Literal::new(ScalarValue::Int32(Some(10)))) as Arc<dyn PhysicalExpr>;
129150

130-
let eq = Arc::new(BinaryExpr::new(Arc::clone(&col), Operator::Eq, lit1))
131-
as Arc<dyn PhysicalExpr>;
132-
let gt = Arc::new(BinaryExpr::new(col, Operator::Gt, lit2)) as Arc<dyn PhysicalExpr>;
151+
// a = 42 AND b = 10: 1/100 * 1/50 = 0.0002
152+
let eq_a =
153+
Arc::new(BinaryExpr::new(col_a, Operator::Eq, lit1)) as Arc<dyn PhysicalExpr>;
154+
let eq_b =
155+
Arc::new(BinaryExpr::new(col_b, Operator::Eq, lit2)) as Arc<dyn PhysicalExpr>;
133156
let and_expr =
134-
Arc::new(BinaryExpr::new(eq, Operator::And, gt)) as Arc<dyn PhysicalExpr>;
157+
Arc::new(BinaryExpr::new(eq_a, Operator::And, eq_b)) as Arc<dyn PhysicalExpr>;
135158

136159
let registry = ExpressionAnalyzerRegistry::new();
137160
let sel = registry.get_selectivity(&and_expr, &stats).unwrap();
138-
assert!((sel - 0.0033).abs() < 0.001); // 0.01 * 0.33
161+
assert!((sel - 0.0002).abs() < 1e-6); // 0.01 * 0.02
162+
163+
// When a child has no NDV (column stats absent), its selectivity is unknown,
164+
// so AND cannot produce an estimate and must delegate.
165+
let stats_no_ndv = Statistics {
166+
num_rows: Precision::Exact(1000),
167+
total_byte_size: Precision::Absent,
168+
column_statistics: vec![ColumnStatistics::default()],
169+
};
170+
// c = 1: column c has no distinct_count, so resolve_ndv returns None -> Delegate
171+
let eq_no_ndv = Arc::new(BinaryExpr::new(
172+
Arc::new(Column::new("c", 0)) as Arc<dyn PhysicalExpr>,
173+
Operator::Eq,
174+
Arc::new(Literal::new(ScalarValue::Int32(Some(1)))) as Arc<dyn PhysicalExpr>,
175+
)) as Arc<dyn PhysicalExpr>;
176+
// c > 5: no range selectivity without statistics -> Delegate
177+
let gt_no_ndv = Arc::new(BinaryExpr::new(
178+
Arc::new(Column::new("c", 0)) as Arc<dyn PhysicalExpr>,
179+
Operator::Gt,
180+
Arc::new(Literal::new(ScalarValue::Int32(Some(5)))) as Arc<dyn PhysicalExpr>,
181+
)) as Arc<dyn PhysicalExpr>;
182+
let and_no_info = Arc::new(BinaryExpr::new(eq_no_ndv, Operator::And, gt_no_ndv))
183+
as Arc<dyn PhysicalExpr>;
184+
assert_eq!(registry.get_selectivity(&and_no_info, &stats_no_ndv), None);
139185
}
140186

141187
#[test]
142188
fn test_or_selectivity() {
143-
let stats = make_stats_with_ndvs(1000, &[100]);
144-
let col = Arc::new(Column::new("a", 0)) as Arc<dyn PhysicalExpr>;
189+
// Both children have NDV, so OR can use inclusion-exclusion.
190+
let stats = make_stats_with_ndvs(1000, &[100, 50]);
191+
let col_a = Arc::new(Column::new("a", 0)) as Arc<dyn PhysicalExpr>;
192+
let col_b = Arc::new(Column::new("b", 1)) as Arc<dyn PhysicalExpr>;
145193
let lit1 =
146194
Arc::new(Literal::new(ScalarValue::Int32(Some(42)))) as Arc<dyn PhysicalExpr>;
147195
let lit2 =
148196
Arc::new(Literal::new(ScalarValue::Int32(Some(10)))) as Arc<dyn PhysicalExpr>;
149197

150-
let eq = Arc::new(BinaryExpr::new(Arc::clone(&col), Operator::Eq, lit1))
151-
as Arc<dyn PhysicalExpr>;
152-
let gt = Arc::new(BinaryExpr::new(col, Operator::Gt, lit2)) as Arc<dyn PhysicalExpr>;
198+
// a = 42 OR b = 10: 0.01 + 0.02 - 0.0002 = 0.0298
199+
let eq_a =
200+
Arc::new(BinaryExpr::new(col_a, Operator::Eq, lit1)) as Arc<dyn PhysicalExpr>;
201+
let eq_b =
202+
Arc::new(BinaryExpr::new(col_b, Operator::Eq, lit2)) as Arc<dyn PhysicalExpr>;
153203
let or_expr =
154-
Arc::new(BinaryExpr::new(eq, Operator::Or, gt)) as Arc<dyn PhysicalExpr>;
204+
Arc::new(BinaryExpr::new(eq_a, Operator::Or, eq_b)) as Arc<dyn PhysicalExpr>;
155205

156206
let registry = ExpressionAnalyzerRegistry::new();
157207
let sel = registry.get_selectivity(&or_expr, &stats).unwrap();
158-
assert!((sel - 0.3367).abs() < 0.001); // 0.01 + 0.33 - 0.01*0.33
208+
assert!((sel - 0.0298).abs() < 1e-6); // 0.01 + 0.02 - 0.01*0.02
209+
210+
// When a child has no NDV (column stats absent), its selectivity is unknown,
211+
// so OR cannot produce an estimate and must delegate.
212+
let stats_no_ndv = Statistics {
213+
num_rows: Precision::Exact(1000),
214+
total_byte_size: Precision::Absent,
215+
column_statistics: vec![ColumnStatistics::default()],
216+
};
217+
// c = 1: column c has no distinct_count, so resolve_ndv returns None -> Delegate
218+
let eq_no_ndv = Arc::new(BinaryExpr::new(
219+
Arc::new(Column::new("c", 0)) as Arc<dyn PhysicalExpr>,
220+
Operator::Eq,
221+
Arc::new(Literal::new(ScalarValue::Int32(Some(1)))) as Arc<dyn PhysicalExpr>,
222+
)) as Arc<dyn PhysicalExpr>;
223+
// c > 5: no range selectivity without statistics -> Delegate
224+
let gt_no_ndv = Arc::new(BinaryExpr::new(
225+
Arc::new(Column::new("c", 0)) as Arc<dyn PhysicalExpr>,
226+
Operator::Gt,
227+
Arc::new(Literal::new(ScalarValue::Int32(Some(5)))) as Arc<dyn PhysicalExpr>,
228+
)) as Arc<dyn PhysicalExpr>;
229+
let or_no_info = Arc::new(BinaryExpr::new(eq_no_ndv, Operator::Or, gt_no_ndv))
230+
as Arc<dyn PhysicalExpr>;
231+
assert_eq!(registry.get_selectivity(&or_no_info, &stats_no_ndv), None);
159232
}
160233

161234
#[test]

datafusion/physical-plan/src/filter.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2634,7 +2634,8 @@ mod tests {
26342634
/// - Without ExpressionAnalyzer: default 20% selectivity -> 200 rows
26352635
/// - With ExpressionAnalyzer: P(a=42) + P(b=5) - P(a=42)*P(b=5) = 0.1 + 0.2 - 0.02 = 0.28 -> 280 rows
26362636
#[tokio::test]
2637-
async fn test_filter_statistics_expression_analyzer_selectivity_or_predicate() -> Result<()> {
2637+
async fn test_filter_statistics_expression_analyzer_selectivity_or_predicate()
2638+
-> Result<()> {
26382639
let schema = Schema::new(vec![
26392640
Field::new("a", DataType::Int64, false),
26402641
Field::new("b", DataType::Int64, false),

0 commit comments

Comments
 (0)