Skip to content

Commit eeb3b50

Browse files
committed
Use registry NDV for equality/inequality selectivity on expressions
1 parent c62a5cb commit eeb3b50

2 files changed

Lines changed: 73 additions & 12 deletions

File tree

datafusion/physical-expr/src/expression_analyzer/default.rs

Lines changed: 35 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,27 @@ impl DefaultExpressionAnalyzer {
5252
.and_then(|idx| input_stats.column_statistics.get(idx))
5353
}
5454

55+
/// Resolve NDV for a binary expression: try direct column stats first,
56+
/// then fall back to the registry for arbitrary expressions
57+
fn resolve_ndv(
58+
left: &Arc<dyn PhysicalExpr>,
59+
right: &Arc<dyn PhysicalExpr>,
60+
input_stats: &Statistics,
61+
registry: &ExpressionAnalyzerRegistry,
62+
) -> Option<usize> {
63+
Self::get_column_stats(left, input_stats)
64+
.or_else(|| Self::get_column_stats(right, input_stats))
65+
.and_then(|s| s.distinct_count.get_value())
66+
.filter(|&&ndv| ndv > 0)
67+
.copied()
68+
.or_else(|| {
69+
let l = registry.get_distinct_count(left, input_stats);
70+
let r = registry.get_distinct_count(right, input_stats);
71+
l.max(r)
72+
})
73+
.filter(|&n| n > 0)
74+
}
75+
5576
/// Recursive selectivity estimation through the registry chain
5677
fn estimate_selectivity_recursive(
5778
&self,
@@ -103,24 +124,26 @@ impl ExpressionAnalyzer for DefaultExpressionAnalyzer {
103124

104125
// Equality: selectivity = 1/NDV
105126
Operator::Eq => {
106-
let ndv = Self::get_column_stats(binary.left(), input_stats)
107-
.or_else(|| Self::get_column_stats(binary.right(), input_stats))
108-
.and_then(|s| s.distinct_count.get_value())
109-
.filter(|&&ndv| ndv > 0);
110-
if let Some(ndv) = ndv {
111-
return AnalysisResult::Computed(1.0 / (*ndv as f64));
127+
if let Some(ndv) = Self::resolve_ndv(
128+
binary.left(),
129+
binary.right(),
130+
input_stats,
131+
registry,
132+
) {
133+
return AnalysisResult::Computed(1.0 / (ndv as f64));
112134
}
113135
0.1 // Default equality selectivity
114136
}
115137

116138
// Inequality: selectivity = 1 - 1/NDV
117139
Operator::NotEq => {
118-
let ndv = Self::get_column_stats(binary.left(), input_stats)
119-
.or_else(|| Self::get_column_stats(binary.right(), input_stats))
120-
.and_then(|s| s.distinct_count.get_value())
121-
.filter(|&&ndv| ndv > 0);
122-
if let Some(ndv) = ndv {
123-
return AnalysisResult::Computed(1.0 - (1.0 / (*ndv as f64)));
140+
if let Some(ndv) = Self::resolve_ndv(
141+
binary.left(),
142+
binary.right(),
143+
input_stats,
144+
registry,
145+
) {
146+
return AnalysisResult::Computed(1.0 - (1.0 / (ndv as f64)));
124147
}
125148
0.9
126149
}

datafusion/physical-expr/src/expression_analyzer/tests.rs

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -162,6 +162,44 @@ fn test_not_selectivity() {
162162
assert!((sel - 0.99).abs() < 0.001); // 1 - 0.01
163163
}
164164

165+
#[test]
166+
fn test_equality_selectivity_expression_eq_literal() {
167+
let stats = make_stats_with_ndv(1000, 100);
168+
let col = Arc::new(Column::new("a", 0)) as Arc<dyn PhysicalExpr>;
169+
let one =
170+
Arc::new(Literal::new(ScalarValue::Int32(Some(1)))) as Arc<dyn PhysicalExpr>;
171+
let forty_two =
172+
Arc::new(Literal::new(ScalarValue::Int32(Some(42)))) as Arc<dyn PhysicalExpr>;
173+
let a_plus_1 =
174+
Arc::new(BinaryExpr::new(col, Operator::Plus, one)) as Arc<dyn PhysicalExpr>;
175+
let eq = Arc::new(BinaryExpr::new(a_plus_1, Operator::Eq, forty_two))
176+
as Arc<dyn PhysicalExpr>;
177+
178+
let registry = ExpressionAnalyzerRegistry::new();
179+
let sel = registry.get_selectivity(&eq, &stats).unwrap();
180+
// NDV(a + 1) = NDV(a) = 100, so selectivity = 1/100 = 0.01
181+
assert!((sel - 0.01).abs() < 0.001);
182+
}
183+
184+
#[test]
185+
fn test_inequality_selectivity_expression_neq_literal() {
186+
let stats = make_stats_with_ndv(1000, 100);
187+
let col = Arc::new(Column::new("a", 0)) as Arc<dyn PhysicalExpr>;
188+
let one =
189+
Arc::new(Literal::new(ScalarValue::Int32(Some(1)))) as Arc<dyn PhysicalExpr>;
190+
let forty_two =
191+
Arc::new(Literal::new(ScalarValue::Int32(Some(42)))) as Arc<dyn PhysicalExpr>;
192+
let a_plus_1 =
193+
Arc::new(BinaryExpr::new(col, Operator::Plus, one)) as Arc<dyn PhysicalExpr>;
194+
let neq = Arc::new(BinaryExpr::new(a_plus_1, Operator::NotEq, forty_two))
195+
as Arc<dyn PhysicalExpr>;
196+
197+
let registry = ExpressionAnalyzerRegistry::new();
198+
let sel = registry.get_selectivity(&neq, &stats).unwrap();
199+
// NDV(a + 1) = 100, selectivity = 1 - 1/100 = 0.99
200+
assert!((sel - 0.99).abs() < 0.001);
201+
}
202+
165203
// Min/max tests
166204

167205
#[test]

0 commit comments

Comments
 (0)