Skip to content

Commit 3f10a1d

Browse files
committed
Make resolve_ndv symmetric: use max NDV across both operands
1 parent 5c85fe8 commit 3f10a1d

2 files changed

Lines changed: 84 additions & 37 deletions

File tree

datafusion/physical-expr/src/expression_analyzer/default.rs

Lines changed: 8 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -52,25 +52,20 @@ impl DefaultExpressionAnalyzer {
5252
.and_then(|idx| input_stats.column_statistics.get(idx))
5353
}
5454

55-
/// Resolve NDV for a binary expression: try direct column stats first,
56-
/// then fall back to the registry for arbitrary expressions
55+
/// Resolve NDV for a binary predicate by taking the max of both sides.
56+
///
57+
/// Using max is symmetric (order-independent) and handles column-vs-column,
58+
/// column-vs-expression, and expression-vs-expression uniformly through the
59+
/// registry chain
5760
fn resolve_ndv(
5861
left: &Arc<dyn PhysicalExpr>,
5962
right: &Arc<dyn PhysicalExpr>,
6063
input_stats: &Statistics,
6164
registry: &ExpressionAnalyzerRegistry,
6265
) -> Option<usize> {
63-
Self::get_column_stats(left, input_stats)
64-
.or_else(|| Self::get_column_stats(right, input_stats))
65-
.and_then(|s| s.distinct_count.get_value())
66-
.filter(|&&ndv| ndv > 0)
67-
.copied()
68-
.or_else(|| {
69-
let l = registry.get_distinct_count(left, input_stats);
70-
let r = registry.get_distinct_count(right, input_stats);
71-
l.max(r)
72-
})
73-
.filter(|&n| n > 0)
66+
let l = registry.get_distinct_count(left, input_stats);
67+
let r = registry.get_distinct_count(right, input_stats);
68+
l.max(r).filter(|&n| n > 0)
7469
}
7570

7671
/// Recursive selectivity estimation through the registry chain

datafusion/physical-expr/src/expression_analyzer/tests.rs

Lines changed: 76 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -23,34 +23,37 @@ use datafusion_common::{ColumnStatistics, ScalarValue, Statistics};
2323
use datafusion_expr::Operator;
2424
use std::sync::Arc;
2525

26-
fn make_stats_with_ndv(num_rows: usize, ndv: usize) -> Statistics {
26+
fn make_stats_with_ndvs(num_rows: usize, ndvs: &[usize]) -> Statistics {
2727
Statistics {
2828
num_rows: Precision::Exact(num_rows),
2929
total_byte_size: Precision::Absent,
30-
column_statistics: vec![ColumnStatistics {
31-
null_count: Precision::Exact(0),
32-
max_value: Precision::Absent,
33-
min_value: Precision::Absent,
34-
sum_value: Precision::Absent,
35-
distinct_count: Precision::Exact(ndv),
36-
byte_size: Precision::Absent,
37-
}],
30+
column_statistics: ndvs
31+
.iter()
32+
.map(|&ndv| ColumnStatistics {
33+
null_count: Precision::Exact(0),
34+
max_value: Precision::Absent,
35+
min_value: Precision::Absent,
36+
sum_value: Precision::Absent,
37+
distinct_count: Precision::Exact(ndv),
38+
byte_size: Precision::Absent,
39+
})
40+
.collect(),
3841
}
3942
}
4043

4144
// NDV tests
4245

4346
#[test]
4447
fn test_column_ndv() {
45-
let stats = make_stats_with_ndv(1000, 100);
48+
let stats = make_stats_with_ndvs(1000, &[100]);
4649
let col = Arc::new(Column::new("a", 0)) as Arc<dyn PhysicalExpr>;
4750
let registry = ExpressionAnalyzerRegistry::new();
4851
assert_eq!(registry.get_distinct_count(&col, &stats), Some(100));
4952
}
5053

5154
#[test]
5255
fn test_literal_ndv() {
53-
let stats = make_stats_with_ndv(1000, 100);
56+
let stats = make_stats_with_ndvs(1000, &[100]);
5457
let lit =
5558
Arc::new(Literal::new(ScalarValue::Int32(Some(42)))) as Arc<dyn PhysicalExpr>;
5659
let registry = ExpressionAnalyzerRegistry::new();
@@ -59,7 +62,7 @@ fn test_literal_ndv() {
5962

6063
#[test]
6164
fn test_arithmetic_ndv() {
62-
let stats = make_stats_with_ndv(1000, 100);
65+
let stats = make_stats_with_ndvs(1000, &[100]);
6366
let col = Arc::new(Column::new("a", 0)) as Arc<dyn PhysicalExpr>;
6467
let lit =
6568
Arc::new(Literal::new(ScalarValue::Int64(Some(1)))) as Arc<dyn PhysicalExpr>;
@@ -91,7 +94,7 @@ fn test_arithmetic_ndv() {
9194

9295
#[test]
9396
fn test_equality_selectivity() {
94-
let stats = make_stats_with_ndv(1000, 100);
97+
let stats = make_stats_with_ndvs(1000, &[100]);
9598
let col = Arc::new(Column::new("a", 0)) as Arc<dyn PhysicalExpr>;
9699
let lit =
97100
Arc::new(Literal::new(ScalarValue::Int32(Some(42)))) as Arc<dyn PhysicalExpr>;
@@ -104,7 +107,7 @@ fn test_equality_selectivity() {
104107

105108
#[test]
106109
fn test_equality_selectivity_column_on_right() {
107-
let stats = make_stats_with_ndv(1000, 100);
110+
let stats = make_stats_with_ndvs(1000, &[100]);
108111
let col = Arc::new(Column::new("a", 0)) as Arc<dyn PhysicalExpr>;
109112
let lit =
110113
Arc::new(Literal::new(ScalarValue::Int32(Some(42)))) as Arc<dyn PhysicalExpr>;
@@ -117,7 +120,7 @@ fn test_equality_selectivity_column_on_right() {
117120

118121
#[test]
119122
fn test_and_selectivity() {
120-
let stats = make_stats_with_ndv(1000, 100);
123+
let stats = make_stats_with_ndvs(1000, &[100]);
121124
let col = Arc::new(Column::new("a", 0)) as Arc<dyn PhysicalExpr>;
122125
let lit1 =
123126
Arc::new(Literal::new(ScalarValue::Int32(Some(42)))) as Arc<dyn PhysicalExpr>;
@@ -137,7 +140,7 @@ fn test_and_selectivity() {
137140

138141
#[test]
139142
fn test_or_selectivity() {
140-
let stats = make_stats_with_ndv(1000, 100);
143+
let stats = make_stats_with_ndvs(1000, &[100]);
141144
let col = Arc::new(Column::new("a", 0)) as Arc<dyn PhysicalExpr>;
142145
let lit1 =
143146
Arc::new(Literal::new(ScalarValue::Int32(Some(42)))) as Arc<dyn PhysicalExpr>;
@@ -157,7 +160,7 @@ fn test_or_selectivity() {
157160

158161
#[test]
159162
fn test_not_selectivity() {
160-
let stats = make_stats_with_ndv(1000, 100);
163+
let stats = make_stats_with_ndvs(1000, &[100]);
161164
let col = Arc::new(Column::new("a", 0)) as Arc<dyn PhysicalExpr>;
162165
let lit =
163166
Arc::new(Literal::new(ScalarValue::Int32(Some(42)))) as Arc<dyn PhysicalExpr>;
@@ -172,7 +175,7 @@ fn test_not_selectivity() {
172175

173176
#[test]
174177
fn test_equality_selectivity_expression_eq_literal() {
175-
let stats = make_stats_with_ndv(1000, 100);
178+
let stats = make_stats_with_ndvs(1000, &[100]);
176179
let col = Arc::new(Column::new("a", 0)) as Arc<dyn PhysicalExpr>;
177180
let one =
178181
Arc::new(Literal::new(ScalarValue::Int32(Some(1)))) as Arc<dyn PhysicalExpr>;
@@ -191,7 +194,7 @@ fn test_equality_selectivity_expression_eq_literal() {
191194

192195
#[test]
193196
fn test_inequality_selectivity_expression_neq_literal() {
194-
let stats = make_stats_with_ndv(1000, 100);
197+
let stats = make_stats_with_ndvs(1000, &[100]);
195198
let col = Arc::new(Column::new("a", 0)) as Arc<dyn PhysicalExpr>;
196199
let one =
197200
Arc::new(Literal::new(ScalarValue::Int32(Some(1)))) as Arc<dyn PhysicalExpr>;
@@ -208,6 +211,55 @@ fn test_inequality_selectivity_expression_neq_literal() {
208211
assert!((sel - 0.99).abs() < 0.001);
209212
}
210213

214+
// Tests for resolve_ndv symmetry (column-vs-column and column-vs-expression)
215+
216+
#[test]
217+
fn test_equality_selectivity_column_eq_column_symmetric() {
218+
// a = b and b = a must give the same selectivity regardless of operand order
219+
let stats = make_stats_with_ndvs(1000, &[50, 200]);
220+
let col_a = Arc::new(Column::new("a", 0)) as Arc<dyn PhysicalExpr>;
221+
let col_b = Arc::new(Column::new("b", 1)) as Arc<dyn PhysicalExpr>;
222+
223+
let registry = ExpressionAnalyzerRegistry::new();
224+
225+
let eq_ab = Arc::new(BinaryExpr::new(
226+
Arc::clone(&col_a),
227+
Operator::Eq,
228+
Arc::clone(&col_b),
229+
)) as Arc<dyn PhysicalExpr>;
230+
let eq_ba = Arc::new(BinaryExpr::new(
231+
Arc::clone(&col_b),
232+
Operator::Eq,
233+
Arc::clone(&col_a),
234+
)) as Arc<dyn PhysicalExpr>;
235+
236+
let sel_ab = registry.get_selectivity(&eq_ab, &stats).unwrap();
237+
let sel_ba = registry.get_selectivity(&eq_ba, &stats).unwrap();
238+
239+
assert_eq!(sel_ab, sel_ba);
240+
}
241+
242+
#[test]
243+
fn test_equality_selectivity_column_eq_expression_uses_max_ndv() {
244+
// col = (expr) should use max(ndv(col), ndv(expr)), not just ndv(col)
245+
// Here col has ndv=10 and expr=(b+1) has ndv=200 from column b
246+
let stats = make_stats_with_ndvs(1000, &[10, 200]);
247+
let col_a = Arc::new(Column::new("a", 0)) as Arc<dyn PhysicalExpr>;
248+
let col_b = Arc::new(Column::new("b", 1)) as Arc<dyn PhysicalExpr>;
249+
let one =
250+
Arc::new(Literal::new(ScalarValue::Int64(Some(1)))) as Arc<dyn PhysicalExpr>;
251+
let b_plus_1 =
252+
Arc::new(BinaryExpr::new(col_b, Operator::Plus, one)) as Arc<dyn PhysicalExpr>;
253+
254+
let eq =
255+
Arc::new(BinaryExpr::new(col_a, Operator::Eq, b_plus_1)) as Arc<dyn PhysicalExpr>;
256+
257+
let registry = ExpressionAnalyzerRegistry::new();
258+
let sel = registry.get_selectivity(&eq, &stats).unwrap();
259+
// max(ndv(a)=10, ndv(b+1)=200) = 200, so selectivity = 1/200
260+
assert_eq!(sel, 1.0 / 200.0);
261+
}
262+
211263
// Min/max tests
212264

213265
#[test]
@@ -235,7 +287,7 @@ fn test_column_min_max() {
235287

236288
#[test]
237289
fn test_literal_min_max() {
238-
let stats = make_stats_with_ndv(100, 10);
290+
let stats = make_stats_with_ndvs(100, &[10]);
239291
let lit =
240292
Arc::new(Literal::new(ScalarValue::Int32(Some(42)))) as Arc<dyn PhysicalExpr>;
241293
let registry = ExpressionAnalyzerRegistry::new();
@@ -271,7 +323,7 @@ fn test_column_null_fraction() {
271323

272324
#[test]
273325
fn test_literal_null_fraction() {
274-
let stats = make_stats_with_ndv(100, 10);
326+
let stats = make_stats_with_ndvs(100, &[10]);
275327
let registry = ExpressionAnalyzerRegistry::new();
276328

277329
let lit =
@@ -301,7 +353,7 @@ impl ExpressionAnalyzer for FixedSelectivityAnalyzer {
301353

302354
#[test]
303355
fn test_custom_analyzer_overrides_default() {
304-
let stats = make_stats_with_ndv(1000, 100);
356+
let stats = make_stats_with_ndvs(1000, &[100]);
305357
let col = Arc::new(Column::new("a", 0)) as Arc<dyn PhysicalExpr>;
306358
let lit =
307359
Arc::new(Literal::new(ScalarValue::Int32(Some(42)))) as Arc<dyn PhysicalExpr>;
@@ -336,7 +388,7 @@ impl ExpressionAnalyzer for ColumnAOnlyAnalyzer {
336388

337389
#[test]
338390
fn test_custom_analyzer_delegates_to_default() {
339-
let stats = make_stats_with_ndv(1000, 100);
391+
let stats = make_stats_with_ndvs(1000, &[100]);
340392
let col_a = Arc::new(Column::new("a", 0)) as Arc<dyn PhysicalExpr>;
341393
let col_b = Arc::new(Column::new("b", 0)) as Arc<dyn PhysicalExpr>;
342394
let lit =
@@ -359,7 +411,7 @@ fn test_custom_analyzer_delegates_to_default() {
359411

360412
#[test]
361413
fn test_with_analyzers_and_default() {
362-
let stats = make_stats_with_ndv(1000, 100);
414+
let stats = make_stats_with_ndvs(1000, &[100]);
363415
let col = Arc::new(Column::new("a", 0)) as Arc<dyn PhysicalExpr>;
364416

365417
let registry =

0 commit comments

Comments
 (0)