Skip to content

Commit aa1db03

Browse files
Dandandanclaude
andcommitted
Use generic evaluate_bounds for statistics propagation instead of CAST-specific code
Replace the CAST-specific statistics propagation with a generic approach using PhysicalExpr::evaluate_bounds(). This works for any expression that implements evaluate_bounds — including CAST, negation, and arithmetic with literals. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 38736c2 commit aa1db03

File tree

1 file changed

+92
-26
lines changed

1 file changed

+92
-26
lines changed

datafusion/physical-expr/src/projection.rs

Lines changed: 92 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ use std::ops::Deref;
2121
use std::sync::Arc;
2222

2323
use crate::PhysicalExpr;
24-
use crate::expressions::{CastExpr, Column, Literal};
24+
use crate::expressions::{Column, Literal};
2525
use crate::scalar_function::ScalarFunctionExpr;
2626
use crate::utils::collect_columns;
2727

@@ -33,6 +33,7 @@ use datafusion_common::{
3333
Result, ScalarValue, Statistics, assert_or_internal_err, internal_datafusion_err,
3434
plan_err,
3535
};
36+
use datafusion_expr_common::interval_arithmetic::Interval;
3637

3738
use datafusion_physical_expr_common::metrics::ExecutionPlanMetricsSet;
3839
use datafusion_physical_expr_common::metrics::ExpressionEvaluatorMetrics;
@@ -729,35 +730,41 @@ impl ProjectionExprs {
729730
}
730731
}
731732

732-
/// Propagate column statistics through expressions that preserve order.
733+
/// Propagate min/max statistics through an expression using the
734+
/// [`PhysicalExpr::evaluate_bounds`] interval arithmetic system.
733735
///
734-
/// Currently handles:
735-
/// - `Column` references (direct passthrough)
736-
/// - `CAST` expressions (casts min/max values to the target type)
737-
///
738-
/// For other expressions, returns unknown statistics.
736+
/// This works for any expression that implements `evaluate_bounds`,
737+
/// including `CAST`, negation, and arithmetic with literals.
738+
/// For expressions that don't implement it, returns unknown statistics.
739739
fn project_column_statistics_through_expr(
740740
expr: &dyn PhysicalExpr,
741741
column_stats: &mut [ColumnStatistics],
742742
) -> ColumnStatistics {
743743
if let Some(col) = expr.as_any().downcast_ref::<Column>() {
744-
std::mem::take(&mut column_stats[col.index()])
745-
} else if let Some(cast_expr) = expr.as_any().downcast_ref::<CastExpr>() {
746-
let inner_stats =
747-
project_column_statistics_through_expr(cast_expr.expr.as_ref(), column_stats);
748-
let target_type = cast_expr.cast_type();
744+
return std::mem::take(&mut column_stats[col.index()]);
745+
}
746+
747+
if let Some(interval) = compute_bounds_from_stats(expr, column_stats) {
748+
let all_exact = leaf_stats_all_exact(expr, column_stats);
749+
let lower = interval.lower().clone();
750+
let upper = interval.upper().clone();
749751
ColumnStatistics {
750-
min_value: inner_stats
751-
.min_value
752-
.cast_to(target_type)
753-
.unwrap_or(Precision::Absent),
754-
max_value: inner_stats
755-
.max_value
756-
.cast_to(target_type)
757-
.unwrap_or(Precision::Absent),
758-
null_count: inner_stats.null_count,
759-
distinct_count: inner_stats.distinct_count,
760-
// Sum and byte size change under CAST, don't propagate
752+
min_value: if lower.is_null() {
753+
Precision::Absent
754+
} else if all_exact {
755+
Precision::Exact(lower)
756+
} else {
757+
Precision::Inexact(lower)
758+
},
759+
max_value: if upper.is_null() {
760+
Precision::Absent
761+
} else if all_exact {
762+
Precision::Exact(upper)
763+
} else {
764+
Precision::Inexact(upper)
765+
},
766+
null_count: Precision::Absent,
767+
distinct_count: Precision::Absent,
761768
sum_value: Precision::Absent,
762769
byte_size: Precision::Absent,
763770
}
@@ -766,6 +773,61 @@ fn project_column_statistics_through_expr(
766773
}
767774
}
768775

776+
/// Recursively compute the output [`Interval`] for an expression from column
777+
/// statistics, using [`PhysicalExpr::evaluate_bounds`].
778+
fn compute_bounds_from_stats(
779+
expr: &dyn PhysicalExpr,
780+
column_stats: &[ColumnStatistics],
781+
) -> Option<Interval> {
782+
if let Some(col) = expr.as_any().downcast_ref::<Column>() {
783+
let stats = &column_stats[col.index()];
784+
let min = stats.min_value.get_value()?;
785+
let max = stats.max_value.get_value()?;
786+
return Interval::try_new(min.clone(), max.clone()).ok();
787+
}
788+
789+
if let Some(lit) = expr.as_any().downcast_ref::<Literal>() {
790+
let val = lit.value();
791+
return Interval::try_new(val.clone(), val.clone()).ok();
792+
}
793+
794+
let children = expr.children();
795+
let child_intervals: Option<Vec<Interval>> = children
796+
.iter()
797+
.map(|c| compute_bounds_from_stats(c.as_ref(), column_stats))
798+
.collect();
799+
let child_intervals = child_intervals?;
800+
let child_refs: Vec<&Interval> = child_intervals.iter().collect();
801+
802+
expr.evaluate_bounds(&child_refs).ok()
803+
}
804+
805+
/// Returns true if all leaf column statistics referenced by the expression
806+
/// have exact min/max values.
807+
fn leaf_stats_all_exact(
808+
expr: &dyn PhysicalExpr,
809+
column_stats: &[ColumnStatistics],
810+
) -> bool {
811+
if let Some(col) = expr.as_any().downcast_ref::<Column>() {
812+
return column_stats[col.index()]
813+
.min_value
814+
.is_exact()
815+
.unwrap_or(false)
816+
&& column_stats[col.index()]
817+
.max_value
818+
.is_exact()
819+
.unwrap_or(false);
820+
}
821+
822+
if expr.as_any().downcast_ref::<Literal>().is_some() {
823+
return true;
824+
}
825+
826+
expr.children()
827+
.iter()
828+
.all(|c| leaf_stats_all_exact(c.as_ref(), column_stats))
829+
}
830+
769831
impl<'a> IntoIterator for &'a ProjectionExprs {
770832
type Item = &'a ProjectionExpr;
771833
type IntoIter = std::slice::Iter<'a, ProjectionExpr>;
@@ -2824,13 +2886,17 @@ pub(crate) mod tests {
28242886
// Should have 2 column statistics
28252887
assert_eq!(output_stats.column_statistics.len(), 2);
28262888

2827-
// First column (expression) should have unknown statistics
2889+
// First column (col0 + 1) should have propagated min/max via evaluate_bounds
28282890
assert_eq!(
2829-
output_stats.column_statistics[0].distinct_count,
2830-
Precision::Absent
2891+
output_stats.column_statistics[0].min_value,
2892+
Precision::Exact(ScalarValue::Int64(Some(-3)))
28312893
);
28322894
assert_eq!(
28332895
output_stats.column_statistics[0].max_value,
2896+
Precision::Exact(ScalarValue::Int64(Some(22)))
2897+
);
2898+
assert_eq!(
2899+
output_stats.column_statistics[0].distinct_count,
28342900
Precision::Absent
28352901
);
28362902

0 commit comments

Comments
 (0)