@@ -21,7 +21,7 @@ use std::ops::Deref;
2121use std:: sync:: Arc ;
2222
2323use crate :: PhysicalExpr ;
24- use crate :: expressions:: { CastExpr , Column , Literal } ;
24+ use crate :: expressions:: { Column , Literal } ;
2525use crate :: scalar_function:: ScalarFunctionExpr ;
2626use crate :: utils:: collect_columns;
2727
@@ -33,6 +33,7 @@ use datafusion_common::{
3333 Result , ScalarValue , Statistics , assert_or_internal_err, internal_datafusion_err,
3434 plan_err,
3535} ;
36+ use datafusion_expr_common:: interval_arithmetic:: Interval ;
3637
3738use datafusion_physical_expr_common:: metrics:: ExecutionPlanMetricsSet ;
3839use datafusion_physical_expr_common:: metrics:: ExpressionEvaluatorMetrics ;
@@ -729,35 +730,41 @@ impl ProjectionExprs {
729730 }
730731}
731732
732- /// Propagate column statistics through expressions that preserve order.
733+ /// Propagate min/max statistics through an expression using the
734+ /// [`PhysicalExpr::evaluate_bounds`] interval arithmetic system.
733735///
734- /// Currently handles:
735- /// - `Column` references (direct passthrough)
736- /// - `CAST` expressions (casts min/max values to the target type)
737- ///
738- /// For other expressions, returns unknown statistics.
736+ /// This works for any expression that implements `evaluate_bounds`,
737+ /// including `CAST`, negation, and arithmetic with literals.
738+ /// For expressions that don't implement it, returns unknown statistics.
739739fn project_column_statistics_through_expr (
740740 expr : & dyn PhysicalExpr ,
741741 column_stats : & mut [ ColumnStatistics ] ,
742742) -> ColumnStatistics {
743743 if let Some ( col) = expr. as_any ( ) . downcast_ref :: < Column > ( ) {
744- std:: mem:: take ( & mut column_stats[ col. index ( ) ] )
745- } else if let Some ( cast_expr) = expr. as_any ( ) . downcast_ref :: < CastExpr > ( ) {
746- let inner_stats =
747- project_column_statistics_through_expr ( cast_expr. expr . as_ref ( ) , column_stats) ;
748- let target_type = cast_expr. cast_type ( ) ;
744+ return std:: mem:: take ( & mut column_stats[ col. index ( ) ] ) ;
745+ }
746+
747+ if let Some ( interval) = compute_bounds_from_stats ( expr, column_stats) {
748+ let all_exact = leaf_stats_all_exact ( expr, column_stats) ;
749+ let lower = interval. lower ( ) . clone ( ) ;
750+ let upper = interval. upper ( ) . clone ( ) ;
749751 ColumnStatistics {
750- min_value : inner_stats
751- . min_value
752- . cast_to ( target_type)
753- . unwrap_or ( Precision :: Absent ) ,
754- max_value : inner_stats
755- . max_value
756- . cast_to ( target_type)
757- . unwrap_or ( Precision :: Absent ) ,
758- null_count : inner_stats. null_count ,
759- distinct_count : inner_stats. distinct_count ,
760- // Sum and byte size change under CAST, don't propagate
752+ min_value : if lower. is_null ( ) {
753+ Precision :: Absent
754+ } else if all_exact {
755+ Precision :: Exact ( lower)
756+ } else {
757+ Precision :: Inexact ( lower)
758+ } ,
759+ max_value : if upper. is_null ( ) {
760+ Precision :: Absent
761+ } else if all_exact {
762+ Precision :: Exact ( upper)
763+ } else {
764+ Precision :: Inexact ( upper)
765+ } ,
766+ null_count : Precision :: Absent ,
767+ distinct_count : Precision :: Absent ,
761768 sum_value : Precision :: Absent ,
762769 byte_size : Precision :: Absent ,
763770 }
@@ -766,6 +773,61 @@ fn project_column_statistics_through_expr(
766773 }
767774}
768775
776+ /// Recursively compute the output [`Interval`] for an expression from column
777+ /// statistics, using [`PhysicalExpr::evaluate_bounds`].
778+ fn compute_bounds_from_stats (
779+ expr : & dyn PhysicalExpr ,
780+ column_stats : & [ ColumnStatistics ] ,
781+ ) -> Option < Interval > {
782+ if let Some ( col) = expr. as_any ( ) . downcast_ref :: < Column > ( ) {
783+ let stats = & column_stats[ col. index ( ) ] ;
784+ let min = stats. min_value . get_value ( ) ?;
785+ let max = stats. max_value . get_value ( ) ?;
786+ return Interval :: try_new ( min. clone ( ) , max. clone ( ) ) . ok ( ) ;
787+ }
788+
789+ if let Some ( lit) = expr. as_any ( ) . downcast_ref :: < Literal > ( ) {
790+ let val = lit. value ( ) ;
791+ return Interval :: try_new ( val. clone ( ) , val. clone ( ) ) . ok ( ) ;
792+ }
793+
794+ let children = expr. children ( ) ;
795+ let child_intervals: Option < Vec < Interval > > = children
796+ . iter ( )
797+ . map ( |c| compute_bounds_from_stats ( c. as_ref ( ) , column_stats) )
798+ . collect ( ) ;
799+ let child_intervals = child_intervals?;
800+ let child_refs: Vec < & Interval > = child_intervals. iter ( ) . collect ( ) ;
801+
802+ expr. evaluate_bounds ( & child_refs) . ok ( )
803+ }
804+
805+ /// Returns true if all leaf column statistics referenced by the expression
806+ /// have exact min/max values.
807+ fn leaf_stats_all_exact (
808+ expr : & dyn PhysicalExpr ,
809+ column_stats : & [ ColumnStatistics ] ,
810+ ) -> bool {
811+ if let Some ( col) = expr. as_any ( ) . downcast_ref :: < Column > ( ) {
812+ return column_stats[ col. index ( ) ]
813+ . min_value
814+ . is_exact ( )
815+ . unwrap_or ( false )
816+ && column_stats[ col. index ( ) ]
817+ . max_value
818+ . is_exact ( )
819+ . unwrap_or ( false ) ;
820+ }
821+
822+ if expr. as_any ( ) . downcast_ref :: < Literal > ( ) . is_some ( ) {
823+ return true ;
824+ }
825+
826+ expr. children ( )
827+ . iter ( )
828+ . all ( |c| leaf_stats_all_exact ( c. as_ref ( ) , column_stats) )
829+ }
830+
769831impl < ' a > IntoIterator for & ' a ProjectionExprs {
770832 type Item = & ' a ProjectionExpr ;
771833 type IntoIter = std:: slice:: Iter < ' a , ProjectionExpr > ;
@@ -2824,13 +2886,17 @@ pub(crate) mod tests {
28242886 // Should have 2 column statistics
28252887 assert_eq ! ( output_stats. column_statistics. len( ) , 2 ) ;
28262888
2827- // First column (expression ) should have unknown statistics
2889+ // First column (col0 + 1 ) should have propagated min/max via evaluate_bounds
28282890 assert_eq ! (
2829- output_stats. column_statistics[ 0 ] . distinct_count ,
2830- Precision :: Absent
2891+ output_stats. column_statistics[ 0 ] . min_value ,
2892+ Precision :: Exact ( ScalarValue :: Int64 ( Some ( - 3 ) ) )
28312893 ) ;
28322894 assert_eq ! (
28332895 output_stats. column_statistics[ 0 ] . max_value,
2896+ Precision :: Exact ( ScalarValue :: Int64 ( Some ( 22 ) ) )
2897+ ) ;
2898+ assert_eq ! (
2899+ output_stats. column_statistics[ 0 ] . distinct_count,
28342900 Precision :: Absent
28352901 ) ;
28362902
0 commit comments