Clarify enable_expression_analyzer scope in config and doc comments

asolimando · asolimando · commit 9b91988779ad · 2026-04-15T18:52:05.000+02:00
diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs
@@ -1113,11 +1113,12 @@ config_namespace! {
         /// So if you disable `enable_topk_dynamic_filter_pushdown`, then enable `enable_dynamic_filter_pushdown`, the `enable_topk_dynamic_filter_pushdown` will be overridden.
         pub enable_dynamic_filter_pushdown: bool, default = true
 
-        /// When set to true, the physical planner will use the ExpressionAnalyzer
+        /// When set to true, the physical planner uses the ExpressionAnalyzer
         /// framework for expression-level statistics estimation (NDV, selectivity,
-        /// min/max, null fraction). When false, existing behavior without
-        /// expression-level statistics support is used.
-        pub enable_expression_analyzer: bool, default = false
+        /// min/max, null fraction). When `use_statistics_registry` is also enabled,
+        /// the registry providers (filters, projections) also use it.
+        /// When false, existing behavior is unchanged.
+        pub use_expression_analyzer: bool, default = false
 
         /// When set to true, the optimizer will insert filters before a join between
         /// a nullable and non-nullable column to filter out nulls on the nullable side. This
diff --git a/datafusion/core/src/physical_planner.rs b/datafusion/core/src/physical_planner.rs
@@ -2865,7 +2865,7 @@ impl DefaultPhysicalPlanner {
         session_state
             .config_options()
             .optimizer
-            .enable_expression_analyzer
+            .use_expression_analyzer
             .then(|| Arc::clone(session_state.expression_analyzer_registry()))
     }
 
@@ -2949,7 +2949,7 @@ impl DefaultPhysicalPlanner {
                 if session_state
                     .config_options()
                     .optimizer
-                    .enable_expression_analyzer
+                    .use_expression_analyzer
                 {
                     new_proj_exec = new_proj_exec.with_expression_analyzer_registry(
                         Arc::clone(session_state.expression_analyzer_registry()),
diff --git a/datafusion/physical-expr/src/projection.rs b/datafusion/physical-expr/src/projection.rs
@@ -201,12 +201,9 @@ impl ProjectionExprs {
 
     /// Set the expression analyzer registry for statistics estimation.
     ///
-    /// The physical planner injects the registry from `SessionState` when
-    /// creating projections. Projections created later by optimizer rules
-    /// do not receive the registry and fall back to
-    /// `DefaultExpressionAnalyzer`. Propagating the registry to all
-    /// operator construction sites requires an operator-level statistics
-    /// registry, which is orthogonal to this work.
+    /// The physical planner injects the registry at plan creation time and
+    /// re-injects it after each physical optimizer rule, so projections
+    /// created by optimizer rules also receive the registry.
     pub fn with_expression_analyzer_registry(
         mut self,
         registry: Arc<ExpressionAnalyzerRegistry>,
@@ -883,6 +880,11 @@ impl Projector {
     ) {
         self.projection.expression_analyzer_registry = Some(registry);
     }
+
+    /// Get the expression analyzer registry, if set
+    pub fn expression_analyzer_registry(&self) -> Option<&ExpressionAnalyzerRegistry> {
+        self.projection.expression_analyzer_registry.as_deref()
+    }
 }
 
 /// Describes an immutable reference counted projection.
diff --git a/datafusion/physical-plan/src/filter.rs b/datafusion/physical-plan/src/filter.rs
@@ -183,13 +183,12 @@ impl FilterExecBuilder {
 
     /// Set the expression analyzer registry for selectivity estimation.
     ///
-    /// Same limitation as [`ProjectionExprs::with_expression_analyzer_registry`]:
-    /// the planner injects this from `SessionState`, but filters created
-    /// by optimizer rules (e.g., filter pushdown into unions) fall back to
-    /// the default selectivity. An operator-level statistics registry is
-    /// needed for full coverage.
-    ///
-    /// [`ProjectionExprs::with_expression_analyzer_registry`]: datafusion_physical_expr::projection::ProjectionExprs::with_expression_analyzer_registry
+    /// The physical planner injects the registry from `SessionState` when
+    /// creating filters. When `use_statistics_registry` is also enabled,
+    /// [`FilterStatisticsProvider`](crate::operator_statistics::FilterStatisticsProvider)
+    /// uses this registry for all filters it handles. Filters created by
+    /// optimizer rules that do not call this method fall back to the
+    /// default selectivity.
     pub fn with_expression_analyzer_registry(
         mut self,
         registry: Arc<
@@ -338,6 +337,13 @@ impl FilterExec {
         &self.projection
     }
 
+    /// Expression analyzer registry for selectivity estimation
+    pub fn expression_analyzer_registry(
+        &self,
+    ) -> Option<&datafusion_physical_expr::expression_analyzer::ExpressionAnalyzerRegistry> {
+        self.expression_analyzer_registry.as_deref()
+    }
+
     /// Calculates `Statistics` for `FilterExec`, by applying selectivity (either default, or estimated) to input statistics.
     pub(crate) fn statistics_helper(
         schema: &SchemaRef,
diff --git a/datafusion/physical-plan/src/operator_statistics/mod.rs b/datafusion/physical-plan/src/operator_statistics/mod.rs
@@ -549,7 +549,7 @@ impl StatisticsProvider for FilterStatisticsProvider {
             input_stats,
             filter.predicate(),
             filter.default_selectivity(),
-            // TODO: pass filter.expression_analyzer_registry() once #21122 lands
+            filter.expression_analyzer_registry(),
         )?;
 
         // Adjust distinct_count for each column using the selectivity ratio
@@ -600,8 +600,6 @@ impl StatisticsProvider for ProjectionStatisticsProvider {
 
         let input_stats = (*child_stats[0].base).clone();
         let output_schema = proj.schema();
-        // TODO: pass proj.expression_analyzer_registry() once #21122 lands,
-        // so expression-level NDV/min/max feeds into projected column stats.
         let stats = proj
             .projection_expr()
             .project_statistics(input_stats, &output_schema)?;
diff --git a/datafusion/physical-plan/src/projection.rs b/datafusion/physical-plan/src/projection.rs
@@ -189,6 +189,14 @@ impl ProjectionExec {
         self
     }
 
+    /// Get the expression analyzer registry, if set
+    pub fn expression_analyzer_registry(
+        &self,
+    ) -> Option<&datafusion_physical_expr::expression_analyzer::ExpressionAnalyzerRegistry>
+    {
+        self.projector.expression_analyzer_registry()
+    }
+
     /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc.
     fn compute_properties(
         input: &Arc<dyn ExecutionPlan>,
diff --git a/datafusion/sqllogictest/test_files/information_schema.slt b/datafusion/sqllogictest/test_files/information_schema.slt
@@ -300,7 +300,7 @@ datafusion.optimizer.default_filter_selectivity 20
 datafusion.optimizer.enable_aggregate_dynamic_filter_pushdown true
 datafusion.optimizer.enable_distinct_aggregation_soft_limit true
 datafusion.optimizer.enable_dynamic_filter_pushdown true
-datafusion.optimizer.enable_expression_analyzer false
+datafusion.optimizer.use_expression_analyzer false
 datafusion.optimizer.enable_join_dynamic_filter_pushdown true
 datafusion.optimizer.enable_leaf_expression_pushdown true
 datafusion.optimizer.enable_piecewise_merge_join false
@@ -446,7 +446,7 @@ datafusion.optimizer.default_filter_selectivity 20 The default filter selectivit
 datafusion.optimizer.enable_aggregate_dynamic_filter_pushdown true When set to true, the optimizer will attempt to push down Aggregate dynamic filters into the file scan phase.
 datafusion.optimizer.enable_distinct_aggregation_soft_limit true When set to true, the optimizer will push a limit operation into grouped aggregations which have no aggregate expressions, as a soft limit, emitting groups once the limit is reached, before all rows in the group are read.
 datafusion.optimizer.enable_dynamic_filter_pushdown true When set to true attempts to push down dynamic filters generated by operators (TopK, Join & Aggregate) into the file scan phase. For example, for a query such as `SELECT * FROM t ORDER BY timestamp DESC LIMIT 10`, the optimizer will attempt to push down the current top 10 timestamps that the TopK operator references into the file scans. This means that if we already have 10 timestamps in the year 2025 any files that only have timestamps in the year 2024 can be skipped / pruned at various stages in the scan. The config will suppress `enable_join_dynamic_filter_pushdown`, `enable_topk_dynamic_filter_pushdown` & `enable_aggregate_dynamic_filter_pushdown` So if you disable `enable_topk_dynamic_filter_pushdown`, then enable `enable_dynamic_filter_pushdown`, the `enable_topk_dynamic_filter_pushdown` will be overridden.
-datafusion.optimizer.enable_expression_analyzer false When set to true, the physical planner will use the ExpressionAnalyzer framework for expression-level statistics estimation (NDV, selectivity, min/max, null fraction). When false, existing behavior without expression-level statistics support is used.
+datafusion.optimizer.use_expression_analyzer false When set to true, the physical planner uses the ExpressionAnalyzer framework for expression-level statistics estimation (NDV, selectivity, min/max, null fraction). When `use_statistics_registry` is also enabled, the registry providers (filters, projections) also use it. When false, existing behavior is unchanged.
 datafusion.optimizer.enable_join_dynamic_filter_pushdown true When set to true, the optimizer will attempt to push down Join dynamic filters into the file scan phase.
 datafusion.optimizer.enable_leaf_expression_pushdown true When set to true, the optimizer will extract leaf expressions (such as `get_field`) from filter/sort/join nodes into projections closer to the leaf table scans, and push those projections down towards the leaf nodes.
 datafusion.optimizer.enable_piecewise_merge_join false When set to true, piecewise merge join is enabled. PiecewiseMergeJoin is currently experimental. Physical planner will opt for PiecewiseMergeJoin when there is only one range filter.
diff --git a/docs/source/user-guide/configs.md b/docs/source/user-guide/configs.md
@@ -145,7 +145,7 @@ The following configuration settings are available:
 | datafusion.optimizer.enable_join_dynamic_filter_pushdown                | true                      | When set to true, the optimizer will attempt to push down Join dynamic filters into the file scan phase.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     |
 | datafusion.optimizer.enable_aggregate_dynamic_filter_pushdown           | true                      | When set to true, the optimizer will attempt to push down Aggregate dynamic filters into the file scan phase.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                |
 | datafusion.optimizer.enable_dynamic_filter_pushdown                     | true                      | When set to true attempts to push down dynamic filters generated by operators (TopK, Join & Aggregate) into the file scan phase. For example, for a query such as `SELECT * FROM t ORDER BY timestamp DESC LIMIT 10`, the optimizer will attempt to push down the current top 10 timestamps that the TopK operator references into the file scans. This means that if we already have 10 timestamps in the year 2025 any files that only have timestamps in the year 2024 can be skipped / pruned at various stages in the scan. The config will suppress `enable_join_dynamic_filter_pushdown`, `enable_topk_dynamic_filter_pushdown` & `enable_aggregate_dynamic_filter_pushdown` So if you disable `enable_topk_dynamic_filter_pushdown`, then enable `enable_dynamic_filter_pushdown`, the `enable_topk_dynamic_filter_pushdown` will be overridden.                                                                                                                                                                                                                                                                                                                                                                                     |
-| datafusion.optimizer.enable_expression_analyzer                         | false                     | When set to true, the physical planner will use the ExpressionAnalyzer framework for expression-level statistics estimation (NDV, selectivity, min/max, null fraction). When false, existing behavior without expression-level statistics support is used.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   |
+| datafusion.optimizer.use_expression_analyzer                            | false                     | When set to true, the physical planner uses the ExpressionAnalyzer framework for expression-level statistics estimation (NDV, selectivity, min/max, null fraction). When `use_statistics_registry` is also enabled, the registry providers (filters, projections) also use it. When false, existing behavior is unchanged.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   |
 | datafusion.optimizer.filter_null_join_keys                              | false                     | When set to true, the optimizer will insert filters before a join between a nullable and non-nullable column to filter out nulls on the nullable side. This filter can add additional overhead when the file format does not fully support predicate push down.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              |
 | datafusion.optimizer.repartition_aggregations                           | true                      | Should DataFusion repartition data using the aggregate keys to execute aggregates in parallel using the provided `target_partitions` level                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   |
 | datafusion.optimizer.repartition_file_min_size                          | 10485760                  | Minimum total files size in bytes to perform file scan repartitioning.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       |