fix: correct weight handling in approx_percentile_cont_with_weight (#19941)

sesteves · web-flow · commit cad38654c75f · 2026-01-30T12:44:37.000Z
The approx_percentile_cont_with_weight function was producing incorrect results due to wrong weight handling in the TDigest implementation. Root cause: In TDigest::new_with_centroid(), the count field was hardcoded to 1 regardless of the actual centroid weight, while the weight was correctly used in the sum calculation. This mismatch caused incorrect percentile calculations since estimate_quantile() uses count to compute the rank. Changes: - Changed TDigest::count from u64 to f64 to properly support fractional weights (consistent with ClickHouse's TDigest implementation) - Fixed new_with_centroid() to use centroid.weight for count - Updated state_fields() in approx_percentile_cont and approx_median to use Float64 for the count field - Added early return in merge_digests() when all centroids have zero weight to prevent panic - Updated test expectations to reflect correct weighted percentile behavior ## Which issue does this PR close? - Closes #19940 ## Rationale for this change The `approx_percentile_cont_with_weight` function produces incorrect weighted percentile results. The bug is in the TDigest implementation where `new_with_centroid()` sets `count: 1` regardless of the actual centroid weight, while the weight is used elsewhere in centroid merging. This mismatch corrupts the percentile calculation. ## What changes are included in this PR? - Changed `TDigest::count` from `u64` to `f64` to properly support fractional weights (consistent with [ClickHouse's TDigest implementation](https://github.com/ClickHouse/ClickHouse/blob/927af1255adb37ace1b95cc3ec4316553b4cb4b4/src/AggregateFunctions/QuantileTDigest.h#L71-L87)) - Fixed `new_with_centroid()` to use `centroid.weight` for count - Updated `state_fields()` in `approx_percentile_cont` and `approx_median` to use `Float64` for the count field - Added early return in `merge_digests()` when all centroids have zero weight to prevent panic - Updated test expectations to reflect correct weighted percentile behavior ## Are these changes tested? Yes. - All existing unit tests in tdigest.rs pass (7 tests) - All SQL logic tests for aggregate functions pass - Manual testing confirms correct behavior with various weight distributions (equal weights, heavy low/high values, linear weights, fractional weights) ## Are there any user-facing changes? Yes, this is a breaking change: 1. Result changes: approx_percentile_cont_with_weight now returns correct weighted percentiles. Queries relying on the previous (incorrect) behavior will see different results. 2. Serialized state format change: The TDigest state field count changes from UInt64 to Float64. Any existing serialized/checkpointed TDigest state will be incompatible and cannot be restored. 3. Edge case behavior change: When all weights are zero, the function now returns NULL instead of the previous undefined behavior.
diff --git a/datafusion/functions-aggregate-common/src/tdigest.rs b/datafusion/functions-aggregate-common/src/tdigest.rs
@@ -49,17 +49,6 @@ macro_rules! cast_scalar_f64 {
     };
 }
 
-// Cast a non-null [`ScalarValue::UInt64`] to an [`u64`], or
-// panic.
-macro_rules! cast_scalar_u64 {
-    ($value:expr ) => {
-        match &$value {
-            ScalarValue::UInt64(Some(v)) => *v,
-            v => panic!("invalid type {}", v),
-        }
-    };
-}
-
 /// Centroid implementation to the cluster mentioned in the paper.
 #[derive(Debug, PartialEq, Clone)]
 pub struct Centroid {
@@ -110,7 +99,7 @@ pub struct TDigest {
     centroids: Vec<Centroid>,
     max_size: usize,
     sum: f64,
-    count: u64,
+    count: f64,
     max: f64,
     min: f64,
 }
@@ -120,8 +109,8 @@ impl TDigest {
         TDigest {
             centroids: Vec::new(),
             max_size,
-            sum: 0_f64,
-            count: 0,
+            sum: 0.0,
+            count: 0.0,
             max: f64::NAN,
             min: f64::NAN,
         }
@@ -133,14 +122,14 @@ impl TDigest {
             centroids: vec![centroid.clone()],
             max_size,
             sum: centroid.mean * centroid.weight,
-            count: 1,
+            count: centroid.weight,
             max: centroid.mean,
             min: centroid.mean,
         }
     }
 
     #[inline]
-    pub fn count(&self) -> u64 {
+    pub fn count(&self) -> f64 {
         self.count
     }
 
@@ -170,8 +159,8 @@ impl Default for TDigest {
         TDigest {
             centroids: Vec::new(),
             max_size: 100,
-            sum: 0_f64,
-            count: 0,
+            sum: 0.0,
+            count: 0.0,
             max: f64::NAN,
             min: f64::NAN,
         }
@@ -216,12 +205,12 @@ impl TDigest {
         }
 
         let mut result = TDigest::new(self.max_size());
-        result.count = self.count() + sorted_values.len() as u64;
+        result.count = self.count() + sorted_values.len() as f64;
 
         let maybe_min = *sorted_values.first().unwrap();
         let maybe_max = *sorted_values.last().unwrap();
 
-        if self.count() > 0 {
+        if self.count() > 0.0 {
             result.min = self.min.min(maybe_min);
             result.max = self.max.max(maybe_max);
         } else {
@@ -233,7 +222,7 @@ impl TDigest {
 
         let mut k_limit: u64 = 1;
         let mut q_limit_times_count =
-            Self::k_to_q(k_limit, self.max_size) * result.count() as f64;
+            Self::k_to_q(k_limit, self.max_size) * result.count();
         k_limit += 1;
 
         let mut iter_centroids = self.centroids.iter().peekable();
@@ -281,7 +270,7 @@ impl TDigest {
 
                 compressed.push(curr.clone());
                 q_limit_times_count =
-                    Self::k_to_q(k_limit, self.max_size) * result.count() as f64;
+                    Self::k_to_q(k_limit, self.max_size) * result.count();
                 k_limit += 1;
                 curr = next;
             }
@@ -353,7 +342,7 @@ impl TDigest {
         let mut centroids: Vec<Centroid> = Vec::with_capacity(n_centroids);
         let mut starts: Vec<usize> = Vec::with_capacity(digests.len());
 
-        let mut count = 0;
+        let mut count = 0.0;
         let mut min = f64::INFINITY;
         let mut max = f64::NEG_INFINITY;
 
@@ -362,7 +351,7 @@ impl TDigest {
             starts.push(start);
 
             let curr_count = digest.count();
-            if curr_count > 0 {
+            if curr_count > 0.0 {
                 min = min.min(digest.min);
                 max = max.max(digest.max);
                 count += curr_count;
@@ -373,6 +362,11 @@ impl TDigest {
             }
         }
 
+        // If no centroids were added (all digests had zero count), return default
+        if centroids.is_empty() {
+            return TDigest::default();
+        }
+
         let mut digests_per_block: usize = 1;
         while digests_per_block < starts.len() {
             for i in (0..starts.len()).step_by(digests_per_block * 2) {
@@ -397,7 +391,7 @@ impl TDigest {
         let mut compressed: Vec<Centroid> = Vec::with_capacity(max_size);
 
         let mut k_limit = 1;
-        let mut q_limit_times_count = Self::k_to_q(k_limit, max_size) * count as f64;
+        let mut q_limit_times_count = Self::k_to_q(k_limit, max_size) * count;
 
         let mut iter_centroids = centroids.iter_mut();
         let mut curr = iter_centroids.next().unwrap();
@@ -416,7 +410,7 @@ impl TDigest {
                 sums_to_merge = 0_f64;
                 weights_to_merge = 0_f64;
                 compressed.push(curr.clone());
-                q_limit_times_count = Self::k_to_q(k_limit, max_size) * count as f64;
+                q_limit_times_count = Self::k_to_q(k_limit, max_size) * count;
                 k_limit += 1;
                 curr = centroid;
             }
@@ -440,7 +434,7 @@ impl TDigest {
             return 0.0;
         }
 
-        let rank = q * self.count as f64;
+        let rank = q * self.count;
 
         let mut pos: usize;
         let mut t;
@@ -450,7 +444,7 @@ impl TDigest {
             }
 
             pos = 0;
-            t = self.count as f64;
+            t = self.count;
 
             for (k, centroid) in self.centroids.iter().enumerate().rev() {
                 t -= centroid.weight();
@@ -563,7 +557,7 @@ impl TDigest {
         vec![
             ScalarValue::UInt64(Some(self.max_size as u64)),
             ScalarValue::Float64(Some(self.sum)),
-            ScalarValue::UInt64(Some(self.count)),
+            ScalarValue::Float64(Some(self.count)),
             ScalarValue::Float64(Some(self.max)),
             ScalarValue::Float64(Some(self.min)),
             ScalarValue::List(arr),
@@ -611,7 +605,7 @@ impl TDigest {
         Self {
             max_size,
             sum: cast_scalar_f64!(state[1]),
-            count: cast_scalar_u64!(&state[2]),
+            count: cast_scalar_f64!(state[2]),
             max,
             min,
             centroids,
diff --git a/datafusion/functions-aggregate/src/approx_median.rs b/datafusion/functions-aggregate/src/approx_median.rs
@@ -110,7 +110,7 @@ impl AggregateUDFImpl for ApproxMedian {
             Ok(vec![
                 Field::new(format_state_name(args.name, "max_size"), UInt64, false),
                 Field::new(format_state_name(args.name, "sum"), Float64, false),
-                Field::new(format_state_name(args.name, "count"), UInt64, false),
+                Field::new(format_state_name(args.name, "count"), Float64, false),
                 Field::new(format_state_name(args.name, "max"), Float64, false),
                 Field::new(format_state_name(args.name, "min"), Float64, false),
                 Field::new_list(
diff --git a/datafusion/functions-aggregate/src/approx_percentile_cont.rs b/datafusion/functions-aggregate/src/approx_percentile_cont.rs
@@ -259,7 +259,7 @@ impl AggregateUDFImpl for ApproxPercentileCont {
             ),
             Field::new(
                 format_state_name(args.name, "count"),
-                DataType::UInt64,
+                DataType::Float64,
                 false,
             ),
             Field::new(
@@ -436,7 +436,7 @@ impl Accumulator for ApproxPercentileAccumulator {
     }
 
     fn evaluate(&mut self) -> Result<ScalarValue> {
-        if self.digest.count() == 0 {
+        if self.digest.count() == 0.0 {
             return ScalarValue::try_from(self.return_type.clone());
         }
         let q = self.digest.estimate_quantile(self.percentile);
@@ -513,8 +513,8 @@ mod tests {
             ApproxPercentileAccumulator::new_with_max_size(0.5, DataType::Float64, 100);
 
         accumulator.merge_digests(&[t1]);
-        assert_eq!(accumulator.digest.count(), 50_000);
+        assert_eq!(accumulator.digest.count(), 50_000.0);
         accumulator.merge_digests(&[t2]);
-        assert_eq!(accumulator.digest.count(), 100_000);
+        assert_eq!(accumulator.digest.count(), 100_000.0);
     }
 }
diff --git a/datafusion/sqllogictest/test_files/aggregate.slt b/datafusion/sqllogictest/test_files/aggregate.slt
@@ -2029,11 +2029,12 @@ statement ok
 INSERT INTO t1 VALUES (TRUE);
 
 # ISSUE: https://github.com/apache/datafusion/issues/12716
-# This test verifies that approx_percentile_cont_with_weight does not panic when given 'NaN' and returns 'inf'
+# This test verifies that approx_percentile_cont_with_weight does not panic when given 'NaN'
+# With weight=0, the data point does not contribute, so result is NULL
 query R
 SELECT approx_percentile_cont_with_weight(0, 0) WITHIN GROUP (ORDER BY 'NaN'::DOUBLE) FROM t1 WHERE t1.v1;
 ----
-Infinity
+NULL
 
 statement ok
 DROP TABLE t1;
@@ -2352,21 +2353,21 @@ e 115
 query TI
 SELECT c1, approx_percentile_cont_with_weight(c2, 0.95) WITHIN GROUP (ORDER BY c3) AS c3_p95 FROM aggregate_test_100 GROUP BY 1 ORDER BY 1
 ----
-a 74
+a 65
 b 68
-c 123
-d 124
-e 115
+c 122
+d 123
+e 110
 
 # approx_percentile_cont_with_weight with centroids
 query TI
 SELECT c1, approx_percentile_cont_with_weight(c2, 0.95, 200) WITHIN GROUP (ORDER BY c3) AS c3_p95 FROM aggregate_test_100 GROUP BY 1 ORDER BY 1
 ----
-a 74
+a 65
 b 68
-c 123
-d 124
-e 115
+c 122
+d 123
+e 110
 
 # csv_query_sum_crossjoin
 query TTI