From ff67596a82ef683c0097d75fefd1b5938ad0320f Mon Sep 17 00:00:00 2001
From: beorn7 <beorn@grafana.com>
Date: Wed, 7 May 2025 15:59:48 +0200
Subject: [PATCH] promql: Simplify avg aggregation and avg_over_time

As it turns out, if we combine Kahan summation and incremental mean
calculation properly, it works quite well and we do not need to switch
between simple mean calculation and incremental calculation based on
overflow.

This simplifies the code quite a bit.

Signed-off-by: beorn7 <beorn@grafana.com>
---
 promql/engine.go    | 62 ++++++++++++++----------------------------
 promql/functions.go | 65 ++++++++++++++-------------------------------
 2 files changed, 40 insertions(+), 87 deletions(-)

diff --git a/promql/engine.go b/promql/engine.go
index fec2bbf761..888df311d0 100644
--- a/promql/engine.go
+++ b/promql/engine.go
@@ -2981,7 +2981,6 @@ type groupedAggregation struct {
 	hasHistogram           bool // Has at least 1 histogram sample aggregated.
 	incompatibleHistograms bool // If true, group has seen mixed exponential and custom buckets, or incompatible custom buckets.
 	groupAggrComplete      bool // Used by LIMITK to short-cut series loop when we've reached K elem on every group.
-	incrementalMean        bool // True after reverting to incremental calculation of the mean value.
 }
 
 // aggregation evaluates sum, avg, count, stdvar, stddev or quantile at one timestep on inputMatrix.
@@ -3080,6 +3079,21 @@ func (ev *evaluator) aggregation(e *parser.AggregateExpr, q float64, inputMatrix
 			}
 
 		case parser.AVG:
+			// For the average calculation, we use incremental mean
+			// calculation. In particular in combination with Kahan
+			// summation (which we do for floats, but not yet for
+			// histograms, see issue #14105), this is quite accurate
+			// and only breaks in extreme cases (see testdata for
+			// avg_over_time). One might assume that simple direct
+			// mean calculation works better in some cases, but so
+			// far, our conclusion is that we fare best with the
+			// incremental approach plus Kahan summation (for
+			// floats). For a relevant discussion, see
+			// https://stackoverflow.com/questions/61665473/is-it-beneficial-for-precision-to-calculate-the-incremental-mean-average
+			// Additional note: For even better numerical accuracy,
+			// we would need to process the values in a particular
+			// order, but that would be very hard to implement given
+			// how the PromQL engine works.
 			group.groupCount++
 			if h != nil {
 				group.hasHistogram = true
@@ -3104,45 +3118,11 @@ func (ev *evaluator) aggregation(e *parser.AggregateExpr, q float64, inputMatrix
 				// point in copying the histogram in that case.
 			} else {
 				group.hasFloat = true
-				if !group.incrementalMean {
-					newV, newC := kahanSumInc(f, group.floatValue, group.floatKahanC)
-					if !math.IsInf(newV, 0) {
-						// The sum doesn't overflow, so we propagate it to the
-						// group struct and continue with the regular
-						// calculation of the mean value.
-						group.floatValue, group.floatKahanC = newV, newC
-						break
-					}
-					// If we are here, we know that the sum _would_ overflow. So
-					// instead of continue to sum up, we revert to incremental
-					// calculation of the mean value from here on.
-					group.incrementalMean = true
-					group.floatMean = group.floatValue / (group.groupCount - 1)
-					group.floatKahanC /= group.groupCount - 1
-				}
-				if math.IsInf(group.floatMean, 0) {
-					if math.IsInf(f, 0) && (group.floatMean > 0) == (f > 0) {
-						// The `floatMean` and `s.F` values are `Inf` of the same sign.  They
-						// can't be subtracted, but the value of `floatMean` is correct
-						// already.
-						break
-					}
-					if !math.IsInf(f, 0) && !math.IsNaN(f) {
-						// At this stage, the mean is an infinite. If the added
-						// value is neither an Inf or a Nan, we can keep that mean
-						// value.
-						// This is required because our calculation below removes
-						// the mean value, which would look like Inf += x - Inf and
-						// end up as a NaN.
-						break
-					}
-				}
-				currentMean := group.floatMean + group.floatKahanC
+				q := (group.groupCount - 1) / group.groupCount
 				group.floatMean, group.floatKahanC = kahanSumInc(
-					// Divide each side of the `-` by `group.groupCount` to avoid float64 overflows.
-					f/group.groupCount-currentMean/group.groupCount,
-					group.floatMean,
-					group.floatKahanC,
+					f/group.groupCount,
+					q*group.floatMean,
+					q*group.floatKahanC,
 				)
 			}
 
@@ -3215,10 +3195,8 @@ func (ev *evaluator) aggregation(e *parser.AggregateExpr, q float64, inputMatrix
 				continue
 			case aggr.hasHistogram:
 				aggr.histogramValue = aggr.histogramValue.Compact(0)
-			case aggr.incrementalMean:
-				aggr.floatValue = aggr.floatMean + aggr.floatKahanC
 			default:
-				aggr.floatValue = (aggr.floatValue + aggr.floatKahanC) / aggr.groupCount
+				aggr.floatValue = aggr.floatMean + aggr.floatKahanC
 			}
 
 		case parser.COUNT:
diff --git a/promql/functions.go b/promql/functions.go
index e4699f3914..7e5205844a 100644
--- a/promql/functions.go
+++ b/promql/functions.go
@@ -671,6 +671,20 @@ func funcAvgOverTime(vals []parser.Value, args parser.Expressions, enh *EvalNode
 		metricName := firstSeries.Metric.Get(labels.MetricName)
 		return enh.Out, annotations.New().Add(annotations.NewMixedFloatsHistogramsWarning(metricName, args[0].PositionRange()))
 	}
+	// For the average calculation, we use incremental mean calculation. In
+	// particular in combination with Kahan summation (which we do for
+	// floats, but not yet for histograms, see issue #14105), this is quite
+	// accurate and only breaks in extreme cases (see testdata). One might
+	// assume that simple direct mean calculation works better in some
+	// cases, but so far, our conclusion is that we fare best with the
+	// incremental approach plus Kahan summation (for floats). For a
+	// relevant discussion, see
+	// https://stackoverflow.com/questions/61665473/is-it-beneficial-for-precision-to-calculate-the-incremental-mean-average
+	// Additional note: For even better numerical accuracy, we would need to
+	// process the values in a particular order. For avg_over_time, that
+	// would be more or less feasible, but it would be more expensivo, and
+	// it would also be much harder for the avg aggregator, given how the
+	// PromQL engine works.
 	if len(firstSeries.Floats) == 0 {
 		// The passed values only contain histograms.
 		vec, err := aggrHistOverTime(vals, enh, func(s Series) (*histogram.FloatHistogram, error) {
@@ -702,52 +716,13 @@ func funcAvgOverTime(vals []parser.Value, args parser.Expressions, enh *EvalNode
 		return vec, nil
 	}
 	return aggrOverTime(vals, enh, func(s Series) float64 {
-		var (
-			sum, mean, count, kahanC float64
-			incrementalMean          bool
-		)
-		for _, f := range s.Floats {
-			count++
-			if !incrementalMean {
-				newSum, newC := kahanSumInc(f.F, sum, kahanC)
-				// Perform regular mean calculation as long as
-				// the sum doesn't overflow and (in any case)
-				// for the first iteration (even if we start
-				// with ±Inf) to not run into division-by-zero
-				// problems below.
-				if count == 1 || !math.IsInf(newSum, 0) {
-					sum, kahanC = newSum, newC
-					continue
-				}
-				// Handle overflow by reverting to incremental calculation of the mean value.
-				incrementalMean = true
-				mean = sum / (count - 1)
-				kahanC /= count - 1
-			}
-			if math.IsInf(mean, 0) {
-				if math.IsInf(f.F, 0) && (mean > 0) == (f.F > 0) {
-					// The `mean` and `f.F` values are `Inf` of the same sign.  They
-					// can't be subtracted, but the value of `mean` is correct
-					// already.
-					continue
-				}
-				if !math.IsInf(f.F, 0) && !math.IsNaN(f.F) {
-					// At this stage, the mean is an infinite. If the added
-					// value is neither an Inf or a Nan, we can keep that mean
-					// value.
-					// This is required because our calculation below removes
-					// the mean value, which would look like Inf += x - Inf and
-					// end up as a NaN.
-					continue
-				}
-			}
-			correctedMean := mean + kahanC
-			mean, kahanC = kahanSumInc(f.F/count-correctedMean/count, mean, kahanC)
+		var mean, kahanC float64
+		for i, f := range s.Floats {
+			count := float64(i + 1)
+			q := float64(i) / count
+			mean, kahanC = kahanSumInc(f.F/count, q*mean, q*kahanC)
 		}
-		if incrementalMean {
-			return mean + kahanC
-		}
-		return (sum + kahanC) / count
+		return mean + kahanC
 	}), nil
 }