Merge pull request #18813 from miguelbernadi/improve-hostogram-allocations

tsdb/record: eliminate prev pointer escapes in V2 histogram WAL decoder
2026-06-08 16:12:16 -04:00 · 2026-06-01 10:54:02 +01:00 · 2026-06-01 10:54:02 +01:00 · 178c53d83c
commit 178c53d83c
parent 489d90e717 423c7878de
3 changed files with 170 additions and 31 deletions
--- a/tsdb/head_test.go
+++ b/tsdb/head_test.go
@ -205,6 +205,16 @@ func BenchmarkLoadWLs(b *testing.B) {
 		// The first oooSamplesPct*samplesPerSeries samples in an OOO series are written as OOO samples.
 		oooSamplesPct float64
 		oooCapMax     int64
+		// histogramSeriesPct is the fraction of series that emit native
+		// histogram samples instead of float samples. 0 means all float
+		// (the default for existing cases), 1 means all histograms.
+		// Histogram series use the last histogramSeriesPct*seriesPerBatch
+		// refs in each batch so existing float-only shapes are unaffected.
+		histogramSeriesPct float64
+		// bucketsPerHistogram is the number of positive buckets written
+		// per native histogram sample. Each bucket adds one span entry
+		// and one bucket delta to the encoded histogram.
+		bucketsPerHistogram int
 	}{
 		{ // Less series and more samples. 2 hour WAL with 1 second scrape interval.
 			batches:          10,
@ -252,6 +262,27 @@ func BenchmarkLoadWLs(b *testing.B) {
 			oooSamplesPct:    0.3,
 			oooCapMax:        DefaultOutOfOrderCapMax,
 		},
+		{ // All-histogram WAL, matching the "In between" float shape.
+			// Exercises the native-histogram decode hot path (DecodeHistogram
+			// + histogramSamplesV1/V2) which is shared by WAL replay,
+			// WAL watcher (remote write), and checkpoint creation.
+			// bucketsPerHistogram=8 is representative of a moderately
+			// complex exponential histogram seen in practice.
+			batches:             10,
+			seriesPerBatch:      1000,
+			samplesPerSeries:    480,
+			histogramSeriesPct:  1.0,
+			bucketsPerHistogram: 8,
+		},
+		{ // Mixed WAL: 50% float series, 50% native histogram series.
+			// Models a deployment that is partway through migrating metrics
+			// to native histograms.
+			batches:             10,
+			seriesPerBatch:      1000,
+			samplesPerSeries:    480,
+			histogramSeriesPct:  0.5,
+			bucketsPerHistogram: 8,
+		},
 	}

 	labelsPerSeries := 5
@ -270,7 +301,11 @@ func BenchmarkLoadWLs(b *testing.B) {
 						continue
 					}
 					lastExemplarsPerSeries = exemplarsPerSeries
-					b.Run(fmt.Sprintf("batches=%d,seriesPerBatch=%d,samplesPerSeries=%d,exemplarsPerSeries=%d,mmappedChunkT=%d,oooSeriesPct=%.3f,oooSamplesPct=%.3f,oooCapMax=%d,missingSeriesPct=%.3f,stStorage=%v", c.batches, c.seriesPerBatch, c.samplesPerSeries, exemplarsPerSeries, c.mmappedChunkT, c.oooSeriesPct, c.oooSamplesPct, c.oooCapMax, missingSeriesPct, enableSTStorage),
+					name := fmt.Sprintf("batches=%d,seriesPerBatch=%d,samplesPerSeries=%d,exemplarsPerSeries=%d,mmappedChunkT=%d,oooSeriesPct=%.3f,oooSamplesPct=%.3f,oooCapMax=%d,missingSeriesPct=%.3f,stStorage=%v", c.batches, c.seriesPerBatch, c.samplesPerSeries, exemplarsPerSeries, c.mmappedChunkT, c.oooSeriesPct, c.oooSamplesPct, c.oooCapMax, missingSeriesPct, enableSTStorage)
+					if c.histogramSeriesPct > 0 {
+						name += fmt.Sprintf(",histogramSeriesPct=%.3f,bucketsPerHistogram=%d", c.histogramSeriesPct, c.bucketsPerHistogram)
+					}
+					b.Run(name,
 						func(b *testing.B) {
 							dir := b.TempDir()

@ -312,30 +347,77 @@ func BenchmarkLoadWLs(b *testing.B) {
 								buf = populateTestWL(b, wal, []any{writeSeries}, buf, enableSTStorage)
 							}

-							// Write samples.
-							refSamples := make([]record.RefSample, 0, c.seriesPerBatch)
+							// Write samples. Series are split into float and
+							// histogram series: the last histogramSeriesPerBatch
+							// refs in each batch emit RefHistogramSample records;
+							// the rest emit RefSample records. This mirrors how
+							// real Prometheus deployments work — a given series is
+							// committed to one type.
+							histogramSeriesPerBatch := int(float64(c.seriesPerBatch) * c.histogramSeriesPct)
+							floatSeriesPerBatch := c.seriesPerBatch - histogramSeriesPerBatch
+
+							refSamples := make([]record.RefSample, 0, floatSeriesPerBatch)
+							refHistSamples := make([]record.RefHistogramSample, 0, histogramSeriesPerBatch)

 							oooSeriesPerBatch := int(float64(c.seriesPerBatch) * c.oooSeriesPct)
 							oooSamplesPerSeries := int(float64(c.samplesPerSeries) * c.oooSamplesPct)

+							// Build a reusable histogram template with the configured
+							// bucket count. All histogram series share the same shape;
+							// only the value (Sum/Count) changes per sample.
+							var histTemplate *histogram.Histogram
+							if histogramSeriesPerBatch > 0 {
+								spans := make([]histogram.Span, c.bucketsPerHistogram)
+								for idx := range spans {
+									spans[idx] = histogram.Span{Offset: int32(idx), Length: 1}
+								}
+								buckets := make([]int64, c.bucketsPerHistogram)
+								for idx := range buckets {
+									buckets[idx] = int64(idx + 1)
+								}
+								histTemplate = &histogram.Histogram{
+									Schema:          1,
+									PositiveSpans:   spans,
+									PositiveBuckets: buckets,
+								}
+							}
+
 							for i := 0; i < c.samplesPerSeries; i++ {
 								for j := 0; j < c.batches; j++ {
 									refSamples = refSamples[:0]
+									refHistSamples = refHistSamples[:0]

+									// Float series occupy refs [j*seriesPerBatch, j*seriesPerBatch+floatSeriesPerBatch).
 									k := j * c.seriesPerBatch
-									// Skip appending the first oooSamplesPerSeries samples for the series in the batch that
-									// should have OOO samples. OOO samples are appended after all the in-order samples.
 									if i < oooSamplesPerSeries {
 										k += oooSeriesPerBatch
 									}
-									for ; k < (j+1)*c.seriesPerBatch; k++ {
+									floatEnd := j*c.seriesPerBatch + floatSeriesPerBatch
+									for ; k < floatEnd; k++ {
 										refSamples = append(refSamples, record.RefSample{
 											Ref: chunks.HeadSeriesRef(k) * 101,
 											T:   int64(i) * 10,
 											V:   float64(i) * 100,
 										})
 									}
-									buf = populateTestWL(b, wal, []any{refSamples}, buf, enableSTStorage)
+									if len(refSamples) > 0 {
+										buf = populateTestWL(b, wal, []any{refSamples}, buf, enableSTStorage)
+									}
+
+									// Histogram series occupy refs [j*seriesPerBatch+floatSeriesPerBatch, (j+1)*seriesPerBatch).
+									for k = floatEnd; k < (j+1)*c.seriesPerBatch; k++ {
+										h := *histTemplate
+										h.Count = uint64(i + 1)
+										h.Sum = float64(i) * 100
+										refHistSamples = append(refHistSamples, record.RefHistogramSample{
+											Ref: chunks.HeadSeriesRef(k) * 101,
+											T:   int64(i) * 10,
+											H:   &h,
+										})
+									}
+									if len(refHistSamples) > 0 {
+										buf = populateTestWL(b, wal, []any{refHistSamples}, buf, enableSTStorage)
+									}
 								}
 							}

--- a/tsdb/record/record.go
+++ b/tsdb/record/record.go
@ -593,22 +593,21 @@ func (d *Decoder) histogramSamplesV2(dec *encoding.Decbuf, histograms []RefHisto
 	firstRef := chunks.HeadSeriesRef(dec.Varint64())
 	firstT := dec.Varint64()
 	firstST := dec.Varint64()
-	var prev *RefHistogramSample
+	var (
+		prevRef chunks.HeadSeriesRef
+		prevST  int64
+	)
+	hasPrev := false

 	for len(dec.B) > 0 && dec.Err() == nil {
 		var ref, t, st int64
-		if prev == nil {
-			prev = &RefHistogramSample{
-				Ref: firstRef,
-				ST:  firstST,
-			}
-			ref = int64(firstRef)
-			t = firstT
-			st = firstST
+		if !hasPrev {
+			ref, t, st = int64(firstRef), firstT, firstST
+			hasPrev = true
 		} else {
-			ref = int64(prev.Ref) + dec.Varint64()
+			ref = int64(prevRef) + dec.Varint64()
 			t = firstT + dec.Varint64()
-			st = readSTMarker(dec, prev.ST, firstST)
+			st = readSTMarker(dec, prevST, firstST)
 		}

 		rh := RefHistogramSample{
@ -617,7 +616,7 @@ func (d *Decoder) histogramSamplesV2(dec *encoding.Decbuf, histograms []RefHisto
 			T:   t,
 			H:   &histogram.Histogram{},
 		}
-		prev = &rh
+		prevRef, prevST = rh.Ref, rh.ST
 		DecodeHistogram(dec, rh.H)

 		if !histogram.IsKnownSchema(rh.H.Schema) {
@ -768,22 +767,19 @@ func (d *Decoder) floatHistogramSamplesV2(dec *encoding.Decbuf, histograms []Ref
 	firstRef := chunks.HeadSeriesRef(dec.Varint64())
 	firstT := dec.Varint64()
 	firstST := dec.Varint64()
-	var prev *RefFloatHistogramSample
+	var prevRef chunks.HeadSeriesRef
+	var prevST int64
+	hasPrev := false

 	for len(dec.B) > 0 && dec.Err() == nil {
 		var ref, t, st int64
-		if prev == nil {
-			prev = &RefFloatHistogramSample{
-				Ref: firstRef,
-				ST:  firstST,
-			}
-			ref = int64(firstRef)
-			t = firstT
-			st = firstST
+		if !hasPrev {
+			ref, t, st = int64(firstRef), firstT, firstST
+			hasPrev = true
 		} else {
-			ref = int64(prev.Ref) + dec.Varint64()
+			ref = int64(prevRef) + dec.Varint64()
 			t = firstT + dec.Varint64()
-			st = readSTMarker(dec, prev.ST, firstST)
+			st = readSTMarker(dec, prevST, firstST)
 		}

 		rfh := RefFloatHistogramSample{
@ -792,7 +788,7 @@ func (d *Decoder) floatHistogramSamplesV2(dec *encoding.Decbuf, histograms []Ref
 			T:   t,
 			FH:  &histogram.FloatHistogram{},
 		}
-		prev = &rfh
+		prevRef, prevST = rfh.Ref, rfh.ST
 		DecodeFloatHistogram(dec, rfh.FH)

 		if !histogram.IsKnownSchema(rfh.FH.Schema) {
--- a/tsdb/record/record_test.go
+++ b/tsdb/record/record_test.go
@ -1323,3 +1323,64 @@ func BenchmarkWAL_HistogramEncoding(b *testing.B) {
 		}
 	}
 }
+
+// BenchmarkDecodeHistogramSamples measures per-sample allocation cost for
+// histogram WAL decoding — both V1 and V2 paths. This exercises the hot
+// path identified in finding B/C of the allocation analysis: one
+// *histogram.Histogram allocation per sample (unavoidable with current
+// design) plus one escaped *RefHistogramSample per V2 iteration (fixable).
+func BenchmarkDecodeHistogramSamples(b *testing.B) {
+	const numSamples = 1000
+
+	makeHistogram := func(buckets int) *histogram.Histogram {
+		spans := make([]histogram.Span, buckets)
+		for i := range spans {
+			spans[i] = histogram.Span{Offset: int32(i), Length: 1}
+		}
+		bkts := make([]int64, buckets)
+		for i := range bkts {
+			bkts[i] = int64(i + 1)
+		}
+		return &histogram.Histogram{
+			Schema:          1,
+			Count:           uint64(buckets * 10),
+			Sum:             float64(buckets),
+			PositiveSpans:   spans,
+			PositiveBuckets: bkts,
+		}
+	}
+
+	for _, buckets := range []int{0, 4, 16} {
+		for _, version := range []string{"v1", "v2"} {
+			b.Run(fmt.Sprintf("buckets=%d/%s", buckets, version), func(b *testing.B) {
+				samples := make([]RefHistogramSample, numSamples)
+				for i := range samples {
+					samples[i] = RefHistogramSample{
+						Ref: chunks.HeadSeriesRef(i),
+						T:   int64(i) * 1000,
+						H:   makeHistogram(buckets),
+					}
+				}
+
+				var raw []byte
+				if version == "v1" {
+					enc := Encoder{}
+					raw, _ = enc.HistogramSamples(samples, raw)
+				} else {
+					enc := Encoder{EnableSTStorage: true}
+					raw, _ = enc.HistogramSamples(samples, raw)
+				}
+
+				dec := NewDecoder(labels.NewSymbolTable(), promslog.NewNopLogger())
+				buf := make([]RefHistogramSample, 0, numSamples)
+
+				b.ResetTimer()
+				b.ReportAllocs()
+				for b.Loop() {
+					buf, _ = dec.HistogramSamples(raw, buf[:0])
+				}
+				_ = buf
+			})
+		}
+	}
+}