From 6942f9e454fc3ecc27dbcedf8e6d2ae234918fb5 Mon Sep 17 00:00:00 2001
From: beorn7 <beorn@grafana.com>
Date: Wed, 5 Oct 2022 15:37:48 +0200
Subject: [PATCH] sparse buckets: Fix handling of +Inf/-Inf/NaN observations
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

NaN observations now go to no bucket, but increment count (and
effectively set sum to NaN, too).

±Inf observations now go to the bucket following the bucket that would
have received math.MaxFloat64. The former is now the last bucket that
can be created.

The getLe is modified to return math.MaxFloat64 for the penultimate
possible bucket.

Also add a test for getLe.

Signed-off-by: beorn7 <beorn@grafana.com>
---
 prometheus/histogram.go      | 85 ++++++++++++++++++++++++++------
 prometheus/histogram_test.go | 95 +++++++++++++++++++++++++++++++++++-
 2 files changed, 162 insertions(+), 18 deletions(-)

diff --git a/prometheus/histogram.go b/prometheus/histogram.go
index eb80fad..6665328 100644
--- a/prometheus/histogram.go
+++ b/prometheus/histogram.go
@@ -577,21 +577,27 @@ func (hc *histogramCounts) observe(v float64, bucket int, doSparse bool) {
 		atomic.AddUint64(&hc.buckets[bucket], 1)
 	}
 	atomicAddFloat(&hc.sumBits, v)
-	if doSparse {
+	if doSparse && !math.IsNaN(v) {
 		var (
-			sparseKey           int
-			sparseSchema        = atomic.LoadInt32(&hc.sparseSchema)
-			sparseZeroThreshold = math.Float64frombits(atomic.LoadUint64(&hc.sparseZeroThresholdBits))
-			frac, exp           = math.Frexp(math.Abs(v))
-			bucketCreated       bool
+			sparseKey            int
+			sparseSchema         = atomic.LoadInt32(&hc.sparseSchema)
+			sparseZeroThreshold  = math.Float64frombits(atomic.LoadUint64(&hc.sparseZeroThresholdBits))
+			bucketCreated, isInf bool
 		)
-		switch {
-		case math.IsInf(v, 0):
-			sparseKey = math.MaxInt32 // Largest possible sparseKey.
-		case sparseSchema > 0:
+		if math.IsInf(v, 0) {
+			// Pretend v is MaxFloat64 but later increment sparseKey by one.
+			if math.IsInf(v, +1) {
+				v = math.MaxFloat64
+			} else {
+				v = -math.MaxFloat64
+			}
+			isInf = true
+		}
+		frac, exp := math.Frexp(math.Abs(v))
+		if sparseSchema > 0 {
 			bounds := sparseBounds[sparseSchema]
 			sparseKey = sort.SearchFloat64s(bounds, frac) + (exp-1)*len(bounds)
-		default:
+		} else {
 			sparseKey = exp
 			if frac == 0.5 {
 				sparseKey--
@@ -599,6 +605,9 @@ func (hc *histogramCounts) observe(v float64, bucket int, doSparse bool) {
 			div := 1 << -sparseSchema
 			sparseKey = (sparseKey + div - 1) / div
 		}
+		if isInf {
+			sparseKey++
+		}
 		switch {
 		case v > sparseZeroThreshold:
 			bucketCreated = addToSparseBucket(&hc.sparseBucketsPositive, sparseKey, 1)
@@ -1062,7 +1071,8 @@ func (v *HistogramVec) GetMetricWith(labels Labels) (Observer, error) {
 // WithLabelValues works as GetMetricWithLabelValues, but panics where
 // GetMetricWithLabelValues would have returned an error. Not returning an
 // error allows shortcuts like
-//     myVec.WithLabelValues("404", "GET").Observe(42.21)
+//
+//	myVec.WithLabelValues("404", "GET").Observe(42.21)
 func (v *HistogramVec) WithLabelValues(lvs ...string) Observer {
 	h, err := v.GetMetricWithLabelValues(lvs...)
 	if err != nil {
@@ -1073,7 +1083,8 @@ func (v *HistogramVec) WithLabelValues(lvs ...string) Observer {
 
 // With works as GetMetricWith but panics where GetMetricWithLabels would have
 // returned an error. Not returning an error allows shortcuts like
-//     myVec.With(prometheus.Labels{"code": "404", "method": "GET"}).Observe(42.21)
+//
+//	myVec.With(prometheus.Labels{"code": "404", "method": "GET"}).Observe(42.21)
 func (v *HistogramVec) With(labels Labels) Observer {
 	h, err := v.GetMetricWith(labels)
 	if err != nil {
@@ -1219,8 +1230,8 @@ func (s buckSort) Less(i, j int) bool {
 // 2^(2^-n) is less or equal the provided bucketFactor.
 //
 // Special cases:
-//     - bucketFactor <= 1: panics.
-//     - bucketFactor < 2^(2^-8) (but > 1): still returns 8.
+//   - bucketFactor <= 1: panics.
+//   - bucketFactor < 2^(2^-8) (but > 1): still returns 8.
 func pickSparseSchema(bucketFactor float64) int32 {
 	if bucketFactor <= 1 {
 		panic(fmt.Errorf("bucketFactor %f is <=1", bucketFactor))
@@ -1346,13 +1357,55 @@ func findSmallestKey(m *sync.Map) int {
 }
 
 func getLe(key int, schema int32) float64 {
+	// Here a bit of context about the behavior for the last bucket counting
+	// regular numbers (called simply "last bucket" below) and the bucket
+	// counting observations of ±Inf (called "inf bucket" below, with a key
+	// one higher than that of the "last bucket"):
+	//
+	// If we apply the usual formula to the last bucket, its upper bound
+	// would be calculated as +Inf. The reason is that the max possible
+	// regular float64 number (math.MaxFloat64) doesn't coincide with one of
+	// the calculated bucket boundaries. So the calculated boundary has to
+	// be larger than math.MaxFloat64, and the only float64 larger than
+	// math.MaxFloat64 is +Inf. However, we want to count actual
+	// observations of ±Inf in the inf bucket. Therefore, we have to treat
+	// the upper bound of the last bucket specially and set it to
+	// math.MaxFloat64. (The upper bound of the inf bucket, with its key
+	// being one higher than that of the last bucket, naturally comes out as
+	// +Inf by the usual formula. So that's fine.)
+	//
+	// math.MaxFloat64 has a frac of 0.9999999999999999 and an exp of
+	// 1024. If there were a float64 number following math.MaxFloat64, it
+	// would have a frac of 1.0 and an exp of 1024, or equivalently a frac
+	// of 0.5 and an exp of 1025. However, since frac must be smaller than
+	// 1, and exp must be smaller than 1025, either representation overflows
+	// a float64. (Which, in turn, is the reason that math.MaxFloat64 is the
+	// largest possible float64. Q.E.D.) However, the formula for
+	// calculating the upper bound from the idx and schema of the last
+	// bucket results in precisely that. It is either frac=1.0 & exp=1024
+	// (for schema < 0) or frac=0.5 & exp=1025 (for schema >=0). (This is,
+	// by the way, a power of two where the exponent itself is a power of
+	// two, 2¹⁰ in fact, which coinicides with a bucket boundary in all
+	// schemas.) So these are the special cases we have to catch below.
 	if schema < 0 {
-		return math.Ldexp(1, key<<(-schema))
+		exp := key << -schema
+		if exp == 1024 {
+			// This is the last bucket before the overflow bucket
+			// (for ±Inf observations). Return math.MaxFloat64 as
+			// explained above.
+			return math.MaxFloat64
+		}
+		return math.Ldexp(1, exp)
 	}
 
 	fracIdx := key & ((1 << schema) - 1)
 	frac := sparseBounds[schema][fracIdx]
 	exp := (key >> schema) + 1
+	if frac == 0.5 && exp == 1025 {
+		// This is the last bucket before the overflow bucket (for ±Inf
+		// observations). Return math.MaxFloat64 as explained above.
+		return math.MaxFloat64
+	}
 	return math.Ldexp(frac, exp)
 }
 
diff --git a/prometheus/histogram_test.go b/prometheus/histogram_test.go
index 5b26fb4..fa80249 100644
--- a/prometheus/histogram_test.go
+++ b/prometheus/histogram_test.go
@@ -548,13 +548,13 @@ func TestSparseHistogram(t *testing.T) {
 			name:         "+Inf observation",
 			observations: []float64{0, 1, 1.2, 1.4, 1.8, 2, math.Inf(+1)},
 			factor:       1.2,
-			want:         `sample_count:7 sample_sum:inf schema:2 zero_threshold:2.938735877055719e-39 zero_count:1 positive_span:<offset:0 length:5 > positive_span:<offset:2147483642 length:1 > positive_delta:1 positive_delta:-1 positive_delta:2 positive_delta:-2 positive_delta:2 positive_delta:-1 `,
+			want:         `sample_count:7 sample_sum:inf schema:2 zero_threshold:2.938735877055719e-39 zero_count:1 positive_span:<offset:0 length:5 > positive_span:<offset:4092 length:1 > positive_delta:1 positive_delta:-1 positive_delta:2 positive_delta:-2 positive_delta:2 positive_delta:-1 `,
 		},
 		{
 			name:         "-Inf observation",
 			observations: []float64{0, 1, 1.2, 1.4, 1.8, 2, math.Inf(-1)},
 			factor:       1.2,
-			want:         `sample_count:7 sample_sum:-inf schema:2 zero_threshold:2.938735877055719e-39 zero_count:1 negative_span:<offset:2147483647 length:1 > negative_delta:1 positive_span:<offset:0 length:5 > positive_delta:1 positive_delta:-1 positive_delta:2 positive_delta:-2 positive_delta:2 `,
+			want:         `sample_count:7 sample_sum:-inf schema:2 zero_threshold:2.938735877055719e-39 zero_count:1 negative_span:<offset:4097 length:1 > negative_delta:1 positive_span:<offset:0 length:5 > positive_delta:1 positive_delta:-1 positive_delta:2 positive_delta:-2 positive_delta:2 `,
 		},
 		{
 			name:         "limited buckets but nothing triggered",
@@ -782,3 +782,94 @@ func TestSparseHistogramConcurrency(t *testing.T) {
 		t.Error(err)
 	}
 }
+
+func TestGetLe(t *testing.T) {
+	scenarios := []struct {
+		key    int
+		schema int32
+		want   float64
+	}{
+		{
+			key:    -1,
+			schema: -1,
+			want:   0.25,
+		},
+		{
+			key:    0,
+			schema: -1,
+			want:   1,
+		},
+		{
+			key:    1,
+			schema: -1,
+			want:   4,
+		},
+		{
+			key:    512,
+			schema: -1,
+			want:   math.MaxFloat64,
+		},
+		{
+			key:    513,
+			schema: -1,
+			want:   math.Inf(+1),
+		},
+		{
+			key:    -1,
+			schema: 0,
+			want:   0.5,
+		},
+		{
+			key:    0,
+			schema: 0,
+			want:   1,
+		},
+		{
+			key:    1,
+			schema: 0,
+			want:   2,
+		},
+		{
+			key:    1024,
+			schema: 0,
+			want:   math.MaxFloat64,
+		},
+		{
+			key:    1025,
+			schema: 0,
+			want:   math.Inf(+1),
+		},
+		{
+			key:    -1,
+			schema: 2,
+			want:   0.8408964152537144,
+		},
+		{
+			key:    0,
+			schema: 2,
+			want:   1,
+		},
+		{
+			key:    1,
+			schema: 2,
+			want:   1.189207115002721,
+		},
+		{
+			key:    4096,
+			schema: 2,
+			want:   math.MaxFloat64,
+		},
+		{
+			key:    4097,
+			schema: 2,
+			want:   math.Inf(+1),
+		},
+	}
+
+	for i, s := range scenarios {
+		got := getLe(s.key, s.schema)
+		if s.want != got {
+			t.Errorf("%d. key %d, schema %d, want upper bound of %g, got %g", i, s.key, s.schema, s.want, got)
+		}
+	}
+}