Merge pull request #1150 from prometheus/sparsehistogram

Merge sparsehistogram branch into main
2022-10-31 16:55:36 +01:00 · 2022-10-31 16:55:36 +01:00 · 5f202eefdb
parent 0859bb8f37 fffb76cafe
commit 5f202eefdb
5 changed files with 1322 additions and 93 deletions
--- a/examples/random/main.go
+++ b/examples/random/main.go
@ -48,14 +48,22 @@ func NewMetrics(reg prometheus.Registerer, normMean, normDomain float64) *metric
 			},
 			[]string{"service"},
 		),
-		// The same as above, but now as a histogram, and only for the normal
-		// distribution. The buckets are targeted to the parameters of the
-		// normal distribution, with 20 buckets centered on the mean, each
-		// half-sigma wide.
+		// The same as above, but now as a histogram, and only for the
+		// normal distribution. The histogram features both conventional
+		// buckets as well as sparse buckets, the latter needed for the
+		// experimental native histograms (ingested by a Prometheus
+		// server v2.40 with the corresponding feature flag
+		// enabled). The conventional buckets are targeted to the
+		// parameters of the normal distribution, with 20 buckets
+		// centered on the mean, each half-sigma wide. The sparse
+		// buckets are always centered on zero, with a growth factor of
+		// one bucket to the text of (at most) 1.1. (The precise factor
+		// is 2^2^-3 = 1.0905077...)
 		rpcDurationsHistogram: prometheus.NewHistogram(prometheus.HistogramOpts{
 			Name:                        "rpc_durations_histogram_seconds",
 			Help:                        "RPC latency distributions.",
 			Buckets:                     prometheus.LinearBuckets(normMean-5*normDomain, .5*normDomain, 20),
+			NativeHistogramBucketFactor: 1.1,
 		}),
 	}
 	reg.MustRegister(m.rpcDurations)
--- a/go.mod
+++ b/go.mod
@ -8,7 +8,7 @@ require (
 	github.com/davecgh/go-spew v1.1.1
 	github.com/golang/protobuf v1.5.2
 	github.com/json-iterator/go v1.1.12
-	github.com/prometheus/client_model v0.2.0
+	github.com/prometheus/client_model v0.3.0
 	github.com/prometheus/common v0.37.0
 	github.com/prometheus/procfs v0.8.0
 	golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a
--- a/go.sum
+++ b/go.sum
@ -134,8 +134,9 @@ github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f/go.mod h1:qRW
 github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
 github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
 github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA=
-github.com/prometheus/client_model v0.2.0 h1:uq5h0d+GuxiXLJLNABMgp2qUWDPiLvgCzz2dUR+/W/M=
 github.com/prometheus/client_model v0.2.0/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA=
+github.com/prometheus/client_model v0.3.0 h1:UBgGFHqYdG/TPFD1B1ogZywDqEkwp3fBMvqdiQ7Xew4=
+github.com/prometheus/client_model v0.3.0/go.mod h1:LDGWKZIo7rky3hgvBe+caln+Dr3dPggB5dvjtD7w9+w=
 github.com/prometheus/common v0.37.0 h1:ccBbHCgIiT9uSoFY0vX8H3zsNR5eLt17/RQLUvn8pXE=
 github.com/prometheus/common v0.37.0/go.mod h1:phzohg0JFMnBEFGxTDbfu3QyL5GI8gTQJFhYO5B3mfA=
 github.com/prometheus/procfs v0.8.0 h1:ODq8ZFEaYeCaZOJlZZdJA2AbQR98dSHSM1KW/You5mo=
--- a/prometheus/histogram.go
+++ b/prometheus/histogram.go
--- a/prometheus/histogram_test.go
+++ b/prometheus/histogram_test.go
@ -20,6 +20,7 @@ import (
 	"runtime"
 	"sort"
 	"sync"
+	"sync/atomic"
 	"testing"
 	"testing/quick"
 	"time"
@ -167,7 +168,7 @@ func TestHistogramConcurrency(t *testing.T) {
 		start.Add(1)
 		end.Add(concLevel)

-		sum := NewHistogram(HistogramOpts{
+		his := NewHistogram(HistogramOpts{
 			Name:    "test_histogram",
 			Help:    "helpless",
 			Buckets: testBuckets,
@ -188,9 +189,9 @@ func TestHistogramConcurrency(t *testing.T) {
 				start.Wait()
 				for _, v := range vals {
 					if n%2 == 0 {
-						sum.Observe(v)
+						his.Observe(v)
 					} else {
-						sum.(ExemplarObserver).ObserveWithExemplar(v, Labels{"foo": "bar"})
+						his.(ExemplarObserver).ObserveWithExemplar(v, Labels{"foo": "bar"})
 					}
 				}
 				end.Done()
@ -201,7 +202,7 @@ func TestHistogramConcurrency(t *testing.T) {
 		end.Wait()

 		m := &dto.Metric{}
-		sum.Write(m)
+		his.Write(m)
 		if got, want := int(*m.Histogram.SampleCount), total; got != want {
 			t.Errorf("got sample count %d, want %d", got, want)
 		}
@ -467,3 +468,408 @@ func TestHistogramExemplar(t *testing.T) {
 		}
 	}
 }
+
+func TestSparseHistogram(t *testing.T) {
+	scenarios := []struct {
+		name             string
+		observations     []float64 // With simulated interval of 1m.
+		factor           float64
+		zeroThreshold    float64
+		maxBuckets       uint32
+		minResetDuration time.Duration
+		maxZeroThreshold float64
+		want             string // String representation of protobuf.
+	}{
+		{
+			name:         "no sparse buckets",
+			observations: []float64{1, 2, 3},
+			factor:       1,
+			want:         `sample_count:3 sample_sum:6 bucket:<cumulative_count:0 upper_bound:0.005 > bucket:<cumulative_count:0 upper_bound:0.01 > bucket:<cumulative_count:0 upper_bound:0.025 > bucket:<cumulative_count:0 upper_bound:0.05 > bucket:<cumulative_count:0 upper_bound:0.1 > bucket:<cumulative_count:0 upper_bound:0.25 > bucket:<cumulative_count:0 upper_bound:0.5 > bucket:<cumulative_count:1 upper_bound:1 > bucket:<cumulative_count:2 upper_bound:2.5 > bucket:<cumulative_count:3 upper_bound:5 > bucket:<cumulative_count:3 upper_bound:10 > `, // Has conventional buckets because there are no sparse buckets.
+		},
+		{
+			name:         "factor 1.1 results in schema 3",
+			observations: []float64{0, 1, 2, 3},
+			factor:       1.1,
+			want:         `sample_count:4 sample_sum:6 schema:3 zero_threshold:2.938735877055719e-39 zero_count:1 positive_span:<offset:0 length:1 > positive_span:<offset:7 length:1 > positive_span:<offset:4 length:1 > positive_delta:1 positive_delta:0 positive_delta:0 `,
+		},
+		{
+			name:         "factor 1.2 results in schema 2",
+			observations: []float64{0, 1, 1.2, 1.4, 1.8, 2},
+			factor:       1.2,
+			want:         `sample_count:6 sample_sum:7.4 schema:2 zero_threshold:2.938735877055719e-39 zero_count:1 positive_span:<offset:0 length:5 > positive_delta:1 positive_delta:-1 positive_delta:2 positive_delta:-2 positive_delta:2 `,
+		},
+		{
+			name: "factor 4 results in schema -1",
+			observations: []float64{
+				0.5, 1, // Bucket 0: (0.25, 1]
+				1.5, 2, 3, 3.5, // Bucket 1: (1, 4]
+				5, 6, 7, // Bucket 2: (4, 16]
+				33.33, // Bucket 3: (16, 64]
+			},
+			factor: 4,
+			want:   `sample_count:10 sample_sum:62.83 schema:-1 zero_threshold:2.938735877055719e-39 zero_count:0 positive_span:<offset:0 length:4 > positive_delta:2 positive_delta:2 positive_delta:-1 positive_delta:-2 `,
+		},
+		{
+			name: "factor 17 results in schema -2",
+			observations: []float64{
+				0.5, 1, // Bucket 0: (0.0625, 1]
+				1.5, 2, 3, 3.5, 5, 6, 7, // Bucket 1: (1, 16]
+				33.33, // Bucket 2: (16, 256]
+			},
+			factor: 17,
+			want:   `sample_count:10 sample_sum:62.83 schema:-2 zero_threshold:2.938735877055719e-39 zero_count:0 positive_span:<offset:0 length:3 > positive_delta:2 positive_delta:5 positive_delta:-6 `,
+		},
+		{
+			name:         "negative buckets",
+			observations: []float64{0, -1, -1.2, -1.4, -1.8, -2},
+			factor:       1.2,
+			want:         `sample_count:6 sample_sum:-7.4 schema:2 zero_threshold:2.938735877055719e-39 zero_count:1 negative_span:<offset:0 length:5 > negative_delta:1 negative_delta:-1 negative_delta:2 negative_delta:-2 negative_delta:2 `,
+		},
+		{
+			name:         "negative and positive buckets",
+			observations: []float64{0, -1, -1.2, -1.4, -1.8, -2, 1, 1.2, 1.4, 1.8, 2},
+			factor:       1.2,
+			want:         `sample_count:11 sample_sum:0 schema:2 zero_threshold:2.938735877055719e-39 zero_count:1 negative_span:<offset:0 length:5 > negative_delta:1 negative_delta:-1 negative_delta:2 negative_delta:-2 negative_delta:2 positive_span:<offset:0 length:5 > positive_delta:1 positive_delta:-1 positive_delta:2 positive_delta:-2 positive_delta:2 `,
+		},
+		{
+			name:          "wide zero bucket",
+			observations:  []float64{0, -1, -1.2, -1.4, -1.8, -2, 1, 1.2, 1.4, 1.8, 2},
+			factor:        1.2,
+			zeroThreshold: 1.4,
+			want:          `sample_count:11 sample_sum:0 schema:2 zero_threshold:1.4 zero_count:7 negative_span:<offset:4 length:1 > negative_delta:2 positive_span:<offset:4 length:1 > positive_delta:2 `,
+		},
+		{
+			name:         "NaN observation",
+			observations: []float64{0, 1, 1.2, 1.4, 1.8, 2, math.NaN()},
+			factor:       1.2,
+			want:         `sample_count:7 sample_sum:nan schema:2 zero_threshold:2.938735877055719e-39 zero_count:1 positive_span:<offset:0 length:5 > positive_delta:1 positive_delta:-1 positive_delta:2 positive_delta:-2 positive_delta:2 `,
+		},
+		{
+			name:         "+Inf observation",
+			observations: []float64{0, 1, 1.2, 1.4, 1.8, 2, math.Inf(+1)},
+			factor:       1.2,
+			want:         `sample_count:7 sample_sum:inf schema:2 zero_threshold:2.938735877055719e-39 zero_count:1 positive_span:<offset:0 length:5 > positive_span:<offset:4092 length:1 > positive_delta:1 positive_delta:-1 positive_delta:2 positive_delta:-2 positive_delta:2 positive_delta:-1 `,
+		},
+		{
+			name:         "-Inf observation",
+			observations: []float64{0, 1, 1.2, 1.4, 1.8, 2, math.Inf(-1)},
+			factor:       1.2,
+			want:         `sample_count:7 sample_sum:-inf schema:2 zero_threshold:2.938735877055719e-39 zero_count:1 negative_span:<offset:4097 length:1 > negative_delta:1 positive_span:<offset:0 length:5 > positive_delta:1 positive_delta:-1 positive_delta:2 positive_delta:-2 positive_delta:2 `,
+		},
+		{
+			name:         "limited buckets but nothing triggered",
+			observations: []float64{0, 1, 1.2, 1.4, 1.8, 2},
+			factor:       1.2,
+			maxBuckets:   4,
+			want:         `sample_count:6 sample_sum:7.4 schema:2 zero_threshold:2.938735877055719e-39 zero_count:1 positive_span:<offset:0 length:5 > positive_delta:1 positive_delta:-1 positive_delta:2 positive_delta:-2 positive_delta:2 `,
+		},
+		{
+			name:         "buckets limited by halving resolution",
+			observations: []float64{0, 1, 1.1, 1.2, 1.4, 1.8, 2, 3},
+			factor:       1.2,
+			maxBuckets:   4,
+			want:         `sample_count:8 sample_sum:11.5 schema:1 zero_threshold:2.938735877055719e-39 zero_count:1 positive_span:<offset:0 length:5 > positive_delta:1 positive_delta:2 positive_delta:-1 positive_delta:-2 positive_delta:1 `,
+		},
+		{
+			name:             "buckets limited by widening the zero bucket",
+			observations:     []float64{0, 1, 1.1, 1.2, 1.4, 1.8, 2, 3},
+			factor:           1.2,
+			maxBuckets:       4,
+			maxZeroThreshold: 1.2,
+			want:             `sample_count:8 sample_sum:11.5 schema:2 zero_threshold:1 zero_count:2 positive_span:<offset:1 length:7 > positive_delta:1 positive_delta:1 positive_delta:-2 positive_delta:2 positive_delta:-2 positive_delta:0 positive_delta:1 `,
+		},
+		{
+			name:             "buckets limited by widening the zero bucket twice",
+			observations:     []float64{0, 1, 1.1, 1.2, 1.4, 1.8, 2, 3, 4},
+			factor:           1.2,
+			maxBuckets:       4,
+			maxZeroThreshold: 1.2,
+			want:             `sample_count:9 sample_sum:15.5 schema:2 zero_threshold:1.189207115002721 zero_count:3 positive_span:<offset:2 length:7 > positive_delta:2 positive_delta:-2 positive_delta:2 positive_delta:-2 positive_delta:0 positive_delta:1 positive_delta:0 `,
+		},
+		{
+			name:             "buckets limited by reset",
+			observations:     []float64{0, 1, 1.1, 1.2, 1.4, 1.8, 2, 3, 4},
+			factor:           1.2,
+			maxBuckets:       4,
+			maxZeroThreshold: 1.2,
+			minResetDuration: 5 * time.Minute,
+			want:             `sample_count:2 sample_sum:7 schema:2 zero_threshold:2.938735877055719e-39 zero_count:0 positive_span:<offset:7 length:2 > positive_delta:1 positive_delta:0 `,
+		},
+		{
+			name:         "limited buckets but nothing triggered, negative observations",
+			observations: []float64{0, -1, -1.2, -1.4, -1.8, -2},
+			factor:       1.2,
+			maxBuckets:   4,
+			want:         `sample_count:6 sample_sum:-7.4 schema:2 zero_threshold:2.938735877055719e-39 zero_count:1 negative_span:<offset:0 length:5 > negative_delta:1 negative_delta:-1 negative_delta:2 negative_delta:-2 negative_delta:2 `,
+		},
+		{
+			name:         "buckets limited by halving resolution, negative observations",
+			observations: []float64{0, -1, -1.1, -1.2, -1.4, -1.8, -2, -3},
+			factor:       1.2,
+			maxBuckets:   4,
+			want:         `sample_count:8 sample_sum:-11.5 schema:1 zero_threshold:2.938735877055719e-39 zero_count:1 negative_span:<offset:0 length:5 > negative_delta:1 negative_delta:2 negative_delta:-1 negative_delta:-2 negative_delta:1 `,
+		},
+		{
+			name:             "buckets limited by widening the zero bucket, negative observations",
+			observations:     []float64{0, -1, -1.1, -1.2, -1.4, -1.8, -2, -3},
+			factor:           1.2,
+			maxBuckets:       4,
+			maxZeroThreshold: 1.2,
+			want:             `sample_count:8 sample_sum:-11.5 schema:2 zero_threshold:1 zero_count:2 negative_span:<offset:1 length:7 > negative_delta:1 negative_delta:1 negative_delta:-2 negative_delta:2 negative_delta:-2 negative_delta:0 negative_delta:1 `,
+		},
+		{
+			name:             "buckets limited by widening the zero bucket twice, negative observations",
+			observations:     []float64{0, -1, -1.1, -1.2, -1.4, -1.8, -2, -3, -4},
+			factor:           1.2,
+			maxBuckets:       4,
+			maxZeroThreshold: 1.2,
+			want:             `sample_count:9 sample_sum:-15.5 schema:2 zero_threshold:1.189207115002721 zero_count:3 negative_span:<offset:2 length:7 > negative_delta:2 negative_delta:-2 negative_delta:2 negative_delta:-2 negative_delta:0 negative_delta:1 negative_delta:0 `,
+		},
+		{
+			name:             "buckets limited by reset, negative observations",
+			observations:     []float64{0, -1, -1.1, -1.2, -1.4, -1.8, -2, -3, -4},
+			factor:           1.2,
+			maxBuckets:       4,
+			maxZeroThreshold: 1.2,
+			minResetDuration: 5 * time.Minute,
+			want:             `sample_count:2 sample_sum:-7 schema:2 zero_threshold:2.938735877055719e-39 zero_count:0 negative_span:<offset:7 length:2 > negative_delta:1 negative_delta:0 `,
+		},
+		{
+			name:             "buckets limited by halving resolution, then reset",
+			observations:     []float64{0, 1, 1.1, 1.2, 1.4, 1.8, 2, 5, 5.1, 3, 4},
+			factor:           1.2,
+			maxBuckets:       4,
+			minResetDuration: 9 * time.Minute,
+			want:             `sample_count:2 sample_sum:7 schema:2 zero_threshold:2.938735877055719e-39 zero_count:0 positive_span:<offset:7 length:2 > positive_delta:1 positive_delta:0 `,
+		},
+		{
+			name:             "buckets limited by widening the zero bucket, then reset",
+			observations:     []float64{0, 1, 1.1, 1.2, 1.4, 1.8, 2, 5, 5.1, 3, 4},
+			factor:           1.2,
+			maxBuckets:       4,
+			maxZeroThreshold: 1.2,
+			minResetDuration: 9 * time.Minute,
+			want:             `sample_count:2 sample_sum:7 schema:2 zero_threshold:2.938735877055719e-39 zero_count:0 positive_span:<offset:7 length:2 > positive_delta:1 positive_delta:0 `,
+		},
+	}
+
+	for _, s := range scenarios {
+		t.Run(s.name, func(t *testing.T) {
+			his := NewHistogram(HistogramOpts{
+				Name:                            "name",
+				Help:                            "help",
+				NativeHistogramBucketFactor:     s.factor,
+				NativeHistogramZeroThreshold:    s.zeroThreshold,
+				NativeHistogramMaxBucketNumber:  s.maxBuckets,
+				NativeHistogramMinResetDuration: s.minResetDuration,
+				NativeHistogramMaxZeroThreshold: s.maxZeroThreshold,
+			})
+			ts := time.Now().Add(30 * time.Second)
+			now := func() time.Time {
+				return ts
+			}
+			his.(*histogram).now = now
+			for _, o := range s.observations {
+				his.Observe(o)
+				ts = ts.Add(time.Minute)
+			}
+			m := &dto.Metric{}
+			if err := his.Write(m); err != nil {
+				t.Fatal("unexpected error writing metric", err)
+			}
+			got := m.Histogram.String()
+			if s.want != got {
+				t.Errorf("want histogram %q, got %q", s.want, got)
+			}
+		})
+	}
+}
+
+func TestSparseHistogramConcurrency(t *testing.T) {
+	if testing.Short() {
+		t.Skip("Skipping test in short mode.")
+	}
+
+	rand.Seed(42)
+
+	it := func(n uint32) bool {
+		mutations := int(n%1e4 + 1e4)
+		concLevel := int(n%5 + 1)
+		total := mutations * concLevel
+
+		var start, end sync.WaitGroup
+		start.Add(1)
+		end.Add(concLevel)
+
+		his := NewHistogram(HistogramOpts{
+			Name:                            "test_sparse_histogram",
+			Help:                            "This help is sparse.",
+			NativeHistogramBucketFactor:     1.05,
+			NativeHistogramZeroThreshold:    0.0000001,
+			NativeHistogramMaxBucketNumber:  50,
+			NativeHistogramMinResetDuration: time.Hour, // Comment out to test for totals below.
+			NativeHistogramMaxZeroThreshold: 0.001,
+		})
+
+		ts := time.Now().Add(30 * time.Second).Unix()
+		now := func() time.Time {
+			return time.Unix(atomic.LoadInt64(&ts), 0)
+		}
+		his.(*histogram).now = now
+
+		allVars := make([]float64, total)
+		var sampleSum float64
+		for i := 0; i < concLevel; i++ {
+			vals := make([]float64, mutations)
+			for j := 0; j < mutations; j++ {
+				v := rand.NormFloat64()
+				vals[j] = v
+				allVars[i*mutations+j] = v
+				sampleSum += v
+			}
+
+			go func(vals []float64) {
+				start.Wait()
+				for _, v := range vals {
+					// An observation every 1 to 10 seconds.
+					atomic.AddInt64(&ts, rand.Int63n(10)+1)
+					his.Observe(v)
+				}
+				end.Done()
+			}(vals)
+		}
+		sort.Float64s(allVars)
+		start.Done()
+		end.Wait()
+
+		m := &dto.Metric{}
+		his.Write(m)
+
+		// Uncomment these tests for totals only if you have disabled histogram resets above.
+		//
+		// if got, want := int(*m.Histogram.SampleCount), total; got != want {
+		// 	t.Errorf("got sample count %d, want %d", got, want)
+		// }
+		// if got, want := *m.Histogram.SampleSum, sampleSum; math.Abs((got-want)/want) > 0.001 {
+		// 	t.Errorf("got sample sum %f, want %f", got, want)
+		// }
+
+		sumBuckets := int(m.Histogram.GetZeroCount())
+		current := 0
+		for _, delta := range m.Histogram.GetNegativeDelta() {
+			current += int(delta)
+			if current < 0 {
+				t.Fatalf("negative bucket population negative: %d", current)
+			}
+			sumBuckets += current
+		}
+		current = 0
+		for _, delta := range m.Histogram.GetPositiveDelta() {
+			current += int(delta)
+			if current < 0 {
+				t.Fatalf("positive bucket population negative: %d", current)
+			}
+			sumBuckets += current
+		}
+		if got, want := sumBuckets, int(*m.Histogram.SampleCount); got != want {
+			t.Errorf("got bucket population sum %d, want %d", got, want)
+		}
+
+		return true
+	}
+
+	if err := quick.Check(it, nil); err != nil {
+		t.Error(err)
+	}
+}
+
+func TestGetLe(t *testing.T) {
+	scenarios := []struct {
+		key    int
+		schema int32
+		want   float64
+	}{
+		{
+			key:    -1,
+			schema: -1,
+			want:   0.25,
+		},
+		{
+			key:    0,
+			schema: -1,
+			want:   1,
+		},
+		{
+			key:    1,
+			schema: -1,
+			want:   4,
+		},
+		{
+			key:    512,
+			schema: -1,
+			want:   math.MaxFloat64,
+		},
+		{
+			key:    513,
+			schema: -1,
+			want:   math.Inf(+1),
+		},
+		{
+			key:    -1,
+			schema: 0,
+			want:   0.5,
+		},
+		{
+			key:    0,
+			schema: 0,
+			want:   1,
+		},
+		{
+			key:    1,
+			schema: 0,
+			want:   2,
+		},
+		{
+			key:    1024,
+			schema: 0,
+			want:   math.MaxFloat64,
+		},
+		{
+			key:    1025,
+			schema: 0,
+			want:   math.Inf(+1),
+		},
+		{
+			key:    -1,
+			schema: 2,
+			want:   0.8408964152537144,
+		},
+		{
+			key:    0,
+			schema: 2,
+			want:   1,
+		},
+		{
+			key:    1,
+			schema: 2,
+			want:   1.189207115002721,
+		},
+		{
+			key:    4096,
+			schema: 2,
+			want:   math.MaxFloat64,
+		},
+		{
+			key:    4097,
+			schema: 2,
+			want:   math.Inf(+1),
+		},
+	}
+
+	for i, s := range scenarios {
+		got := getLe(s.key, s.schema)
+		if s.want != got {
+			t.Errorf("%d. key %d, schema %d, want upper bound of %g, got %g", i, s.key, s.schema, s.want, got)
+		}
+	}
+}