Merge pull request #1150 from prometheus/sparsehistogram

Merge sparsehistogram branch into main
This commit is contained in:
Björn Rabenstein 2022-10-31 16:55:36 +01:00 committed by GitHub
commit 5f202eefdb
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 1322 additions and 93 deletions

View File

@ -48,14 +48,22 @@ func NewMetrics(reg prometheus.Registerer, normMean, normDomain float64) *metric
},
[]string{"service"},
),
// The same as above, but now as a histogram, and only for the normal
// distribution. The buckets are targeted to the parameters of the
// normal distribution, with 20 buckets centered on the mean, each
// half-sigma wide.
// The same as above, but now as a histogram, and only for the
// normal distribution. The histogram features both conventional
// buckets as well as sparse buckets, the latter needed for the
// experimental native histograms (ingested by a Prometheus
// server v2.40 with the corresponding feature flag
// enabled). The conventional buckets are targeted to the
// parameters of the normal distribution, with 20 buckets
// centered on the mean, each half-sigma wide. The sparse
// buckets are always centered on zero, with a growth factor of
// one bucket to the text of (at most) 1.1. (The precise factor
// is 2^2^-3 = 1.0905077...)
rpcDurationsHistogram: prometheus.NewHistogram(prometheus.HistogramOpts{
Name: "rpc_durations_histogram_seconds",
Help: "RPC latency distributions.",
Buckets: prometheus.LinearBuckets(normMean-5*normDomain, .5*normDomain, 20),
Name: "rpc_durations_histogram_seconds",
Help: "RPC latency distributions.",
Buckets: prometheus.LinearBuckets(normMean-5*normDomain, .5*normDomain, 20),
NativeHistogramBucketFactor: 1.1,
}),
}
reg.MustRegister(m.rpcDurations)

2
go.mod
View File

@ -8,7 +8,7 @@ require (
github.com/davecgh/go-spew v1.1.1
github.com/golang/protobuf v1.5.2
github.com/json-iterator/go v1.1.12
github.com/prometheus/client_model v0.2.0
github.com/prometheus/client_model v0.3.0
github.com/prometheus/common v0.37.0
github.com/prometheus/procfs v0.8.0
golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a

3
go.sum
View File

@ -134,8 +134,9 @@ github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f/go.mod h1:qRW
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA=
github.com/prometheus/client_model v0.2.0 h1:uq5h0d+GuxiXLJLNABMgp2qUWDPiLvgCzz2dUR+/W/M=
github.com/prometheus/client_model v0.2.0/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA=
github.com/prometheus/client_model v0.3.0 h1:UBgGFHqYdG/TPFD1B1ogZywDqEkwp3fBMvqdiQ7Xew4=
github.com/prometheus/client_model v0.3.0/go.mod h1:LDGWKZIo7rky3hgvBe+caln+Dr3dPggB5dvjtD7w9+w=
github.com/prometheus/common v0.37.0 h1:ccBbHCgIiT9uSoFY0vX8H3zsNR5eLt17/RQLUvn8pXE=
github.com/prometheus/common v0.37.0/go.mod h1:phzohg0JFMnBEFGxTDbfu3QyL5GI8gTQJFhYO5B3mfA=
github.com/prometheus/procfs v0.8.0 h1:ODq8ZFEaYeCaZOJlZZdJA2AbQR98dSHSM1KW/You5mo=

File diff suppressed because it is too large Load Diff

View File

@ -20,6 +20,7 @@ import (
"runtime"
"sort"
"sync"
"sync/atomic"
"testing"
"testing/quick"
"time"
@ -167,7 +168,7 @@ func TestHistogramConcurrency(t *testing.T) {
start.Add(1)
end.Add(concLevel)
sum := NewHistogram(HistogramOpts{
his := NewHistogram(HistogramOpts{
Name: "test_histogram",
Help: "helpless",
Buckets: testBuckets,
@ -188,9 +189,9 @@ func TestHistogramConcurrency(t *testing.T) {
start.Wait()
for _, v := range vals {
if n%2 == 0 {
sum.Observe(v)
his.Observe(v)
} else {
sum.(ExemplarObserver).ObserveWithExemplar(v, Labels{"foo": "bar"})
his.(ExemplarObserver).ObserveWithExemplar(v, Labels{"foo": "bar"})
}
}
end.Done()
@ -201,7 +202,7 @@ func TestHistogramConcurrency(t *testing.T) {
end.Wait()
m := &dto.Metric{}
sum.Write(m)
his.Write(m)
if got, want := int(*m.Histogram.SampleCount), total; got != want {
t.Errorf("got sample count %d, want %d", got, want)
}
@ -467,3 +468,408 @@ func TestHistogramExemplar(t *testing.T) {
}
}
}
func TestSparseHistogram(t *testing.T) {
scenarios := []struct {
name string
observations []float64 // With simulated interval of 1m.
factor float64
zeroThreshold float64
maxBuckets uint32
minResetDuration time.Duration
maxZeroThreshold float64
want string // String representation of protobuf.
}{
{
name: "no sparse buckets",
observations: []float64{1, 2, 3},
factor: 1,
want: `sample_count:3 sample_sum:6 bucket:<cumulative_count:0 upper_bound:0.005 > bucket:<cumulative_count:0 upper_bound:0.01 > bucket:<cumulative_count:0 upper_bound:0.025 > bucket:<cumulative_count:0 upper_bound:0.05 > bucket:<cumulative_count:0 upper_bound:0.1 > bucket:<cumulative_count:0 upper_bound:0.25 > bucket:<cumulative_count:0 upper_bound:0.5 > bucket:<cumulative_count:1 upper_bound:1 > bucket:<cumulative_count:2 upper_bound:2.5 > bucket:<cumulative_count:3 upper_bound:5 > bucket:<cumulative_count:3 upper_bound:10 > `, // Has conventional buckets because there are no sparse buckets.
},
{
name: "factor 1.1 results in schema 3",
observations: []float64{0, 1, 2, 3},
factor: 1.1,
want: `sample_count:4 sample_sum:6 schema:3 zero_threshold:2.938735877055719e-39 zero_count:1 positive_span:<offset:0 length:1 > positive_span:<offset:7 length:1 > positive_span:<offset:4 length:1 > positive_delta:1 positive_delta:0 positive_delta:0 `,
},
{
name: "factor 1.2 results in schema 2",
observations: []float64{0, 1, 1.2, 1.4, 1.8, 2},
factor: 1.2,
want: `sample_count:6 sample_sum:7.4 schema:2 zero_threshold:2.938735877055719e-39 zero_count:1 positive_span:<offset:0 length:5 > positive_delta:1 positive_delta:-1 positive_delta:2 positive_delta:-2 positive_delta:2 `,
},
{
name: "factor 4 results in schema -1",
observations: []float64{
0.5, 1, // Bucket 0: (0.25, 1]
1.5, 2, 3, 3.5, // Bucket 1: (1, 4]
5, 6, 7, // Bucket 2: (4, 16]
33.33, // Bucket 3: (16, 64]
},
factor: 4,
want: `sample_count:10 sample_sum:62.83 schema:-1 zero_threshold:2.938735877055719e-39 zero_count:0 positive_span:<offset:0 length:4 > positive_delta:2 positive_delta:2 positive_delta:-1 positive_delta:-2 `,
},
{
name: "factor 17 results in schema -2",
observations: []float64{
0.5, 1, // Bucket 0: (0.0625, 1]
1.5, 2, 3, 3.5, 5, 6, 7, // Bucket 1: (1, 16]
33.33, // Bucket 2: (16, 256]
},
factor: 17,
want: `sample_count:10 sample_sum:62.83 schema:-2 zero_threshold:2.938735877055719e-39 zero_count:0 positive_span:<offset:0 length:3 > positive_delta:2 positive_delta:5 positive_delta:-6 `,
},
{
name: "negative buckets",
observations: []float64{0, -1, -1.2, -1.4, -1.8, -2},
factor: 1.2,
want: `sample_count:6 sample_sum:-7.4 schema:2 zero_threshold:2.938735877055719e-39 zero_count:1 negative_span:<offset:0 length:5 > negative_delta:1 negative_delta:-1 negative_delta:2 negative_delta:-2 negative_delta:2 `,
},
{
name: "negative and positive buckets",
observations: []float64{0, -1, -1.2, -1.4, -1.8, -2, 1, 1.2, 1.4, 1.8, 2},
factor: 1.2,
want: `sample_count:11 sample_sum:0 schema:2 zero_threshold:2.938735877055719e-39 zero_count:1 negative_span:<offset:0 length:5 > negative_delta:1 negative_delta:-1 negative_delta:2 negative_delta:-2 negative_delta:2 positive_span:<offset:0 length:5 > positive_delta:1 positive_delta:-1 positive_delta:2 positive_delta:-2 positive_delta:2 `,
},
{
name: "wide zero bucket",
observations: []float64{0, -1, -1.2, -1.4, -1.8, -2, 1, 1.2, 1.4, 1.8, 2},
factor: 1.2,
zeroThreshold: 1.4,
want: `sample_count:11 sample_sum:0 schema:2 zero_threshold:1.4 zero_count:7 negative_span:<offset:4 length:1 > negative_delta:2 positive_span:<offset:4 length:1 > positive_delta:2 `,
},
{
name: "NaN observation",
observations: []float64{0, 1, 1.2, 1.4, 1.8, 2, math.NaN()},
factor: 1.2,
want: `sample_count:7 sample_sum:nan schema:2 zero_threshold:2.938735877055719e-39 zero_count:1 positive_span:<offset:0 length:5 > positive_delta:1 positive_delta:-1 positive_delta:2 positive_delta:-2 positive_delta:2 `,
},
{
name: "+Inf observation",
observations: []float64{0, 1, 1.2, 1.4, 1.8, 2, math.Inf(+1)},
factor: 1.2,
want: `sample_count:7 sample_sum:inf schema:2 zero_threshold:2.938735877055719e-39 zero_count:1 positive_span:<offset:0 length:5 > positive_span:<offset:4092 length:1 > positive_delta:1 positive_delta:-1 positive_delta:2 positive_delta:-2 positive_delta:2 positive_delta:-1 `,
},
{
name: "-Inf observation",
observations: []float64{0, 1, 1.2, 1.4, 1.8, 2, math.Inf(-1)},
factor: 1.2,
want: `sample_count:7 sample_sum:-inf schema:2 zero_threshold:2.938735877055719e-39 zero_count:1 negative_span:<offset:4097 length:1 > negative_delta:1 positive_span:<offset:0 length:5 > positive_delta:1 positive_delta:-1 positive_delta:2 positive_delta:-2 positive_delta:2 `,
},
{
name: "limited buckets but nothing triggered",
observations: []float64{0, 1, 1.2, 1.4, 1.8, 2},
factor: 1.2,
maxBuckets: 4,
want: `sample_count:6 sample_sum:7.4 schema:2 zero_threshold:2.938735877055719e-39 zero_count:1 positive_span:<offset:0 length:5 > positive_delta:1 positive_delta:-1 positive_delta:2 positive_delta:-2 positive_delta:2 `,
},
{
name: "buckets limited by halving resolution",
observations: []float64{0, 1, 1.1, 1.2, 1.4, 1.8, 2, 3},
factor: 1.2,
maxBuckets: 4,
want: `sample_count:8 sample_sum:11.5 schema:1 zero_threshold:2.938735877055719e-39 zero_count:1 positive_span:<offset:0 length:5 > positive_delta:1 positive_delta:2 positive_delta:-1 positive_delta:-2 positive_delta:1 `,
},
{
name: "buckets limited by widening the zero bucket",
observations: []float64{0, 1, 1.1, 1.2, 1.4, 1.8, 2, 3},
factor: 1.2,
maxBuckets: 4,
maxZeroThreshold: 1.2,
want: `sample_count:8 sample_sum:11.5 schema:2 zero_threshold:1 zero_count:2 positive_span:<offset:1 length:7 > positive_delta:1 positive_delta:1 positive_delta:-2 positive_delta:2 positive_delta:-2 positive_delta:0 positive_delta:1 `,
},
{
name: "buckets limited by widening the zero bucket twice",
observations: []float64{0, 1, 1.1, 1.2, 1.4, 1.8, 2, 3, 4},
factor: 1.2,
maxBuckets: 4,
maxZeroThreshold: 1.2,
want: `sample_count:9 sample_sum:15.5 schema:2 zero_threshold:1.189207115002721 zero_count:3 positive_span:<offset:2 length:7 > positive_delta:2 positive_delta:-2 positive_delta:2 positive_delta:-2 positive_delta:0 positive_delta:1 positive_delta:0 `,
},
{
name: "buckets limited by reset",
observations: []float64{0, 1, 1.1, 1.2, 1.4, 1.8, 2, 3, 4},
factor: 1.2,
maxBuckets: 4,
maxZeroThreshold: 1.2,
minResetDuration: 5 * time.Minute,
want: `sample_count:2 sample_sum:7 schema:2 zero_threshold:2.938735877055719e-39 zero_count:0 positive_span:<offset:7 length:2 > positive_delta:1 positive_delta:0 `,
},
{
name: "limited buckets but nothing triggered, negative observations",
observations: []float64{0, -1, -1.2, -1.4, -1.8, -2},
factor: 1.2,
maxBuckets: 4,
want: `sample_count:6 sample_sum:-7.4 schema:2 zero_threshold:2.938735877055719e-39 zero_count:1 negative_span:<offset:0 length:5 > negative_delta:1 negative_delta:-1 negative_delta:2 negative_delta:-2 negative_delta:2 `,
},
{
name: "buckets limited by halving resolution, negative observations",
observations: []float64{0, -1, -1.1, -1.2, -1.4, -1.8, -2, -3},
factor: 1.2,
maxBuckets: 4,
want: `sample_count:8 sample_sum:-11.5 schema:1 zero_threshold:2.938735877055719e-39 zero_count:1 negative_span:<offset:0 length:5 > negative_delta:1 negative_delta:2 negative_delta:-1 negative_delta:-2 negative_delta:1 `,
},
{
name: "buckets limited by widening the zero bucket, negative observations",
observations: []float64{0, -1, -1.1, -1.2, -1.4, -1.8, -2, -3},
factor: 1.2,
maxBuckets: 4,
maxZeroThreshold: 1.2,
want: `sample_count:8 sample_sum:-11.5 schema:2 zero_threshold:1 zero_count:2 negative_span:<offset:1 length:7 > negative_delta:1 negative_delta:1 negative_delta:-2 negative_delta:2 negative_delta:-2 negative_delta:0 negative_delta:1 `,
},
{
name: "buckets limited by widening the zero bucket twice, negative observations",
observations: []float64{0, -1, -1.1, -1.2, -1.4, -1.8, -2, -3, -4},
factor: 1.2,
maxBuckets: 4,
maxZeroThreshold: 1.2,
want: `sample_count:9 sample_sum:-15.5 schema:2 zero_threshold:1.189207115002721 zero_count:3 negative_span:<offset:2 length:7 > negative_delta:2 negative_delta:-2 negative_delta:2 negative_delta:-2 negative_delta:0 negative_delta:1 negative_delta:0 `,
},
{
name: "buckets limited by reset, negative observations",
observations: []float64{0, -1, -1.1, -1.2, -1.4, -1.8, -2, -3, -4},
factor: 1.2,
maxBuckets: 4,
maxZeroThreshold: 1.2,
minResetDuration: 5 * time.Minute,
want: `sample_count:2 sample_sum:-7 schema:2 zero_threshold:2.938735877055719e-39 zero_count:0 negative_span:<offset:7 length:2 > negative_delta:1 negative_delta:0 `,
},
{
name: "buckets limited by halving resolution, then reset",
observations: []float64{0, 1, 1.1, 1.2, 1.4, 1.8, 2, 5, 5.1, 3, 4},
factor: 1.2,
maxBuckets: 4,
minResetDuration: 9 * time.Minute,
want: `sample_count:2 sample_sum:7 schema:2 zero_threshold:2.938735877055719e-39 zero_count:0 positive_span:<offset:7 length:2 > positive_delta:1 positive_delta:0 `,
},
{
name: "buckets limited by widening the zero bucket, then reset",
observations: []float64{0, 1, 1.1, 1.2, 1.4, 1.8, 2, 5, 5.1, 3, 4},
factor: 1.2,
maxBuckets: 4,
maxZeroThreshold: 1.2,
minResetDuration: 9 * time.Minute,
want: `sample_count:2 sample_sum:7 schema:2 zero_threshold:2.938735877055719e-39 zero_count:0 positive_span:<offset:7 length:2 > positive_delta:1 positive_delta:0 `,
},
}
for _, s := range scenarios {
t.Run(s.name, func(t *testing.T) {
his := NewHistogram(HistogramOpts{
Name: "name",
Help: "help",
NativeHistogramBucketFactor: s.factor,
NativeHistogramZeroThreshold: s.zeroThreshold,
NativeHistogramMaxBucketNumber: s.maxBuckets,
NativeHistogramMinResetDuration: s.minResetDuration,
NativeHistogramMaxZeroThreshold: s.maxZeroThreshold,
})
ts := time.Now().Add(30 * time.Second)
now := func() time.Time {
return ts
}
his.(*histogram).now = now
for _, o := range s.observations {
his.Observe(o)
ts = ts.Add(time.Minute)
}
m := &dto.Metric{}
if err := his.Write(m); err != nil {
t.Fatal("unexpected error writing metric", err)
}
got := m.Histogram.String()
if s.want != got {
t.Errorf("want histogram %q, got %q", s.want, got)
}
})
}
}
func TestSparseHistogramConcurrency(t *testing.T) {
if testing.Short() {
t.Skip("Skipping test in short mode.")
}
rand.Seed(42)
it := func(n uint32) bool {
mutations := int(n%1e4 + 1e4)
concLevel := int(n%5 + 1)
total := mutations * concLevel
var start, end sync.WaitGroup
start.Add(1)
end.Add(concLevel)
his := NewHistogram(HistogramOpts{
Name: "test_sparse_histogram",
Help: "This help is sparse.",
NativeHistogramBucketFactor: 1.05,
NativeHistogramZeroThreshold: 0.0000001,
NativeHistogramMaxBucketNumber: 50,
NativeHistogramMinResetDuration: time.Hour, // Comment out to test for totals below.
NativeHistogramMaxZeroThreshold: 0.001,
})
ts := time.Now().Add(30 * time.Second).Unix()
now := func() time.Time {
return time.Unix(atomic.LoadInt64(&ts), 0)
}
his.(*histogram).now = now
allVars := make([]float64, total)
var sampleSum float64
for i := 0; i < concLevel; i++ {
vals := make([]float64, mutations)
for j := 0; j < mutations; j++ {
v := rand.NormFloat64()
vals[j] = v
allVars[i*mutations+j] = v
sampleSum += v
}
go func(vals []float64) {
start.Wait()
for _, v := range vals {
// An observation every 1 to 10 seconds.
atomic.AddInt64(&ts, rand.Int63n(10)+1)
his.Observe(v)
}
end.Done()
}(vals)
}
sort.Float64s(allVars)
start.Done()
end.Wait()
m := &dto.Metric{}
his.Write(m)
// Uncomment these tests for totals only if you have disabled histogram resets above.
//
// if got, want := int(*m.Histogram.SampleCount), total; got != want {
// t.Errorf("got sample count %d, want %d", got, want)
// }
// if got, want := *m.Histogram.SampleSum, sampleSum; math.Abs((got-want)/want) > 0.001 {
// t.Errorf("got sample sum %f, want %f", got, want)
// }
sumBuckets := int(m.Histogram.GetZeroCount())
current := 0
for _, delta := range m.Histogram.GetNegativeDelta() {
current += int(delta)
if current < 0 {
t.Fatalf("negative bucket population negative: %d", current)
}
sumBuckets += current
}
current = 0
for _, delta := range m.Histogram.GetPositiveDelta() {
current += int(delta)
if current < 0 {
t.Fatalf("positive bucket population negative: %d", current)
}
sumBuckets += current
}
if got, want := sumBuckets, int(*m.Histogram.SampleCount); got != want {
t.Errorf("got bucket population sum %d, want %d", got, want)
}
return true
}
if err := quick.Check(it, nil); err != nil {
t.Error(err)
}
}
func TestGetLe(t *testing.T) {
scenarios := []struct {
key int
schema int32
want float64
}{
{
key: -1,
schema: -1,
want: 0.25,
},
{
key: 0,
schema: -1,
want: 1,
},
{
key: 1,
schema: -1,
want: 4,
},
{
key: 512,
schema: -1,
want: math.MaxFloat64,
},
{
key: 513,
schema: -1,
want: math.Inf(+1),
},
{
key: -1,
schema: 0,
want: 0.5,
},
{
key: 0,
schema: 0,
want: 1,
},
{
key: 1,
schema: 0,
want: 2,
},
{
key: 1024,
schema: 0,
want: math.MaxFloat64,
},
{
key: 1025,
schema: 0,
want: math.Inf(+1),
},
{
key: -1,
schema: 2,
want: 0.8408964152537144,
},
{
key: 0,
schema: 2,
want: 1,
},
{
key: 1,
schema: 2,
want: 1.189207115002721,
},
{
key: 4096,
schema: 2,
want: math.MaxFloat64,
},
{
key: 4097,
schema: 2,
want: math.Inf(+1),
},
}
for i, s := range scenarios {
got := getLe(s.key, s.schema)
if s.want != got {
t.Errorf("%d. key %d, schema %d, want upper bound of %g, got %g", i, s.key, s.schema, s.want, got)
}
}
}