Reduce granularity of histogram buckets for Go 1.17 collector (#974)

The Go runtime/metrics package currently exports extremely granular
histograms. Exponentially bucket any histogram with unit "seconds"
or "bytes" instead to dramatically reduce the number of buckets, and
thus the number of metrics.

This change also adds a test to check for expected cardinality to
prevent cardinality surprises in the future.

Signed-off-by: Michael Anthony Knyszek <mknyszek@google.com>
This commit is contained in:
Michael Knyszek 2022-01-27 23:46:45 -05:00 committed by Kemal Akkoyun
parent 5a529ae06b
commit 9b785b0349
5 changed files with 205 additions and 27 deletions

View File

@ -21,7 +21,9 @@ import (
"fmt" "fmt"
"go/format" "go/format"
"log" "log"
"math"
"os" "os"
"runtime"
"runtime/metrics" "runtime/metrics"
"strconv" "strconv"
"strings" "strings"
@ -35,6 +37,10 @@ func main() {
if len(os.Args) != 2 { if len(os.Args) != 2 {
log.Fatal("requires Go version (e.g. go1.17) as an argument") log.Fatal("requires Go version (e.g. go1.17) as an argument")
} }
toolVersion := runtime.Version()
if majorVersion := toolVersion[:strings.LastIndexByte(toolVersion, '.')]; majorVersion != os.Args[1] {
log.Fatalf("using Go version %q but expected Go version %q", majorVersion, os.Args[1])
}
version, err := parseVersion(os.Args[1]) version, err := parseVersion(os.Args[1])
if err != nil { if err != nil {
log.Fatalf("parsing Go version: %v", err) log.Fatalf("parsing Go version: %v", err)
@ -45,9 +51,11 @@ func main() {
err = testFile.Execute(&buf, struct { err = testFile.Execute(&buf, struct {
Descriptions []metrics.Description Descriptions []metrics.Description
GoVersion goVersion GoVersion goVersion
Cardinality int
}{ }{
Descriptions: metrics.All(), Descriptions: metrics.All(),
GoVersion: version, GoVersion: version,
Cardinality: rmCardinality(),
}) })
if err != nil { if err != nil {
log.Fatalf("executing template: %v", err) log.Fatalf("executing template: %v", err)
@ -85,6 +93,46 @@ func parseVersion(s string) (goVersion, error) {
return goVersion(i), err return goVersion(i), err
} }
func rmCardinality() int {
cardinality := 0
// Collect all histogram samples so that we can get their buckets.
// The API guarantees that the buckets are always fixed for the lifetime
// of the process.
var histograms []metrics.Sample
for _, d := range metrics.All() {
if d.Kind == metrics.KindFloat64Histogram {
histograms = append(histograms, metrics.Sample{Name: d.Name})
} else {
cardinality++
}
}
// Handle histograms.
metrics.Read(histograms)
for i := range histograms {
name := histograms[i].Name
buckets := internal.RuntimeMetricsBucketsForUnit(
histograms[i].Value.Float64Histogram().Buckets,
name[strings.IndexRune(name, ':')+1:],
)
cardinality += len(buckets) + 3 // Plus total count, sum, and the implicit infinity bucket.
// runtime/metrics bucket boundaries are lower-bound-inclusive, but
// always represents each actual *boundary* so Buckets is always
// 1 longer than Counts, while in Prometheus the mapping is one-to-one,
// as the bottom bucket extends to -Inf, and the top infinity bucket is
// implicit. Therefore, we should have one fewer bucket than is listed
// above.
cardinality--
if buckets[len(buckets)-1] == math.Inf(1) {
// We already counted the infinity bucket separately.
cardinality--
}
}
return cardinality
}
var testFile = template.Must(template.New("testFile").Funcs(map[string]interface{}{ var testFile = template.Must(template.New("testFile").Funcs(map[string]interface{}{
"rm2prom": func(d metrics.Description) string { "rm2prom": func(d metrics.Description) string {
ns, ss, n, ok := internal.RuntimeMetricsToProm(&d) ns, ss, n, ok := internal.RuntimeMetricsToProm(&d)
@ -112,4 +160,6 @@ var expectedRuntimeMetrics = map[string]string{
{{- end -}} {{- end -}}
{{end}} {{end}}
} }
const expectedRuntimeMetricsCardinality = {{.Cardinality}}
`)) `))

View File

@ -20,6 +20,7 @@ import (
"math" "math"
"runtime" "runtime"
"runtime/metrics" "runtime/metrics"
"strings"
"sync" "sync"
//nolint:staticcheck // Ignore SA1019. Need to keep deprecated package for compatibility. //nolint:staticcheck // Ignore SA1019. Need to keep deprecated package for compatibility.
@ -56,9 +57,20 @@ type goCollector struct {
// Deprecated: Use collectors.NewGoCollector instead. // Deprecated: Use collectors.NewGoCollector instead.
func NewGoCollector() Collector { func NewGoCollector() Collector {
descriptions := metrics.All() descriptions := metrics.All()
descMap := make(map[string]*metrics.Description)
for i := range descriptions { // Collect all histogram samples so that we can get their buckets.
descMap[descriptions[i].Name] = &descriptions[i] // The API guarantees that the buckets are always fixed for the lifetime
// of the process.
var histograms []metrics.Sample
for _, d := range descriptions {
if d.Kind == metrics.KindFloat64Histogram {
histograms = append(histograms, metrics.Sample{Name: d.Name})
}
}
metrics.Read(histograms)
bucketsMap := make(map[string][]float64)
for i := range histograms {
bucketsMap[histograms[i].Name] = histograms[i].Value.Float64Histogram().Buckets
} }
// Generate a Desc and ValueType for each runtime/metrics metric. // Generate a Desc and ValueType for each runtime/metrics metric.
@ -83,6 +95,7 @@ func NewGoCollector() Collector {
var m collectorMetric var m collectorMetric
if d.Kind == metrics.KindFloat64Histogram { if d.Kind == metrics.KindFloat64Histogram {
_, hasSum := rmExactSumMap[d.Name] _, hasSum := rmExactSumMap[d.Name]
unit := d.Name[strings.IndexRune(d.Name, ':')+1:]
m = newBatchHistogram( m = newBatchHistogram(
NewDesc( NewDesc(
BuildFQName(namespace, subsystem, name), BuildFQName(namespace, subsystem, name),
@ -90,6 +103,7 @@ func NewGoCollector() Collector {
nil, nil,
nil, nil,
), ),
internal.RuntimeMetricsBucketsForUnit(bucketsMap[d.Name], unit),
hasSum, hasSum,
) )
} else if d.Cumulative { } else if d.Cumulative {
@ -299,13 +313,27 @@ type batchHistogram struct {
// but Write calls may operate concurrently with updates. // but Write calls may operate concurrently with updates.
// Contention between these two sources should be rare. // Contention between these two sources should be rare.
mu sync.Mutex mu sync.Mutex
buckets []float64 // Inclusive lower bounds. buckets []float64 // Inclusive lower bounds, like runtime/metrics.
counts []uint64 counts []uint64
sum float64 // Used if hasSum is true. sum float64 // Used if hasSum is true.
} }
func newBatchHistogram(desc *Desc, hasSum bool) *batchHistogram { // newBatchHistogram creates a new batch histogram value with the given
h := &batchHistogram{desc: desc, hasSum: hasSum} // Desc, buckets, and whether or not it has an exact sum available.
//
// buckets must always be from the runtime/metrics package, following
// the same conventions.
func newBatchHistogram(desc *Desc, buckets []float64, hasSum bool) *batchHistogram {
h := &batchHistogram{
desc: desc,
buckets: buckets,
// Because buckets follows runtime/metrics conventions, there's
// 1 more value in the buckets list than there are buckets represented,
// because in runtime/metrics, the bucket values represent *boundaries*,
// and non-Inf boundaries are inclusive lower bounds for that bucket.
counts: make([]uint64, len(buckets)-1),
hasSum: hasSum,
}
h.init(h) h.init(h)
return h return h
} }
@ -313,28 +341,25 @@ func newBatchHistogram(desc *Desc, hasSum bool) *batchHistogram {
// update updates the batchHistogram from a runtime/metrics histogram. // update updates the batchHistogram from a runtime/metrics histogram.
// //
// sum must be provided if the batchHistogram was created to have an exact sum. // sum must be provided if the batchHistogram was created to have an exact sum.
// h.buckets must be a strict subset of his.Buckets.
func (h *batchHistogram) update(his *metrics.Float64Histogram, sum float64) { func (h *batchHistogram) update(his *metrics.Float64Histogram, sum float64) {
counts, buckets := his.Counts, his.Buckets counts, buckets := his.Counts, his.Buckets
// Skip a -Inf bucket altogether. It's not clear how to represent that.
if math.IsInf(buckets[0], -1) {
buckets = buckets[1:]
counts = counts[1:]
}
h.mu.Lock() h.mu.Lock()
defer h.mu.Unlock() defer h.mu.Unlock()
// Check if we're initialized. // Clear buckets.
if h.buckets == nil { for i := range h.counts {
// Make copies of counts and buckets. It's really important h.counts[i] = 0
// that we don't retain his.Counts or his.Buckets anywhere since }
// it's going to get reused. // Copy and reduce buckets.
h.buckets = make([]float64, len(buckets)) var j int
copy(h.buckets, buckets) for i, count := range counts {
h.counts[j] += count
h.counts = make([]uint64, len(counts)) if buckets[i+1] == h.buckets[j+1] {
j++
}
} }
copy(h.counts, counts)
if h.hasSum { if h.hasSum {
h.sum = sum h.sum = sum
} }

View File

@ -140,9 +140,13 @@ func TestBatchHistogram(t *testing.T) {
} }
metrics.Read(s) metrics.Read(s)
rmHist := s[0].Value.Float64Histogram() rmHist := s[0].Value.Float64Histogram()
// runtime/metrics histograms always have -Inf and +Inf buckets. wantBuckets := internal.RuntimeMetricsBucketsForUnit(rmHist.Buckets, "bytes")
// We never handle -Inf and +Inf is implicit. // runtime/metrics histograms always have a +Inf bucket and are lower
wantBuckets := len(rmHist.Buckets) - 2 // bound inclusive. In contrast, we have an implicit +Inf bucket and
// are upper bound inclusive, so we can chop off the first bucket
// (since the conversion to upper bound inclusive will shift all buckets
// down one index) and the +Inf for the last bucket.
wantBuckets = wantBuckets[1 : len(wantBuckets)-1]
// Check to make sure the output proto makes sense. // Check to make sure the output proto makes sense.
pb := &dto.Metric{} pb := &dto.Metric{}
@ -151,14 +155,14 @@ func TestBatchHistogram(t *testing.T) {
if math.IsInf(pb.Histogram.Bucket[len(pb.Histogram.Bucket)-1].GetUpperBound(), +1) { if math.IsInf(pb.Histogram.Bucket[len(pb.Histogram.Bucket)-1].GetUpperBound(), +1) {
t.Errorf("found +Inf bucket") t.Errorf("found +Inf bucket")
} }
if got := len(pb.Histogram.Bucket); got != wantBuckets { if got := len(pb.Histogram.Bucket); got != len(wantBuckets) {
t.Errorf("got %d buckets in protobuf, want %d", got, wantBuckets) t.Errorf("got %d buckets in protobuf, want %d", got, len(wantBuckets))
} }
for i, bucket := range pb.Histogram.Bucket { for i, bucket := range pb.Histogram.Bucket {
// runtime/metrics histograms are lower-bound inclusive, but we're // runtime/metrics histograms are lower-bound inclusive, but we're
// upper-bound inclusive. So just make sure the new inclusive upper // upper-bound inclusive. So just make sure the new inclusive upper
// bound is somewhere close by (in some cases it's equal). // bound is somewhere close by (in some cases it's equal).
wantBound := rmHist.Buckets[i+1] wantBound := wantBuckets[i]
if gotBound := *bucket.UpperBound; (wantBound-gotBound)/wantBound > 0.001 { if gotBound := *bucket.UpperBound; (wantBound-gotBound)/wantBound > 0.001 {
t.Errorf("got bound %f, want within 0.1%% of %f", gotBound, wantBound) t.Errorf("got bound %f, want within 0.1%% of %f", gotBound, wantBound)
} }
@ -244,6 +248,7 @@ func TestExpectedRuntimeMetrics(t *testing.T) {
descs := metrics.All() descs := metrics.All()
rmSet := make(map[string]struct{}) rmSet := make(map[string]struct{})
// Iterate over runtime-reported descriptions to find new metrics.
for i := range descs { for i := range descs {
rmName := descs[i].Name rmName := descs[i].Name
rmSet[rmName] = struct{}{} rmSet[rmName] = struct{}{}
@ -263,6 +268,8 @@ func TestExpectedRuntimeMetrics(t *testing.T) {
continue continue
} }
} }
// Now iterate over the expected metrics and look for removals.
cardinality := 0
for rmName, fqName := range expectedRuntimeMetrics { for rmName, fqName := range expectedRuntimeMetrics {
if _, ok := rmSet[rmName]; !ok { if _, ok := rmSet[rmName]; !ok {
t.Errorf("runtime/metrics metric %s removed", rmName) t.Errorf("runtime/metrics metric %s removed", rmName)
@ -272,6 +279,30 @@ func TestExpectedRuntimeMetrics(t *testing.T) {
t.Errorf("runtime/metrics metric %s not appearing under expected name %s", rmName, fqName) t.Errorf("runtime/metrics metric %s not appearing under expected name %s", rmName, fqName)
continue continue
} }
// While we're at it, check to make sure expected cardinality lines
// up, but at the point of the protobuf write to get as close to the
// real deal as possible.
//
// Note that we filter out non-runtime/metrics metrics here, because
// those are manually managed.
var m dto.Metric
if err := goMetricSet[fqName].Write(&m); err != nil {
t.Errorf("writing metric %s: %v", fqName, err)
continue
}
// N.B. These are the only fields populated by runtime/metrics metrics specifically.
// Other fields are populated by e.g. GCStats metrics.
switch {
case m.Counter != nil:
fallthrough
case m.Gauge != nil:
cardinality++
case m.Histogram != nil:
cardinality += len(m.Histogram.Bucket) + 3 // + sum, count, and +inf
default:
t.Errorf("unexpected protobuf structure for metric %s", fqName)
}
} }
if t.Failed() { if t.Failed() {
@ -279,6 +310,11 @@ func TestExpectedRuntimeMetrics(t *testing.T) {
t.Log("\tgo run gen_go_collector_metrics_set.go go1.X") t.Log("\tgo run gen_go_collector_metrics_set.go go1.X")
t.Log("where X is the Go version you are currently using") t.Log("where X is the Go version you are currently using")
} }
expectCardinality := expectedRuntimeMetricsCardinality
if cardinality != expectCardinality {
t.Errorf("unexpected cardinality for runtime/metrics metrics: got %d, want %d", cardinality, expectCardinality)
}
} }
func TestGoCollectorConcurrency(t *testing.T) { func TestGoCollectorConcurrency(t *testing.T) {

View File

@ -37,3 +37,5 @@ var expectedRuntimeMetrics = map[string]string{
"/sched/goroutines:goroutines": "go_sched_goroutines_goroutines", "/sched/goroutines:goroutines": "go_sched_goroutines_goroutines",
"/sched/latencies:seconds": "go_sched_latencies_seconds", "/sched/latencies:seconds": "go_sched_latencies_seconds",
} }
const expectedRuntimeMetricsCardinality = 79

View File

@ -17,6 +17,7 @@
package internal package internal
import ( import (
"math"
"path" "path"
"runtime/metrics" "runtime/metrics"
"strings" "strings"
@ -75,3 +76,67 @@ func RuntimeMetricsToProm(d *metrics.Description) (string, string, string, bool)
} }
return namespace, subsystem, name, valid return namespace, subsystem, name, valid
} }
// RuntimeMetricsBucketsForUnit takes a set of buckets obtained for a runtime/metrics histogram
// type (so, lower-bound inclusive) and a unit from a runtime/metrics name, and produces
// a reduced set of buckets. This function always removes any -Inf bucket as it's represented
// as the bottom-most upper-bound inclusive bucket in Prometheus.
func RuntimeMetricsBucketsForUnit(buckets []float64, unit string) []float64 {
switch unit {
case "bytes":
// Rebucket as powers of 2.
return rebucketExp(buckets, 2)
case "seconds":
// Rebucket as powers of 10 and then merge all buckets greater
// than 1 second into the +Inf bucket.
b := rebucketExp(buckets, 10)
for i := range b {
if b[i] <= 1 {
continue
}
b[i] = math.Inf(1)
b = b[:i+1]
break
}
return b
}
return buckets
}
// rebucketExp takes a list of bucket boundaries (lower bound inclusive) and
// downsamples the buckets to those a multiple of base apart. The end result
// is a roughly exponential (in many cases, perfectly exponential) bucketing
// scheme.
func rebucketExp(buckets []float64, base float64) []float64 {
bucket := buckets[0]
var newBuckets []float64
// We may see a -Inf here, in which case, add it and skip it
// since we risk producing NaNs otherwise.
//
// We need to preserve -Inf values to maintain runtime/metrics
// conventions. We'll strip it out later.
if bucket == math.Inf(-1) {
newBuckets = append(newBuckets, bucket)
buckets = buckets[1:]
bucket = buckets[0]
}
// From now on, bucket should always have a non-Inf value because
// Infs are only ever at the ends of the bucket lists, so
// arithmetic operations on it are non-NaN.
for i := 1; i < len(buckets); i++ {
if bucket >= 0 && buckets[i] < bucket*base {
// The next bucket we want to include is at least bucket*base.
continue
} else if bucket < 0 && buckets[i] < bucket/base {
// In this case the bucket we're targeting is negative, and since
// we're ascending through buckets here, we need to divide to get
// closer to zero exponentially.
continue
}
// The +Inf bucket will always be the last one, and we'll always
// end up including it here because bucket
newBuckets = append(newBuckets, bucket)
bucket = buckets[i]
}
return append(newBuckets, bucket)
}