From 7cf09554214c5d2a1cc31d78629b92bd97edebe4 Mon Sep 17 00:00:00 2001 From: Bjoern Rabenstein Date: Sat, 4 May 2019 01:00:59 +0200 Subject: [PATCH] Handle long ReadMemStats duration in Go collector tl;dr: Return previous memstats if reading new ones takes longer than 1s. See the doc comment of NewGoCollector for details. Signed-off-by: beorn7 --- prometheus/go_collector.go | 86 +++++++++++++++++++++++++++++---- prometheus/go_collector_test.go | 85 ++++++++++++++++++++++++++++++++ 2 files changed, 161 insertions(+), 10 deletions(-) diff --git a/prometheus/go_collector.go b/prometheus/go_collector.go index fcb5ccb..b108ec5 100644 --- a/prometheus/go_collector.go +++ b/prometheus/go_collector.go @@ -16,6 +16,7 @@ package prometheus import ( "runtime" "runtime/debug" + "sync" "time" ) @@ -25,16 +26,41 @@ type goCollector struct { gcDesc *Desc goInfoDesc *Desc - // metrics to describe and collect - metrics memStatsMetrics + // ms... are memstats related. + msLast *runtime.MemStats // Previously collected memstats. + msLastTimestamp time.Time + msMtx sync.Mutex // Protects msLast and msLastTimestamp. + msMetrics memStatsMetrics + msRead func(*runtime.MemStats) // For mocking in tests. + msMaxWait time.Duration // Wait time for fresh memstats. + msMaxAge time.Duration // Maximum allowed age of old memstats. } // NewGoCollector returns a collector which exports metrics about the current Go // process. This includes memory stats. To collect those, runtime.ReadMemStats -// is called. This causes a stop-the-world, which is very short with Go1.9+ -// (~25µs). However, with older Go versions, the stop-the-world duration depends -// on the heap size and can be quite significant (~1.7 ms/GiB as per +// is called. This requires to “stop the world”, which usually only happens for +// garbage collection (GC). Take the following implications into account when +// deciding whether to use the Go collector: +// +// 1. The performance impact of stopping the world is the more relevant the more +// frequently metrics are collected. However, with Go1.9 or later the +// stop-the-world time per metrics collection is very short (~25µs) so that the +// performance impact will only matter in rare cases. However, with older Go +// versions, the stop-the-world duration depends on the heap size and can be +// quite significant (~1.7 ms/GiB as per // https://go-review.googlesource.com/c/go/+/34937). +// +// 2. During an ongoing GC, nothing else can stop the world. Therefore, if the +// metrics collection happens to coincide with GC, it will only complete after +// GC has finished. Usually, GC is fast enough to not cause problems. However, +// with a very large heap, GC might take multiple seconds, which is enough to +// cause scrape timeouts in common setups. To avoid this problem, the Go +// collector will use the memstats from a previous collection if +// runtime.ReadMemStats takes more than 1s. However, if there are no previously +// collected memstats, or their collection is more than 5m ago, the collection +// will block until runtime.ReadMemStats succeeds. (The problem might be solved +// in Go1.13, see https://github.com/golang/go/issues/19812 for the related Go +// issue.) func NewGoCollector() Collector { return &goCollector{ goroutinesDesc: NewDesc( @@ -53,7 +79,11 @@ func NewGoCollector() Collector { "go_info", "Information about the Go environment.", nil, Labels{"version": runtime.Version()}), - metrics: memStatsMetrics{ + msLast: &runtime.MemStats{}, + msRead: runtime.ReadMemStats, + msMaxWait: time.Second, + msMaxAge: 5 * time.Minute, + msMetrics: memStatsMetrics{ { desc: NewDesc( memstatNamespace("alloc_bytes"), @@ -261,13 +291,27 @@ func (c *goCollector) Describe(ch chan<- *Desc) { ch <- c.threadsDesc ch <- c.gcDesc ch <- c.goInfoDesc - for _, i := range c.metrics { + for _, i := range c.msMetrics { ch <- i.desc } } // Collect returns the current state of all metrics of the collector. func (c *goCollector) Collect(ch chan<- Metric) { + var ( + ms = &runtime.MemStats{} + done = make(chan struct{}) + ) + // Start reading memstats first as it might take a while. + go func() { + c.msRead(ms) + c.msMtx.Lock() + c.msLast = ms + c.msLastTimestamp = time.Now() + c.msMtx.Unlock() + close(done) + }() + ch <- MustNewConstMetric(c.goroutinesDesc, GaugeValue, float64(runtime.NumGoroutine())) n, _ := runtime.ThreadCreateProfile(nil) ch <- MustNewConstMetric(c.threadsDesc, GaugeValue, float64(n)) @@ -285,9 +329,31 @@ func (c *goCollector) Collect(ch chan<- Metric) { ch <- MustNewConstMetric(c.goInfoDesc, GaugeValue, 1) - ms := &runtime.MemStats{} - runtime.ReadMemStats(ms) - for _, i := range c.metrics { + timer := time.NewTimer(c.msMaxWait) + select { + case <-done: // Our own ReadMemStats succeeded in time. Use it. + timer.Stop() // Important for high collection frequencies to not pile up timers. + c.msCollect(ch, ms) + return + case <-timer.C: // Time out, use last memstats if possible. Continue below. + } + c.msMtx.Lock() + if time.Since(c.msLastTimestamp) < c.msMaxAge { + // Last memstats are recent enough. Collect from them under the lock. + c.msCollect(ch, c.msLast) + c.msMtx.Unlock() + return + } + // If we are here, the last memstats are too old or don't exist. We have + // to wait until our own ReadMemStats finally completes. For that to + // happen, we have to release the lock. + c.msMtx.Unlock() + <-done + c.msCollect(ch, ms) +} + +func (c *goCollector) msCollect(ch chan<- Metric, ms *runtime.MemStats) { + for _, i := range c.msMetrics { ch <- MustNewConstMetric(i.desc, i.valType, i.eval(ms)) } } diff --git a/prometheus/go_collector_test.go b/prometheus/go_collector_test.go index 7ab3968..5a89b25 100644 --- a/prometheus/go_collector_test.go +++ b/prometheus/go_collector_test.go @@ -150,3 +150,88 @@ func TestGoCollectorGC(t *testing.T) { break } } + +func TestGoCollectorMemStats(t *testing.T) { + var ( + c = NewGoCollector().(*goCollector) + got uint64 + ) + + checkCollect := func(want uint64) { + metricCh := make(chan Metric) + endCh := make(chan struct{}) + + go func() { + c.Collect(metricCh) + close(endCh) + }() + Collect: + for { + select { + case metric := <-metricCh: + if metric.Desc().fqName != "go_memstats_alloc_bytes" { + continue Collect + } + pb := &dto.Metric{} + metric.Write(pb) + got = uint64(pb.GetGauge().GetValue()) + case <-endCh: + break Collect + } + } + if want != got { + t.Errorf("unexpected value of go_memstats_alloc_bytes, want %d, got %d", want, got) + } + } + + // Speed up the timing to make the tast faster. + c.msMaxWait = time.Millisecond + c.msMaxAge = 10 * time.Millisecond + + // Scenario 1: msRead responds slowly, no previous memstats available, + // msRead is executed anyway. + c.msRead = func(ms *runtime.MemStats) { + time.Sleep(3 * time.Millisecond) + ms.Alloc = 1 + } + checkCollect(1) + // Now msLast is set. + if want, got := uint64(1), c.msLast.Alloc; want != got { + t.Errorf("unexpected of msLast.Alloc, want %d, got %d", want, got) + } + + // Scenario 2: msRead responds fast, previous memstats available, new + // value collected. + c.msRead = func(ms *runtime.MemStats) { + ms.Alloc = 2 + } + checkCollect(2) + // msLast is set, too. + if want, got := uint64(2), c.msLast.Alloc; want != got { + t.Errorf("unexpected of msLast.Alloc, want %d, got %d", want, got) + } + + // Scenario 3: msRead responds slowly, previous memstats available, old + // value collected. + c.msRead = func(ms *runtime.MemStats) { + time.Sleep(3 * time.Millisecond) + ms.Alloc = 3 + } + checkCollect(2) + // After waiting, new value is still set in msLast. + time.Sleep(12 * time.Millisecond) + if want, got := uint64(3), c.msLast.Alloc; want != got { + t.Errorf("unexpected of msLast.Alloc, want %d, got %d", want, got) + } + + // Scenario 4: msRead responds slowly, previous memstats is too old, new + // value collected. + c.msRead = func(ms *runtime.MemStats) { + time.Sleep(3 * time.Millisecond) + ms.Alloc = 4 + } + checkCollect(4) + if want, got := uint64(4), c.msLast.Alloc; want != got { + t.Errorf("unexpected of msLast.Alloc, want %d, got %d", want, got) + } +}