From 016273b1f98f93a268e8eba12a5548c315fc0a02 Mon Sep 17 00:00:00 2001 From: beorn7 Date: Sun, 5 May 2019 23:44:22 +0200 Subject: [PATCH 1/3] Fix and tweak Go collector tests Signed-off-by: beorn7 --- prometheus/go_collector_test.go | 86 +++++++++++++++++++-------------- 1 file changed, 51 insertions(+), 35 deletions(-) diff --git a/prometheus/go_collector_test.go b/prometheus/go_collector_test.go index f93dcdc..7ab3968 100644 --- a/prometheus/go_collector_test.go +++ b/prometheus/go_collector_test.go @@ -21,28 +21,40 @@ import ( dto "github.com/prometheus/client_model/go" ) -func TestGoCollector(t *testing.T) { +func TestGoCollectorGoroutines(t *testing.T) { var ( - c = NewGoCollector() - ch = make(chan Metric) - waitc = make(chan struct{}) - closec = make(chan struct{}) - old = -1 + c = NewGoCollector() + metricCh = make(chan Metric) + waitCh = make(chan struct{}) + endGoroutineCh = make(chan struct{}) + endCollectionCh = make(chan struct{}) + old = -1 ) - defer close(closec) + defer func() { + close(endGoroutineCh) + // Drain the collect channel to prevent goroutine leak. + for { + select { + case <-metricCh: + case <-endCollectionCh: + return + } + } + }() go func() { - c.Collect(ch) + c.Collect(metricCh) go func(c <-chan struct{}) { <-c - }(closec) - <-waitc - c.Collect(ch) + }(endGoroutineCh) + <-waitCh + c.Collect(metricCh) + close(endCollectionCh) }() for { select { - case m := <-ch: + case m := <-metricCh: // m can be Gauge or Counter, // currently just test the go_goroutines Gauge // and ignore others. @@ -57,7 +69,7 @@ func TestGoCollector(t *testing.T) { if old == -1 { old = int(pb.GetGauge().GetValue()) - close(waitc) + close(waitCh) continue } @@ -65,43 +77,47 @@ func TestGoCollector(t *testing.T) { // TODO: This is flaky in highly concurrent situations. t.Errorf("want 1 new goroutine, got %d", diff) } - - // GoCollector performs three sends per call. - // On line 27 we need to receive three more sends - // to shut down cleanly. - <-ch - <-ch - <-ch - return case <-time.After(1 * time.Second): t.Fatalf("expected collect timed out") } + break } } -func TestGCCollector(t *testing.T) { +func TestGoCollectorGC(t *testing.T) { var ( - c = NewGoCollector() - ch = make(chan Metric) - waitc = make(chan struct{}) - closec = make(chan struct{}) - oldGC uint64 - oldPause float64 + c = NewGoCollector() + metricCh = make(chan Metric) + waitCh = make(chan struct{}) + endCollectionCh = make(chan struct{}) + oldGC uint64 + oldPause float64 ) - defer close(closec) go func() { - c.Collect(ch) + c.Collect(metricCh) // force GC runtime.GC() - <-waitc - c.Collect(ch) + <-waitCh + c.Collect(metricCh) + close(endCollectionCh) + }() + + defer func() { + // Drain the collect channel to prevent goroutine leak. + for { + select { + case <-metricCh: + case <-endCollectionCh: + return + } + } }() first := true for { select { - case metric := <-ch: + case metric := <-metricCh: pb := &dto.Metric{} metric.Write(pb) if pb.GetSummary() == nil { @@ -119,7 +135,7 @@ func TestGCCollector(t *testing.T) { first = false oldGC = *pb.GetSummary().SampleCount oldPause = *pb.GetSummary().SampleSum - close(waitc) + close(waitCh) continue } if diff := *pb.GetSummary().SampleCount - oldGC; diff != 1 { @@ -128,9 +144,9 @@ func TestGCCollector(t *testing.T) { if diff := *pb.GetSummary().SampleSum - oldPause; diff <= 0 { t.Errorf("want moar pause, got %f", diff) } - return case <-time.After(1 * time.Second): t.Fatalf("expected collect timed out") } + break } } From 547c945a6226045c65f4f8a840d96a456db97651 Mon Sep 17 00:00:00 2001 From: beorn7 Date: Mon, 6 May 2019 00:39:02 +0200 Subject: [PATCH 2/3] Replace fmt.Sprintf by simple concatenation In this simple case, it's the fastest and easiest. Signed-off-by: beorn7 --- prometheus/go_collector.go | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/prometheus/go_collector.go b/prometheus/go_collector.go index ba3b933..fcb5ccb 100644 --- a/prometheus/go_collector.go +++ b/prometheus/go_collector.go @@ -14,7 +14,6 @@ package prometheus import ( - "fmt" "runtime" "runtime/debug" "time" @@ -253,7 +252,7 @@ func NewGoCollector() Collector { } func memstatNamespace(s string) string { - return fmt.Sprintf("go_memstats_%s", s) + return "go_memstats_" + s } // Describe returns all descriptions of the collector. From 7cf09554214c5d2a1cc31d78629b92bd97edebe4 Mon Sep 17 00:00:00 2001 From: Bjoern Rabenstein Date: Sat, 4 May 2019 01:00:59 +0200 Subject: [PATCH 3/3] Handle long ReadMemStats duration in Go collector tl;dr: Return previous memstats if reading new ones takes longer than 1s. See the doc comment of NewGoCollector for details. Signed-off-by: beorn7 --- prometheus/go_collector.go | 86 +++++++++++++++++++++++++++++---- prometheus/go_collector_test.go | 85 ++++++++++++++++++++++++++++++++ 2 files changed, 161 insertions(+), 10 deletions(-) diff --git a/prometheus/go_collector.go b/prometheus/go_collector.go index fcb5ccb..b108ec5 100644 --- a/prometheus/go_collector.go +++ b/prometheus/go_collector.go @@ -16,6 +16,7 @@ package prometheus import ( "runtime" "runtime/debug" + "sync" "time" ) @@ -25,16 +26,41 @@ type goCollector struct { gcDesc *Desc goInfoDesc *Desc - // metrics to describe and collect - metrics memStatsMetrics + // ms... are memstats related. + msLast *runtime.MemStats // Previously collected memstats. + msLastTimestamp time.Time + msMtx sync.Mutex // Protects msLast and msLastTimestamp. + msMetrics memStatsMetrics + msRead func(*runtime.MemStats) // For mocking in tests. + msMaxWait time.Duration // Wait time for fresh memstats. + msMaxAge time.Duration // Maximum allowed age of old memstats. } // NewGoCollector returns a collector which exports metrics about the current Go // process. This includes memory stats. To collect those, runtime.ReadMemStats -// is called. This causes a stop-the-world, which is very short with Go1.9+ -// (~25µs). However, with older Go versions, the stop-the-world duration depends -// on the heap size and can be quite significant (~1.7 ms/GiB as per +// is called. This requires to “stop the world”, which usually only happens for +// garbage collection (GC). Take the following implications into account when +// deciding whether to use the Go collector: +// +// 1. The performance impact of stopping the world is the more relevant the more +// frequently metrics are collected. However, with Go1.9 or later the +// stop-the-world time per metrics collection is very short (~25µs) so that the +// performance impact will only matter in rare cases. However, with older Go +// versions, the stop-the-world duration depends on the heap size and can be +// quite significant (~1.7 ms/GiB as per // https://go-review.googlesource.com/c/go/+/34937). +// +// 2. During an ongoing GC, nothing else can stop the world. Therefore, if the +// metrics collection happens to coincide with GC, it will only complete after +// GC has finished. Usually, GC is fast enough to not cause problems. However, +// with a very large heap, GC might take multiple seconds, which is enough to +// cause scrape timeouts in common setups. To avoid this problem, the Go +// collector will use the memstats from a previous collection if +// runtime.ReadMemStats takes more than 1s. However, if there are no previously +// collected memstats, or their collection is more than 5m ago, the collection +// will block until runtime.ReadMemStats succeeds. (The problem might be solved +// in Go1.13, see https://github.com/golang/go/issues/19812 for the related Go +// issue.) func NewGoCollector() Collector { return &goCollector{ goroutinesDesc: NewDesc( @@ -53,7 +79,11 @@ func NewGoCollector() Collector { "go_info", "Information about the Go environment.", nil, Labels{"version": runtime.Version()}), - metrics: memStatsMetrics{ + msLast: &runtime.MemStats{}, + msRead: runtime.ReadMemStats, + msMaxWait: time.Second, + msMaxAge: 5 * time.Minute, + msMetrics: memStatsMetrics{ { desc: NewDesc( memstatNamespace("alloc_bytes"), @@ -261,13 +291,27 @@ func (c *goCollector) Describe(ch chan<- *Desc) { ch <- c.threadsDesc ch <- c.gcDesc ch <- c.goInfoDesc - for _, i := range c.metrics { + for _, i := range c.msMetrics { ch <- i.desc } } // Collect returns the current state of all metrics of the collector. func (c *goCollector) Collect(ch chan<- Metric) { + var ( + ms = &runtime.MemStats{} + done = make(chan struct{}) + ) + // Start reading memstats first as it might take a while. + go func() { + c.msRead(ms) + c.msMtx.Lock() + c.msLast = ms + c.msLastTimestamp = time.Now() + c.msMtx.Unlock() + close(done) + }() + ch <- MustNewConstMetric(c.goroutinesDesc, GaugeValue, float64(runtime.NumGoroutine())) n, _ := runtime.ThreadCreateProfile(nil) ch <- MustNewConstMetric(c.threadsDesc, GaugeValue, float64(n)) @@ -285,9 +329,31 @@ func (c *goCollector) Collect(ch chan<- Metric) { ch <- MustNewConstMetric(c.goInfoDesc, GaugeValue, 1) - ms := &runtime.MemStats{} - runtime.ReadMemStats(ms) - for _, i := range c.metrics { + timer := time.NewTimer(c.msMaxWait) + select { + case <-done: // Our own ReadMemStats succeeded in time. Use it. + timer.Stop() // Important for high collection frequencies to not pile up timers. + c.msCollect(ch, ms) + return + case <-timer.C: // Time out, use last memstats if possible. Continue below. + } + c.msMtx.Lock() + if time.Since(c.msLastTimestamp) < c.msMaxAge { + // Last memstats are recent enough. Collect from them under the lock. + c.msCollect(ch, c.msLast) + c.msMtx.Unlock() + return + } + // If we are here, the last memstats are too old or don't exist. We have + // to wait until our own ReadMemStats finally completes. For that to + // happen, we have to release the lock. + c.msMtx.Unlock() + <-done + c.msCollect(ch, ms) +} + +func (c *goCollector) msCollect(ch chan<- Metric, ms *runtime.MemStats) { + for _, i := range c.msMetrics { ch <- MustNewConstMetric(i.desc, i.valType, i.eval(ms)) } } diff --git a/prometheus/go_collector_test.go b/prometheus/go_collector_test.go index 7ab3968..5a89b25 100644 --- a/prometheus/go_collector_test.go +++ b/prometheus/go_collector_test.go @@ -150,3 +150,88 @@ func TestGoCollectorGC(t *testing.T) { break } } + +func TestGoCollectorMemStats(t *testing.T) { + var ( + c = NewGoCollector().(*goCollector) + got uint64 + ) + + checkCollect := func(want uint64) { + metricCh := make(chan Metric) + endCh := make(chan struct{}) + + go func() { + c.Collect(metricCh) + close(endCh) + }() + Collect: + for { + select { + case metric := <-metricCh: + if metric.Desc().fqName != "go_memstats_alloc_bytes" { + continue Collect + } + pb := &dto.Metric{} + metric.Write(pb) + got = uint64(pb.GetGauge().GetValue()) + case <-endCh: + break Collect + } + } + if want != got { + t.Errorf("unexpected value of go_memstats_alloc_bytes, want %d, got %d", want, got) + } + } + + // Speed up the timing to make the tast faster. + c.msMaxWait = time.Millisecond + c.msMaxAge = 10 * time.Millisecond + + // Scenario 1: msRead responds slowly, no previous memstats available, + // msRead is executed anyway. + c.msRead = func(ms *runtime.MemStats) { + time.Sleep(3 * time.Millisecond) + ms.Alloc = 1 + } + checkCollect(1) + // Now msLast is set. + if want, got := uint64(1), c.msLast.Alloc; want != got { + t.Errorf("unexpected of msLast.Alloc, want %d, got %d", want, got) + } + + // Scenario 2: msRead responds fast, previous memstats available, new + // value collected. + c.msRead = func(ms *runtime.MemStats) { + ms.Alloc = 2 + } + checkCollect(2) + // msLast is set, too. + if want, got := uint64(2), c.msLast.Alloc; want != got { + t.Errorf("unexpected of msLast.Alloc, want %d, got %d", want, got) + } + + // Scenario 3: msRead responds slowly, previous memstats available, old + // value collected. + c.msRead = func(ms *runtime.MemStats) { + time.Sleep(3 * time.Millisecond) + ms.Alloc = 3 + } + checkCollect(2) + // After waiting, new value is still set in msLast. + time.Sleep(12 * time.Millisecond) + if want, got := uint64(3), c.msLast.Alloc; want != got { + t.Errorf("unexpected of msLast.Alloc, want %d, got %d", want, got) + } + + // Scenario 4: msRead responds slowly, previous memstats is too old, new + // value collected. + c.msRead = func(ms *runtime.MemStats) { + time.Sleep(3 * time.Millisecond) + ms.Alloc = 4 + } + checkCollect(4) + if want, got := uint64(4), c.msLast.Alloc; want != got { + t.Errorf("unexpected of msLast.Alloc, want %d, got %d", want, got) + } +}