diff --git a/prometheus/promhttp/http.go b/prometheus/promhttp/http.go index b137c88..cea5a90 100644 --- a/prometheus/promhttp/http.go +++ b/prometheus/promhttp/http.go @@ -84,10 +84,32 @@ func Handler() http.Handler { // instrumentation. Use the InstrumentMetricHandler function to apply the same // kind of instrumentation as it is used by the Handler function. func HandlerFor(reg prometheus.Gatherer, opts HandlerOpts) http.Handler { - var inFlightSem chan struct{} + var ( + inFlightSem chan struct{} + errCnt = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Name: "promhttp_metric_handler_errors_total", + Help: "Total number of internal errors encountered by the promhttp metric handler.", + }, + []string{"cause"}, + ) + ) + if opts.MaxRequestsInFlight > 0 { inFlightSem = make(chan struct{}, opts.MaxRequestsInFlight) } + if opts.Registry != nil { + // Initialize all possibilites that can occur below. + errCnt.WithLabelValues("gathering") + errCnt.WithLabelValues("encoding") + if err := opts.Registry.Register(errCnt); err != nil { + if are, ok := err.(prometheus.AlreadyRegisteredError); ok { + errCnt = are.ExistingCollector.(*prometheus.CounterVec) + } else { + panic(err) + } + } + } h := http.HandlerFunc(func(rsp http.ResponseWriter, req *http.Request) { if inFlightSem != nil { @@ -106,6 +128,7 @@ func HandlerFor(reg prometheus.Gatherer, opts HandlerOpts) http.Handler { if opts.ErrorLog != nil { opts.ErrorLog.Println("error gathering metrics:", err) } + errCnt.WithLabelValues("gathering").Inc() switch opts.ErrorHandling { case PanicOnError: panic(err) @@ -146,6 +169,7 @@ func HandlerFor(reg prometheus.Gatherer, opts HandlerOpts) http.Handler { if opts.ErrorLog != nil { opts.ErrorLog.Println("error encoding and sending metric family:", err) } + errCnt.WithLabelValues("encoding").Inc() switch opts.ErrorHandling { case PanicOnError: panic(err) @@ -236,9 +260,12 @@ const ( // Ignore errors and try to serve as many metrics as possible. However, // if no metrics can be served, serve an HTTP status code 500 and the // last error message in the body. Only use this in deliberate "best - // effort" metrics collection scenarios. It is recommended to at least - // log errors (by providing an ErrorLog in HandlerOpts) to not mask - // errors completely. + // effort" metrics collection scenarios. In this case, it is highly + // recommended to provide other means of detecting errors: By setting an + // ErrorLog in HandlerOpts, the errors are logged. By providing a + // Registry in HandlerOpts, the exposed metrics include an error counter + // "promhttp_metric_handler_errors_total", which can be used for + // alerts. ContinueOnError // Panic upon the first error encountered (useful for "crash only" apps). PanicOnError @@ -261,6 +288,18 @@ type HandlerOpts struct { // logged regardless of the configured ErrorHandling provided ErrorLog // is not nil. ErrorHandling HandlerErrorHandling + // If Registry is not nil, it is used to register a metric + // "promhttp_metric_handler_errors_total", partitioned by "cause". A + // failed registration causes a panic. Note that this error counter is + // different from the instrumentation you get from the various + // InstrumentHandler... helpers. It counts errors that don't necessarily + // result in a non-2xx HTTP status code. There are two typical cases: + // (1) Encoding errors that only happen after streaming of the HTTP body + // has already started (and the status code 200 has been sent). This + // should only happen with custom collectors. (2) Collection errors with + // no effect on the HTTP status code because ErrorHandling is set to + // ContinueOnError. + Registry prometheus.Registerer // If DisableCompression is true, the handler will never compress the // response, even if requested by the client. DisableCompression bool diff --git a/prometheus/promhttp/http_test.go b/prometheus/promhttp/http_test.go index 6e23e6c..781ea8f 100644 --- a/prometheus/promhttp/http_test.go +++ b/prometheus/promhttp/http_test.go @@ -59,7 +59,8 @@ func (b blockingCollector) Collect(ch chan<- prometheus.Metric) { func TestHandlerErrorHandling(t *testing.T) { // Create a registry that collects a MetricFamily with two elements, - // another with one, and reports an error. + // another with one, and reports an error. Further down, we'll use the + // same registry in the HandlerOpts. reg := prometheus.NewRegistry() cnt := prometheus.NewCounter(prometheus.CounterOpts{ @@ -92,14 +93,17 @@ func TestHandlerErrorHandling(t *testing.T) { errorHandler := HandlerFor(reg, HandlerOpts{ ErrorLog: logger, ErrorHandling: HTTPErrorOnError, + Registry: reg, }) continueHandler := HandlerFor(reg, HandlerOpts{ ErrorLog: logger, ErrorHandling: ContinueOnError, + Registry: reg, }) panicHandler := HandlerFor(reg, HandlerOpts{ ErrorLog: logger, ErrorHandling: PanicOnError, + Registry: reg, }) wantMsg := `error gathering metrics: error collecting metric Desc{fqName: "invalid_metric", help: "not helpful", constLabels: {}, variableLabels: []}: collect error ` @@ -107,10 +111,29 @@ func TestHandlerErrorHandling(t *testing.T) { error collecting metric Desc{fqName: "invalid_metric", help: "not helpful", constLabels: {}, variableLabels: []}: collect error ` - wantOKBody := `# HELP name docstring + wantOKBody1 := `# HELP name docstring # TYPE name counter name{constname="constvalue",labelname="val1"} 1 name{constname="constvalue",labelname="val2"} 1 +# HELP promhttp_metric_handler_errors_total Total number of internal errors encountered by the promhttp metric handler. +# TYPE promhttp_metric_handler_errors_total counter +promhttp_metric_handler_errors_total{cause="encoding"} 0 +promhttp_metric_handler_errors_total{cause="gathering"} 1 +# HELP the_count Ah-ah-ah! Thunder and lightning! +# TYPE the_count counter +the_count 0 +` + // It might happen that counting the gathering error makes it to the + // promhttp_metric_handler_errors_total counter before it is gathered + // itself. Thus, we have to bodies that are acceptable for the test. + wantOKBody2 := `# HELP name docstring +# TYPE name counter +name{constname="constvalue",labelname="val1"} 1 +name{constname="constvalue",labelname="val2"} 1 +# HELP promhttp_metric_handler_errors_total Total number of internal errors encountered by the promhttp metric handler. +# TYPE promhttp_metric_handler_errors_total counter +promhttp_metric_handler_errors_total{cause="encoding"} 0 +promhttp_metric_handler_errors_total{cause="gathering"} 2 # HELP the_count Ah-ah-ah! Thunder and lightning! # TYPE the_count counter the_count 0 @@ -137,8 +160,8 @@ the_count 0 if got := logBuf.String(); got != wantMsg { t.Errorf("got log message %q, want %q", got, wantMsg) } - if got := writer.Body.String(); got != wantOKBody { - t.Errorf("got body %q, want %q", got, wantOKBody) + if got := writer.Body.String(); got != wantOKBody1 && got != wantOKBody2 { + t.Errorf("got body %q, want either %q or %q", got, wantOKBody1, wantOKBody2) } defer func() {