av/cmd/vidforward/notifier.go

/*
DESCRIPTION
  notifier.go provides a tool for notifying a systemd watchdog under healthy
  operation of the vidforward service.

AUTHORS
  Saxon A. Nelson-Milton <saxon@ausocean.org>

LICENSE
  Copyright (C) 2022 the Australian Ocean Lab (AusOcean)

  It is free software: you can redistribute it and/or modify them
  under the terms of the GNU General Public License as published by the
  Free Software Foundation, either version 3 of the License, or (at your
  option) any later version.

  It is distributed in the hope that it will be useful, but WITHOUT
  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
  for more details.

  You should have received a copy of the GNU General Public License
  along with revid in gpl.txt. If not, see http://www.gnu.org/licenses.
*/

package main

import (
	"log"
	"os"
	"os/signal"
	"sync"
	"syscall"
	"time"

	"bitbucket.org/ausocean/utils/logging"
	"github.com/coreos/go-systemd/daemon"
)

// By default we assume we should be notifying a systemd watchdog. This can be
// toggled off by using the nowatchdog build tag (see nowatchdog.go file).
var notifyWatchdog = true

// watchdogNotifier keeps track of the watchdog interval from the external
// sysd service settings, the currently active request handlers and a curId
// field that is is incremented to generate new handler ids for storage.
type watchdogNotifier struct {
	watchdogInterval time.Duration
	activeHandlers   map[int]handlerInfo
	curId            int
	termCallback     func()
	log              logging.Logger
	mu               sync.Mutex
}

// handlerInfo keeps track of a handlers name (for any logging purposes) and
// time at which the handler was invoked, which is later used to calculate time
// active and therefore heatlh.
type handlerInfo struct {
	name string
	time time.Time
}

// newWatchdogNotifier creates a new watchdogNotifier with the provided logger
// and termination callback that is called if a SIGINT or SIGTERM signal is
// received. Recommended use of this is an attempted state save.
func newWatchdogNotifier(l logging.Logger, termCallback func()) (*watchdogNotifier, error) {
	var (
		interval = 1 * time.Minute
		err      error
	)

	if notifyWatchdog {
		const clearEnvVars = false
		interval, err = daemon.SdWatchdogEnabled(clearEnvVars)
		if err != nil {
			return nil, err
		}
	}

	return &watchdogNotifier{
		activeHandlers:   make(map[int]handlerInfo),
		watchdogInterval: interval,
		log:              l,
		termCallback:     termCallback,
	}, nil
}

// notify is to be called as a routine. This is responsible for checking if the
// handlers are healthy and then notifying the watchdog if so, otherwise we
// wait, continue and check again until they are. If the handlers take too long to
// become healthy, we risk exceeding the watchdog interval causing a process restart.
// notify also starts a routine to monitor for any SIGINT or SIGTERM, upon which
// a callback that's provided at initialisation is called.
func (n *watchdogNotifier) notify() {
	notifyTicker := time.NewTicker(n.watchdogInterval / 2)

	go func() {
		sigs := make(chan os.Signal, 1)
		signal.Notify(sigs, syscall.SIGINT, syscall.SIGTERM)
		sig := <-sigs
		n.log.Warning("received termination signal, calling termination callback", "signal", sig.String())
		n.termCallback()
	}()

	for {
		if n.handlersUnhealthy() {
			const unhealthyHandlerWait = 1 * time.Second
			time.Sleep(unhealthyHandlerWait)
			continue
		}

		<-notifyTicker.C

		if !notifyWatchdog {
			continue
		}

		// If this fails for any reason it indicates a systemd service configuration
		// issue, and therefore programmer error, so do fatal log to cause crash.
		supported, err := daemon.SdNotify(false, daemon.SdNotifyWatchdog)
		if err != nil {
			n.log.Fatal("error from systemd watchdog notify", "error", err)
		}

		if !supported {
			n.log.Fatal("watchdog notification not supported")
		}
	}
}

// handlersUnhealthy returns true if it is detected that any handlers are unhealthy,
// that is, if they have been handling for longer than the unhealthyHandleDuration.
func (n *watchdogNotifier) handlersUnhealthy() bool {
	n.mu.Lock()
	defer n.mu.Unlock()
	for _, info := range n.activeHandlers {
		const unhealthyHandleDuration = 30 * time.Second
		if time.Now().Sub(info.time) > unhealthyHandleDuration {
			n.log.Warning("handler unhealthy", "name", info.name)
			return true
		}
	}
	return false
}

// handlerInvoked is to be called at the start of a request handler to indicate
// that handling has begun. The name and start time of the handler is recorded
// in the active handlers map with a unique ID as the key. A function is returned
// that must be called at exit of the handler to indicate that handling has
// finished. It is recommended this be done using a defer statement immediately
// after receiveing it.
func (n *watchdogNotifier) handlerInvoked(name string) func() {
	n.mu.Lock()
	defer n.mu.Unlock()

	id := n.curId
	n.curId++
	n.activeHandlers[id] = handlerInfo{time: time.Now(), name: name}

	return func() {
		n.mu.Lock()
		defer n.mu.Unlock()

		if _, ok := n.activeHandlers[id]; !ok {
			log.Fatal("handler id not in map", "name", name)
		}

		delete(n.activeHandlers, id)
	}
}
cmd/vidforward: add watchdog notifier This change adds a "watchdog notifier" utility which tracks the health of request handlers and notifies an external systemd watchdog if everything looks good. This allows us to cause a termination if any request handlers get hung. 2022-11-27 02:52:57 +03:00			`/*`
			`DESCRIPTION`
			`notifier.go provides a tool for notifying a systemd watchdog under healthy`
			`operation of the vidforward service.`

			`AUTHORS`
			`Saxon A. Nelson-Milton <saxon@ausocean.org>`

			`LICENSE`
			`Copyright (C) 2022 the Australian Ocean Lab (AusOcean)`

			`It is free software: you can redistribute it and/or modify them`
			`under the terms of the GNU General Public License as published by the`
			`Free Software Foundation, either version 3 of the License, or (at your`
			`option) any later version.`

			`It is distributed in the hope that it will be useful, but WITHOUT`
			`ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or`
			`FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License`
			`for more details.`

			`You should have received a copy of the GNU General Public License`
			`along with revid in gpl.txt. If not, see http://www.gnu.org/licenses.`
			`*/`

			`package main`

			`import (`
			`"log"`
			`"os"`
			`"os/signal"`
			`"sync"`
			`"syscall"`
			`"time"`

			`"bitbucket.org/ausocean/utils/logging"`
			`"github.com/coreos/go-systemd/daemon"`
			`)`

			`// By default we assume we should be notifying a systemd watchdog. This can be`
			`// toggled off by using the nowatchdog build tag (see nowatchdog.go file).`
			`var notifyWatchdog = true`

			`// watchdogNotifier keeps track of the watchdog interval from the external`
			`// sysd service settings, the currently active request handlers and a curId`
			`// field that is is incremented to generate new handler ids for storage.`
			`type watchdogNotifier struct {`
			`watchdogInterval time.Duration`
			`activeHandlers map[int]handlerInfo`
			`curId int`
			`termCallback func()`
			`log logging.Logger`
			`mu sync.Mutex`
			`}`

			`// handlerInfo keeps track of a handlers name (for any logging purposes) and`
			`// time at which the handler was invoked, which is later used to calculate time`
			`// active and therefore heatlh.`
			`type handlerInfo struct {`
			`name string`
			`time time.Time`
			`}`

			`// newWatchdogNotifier creates a new watchdogNotifier with the provided logger`
			`// and termination callback that is called if a SIGINT or SIGTERM signal is`
			`// received. Recommended use of this is an attempted state save.`
			`func newWatchdogNotifier(l logging.Logger, termCallback func()) (*watchdogNotifier, error) {`
			`var (`
			`interval = 1 * time.Minute`
			`err error`
			`)`

			`if notifyWatchdog {`
			`const clearEnvVars = false`
			`interval, err = daemon.SdWatchdogEnabled(clearEnvVars)`
			`if err != nil {`
			`return nil, err`
			`}`
			`}`

			`return &watchdogNotifier{`
			`activeHandlers: make(map[int]handlerInfo),`
			`watchdogInterval: interval,`
			`log: l,`
			`termCallback: termCallback,`
			`}, nil`
			`}`

			`// notify is to be called as a routine. This is responsible for checking if the`
			`// handlers are healthy and then notifying the watchdog if so, otherwise we`
			`// wait, continue and check again until they are. If the handlers take too long to`
			`// become healthy, we risk exceeding the watchdog interval causing a process restart.`
			`// notify also starts a routine to monitor for any SIGINT or SIGTERM, upon which`
			`// a callback that's provided at initialisation is called.`
			`func (n *watchdogNotifier) notify() {`
			`notifyTicker := time.NewTicker(n.watchdogInterval / 2)`

			`go func() {`
			`sigs := make(chan os.Signal, 1)`
			`signal.Notify(sigs, syscall.SIGINT, syscall.SIGTERM)`
			`sig := <-sigs`
			`n.log.Warning("received termination signal, calling termination callback", "signal", sig.String())`
			`n.termCallback()`
			`}()`

			`for {`
			`if n.handlersUnhealthy() {`
			`const unhealthyHandlerWait = 1 * time.Second`
			`time.Sleep(unhealthyHandlerWait)`
			`continue`
			`}`

			`<-notifyTicker.C`

			`if !notifyWatchdog {`
			`continue`
			`}`

			`// If this fails for any reason it indicates a systemd service configuration`
			`// issue, and therefore programmer error, so do fatal log to cause crash.`
			`supported, err := daemon.SdNotify(false, daemon.SdNotifyWatchdog)`
			`if err != nil {`
			`n.log.Fatal("error from systemd watchdog notify", "error", err)`
			`}`

			`if !supported {`
			`n.log.Fatal("watchdog notification not supported")`
			`}`
			`}`
			`}`

			`// handlersUnhealthy returns true if it is detected that any handlers are unhealthy,`
			`// that is, if they have been handling for longer than the unhealthyHandleDuration.`
			`func (n *watchdogNotifier) handlersUnhealthy() bool {`
			`n.mu.Lock()`
			`defer n.mu.Unlock()`
			`for _, info := range n.activeHandlers {`
			`const unhealthyHandleDuration = 30 * time.Second`
			`if time.Now().Sub(info.time) > unhealthyHandleDuration {`
			`n.log.Warning("handler unhealthy", "name", info.name)`
			`return true`
			`}`
			`}`
			`return false`
			`}`

			`// handlerInvoked is to be called at the start of a request handler to indicate`
			`// that handling has begun. The name and start time of the handler is recorded`
			`// in the active handlers map with a unique ID as the key. A function is returned`
			`// that must be called at exit of the handler to indicate that handling has`
			`// finished. It is recommended this be done using a defer statement immediately`
			`// after receiveing it.`
			`func (n *watchdogNotifier) handlerInvoked(name string) func() {`
			`n.mu.Lock()`
			`defer n.mu.Unlock()`

			`id := n.curId`
			`n.curId++`
			`n.activeHandlers[id] = handlerInfo{time: time.Now(), name: name}`

			`return func() {`
			`n.mu.Lock()`
			`defer n.mu.Unlock()`

			`if _, ok := n.activeHandlers[id]; !ok {`
			`log.Fatal("handler id not in map", "name", name)`
			`}`

			`delete(n.activeHandlers, id)`
			`}`
			`}`