/* DESCRIPTION notifier.go provides a tool for notifying a systemd watchdog under healthy operation of the vidforward service. AUTHORS Saxon A. Nelson-Milton LICENSE Copyright (C) 2022 the Australian Ocean Lab (AusOcean) It is free software: you can redistribute it and/or modify them under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. It is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with revid in gpl.txt. If not, see http://www.gnu.org/licenses. */ package main import ( "log" "os" "os/signal" "sync" "syscall" "time" "bitbucket.org/ausocean/utils/logging" "github.com/coreos/go-systemd/daemon" ) // By default we assume we should be notifying a systemd watchdog. This can be // toggled off by using the nowatchdog build tag (see nowatchdog.go file). var notifyWatchdog = true // watchdogNotifier keeps track of the watchdog interval from the external // sysd service settings, the currently active request handlers and a curId // field that is is incremented to generate new handler ids for storage. type watchdogNotifier struct { watchdogInterval time.Duration activeHandlers map[int]handlerInfo curId int termCallback func() log logging.Logger mu sync.Mutex } // handlerInfo keeps track of a handlers name (for any logging purposes) and // time at which the handler was invoked, which is later used to calculate time // active and therefore heatlh. type handlerInfo struct { name string time time.Time } // newWatchdogNotifier creates a new watchdogNotifier with the provided logger // and termination callback that is called if a SIGINT or SIGTERM signal is // received. Recommended use of this is an attempted state save. func newWatchdogNotifier(l logging.Logger, termCallback func()) (*watchdogNotifier, error) { var ( interval = 1 * time.Minute err error ) if notifyWatchdog { const clearEnvVars = false interval, err = daemon.SdWatchdogEnabled(clearEnvVars) if err != nil { return nil, err } } return &watchdogNotifier{ activeHandlers: make(map[int]handlerInfo), watchdogInterval: interval, log: l, termCallback: termCallback, }, nil } // notify is to be called as a routine. This is responsible for checking if the // handlers are healthy and then notifying the watchdog if so, otherwise we // wait, continue and check again until they are. If the handlers take too long to // become healthy, we risk exceeding the watchdog interval causing a process restart. // notify also starts a routine to monitor for any SIGINT or SIGTERM, upon which // a callback that's provided at initialisation is called. func (n *watchdogNotifier) notify() { notifyTicker := time.NewTicker(n.watchdogInterval / 2) go func() { sigs := make(chan os.Signal, 1) signal.Notify(sigs, syscall.SIGINT, syscall.SIGTERM) sig := <-sigs n.log.Warning("received termination signal, calling termination callback", "signal", sig.String()) n.termCallback() }() for { if n.handlersUnhealthy() { const unhealthyHandlerWait = 1 * time.Second time.Sleep(unhealthyHandlerWait) continue } <-notifyTicker.C if !notifyWatchdog { continue } // If this fails for any reason it indicates a systemd service configuration // issue, and therefore programmer error, so do fatal log to cause crash. supported, err := daemon.SdNotify(false, daemon.SdNotifyWatchdog) if err != nil { n.log.Fatal("error from systemd watchdog notify", "error", err) } if !supported { n.log.Fatal("watchdog notification not supported") } } } // handlersUnhealthy returns true if it is detected that any handlers are unhealthy, // that is, if they have been handling for longer than the unhealthyHandleDuration. func (n *watchdogNotifier) handlersUnhealthy() bool { n.mu.Lock() defer n.mu.Unlock() for _, info := range n.activeHandlers { const unhealthyHandleDuration = 30 * time.Second if time.Now().Sub(info.time) > unhealthyHandleDuration { n.log.Warning("handler unhealthy", "name", info.name) return true } } return false } // handlerInvoked is to be called at the start of a request handler to indicate // that handling has begun. The name and start time of the handler is recorded // in the active handlers map with a unique ID as the key. A function is returned // that must be called at exit of the handler to indicate that handling has // finished. It is recommended this be done using a defer statement immediately // after receiveing it. func (n *watchdogNotifier) handlerInvoked(name string) func() { n.mu.Lock() defer n.mu.Unlock() id := n.curId n.curId++ n.activeHandlers[id] = handlerInfo{time: time.Now(), name: name} return func() { n.mu.Lock() defer n.mu.Unlock() if _, ok := n.activeHandlers[id]; !ok { log.Fatal("handler id not in map", "name", name) } delete(n.activeHandlers, id) } }