From 64f972fbeae401e52a2c066a0e1c922af617e15c Mon Sep 17 00:00:00 2001
From: Ben Keith <benkeith@splunk.com>
Date: Fri, 1 Jul 2022 10:14:52 -0400
Subject: [PATCH] fix: disregard failed pings in updateLatency() for cluster
 nodes

It is possible that a ping command might fail very quickly due to an
immediate connection refused message or some other server failure
condition. In this case, the derived latency is not reliable and should
not be used to compare this node to other nodes in the cluster.

This change will only count successful Ping commands in the average
latency and in the case of no successful Pings, set the latency to an
arbitrarily high value to make this node have the least priority when
routing by latency.
---
 cluster.go | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/cluster.go b/cluster.go
index e3a344fc..bd8fdd73 100644
--- a/cluster.go
+++ b/cluster.go
@@ -204,15 +204,26 @@ func (n *clusterNode) updateLatency() {
 	const numProbe = 10
 	var dur uint64
 
+	successes := 0
 	for i := 0; i < numProbe; i++ {
 		time.Sleep(time.Duration(10+rand.Intn(10)) * time.Millisecond)
 
 		start := time.Now()
-		n.Client.Ping(context.TODO())
-		dur += uint64(time.Since(start) / time.Microsecond)
+		err := n.Client.Ping(context.TODO()).Err()
+		if err == nil {
+			dur += uint64(time.Since(start) / time.Microsecond)
+			successes++
+		}
 	}
 
-	latency := float64(dur) / float64(numProbe)
+	var latency float64
+	if successes == 0 {
+		// If none of the pings worked, set latency to some arbitrarily high value so this node gets
+		// least priority.
+		latency = float64((1 * time.Minute) / time.Microsecond)
+	} else {
+		latency = float64(dur) / float64(successes)
+	}
 	atomic.StoreUint32(&n.latency, uint32(latency+0.5))
 }