fix: disregard failed pings in updateLatency() for cluster nodes

It is possible that a ping command might fail very quickly due to an immediate connection refused message or some other server failure condition. In this case, the derived latency is not reliable and should not be used to compare this node to other nodes in the cluster. This change will only count successful Ping commands in the average latency and in the case of no successful Pings, set the latency to an arbitrarily high value to make this node have the least priority when routing by latency.
2022-07-01 10:14:52 -04:00 · 2022-07-01 10:14:52 -04:00 · 64f972fbea
parent 89d6dfe09a
commit 64f972fbea
1 changed files with 14 additions and 3 deletions
--- a/cluster.go
+++ b/cluster.go
@ -204,15 +204,26 @@ func (n *clusterNode) updateLatency() {
 	const numProbe = 10
 	var dur uint64

+	successes := 0
 	for i := 0; i < numProbe; i++ {
 		time.Sleep(time.Duration(10+rand.Intn(10)) * time.Millisecond)

 		start := time.Now()
-		n.Client.Ping(context.TODO())
-		dur += uint64(time.Since(start) / time.Microsecond)
+		err := n.Client.Ping(context.TODO()).Err()
+		if err == nil {
+			dur += uint64(time.Since(start) / time.Microsecond)
+			successes++
+		}
 	}

-	latency := float64(dur) / float64(numProbe)
+	var latency float64
+	if successes == 0 {
+		// If none of the pings worked, set latency to some arbitrarily high value so this node gets
+		// least priority.
+		latency = float64((1 * time.Minute) / time.Microsecond)
+	} else {
+		latency = float64(dur) / float64(successes)
+	}
 	atomic.StoreUint32(&n.latency, uint32(latency+0.5))
 }