Merge fe465ecd54 into f1ffb55c9a

Only check latencies once every 10 seconds with `routeByLatency` (#2795 )
* Only check latencies once every 10 seconds with `routeByLatency` `routeByLatency` currently checks latencies any time a server returns a MOVED or READONLY reply. When a shard is down, the ClusterClient chooses to issue the request to a random server, which returns a MOVED reply. This causes a state refresh and a latency update on all servers. This can lead to significant ping load to clusters with a large number of clients. This introduces logic to ping only once every 10 seconds, only performing a latency update on a node during the `GC` function if the latency was set later than 10 seconds ago. Fixes https://github.com/redis/go-redis/issues/2782 * use UnixNano instead of Unix for better precision --------- Co-authored-by: ofekshenawa <104765379+ofekshenawa@users.noreply.github.com>
2024-11-20 12:59:27 -08:00 · 2024-11-20 14:36:39 +02:00 · 2024-09-16 23:24:49 +02:00 · 2024-09-16 22:55:09 +02:00 · 2024-09-16 22:37:00 +02:00
2 changed files with 25 additions and 2 deletions
--- a/options.go
+++ b/options.go
@ -85,7 +85,7 @@ type Options struct {
 	DialTimeout time.Duration
 	// Timeout for socket reads. If reached, commands will fail
 	// with a timeout instead of blocking. Supported values:
-	//   - `0` - default timeout (3 seconds).
+	//   - `0` - default timeout (5 seconds).
 	//   - `-1` - no timeout (block indefinitely).
 	//   - `-2` - disables SetReadDeadline calls completely.
 	ReadTimeout time.Duration
--- a/osscluster.go
+++ b/osscluster.go
@ -21,6 +21,10 @@ import (
 	"github.com/redis/go-redis/v9/internal/rand"
 )

+const (
+	minLatencyMeasurementInterval = 10 * time.Second
+)
+
 var errClusterNoNodes = fmt.Errorf("redis: cluster has no nodes")

 // ClusterOptions are used to configure a cluster client and should be
@ -316,6 +320,10 @@ type clusterNode struct {
 	latency    uint32 // atomic
 	generation uint32 // atomic
 	failing    uint32 // atomic
+
+	// last time the latency measurement was performed for the node, stored in nanoseconds
+	// from epoch
+	lastLatencyMeasurement int64 // atomic
 }

 func newClusterNode(clOpt *ClusterOptions, addr string) *clusterNode {
@ -368,6 +376,7 @@ func (n *clusterNode) updateLatency() {
 		latency = float64(dur) / float64(successes)
 	}
 	atomic.StoreUint32(&n.latency, uint32(latency+0.5))
+	n.SetLastLatencyMeasurement(time.Now())
 }

 func (n *clusterNode) Latency() time.Duration {
@ -397,6 +406,10 @@ func (n *clusterNode) Generation() uint32 {
 	return atomic.LoadUint32(&n.generation)
 }

+func (n *clusterNode) LastLatencyMeasurement() int64 {
+	return atomic.LoadInt64(&n.lastLatencyMeasurement)
+}
+
 func (n *clusterNode) SetGeneration(gen uint32) {
 	for {
 		v := atomic.LoadUint32(&n.generation)
@ -406,6 +419,15 @@ func (n *clusterNode) SetGeneration(gen uint32) {
 	}
 }

+func (n *clusterNode) SetLastLatencyMeasurement(t time.Time) {
+	for {
+		v := atomic.LoadInt64(&n.lastLatencyMeasurement)
+		if t.UnixNano() < v || atomic.CompareAndSwapInt64(&n.lastLatencyMeasurement, v, t.UnixNano()) {
+			break
+		}
+	}
+}
+
 //------------------------------------------------------------------------------

 type clusterNodes struct {
@ -493,10 +515,11 @@ func (c *clusterNodes) GC(generation uint32) {
 	c.mu.Lock()

 	c.activeAddrs = c.activeAddrs[:0]
+	now := time.Now()
 	for addr, node := range c.nodes {
 		if node.Generation() >= generation {
 			c.activeAddrs = append(c.activeAddrs, addr)
-			if c.opt.RouteByLatency {
+			if c.opt.RouteByLatency && node.LastLatencyMeasurement() < now.Add(-minLatencyMeasurementInterval).UnixNano() {
 				go node.updateLatency()
 			}
 			continue
Author	SHA1	Message	Date
kooskoos20	f4302004e2	Merge `fe465ecd54` into `f1ffb55c9a`	2024-11-20 12:59:27 -08:00
Justin	f1ffb55c9a	Only check latencies once every 10 seconds with `routeByLatency` (#2795 ) * Only check latencies once every 10 seconds with `routeByLatency` `routeByLatency` currently checks latencies any time a server returns a MOVED or READONLY reply. When a shard is down, the ClusterClient chooses to issue the request to a random server, which returns a MOVED reply. This causes a state refresh and a latency update on all servers. This can lead to significant ping load to clusters with a large number of clients. This introduces logic to ping only once every 10 seconds, only performing a latency update on a node during the `GC` function if the latency was set later than 10 seconds ago. Fixes https://github.com/redis/go-redis/issues/2782 * use UnixNano instead of Unix for better precision --------- Co-authored-by: ofekshenawa <104765379+ofekshenawa@users.noreply.github.com>	2024-11-20 14:36:39 +02:00
Omkar Birade	fe465ecd54	Revert "Revert "docs: update default timeout from 3 seconds to 5 seconds"" This reverts commit `36c8ee43d4`.	2024-09-16 23:24:49 +02:00
Omkar Birade	36c8ee43d4	Revert "docs: update default timeout from 3 seconds to 5 seconds" This reverts commit `85b4dbfe15`.	2024-09-16 22:55:09 +02:00
Omkar Birade	85b4dbfe15	docs: update default timeout from 3 seconds to 5 seconds	2024-09-16 22:37:00 +02:00