From fa7f64f7f27348658ecfa71dd71dfb2d112e2f86 Mon Sep 17 00:00:00 2001 From: Vladimir Mihailenco Date: Thu, 15 Feb 2018 13:00:54 +0200 Subject: [PATCH] Rework retrying --- cluster.go | 252 ++++++++++++++------------ cluster_test.go | 28 +-- internal/error.go | 4 + internal/singleflight/singleflight.go | 64 +++++++ 4 files changed, 219 insertions(+), 129 deletions(-) create mode 100644 internal/singleflight/singleflight.go diff --git a/cluster.go b/cluster.go index 628f513e..ea677ae2 100644 --- a/cluster.go +++ b/cluster.go @@ -13,10 +13,10 @@ import ( "github.com/go-redis/redis/internal/hashtag" "github.com/go-redis/redis/internal/pool" "github.com/go-redis/redis/internal/proto" + "github.com/go-redis/redis/internal/singleflight" ) var errClusterNoNodes = fmt.Errorf("redis: cluster has no nodes") -var errNilClusterState = fmt.Errorf("redis: cannot load cluster slots") // ClusterOptions are used to configure a cluster client and should be // passed to NewClusterClient. @@ -122,8 +122,8 @@ type clusterNode struct { Client *Client latency uint32 // atomic - generation uint32 - loading int64 // atomic + generation uint32 // atomic + loading int64 // atomic } func newClusterNode(clOpt *ClusterOptions, addr string) *clusterNode { @@ -141,6 +141,14 @@ func newClusterNode(clOpt *ClusterOptions, addr string) *clusterNode { return &node } +func (n *clusterNode) Close() error { + return n.Client.Close() +} + +func (n *clusterNode) Test() error { + return n.Client.ClusterInfo().Err() +} + func (n *clusterNode) updateLatency() { const probes = 10 @@ -154,14 +162,6 @@ func (n *clusterNode) updateLatency() { atomic.StoreUint32(&n.latency, latency) } -func (n *clusterNode) Close() error { - return n.Client.Close() -} - -func (n *clusterNode) Test() error { - return n.Client.ClusterInfo().Err() -} - func (n *clusterNode) Latency() time.Duration { latency := atomic.LoadUint32(&n.latency) return time.Duration(latency) * time.Microsecond @@ -186,14 +186,16 @@ func (n *clusterNode) Loading() bool { } func (n *clusterNode) Generation() uint32 { - return n.generation + return atomic.LoadUint32(&n.generation) } func (n *clusterNode) SetGeneration(gen uint32) { - if gen < n.generation { - panic("gen < n.generation") + for { + v := atomic.LoadUint32(&n.generation) + if gen < v || atomic.CompareAndSwapUint32(&n.generation, v, gen) { + break + } } - n.generation = gen } //------------------------------------------------------------------------------ @@ -201,18 +203,23 @@ func (n *clusterNode) SetGeneration(gen uint32) { type clusterNodes struct { opt *ClusterOptions - mu sync.RWMutex - addrs []string - nodes map[string]*clusterNode - closed bool + mu sync.RWMutex + allAddrs []string + addrs []string + nodes map[string]*clusterNode + closed bool + + nodeCreateGroup singleflight.Group generation uint32 } func newClusterNodes(opt *ClusterOptions) *clusterNodes { return &clusterNodes{ - opt: opt, - nodes: make(map[string]*clusterNode), + opt: opt, + + allAddrs: opt.Addrs, + nodes: make(map[string]*clusterNode), } } @@ -231,6 +238,7 @@ func (c *clusterNodes) Close() error { firstErr = err } } + c.addrs = nil c.nodes = nil @@ -238,9 +246,16 @@ func (c *clusterNodes) Close() error { } func (c *clusterNodes) Addrs() ([]string, error) { + var addrs []string c.mu.RLock() closed := c.closed - addrs := c.addrs + if !closed { + if len(c.addrs) > 0 { + addrs = c.addrs + } else { + addrs = c.allAddrs + } + } c.mu.RUnlock() if closed { @@ -310,6 +325,14 @@ func (c *clusterNodes) GetOrCreate(addr string) (*clusterNode, error) { return node, nil } + v, err := c.nodeCreateGroup.Do(addr, func() (interface{}, error) { + node := newClusterNode(c.opt, addr) + return node, node.Test() + }) + if err != nil { + return nil, err + } + c.mu.Lock() defer c.mu.Unlock() @@ -319,12 +342,15 @@ func (c *clusterNodes) GetOrCreate(addr string) (*clusterNode, error) { node, ok = c.nodes[addr] if ok { + _ = v.(*clusterNode).Close() return node, nil } + node = v.(*clusterNode) + c.allAddrs = appendIfNotExists(c.allAddrs, addr) c.addrs = append(c.addrs, addr) - node = newClusterNode(c.opt, addr) c.nodes[addr] = node + return node, nil } @@ -334,20 +360,8 @@ func (c *clusterNodes) Random() (*clusterNode, error) { return nil, err } - var nodeErr error - for i := 0; i <= c.opt.MaxRedirects; i++ { - n := rand.Intn(len(addrs)) - node, err := c.GetOrCreate(addrs[n]) - if err != nil { - return nil, err - } - - nodeErr = node.Test() - if nodeErr == nil { - return node, nil - } - } - return nil, nodeErr + n := rand.Intn(len(addrs)) + return c.GetOrCreate(addrs[n]) } //------------------------------------------------------------------------------ @@ -470,9 +484,12 @@ func (c *clusterState) slotNodes(slot int) []*clusterNode { type ClusterClient struct { cmdable - opt *ClusterOptions - nodes *clusterNodes - _state atomic.Value + opt *ClusterOptions + nodes *clusterNodes + + _state atomic.Value + stateErrMu sync.RWMutex + stateErr error cmdsInfoOnce internal.Once cmdsInfo map[string]*CommandInfo @@ -501,20 +518,7 @@ func NewClusterClient(opt *ClusterOptions) *ClusterClient { c.cmdable.setProcessor(c.Process) - // Add initial nodes. - for _, addr := range opt.Addrs { - _, _ = c.nodes.GetOrCreate(addr) - } - - // Preload cluster slots. - for i := 0; i < 10; i++ { - state, err := c.reloadState() - if err == nil { - c._state.Store(state) - break - } - } - + c.reloadState() if opt.IdleCheckFrequency > 0 { go c.reaper(opt.IdleCheckFrequency) } @@ -531,21 +535,6 @@ func (c *ClusterClient) retryBackoff(attempt int) time.Duration { return internal.RetryBackoff(attempt, c.opt.MinRetryBackoff, c.opt.MaxRetryBackoff) } -func (c *ClusterClient) state() (*clusterState, error) { - v := c._state.Load() - if v != nil { - return v.(*clusterState), nil - } - - _, err := c.nodes.Addrs() - if err != nil { - return nil, err - } - - c.lazyReloadState() - return nil, errNilClusterState -} - func (c *ClusterClient) cmdInfo(name string) *CommandInfo { err := c.cmdsInfoOnce.Do(func() error { node, err := c.nodes.Random() @@ -584,7 +573,12 @@ func (c *ClusterClient) cmdSlot(cmd Cmder) int { return cmdSlot(cmd, cmdFirstKeyPos(cmd, cmdInfo)) } -func (c *ClusterClient) cmdSlotAndNode(state *clusterState, cmd Cmder) (int, *clusterNode, error) { +func (c *ClusterClient) cmdSlotAndNode(cmd Cmder) (int, *clusterNode, error) { + state, err := c.state() + if err != nil { + return 0, nil, err + } + cmdInfo := c.cmdInfo(cmd.Name()) slot := cmdSlot(cmd, cmdFirstKeyPos(cmd, cmdInfo)) @@ -602,16 +596,24 @@ func (c *ClusterClient) cmdSlotAndNode(state *clusterState, cmd Cmder) (int, *cl return slot, node, err } +func (c *ClusterClient) slotMasterNode(slot int) (*clusterNode, error) { + state, err := c.state() + if err != nil { + return nil, err + } + + nodes := state.slotNodes(slot) + if len(nodes) > 0 { + return nodes[0], nil + } + return c.nodes.Random() +} + func (c *ClusterClient) Watch(fn func(*Tx) error, keys ...string) error { if len(keys) == 0 { return fmt.Errorf("redis: keys don't hash to the same slot") } - state, err := c.state() - if err != nil { - return err - } - slot := hashtag.Slot(keys[0]) for _, key := range keys[1:] { if hashtag.Slot(key) != slot { @@ -619,7 +621,7 @@ func (c *ClusterClient) Watch(fn func(*Tx) error, keys ...string) error { } } - node, err := state.slotMasterNode(slot) + node, err := c.slotMasterNode(slot) if err != nil { return err } @@ -649,10 +651,11 @@ func (c *ClusterClient) Watch(fn func(*Tx) error, keys ...string) error { } if err == pool.ErrClosed { - node, err = state.slotMasterNode(slot) + node, err = c.slotMasterNode(slot) if err != nil { return err } + continue } return err @@ -683,13 +686,7 @@ func (c *ClusterClient) Process(cmd Cmder) error { } func (c *ClusterClient) defaultProcess(cmd Cmder) error { - state, err := c.state() - if err != nil { - cmd.setErr(err) - return err - } - - _, node, err := c.cmdSlotAndNode(state, cmd) + _, node, err := c.cmdSlotAndNode(cmd) if err != nil { cmd.setErr(err) return err @@ -747,11 +744,12 @@ func (c *ClusterClient) defaultProcess(cmd Cmder) error { } if err == pool.ErrClosed { - _, node, err = c.cmdSlotAndNode(state, cmd) + _, node, err = c.cmdSlotAndNode(cmd) if err != nil { cmd.setErr(err) - return err + break } + continue } break @@ -903,31 +901,37 @@ func (c *ClusterClient) lazyReloadState() { if !atomic.CompareAndSwapUint32(&c.reloading, 0, 1) { return } - go func() { - defer atomic.StoreUint32(&c.reloading, 0) - - for { - state, err := c.reloadState() - if err == pool.ErrClosed { - return - } - - if err != nil { - time.Sleep(time.Millisecond) - continue - } - - c._state.Store(state) - time.Sleep(5 * time.Second) - c.nodes.GC(state.generation) - break + if c.reloadState() { + time.Sleep(time.Second) } + atomic.StoreUint32(&c.reloading, 0) }() } -// Not thread-safe. -func (c *ClusterClient) reloadState() (*clusterState, error) { +func (c *ClusterClient) reloadState() bool { + for attempt := 0; attempt <= c.opt.MaxRedirects; attempt++ { + if attempt > 0 { + time.Sleep(c.retryBackoff(attempt)) + } + + state, err := c.loadState() + if err == nil { + c._state.Store(state) + c.nodes.GC(state.generation) + return true + } + + c.setStateErr(err) + switch err { + case pool.ErrClosed, errClusterNoNodes: + return false + } + } + return false +} + +func (c *ClusterClient) loadState() (*clusterState, error) { node, err := c.nodes.Random() if err != nil { return nil, err @@ -941,6 +945,27 @@ func (c *ClusterClient) reloadState() (*clusterState, error) { return newClusterState(c.nodes, slots, node.Client.opt.Addr) } +func (c *ClusterClient) state() (*clusterState, error) { + v := c._state.Load() + if v != nil { + return v.(*clusterState), nil + } + return nil, c.getStateErr() +} + +func (c *ClusterClient) setStateErr(err error) { + c.stateErrMu.Lock() + c.stateErr = err + c.stateErrMu.Unlock() +} + +func (c *ClusterClient) getStateErr() error { + c.stateErrMu.RLock() + err := c.stateErr + c.stateErrMu.RUnlock() + return err +} + // reaper closes idle connections to the cluster. func (c *ClusterClient) reaper(idleCheckFrequency time.Duration) { ticker := time.NewTicker(idleCheckFrequency) @@ -1055,15 +1080,17 @@ func (c *ClusterClient) remapCmds(cmds []Cmder, failedCmds map[*clusterNode][]Cm func (c *ClusterClient) pipelineProcessCmds( node *clusterNode, cn *pool.Conn, cmds []Cmder, failedCmds map[*clusterNode][]Cmder, ) error { - cn.SetWriteTimeout(c.opt.WriteTimeout) - if err := writeCmd(cn, cmds...); err != nil { + _ = cn.SetWriteTimeout(c.opt.WriteTimeout) + + err := writeCmd(cn, cmds...) + if err != nil { setCmdsErr(cmds, err) failedCmds[node] = cmds return err } // Set read timeout for all commands. - cn.SetReadTimeout(c.opt.ReadTimeout) + _ = cn.SetReadTimeout(c.opt.ReadTimeout) return c.pipelineReadCmds(cn, cmds, failedCmds) } @@ -1280,12 +1307,7 @@ func (c *ClusterClient) pubSub(channels []string) *PubSub { slot = -1 } - state, err := c.state() - if err != nil { - return nil, err - } - - masterNode, err := state.slotMasterNode(slot) + masterNode, err := c.slotMasterNode(slot) if err != nil { return nil, err } diff --git a/cluster_test.go b/cluster_test.go index 8ca922c4..61cac998 100644 --- a/cluster_test.go +++ b/cluster_test.go @@ -190,13 +190,11 @@ var _ = Describe("ClusterClient", func() { assertClusterClient := func() { It("should GET/SET/DEL", func() { - val, err := client.Get("A").Result() + err := client.Get("A").Err() Expect(err).To(Equal(redis.Nil)) - Expect(val).To(Equal("")) - val, err = client.Set("A", "VALUE", 0).Result() + err = client.Set("A", "VALUE", 0).Err() Expect(err).NotTo(HaveOccurred()) - Expect(val).To(Equal("OK")) Eventually(func() string { return client.Get("A").Val() @@ -295,9 +293,9 @@ var _ = Describe("ClusterClient", func() { } wg.Wait() - n, err := client.Get("key").Int64() - Expect(err).NotTo(HaveOccurred()) - Expect(n).To(Equal(int64(100))) + Eventually(func() string { + return client.Get("key").Val() + }, 30*time.Second).Should(Equal("100")) }) Describe("pipelining", func() { @@ -578,12 +576,13 @@ var _ = Describe("ClusterClient", func() { opt = redisClusterOptions() client = cluster.clusterClient(opt) - _ = client.ForEachMaster(func(master *redis.Client) error { - return master.FlushDB().Err() - }) - _ = client.ForEachSlave(func(slave *redis.Client) error { defer GinkgoRecover() + + _ = client.ForEachMaster(func(master *redis.Client) error { + return master.FlushDB().Err() + }) + Eventually(func() int64 { return slave.DBSize().Val() }, 30*time.Second).Should(Equal(int64(0))) @@ -670,7 +669,7 @@ var _ = Describe("ClusterClient without valid nodes", func() { It("returns an error", func() { err := client.Ping().Err() - Expect(err).To(MatchError("redis: cannot load cluster slots")) + Expect(err).To(MatchError("ERR This instance has cluster support disabled")) }) It("pipeline returns an error", func() { @@ -678,7 +677,7 @@ var _ = Describe("ClusterClient without valid nodes", func() { pipe.Ping() return nil }) - Expect(err).To(MatchError("redis: cannot load cluster slots")) + Expect(err).To(MatchError("ERR This instance has cluster support disabled")) }) }) @@ -743,7 +742,8 @@ var _ = Describe("ClusterClient timeout", func() { }) AfterEach(func() { - client.ForEachNode(func(client *redis.Client) error { + _ = client.ForEachNode(func(client *redis.Client) error { + defer GinkgoRecover() Eventually(func() error { return client.Ping().Err() }, 2*pause).ShouldNot(HaveOccurred()) diff --git a/internal/error.go b/internal/error.go index 0898eeb6..c190f469 100644 --- a/internal/error.go +++ b/internal/error.go @@ -42,6 +42,10 @@ func IsNetworkError(err error) bool { return ok } +func IsReadOnlyError(err error) bool { + return strings.HasPrefix(err.Error(), "READONLY ") +} + func IsBadConn(err error, allowTimeout bool) bool { if err == nil { return false diff --git a/internal/singleflight/singleflight.go b/internal/singleflight/singleflight.go new file mode 100644 index 00000000..3b174172 --- /dev/null +++ b/internal/singleflight/singleflight.go @@ -0,0 +1,64 @@ +/* +Copyright 2013 Google Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Package singleflight provides a duplicate function call suppression +// mechanism. +package singleflight + +import "sync" + +// call is an in-flight or completed Do call +type call struct { + wg sync.WaitGroup + val interface{} + err error +} + +// Group represents a class of work and forms a namespace in which +// units of work can be executed with duplicate suppression. +type Group struct { + mu sync.Mutex // protects m + m map[string]*call // lazily initialized +} + +// Do executes and returns the results of the given function, making +// sure that only one execution is in-flight for a given key at a +// time. If a duplicate comes in, the duplicate caller waits for the +// original to complete and receives the same results. +func (g *Group) Do(key string, fn func() (interface{}, error)) (interface{}, error) { + g.mu.Lock() + if g.m == nil { + g.m = make(map[string]*call) + } + if c, ok := g.m[key]; ok { + g.mu.Unlock() + c.wg.Wait() + return c.val, c.err + } + c := new(call) + c.wg.Add(1) + g.m[key] = c + g.mu.Unlock() + + c.val, c.err = fn() + c.wg.Done() + + g.mu.Lock() + delete(g.m, key) + g.mu.Unlock() + + return c.val, c.err +}