ledisdb/server/replication.go

373 lines
6.1 KiB
Go
Raw Normal View History

package server
2014-06-08 12:43:59 +04:00
import (
2014-06-09 13:23:32 +04:00
"bufio"
"bytes"
"errors"
"fmt"
2014-09-24 08:34:21 +04:00
"github.com/siddontang/go/hack"
2014-09-24 05:46:36 +04:00
"github.com/siddontang/go/log"
2014-10-05 13:24:44 +04:00
"github.com/siddontang/go/num"
2014-06-09 13:23:32 +04:00
"github.com/siddontang/ledisdb/ledis"
"github.com/siddontang/ledisdb/rpl"
2014-06-09 13:23:32 +04:00
"net"
2014-06-08 12:43:59 +04:00
"os"
"path"
2014-06-09 13:23:32 +04:00
"strconv"
"sync"
"time"
2014-06-08 12:43:59 +04:00
)
2014-06-09 13:23:32 +04:00
var (
errConnectMaster = errors.New("connect master error")
)
type master struct {
sync.Mutex
conn net.Conn
rb *bufio.Reader
2014-06-09 13:23:32 +04:00
app *App
quit chan struct{}
addr string
2014-06-09 13:23:32 +04:00
wg sync.WaitGroup
syncBuf bytes.Buffer
}
func newMaster(app *App) *master {
m := new(master)
m.app = app
2014-06-10 06:41:50 +04:00
m.quit = make(chan struct{}, 1)
2014-06-09 13:23:32 +04:00
return m
2014-06-08 12:43:59 +04:00
}
2014-06-09 13:23:32 +04:00
func (m *master) Close() {
2014-09-24 17:31:26 +04:00
ledis.AsyncNotify(m.quit)
2014-06-09 13:23:32 +04:00
if m.conn != nil {
m.conn.Close()
m.conn = nil
2014-06-09 13:23:32 +04:00
}
m.wg.Wait()
2014-06-08 12:43:59 +04:00
}
2014-06-09 13:23:32 +04:00
func (m *master) connect() error {
if len(m.addr) == 0 {
2014-06-09 13:23:32 +04:00
return fmt.Errorf("no assign master addr")
}
if m.conn != nil {
m.conn.Close()
m.conn = nil
2014-06-08 12:43:59 +04:00
}
if conn, err := net.Dial("tcp", m.addr); err != nil {
2014-06-09 13:23:32 +04:00
return err
} else {
m.conn = conn
2014-06-09 13:23:32 +04:00
m.rb = bufio.NewReaderSize(m.conn, 4096)
2014-06-09 13:23:32 +04:00
}
return nil
}
2014-06-09 13:23:32 +04:00
func (m *master) stopReplication() error {
m.Close()
return nil
}
func (m *master) startReplication(masterAddr string, restart bool) error {
2014-06-09 13:23:32 +04:00
//stop last replcation, if avaliable
m.Close()
m.addr = masterAddr
2014-06-09 13:23:32 +04:00
2014-06-10 06:41:50 +04:00
m.quit = make(chan struct{}, 1)
2014-06-09 13:23:32 +04:00
2014-10-10 05:49:16 +04:00
m.app.cfg.Readonly = true
m.wg.Add(1)
go m.runReplication(restart)
2014-06-09 13:23:32 +04:00
return nil
}
func (m *master) runReplication(restart bool) {
2014-06-09 13:23:32 +04:00
defer m.wg.Done()
for {
select {
case <-m.quit:
return
default:
if err := m.connect(); err != nil {
log.Error("connect master %s error %s, try 2s later", m.addr, err.Error())
2014-06-09 13:23:32 +04:00
time.Sleep(2 * time.Second)
continue
}
}
if restart {
if err := m.fullSync(); err != nil {
if m.conn != nil {
//if conn == nil, other close the replication, not error
log.Error("restart fullsync error %s", err.Error())
}
return
}
}
2014-06-09 13:23:32 +04:00
for {
if err := m.sync(); err != nil {
if m.conn != nil {
//if conn == nil, other close the replication, not error
2014-09-25 12:03:29 +04:00
log.Error("sync error %s", err.Error())
2014-06-10 06:41:50 +04:00
}
return
2014-06-10 06:41:50 +04:00
}
select {
2014-06-09 13:23:32 +04:00
case <-m.quit:
return
default:
2014-06-10 06:41:50 +04:00
break
2014-06-09 13:23:32 +04:00
}
}
}
return
}
var (
fullSyncCmd = []byte("*1\r\n$8\r\nfullsync\r\n") //fullsync
syncCmdFormat = "*2\r\n$4\r\nsync\r\n$%d\r\n%s\r\n" //sync logid
2014-06-09 13:23:32 +04:00
)
2014-06-09 13:23:32 +04:00
func (m *master) fullSync() error {
log.Info("begin full sync")
if _, err := m.conn.Write(fullSyncCmd); err != nil {
2014-06-09 13:23:32 +04:00
return err
}
dumpPath := path.Join(m.app.cfg.DataDir, "master.dump")
2014-09-18 17:27:43 +04:00
f, err := os.OpenFile(dumpPath, os.O_CREATE|os.O_WRONLY, 0644)
2014-06-09 13:23:32 +04:00
if err != nil {
return err
}
defer os.Remove(dumpPath)
2014-07-11 06:43:39 +04:00
err = ReadBulkTo(m.rb, f)
2014-06-09 13:23:32 +04:00
f.Close()
if err != nil {
log.Error("read dump data error %s", err.Error())
return err
}
if _, err = m.app.ldb.LoadDumpFile(dumpPath); err != nil {
2014-06-09 13:23:32 +04:00
log.Error("load dump file error %s", err.Error())
return err
}
return nil
2014-06-09 13:23:32 +04:00
}
2014-09-25 18:33:09 +04:00
func (m *master) nextSyncLogID() (uint64, error) {
s, err := m.app.ldb.ReplicationStat()
if err != nil {
return 0, err
}
if s.LastID > s.CommitID {
return s.LastID + 1, nil
} else {
return s.CommitID + 1, nil
}
}
2014-06-09 13:23:32 +04:00
func (m *master) sync() error {
var err error
var syncID uint64
2014-09-25 18:33:09 +04:00
if syncID, err = m.nextSyncLogID(); err != nil {
return err
}
2014-06-09 13:23:32 +04:00
logIDStr := strconv.FormatUint(syncID, 10)
2014-09-24 08:34:21 +04:00
cmd := hack.Slice(fmt.Sprintf(syncCmdFormat, len(logIDStr),
logIDStr))
if _, err := m.conn.Write(cmd); err != nil {
2014-06-09 13:23:32 +04:00
return err
}
m.syncBuf.Reset()
if err = ReadBulkTo(m.rb, &m.syncBuf); err != nil {
switch err.Error() {
case ledis.ErrLogMissed.Error():
return m.fullSync()
case ledis.ErrRplNotSupport.Error():
m.stopReplication()
return nil
default:
return err
}
2014-06-09 13:23:32 +04:00
}
2014-09-27 06:08:45 +04:00
buf := m.syncBuf.Bytes()
2014-06-09 13:23:32 +04:00
if len(buf) == 0 {
2014-06-09 13:23:32 +04:00
return nil
}
if err = m.app.ldb.StoreLogsFromData(buf); err != nil {
2014-06-09 13:23:32 +04:00
return err
}
return nil
2014-06-09 13:23:32 +04:00
}
2014-10-10 05:49:16 +04:00
func (app *App) slaveof(masterAddr string, restart bool, readonly bool) error {
2014-06-09 13:23:32 +04:00
app.m.Lock()
defer app.m.Unlock()
2014-10-10 05:49:16 +04:00
//in master mode and no slaveof, only set readonly
if len(app.cfg.SlaveOf) == 0 && len(masterAddr) == 0 {
app.cfg.Readonly = readonly
return nil
}
if !app.ldb.ReplicationUsed() {
return fmt.Errorf("slaveof must enable replication")
}
app.cfg.SlaveOf = masterAddr
2014-06-09 13:23:32 +04:00
if len(masterAddr) == 0 {
2014-09-25 12:03:29 +04:00
if err := app.m.stopReplication(); err != nil {
return err
}
2014-10-10 05:49:16 +04:00
app.cfg.Readonly = readonly
2014-06-09 13:23:32 +04:00
} else {
return app.m.startReplication(masterAddr, restart)
2014-06-08 12:43:59 +04:00
}
return nil
}
func (app *App) tryReSlaveof() error {
app.m.Lock()
defer app.m.Unlock()
if !app.ldb.ReplicationUsed() {
return nil
}
if len(app.cfg.SlaveOf) == 0 {
return nil
} else {
return app.m.startReplication(app.cfg.SlaveOf, true)
}
}
func (app *App) addSlave(c *client) {
app.slock.Lock()
defer app.slock.Unlock()
app.slaves[c] = struct{}{}
}
func (app *App) removeSlave(c *client) {
app.slock.Lock()
defer app.slock.Unlock()
2014-10-09 07:47:14 +04:00
if _, ok := app.slaves[c]; ok {
delete(app.slaves, c)
log.Info("remove slave %s", c.remoteAddr)
}
if c.ack != nil {
2014-09-24 17:31:26 +04:00
asyncNotifyUint64(c.ack.ch, c.lastLogID)
}
}
func asyncNotifyUint64(ch chan uint64, v uint64) {
select {
case ch <- v:
default:
}
}
func (app *App) publishNewLog(l *rpl.Log) {
if !app.cfg.Replication.Sync {
//no sync replication, we will do async
return
}
ss := make([]*client, 0, 4)
app.slock.Lock()
logId := l.ID
for s, _ := range app.slaves {
2014-09-23 13:53:52 +04:00
if s.lastLogID >= logId {
2014-10-09 07:47:14 +04:00
//slave has already owned this log
ss = []*client{}
break
} else {
ss = append(ss, s)
}
}
app.slock.Unlock()
if len(ss) == 0 {
return
}
ack := &syncAck{
logId, make(chan uint64, len(ss)),
}
for _, s := range ss {
s.ack = ack
}
2014-10-05 13:24:44 +04:00
total := (len(ss) + 1) / 2
if app.cfg.Replication.WaitMaxSlaveAcks > 0 {
total = num.MinInt(total, app.cfg.Replication.WaitMaxSlaveAcks)
}
done := make(chan struct{}, 1)
go func(total int) {
n := 0
for i := 0; i < len(ss); i++ {
id := <-ack.ch
if id > logId {
n++
if n >= total {
break
}
}
}
done <- struct{}{}
2014-10-05 13:24:44 +04:00
}(total)
select {
case <-done:
case <-time.After(time.Duration(app.cfg.Replication.WaitSyncTime) * time.Second):
2014-10-05 13:24:44 +04:00
log.Info("replication wait timeout")
}
}