diff --git a/cluster/cluster.go b/cluster/cluster.go index 8f07707bc..a89534404 100644 --- a/cluster/cluster.go +++ b/cluster/cluster.go @@ -217,7 +217,8 @@ type Cluster struct { crcTable *crc64.Table SlavesOldestMasterFile SlavesOldestMasterFile SlavesConnected int - clog *clog.Logger `json:"-"` + clog *clog.Logger `json:"-"` + MDevIssues *config.MDevIssueMap `json:"-"` *ClusterGraphite } @@ -321,6 +322,7 @@ func (cluster *Cluster) Init(confs *config.ConfVersion, cfgGroup string, tlog *s cluster.runUUID = runUUID cluster.repmgrHostname = repmgrHostname cluster.repmgrVersion = repmgrVersion + cluster.MDevIssues = config.NewMDevIssueMap() cluster.InitFromConf() cluster.NewClusterGraphite() diff --git a/cluster/cluster_chk.go b/cluster/cluster_chk.go index 2e1ee0e57..90d245be1 100644 --- a/cluster/cluster_chk.go +++ b/cluster/cluster_chk.go @@ -90,6 +90,12 @@ func (cluster *Cluster) isSlaveElectableForSwitchover(sl *ServerMonitor, forcing // } return false } + + // If cluster have bug in replication + if !cluster.runOnceAfterTopology && cluster.Conf.FailoverCheckBlocker && !cluster.CheckBlockerState(sl, forcingLog) { + return false + } + if cluster.Conf.SwitchGtidCheck && cluster.IsCurrentGTIDSync(sl, cluster.master) == false && cluster.Conf.RplChecks == true { // if cluster.Conf.LogLevel > 1 || forcingLog { cluster.LogModulePrintf(forcingLog, config.ConstLogModGeneral, config.LvlWarn, "Equal-GTID option is enabled and GTID position on slave %s differs from master. Skipping", sl.URL) @@ -201,7 +207,7 @@ func (cluster *Cluster) isOneSlaveHeartbeatIncreasing() bool { cluster.LogModulePrintf(cluster.Conf.Verbose, config.ConstLogModGeneral, config.LvlDbg, "SLAVE_RECEIVED_HEARTBEATS %d", status2["SLAVE_RECEIVED_HEARTBEATS"]) // } if status2["SLAVE_RECEIVED_HEARTBEATS"] > saveheartbeats { - cluster.SetState("ERR00028", state.State{ErrType: config.LvlErr, ErrDesc: fmt.Sprintf(clusterError["ERR00028"], s.URL), ErrFrom: "CHECK"}) + cluster.SetState("ERR00028", state.State{ErrType: config.LvlErr, ErrDesc: clusterError["ERR00028"], ErrFrom: "CHECK", ServerUrl: s.URL}) return true } } @@ -637,7 +643,7 @@ func (cluster *Cluster) CheckTableChecksum(schema string, table string) { if slaveSeq >= masterSeq { break } else { - cluster.SetState("WARN0086", state.State{ErrType: "WARNING", ErrDesc: fmt.Sprintf(clusterError["WARN0086"], s.URL), ErrFrom: "MON", ServerUrl: s.URL}) + cluster.SetState("WARN0086", state.State{ErrType: "WARNING", ErrDesc: clusterError["WARN0086"], ErrFrom: "MON", ServerUrl: s.URL}) } time.Sleep(1 * time.Second) } @@ -856,3 +862,23 @@ func (cluster *Cluster) CheckDefaultUser(i bool) { cluster.SetState("WARN0108", state.State{ErrType: "WARNING", ErrDesc: fmt.Sprintf(clusterError["WARN0108"], out), ErrFrom: "CLUSTER"}) } } + +// This will check replication from bug, return true if valid or not activated +func (cluster *Cluster) CheckBlockerState(sl *ServerMonitor, forcingLog bool) bool { + // If regression check disabled + if !cluster.Conf.FailoverCheckBlocker { + return true + } + + blockers := []string{ + "MDEV-28310", + } + + for _, mdev := range blockers { + if sl.MDevIssues.HasMdevBug(mdev) { + return false + } + } + + return true +} diff --git a/cluster/cluster_fail.go b/cluster/cluster_fail.go index da2a9c93a..adb834e14 100644 --- a/cluster/cluster_fail.go +++ b/cluster/cluster_fail.go @@ -643,7 +643,7 @@ func (cluster *Cluster) electSwitchoverGroupReplicationCandidate(l []*ServerMoni // Return one not ignored not full , not prefered for i, sl := range l { if sl.IsIgnored() { - cluster.SetState("ERR00037", state.State{ErrType: config.LvlWarn, ErrDesc: fmt.Sprintf(clusterError["ERR00037"], sl.URL), ServerUrl: sl.URL, ErrFrom: "CHECK"}) + cluster.SetState("ERR00037", state.State{ErrType: config.LvlWarn, ErrDesc: clusterError["ERR00037"], ServerUrl: sl.URL, ErrFrom: "CHECK"}) continue } if cluster.IsInPreferedHosts(sl) { @@ -671,7 +671,7 @@ func (cluster *Cluster) electSwitchoverCandidate(l []*ServerMonitor, forcingLog /* If server is in the ignore list, do not elect it in switchover */ if sl.IsIgnored() { - cluster.SetState("ERR00037", state.State{ErrType: config.LvlWarn, ErrDesc: fmt.Sprintf(clusterError["ERR00037"], sl.URL), ServerUrl: sl.URL, ErrFrom: "CHECK"}) + cluster.SetState("ERR00037", state.State{ErrType: config.LvlWarn, ErrDesc: clusterError["ERR00037"], ServerUrl: sl.URL, ErrFrom: "CHECK"}) continue } if sl.IsFull { @@ -679,27 +679,27 @@ func (cluster *Cluster) electSwitchoverCandidate(l []*ServerMonitor, forcingLog } //Need comment// if sl.IsRelay { - cluster.SetState("ERR00036", state.State{ErrType: config.LvlWarn, ErrDesc: fmt.Sprintf(clusterError["ERR00036"], sl.URL), ServerUrl: sl.URL, ErrFrom: "CHECK"}) + cluster.SetState("ERR00036", state.State{ErrType: config.LvlWarn, ErrDesc: clusterError["ERR00036"], ServerUrl: sl.URL, ErrFrom: "CHECK"}) continue } if !sl.HasBinlog() && !sl.IsIgnored() { - cluster.SetState("ERR00013", state.State{ErrType: config.LvlWarn, ErrDesc: fmt.Sprintf(clusterError["ERR00013"], sl.URL), ErrFrom: "CHECK", ServerUrl: sl.URL}) + cluster.SetState("ERR00013", state.State{ErrType: config.LvlWarn, ErrDesc: clusterError["ERR00013"], ErrFrom: "CHECK", ServerUrl: sl.URL}) continue } if cluster.Conf.MultiMaster == true && sl.State == stateMaster { - cluster.SetState("ERR00035", state.State{ErrType: config.LvlWarn, ErrDesc: fmt.Sprintf(clusterError["ERR00035"], sl.URL), ServerUrl: sl.URL, ErrFrom: "CHECK"}) + cluster.SetState("ERR00035", state.State{ErrType: config.LvlWarn, ErrDesc: clusterError["ERR00035"], ServerUrl: sl.URL, ErrFrom: "CHECK"}) continue } // The tests below should run only in case of a switchover as they require the master to be up. if cluster.isSlaveElectableForSwitchover(sl, forcingLog) == false { - cluster.SetState("ERR00034", state.State{ErrType: config.LvlWarn, ErrDesc: fmt.Sprintf(clusterError["ERR00034"], sl.URL), ServerUrl: sl.URL, ErrFrom: "CHECK"}) + cluster.SetState("ERR00034", state.State{ErrType: config.LvlWarn, ErrDesc: clusterError["ERR00034"], ServerUrl: sl.URL, ErrFrom: "CHECK"}) continue } /* binlog + ping */ if cluster.isSlaveElectable(sl, forcingLog) == false { - cluster.SetState("ERR00039", state.State{ErrType: config.LvlWarn, ErrDesc: fmt.Sprintf(clusterError["ERR00039"], sl.URL), ServerUrl: sl.URL, ErrFrom: "CHECK"}) + cluster.SetState("ERR00039", state.State{ErrType: config.LvlWarn, ErrDesc: clusterError["ERR00039"], ServerUrl: sl.URL, ErrFrom: "CHECK"}) continue } @@ -712,14 +712,14 @@ func (cluster *Cluster) electSwitchoverCandidate(l []*ServerMonitor, forcingLog return i } if sl.HaveNoMasterOnStart == true && cluster.Conf.FailRestartUnsafe == false { - cluster.SetState("ERR00084", state.State{ErrType: config.LvlWarn, ErrDesc: fmt.Sprintf(clusterError["ERR00084"], sl.URL), ServerUrl: sl.URL, ErrFrom: "CHECK"}) + cluster.SetState("ERR00084", state.State{ErrType: config.LvlWarn, ErrDesc: clusterError["ERR00084"], ServerUrl: sl.URL, ErrFrom: "CHECK"}) continue } ss, errss := sl.GetSlaveStatus(sl.ReplicationSourceName) // not a slave if errss != nil && cluster.Conf.FailRestartUnsafe == false { //Skip slave in election %s have no master log file, slave might have failed - cluster.SetState("ERR00033", state.State{ErrType: config.LvlWarn, ErrDesc: fmt.Sprintf(clusterError["ERR00033"], sl.URL), ServerUrl: sl.URL, ErrFrom: "CHECK"}) + cluster.SetState("ERR00033", state.State{ErrType: config.LvlWarn, ErrDesc: clusterError["ERR00033"], ServerUrl: sl.URL, ErrFrom: "CHECK"}) continue } // Fake position if none as new slave @@ -815,23 +815,23 @@ func (cluster *Cluster) electFailoverCandidate(l []*ServerMonitor, forcingLog bo //Need comment// if sl.IsRelay { - cluster.SetState("ERR00036", state.State{ErrType: config.LvlWarn, ErrDesc: fmt.Sprintf(clusterError["ERR00036"], sl.URL), ErrFrom: "CHECK", ServerUrl: sl.URL}) + cluster.SetState("ERR00036", state.State{ErrType: config.LvlWarn, ErrDesc: clusterError["ERR00036"], ErrFrom: "CHECK", ServerUrl: sl.URL}) continue } if sl.IsFull { continue } if cluster.Conf.MultiMaster == true && sl.State == stateMaster { - cluster.SetState("ERR00035", state.State{ErrType: config.LvlWarn, ErrDesc: fmt.Sprintf(clusterError["ERR00035"], sl.URL), ErrFrom: "CHECK", ServerUrl: sl.URL}) + cluster.SetState("ERR00035", state.State{ErrType: config.LvlWarn, ErrDesc: clusterError["ERR00035"], ErrFrom: "CHECK", ServerUrl: sl.URL}) trackposList[i].Ignoredmultimaster = true continue } if sl.HaveNoMasterOnStart == true && cluster.Conf.FailRestartUnsafe == false { - cluster.SetState("ERR00084", state.State{ErrType: config.LvlWarn, ErrDesc: fmt.Sprintf(clusterError["ERR00084"], sl.URL), ServerUrl: sl.URL, ErrFrom: "CHECK"}) + cluster.SetState("ERR00084", state.State{ErrType: config.LvlWarn, ErrDesc: clusterError["ERR00084"], ServerUrl: sl.URL, ErrFrom: "CHECK"}) continue } if !sl.HasBinlog() && !sl.IsIgnored() { - cluster.SetState("ERR00013", state.State{ErrType: config.LvlWarn, ErrDesc: fmt.Sprintf(clusterError["ERR00013"], sl.URL), ErrFrom: "CHECK", ServerUrl: sl.URL}) + cluster.SetState("ERR00013", state.State{ErrType: config.LvlWarn, ErrDesc: clusterError["ERR00013"], ErrFrom: "CHECK", ServerUrl: sl.URL}) continue } if cluster.GetTopology() == topoMultiMasterWsrep && cluster.vmaster != nil { @@ -850,7 +850,7 @@ func (cluster *Cluster) electFailoverCandidate(l []*ServerMonitor, forcingLog bo ss, errss := sl.GetSlaveStatus(sl.ReplicationSourceName) // not a slave if errss != nil && cluster.Conf.FailRestartUnsafe == false { - cluster.SetState("ERR00033", state.State{ErrType: config.LvlWarn, ErrDesc: fmt.Sprintf(clusterError["ERR00033"], sl.URL), ErrFrom: "CHECK", ServerUrl: sl.URL}) + cluster.SetState("ERR00033", state.State{ErrType: config.LvlWarn, ErrDesc: clusterError["ERR00033"], ErrFrom: "CHECK", ServerUrl: sl.URL}) trackposList[i].Ignoredreplication = true continue } @@ -1028,7 +1028,7 @@ func (cluster *Cluster) isSlaveElectable(sl *ServerMonitor, forcingLog bool) boo } //if master is alived and IO Thread stops then not a good candidate and not forced if ss.SlaveIORunning.String == "No" && cluster.Conf.RplChecks && !cluster.IsMasterFailed() { - cluster.SetState("ERR00087", state.State{ErrType: "WARNING", ErrDesc: fmt.Sprintf(clusterError["ERR00087"], sl.URL), ErrFrom: "CHECK", ServerUrl: sl.URL}) + cluster.SetState("ERR00087", state.State{ErrType: "WARNING", ErrDesc: clusterError["ERR00087"], ErrFrom: "CHECK", ServerUrl: sl.URL}) // if cluster.Conf.LogLevel > 1 || forcingLog { cluster.LogModulePrintf(forcingLog, config.ConstLogModWriterElection, config.LvlWarn, "Unsafe failover condition. Slave %s IO Thread is stopped %s. Skipping", sl.URL, ss.LastIOError.String) // } @@ -1037,14 +1037,14 @@ func (cluster *Cluster) isSlaveElectable(sl *ServerMonitor, forcingLog bool) boo /* binlog + ping */ if dbhelper.CheckSlavePrerequisites(sl.Conn, sl.Host, sl.DBVersion) == false { - cluster.SetState("ERR00040", state.State{ErrType: "WARNING", ErrDesc: fmt.Sprintf(clusterError["ERR00040"], sl.URL), ErrFrom: "CHECK", ServerUrl: sl.URL}) + cluster.SetState("ERR00040", state.State{ErrType: "WARNING", ErrDesc: clusterError["ERR00040"], ErrFrom: "CHECK", ServerUrl: sl.URL}) // if cluster.Conf.LogLevel > 1 || forcingLog { cluster.LogModulePrintf(forcingLog, config.ConstLogModWriterElection, config.LvlWarn, "Slave %s does not ping or has no binlogs. Skipping", sl.URL) // } return false } if sl.IsMaintenance { - cluster.SetState("ERR00047", state.State{ErrType: "WARNING", ErrDesc: fmt.Sprintf(clusterError["ERR00047"], sl.URL), ErrFrom: "CHECK", ServerUrl: sl.URL}) + cluster.SetState("ERR00047", state.State{ErrType: "WARNING", ErrDesc: clusterError["ERR00047"], ErrFrom: "CHECK", ServerUrl: sl.URL}) // if cluster.Conf.LogLevel > 1 || forcingLog { cluster.LogModulePrintf(forcingLog, config.ConstLogModWriterElection, config.LvlWarn, "Slave %s is in maintenance. Skipping", sl.URL) // } @@ -1061,24 +1061,29 @@ func (cluster *Cluster) isSlaveElectable(sl *ServerMonitor, forcingLog bool) boo } if ss.SlaveSQLRunning.String == "No" && cluster.Conf.RplChecks { - cluster.SetState("ERR00042", state.State{ErrType: "WARNING", ErrDesc: fmt.Sprintf(clusterError["ERR00042"], sl.URL), ErrFrom: "CHECK", ServerUrl: sl.URL}) + cluster.SetState("ERR00042", state.State{ErrType: "WARNING", ErrDesc: clusterError["ERR00042"], ErrFrom: "CHECK", ServerUrl: sl.URL}) // if cluster.Conf.LogLevel > 1 || forcingLog { cluster.LogModulePrintf(forcingLog, config.ConstLogModWriterElection, config.LvlWarn, "Unsafe failover condition. Slave %s SQL Thread is stopped. Skipping", sl.URL) // } return false } + // If cluster have bug in replication + if !cluster.runOnceAfterTopology && cluster.Conf.FailoverCheckBlocker && !cluster.CheckBlockerState(sl, forcingLog) { + return false + } + //if master is alived and connection issues, we have to refetch password from vault if ss.SlaveIORunning.String == "Connecting" && !cluster.IsMasterFailed() { cluster.LogModulePrintf(forcingLog, config.ConstLogModWriterElection, config.LvlDbg, "isSlaveElect lastIOErrno: %s", ss.LastIOErrno.String) if ss.LastIOErrno.String == "1045" { - cluster.SetState("ERR00088", state.State{ErrType: "WARNING", ErrDesc: fmt.Sprintf(clusterError["ERR00088"], sl.URL), ErrFrom: "CHECK", ServerUrl: sl.URL}) + cluster.SetState("ERR00088", state.State{ErrType: "WARNING", ErrDesc: clusterError["ERR00088"], ErrFrom: "CHECK", ServerUrl: sl.URL}) sl.SetReplicationCredentialsRotation(ss) } } if sl.HaveSemiSync && sl.SemiSyncSlaveStatus == false && cluster.Conf.FailSync && cluster.Conf.RplChecks { - cluster.SetState("ERR00043", state.State{ErrType: "WARNING", ErrDesc: fmt.Sprintf(clusterError["ERR00043"], sl.URL), ErrFrom: "CHECK", ServerUrl: sl.URL}) + cluster.SetState("ERR00043", state.State{ErrType: "WARNING", ErrDesc: clusterError["ERR00043"], ErrFrom: "CHECK", ServerUrl: sl.URL}) // if cluster.Conf.LogLevel > 1 || forcingLog { cluster.LogModulePrintf(forcingLog, config.ConstLogModWriterElection, config.LvlWarn, "Semi-sync slave %s is out of sync. Skipping", sl.URL) // } @@ -1101,7 +1106,7 @@ func (cluster *Cluster) isSlaveValidReader(sl *ServerMonitor, forcingLog bool) b } if sl.IsMaintenance { - cluster.SetState("ERR00047", state.State{ErrType: "WARNING", ErrDesc: fmt.Sprintf(clusterError["ERR00047"], sl.URL), ErrFrom: "CHECK", ServerUrl: sl.URL}) + cluster.SetState("ERR00047", state.State{ErrType: "WARNING", ErrDesc: clusterError["ERR00047"], ErrFrom: "CHECK", ServerUrl: sl.URL}) // if cluster.Conf.LogLevel > 1 || forcingLog { cluster.LogModulePrintf(forcingLog, config.ConstLogModWriterElection, config.LvlWarn, "Slave %s is in maintenance. Skipping", sl.URL) // } @@ -1117,7 +1122,7 @@ func (cluster *Cluster) isSlaveValidReader(sl *ServerMonitor, forcingLog bool) b return false } if sl.HaveSemiSync && sl.SemiSyncSlaveStatus == false && cluster.Conf.FailSync && cluster.Conf.RplChecks { - cluster.SetState("ERR00043", state.State{ErrType: "WARNING", ErrDesc: fmt.Sprintf(clusterError["ERR00043"], sl.URL), ErrFrom: "CHECK", ServerUrl: sl.URL}) + cluster.SetState("ERR00043", state.State{ErrType: "WARNING", ErrDesc: clusterError["ERR00043"], ErrFrom: "CHECK", ServerUrl: sl.URL}) if cluster.Conf.LogLevel > 1 || forcingLog { cluster.LogModulePrintf(forcingLog, config.ConstLogModGeneral,LvlWarn, "Semi-sync slave %s is out of sync. Skipping", sl.URL) } @@ -1125,7 +1130,7 @@ func (cluster *Cluster) isSlaveValidReader(sl *ServerMonitor, forcingLog bool) b } */ if ss.SlaveSQLRunning.String == "No" { - cluster.SetState("ERR00042", state.State{ErrType: "WARNING", ErrDesc: fmt.Sprintf(clusterError["ERR00042"], sl.URL), ErrFrom: "CHECK", ServerUrl: sl.URL}) + cluster.SetState("ERR00042", state.State{ErrType: "WARNING", ErrDesc: clusterError["ERR00042"], ErrFrom: "CHECK", ServerUrl: sl.URL}) // if cluster.Conf.LogLevel > 1 || forcingLog { cluster.LogModulePrintf(forcingLog, config.ConstLogModWriterElection, config.LvlWarn, "Unsafe failover condition. Slave %s SQL Thread is stopped. Skipping", sl.URL) // } @@ -1383,7 +1388,7 @@ func (cluster *Cluster) electVirtualCandidate(oldMaster *ServerMonitor, forcingL for i, sl := range cluster.Servers { /* If server is in the ignore list, do not elect it */ if sl.IsIgnored() { - cluster.SetState("ERR00037", state.State{ErrType: "WARNING", ErrDesc: fmt.Sprintf(clusterError["ERR00037"], sl.URL), ErrFrom: "CHECK"}) + cluster.SetState("ERR00037", state.State{ErrType: "WARNING", ErrDesc: clusterError["ERR00037"], ErrFrom: "CHECK", ServerUrl: sl.URL}) // if cluster.Conf.LogLevel > 1 || forcingLog { cluster.LogModulePrintf(forcingLog, config.ConstLogModGeneral, config.LvlDbg, "%s is in the ignore list. Skipping", sl.URL) // } diff --git a/cluster/cluster_log.go b/cluster/cluster_log.go index a52c5b472..0a30f6dad 100644 --- a/cluster/cluster_log.go +++ b/cluster/cluster_log.go @@ -459,14 +459,21 @@ func (cluster *Cluster) LogPrintState(st state.State, resolved bool) int { tag := config.GetTagsForLog(config.ConstLogModGeneral) cliformat := format - format = "[" + cluster.Name + "] [" + tag + "] " + padright(level, " ", 5) + " - " + format + format = "[" + cluster.Name + "][" + tag + "] " + padright(level, " ", 5) + " - " + format + if st.ServerUrl != "" { + format = format + " [" + st.ServerUrl + "]" + } if cluster.tlog != nil && cluster.tlog.Len > 0 { cluster.tlog.Add(format) } if cluster.Conf.HttpServ { + httpformat := fmt.Sprintf("[%s] %s", tag, cliformat) + if st.ServerUrl != "" { + httpformat = fmt.Sprintf("[%s] %s. Servers: [%s]", tag, cliformat, st.ServerUrl) + } msg := s18log.HttpMessage{ Group: cluster.Name, Level: level, @@ -478,11 +485,15 @@ func (cluster *Cluster) LogPrintState(st state.State, resolved bool) int { } if cluster.Conf.Daemon { + sURL := "none" + if st.ServerUrl != "" { + sURL = st.ServerUrl + } // wrap logrus levels if resolved { - log.WithFields(log.Fields{"cluster": cluster.Name, "type": "state", "status": "RESOLV", "code": st.ErrKey, "channel": "StdOut"}).Warnf(st.ErrDesc) + log.WithFields(log.Fields{"cluster": cluster.Name, "type": "state", "status": "RESOLV", "code": st.ErrKey, "channel": "StdOut", "server": sURL}).Warnf(st.ErrDesc) } else { - log.WithFields(log.Fields{"cluster": cluster.Name, "type": "state", "status": "OPENED", "code": st.ErrKey, "channel": "StdOut"}).Warnf(st.ErrDesc) + log.WithFields(log.Fields{"cluster": cluster.Name, "type": "state", "status": "OPENED", "code": st.ErrKey, "channel": "StdOut", "server": sURL}).Warnf(st.ErrDesc) } if cluster.Conf.TeamsUrl != "" && cluster.Conf.TeamsAlertState != "" { diff --git a/cluster/cluster_set.go b/cluster/cluster_set.go index 8312faa8e..7e61df3cc 100644 --- a/cluster/cluster_set.go +++ b/cluster/cluster_set.go @@ -1767,6 +1767,12 @@ func (cluster *Cluster) SetMonitorCaptureTrigger(value string) { cluster.Unlock() } +func (cluster *Cluster) SetMDevList(value *config.MDevIssueMap) { + cluster.Lock() + cluster.MDevIssues = value + cluster.Unlock() +} + func (cluster *Cluster) SetMasterNil() { cluster.master = nil } diff --git a/cluster/cluster_tgl.go b/cluster/cluster_tgl.go index 588af73ae..cd48a18f3 100644 --- a/cluster/cluster_tgl.go +++ b/cluster/cluster_tgl.go @@ -609,3 +609,7 @@ func (cluster *Cluster) SwitchDynamicTopology() { func (cluster *Cluster) SwitchReplicationNoRelay() { cluster.Conf.ReplicationNoRelay = !cluster.Conf.ReplicationNoRelay } + +func (cluster *Cluster) SwitchFailoverCheckBlocker() { + cluster.Conf.FailoverCheckBlocker = !cluster.Conf.FailoverCheckBlocker +} diff --git a/cluster/cluster_topo.go b/cluster/cluster_topo.go index 022891b7c..ecb813ecd 100644 --- a/cluster/cluster_topo.go +++ b/cluster/cluster_topo.go @@ -297,7 +297,7 @@ func (cluster *Cluster) TopologyDiscover(wcg *sync.WaitGroup) error { if sl.HasCycling() { hasCycling = true if cluster.Conf.MultiMaster == false && len(cluster.Servers) == 2 { - cluster.SetState("ERR00011", state.State{ErrType: "WARNING", ErrDesc: fmt.Sprintf(clusterError["ERR00011"]), ErrFrom: "TOPO", ServerUrl: sl.URL}) + cluster.SetState("ERR00011", state.State{ErrType: "WARNING", ErrDesc: clusterError["ERR00011"], ErrFrom: "TOPO", ServerUrl: sl.URL}) // if cluster.Conf.DynamicTopology { cluster.Conf.MultiMaster = true cluster.Topology = topoMultiMaster @@ -435,7 +435,7 @@ func (cluster *Cluster) TopologyDiscover(wcg *sync.WaitGroup) error { if cluster.master == nil { // could not detect master if cluster.GetMaster() == nil { - cluster.SetState("ERR00012", state.State{ErrType: "ERROR", ErrDesc: fmt.Sprintf(clusterError["ERR00012"]), ErrFrom: "TOPO"}) + cluster.SetState("ERR00012", state.State{ErrType: "ERROR", ErrDesc: clusterError["ERR00012"], ErrFrom: "TOPO"}) } } else { cluster.master.HaveHealthyReplica = false diff --git a/cluster/srv.go b/cluster/srv.go index e0d49fb2e..3352e6c6f 100644 --- a/cluster/srv.go +++ b/cluster/srv.go @@ -187,6 +187,8 @@ type ServerMonitor struct { DelayStat *ServerDelayStat `json:"delayStat"` SlaveVariables SlaveVariables `json:"slaveVariables"` IsReseeding bool `json:"isReseeding"` + MDevIssues ServerBug `json:"mdevIssues"` + IsCheckedForMDevIssues bool `json:"isCheckedForMdevIssues"` IsInSlowQueryCapture bool IsInPFSQueryCapture bool InPurgingBinaryLog bool @@ -197,6 +199,27 @@ type ServerMonitor struct { DBDataDir string } +type ServerBug struct { + Replication []string + Service []string +} + +func (sb *ServerBug) HasMdevBug(key string) bool { + for _, r := range sb.Replication { + if r == key { + return true + } + } + + for _, s := range sb.Service { + if s == key { + return true + } + } + + return false +} + type SlaveVariables struct { SlaveParallelMaxQueued int `json:"slaveParallelMaxQueued"` SlaveParallelMode string `json:"slaveParallelMode"` @@ -752,7 +775,7 @@ func (server *ServerMonitor) Refresh() error { server.EventStatus, logs, err = dbhelper.GetEventStatus(server.Conn, server.DBVersion) cluster.LogSQL(logs, err, server.URL, "Monitor", config.LvlDbg, "Could not get events status %s %s", server.URL, err) if err != nil { - cluster.SetState("ERR00073", state.State{ErrType: config.LvlErr, ErrDesc: fmt.Sprintf(clusterError["ERR00073"], server.URL), ErrFrom: "MON"}) + cluster.SetState("ERR00073", state.State{ErrType: config.LvlErr, ErrDesc: clusterError["ERR00073"], ErrFrom: "MON", ServerUrl: server.URL}) } if cluster.StateMachine.GetHeartbeats()%30 == 0 { server.SaveInfos() @@ -780,7 +803,7 @@ func (server *ServerMonitor) Refresh() error { server.CurrentWorkLoad() server.AvgWorkLoad() server.MaxWorkLoad() - + cluster.StateMachine.PreserveGroup("MDEV") } // end not postgress // get Users @@ -800,7 +823,7 @@ func (server *ServerMonitor) Refresh() error { } } if server.InCaptureMode { - cluster.SetState("WARN0085", state.State{ErrType: config.LvlInfo, ErrDesc: fmt.Sprintf(clusterError["WARN0085"], server.URL), ServerUrl: server.URL, ErrFrom: "MON"}) + cluster.SetState("WARN0085", state.State{ErrType: config.LvlInfo, ErrDesc: clusterError["WARN0085"], ServerUrl: server.URL, ErrFrom: "MON"}) } logs := "" diff --git a/cluster/srv_chk.go b/cluster/srv_chk.go index 881cb6122..3746e5b93 100644 --- a/cluster/srv_chk.go +++ b/cluster/srv_chk.go @@ -27,7 +27,7 @@ func (server *ServerMonitor) CheckMaxConnections() { maxCx, _ := strconv.ParseInt(server.Variables.Get("MAX_CONNECTIONS"), 10, 64) curCx, _ := strconv.ParseInt(server.Status.Get("THREADS_CONNECTED"), 10, 64) if curCx > maxCx*80/100 { - cluster.SetState("ERR00076", state.State{ErrType: config.LvlWarn, ErrDesc: fmt.Sprintf(clusterError["ERR00076"], server.URL), ErrFrom: "MON", ServerUrl: server.URL}) + cluster.SetState("ERR00076", state.State{ErrType: config.LvlWarn, ErrDesc: clusterError["ERR00076"], ErrFrom: "MON", ServerUrl: server.URL}) } } @@ -37,10 +37,15 @@ func (server *ServerMonitor) CheckVersion() { cluster.SetState("MDEV20821", state.State{ErrType: "WARNING", ErrDesc: fmt.Sprintf(clusterError["MDEV20821"], server.URL), ErrFrom: "MON", ServerUrl: server.URL}) } - if server.DBVersion.IsMariaDB() && !server.HasBinlogRow() && server.DBVersion.LowerReleaseList("10.2.44", "10.3.35", "10.4.25", "10.5.16", "10.6.8", "10.7.4", "10.8.3", "10.9.1") { - cluster.SetState("MDEV28310", state.State{ErrType: "WARNING", ErrDesc: fmt.Sprintf(clusterError["MDEV28310"], server.URL), ErrFrom: "MON", ServerUrl: server.URL}) - } + // Already logged with srv_mdev + // if server.DBVersion.IsMariaDB() && !server.HasBinlogRow() && server.DBVersion.LowerReleaseList("10.2.44", "10.3.35", "10.4.25", "10.5.16", "10.6.8", "10.7.4", "10.8.3", "10.9.1") { + // cluster.SetState("MDEV28310", state.State{ErrType: "WARNING", ErrDesc: fmt.Sprintf(clusterError["MDEV28310"], server.URL), ErrFrom: "MON", ServerUrl: server.URL}) + // } + //Only check once + if !server.IsCheckedForMDevIssues { + server.CheckMDevIssues() + } } // CheckDisks check mariadb disk plugin ti see if it get free space @@ -48,7 +53,7 @@ func (server *ServerMonitor) CheckDisks() { cluster := server.ClusterGroup for _, d := range server.Disks { if d.Used/d.Total*100 > int32(cluster.Conf.MonitorDiskUsagePct) { - cluster.SetState("ERR00079", state.State{ErrType: "WARNING", ErrDesc: fmt.Sprintf(clusterError["ERR00079"], server.URL), ErrFrom: "MON", ServerUrl: server.URL}) + cluster.SetState("ERR00079", state.State{ErrType: "WARNING", ErrDesc: clusterError["ERR00079"], ErrFrom: "MON", ServerUrl: server.URL}) } } } @@ -177,7 +182,7 @@ func (server *ServerMonitor) CheckSlaveSettings() { cluster.LogModulePrintf(cluster.Conf.Verbose, config.ConstLogModGeneral, "DEBUG", "Enforce semisync on slave %s", sl.URL) dbhelper.InstallSemiSync(sl.Conn, server.DBVersion) } else if sl.IsIgnored() == false && sl.HaveSemiSync == false && cluster.GetTopology() != topoMultiMasterWsrep { - cluster.SetState("WARN0048", state.State{ErrType: config.LvlWarn, ErrDesc: fmt.Sprintf(clusterError["WARN0048"], sl.URL), ErrFrom: "TOPO", ServerUrl: sl.URL}) + cluster.SetState("WARN0048", state.State{ErrType: config.LvlWarn, ErrDesc: clusterError["WARN0048"], ErrFrom: "TOPO", ServerUrl: sl.URL}) } if cluster.Conf.ForceBinlogRow && sl.HaveBinlogRow == false { @@ -186,7 +191,7 @@ func (server *ServerMonitor) CheckSlaveSettings() { cluster.LogModulePrintf(cluster.Conf.Verbose, config.ConstLogModGeneral, "INFO", "Enforce binlog format ROW on slave %s", sl.URL) } else if sl.IsIgnored() == false && sl.HaveBinlogRow == false && (cluster.Conf.AutorejoinFlashback == true || cluster.GetTopology() == topoMultiMasterWsrep) { //galera or binlog flashback need row based binlog - cluster.SetState("WARN0049", state.State{ErrType: config.LvlWarn, ErrDesc: fmt.Sprintf(clusterError["WARN0049"], sl.URL), ErrFrom: "TOPO", ServerUrl: sl.URL}) + cluster.SetState("WARN0049", state.State{ErrType: config.LvlWarn, ErrDesc: clusterError["WARN0049"], ErrFrom: "TOPO", ServerUrl: sl.URL}) } if cluster.Conf.ForceSlaveReadOnly && sl.ReadOnly == "OFF" && !server.IsIgnoredReadonly() && !cluster.IsMultiMaster() { // In non-multimaster mode, enforce read-only flag if the option is set @@ -197,32 +202,32 @@ func (server *ServerMonitor) CheckSlaveSettings() { dbhelper.SetSlaveHeartbeat(sl.Conn, "1", cluster.Conf.MasterConn, server.DBVersion) cluster.LogModulePrintf(cluster.Conf.Verbose, config.ConstLogModGeneral, "INFO", "Enforce heartbeat to 1s on slave %s", sl.URL) } else if sl.IsIgnored() == false && sl.GetReplicationHearbeatPeriod() > 1 { - cluster.SetState("WARN0050", state.State{ErrType: config.LvlWarn, ErrDesc: fmt.Sprintf(clusterError["WARN0050"], sl.URL), ErrFrom: "TOPO", ServerUrl: sl.URL}) + cluster.SetState("WARN0050", state.State{ErrType: config.LvlWarn, ErrDesc: clusterError["WARN0050"], ErrFrom: "TOPO", ServerUrl: sl.URL}) } if cluster.Conf.ForceSlaveGtid && sl.GetReplicationUsingGtid() == "No" { dbhelper.SetSlaveGTIDMode(sl.Conn, "slave_pos", cluster.Conf.MasterConn, server.DBVersion) cluster.LogModulePrintf(cluster.Conf.Verbose, config.ConstLogModGeneral, "INFO", "Enforce GTID replication on slave %s", sl.URL) } else if sl.IsIgnored() == false && sl.GetReplicationUsingGtid() == "No" && cluster.GetTopology() != topoMultiMasterWsrep && server.IsMariaDB() { - cluster.SetState("WARN0051", state.State{ErrType: config.LvlWarn, ErrDesc: fmt.Sprintf(clusterError["WARN0051"], sl.URL), ErrFrom: "TOPO", ServerUrl: sl.URL}) + cluster.SetState("WARN0051", state.State{ErrType: config.LvlWarn, ErrDesc: clusterError["WARN0051"], ErrFrom: "TOPO", ServerUrl: sl.URL}) } if cluster.Conf.ForceSlaveGtidStrict && !sl.IsReplicationUsingGtidStrict() && cluster.GetTopology() != topoMultiMasterWsrep && server.IsMariaDB() { dbhelper.SetSlaveGTIDModeStrict(sl.Conn, server.DBVersion) cluster.LogModulePrintf(cluster.Conf.Verbose, config.ConstLogModGeneral, "INFO", "Enforce GTID strict mode on slave %s", sl.URL) } else if !sl.IsIgnored() && !sl.IsReplicationUsingGtidStrict() && cluster.GetTopology() != topoMultiMasterWsrep && server.IsMariaDB() { - cluster.SetState("WARN0058", state.State{ErrType: config.LvlWarn, ErrDesc: fmt.Sprintf(clusterError["WARN0058"], sl.URL), ErrFrom: "TOPO", ServerUrl: sl.URL}) + cluster.SetState("WARN0058", state.State{ErrType: config.LvlWarn, ErrDesc: clusterError["WARN0058"], ErrFrom: "TOPO", ServerUrl: sl.URL}) } if cluster.Conf.ForceSlaveIdempotent && !sl.HaveSlaveIdempotent && cluster.GetTopology() != topoMultiMasterWsrep && server.IsMariaDB() { dbhelper.SetSlaveExecMode(sl.Conn, "IDEMPOTENT", cluster.Conf.MasterConn, server.DBVersion) cluster.LogModulePrintf(cluster.Conf.Verbose, config.ConstLogModGeneral, "INFO", "Enforce replication mode idempotent on slave %s", sl.URL) } /* else if !sl.IsIgnored() && cluster.Conf.ForceSlaveIdempotent && sl.HaveSlaveIdempotent && cluster.GetTopology() != topoMultiMasterWsrep && server.IsMariaDB() { - cluster.SetState("WARN0103", state.State{ErrType: config.LvlWarn, ErrDesc: fmt.Sprintf(clusterError["WARN0103"], sl.URL), ErrFrom: "TOPO", ServerUrl: sl.URL}) + cluster.SetState("WARN0103", state.State{ErrType: config.LvlWarn, ErrDesc: clusterError["WARN0103"], ErrFrom: "TOPO", ServerUrl: sl.URL}) }*/ if cluster.Conf.ForceSlaveStrict && sl.HaveSlaveIdempotent && cluster.GetTopology() != topoMultiMasterWsrep && server.IsMariaDB() { dbhelper.SetSlaveExecMode(sl.Conn, "STRICT", cluster.Conf.MasterConn, server.DBVersion) cluster.LogModulePrintf(cluster.Conf.Verbose, config.ConstLogModGeneral, "INFO", "Enforce replication mode strict on slave %s", sl.URL) } /*else if !sl.IsIgnored() && cluster.Conf.ForceSlaveStrict && && cluster.GetTopology() != topoMultiMasterWsrep && server.IsMariaDB() { - cluster.SetState("WARN0104", state.State{ErrType: config.LvlWarn, ErrDesc: fmt.Sprintf(clusterError["WARN0103"], sl.URL), ErrFrom: "TOPO", ServerUrl: sl.URL}) + cluster.SetState("WARN0104", state.State{ErrType: config.LvlWarn, ErrDesc: clusterError["WARN0103"], ErrFrom: "TOPO", ServerUrl: sl.URL}) } */ if strings.ToUpper(cluster.Conf.ForceSlaveParallelMode) == "OPTIMISTIC" && !sl.HaveSlaveOptimistic && cluster.GetTopology() != topoMultiMasterWsrep && server.IsMariaDB() { dbhelper.SetSlaveParallelMode(sl.Conn, "OPTIMISTIC", cluster.Conf.MasterConn, server.DBVersion) @@ -248,35 +253,35 @@ func (server *ServerMonitor) CheckSlaveSettings() { dbhelper.SetSyncInnodb(sl.Conn) cluster.LogModulePrintf(cluster.Conf.Verbose, config.ConstLogModGeneral, "INFO", "Enforce InnoDB durability on slave %s", sl.URL) } else if sl.IsIgnored() == false && sl.HaveInnodbTrxCommit == false { - cluster.SetState("WARN0052", state.State{ErrType: config.LvlWarn, ErrDesc: fmt.Sprintf(clusterError["WARN0052"], sl.URL), ErrFrom: "TOPO", ServerUrl: sl.URL}) + cluster.SetState("WARN0052", state.State{ErrType: config.LvlWarn, ErrDesc: clusterError["WARN0052"], ErrFrom: "TOPO", ServerUrl: sl.URL}) } if cluster.Conf.ForceBinlogChecksum && sl.HaveChecksum == false { dbhelper.SetBinlogChecksum(sl.Conn) cluster.LogModulePrintf(cluster.Conf.Verbose, config.ConstLogModGeneral, "INFO", "Enforce checksum on slave %s", sl.URL) } else if sl.IsIgnored() == false && sl.HaveChecksum == false { - cluster.SetState("WARN0053", state.State{ErrType: config.LvlWarn, ErrDesc: fmt.Sprintf(clusterError["WARN0053"], sl.URL), ErrFrom: "TOPO", ServerUrl: sl.URL}) + cluster.SetState("WARN0053", state.State{ErrType: config.LvlWarn, ErrDesc: clusterError["WARN0053"], ErrFrom: "TOPO", ServerUrl: sl.URL}) } if cluster.Conf.ForceBinlogSlowqueries && sl.HaveBinlogSlowqueries == false { dbhelper.SetBinlogSlowqueries(sl.Conn) cluster.LogModulePrintf(cluster.Conf.Verbose, config.ConstLogModGeneral, "INFO", "Enforce log slow queries of replication on slave %s", sl.URL) } else if sl.IsIgnored() == false && sl.HaveBinlogSlowqueries == false { - cluster.SetState("WARN0054", state.State{ErrType: config.LvlWarn, ErrDesc: fmt.Sprintf(clusterError["WARN0054"], sl.URL), ErrFrom: "TOPO", ServerUrl: sl.URL}) + cluster.SetState("WARN0054", state.State{ErrType: config.LvlWarn, ErrDesc: clusterError["WARN0054"], ErrFrom: "TOPO", ServerUrl: sl.URL}) } if cluster.Conf.ForceBinlogAnnotate && sl.HaveBinlogAnnotate == false && server.IsMariaDB() { dbhelper.SetBinlogAnnotate(sl.Conn) cluster.LogModulePrintf(cluster.Conf.Verbose, config.ConstLogModGeneral, "INFO", "Enforce annotate on slave %s", sl.URL) } else if sl.IsIgnored() == false && sl.HaveBinlogAnnotate == false && server.IsMariaDB() { - cluster.SetState("WARN0055", state.State{ErrType: config.LvlWarn, ErrDesc: fmt.Sprintf(clusterError["WARN0055"], sl.URL), ErrFrom: "TOPO", ServerUrl: sl.URL}) + cluster.SetState("WARN0055", state.State{ErrType: config.LvlWarn, ErrDesc: clusterError["WARN0055"], ErrFrom: "TOPO", ServerUrl: sl.URL}) } if cluster.Conf.ForceBinlogCompress && sl.HaveBinlogCompress == false && sl.DBVersion.IsMariaDB() && sl.DBVersion.Major >= 10 && sl.DBVersion.Minor >= 2 { dbhelper.SetBinlogCompress(sl.Conn) cluster.LogModulePrintf(cluster.Conf.Verbose, config.ConstLogModGeneral, "INFO", "Enforce binlog compression on slave %s", sl.URL) } else if sl.IsIgnored() == false && sl.HaveBinlogCompress == false && sl.DBVersion.IsMariaDB() && sl.DBVersion.Major >= 10 && sl.DBVersion.Minor >= 2 { - cluster.SetState("WARN0056", state.State{ErrType: config.LvlWarn, ErrDesc: fmt.Sprintf(clusterError["WARN0056"], sl.URL), ErrFrom: "TOPO", ServerUrl: sl.URL}) + cluster.SetState("WARN0056", state.State{ErrType: config.LvlWarn, ErrDesc: clusterError["WARN0056"], ErrFrom: "TOPO", ServerUrl: sl.URL}) } if sl.IsIgnored() == false && sl.HaveBinlogSlaveUpdates == false { - cluster.SetState("WARN0057", state.State{ErrType: config.LvlWarn, ErrDesc: fmt.Sprintf(clusterError["WARN0057"], sl.URL), ErrFrom: "TOPO", ServerUrl: sl.URL}) + cluster.SetState("WARN0057", state.State{ErrType: config.LvlWarn, ErrDesc: clusterError["WARN0057"], ErrFrom: "TOPO", ServerUrl: sl.URL}) } if server.IsAcid() == false && cluster.IsDiscovered() { @@ -292,49 +297,49 @@ func (server *ServerMonitor) CheckMasterSettings() { cluster.LogModulePrintf(cluster.Conf.Verbose, config.ConstLogModGeneral, "INFO", "Enforce semisync on Master %s", server.URL) dbhelper.InstallSemiSync(server.Conn, server.DBVersion) } else if server.HaveSemiSync == false && cluster.GetTopology() != topoMultiMasterWsrep && cluster.GetTopology() != topoMultiMasterGrouprep { - cluster.SetState("WARN0060", state.State{ErrType: "WARNING", ErrDesc: fmt.Sprintf(clusterError["WARN0060"], server.URL), ErrFrom: "TOPO", ServerUrl: server.URL}) + cluster.SetState("WARN0060", state.State{ErrType: "WARNING", ErrDesc: clusterError["WARN0060"], ErrFrom: "TOPO", ServerUrl: server.URL}) } if cluster.Conf.ForceBinlogRow && server.HaveBinlogRow == false { dbhelper.SetBinlogFormat(server.Conn, "ROW") cluster.LogModulePrintf(cluster.Conf.Verbose, config.ConstLogModGeneral, "INFO", "Enforce binlog format ROW on Master %s", server.URL) } else if server.HaveBinlogRow == false && cluster.Conf.AutorejoinFlashback == true { - cluster.SetState("WARN0061", state.State{ErrType: "WARNING", ErrDesc: fmt.Sprintf(clusterError["WARN0061"], server.URL), ErrFrom: "TOPO", ServerUrl: server.URL}) + cluster.SetState("WARN0061", state.State{ErrType: "WARNING", ErrDesc: clusterError["WARN0061"], ErrFrom: "TOPO", ServerUrl: server.URL}) } if cluster.Conf.ForceSyncBinlog && server.HaveBinlogSync == false { dbhelper.SetSyncBinlog(server.Conn) cluster.LogModulePrintf(cluster.Conf.Verbose, config.ConstLogModGeneral, "INFO", "Enforce sync binlog on Master %s", server.URL) } else if server.HaveBinlogSync == false { - cluster.SetState("WARN0062", state.State{ErrType: "WARNING", ErrDesc: fmt.Sprintf(clusterError["WARN0062"], server.URL), ErrFrom: "TOPO", ServerUrl: server.URL}) + cluster.SetState("WARN0062", state.State{ErrType: "WARNING", ErrDesc: clusterError["WARN0062"], ErrFrom: "TOPO", ServerUrl: server.URL}) } if cluster.Conf.ForceSyncInnoDB && server.HaveBinlogSync == false { dbhelper.SetSyncInnodb(server.Conn) cluster.LogModulePrintf(cluster.Conf.Verbose, config.ConstLogModGeneral, "INFO", "Enforce innodb durability on Master %s", server.URL) } else if server.HaveBinlogSync == false { - cluster.SetState("WARN0064", state.State{ErrType: "WARNING", ErrDesc: fmt.Sprintf(clusterError["WARN0064"], server.URL), ErrFrom: "TOPO", ServerUrl: server.URL}) + cluster.SetState("WARN0064", state.State{ErrType: "WARNING", ErrDesc: clusterError["WARN0064"], ErrFrom: "TOPO", ServerUrl: server.URL}) } if cluster.Conf.ForceBinlogAnnotate && server.HaveBinlogAnnotate == false && server.IsMariaDB() { dbhelper.SetBinlogAnnotate(server.Conn) cluster.LogModulePrintf(cluster.Conf.Verbose, config.ConstLogModGeneral, "INFO", "Enforce binlog annotate on master %s", server.URL) } else if server.HaveBinlogAnnotate == false && server.IsMariaDB() { - cluster.SetState("WARN0067", state.State{ErrType: "WARNING", ErrDesc: fmt.Sprintf(clusterError["WARN0067"], server.URL), ErrFrom: "TOPO", ServerUrl: server.URL}) + cluster.SetState("WARN0067", state.State{ErrType: "WARNING", ErrDesc: clusterError["WARN0067"], ErrFrom: "TOPO", ServerUrl: server.URL}) } if cluster.Conf.ForceBinlogChecksum && server.HaveChecksum == false { dbhelper.SetBinlogChecksum(server.Conn) cluster.LogModulePrintf(cluster.Conf.Verbose, config.ConstLogModGeneral, "INFO", "Enforce ckecksum annotate on master %s", server.URL) } else if server.HaveChecksum == false { - cluster.SetState("WARN0065", state.State{ErrType: "WARNING", ErrDesc: fmt.Sprintf(clusterError["WARN0065"], server.URL), ErrFrom: "TOPO", ServerUrl: server.URL}) + cluster.SetState("WARN0065", state.State{ErrType: "WARNING", ErrDesc: clusterError["WARN0065"], ErrFrom: "TOPO", ServerUrl: server.URL}) } if cluster.Conf.ForceBinlogCompress && server.HaveBinlogCompress == false && server.IsMariaDB() && server.DBVersion.Major >= 10 && server.DBVersion.Minor >= 2 { dbhelper.SetBinlogCompress(server.Conn) cluster.LogModulePrintf(cluster.Conf.Verbose, config.ConstLogModGeneral, "INFO", "Enforce binlog compression on master %s", server.URL) } else if server.HaveBinlogCompress == false && server.DBVersion.IsMariaDB() && server.DBVersion.Major >= 10 && server.DBVersion.Minor >= 2 { - cluster.SetState("WARN0068", state.State{ErrType: "WARNING", ErrDesc: fmt.Sprintf(clusterError["WARN0068"], server.URL), ErrFrom: "TOPO", ServerUrl: server.URL}) + cluster.SetState("WARN0068", state.State{ErrType: "WARNING", ErrDesc: clusterError["WARN0068"], ErrFrom: "TOPO", ServerUrl: server.URL}) } if server.HaveBinlogSlaveUpdates == false { - cluster.SetState("WARN0069", state.State{ErrType: "WARNING", ErrDesc: fmt.Sprintf(clusterError["WARN0069"], server.URL), ErrFrom: "TOPO", ServerUrl: server.URL}) + cluster.SetState("WARN0069", state.State{ErrType: "WARNING", ErrDesc: clusterError["WARN0069"], ErrFrom: "TOPO", ServerUrl: server.URL}) } if server.HaveGtidStrictMode == false && server.DBVersion.Flavor == "MariaDB" && cluster.GetTopology() != topoMultiMasterWsrep && cluster.GetTopology() != topoMultiMasterGrouprep { - cluster.SetState("WARN0070", state.State{ErrType: "WARNING", ErrDesc: fmt.Sprintf(clusterError["WARN0070"], server.URL), ErrFrom: "TOPO", ServerUrl: server.URL}) + cluster.SetState("WARN0070", state.State{ErrType: "WARNING", ErrDesc: clusterError["WARN0070"], ErrFrom: "TOPO", ServerUrl: server.URL}) } if server.IsAcid() == false && cluster.IsDiscovered() { cluster.SetState("WARN0007", state.State{ErrType: "WARNING", ErrDesc: "At least one server is not ACID-compliant. Please make sure that sync_binlog and innodb_flush_log_at_trx_commit are set to 1", ErrFrom: "CONF", ServerUrl: server.URL}) @@ -381,13 +386,13 @@ func (server *ServerMonitor) CheckPrivileges() { cluster.SetState("ERR00005", state.State{ErrType: "ERROR", ErrDesc: fmt.Sprintf(clusterError["ERR00005"], cluster.GetDbUser(), cluster.repmgrHostname, err), ErrFrom: "CONF", ServerUrl: server.URL}) } if priv.Repl_client_priv == "N" { - cluster.SetState("ERR00006", state.State{ErrType: "ERROR", ErrDesc: fmt.Sprintf(clusterError["ERR00006"], server.URL), ErrFrom: "CONF", ServerUrl: server.URL}) + cluster.SetState("ERR00006", state.State{ErrType: "ERROR", ErrDesc: clusterError["ERR00006"], ErrFrom: "CONF", ServerUrl: server.URL}) } if priv.Super_priv == "N" { - cluster.SetState("ERR00008", state.State{ErrType: "ERROR", ErrDesc: fmt.Sprintf(clusterError["ERR00008"], server.URL), ErrFrom: "CONF", ServerUrl: server.URL}) + cluster.SetState("ERR00008", state.State{ErrType: "ERROR", ErrDesc: clusterError["ERR00008"], ErrFrom: "CONF", ServerUrl: server.URL}) } if priv.Reload_priv == "N" { - cluster.SetState("ERR00009", state.State{ErrType: "ERROR", ErrDesc: fmt.Sprintf(clusterError["ERR00009"], server.URL), ErrFrom: "CONF", ServerUrl: server.URL}) + cluster.SetState("ERR00009", state.State{ErrType: "ERROR", ErrDesc: clusterError["ERR00009"], ErrFrom: "CONF", ServerUrl: server.URL}) } } // Check replication user has correct privs. @@ -400,7 +405,7 @@ func (server *ServerMonitor) CheckPrivileges() { cluster.SetState("ERR00015", state.State{ErrType: "ERROR", ErrDesc: fmt.Sprintf(clusterError["ERR00015"], cluster.GetRplUser(), sv2.URL, err), ErrFrom: "CONF", ServerUrl: sv2.URL}) } if rpriv.Repl_slave_priv == "N" { - cluster.SetState("ERR00007", state.State{ErrType: "ERROR", ErrDesc: fmt.Sprintf(clusterError["ERR00007"], sv2.URL), ErrFrom: "CONF", ServerUrl: sv2.URL}) + cluster.SetState("ERR00007", state.State{ErrType: "ERROR", ErrDesc: clusterError["ERR00007"], ErrFrom: "CONF", ServerUrl: sv2.URL}) } } } diff --git a/cluster/srv_get.go b/cluster/srv_get.go index 7cb0d070d..c1852e965 100644 --- a/cluster/srv_get.go +++ b/cluster/srv_get.go @@ -534,7 +534,7 @@ func (server *ServerMonitor) GetNewDBConn() (*sqlx.DB, error) { server.SetDSN() conn, err := sqlx.Connect("mysql", server.DSN) if err == nil { - server.ClusterGroup.SetState("ERR00080", state.State{ErrType: config.LvlErr, ErrDesc: fmt.Sprintf(clusterError["ERR00080"], server.URL), ServerUrl: server.URL, ErrFrom: "MON"}) + server.ClusterGroup.SetState("ERR00080", state.State{ErrType: config.LvlErr, ErrDesc: clusterError["ERR00080"], ServerUrl: server.URL, ErrFrom: "MON"}) } else { server.TLSConfigUsed = ConstTLSNoConfig server.SetDSN() diff --git a/cluster/srv_has.go b/cluster/srv_has.go index 8e9fc5c2b..1d74a2329 100644 --- a/cluster/srv_has.go +++ b/cluster/srv_has.go @@ -189,6 +189,14 @@ func (server *ServerMonitor) HasBinlogRow() bool { return server.Variables.Get("BINLOG_FORMAT") == "ROW" } +func (server *ServerMonitor) HasBinlogMixed() bool { + return server.Variables.Get("BINLOG_FORMAT") == "MIXED" +} + +func (server *ServerMonitor) HasBinlogStatement() bool { + return server.Variables.Get("BINLOG_FORMAT") == "STATEMENT" +} + func (server *ServerMonitor) HasBinlogRowAnnotate() bool { return server.Variables.Get("BINLOG_ANNOTATE_ROW_EVENTS") == "ON" } diff --git a/cluster/srv_job.go b/cluster/srv_job.go index af77a4453..0ee143372 100644 --- a/cluster/srv_job.go +++ b/cluster/srv_job.go @@ -94,7 +94,7 @@ func (server *ServerMonitor) JobBackupPhysical() (int64, error) { cluster := server.ClusterGroup if cluster.IsInBackup() && cluster.Conf.BackupRestic { - cluster.SetState("WARN0110", state.State{ErrType: "WARNING", ErrDesc: fmt.Sprintf(cluster.GetErrorList()["WARN0110"], "Physical", cluster.Conf.BackupPhysicalType, server.URL), ErrFrom: "JOB", ServerUrl: server.URL}) + cluster.SetState("WARN0110", state.State{ErrType: "WARNING", ErrDesc: fmt.Sprintf(clusterError["WARN0110"], "Physical", cluster.Conf.BackupPhysicalType, server.URL), ErrFrom: "JOB", ServerUrl: server.URL}) time.Sleep(1 * time.Second) return server.JobBackupPhysical() @@ -691,7 +691,7 @@ func (server *ServerMonitor) JobsCheckRunning() error { rows.Scan(&task.task, &task.ct, &task.id) if task.ct > 0 { if task.ct > 10 { - cluster.SetState("ERR00060", state.State{ErrType: "WARNING", ErrDesc: fmt.Sprintf(cluster.GetErrorList()["ERR00060"], server.URL), ErrFrom: "JOB", ServerUrl: server.URL}) + cluster.SetState("ERR00060", state.State{ErrType: "WARNING", ErrDesc: clusterError["ERR00060"], ErrFrom: "JOB", ServerUrl: server.URL}) purge := "DELETE from replication_manager_schema.jobs WHERE task='" + task.task + "' AND done=0 AND result IS NULL order by start asc limit " + strconv.Itoa(task.ct-1) err := server.ExecQueryNoBinLog(purge) if err != nil { @@ -699,31 +699,31 @@ func (server *ServerMonitor) JobsCheckRunning() error { } } else { if task.task == "optimized" { - cluster.SetState("WARN0072", state.State{ErrType: "WARNING", ErrDesc: fmt.Sprintf(cluster.GetErrorList()["WARN0072"], server.URL), ErrFrom: "JOB", ServerUrl: server.URL}) + cluster.SetState("WARN0072", state.State{ErrType: "WARNING", ErrDesc: clusterError["WARN0072"], ErrFrom: "JOB", ServerUrl: server.URL}) } else if task.task == "restart" { - cluster.SetState("WARN0096", state.State{ErrType: "WARNING", ErrDesc: fmt.Sprintf(cluster.GetErrorList()["WARN0096"], server.URL), ErrFrom: "JOB", ServerUrl: server.URL}) + cluster.SetState("WARN0096", state.State{ErrType: "WARNING", ErrDesc: clusterError["WARN0096"], ErrFrom: "JOB", ServerUrl: server.URL}) } else if task.task == "stop" { - cluster.SetState("WARN0097", state.State{ErrType: "WARNING", ErrDesc: fmt.Sprintf(cluster.GetErrorList()["WARN0097"], server.URL), ErrFrom: "JOB", ServerUrl: server.URL}) + cluster.SetState("WARN0097", state.State{ErrType: "WARNING", ErrDesc: clusterError["WARN0097"], ErrFrom: "JOB", ServerUrl: server.URL}) } else if task.task == "xtrabackup" { - cluster.SetState("WARN0073", state.State{ErrType: "WARNING", ErrDesc: fmt.Sprintf(cluster.GetErrorList()["WARN0073"], cluster.Conf.BackupPhysicalType, server.URL), ErrFrom: "JOB", ServerUrl: server.URL}) + cluster.SetState("WARN0073", state.State{ErrType: "WARNING", ErrDesc: fmt.Sprintf(clusterError["WARN0073"], cluster.Conf.BackupPhysicalType), ErrFrom: "JOB", ServerUrl: server.URL}) } else if task.task == "mariabackup" { - cluster.SetState("WARN0073", state.State{ErrType: "WARNING", ErrDesc: fmt.Sprintf(cluster.GetErrorList()["WARN0073"], cluster.Conf.BackupPhysicalType, server.URL), ErrFrom: "JOB", ServerUrl: server.URL}) + cluster.SetState("WARN0073", state.State{ErrType: "WARNING", ErrDesc: fmt.Sprintf(clusterError["WARN0073"], cluster.Conf.BackupPhysicalType), ErrFrom: "JOB", ServerUrl: server.URL}) } else if task.task == "reseedxtrabackup" { - cluster.SetState("WARN0074", state.State{ErrType: "WARNING", ErrDesc: fmt.Sprintf(cluster.GetErrorList()["WARN0074"], cluster.Conf.BackupPhysicalType, server.URL), ErrFrom: "JOB", ServerUrl: server.URL}) + cluster.SetState("WARN0074", state.State{ErrType: "WARNING", ErrDesc: fmt.Sprintf(clusterError["WARN0074"], cluster.Conf.BackupPhysicalType), ErrFrom: "JOB", ServerUrl: server.URL}) } else if task.task == "reseedmariabackup" { - cluster.SetState("WARN0074", state.State{ErrType: "WARNING", ErrDesc: fmt.Sprintf(cluster.GetErrorList()["WARN0074"], cluster.Conf.BackupPhysicalType, server.URL), ErrFrom: "JOB", ServerUrl: server.URL}) + cluster.SetState("WARN0074", state.State{ErrType: "WARNING", ErrDesc: fmt.Sprintf(clusterError["WARN0074"], cluster.Conf.BackupPhysicalType), ErrFrom: "JOB", ServerUrl: server.URL}) } else if task.task == "reseedmysqldump" { - cluster.SetState("WARN0075", state.State{ErrType: "WARNING", ErrDesc: fmt.Sprintf(cluster.GetErrorList()["WARN0075"], cluster.Conf.BackupLogicalType, server.URL), ErrFrom: "JOB", ServerUrl: server.URL}) + cluster.SetState("WARN0075", state.State{ErrType: "WARNING", ErrDesc: fmt.Sprintf(clusterError["WARN0075"], cluster.Conf.BackupLogicalType), ErrFrom: "JOB", ServerUrl: server.URL}) } else if task.task == "reseedmydumper" { - cluster.SetState("WARN0075", state.State{ErrType: "WARNING", ErrDesc: fmt.Sprintf(cluster.GetErrorList()["WARN0075"], cluster.Conf.BackupLogicalType, server.URL), ErrFrom: "JOB", ServerUrl: server.URL}) + cluster.SetState("WARN0075", state.State{ErrType: "WARNING", ErrDesc: fmt.Sprintf(clusterError["WARN0075"], cluster.Conf.BackupLogicalType), ErrFrom: "JOB", ServerUrl: server.URL}) } else if task.task == "flashbackxtrabackup" { - cluster.SetState("WARN0076", state.State{ErrType: "WARNING", ErrDesc: fmt.Sprintf(cluster.GetErrorList()["WARN0076"], cluster.Conf.BackupPhysicalType, server.URL), ErrFrom: "JOB", ServerUrl: server.URL}) + cluster.SetState("WARN0076", state.State{ErrType: "WARNING", ErrDesc: fmt.Sprintf(clusterError["WARN0076"], cluster.Conf.BackupPhysicalType), ErrFrom: "JOB", ServerUrl: server.URL}) } else if task.task == "flashbackmariabackup" { - cluster.SetState("WARN0076", state.State{ErrType: "WARNING", ErrDesc: fmt.Sprintf(cluster.GetErrorList()["WARN0076"], cluster.Conf.BackupPhysicalType, server.URL), ErrFrom: "JOB", ServerUrl: server.URL}) + cluster.SetState("WARN0076", state.State{ErrType: "WARNING", ErrDesc: fmt.Sprintf(clusterError["WARN0076"], cluster.Conf.BackupPhysicalType), ErrFrom: "JOB", ServerUrl: server.URL}) } else if task.task == "flashbackmydumper" { - cluster.SetState("WARN0077", state.State{ErrType: "WARNING", ErrDesc: fmt.Sprintf(cluster.GetErrorList()["WARN0077"], cluster.Conf.BackupLogicalType, server.URL), ErrFrom: "JOB", ServerUrl: server.URL}) + cluster.SetState("WARN0077", state.State{ErrType: "WARNING", ErrDesc: fmt.Sprintf(clusterError["WARN0077"], cluster.Conf.BackupLogicalType), ErrFrom: "JOB", ServerUrl: server.URL}) } else if task.task == "flashbackmysqldump" { - cluster.SetState("WARN0077", state.State{ErrType: "WARNING", ErrDesc: fmt.Sprintf(cluster.GetErrorList()["WARN0077"], cluster.Conf.BackupLogicalType, server.URL), ErrFrom: "JOB", ServerUrl: server.URL}) + cluster.SetState("WARN0077", state.State{ErrType: "WARNING", ErrDesc: fmt.Sprintf(clusterError["WARN0077"], cluster.Conf.BackupLogicalType), ErrFrom: "JOB", ServerUrl: server.URL}) } else { //Skip adding to active task if not defined continue @@ -1059,7 +1059,7 @@ func (server *ServerMonitor) JobBackupLogical() error { //Wait for previous restic backup if cluster.IsInBackup() && cluster.Conf.BackupRestic { - cluster.SetState("WARN0110", state.State{ErrType: "WARNING", ErrDesc: fmt.Sprintf(cluster.GetErrorList()["WARN0110"], "Logical", cluster.Conf.BackupLogicalType, server.URL), ErrFrom: "JOB", ServerUrl: server.URL}) + cluster.SetState("WARN0110", state.State{ErrType: "WARNING", ErrDesc: fmt.Sprintf(clusterError["WARN0110"], "Logical", cluster.Conf.BackupLogicalType), ErrFrom: "JOB", ServerUrl: server.URL}) time.Sleep(1 * time.Second) return server.JobBackupLogical() @@ -1313,7 +1313,7 @@ func (server *ServerMonitor) JobBackupBinlog(binlogfile string, isPurge bool) er //Skip setting in backup state due to batch purging if !isPurge { if cluster.IsInBackup() && cluster.Conf.BackupRestic { - cluster.SetState("WARN0110", state.State{ErrType: "WARNING", ErrDesc: fmt.Sprintf(cluster.GetErrorList()["WARN0110"], "Binary Log", cluster.Conf.BinlogCopyMode, server.URL), ErrFrom: "JOB", ServerUrl: server.URL}) + cluster.SetState("WARN0110", state.State{ErrType: "WARNING", ErrDesc: fmt.Sprintf(clusterError["WARN0110"], "Binary Log", cluster.Conf.BinlogCopyMode), ErrFrom: "JOB", ServerUrl: server.URL}) time.Sleep(1 * time.Second) return server.JobBackupBinlog(binlogfile, isPurge) @@ -1362,7 +1362,7 @@ func (server *ServerMonitor) JobBackupBinlogPurge(binlogfile string) error { } if cluster.IsInBackup() && cluster.Conf.BackupRestic { - cluster.SetState("WARN0110", state.State{ErrType: "WARNING", ErrDesc: fmt.Sprintf(cluster.GetErrorList()["WARN0110"], "Binary Log", cluster.Conf.BinlogCopyMode, server.URL), ErrFrom: "JOB", ServerUrl: server.URL}) + cluster.SetState("WARN0110", state.State{ErrType: "WARNING", ErrDesc: fmt.Sprintf(clusterError["WARN0110"], "Binary Log", cluster.Conf.BinlogCopyMode), ErrFrom: "JOB", ServerUrl: server.URL}) time.Sleep(1 * time.Second) return server.JobBackupBinlogPurge(binlogfile) @@ -1550,7 +1550,7 @@ func (server *ServerMonitor) JobBackupBinlogSSH(binlogfile string, isPurge bool) //Skip setting in backup state due to batch purging if !isPurge { if cluster.IsInBackup() && cluster.Conf.BackupRestic { - cluster.SetState("WARN0110", state.State{ErrType: "WARNING", ErrDesc: fmt.Sprintf(cluster.GetErrorList()["WARN0110"], "Binary Log", cluster.Conf.BinlogCopyMode, server.URL), ErrFrom: "JOB", ServerUrl: server.URL}) + cluster.SetState("WARN0110", state.State{ErrType: "WARNING", ErrDesc: fmt.Sprintf(clusterError["WARN0110"], "Binary Log", cluster.Conf.BinlogCopyMode), ErrFrom: "JOB", ServerUrl: server.URL}) time.Sleep(1 * time.Second) return server.JobBackupBinlogSSH(binlogfile, isPurge) diff --git a/cluster/srv_mdev.go b/cluster/srv_mdev.go new file mode 100644 index 000000000..a5c16a26e --- /dev/null +++ b/cluster/srv_mdev.go @@ -0,0 +1,114 @@ +// replication-manager - Replication Manager Monitoring and CLI for MariaDB and MySQL +// Copyright 2017-2021 SIGNAL18 CLOUD SAS +// Authors: Guillaume Lefranc +// +// Stephane Varoqui +// +// This source code is licensed under the GNU General Public License, version 3. +// Redistribution/Reuse of this code is permitted under the GNU v3 license, as +// an additional term, ALL code must carry the original Author(s) credit in comment form. +// See LICENSE in this directory for the integral text. +package cluster + +import ( + "fmt" + "strings" + + "github.com/signal18/replication-manager/config" + "github.com/signal18/replication-manager/utils/state" +) + +func (server *ServerMonitor) CheckMDevIssues() { + + cluster := server.ClusterGroup + + if server.MDevIssues.Replication == nil { + server.MDevIssues.Replication = make([]string, 0) + server.MDevIssues.Service = make([]string, 0) + } + + if !server.IsSuspect() && !server.IsFailed() { + chkf := func(key string, issue *config.MDevIssue) bool { + server.SearchMDevIssue(issue) + + //Always true + return true + } + cluster.MDevIssues.Callback(chkf) + server.IsCheckedForMDevIssues = true + } +} + +func (server *ServerMonitor) SearchMDevIssue(issue *config.MDevIssue) bool { + var hasIssue bool + cluster := server.ClusterGroup + ver := server.DBVersion + strState := strings.Replace(issue.Key, "-", "", 1) + mdstate := state.State{ + ErrType: "WARNING", + ErrFrom: "MDEV", + ErrDesc: fmt.Sprintf(config.BugString, strings.Join(issue.Components, ","), issue.GetURL()), + ServerUrl: server.URL, + } + // Will also check unresolved cases + if ver.GreaterEqualReleaseList(issue.Versions...) && (issue.Status == "Unresolved" || ver.LowerReleaseList(issue.FixVersions...)) { + found := false + isReplication := false + isGalera := false + isSpider := false + for _, c := range issue.Components { + if c == "Replication" { + isReplication = true + } + if c == "Storage Engine - Spider" { + isSpider = true + } + if strings.Contains(c, "Galera") { + isGalera = true + } + } + + // Replication Bug + if isReplication { + if isSpider && cluster.Conf.Spider { + found = true + } else if isGalera && server.HaveWsrep { + found = true + } else { + switch issue.Key { + case "MDEV-27512": + if server.Variables.Get(strings.ToUpper("slave_skip_errors")) == "ALL" { + found = true + } + case "MDEV-28310": + if server.HasBinlogMixed() { + found = true + } + default: + found = true + } + } + // Append to slices + if found { + server.MDevIssues.Replication = append(server.MDevIssues.Replication, issue.Key) + } + } else { + //Critical Area (Can affect service due to locking/crash) + switch issue.Key { + default: + found = true + } + //Append to slices + if found { + server.MDevIssues.Service = append(server.MDevIssues.Service, issue.Key) + } + } + + // Set state for Server + if found { + cluster.SetState(strState, mdstate) + } + } + + return hasIssue +} diff --git a/cluster/srv_rejoin.go b/cluster/srv_rejoin.go index d8165c176..ab7c5c5c2 100644 --- a/cluster/srv_rejoin.go +++ b/cluster/srv_rejoin.go @@ -63,7 +63,7 @@ func (server *ServerMonitor) RejoinMaster() error { if cluster.master != nil { if server.URL != cluster.master.URL { - cluster.SetState("WARN0022", state.State{ErrType: "WARNING", ErrDesc: fmt.Sprintf(clusterError["WARN0022"], server.URL, cluster.master.URL), ErrFrom: "REJOIN"}) + cluster.SetState("WARN0022", state.State{ErrType: "WARNING", ErrDesc: fmt.Sprintf(clusterError["WARN0022"], cluster.master.URL), ErrFrom: "REJOIN", ServerUrl: server.URL}) server.RejoinScript() if cluster.Conf.MultiMasterGrouprep { cluster.LogModulePrintf(cluster.Conf.Verbose, config.ConstLogModGeneral, "INFO", "Group replication rejoin %s server to PRIMARY ", server.URL) @@ -77,7 +77,7 @@ func (server *ServerMonitor) RejoinMaster() error { } crash := cluster.getCrashFromJoiner(server.URL) if crash == nil { - cluster.SetState("ERR00066", state.State{ErrType: "ERROR", ErrDesc: fmt.Sprintf(clusterError["ERR00066"], server.URL, cluster.master.URL), ErrFrom: "REJOIN"}) + cluster.SetState("ERR00066", state.State{ErrType: "ERROR", ErrDesc: fmt.Sprintf(clusterError["ERR00066"], cluster.master.URL), ErrFrom: "REJOIN", ServerUrl: server.URL}) if cluster.oldMaster != nil { if cluster.oldMaster.URL == server.URL { server.RejoinMasterSST() @@ -519,7 +519,7 @@ func (server *ServerMonitor) rejoinSlave(ss dbhelper.SlaveStatus) error { if server.HasGTIDReplication() { crash := cluster.getCrashFromMaster(cluster.master.URL) if crash == nil { - cluster.SetState("ERR00065", state.State{ErrType: "ERROR", ErrDesc: fmt.Sprintf(clusterError["ERR00065"], server.URL, cluster.master.URL), ErrFrom: "REJOIN"}) + cluster.SetState("ERR00065", state.State{ErrType: "ERROR", ErrDesc: fmt.Sprintf(clusterError["ERR00065"], cluster.master.URL), ErrFrom: "REJOIN", ServerUrl: server.URL}) return errors.New("No Crash info on current master") } cluster.LogModulePrintf(cluster.Conf.Verbose, config.ConstLogModGeneral, "INFO", "Crash info on current master %s", crash) diff --git a/config/config.go b/config/config.go index 629f785c9..b3e6e0a68 100644 --- a/config/config.go +++ b/config/config.go @@ -203,6 +203,7 @@ type Config struct { CheckFalsePositiveExternal bool `mapstructure:"failover-falsepositive-external" toml:"failover-falsepositive-external" json:"failoverFalsePositiveExternal"` CheckFalsePositiveExternalPort int `mapstructure:"failover-falsepositive-external-port" toml:"failover-falsepositive-external-port" json:"failoverFalsePositiveExternalPort"` FailoverLogFileKeep int `mapstructure:"failover-log-file-keep" toml:"failover-log-file-keep" json:"failoverLogFileKeep"` + FailoverCheckBlocker bool `mapstructure:"failover-check-blocker" toml:"failover-check-blocker" json:"failoverCheckBlocker"` FailoverSwitchToPrefered bool `mapstructure:"failover-switch-to-prefered" toml:"failover-switch-to-prefered" json:"failoverSwithToPrefered"` DelayStatCapture bool `mapstructure:"delay-stat-capture" toml:"delay-stat-capture" json:"delayStatCapture"` PrintDelayStat bool `mapstructure:"print-delay-stat" toml:"print-delay-stat" json:"printDelayStat"` diff --git a/config/error.go b/config/error.go index 48f4df580..31e6eff45 100644 --- a/config/error.go +++ b/config/error.go @@ -15,14 +15,14 @@ var ClusterError = map[string]string{ "ERR00002": "Waiting for a user manual failover", "ERR00004": "Database %s access denied: %s", "ERR00005": "Could not get privileges for user %s@%s: %s", - "ERR00006": "User must have REPLICATION CLIENT privilege on %s", - "ERR00007": "User must have REPLICATION SLAVE privilege on %s", - "ERR00008": "User must have SUPER privilege %s", - "ERR00009": "User must have RELOAD privilege %s", + "ERR00006": "User must have REPLICATION CLIENT privilege ", + "ERR00007": "User must have REPLICATION SLAVE privilege ", + "ERR00008": "User must have SUPER privilege", + "ERR00009": "User must have RELOAD privilege", "ERR00010": "Could not find a slave in topology", "ERR00011": "Found multiple masters in topology but not explicitely setup", "ERR00012": "Could not find a master in topology", - "ERR00013": "Binary log disabled on slave: %s", + "ERR00013": "Binary log disabled on slave", "ERR00014": "Could not get binlog dump count on server %s: %s", "ERR00015": "Could not get privileges for user %s on server %s: %s", "ERR00016": "Master is unreachable but slaves are replicating", @@ -37,26 +37,26 @@ var ClusterError = map[string]string{ "ERR00025": "Could not get MaxScale maxinfo server list: %s", "ERR00026": "First node restarted is a slave, non-interactive mode", "ERR00027": "Number of cluster failovers exceeded", - "ERR00028": "Slave %s can still communicate with the master", + "ERR00028": "Slave can still communicate with the master", "ERR00029": "Time between failovers too short", "ERR00030": "Maxscale %s can still communicate with the master", "ERR00031": "External API can still communicate with the master", "ERR00032": "No candidates found in slaves list", - "ERR00033": "Skip slave in election %s have no master log file, slave might have failed", - "ERR00034": "Skip slave in election %s repl not electable for switchover", - "ERR00035": "Skip slave in election %s multi-master and is already the master", - "ERR00036": "Skip slave in election %s is relay", - "ERR00037": "Skip slave in election %s in ignore list", - "ERR00038": "Skip slave in election %s repl not electable for failover", - "ERR00039": "Skip slave in election %s repl not electable", - "ERR00040": "Skip slave in election %s does not ping or has no binlogs", + "ERR00033": "Skip slave in election - have no master log file, slave might have failed", + "ERR00034": "Skip slave in election - repl not electable for switchover", + "ERR00035": "Skip slave in election - multi-master and is already the master", + "ERR00036": "Skip slave in election - is relay", + "ERR00037": "Skip slave in election - in ignore list", + "ERR00038": "Skip slave in election - repl not electable for failover", + "ERR00039": "Skip slave in election - repl not electable", + "ERR00040": "Skip slave in election - does not ping or has no binlogs", "ERR00041": "Skip slave in election %s has more than %d seconds of replication delay (%d)", - "ERR00042": "Skip slave in election %s SQL Thread is stopped", - "ERR00043": "Skip slave in election %s Semisync report unsynced", + "ERR00042": "Skip slave in election - SQL Thread is stopped", + "ERR00043": "Skip slave in election - Semisync report unsynced", "ERR00044": "Can't connect to OpenSVC collector %s", "ERR00045": "Found forbidden relay topology, trying to fix", "ERR00046": "Can't fix relay topology: high replication delay", - "ERR00047": "Skip slave in election %s - Maintenance mode", + "ERR00047": "Skip slave in election - Maintenance mode", "ERR00048": "Broken muti master ring", "ERR00049": "Waiting old master to rejoin in positional mode to rejoin slave", "ERR00050": "Can't connect to proxy %s", @@ -69,13 +69,13 @@ var ClusterError = map[string]string{ "ERR00057": "Database duplicate users not allowed in proxysql %s", "ERR00058": "Sphinx connection error: %s", "ERR00059": "Ignored server %s not found in configured server list", - "ERR00060": "To many non closed task in scheduler, donor may not work on server %s", + "ERR00060": "To many non closed task in scheduler, donor may not work on server", "ERR00061": "No user:password credential specified", "ERR00062": "DNS resolution for host %s error %s", "ERR00063": "Extra master in master slave topology rejoin it after split brain", "ERR00064": "Server %s is not a slave of declared master %s, and replication no relay is enable: Pointing to %s", - "ERR00065": "No crash found on current master when rejoining slave %s to %s", - "ERR00066": "No crash found on current master when rejoining standalone %s to %s", + "ERR00065": "No crash found on current master when rejoining slave to %s", + "ERR00066": "No crash found on current master when rejoining standalone to %s", "ERR00067": "Found slave to rejoin %s slave was previously in state %s replication io thread %s, pointing currently to %s", "ERR00068": "Arbitration looser", "ERR00069": "ProxySQL could not set %s as reader (%s) different state OFFLINE_HARD", @@ -88,54 +88,54 @@ var ClusterError = map[string]string{ "ERR00076": "Connections reach 80 pourcent threshold: %s", "ERR00077": "All databases state down", "ERR00078": "Could not resolve IP from connection %s@%s: with hostname %s on server %s", - "ERR00079": "Disk %s usage high on %s", - "ERR00080": "Connection use old TLS keys on %s", - "ERR00081": "Connection use no TLS keys on %s", + "ERR00079": "Disk %s usage high", + "ERR00080": "Connection use old TLS keys", + "ERR00081": "Connection use no TLS keys", "ERR00082": "Could not get agents from orchestrator %s", "ERR00083": "Different cluster uuid found on %s:%s %s:%s", - "ERR00084": "Cluster have no master when slave %s was started", + "ERR00084": "Cluster have no master when slave was started", "ERR00085": "No replica found for routing reads", "ERR00086": "Sharding proxy refresh no database monitor yet initialize", - "ERR00087": "Skip slave in election %s IO Thread is stopped with valid leader", + "ERR00087": "Skip slave in election - IO Thread is stopped with valid leader", "ERR00088": "Authentification error in replication IO thread", "ERR00089": "Authentification error to Vault %s", "ERR00090": "Monitoring save config enable but no encryption key for password, see the keygen command", "ERR00091": "Proxysql %s found active server %s as OFFLINE_SOFT in ProxySQL.", "ERR00092": "Cluster [%s] topology (%s) is not same with target topology (%s).", - "WARN0022": "Rejoining standalone server %s to master %s", + "WARN0022": "Rejoining standalone server to master %s", "WARN0023": "Number of failed master ping has been reached", "WARN0045": "Provision task is in queue", "WARN0046": "Provision task is waiting", "WARN0047": "Entreprise provision of MariaDB Sharding Cluster not yet implemented", - "WARN0048": "No semisync settings on slave %s", - "WARN0049": "No binlog format ROW on slave %s and flashback activated", - "WARN0050": "No Heartbeat <= 1s on slave %s", - "WARN0051": "No GTID replication on slave %s", - "WARN0052": "No InnoDB durability on slave %s", - "WARN0053": "No replication checksum on slave %s", - "WARN0054": "No log of replication queries in slow query on slave %s", - "WARN0055": "ROW or MIXED binlog format and replicate_annotate_row_events is off on slave %s", - "WARN0056": "No compression of binlog on slave %s", - "WARN0057": "No log-slave-updates on slave %s", - "WARN0058": "No GTID strict mode on slave %s", - "WARN0059": "No replication crash-safe settings on slave %s", - "WARN0060": "No semisync settings on master %s", - "WARN0061": "No binlog format ROW on master %s and flashback activated", - "WARN0062": "No Heartbeat <= 1s on master %s", - "WARN0064": "No InnoDB durability on master %s", - "WARN0065": "No replication checksum on master %s", - "WARN0066": "No log of replication queries in slow query on master %s", - "WARN0067": "RBR is on and Binlog Annotation is off on master %s", - "WARN0068": "No compression of binlog on slave %s", - "WARN0069": "No log-slave-updates on master %s", - "WARN0070": "No GTID strict mode on master %s", - "WARN0071": "No replication crash-safe settings on master %s", - "WARN0072": "Running optimize table %s on server %s", - "WARN0073": "Running physical backup %s on server %s", - "WARN0074": "Reseeding physical backup %s on server %s", - "WARN0075": "Reseeding logical backup %s on server %s", - "WARN0076": "Flashback physical backup %s on server %s", - "WARN0077": "Flashback logical backup %s on server %s", + "WARN0048": "No semisync settings on slave", + "WARN0049": "No binlog format ROW on slave and flashback activated", + "WARN0050": "No Heartbeat <= 1s on slave", + "WARN0051": "No GTID replication on slave", + "WARN0052": "No InnoDB durability on slave", + "WARN0053": "No replication checksum on slave", + "WARN0054": "No log of replication queries in slow query on slave", + "WARN0055": "ROW or MIXED binlog format and replicate_annotate_row_events is off on slave", + "WARN0056": "No compression of binlog on slave", + "WARN0057": "No log-slave-updates on slave", + "WARN0058": "No GTID strict mode on slave", + "WARN0059": "No replication crash-safe settings on slave", + "WARN0060": "No semisync settings on master", + "WARN0061": "No binlog format ROW on master and flashback activated", + "WARN0062": "No Heartbeat <= 1s on master", + "WARN0064": "No InnoDB durability on master", + "WARN0065": "No replication checksum on master", + "WARN0066": "No log of replication queries in slow query on master", + "WARN0067": "RBR is on and Binlog Annotation is off on master", + "WARN0068": "No compression of binlog on slave", + "WARN0069": "No log-slave-updates on master", + "WARN0070": "No GTID strict mode on master", + "WARN0071": "No replication crash-safe settings on master", + "WARN0072": "Running optimize table %s on server", + "WARN0073": "Running physical backup %s on server", + "WARN0074": "Reseeding physical backup %s on server", + "WARN0075": "Reseeding logical backup %s on server", + "WARN0076": "Flashback physical backup %s on server", + "WARN0077": "Flashback logical backup %s on server", "WARN0078": "Haproxy version to old to get statistics", "WARN0079": "Cluster is split brain", "WARN0080": "Cluster lost majority", @@ -143,8 +143,8 @@ var ClusterError = map[string]string{ "WARN0082": "Cluster arbitrator error in arbitration %s", "WARN0083": "Arbitration winner", "WARN0084": "Variable diff:\n %s", - "WARN0085": "Server %s in capture mode", - "WARN0086": "Checksum table waiting replication sync on slave %s", + "WARN0085": "server in capture mode", + "WARN0086": "Checksum table waiting replication sync on slave", "WARN0087": "Cluster same server_id %s %s", "WARN0088": "High number of slow queries %s ", "WARN0089": "ShardProxy Could not fetch master schemas %s", @@ -157,17 +157,18 @@ var ClusterError = map[string]string{ "WARN0096": "Restart database server via job request %s", "WARN0097": "Stop database server via job request %s", "WARN0098": "ProxySQL could not load global variables from runtime (%s)", + "WARN0099": "MariaDB version has replication issue https://jira.mariadb.org/browse/MDEV-20821", "WARN0100": "No space left on device pn %s", "WARN0101": "Cluster does not have backup", "WARN0102": "The config file must be merge because an immutable parameter has been changed. Use the config-merge command to save your changes.", - "WARN0103": "Enforce replication mode idempotent but strict on server %s", - "WARN0104": "Enforce replication mode strict but idempotent on server %s", + "WARN0103": "Enforce replication mode idempotent but strict on server", + "WARN0104": "Enforce replication mode strict but idempotent on server", "WARN0105": "Force Binlog Purge is not yet available in multi master. Skipping", "WARN0106": "Minimum number of connected replica(s) is not enough to initiate force purging. Minimum replicas: %d. Skipping", "WARN0107": "Force Binlog Purge can not continue. Oldest binlog is still used by slaves: %s.%d. Skipping", "WARN0108": "Default users still use default password. Please change the credentials for users: (%s)", "WARN0109": "Error while checking master log file for purging in slave [%s] (%s) : %s\n", - "WARN0110": "Pending %s backup using %s for [%s] due to another job. Waiting...", + "WARN0110": "Pending %s backup using %s due to another job. Waiting...", "WARN0111": "Cluster does not have logical backup", "WARN0112": "Cluster does not have physical backup", "MDEV20821": "MariaDB version has replication issue https://jira.mariadb.org/browse/MDEV-20821", diff --git a/config/mdev.go b/config/mdev.go new file mode 100644 index 000000000..047313266 --- /dev/null +++ b/config/mdev.go @@ -0,0 +1,400 @@ +package config + +import ( + "encoding/csv" + "encoding/json" + "io" + "os" + "sync" + "time" + + "github.com/signal18/replication-manager/share" + log "github.com/sirupsen/logrus" +) + +var JiraURL string = "https://jira.mariadb.org/browse/" +var BugString string = "The MariaDB version has bug in [%s] for this version, source: (%s)" + +const labelKey string = "Issue key" +const labelStatus string = "Status" +const labelUpdated string = "Updated" +const labelComp string = "Component/s" +const labelVersionAffected string = "Affects Version/s" +const labelVersionFixed string = "Fix Version/s" + +type MDevIssue struct { + Key string `json:"key"` + Status string `json:"status"` + Updated int64 `json:"updated"` + Components []string `json:"components"` + Versions []string `json:"versions"` + FixVersions []string `json:"fixVersions"` +} + +type MDevIssueList map[string]MDevIssue + +type IndexRange struct { + Min int + Max int +} + +type MDevIssueExists struct { + Key bool + Status bool + Updated bool + Components bool + Versions bool + FixVersions bool +} + +type MDevIssueIndex struct { + Key int + Status int + Updated int + Components IndexRange + Versions IndexRange + FixVersions IndexRange + Found MDevIssueExists +} + +func (issue *MDevIssue) GetURL() string { + return JiraURL + issue.Key +} + +func (idx *MDevIssueIndex) parseHeader(line []string) { + prev := "" + for i, v := range line { + switch v { + case labelKey: + idx.Key = i + idx.Found.Key = true + case labelStatus: + idx.Status = i + idx.Found.Status = true + case labelUpdated: + idx.Updated = i + idx.Found.Updated = true + case labelComp: + if prev != labelComp { + idx.Components.Min = i + prev = labelComp + } + idx.Components.Max = i + idx.Found.Components = true + case labelVersionAffected: + if prev != labelVersionAffected { + idx.Versions.Min = i + prev = labelVersionAffected + } + idx.Versions.Max = i + idx.Found.Versions = true + case labelVersionFixed: + if prev != labelVersionFixed { + idx.FixVersions.Min = i + prev = labelVersionFixed + } + idx.FixVersions.Max = i + idx.Found.FixVersions = true + } + } +} + +func (issue *MDevIssue) parseContent(line []string, idx *MDevIssueIndex) error { + issue.Key = line[idx.Key] + issue.Status = line[idx.Status] + + comps := make([]string, 0) + for i := idx.Components.Min; i <= idx.Components.Max; i++ { + if len(line[i]) > 0 { + comps = append(comps, line[i]) + } + } + issue.Components = comps + + vers := make([]string, 0) + for i := idx.Versions.Min; i <= idx.Versions.Max; i++ { + if len(line[i]) > 0 { + vers = append(vers, line[i]) + } + } + issue.Versions = vers + + fixs := make([]string, 0) + for i := idx.FixVersions.Min; i <= idx.FixVersions.Max; i++ { + if len(line[i]) > 0 { + fixs = append(fixs, line[i]) + } + } + issue.FixVersions = fixs + + if u, err := time.Parse("2006-01-02 15:04", line[idx.Updated]); err == nil { + issue.Updated = u.Unix() + } else { + log.WithFields(log.Fields{"cluster": "none", "module": "mdev"}).Errorf("[MDEV-Parser] Error parse content: %s", err.Error()) + return err + } + + return nil +} + +type MDevIssueMap struct { + *sync.Map +} + +func NewMDevIssueMap() *MDevIssueMap { + s := new(sync.Map) + m := &MDevIssueMap{Map: s} + return m +} + +func (m *MDevIssueMap) Get(key string) *MDevIssue { + if v, ok := m.Load(key); ok { + return v.(*MDevIssue) + } + return nil +} + +func (m *MDevIssueMap) CheckAndGet(key string) (*MDevIssue, bool) { + v, ok := m.Load(key) + if ok { + return v.(*MDevIssue), true + } + return nil, false +} + +func (m *MDevIssueMap) Set(key string, value *MDevIssue) { + m.Store(key, value) +} + +func (m *MDevIssueMap) ToNormalMap(c map[string]*MDevIssue) { + // Clear the old values in the output map + for k := range c { + delete(c, k) + } + + // Insert all values from the MDevIssueMap to the output map + m.Callback(func(key string, value *MDevIssue) bool { + c[key] = value + return true + }) +} + +func (m *MDevIssueMap) ToNewMap() map[string]*MDevIssue { + result := make(map[string]*MDevIssue) + m.Range(func(k, v any) bool { + result[k.(string)] = v.(*MDevIssue) + return true + }) + return result +} + +func (m *MDevIssueMap) Callback(f func(key string, value *MDevIssue) bool) { + m.Range(func(k, v any) bool { + return f(k.(string), v.(*MDevIssue)) + }) +} + +func (m *MDevIssueMap) Clear() { + m.Range(func(key, value any) bool { + m.Delete(key.(string)) + return true + }) +} + +func FromNormalMDevIssueMap(m *MDevIssueMap, c map[string]*MDevIssue) *MDevIssueMap { + if m == nil { + m = NewMDevIssueMap() + } else { + m.Clear() + } + + for k, v := range c { + m.Set(k, v) + } + + return m +} + +func FromMDevIssueMap(m *MDevIssueMap, c *MDevIssueMap) *MDevIssueMap { + if m == nil { + m = NewMDevIssueMap() + } else { + m.Clear() + } + + if c != nil { + c.Callback(func(key string, value *MDevIssue) bool { + m.Set(key, value) + return true + }) + } + + return m +} + +func (m *MDevIssueMap) MDevParseCSV(filename string, replace bool) error { + log.WithFields(log.Fields{"cluster": "none", "module": "mdev"}).Infof("[MDEV-Parser] Opening csv in shared repo dir : %s", filename) + + file, err := os.Open(filename) + if err != nil { + log.WithFields(log.Fields{"cluster": "none", "module": "mdev"}).Errorf("[MDEV-Parser] failed to open csv in shared repo dir : %s", err.Error()) + return err + } + defer file.Close() + + csvr := csv.NewReader(file) + header := true + idx := new(MDevIssueIndex) + + csvr.Comma = ';' + + for { + line, err := csvr.Read() + if err != nil { + if err != io.EOF { + log.Error(err) + } + break + } + ln, _ := csvr.FieldPos(0) + + if header { + //Parse Header + idx.parseHeader(line) + log.WithFields(log.Fields{"cluster": "none", "module": "mdev"}).Infof("[MDEV-Parser] Indexes : %v", idx) + header = false + } else { + //Parse Content + issue := new(MDevIssue) + if err = issue.parseContent(line, idx); err == nil { + if replace { + m.Store(issue.Key, issue) + } else { + m.LoadOrStore(issue.Key, issue) + } + if log.GetLevel() == log.DebugLevel { + jsline, _ := json.MarshalIndent(issue, "", "\t") + log.WithFields(log.Fields{"cluster": "none", "module": "mdev"}).Debugf("[MDEV-Parser] Line:%d source:(%v)", ln, line) + log.WithFields(log.Fields{"cluster": "none", "module": "mdev"}).Debugf("[MDEV-Parser] Line:%d result:(%s)", ln, jsline) + } + } else { + + log.WithFields(log.Fields{"cluster": "none", "module": "mdev"}).Errorf("[MDEV-Parser] Skip line number: %d", ln) + } + } + } + + return nil +} + +func (m *MDevIssueMap) MDevWriteJSONFile(filename string) error { + file, err := os.Create(filename) + if err != nil { + log.WithFields(log.Fields{"cluster": "none", "module": "mdev"}).Errorf("[MDEV-Parser] failed to csv in shared repo dir : %s", err.Error()) + return err + } + defer file.Close() + + enc := json.NewEncoder(file) + enc.SetIndent("", "\t") + err = enc.Encode(m.ToNewMap()) + if err != nil { + log.WithFields(log.Fields{"cluster": "none", "module": "mdev"}).Errorf("[MDEV-Parser] failed to csv in shared repo dir : %s", err.Error()) + return err + } + + return nil +} + +func (m *MDevIssueMap) MDevLoadJSONFile(filename string) error { + var err error + var content []byte = make([]byte, 0) + + log.WithFields(log.Fields{"cluster": "none", "module": "mdev"}).Infof("[MDEV-Parser] Loading JSON MDev file at %s", filename) + content, err = os.ReadFile(filename) + if err != nil { + if os.IsNotExist(err) { + log.WithFields(log.Fields{"cluster": "none", "module": "mdev"}).Errorf("[MDEV-Parser] No JSON file. Initiate empty JSON file: %s", filename) + var file *os.File + file, err = os.Create(filename) + file.Close() + } + if err != nil { + log.WithFields(log.Fields{"cluster": "none", "module": "mdev"}).Errorf("[MDEV-Parser] failed to open json : %s", err.Error()) + return err + } + } + tmp := make(map[string]MDevIssue) + if len(content) > 0 { + err = json.Unmarshal(content, &tmp) + if err != nil { + log.WithFields(log.Fields{"cluster": "none", "module": "mdev"}).Errorf("[MDEV-Parser] failed to parse JSON : %s", err.Error()) + return err + } + + for k, v := range tmp { + m.Store(k, &v) + } + } else { + log.WithFields(log.Fields{"cluster": "none", "module": "mdev"}).Warn("[MDEV-Parser] Skip parsing empty JSON file") + } + + return nil +} + +func (conf *Config) UpdateMDevJSONFile(csvfile string, replace bool, verbose bool) error { + var mdev *MDevIssueMap = NewMDevIssueMap() + var err error + var jsonfile string = conf.WorkingDir + "/mdev.json" + + if verbose { + log.Info("Log Verbose") + log.SetLevel(log.DebugLevel) + } + + conf.InitMDevJSONFile(jsonfile) + //Populate existing list from JSON + err = mdev.MDevLoadJSONFile(jsonfile) + if err != nil { + return err + } + //Populate existing list from JSON + err = mdev.MDevParseCSV(csvfile, replace) + if err != nil { + return err + } + //Write back to JSON File + err = mdev.MDevWriteJSONFile(jsonfile) + if err != nil { + return err + } + return nil +} + +func (conf *Config) InitMDevJSONFile(filename string) error { + var err error + var content []byte = make([]byte, 0) + + // Init if not exists + if _, err := os.Stat(filename); err != nil { + if os.IsNotExist(err) { + if conf.WithEmbed == "ON" { + content, err = share.EmbededDbModuleFS.ReadFile("repo/mdev.json") + } else { + content, err = os.ReadFile(conf.ShareDir + "/repo/mdev.json") + } + if err != nil { + log.WithFields(log.Fields{"cluster": "none", "module": "mdev"}).Errorf("[MDEV-Parser] failed to read JSON from shared dir: %s", err.Error()) + log.WithFields(log.Fields{"cluster": "none", "module": "mdev"}).Infof("[MDEV-Parser] Init empty json file at: %s", filename) + } + + err = os.WriteFile(filename, content, 0644) + if err != nil { + log.WithFields(log.Fields{"cluster": "none", "module": "mdev"}).Errorf("[MDEV-Parser] failed to write JSON file: %s", err.Error()) + } + } else { + log.WithFields(log.Fields{"cluster": "none", "module": "mdev"}).Errorf("[MDEV-Parser] failed to read JSON file: %s", err.Error()) + } + } + return err +} diff --git a/server/api_cluster.go b/server/api_cluster.go index bcb4f5f4b..07e86010e 100644 --- a/server/api_cluster.go +++ b/server/api_cluster.go @@ -1245,6 +1245,8 @@ func (repman *ReplicationManager) switchSettings(mycluster *cluster.Cluster, set mycluster.SwitchDynamicTopology() case "replication-no-relay": mycluster.SwitchReplicationNoRelay() + case "failover-check-blocker": + mycluster.SwitchFailoverCheckBlocker() } } diff --git a/server/server.go b/server/server.go index 7bdc07cee..67af80958 100644 --- a/server/server.go +++ b/server/server.go @@ -112,6 +112,7 @@ type ReplicationManager struct { v3Config Repmanv3Config `json:"-"` cloud18CheckSum hash.Hash `json:"-"` clog *clog.Logger `json:"-"` + MDevIssues *config.MDevIssueMap repmanv3.UnimplementedClusterPublicServiceServer `json:"-"` repmanv3.UnimplementedClusterServiceServer `json:"-"` sync.Mutex @@ -383,6 +384,8 @@ func (repman *ReplicationManager) AddFlags(flags *pflag.FlagSet, conf *config.Co flags.IntVar(&conf.CheckFalsePositiveExternalPort, "failover-falsepositive-external-port", 80, "Failover checks external port") flags.IntVar(&conf.MaxFail, "failover-falsepositive-ping-counter", 5, "Failover after this number of ping failures (interval 1s)") flags.IntVar(&conf.FailoverLogFileKeep, "failover-log-file-keep", 5, "Purge log files taken during failover") + flags.BoolVar(&conf.FailoverCheckBlocker, "failover-check-blocker", true, "Failover checks for replication bug") + flags.BoolVar(&conf.FailoverCheckDelayStat, "failover-check-delay-stat", false, "Use delay avg statistic for failover decision") flags.BoolVar(&conf.DelayStatCapture, "delay-stat-capture", false, "Capture hourly statistic for delay average") flags.BoolVar(&conf.PrintDelayStat, "print-delay-stat", false, "Print captured delay statistic") @@ -1266,7 +1269,6 @@ func (repman *ReplicationManager) GetClusterConfig(fistRead *viper.Viper, Immuab if v != nil { clustImmuableMap[f] = v } - } //set the default config @@ -1421,6 +1423,16 @@ func (repman *ReplicationManager) InitRestic() error { return nil } +func (repman *ReplicationManager) InitMDevIssues() error { + repman.MDevIssues = config.NewMDevIssueMap() + filename := repman.Conf.WorkingDir + "/mdev.json" + //Only for premium users + // if repman.Conf.Cloud18 { + repman.MDevIssues.MDevLoadJSONFile(filename) + // } + return nil +} + func (repman *ReplicationManager) Run() error { var err error @@ -1439,7 +1451,6 @@ func (repman *ReplicationManager) Run() error { log.Fatal(err) } pprof.StartCPUProfile(fcpupprof) - } repman.Clusters = make(map[string]*cluster.Cluster) @@ -1577,6 +1588,7 @@ func (repman *ReplicationManager) Run() error { log.Infof("repman.Conf.WorkingDir : %s", repman.Conf.WorkingDir) log.Infof("repman.Conf.ShareDir : %s", repman.Conf.ShareDir) + repman.InitMDevIssues() // If there's an existing encryption key, decrypt the passwords for _, gl := range repman.ClusterList { @@ -1584,8 +1596,8 @@ func (repman *ReplicationManager) Run() error { } for _, cluster := range repman.Clusters { cluster.SetClusterList(repman.Clusters) - cluster.SetCarbonLogger(repman.clog) + cluster.SetMDevList(repman.MDevIssues) } // repman.currentCluster.SetCfgGroupDisplay(strClusters) diff --git a/server/server_mdev.go b/server/server_mdev.go new file mode 100644 index 000000000..ef353a122 --- /dev/null +++ b/server/server_mdev.go @@ -0,0 +1,47 @@ +//go:build !clients +// +build !clients + +// replication-manager - Replication Manager Monitoring and CLI for MariaDB and MySQL +// Copyright 2017-2021 SIGNAL18 CLOUD SAS +// Authors: Guillaume Lefranc +// Stephane Varoqui +// This source code is licensed under the GNU General Public License, version 3. + +package server + +import ( + "fmt" + + "github.com/spf13/cobra" +) + +var mdevCsv string +var verbose bool + +func init() { + rootCmd.AddCommand(mdevUpdateCmd) + mdevUpdateCmd.Flags().BoolVar(&overwrite, "overwrite", false, "Overwrite JSON records with latest CSV file") + mdevUpdateCmd.Flags().BoolVar(&verbose, "debug", false, "Debug line per line") + mdevUpdateCmd.Flags().StringVar(&mdevCsv, "csv-path", "/usr/share/replication-manager/repo/mdev.csv", "MDEV list csv file") +} + +var mdevUpdateCmd = &cobra.Command{ + Use: "mdev", + Short: "Update MDEV blocker list", + Long: `Update MDEV blocker list by merging the issues from csv file with existing list in the MDEV JSON file.`, + Run: func(cmd *cobra.Command, args []string) { + fmt.Printf("Start mdev command !\n") + RepMan = new(ReplicationManager) + RepMan.CommandLineFlag = GetCommandLineFlag(cmd) + RepMan.DefaultFlagMap = defaultFlagMap + RepMan.InitConfig(conf) + fmt.Printf("Config : %s\n", RepMan.Conf.ConfigFile) + fmt.Printf("Verbose : %v\n", verbose) + err := RepMan.Conf.UpdateMDevJSONFile(mdevCsv, overwrite, verbose) + if err != nil { + fmt.Printf("Config mdev update command fail: %s\n", err) + return + } + fmt.Println("Success executing mdev update command!") + }, +} diff --git a/share/dashboard/static/card-setting-replication.html b/share/dashboard/static/card-setting-replication.html index e2ce434ae..53b0e78a6 100644 --- a/share/dashboard/static/card-setting-replication.html +++ b/share/dashboard/static/card-setting-replication.html @@ -26,6 +26,19 @@ + + Checks MDEV Blocker for failover & switchover + + + + + On + Off + + +
diff --git a/share/repo/alerts.csv b/share/repo/mdev.csv similarity index 100% rename from share/repo/alerts.csv rename to share/repo/mdev.csv diff --git a/utils/dbhelper/version_test.go b/utils/dbhelper/version_test.go index ce1f17bcc..f87323b61 100644 --- a/utils/dbhelper/version_test.go +++ b/utils/dbhelper/version_test.go @@ -10,7 +10,10 @@ package dbhelper import ( + "fmt" + "math/rand" "testing" + "time" ) func TestMySQLVersion(t *testing.T) { @@ -252,3 +255,21 @@ func TestMariaDBVersion(t *testing.T) { } } + +func TestMariaDBVersionBinlogFormat(t *testing.T) { + + for i := 0; i < 10; i++ { + vmin := rand.Intn(11) + vrel := rand.Intn(40) + tstring := fmt.Sprintf("10.%d.%d-MariaDB-1:10.%d.%d+maria~ubu2204-log", vmin, vrel, vmin, vrel) + cstring := "MariaDB" + mv, _ := NewMySQLVersion(tstring, cstring) + + if mv.LowerReleaseList("10.2.44", "10.3.35", "10.4.25", "10.5.16", "10.6.8", "10.7.4", "10.8.3", "10.9.1") { + t.Logf("Version %s is unsafe", mv.ToString()) + } else { + t.Logf("Version %s is safe", mv.ToString()) + } + time.Sleep(10 * time.Millisecond) + } +} diff --git a/utils/state/state.go b/utils/state/state.go index b64c85218..da37e8ee1 100644 --- a/utils/state/state.go +++ b/utils/state/state.go @@ -14,6 +14,7 @@ import ( "slices" "sort" "strconv" + "strings" "sync" "time" ) @@ -60,11 +61,13 @@ func NewMap() *Map { } func (m Map) Add(key string, s State) { - - _, ok := m[key] - if !ok { + if ms, ok := m[key]; !ok { m[key] = s - + } else { + if !strings.Contains(ms.ServerUrl, s.ServerUrl) { + ms.ServerUrl = ms.ServerUrl + "," + s.ServerUrl + m[key] = ms + } } } @@ -79,7 +82,6 @@ func (m Map) Search(key string) bool { } else { return false } - } type StateMachine struct { @@ -334,8 +336,26 @@ func (SM *StateMachine) GetLastResolvedStates() map[string]State { SM.Lock() //every thing in OldState that can't be found in curstate for key, state := range *SM.OldState { - if !SM.CurState.Search(key) { + if cs, ok := (*SM.CurState)[key]; !ok { resolved[key] = state + } else if len(cs.ServerUrl) != len(state.ServerUrl) { + svUrl := "" + for _, sUrl := range strings.Split(state.ServerUrl, ",") { + if !strings.Contains(cs.ServerUrl, sUrl) { + if svUrl == "" { + svUrl = sUrl + } else { + svUrl = svUrl + "," + sUrl + } + } + } + resolved[key] = State{ + ErrFrom: state.ErrFrom, + ErrKey: state.ErrKey, + ErrDesc: state.ErrDesc, + ErrType: state.ErrType, + ServerUrl: svUrl, + } } } SM.Unlock() @@ -345,22 +365,52 @@ func (SM *StateMachine) GetLastResolvedStates() map[string]State { func (SM *StateMachine) GetLastOpenedStates() map[string]State { opened := make(map[string]State) SM.Lock() - //every thing in OldState that can't be found in curstate + //every thing in Curstate that can't be found in Oldstate for key, state := range *SM.CurState { - if !SM.OldState.Search(key) { + if old, ok := (*SM.OldState)[key]; !ok { opened[key] = state + } else if len(old.ServerUrl) != len(state.ServerUrl) { + svUrl := "" + for _, sUrl := range strings.Split(state.ServerUrl, ",") { + if !strings.Contains(old.ServerUrl, sUrl) { + if svUrl == "" { + svUrl = sUrl + } else { + svUrl = svUrl + "," + sUrl + } + } + } + opened[key] = State{ + ErrFrom: state.ErrFrom, + ErrKey: state.ErrKey, + ErrDesc: state.ErrDesc, + ErrType: state.ErrType, + ServerUrl: svUrl, + } } } SM.Unlock() return opened } +// The serverURL splitted in order to be used in GetServerFromURL function later +// This will get the resolved states for each server func (SM *StateMachine) GetResolvedStates() []State { var log []State SM.Lock() for key, state := range *SM.OldState { - if !SM.CurState.Search(key) { - log = append(log, state) + if cs, ok := (*SM.CurState)[key]; !ok || len(cs.ServerUrl) != len(state.ServerUrl) { + for _, sUrl := range strings.Split(state.ServerUrl, ",") { + if !ok || !strings.Contains(cs.ServerUrl, sUrl) { + log = append(log, State{ + ErrFrom: state.ErrFrom, + ErrKey: state.ErrKey, + ErrDesc: state.ErrDesc, + ErrType: state.ErrType, + ServerUrl: sUrl, + }) + } + } } } @@ -430,6 +480,14 @@ func (SM *StateMachine) PreserveState(key string) { } } +func (SM *StateMachine) PreserveGroup(prefix string) { + for key, value := range *SM.OldState { + if strings.HasPrefix(key, prefix) { + SM.AddState(key, value) + } + } +} + func (SM *StateMachine) AddToCapturedState(key string, cstate *CapturedState) { _, ok := SM.CapturedState.Load(key) if !ok {