diff --git a/go.mod b/go.mod index fcea7fa65f9..40f2b19e449 100644 --- a/go.mod +++ b/go.mod @@ -102,7 +102,7 @@ require ( github.com/kr/text v0.2.0 github.com/mitchellh/mapstructure v1.5.0 github.com/nsf/jsondiff v0.0.0-20210926074059-1e845ec5d249 - github.com/slackhq/vitess-addons v0.19.0 + github.com/slackhq/vitess-addons v0.19.1 github.com/slok/noglog v0.2.0 github.com/spf13/afero v1.11.0 github.com/spf13/jwalterweatherman v1.1.0 diff --git a/go/vt/vtorc/logic/topology_recovery.go b/go/vt/vtorc/logic/topology_recovery.go index c1fc2c8f9fb..7f106f25221 100644 --- a/go/vt/vtorc/logic/topology_recovery.go +++ b/go/vt/vtorc/logic/topology_recovery.go @@ -21,9 +21,11 @@ import ( "encoding/json" "fmt" "math/rand" + "os" "time" "github.com/patrickmn/go-cache" + "github.com/slackhq/vitess-addons/go/external" "vitess.io/vitess/go/stats" "vitess.io/vitess/go/vt/log" @@ -81,6 +83,8 @@ var ( // recoveriesFailureCounter counts the number of failed recoveries that VTOrc has performed recoveriesFailureCounter = stats.NewCountersWithSingleLabel("FailedRecoveries", "Count of the different failed recoveries performed", "RecoveryType", actionableRecoveriesNames...) + + vtopsExec = external.NewExecVTOps(os.Getenv("VTOPS_PATH"), os.Getenv("VTOPS_HTTP_PROXY")) ) // recoveryFunction is the code of the recovery function to be used @@ -297,6 +301,7 @@ func postErsCompletion(topologyRecovery *TopologyRecovery, analysisEntry *inst.R _ = AuditTopologyRecovery(topologyRecovery, message) _ = inst.AuditOperation(recoveryName, analysisEntry.AnalyzedInstanceAlias, message) _ = AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("%v: successfully promoted %+v", recoveryName, promotedReplica.InstanceAlias)) + vtopsExec.RaiseProblem(analysisEntry.AnalyzedInstanceHostname, "orc-dead-tablet", true) } } @@ -590,7 +595,6 @@ func runEmergentOperations(analysisEntry *inst.ReplicationAnalysis) { func executeCheckAndRecoverFunction(analysisEntry *inst.ReplicationAnalysis) (err error) { countPendingRecoveries.Add(1) defer countPendingRecoveries.Add(-1) - checkAndRecoverFunctionCode := getCheckAndRecoverFunctionCode(analysisEntry.Analysis, analysisEntry.AnalyzedInstanceAlias) isActionableRecovery := hasActionableRecovery(checkAndRecoverFunctionCode) analysisEntry.IsActionableRecovery = isActionableRecovery @@ -605,8 +609,11 @@ func executeCheckAndRecoverFunction(analysisEntry *inst.ReplicationAnalysis) (er } } + vtopsExec.SendSlackMessage(fmt.Sprintf("[VTOrc] No recovery available for %s for problem %s.", analysisEntry.AnalyzedInstanceHostname, analysisEntry.Analysis), true) + return nil } + // we have a recovery function; its execution still depends on filters if not disabled. if isActionableRecovery || util.ClearToLog("executeCheckAndRecoverFunction: detection", analysisEntry.AnalyzedInstanceAlias) { log.Infof("executeCheckAndRecoverFunction: proceeding with %+v detection on %+v; isActionable?: %+v", analysisEntry.Analysis, analysisEntry.AnalyzedInstanceAlias, isActionableRecovery) @@ -707,6 +714,11 @@ func executeCheckAndRecoverFunction(analysisEntry *inst.ReplicationAnalysis) (er if isActionableRecovery || util.ClearToLog("executeCheckAndRecoverFunction: recovery", analysisEntry.AnalyzedInstanceAlias) { log.Infof("executeCheckAndRecoverFunction: proceeding with %+v recovery on %+v; isRecoverable?: %+v", analysisEntry.Analysis, analysisEntry.AnalyzedInstanceAlias, isActionableRecovery) } + + if !isActionableRecovery { + vtopsExec.SendSlackMessage(fmt.Sprintf("No actionable recovery on %s for problem %s.", analysisEntry.AnalyzedInstanceHostname, analysisEntry.Analysis), true) + } + recoveryAttempted, topologyRecovery, err := getCheckAndRecoverFunction(checkAndRecoverFunctionCode)(ctx, analysisEntry) if !recoveryAttempted { return err @@ -714,8 +726,10 @@ func executeCheckAndRecoverFunction(analysisEntry *inst.ReplicationAnalysis) (er recoveryName := getRecoverFunctionName(checkAndRecoverFunctionCode) recoveriesCounter.Add(recoveryName, 1) if err != nil { + vtopsExec.SendSlackMessage(fmt.Sprintf("Recovery failed on %s for problem %s. Error: %s", analysisEntry.AnalyzedInstanceHostname, analysisEntry.Analysis, err.Error()), true) recoveriesFailureCounter.Add(recoveryName, 1) } else { + vtopsExec.SendSlackMessage(fmt.Sprintf("Recovery succeeded on %s for problem %s.", analysisEntry.AnalyzedInstanceHostname, analysisEntry.Analysis), true) recoveriesSuccessfulCounter.Add(recoveryName, 1) } if topologyRecovery == nil { @@ -813,6 +827,7 @@ func postPrsCompletion(topologyRecovery *TopologyRecovery, analysisEntry *inst.R _ = AuditTopologyRecovery(topologyRecovery, message) _ = inst.AuditOperation(string(analysisEntry.Analysis), analysisEntry.AnalyzedInstanceAlias, message) _ = AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("%+v: successfully promoted %+v", analysisEntry.Analysis, promotedReplica.InstanceAlias)) + vtopsExec.RaiseProblem(analysisEntry.AnalyzedInstanceHostname, "orc-dead-tablet", true) } }