From 8850e8ef3086179d7046e46277e40601922410b9 Mon Sep 17 00:00:00 2001 From: Manan Gupta <35839558+GuptaManan100@users.noreply.github.com> Date: Wed, 27 Mar 2024 23:32:57 +0530 Subject: [PATCH] Remove reading emergently instances (#15580) Signed-off-by: Manan Gupta --- go/vt/vtorc/logic/topology_recovery.go | 113 -------------------- go/vt/vtorc/logic/topology_recovery_test.go | 9 -- 2 files changed, 122 deletions(-) diff --git a/go/vt/vtorc/logic/topology_recovery.go b/go/vt/vtorc/logic/topology_recovery.go index 1ee82bb3560..8a1c3af5ac9 100644 --- a/go/vt/vtorc/logic/topology_recovery.go +++ b/go/vt/vtorc/logic/topology_recovery.go @@ -23,8 +23,6 @@ import ( "math/rand/v2" "time" - "github.com/patrickmn/go-cache" - "vitess.io/vitess/go/stats" "vitess.io/vitess/go/vt/log" "vitess.io/vitess/go/vt/logutil" @@ -160,20 +158,12 @@ func NewTopologyRecoveryStep(uid string, message string) *TopologyRecoveryStep { } } -var emergencyReadTopologyInstanceMap *cache.Cache -var emergencyRestartReplicaTopologyInstanceMap *cache.Cache -var emergencyOperationGracefulPeriodMap *cache.Cache - func init() { go initializeTopologyRecoveryPostConfiguration() } func initializeTopologyRecoveryPostConfiguration() { config.WaitForConfigurationToBeLoaded() - - emergencyReadTopologyInstanceMap = cache.New(time.Second, time.Millisecond*250) - emergencyRestartReplicaTopologyInstanceMap = cache.New(time.Second*30, time.Second) - emergencyOperationGracefulPeriodMap = cache.New(time.Second*5, time.Millisecond*500) } // AuditTopologyRecovery audits a single step in a topology recovery process. @@ -310,80 +300,6 @@ func checkAndRecoverGenericProblem(ctx context.Context, analysisEntry *inst.Repl return false, nil, nil } -// Force a re-read of a topology instance; this is done because we need to substantiate a suspicion -// that we may have a failover scenario. we want to speed up reading the complete picture. -func emergentlyReadTopologyInstance(tabletAlias string, analysisCode inst.AnalysisCode) (instance *inst.Instance) { - if existsInCacheError := emergencyReadTopologyInstanceMap.Add(tabletAlias, true, cache.DefaultExpiration); existsInCacheError != nil { - // Just recently attempted - return nil - } - instance, _ = inst.ReadTopologyInstance(tabletAlias) - _ = inst.AuditOperation("emergently-read-topology-instance", tabletAlias, string(analysisCode)) - return instance -} - -// Force reading of replicas of given instance. This is because we suspect the instance is dead, and want to speed up -// detection of replication failure from its replicas. -func emergentlyReadTopologyInstanceReplicas(primaryHost string, primaryPort int, analysisCode inst.AnalysisCode) { - replicas, err := inst.ReadReplicaInstancesIncludingBinlogServerSubReplicas(primaryHost, primaryPort) - if err != nil { - return - } - for _, replica := range replicas { - go emergentlyReadTopologyInstance(replica.InstanceAlias, analysisCode) - } -} - -// emergentlyRestartReplicationOnTopologyInstance forces a RestartReplication on a given instance. -func emergentlyRestartReplicationOnTopologyInstance(tabletAlias string, analysisCode inst.AnalysisCode) { - if existsInCacheError := emergencyRestartReplicaTopologyInstanceMap.Add(tabletAlias, true, cache.DefaultExpiration); existsInCacheError != nil { - // Just recently attempted on this specific replica - return - } - go inst.ExecuteOnTopology(func() { - _ = restartReplication(tabletAlias) - _ = inst.AuditOperation("emergently-restart-replication-topology-instance", tabletAlias, string(analysisCode)) - }) -} - -func beginEmergencyOperationGracefulPeriod(tabletAlias string) { - emergencyOperationGracefulPeriodMap.Set(tabletAlias, true, cache.DefaultExpiration) -} - -func isInEmergencyOperationGracefulPeriod(tabletAlias string) bool { - _, found := emergencyOperationGracefulPeriodMap.Get(tabletAlias) - return found -} - -// emergentlyRestartReplicationOnTopologyInstanceReplicas forces a stop slave + start slave on -// replicas of a given instance, in an attempt to cause them to re-evaluate their replication state. -// This can be useful in scenarios where the primary has Too Many Connections, but long-time connected -// replicas are not seeing this; when they stop+start replication, they need to re-authenticate and -// that's where we hope they realize the primary is bad. -func emergentlyRestartReplicationOnTopologyInstanceReplicas(primaryHost string, primaryPort int, tabletAlias string, analysisCode inst.AnalysisCode) { - if existsInCacheError := emergencyRestartReplicaTopologyInstanceMap.Add(tabletAlias, true, cache.DefaultExpiration); existsInCacheError != nil { - // While each replica's RestartReplication() is throttled on its own, it's also wasteful to - // iterate all replicas all the time. This is the reason why we do grand-throttle check. - return - } - beginEmergencyOperationGracefulPeriod(tabletAlias) - - replicas, err := inst.ReadReplicaInstancesIncludingBinlogServerSubReplicas(primaryHost, primaryPort) - if err != nil { - return - } - for _, replica := range replicas { - go emergentlyRestartReplicationOnTopologyInstance(replica.InstanceAlias, analysisCode) - } -} - -func emergentlyRecordStaleBinlogCoordinates(tabletAlias string, binlogCoordinates *inst.BinlogCoordinates) { - err := inst.RecordStaleInstanceBinlogCoordinates(tabletAlias, binlogCoordinates) - if err != nil { - log.Error(err) - } -} - // checkAndExecuteFailureDetectionProcesses tries to register for failure detection and potentially executes // failure-detection processes. func checkAndExecuteFailureDetectionProcesses(analysisEntry *inst.ReplicationAnalysis) (detectionRegistrationSuccess bool, processesExecutionAttempted bool, err error) { @@ -407,9 +323,6 @@ func getCheckAndRecoverFunctionCode(analysisCode inst.AnalysisCode, tabletAlias log.Infof("VTOrc not configured to run ERS, skipping recovering %v", analysisCode) return noRecoveryFunc } - if isInEmergencyOperationGracefulPeriod(tabletAlias) { - return recoverGenericProblemFunc - } return recoverDeadPrimaryFunc case inst.PrimaryTabletDeleted: // If ERS is disabled, we have no way of repairing the cluster. @@ -417,9 +330,6 @@ func getCheckAndRecoverFunctionCode(analysisCode inst.AnalysisCode, tabletAlias log.Infof("VTOrc not configured to run ERS, skipping recovering %v", analysisCode) return noRecoveryFunc } - if isInEmergencyOperationGracefulPeriod(tabletAlias) { - return recoverGenericProblemFunc - } return recoverPrimaryTabletDeletedFunc case inst.ErrantGTIDDetected: if !config.ConvertTabletWithErrantGTIDs() { @@ -430,9 +340,6 @@ func getCheckAndRecoverFunctionCode(analysisCode inst.AnalysisCode, tabletAlias case inst.PrimaryHasPrimary: return recoverPrimaryHasPrimaryFunc case inst.LockedSemiSyncPrimary: - if isInEmergencyOperationGracefulPeriod(tabletAlias) { - return recoverGenericProblemFunc - } return recoverLockedSemiSyncPrimaryFunc case inst.ClusterHasNoPrimary: return electNewPrimaryFunc @@ -566,25 +473,6 @@ func analysisEntriesHaveSameRecovery(prevAnalysis, newAnalysis *inst.Replication return prevRecoveryFunctionCode == newRecoveryFunctionCode } -func runEmergentOperations(analysisEntry *inst.ReplicationAnalysis) { - switch analysisEntry.Analysis { - case inst.DeadPrimaryAndReplicas: - go emergentlyReadTopologyInstance(analysisEntry.AnalyzedInstancePrimaryAlias, analysisEntry.Analysis) - case inst.UnreachablePrimary: - go emergentlyReadTopologyInstance(analysisEntry.AnalyzedInstanceAlias, analysisEntry.Analysis) - go emergentlyReadTopologyInstanceReplicas(analysisEntry.AnalyzedInstanceHostname, analysisEntry.AnalyzedInstancePort, analysisEntry.Analysis) - case inst.UnreachablePrimaryWithLaggingReplicas: - go emergentlyRestartReplicationOnTopologyInstanceReplicas(analysisEntry.AnalyzedInstanceHostname, analysisEntry.AnalyzedInstancePort, analysisEntry.AnalyzedInstanceAlias, analysisEntry.Analysis) - case inst.LockedSemiSyncPrimaryHypothesis: - go emergentlyReadTopologyInstance(analysisEntry.AnalyzedInstanceAlias, analysisEntry.Analysis) - go emergentlyRecordStaleBinlogCoordinates(analysisEntry.AnalyzedInstanceAlias, &analysisEntry.AnalyzedInstanceBinlogCoordinates) - case inst.AllPrimaryReplicasNotReplicating: - go emergentlyReadTopologyInstance(analysisEntry.AnalyzedInstanceAlias, analysisEntry.Analysis) - case inst.AllPrimaryReplicasNotReplicatingOrDead: - go emergentlyReadTopologyInstance(analysisEntry.AnalyzedInstanceAlias, analysisEntry.Analysis) - } -} - // executeCheckAndRecoverFunction will choose the correct check & recovery function based on analysis. // It executes the function synchronously func executeCheckAndRecoverFunction(analysisEntry *inst.ReplicationAnalysis) (err error) { @@ -594,7 +482,6 @@ func executeCheckAndRecoverFunction(analysisEntry *inst.ReplicationAnalysis) (er checkAndRecoverFunctionCode := getCheckAndRecoverFunctionCode(analysisEntry.Analysis, analysisEntry.AnalyzedInstanceAlias) isActionableRecovery := hasActionableRecovery(checkAndRecoverFunctionCode) analysisEntry.IsActionableRecovery = isActionableRecovery - runEmergentOperations(analysisEntry) if checkAndRecoverFunctionCode == noRecoveryFunc { // Unhandled problem type diff --git a/go/vt/vtorc/logic/topology_recovery_test.go b/go/vt/vtorc/logic/topology_recovery_test.go index d517649fd13..f636a194283 100644 --- a/go/vt/vtorc/logic/topology_recovery_test.go +++ b/go/vt/vtorc/logic/topology_recovery_test.go @@ -19,9 +19,7 @@ package logic import ( "context" "testing" - "time" - "github.com/patrickmn/go-cache" "github.com/stretchr/testify/require" topodatapb "vitess.io/vitess/go/vt/proto/topodata" @@ -88,7 +86,6 @@ func TestAnalysisEntriesHaveSameRecovery(t *testing.T) { shouldBeEqual: true, }, } - emergencyOperationGracefulPeriodMap = cache.New(time.Second*5, time.Millisecond*500) t.Parallel() for _, tt := range tests { t.Run(string(tt.prevAnalysisCode)+","+string(tt.newAnalysisCode), func(t *testing.T) { @@ -256,12 +253,6 @@ func TestGetCheckAndRecoverFunctionCode(t *testing.T) { }, } - // Needed for the test to work - oldMap := emergencyOperationGracefulPeriodMap - emergencyOperationGracefulPeriodMap = cache.New(time.Second*5, time.Millisecond*500) - defer func() { - emergencyOperationGracefulPeriodMap = oldMap - }() for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { prevVal := config.ERSEnabled()