diff --git a/go/test/endtoend/reparent/newfeaturetest/reparent_test.go b/go/test/endtoend/reparent/newfeaturetest/reparent_test.go index 9382a746385..ad798d61792 100644 --- a/go/test/endtoend/reparent/newfeaturetest/reparent_test.go +++ b/go/test/endtoend/reparent/newfeaturetest/reparent_test.go @@ -157,3 +157,23 @@ func TestChangeTypeWithoutSemiSync(t *testing.T) { err = clusterInstance.VtctldClientProcess.ExecuteCommand("ChangeTabletType", replica.Alias, "replica") require.NoError(t, err) } + +// TestERSWithWriteInPromoteReplica tests that ERS doesn't fail even if there is a +// write that happens when PromoteReplica is called. +func TestERSWithWriteInPromoteReplica(t *testing.T) { + defer cluster.PanicHandler(t) + clusterInstance := utils.SetupReparentCluster(t, "semi_sync") + defer utils.TeardownCluster(clusterInstance) + tablets := clusterInstance.Keyspaces[0].Shards[0].Vttablets + utils.ConfirmReplication(t, tablets[0], []*cluster.Vttablet{tablets[1], tablets[2], tablets[3]}) + + // Drop a table so that when sidecardb changes are checked, we run a DML query. + utils.RunSQLs(context.Background(), t, []string{ + "set sql_log_bin=0", + `SET @@global.super_read_only=0`, + `DROP TABLE _vt.heartbeat`, + "set sql_log_bin=1", + }, tablets[3]) + _, err := utils.Ers(clusterInstance, tablets[3], "60s", "30s") + require.NoError(t, err, "ERS should not fail even if there is a sidecardb change") +} diff --git a/go/vt/vtctl/grpcvtctldserver/server_slow_test.go b/go/vt/vtctl/grpcvtctldserver/server_slow_test.go index 8170e7d8cd3..4d7c5aa1943 100644 --- a/go/vt/vtctl/grpcvtctldserver/server_slow_test.go +++ b/go/vt/vtctl/grpcvtctldserver/server_slow_test.go @@ -107,8 +107,9 @@ func TestEmergencyReparentShardSlow(t *testing.T) { }, }, }, - PopulateReparentJournalDelays: map[string]time.Duration{ - "zone1-0000000200": time.Second * 29, + SetReplicationSourceDelays: map[string]time.Duration{ + "zone1-0000000100": time.Second * 29, + "zone1-0000000101": time.Second * 29, }, PopulateReparentJournalResults: map[string]error{ "zone1-0000000200": nil, @@ -224,8 +225,9 @@ func TestEmergencyReparentShardSlow(t *testing.T) { }, }, }, - PopulateReparentJournalDelays: map[string]time.Duration{ - "zone1-0000000200": time.Second * 31, + SetReplicationSourceDelays: map[string]time.Duration{ + "zone1-0000000100": time.Second * 31, + "zone1-0000000101": time.Second * 31, }, PopulateReparentJournalResults: map[string]error{ "zone1-0000000200": nil, diff --git a/go/vt/vtctl/reparentutil/emergency_reparenter.go b/go/vt/vtctl/reparentutil/emergency_reparenter.go index 3d7af37d932..a744012982a 100644 --- a/go/vt/vtctl/reparentutil/emergency_reparenter.go +++ b/go/vt/vtctl/reparentutil/emergency_reparenter.go @@ -308,7 +308,7 @@ func (erp *EmergencyReparenter) reparentShardLocked(ctx context.Context, ev *eve // Since the new primary tablet belongs to the validCandidateTablets list, we no longer need any additional constraint checks // Final step is to promote our primary candidate - err = erp.promoteNewPrimary(ctx, ev, newPrimary, opts, tabletMap, stoppedReplicationSnapshot.statusMap) + _, err = erp.reparentReplicas(ctx, ev, newPrimary, tabletMap, stoppedReplicationSnapshot.statusMap, opts, false /* intermediateReparent */) if err != nil { return err } @@ -458,7 +458,7 @@ func (erp *EmergencyReparenter) promoteIntermediateSource( // we reparent all the other valid tablets to start replication from our new source // we wait for all the replicas so that we can choose a better candidate from the ones that started replication later - reachableTablets, err := erp.reparentReplicas(ctx, ev, source, validTabletMap, statusMap, opts, true /* waitForAllReplicas */, false /* populateReparentJournal */) + reachableTablets, err := erp.reparentReplicas(ctx, ev, source, validTabletMap, statusMap, opts, true /* intermediateReparent */) if err != nil { return nil, err } @@ -487,8 +487,10 @@ func (erp *EmergencyReparenter) reparentReplicas( tabletMap map[string]*topo.TabletInfo, statusMap map[string]*replicationdatapb.StopReplicationStatus, opts EmergencyReparentOptions, - waitForAllReplicas bool, - populateReparentJournal bool, + intermediateReparent bool, // intermediateReparent represents whether the reparenting of the replicas is the final reparent or not. + // Since ERS can sometimes promote a tablet, which isn't a candidate for promotion, if it is the most advanced, we don't want to + // call PromoteReplica on it. We just want to get all replicas to replicate from it to get caught up, after which we'll promote the primary + // candidate separately. During the final promotion, we call `PromoteReplica` and `PopulateReparentJournal`. ) ([]*topodatapb.Tablet, error) { var ( @@ -497,6 +499,8 @@ func (erp *EmergencyReparenter) reparentReplicas( ) replCtx, replCancel := context.WithTimeout(context.Background(), opts.WaitReplicasTimeout) + primaryCtx, primaryCancel := context.WithTimeout(ctx, topo.RemoteOperationTimeout) + defer primaryCancel() event.DispatchUpdate(ev, "reparenting all tablets") @@ -518,13 +522,26 @@ func (erp *EmergencyReparenter) reparentReplicas( rec := concurrency.AllErrorRecorder{} handlePrimary := func(alias string, tablet *topodatapb.Tablet) error { - position, err := erp.tmc.PrimaryPosition(replCtx, tablet) - if err != nil { - return err - } - if populateReparentJournal { + if !intermediateReparent { + var position string + var err error + if ev.ShardInfo.PrimaryAlias == nil { + erp.logger.Infof("setting up %v as new primary for an uninitialized cluster", alias) + // we call InitPrimary when the PrimaryAlias in the ShardInfo is empty. This happens when we have an uninitialized cluster. + position, err = erp.tmc.InitPrimary(primaryCtx, tablet, SemiSyncAckers(opts.durability, tablet) > 0) + } else { + erp.logger.Infof("starting promotion for the new primary - %v", alias) + // we call PromoteReplica which changes the tablet type, fixes the semi-sync, set the primary to read-write and flushes the binlogs + position, err = erp.tmc.PromoteReplica(primaryCtx, tablet, SemiSyncAckers(opts.durability, tablet) > 0) + } + if err != nil { + return vterrors.Wrapf(err, "primary-elect tablet %v failed to be upgraded to primary: %v", alias, err) + } erp.logger.Infof("populating reparent journal on new primary %v", alias) - return erp.tmc.PopulateReparentJournal(replCtx, tablet, now, opts.lockAction, newPrimaryTablet.Alias, position) + err = erp.tmc.PopulateReparentJournal(primaryCtx, tablet, now, opts.lockAction, tablet.Alias, position) + if err != nil { + return vterrors.Wrapf(err, "failed to PopulateReparentJournal on primary: %v", err) + } } return nil } @@ -559,8 +576,8 @@ func (erp *EmergencyReparenter) reparentReplicas( replicaMutex.Unlock() // Signal that at least one goroutine succeeded to SetReplicationSource. - // We do this only when we do not want to wait for all the replicas - if !waitForAllReplicas { + // We do this only when we do not want to wait for all the replicas. + if !intermediateReparent { replSuccessCancel() } } @@ -594,10 +611,10 @@ func (erp *EmergencyReparenter) reparentReplicas( primaryErr := handlePrimary(topoproto.TabletAliasString(newPrimaryTablet.Alias), newPrimaryTablet) if primaryErr != nil { - erp.logger.Warningf("primary failed to PopulateReparentJournal") + erp.logger.Errorf("failed to promote %s to primary", topoproto.TabletAliasString(newPrimaryTablet.Alias)) replCancel() - return nil, vterrors.Wrapf(primaryErr, "failed to PopulateReparentJournal on primary: %v", primaryErr) + return nil, vterrors.Wrapf(primaryErr, "failed to promote %v to primary", topoproto.TabletAliasString(newPrimaryTablet.Alias)) } // We should only cancel the context that all the replicas are using when they are done. @@ -709,36 +726,6 @@ func (erp *EmergencyReparenter) identifyPrimaryCandidate( return nil, vterrors.Errorf(vtrpc.Code_INTERNAL, "unreachable - did not find a valid primary candidate even though the valid candidate list was non-empty") } -func (erp *EmergencyReparenter) promoteNewPrimary( - ctx context.Context, - ev *events.Reparent, - newPrimary *topodatapb.Tablet, - opts EmergencyReparentOptions, - tabletMap map[string]*topo.TabletInfo, - statusMap map[string]*replicationdatapb.StopReplicationStatus, -) error { - var err error - if ev.ShardInfo.PrimaryAlias == nil { - erp.logger.Infof("setting up %v as new primary for an uninitialized cluster", newPrimary.Alias) - // we call InitPrimary when the PrimaryAlias in the ShardInfo is empty. This happens when we have an uninitialized cluster. - _, err = erp.tmc.InitPrimary(ctx, newPrimary, SemiSyncAckers(opts.durability, newPrimary) > 0) - } else { - erp.logger.Infof("starting promotion for the new primary - %v", newPrimary.Alias) - // we call PromoteReplica which changes the tablet type, fixes the semi-sync, set the primary to read-write and flushes the binlogs - _, err = erp.tmc.PromoteReplica(ctx, newPrimary, SemiSyncAckers(opts.durability, newPrimary) > 0) - } - if err != nil { - return vterrors.Wrapf(err, "primary-elect tablet %v failed to be upgraded to primary: %v", newPrimary.Alias, err) - } - // we now reparent all the replicas to the new primary we have promoted. - // Here we do not need to wait for all the replicas, We can finish early when even 1 succeeds. - _, err = erp.reparentReplicas(ctx, ev, newPrimary, tabletMap, statusMap, opts, false /* waitForAllReplicas */, true /* populateReparentJournal */) - if err != nil { - return err - } - return nil -} - // filterValidCandidates filters valid tablets, keeping only the ones which can successfully be promoted without any constraint failures and can make forward progress on being promoted func (erp *EmergencyReparenter) filterValidCandidates(validTablets []*topodatapb.Tablet, tabletsReachable []*topodatapb.Tablet, prevPrimary *topodatapb.Tablet, opts EmergencyReparentOptions) ([]*topodatapb.Tablet, error) { var restrictedValidTablets []*topodatapb.Tablet diff --git a/go/vt/vtctl/reparentutil/emergency_reparenter_test.go b/go/vt/vtctl/reparentutil/emergency_reparenter_test.go index 8976a8d1b8f..6c73254586a 100644 --- a/go/vt/vtctl/reparentutil/emergency_reparenter_test.go +++ b/go/vt/vtctl/reparentutil/emergency_reparenter_test.go @@ -141,14 +141,6 @@ func TestEmergencyReparenter_reparentShardLocked(t *testing.T) { Error: nil, }, }, - PrimaryPositionResults: map[string]struct { - Position string - Error error - }{ - "zone1-0000000102": { - Error: nil, - }, - }, SetReplicationSourceResults: map[string]error{ "zone1-0000000100": nil, "zone1-0000000101": nil, @@ -258,14 +250,6 @@ func TestEmergencyReparenter_reparentShardLocked(t *testing.T) { Error: nil, }, }, - PrimaryPositionResults: map[string]struct { - Position string - Error error - }{ - "zone1-0000000102": { - Error: nil, - }, - }, SetReplicationSourceResults: map[string]error{ "zone1-0000000100": nil, "zone1-0000000101": nil, @@ -394,14 +378,6 @@ func TestEmergencyReparenter_reparentShardLocked(t *testing.T) { PopulateReparentJournalResults: map[string]error{ "zone1-0000000101": nil, }, - PrimaryPositionResults: map[string]struct { - Position string - Error error - }{ - "zone1-0000000101": { - Error: nil, - }, - }, PromoteReplicaResults: map[string]struct { Result string Error error @@ -517,14 +493,6 @@ func TestEmergencyReparenter_reparentShardLocked(t *testing.T) { }, }, }, - PrimaryPositionResults: map[string]struct { - Position string - Error error - }{ - "zone1-0000000102": { - Error: nil, - }, - }, PopulateReparentJournalResults: map[string]error{ "zone1-0000000102": nil, }, @@ -1064,9 +1032,6 @@ func TestEmergencyReparenter_reparentShardLocked(t *testing.T) { Position: "MySQL56/3E11FA47-71CA-11E1-9E33-C80AA9429562:1-21", Error: nil, }, - "zone1-0000000102": { - Error: nil, - }, }, StopReplicationAndGetStatusResults: map[string]struct { StopStatus *replicationdatapb.StopReplicationStatus @@ -1170,14 +1135,6 @@ func TestEmergencyReparenter_reparentShardLocked(t *testing.T) { Error: assert.AnError, }, }, - PrimaryPositionResults: map[string]struct { - Position string - Error error - }{ - "zone1-0000000102": { - Error: nil, - }, - }, PopulateReparentJournalResults: map[string]error{ "zone1-0000000102": nil, }, @@ -1292,14 +1249,6 @@ func TestEmergencyReparenter_reparentShardLocked(t *testing.T) { Error: nil, }, }, - PrimaryPositionResults: map[string]struct { - Position string - Error error - }{ - "zone1-0000000102": { - Error: nil, - }, - }, SetReplicationSourceResults: map[string]error{ "zone1-0000000100": nil, "zone1-0000000101": nil, @@ -1412,14 +1361,6 @@ func TestEmergencyReparenter_reparentShardLocked(t *testing.T) { Error: nil, }, }, - PrimaryPositionResults: map[string]struct { - Position string - Error error - }{ - "zone1-0000000102": { - Error: nil, - }, - }, SetReplicationSourceResults: map[string]error{ "zone1-0000000100": nil, "zone1-0000000101": nil, @@ -1527,14 +1468,6 @@ func TestEmergencyReparenter_reparentShardLocked(t *testing.T) { Error: nil, }, }, - PrimaryPositionResults: map[string]struct { - Position string - Error error - }{ - "zone1-0000000102": { - Error: nil, - }, - }, SetReplicationSourceResults: map[string]error{ "zone1-0000000100": nil, "zone1-0000000101": nil, @@ -1660,14 +1593,6 @@ func TestEmergencyReparenter_reparentShardLocked(t *testing.T) { Error: nil, }, }, - PrimaryPositionResults: map[string]struct { - Position string - Error error - }{ - "zone1-0000000102": { - Error: nil, - }, - }, SetReplicationSourceResults: map[string]error{ "zone1-0000000100": nil, "zone1-0000000101": nil, @@ -1792,14 +1717,6 @@ func TestEmergencyReparenter_reparentShardLocked(t *testing.T) { Error: nil, }, }, - PrimaryPositionResults: map[string]struct { - Position string - Error error - }{ - "zone1-0000000102": { - Error: nil, - }, - }, SetReplicationSourceResults: map[string]error{ "zone1-0000000100": nil, "zone1-0000000101": nil, @@ -1943,7 +1860,7 @@ func TestEmergencyReparenter_reparentShardLocked(t *testing.T) { } } -func TestEmergencyReparenter_promoteNewPrimary(t *testing.T) { +func TestEmergencyReparenter_promotionOfNewPrimary(t *testing.T) { t.Parallel() tests := []struct { @@ -1968,14 +1885,6 @@ func TestEmergencyReparenter_promoteNewPrimary(t *testing.T) { PopulateReparentJournalResults: map[string]error{ "zone1-0000000100": nil, }, - PrimaryPositionResults: map[string]struct { - Position string - Error error - }{ - "zone1-0000000100": { - Error: nil, - }, - }, PromoteReplicaResults: map[string]struct { Result string Error error @@ -2047,17 +1956,9 @@ func TestEmergencyReparenter_promoteNewPrimary(t *testing.T) { shouldErr: false, }, { - name: "PrimaryPosition error", + name: "PromoteReplica error", emergencyReparentOps: EmergencyReparentOptions{}, tmc: &testutil.TabletManagerClient{ - PrimaryPositionResults: map[string]struct { - Position string - Error error - }{ - "zone1-0000000100": { - Error: fmt.Errorf("primary position error"), - }, - }, PromoteReplicaResults: map[string]struct { Result string Error error @@ -2099,14 +2000,6 @@ func TestEmergencyReparenter_promoteNewPrimary(t *testing.T) { PopulateReparentJournalResults: map[string]error{ "zone1-0000000100": assert.AnError, }, - PrimaryPositionResults: map[string]struct { - Position string - Error error - }{ - "zone1-0000000100": { - Error: nil, - }, - }, PromoteReplicaResults: map[string]struct { Result string Error error @@ -2148,14 +2041,6 @@ func TestEmergencyReparenter_promoteNewPrimary(t *testing.T) { PopulateReparentJournalResults: map[string]error{ "zone1-0000000100": nil, }, - PrimaryPositionResults: map[string]struct { - Position string - Error error - }{ - "zone1-0000000100": { - Error: nil, - }, - }, PromoteReplicaResults: map[string]struct { Result string Error error @@ -2211,14 +2096,6 @@ func TestEmergencyReparenter_promoteNewPrimary(t *testing.T) { PopulateReparentJournalResults: map[string]error{ "zone1-0000000100": nil, }, - PrimaryPositionResults: map[string]struct { - Position string - Error error - }{ - "zone1-0000000100": { - Error: nil, - }, - }, PromoteReplicaResults: map[string]struct { Result string Error error @@ -2278,14 +2155,6 @@ func TestEmergencyReparenter_promoteNewPrimary(t *testing.T) { PopulateReparentJournalResults: map[string]error{ "zone1-0000000100": nil, }, - PrimaryPositionResults: map[string]struct { - Position string - Error error - }{ - "zone1-0000000100": { - Error: nil, - }, - }, PromoteReplicaResults: map[string]struct { Result string Error error @@ -2339,14 +2208,6 @@ func TestEmergencyReparenter_promoteNewPrimary(t *testing.T) { PopulateReparentJournalResults: map[string]error{ "zone1-0000000100": nil, }, - PrimaryPositionResults: map[string]struct { - Position string - Error error - }{ - "zone1-0000000100": { - Error: nil, - }, - }, InitPrimaryResults: map[string]struct { Result string Error error @@ -2467,7 +2328,7 @@ func TestEmergencyReparenter_promoteNewPrimary(t *testing.T) { tt.emergencyReparentOps.durability = durability erp := NewEmergencyReparenter(ts, tt.tmc, logger) - err := erp.promoteNewPrimary(ctx, ev, tabletInfo.Tablet, tt.emergencyReparentOps, tt.tabletMap, tt.statusMap) + _, err := erp.reparentReplicas(ctx, ev, tabletInfo.Tablet, tt.tabletMap, tt.statusMap, tt.emergencyReparentOps, false) if tt.shouldErr { assert.Error(t, err) assert.Contains(t, err.Error(), tt.errShouldContain) @@ -2744,14 +2605,6 @@ func TestEmergencyReparenterStats(t *testing.T) { Error: nil, }, }, - PrimaryPositionResults: map[string]struct { - Position string - Error error - }{ - "zone1-0000000102": { - Error: nil, - }, - }, SetReplicationSourceResults: map[string]error{ "zone1-0000000100": nil, "zone1-0000000101": nil, @@ -3149,8 +3002,6 @@ func TestEmergencyReparenter_findMostAdvanced(t *testing.T) { } func TestEmergencyReparenter_reparentReplicas(t *testing.T) { - t.Parallel() - tests := []struct { name string emergencyReparentOps EmergencyReparentOptions @@ -3164,6 +3015,7 @@ func TestEmergencyReparenter_reparentReplicas(t *testing.T) { statusMap map[string]*replicationdatapb.StopReplicationStatus shouldErr bool errShouldContain string + remoteOpTimeout time.Duration }{ { name: "success", @@ -3172,9 +3024,9 @@ func TestEmergencyReparenter_reparentReplicas(t *testing.T) { PopulateReparentJournalResults: map[string]error{ "zone1-0000000100": nil, }, - PrimaryPositionResults: map[string]struct { - Position string - Error error + PromoteReplicaResults: map[string]struct { + Result string + Error error }{ "zone1-0000000100": { Error: nil, @@ -3243,12 +3095,12 @@ func TestEmergencyReparenter_reparentReplicas(t *testing.T) { shouldErr: false, }, { - name: "PrimaryPosition error", + name: "PromoteReplica error", emergencyReparentOps: EmergencyReparentOptions{}, tmc: &testutil.TabletManagerClient{ - PrimaryPositionResults: map[string]struct { - Position string - Error error + PromoteReplicaResults: map[string]struct { + Result string + Error error }{ "zone1-0000000100": { Error: fmt.Errorf("primary position error"), @@ -3287,9 +3139,9 @@ func TestEmergencyReparenter_reparentReplicas(t *testing.T) { PopulateReparentJournalResults: map[string]error{ "zone1-0000000100": assert.AnError, }, - PrimaryPositionResults: map[string]struct { - Position string - Error error + PromoteReplicaResults: map[string]struct { + Result string + Error error }{ "zone1-0000000100": { Error: nil, @@ -3328,9 +3180,9 @@ func TestEmergencyReparenter_reparentReplicas(t *testing.T) { PopulateReparentJournalResults: map[string]error{ "zone1-0000000100": nil, }, - PrimaryPositionResults: map[string]struct { - Position string - Error error + PromoteReplicaResults: map[string]struct { + Result string + Error error }{ "zone1-0000000100": { Error: nil, @@ -3383,9 +3235,9 @@ func TestEmergencyReparenter_reparentReplicas(t *testing.T) { PopulateReparentJournalResults: map[string]error{ "zone1-0000000100": nil, }, - PrimaryPositionResults: map[string]struct { - Position string - Error error + PromoteReplicaResults: map[string]struct { + Result string + Error error }{ "zone1-0000000100": { Error: nil, @@ -3441,9 +3293,9 @@ func TestEmergencyReparenter_reparentReplicas(t *testing.T) { PopulateReparentJournalResults: map[string]error{ "zone1-0000000100": nil, }, - PrimaryPositionResults: map[string]struct { - Position string - Error error + PromoteReplicaResults: map[string]struct { + Result string + Error error }{ "zone1-0000000100": { Error: nil, @@ -3492,9 +3344,9 @@ func TestEmergencyReparenter_reparentReplicas(t *testing.T) { PopulateReparentJournalResults: map[string]error{ "zone1-0000000100": nil, }, - PrimaryPositionResults: map[string]struct { - Position string - Error error + PromoteReplicaResults: map[string]struct { + Result string + Error error }{ "zone1-0000000100": { Error: nil, @@ -3528,15 +3380,111 @@ func TestEmergencyReparenter_reparentReplicas(t *testing.T) { shard: "-", shouldErr: false, }, + { + name: "primary promotion gets infinitely stuck", + emergencyReparentOps: EmergencyReparentOptions{}, + tmc: &testutil.TabletManagerClient{ + PopulateReparentJournalResults: map[string]error{ + "zone1-0000000100": nil, + }, + PromoteReplicaResults: map[string]struct { + Result string + Error error + }{ + "zone1-0000000100": { + Error: nil, + }, + }, + PromoteReplicaDelays: map[string]time.Duration{ + "zone1-0000000100": 500 * time.Hour, + }, + SetReplicationSourceResults: map[string]error{ + "zone1-0000000101": nil, + "zone1-0000000102": nil, + }, + }, + remoteOpTimeout: 100 * time.Millisecond, + newPrimaryTabletAlias: "zone1-0000000100", + tabletMap: map[string]*topo.TabletInfo{ + "zone1-0000000100": { + Tablet: &topodatapb.Tablet{ + Alias: &topodatapb.TabletAlias{ + Cell: "zone1", + Uid: 100, + }, + Hostname: "primary-elect", + }, + }, + "zone1-0000000101": { + Tablet: &topodatapb.Tablet{ + Alias: &topodatapb.TabletAlias{ + Cell: "zone1", + Uid: 101, + }, + }, + }, + "zone1-0000000102": { + Tablet: &topodatapb.Tablet{ + Alias: &topodatapb.TabletAlias{ + Cell: "zone1", + Uid: 102, + }, + Hostname: "requires force start", + }, + }, + "zone1-0000000404": { + Tablet: &topodatapb.Tablet{ + Alias: &topodatapb.TabletAlias{ + Cell: "zone1", + Uid: 404, + }, + Hostname: "ignored tablet", + }, + }, + }, + statusMap: map[string]*replicationdatapb.StopReplicationStatus{ + "zone1-0000000101": { // forceStart = false + Before: &replicationdatapb.Status{ + IoState: int32(replication.ReplicationStateStopped), + SqlState: int32(replication.ReplicationStateStopped), + }, + }, + "zone1-0000000102": { // forceStart = true + Before: &replicationdatapb.Status{ + IoState: int32(replication.ReplicationStateRunning), + SqlState: int32(replication.ReplicationStateRunning), + }, + }, + }, + keyspace: "testkeyspace", + shard: "-", + shouldErr: true, + errShouldContain: "failed to promote zone1-0000000100 to primary: primary-elect tablet zone1-0000000100 failed to be upgraded to primary: context deadline exceeded", + }, } durability, _ := GetDurabilityPolicy("none") for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - t.Parallel() + if tt.remoteOpTimeout != 0 { + oldTimeout := topo.RemoteOperationTimeout + topo.RemoteOperationTimeout = tt.remoteOpTimeout + defer func() { + topo.RemoteOperationTimeout = oldTimeout + }() + } logger := logutil.NewMemoryLogger() - ev := &events.Reparent{} + ev := &events.Reparent{ + ShardInfo: topo.ShardInfo{ + Shard: &topodatapb.Shard{ + PrimaryAlias: &topodatapb.TabletAlias{ + Cell: "zone1", + Uid: 000, + }, + }, + }, + } ctx, cancel := context.WithCancel(context.Background()) defer cancel() @@ -3567,7 +3515,7 @@ func TestEmergencyReparenter_reparentReplicas(t *testing.T) { tt.emergencyReparentOps.durability = durability erp := NewEmergencyReparenter(ts, tt.tmc, logger) - _, err := erp.reparentReplicas(ctx, ev, tabletInfo.Tablet, tt.tabletMap, tt.statusMap, tt.emergencyReparentOps, false /* waitForAllReplicas */, true /* populateReparentJournal */) + _, err := erp.reparentReplicas(ctx, ev, tabletInfo.Tablet, tt.tabletMap, tt.statusMap, tt.emergencyReparentOps, false /* intermediateReparent */) if tt.shouldErr { assert.Error(t, err) assert.Contains(t, err.Error(), tt.errShouldContain) @@ -3605,14 +3553,6 @@ func TestEmergencyReparenter_promoteIntermediateSource(t *testing.T) { PopulateReparentJournalResults: map[string]error{ "zone1-0000000100": nil, }, - PrimaryPositionResults: map[string]struct { - Position string - Error error - }{ - "zone1-0000000100": { - Error: nil, - }, - }, SetReplicationSourceResults: map[string]error{ "zone1-0000000101": nil, "zone1-0000000102": nil, @@ -3730,14 +3670,6 @@ func TestEmergencyReparenter_promoteIntermediateSource(t *testing.T) { PopulateReparentJournalResults: map[string]error{ "zone1-0000000100": nil, }, - PrimaryPositionResults: map[string]struct { - Position string - Error error - }{ - "zone1-0000000100": { - Error: nil, - }, - }, SetReplicationSourceResults: map[string]error{ "zone1-0000000101": nil, }, @@ -3826,14 +3758,6 @@ func TestEmergencyReparenter_promoteIntermediateSource(t *testing.T) { PopulateReparentJournalResults: map[string]error{ "zone1-0000000100": nil, }, - PrimaryPositionResults: map[string]struct { - Position string - Error error - }{ - "zone1-0000000100": { - Error: nil, - }, - }, SetReplicationSourceResults: map[string]error{ "zone1-0000000101": fmt.Errorf("An error"), }, @@ -3902,15 +3826,6 @@ func TestEmergencyReparenter_promoteIntermediateSource(t *testing.T) { PopulateReparentJournalResults: map[string]error{ "zone1-0000000100": nil, }, - PrimaryPositionResults: map[string]struct { - Position string - Error error - }{ - "zone1-0000000100": { - Error: nil, - }, - }, - SetReplicationSourceResults: map[string]error{ // everyone fails, we all fail "zone1-0000000101": assert.AnError, @@ -3978,14 +3893,6 @@ func TestEmergencyReparenter_promoteIntermediateSource(t *testing.T) { PopulateReparentJournalResults: map[string]error{ "zone1-0000000100": nil, }, - PrimaryPositionResults: map[string]struct { - Position string - Error error - }{ - "zone1-0000000100": { - Error: nil, - }, - }, SetReplicationSourceResults: map[string]error{ "zone1-0000000101": nil, // this one succeeds, so we're good "zone1-0000000102": assert.AnError, @@ -4064,14 +3971,6 @@ func TestEmergencyReparenter_promoteIntermediateSource(t *testing.T) { PopulateReparentJournalResults: map[string]error{ "zone1-0000000100": nil, }, - PrimaryPositionResults: map[string]struct { - Position string - Error error - }{ - "zone1-0000000100": { - Error: nil, - }, - }, SetReplicationSourceResults: map[string]error{ "zone1-0000000101": nil, "zone1-0000000102": nil, @@ -4423,14 +4322,6 @@ func TestParentContextCancelled(t *testing.T) { emergencyReparentOps := EmergencyReparentOptions{IgnoreReplicas: sets.New[string]("zone1-0000000404"), WaitReplicasTimeout: time.Minute, durability: durability} // Make the replica tablet return its results after 3 seconds tmc := &testutil.TabletManagerClient{ - PrimaryPositionResults: map[string]struct { - Position string - Error error - }{ - "zone1-0000000100": { - Error: nil, - }, - }, SetReplicationSourceResults: map[string]error{ "zone1-0000000101": nil, }, @@ -4490,7 +4381,7 @@ func TestParentContextCancelled(t *testing.T) { time.Sleep(time.Second) cancel() }() - _, err = erp.reparentReplicas(ctx, ev, tabletMap[newPrimaryTabletAlias].Tablet, tabletMap, statusMap, emergencyReparentOps, false, false) + _, err = erp.reparentReplicas(ctx, ev, tabletMap[newPrimaryTabletAlias].Tablet, tabletMap, statusMap, emergencyReparentOps, true) require.NoError(t, err) }