From 6a181783b66b0fffd80a5250863b01e824a5acd4 Mon Sep 17 00:00:00 2001 From: Manan Gupta <35839558+GuptaManan100@users.noreply.github.com> Date: Tue, 26 Mar 2024 09:53:24 +0530 Subject: [PATCH] Filter tablet map using valid candidates before reparenting to intermediate source (#15540) --- .../reparentutil/emergency_reparenter.go | 11 +- .../reparentutil/emergency_reparenter_test.go | 180 +++++++++++++++++- 2 files changed, 188 insertions(+), 3 deletions(-) diff --git a/go/vt/vtctl/reparentutil/emergency_reparenter.go b/go/vt/vtctl/reparentutil/emergency_reparenter.go index 607fec700b5..631ea709016 100644 --- a/go/vt/vtctl/reparentutil/emergency_reparenter.go +++ b/go/vt/vtctl/reparentutil/emergency_reparenter.go @@ -449,9 +449,16 @@ func (erp *EmergencyReparenter) promoteIntermediateSource( validCandidateTablets []*topodatapb.Tablet, opts EmergencyReparentOptions, ) ([]*topodatapb.Tablet, error) { - // we reparent all the other tablets to start replication from our new source + // Create a tablet map from all the valid replicas + validTabletMap := map[string]*topo.TabletInfo{} + for _, candidate := range validCandidateTablets { + alias := topoproto.TabletAliasString(candidate.Alias) + validTabletMap[alias] = tabletMap[alias] + } + + // we reparent all the other valid tablets to start replication from our new source // we wait for all the replicas so that we can choose a better candidate from the ones that started replication later - reachableTablets, err := erp.reparentReplicas(ctx, ev, source, tabletMap, statusMap, opts, true /* waitForAllReplicas */, false /* populateReparentJournal */) + reachableTablets, err := erp.reparentReplicas(ctx, ev, source, validTabletMap, statusMap, opts, true /* waitForAllReplicas */, false /* populateReparentJournal */) if err != nil { return nil, err } diff --git a/go/vt/vtctl/reparentutil/emergency_reparenter_test.go b/go/vt/vtctl/reparentutil/emergency_reparenter_test.go index 8e793bcf6d5..8976a8d1b8f 100644 --- a/go/vt/vtctl/reparentutil/emergency_reparenter_test.go +++ b/go/vt/vtctl/reparentutil/emergency_reparenter_test.go @@ -3715,6 +3715,184 @@ func TestEmergencyReparenter_promoteIntermediateSource(t *testing.T) { }, Hostname: "requires force start", }, + { + Alias: &topodatapb.TabletAlias{ + Cell: "zone1", + Uid: 404, + }, + }, + }, + }, + { + name: "success - filter with valid tablets before", + emergencyReparentOps: EmergencyReparentOptions{}, + tmc: &testutil.TabletManagerClient{ + PopulateReparentJournalResults: map[string]error{ + "zone1-0000000100": nil, + }, + PrimaryPositionResults: map[string]struct { + Position string + Error error + }{ + "zone1-0000000100": { + Error: nil, + }, + }, + SetReplicationSourceResults: map[string]error{ + "zone1-0000000101": nil, + }, + }, + newSourceTabletAlias: "zone1-0000000100", + tabletMap: map[string]*topo.TabletInfo{ + "zone1-0000000100": { + Tablet: &topodatapb.Tablet{ + Alias: &topodatapb.TabletAlias{ + Cell: "zone1", + Uid: 100, + }, + Hostname: "primary-elect", + }, + }, + "zone1-0000000101": { + Tablet: &topodatapb.Tablet{ + Alias: &topodatapb.TabletAlias{ + Cell: "zone1", + Uid: 101, + }, + }, + }, + "zone1-0000000102": { + Tablet: &topodatapb.Tablet{ + Alias: &topodatapb.TabletAlias{ + Cell: "zone1", + Uid: 102, + }, + Hostname: "requires force start", + }, + }, + "zone1-0000000404": { + Tablet: &topodatapb.Tablet{ + Alias: &topodatapb.TabletAlias{ + Cell: "zone1", + Uid: 404, + }, + Hostname: "ignored tablet", + }, + }, + }, + statusMap: map[string]*replicationdatapb.StopReplicationStatus{ + "zone1-0000000101": { // forceStart = false + Before: &replicationdatapb.Status{ + IoState: int32(replication.ReplicationStateStopped), + SqlState: int32(replication.ReplicationStateStopped), + }, + }, + }, + keyspace: "testkeyspace", + shard: "-", + shouldErr: false, + result: []*topodatapb.Tablet{ + { + Alias: &topodatapb.TabletAlias{ + Cell: "zone1", + Uid: 100, + }, + Hostname: "primary-elect", + }, { + Alias: &topodatapb.TabletAlias{ + Cell: "zone1", + Uid: 101, + }, + }, + }, + validCandidateTablets: []*topodatapb.Tablet{ + { + Alias: &topodatapb.TabletAlias{ + Cell: "zone1", + Uid: 100, + }, + Hostname: "primary-elect", + }, { + Alias: &topodatapb.TabletAlias{ + Cell: "zone1", + Uid: 101, + }, + }, + }, + }, { + name: "success - only 2 tablets and they error", + emergencyReparentOps: EmergencyReparentOptions{}, + tmc: &testutil.TabletManagerClient{ + PopulateReparentJournalResults: map[string]error{ + "zone1-0000000100": nil, + }, + PrimaryPositionResults: map[string]struct { + Position string + Error error + }{ + "zone1-0000000100": { + Error: nil, + }, + }, + SetReplicationSourceResults: map[string]error{ + "zone1-0000000101": fmt.Errorf("An error"), + }, + }, + newSourceTabletAlias: "zone1-0000000100", + tabletMap: map[string]*topo.TabletInfo{ + "zone1-0000000100": { + Tablet: &topodatapb.Tablet{ + Alias: &topodatapb.TabletAlias{ + Cell: "zone1", + Uid: 100, + }, + Hostname: "primary-elect", + }, + }, + "zone1-0000000101": { + Tablet: &topodatapb.Tablet{ + Alias: &topodatapb.TabletAlias{ + Cell: "zone1", + Uid: 101, + }, + }, + }, + "zone1-0000000102": { + Tablet: &topodatapb.Tablet{ + Alias: &topodatapb.TabletAlias{ + Cell: "zone1", + Uid: 102, + }, + Hostname: "requires force start", + }, + }, + }, + statusMap: map[string]*replicationdatapb.StopReplicationStatus{}, + keyspace: "testkeyspace", + shard: "-", + shouldErr: false, + result: []*topodatapb.Tablet{ + { + Alias: &topodatapb.TabletAlias{ + Cell: "zone1", + Uid: 100, + }, + Hostname: "primary-elect", + }, + }, + validCandidateTablets: []*topodatapb.Tablet{ + { + Alias: &topodatapb.TabletAlias{ + Cell: "zone1", + Uid: 100, + }, + Hostname: "primary-elect", + }, { + Alias: &topodatapb.TabletAlias{ + Cell: "zone1", + Uid: 101, + }, + }, }, }, { @@ -3757,7 +3935,7 @@ func TestEmergencyReparenter_promoteIntermediateSource(t *testing.T) { }, }, }, - "zone1-00000000102": { + "zone1-0000000102": { Tablet: &topodatapb.Tablet{ Alias: &topodatapb.TabletAlias{ Cell: "zone1",