Skip to content

Commit

Permalink
do not demote a new primary after backup completion (vitessio#12856)
Browse files Browse the repository at this point in the history
* do not demote a new primary after backup completion

Signed-off-by: Olga Shestopalova <oshestopalova@hubspot.com>

* move set replication source out of vtctld and into tablet

Signed-off-by: Olga Shestopalova <oshestopalova@hubspot.com>

* remove set replication source in vtctld

Signed-off-by: Olga Shestopalova <oshestopalova@hubspot.com>

* fix ctx, and wip test

Signed-off-by: Olga Shestopalova <oshestopalova@hubspot.com>

* add e2e test instead of unit test

Signed-off-by: Olga Shestopalova <oshestopalova@hubspot.com>

* use built in methods

Signed-off-by: Olga Shestopalova <oshestopalova@hubspot.com>

* omit this in test

Signed-off-by: Olga Shestopalova <oshestopalova@hubspot.com>

* Update go/vt/vttablet/tabletmanager/rpc_backup.go

Co-authored-by: Shlomi Noach <2607934+shlomi-noach@users.noreply.github.com>
Signed-off-by: Olga Shestopalova <olgash@mit.edu>
Signed-off-by: Olga Shestopalova <oshestopalova@hubspot.com>

* move wg done to top of func

Signed-off-by: Olga Shestopalova <oshestopalova@hubspot.com>

* fix method change

Signed-off-by: Olga Shestopalova <oshestopalova@hubspot.com>
Signed-off-by: <oshestopalova@hubspot.com>
Signed-off-by: Olga Shestopalova <oshestopalova@hubspot.com>

* try running this in isolation

Signed-off-by: Olga Shestopalova <oshestopalova@hubspot.com>

* try running this in isolation, dont run with others

Signed-off-by: Olga Shestopalova <oshestopalova@hubspot.com>

* add debug logging

Signed-off-by: Olga Shestopalova <oshestopalova@hubspot.com>

* add debug logging

Signed-off-by: Olga Shestopalova <oshestopalova@hubspot.com>

* Update go/test/endtoend/backup/vtctlbackup/backup_utils.go

Co-authored-by: Manan Gupta <35839558+GuptaManan100@users.noreply.github.com>
Signed-off-by: Olga Shestopalova <olgash@mit.edu>

* Update go/test/endtoend/backup/vtctlbackup/backup_utils.go

Co-authored-by: Manan Gupta <35839558+GuptaManan100@users.noreply.github.com>
Signed-off-by: Olga Shestopalova <olgash@mit.edu>

* run goimports

Signed-off-by: Olga Shestopalova <oshestopalova@hubspot.com>

* remove dupe

Signed-off-by: Olga Shestopalova <oshestopalova@hubspot.com>

* remove generated file

Signed-off-by: Olga Shestopalova <oshestopalova@hubspot.com>

---------

Signed-off-by: Olga Shestopalova <oshestopalova@hubspot.com>
Signed-off-by: Olga Shestopalova <olgash@mit.edu>
Signed-off-by: <oshestopalova@hubspot.com>
Co-authored-by: Olga Shestopalova <oshestopalova@hubspot.com>
Co-authored-by: Shlomi Noach <2607934+shlomi-noach@users.noreply.github.com>
Co-authored-by: Manan Gupta <35839558+GuptaManan100@users.noreply.github.com>
  • Loading branch information
4 people authored and tanjinx committed Jun 24, 2024
1 parent 0474726 commit 442af43
Show file tree
Hide file tree
Showing 4 changed files with 150 additions and 16 deletions.
64 changes: 64 additions & 0 deletions go/test/endtoend/backup/vtctlbackup/backup_utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ import (
"os/exec"
"path"
"strings"
"sync"
"syscall"
"testing"
"time"
Expand All @@ -36,6 +37,7 @@ import (
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"

"vitess.io/vitess/go/json2"
"vitess.io/vitess/go/test/endtoend/cluster"
)

Expand Down Expand Up @@ -329,6 +331,10 @@ func TestBackup(t *testing.T, setupType int, streamMode string, stripes int, cDe
name: "TestTerminatedRestore",
method: terminatedRestore,
}, //
{
name: "DoNotDemoteNewlyPromotedPrimaryIfReparentingDuringBackup",
method: doNotDemoteNewlyPromotedPrimaryIfReparentingDuringBackup,
}, //
}

defer cluster.PanicHandler(t)
Expand All @@ -344,6 +350,10 @@ func TestBackup(t *testing.T, setupType int, streamMode string, stripes int, cDe
if len(runSpecific) > 0 && !isRegistered(test.name, runSpecific) {
continue
}
// don't run this one unless specified
if len(runSpecific) == 0 && test.name == "DoNotDemoteNewlyPromotedPrimaryIfReparentingDuringBackup" {
continue
}
if retVal := t.Run(test.name, test.method); !retVal {
return vterrors.Errorf(vtrpc.Code_UNKNOWN, "test failure: %s", test.name)
}
Expand Down Expand Up @@ -749,6 +759,60 @@ func terminatedRestore(t *testing.T) {
stopAllTablets()
}

func checkTabletType(t *testing.T, alias string, tabletType topodata.TabletType) {
// for loop for 15 seconds to check if tablet type is correct
for i := 0; i < 15; i++ {
output, err := localCluster.VtctldClientProcess.ExecuteCommandWithOutput("GetTablet", alias)
require.Nil(t, err)
var tabletPB topodata.Tablet
err = json2.Unmarshal([]byte(output), &tabletPB)
require.NoError(t, err)
if tabletType == tabletPB.Type {
return
}
time.Sleep(1 * time.Second)
}
require.Failf(t, "checkTabletType failed.", "Tablet type is not correct. Expected: %v", tabletType)
}

func doNotDemoteNewlyPromotedPrimaryIfReparentingDuringBackup(t *testing.T) {
var wg sync.WaitGroup
wg.Add(2)

// Start the backup on a replica
go func() {
defer wg.Done()
// ensure this is a primary first
checkTabletType(t, primary.Alias, topodata.TabletType_PRIMARY)

// now backup
err := localCluster.VtctlclientProcess.ExecuteCommand("Backup", replica1.Alias)
require.Nil(t, err)
}()

// Perform a graceful reparent operation
go func() {
defer wg.Done()
// ensure this is a primary first
checkTabletType(t, primary.Alias, topodata.TabletType_PRIMARY)

// now reparent
_, err := localCluster.VtctlclientProcess.ExecuteCommandWithOutput(
"PlannedReparentShard", "--",
"--keyspace_shard", fmt.Sprintf("%s/%s", keyspaceName, shardName),
"--new_primary", replica1.Alias)
require.Nil(t, err)

// check that we reparented
checkTabletType(t, replica1.Alias, topodata.TabletType_PRIMARY)
}()

wg.Wait()

// check that this is still a primary
checkTabletType(t, replica1.Alias, topodata.TabletType_PRIMARY)
}

// test_backup will:
// - create a shard with primary and replica1 only
// - run InitShardPrimary
Expand Down
28 changes: 28 additions & 0 deletions go/test/endtoend/backup/xtrabackup/xtrabackup_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,34 @@ func TestXtrabackWithZstdCompression(t *testing.T) {
backup.TestBackup(t, backup.XtraBackup, "tar", 0, cDetails, []string{"TestReplicaBackup"})
}

func TestXtrabackupWithExternalZstdCompression(t *testing.T) {
defer setDefaultCompressionFlag()
cDetails := &backup.CompressionDetails{
CompressorEngineName: "external",
ExternalCompressorCmd: "zstd",
ExternalCompressorExt: ".zst",
ExternalDecompressorCmd: "zstd -d",
}

backup.TestBackup(t, backup.XtraBackup, "tar", 0, cDetails, []string{"TestReplicaBackup"})
}

func TestXtrabackupWithExternalZstdCompressionAndManifestedDecompressor(t *testing.T) {
defer setDefaultCompressionFlag()
cDetails := &backup.CompressionDetails{
CompressorEngineName: "external",
ExternalCompressorCmd: "zstd",
ExternalCompressorExt: ".zst",
ExternalDecompressorCmd: "zstd -d",
}

backup.TestBackup(t, backup.XtraBackup, "tar", 0, cDetails, []string{"TestReplicaBackup"})
}

func TestDoNotDemoteNewlyPromotedPrimaryIfReparentingDuringBackup(t *testing.T) {
backup.TestBackup(t, backup.XtraBackup, "xbstream", 0, nil, []string{"DoNotDemoteNewlyPromotedPrimaryIfReparentingDuringBackup"})
}

func setDefaultCompressionFlag() {
mysqlctl.CompressionEngineName = "pgzip"
mysqlctl.ExternalCompressorCmd = ""
Expand Down
15 changes: 1 addition & 14 deletions go/vt/vtctl/grpcvtctldserver/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -478,20 +478,7 @@ func (s *VtctldServer) backupTablet(ctx context.Context, tablet *topodatapb.Tabl
logger.Errorf("failed to send stream response %+v: %v", resp, err)
}
case io.EOF:
// Do not do anything for primary tablets and when active reparenting is disabled
if mysqlctl.DisableActiveReparents || tablet.Type == topodatapb.TabletType_PRIMARY {
return nil
}

// Otherwise we find the correct primary tablet and set the replication source,
// since the primary could have changed while we executed the backup which can
// also affect whether we want to send semi sync acks or not.
tabletInfo, err := s.ts.GetTablet(ctx, tablet.Alias)
if err != nil {
return err
}

return reparentutil.SetReplicationSource(ctx, s.ts, s.tmc, tabletInfo.Tablet)
return nil
default:
return err
}
Expand Down
59 changes: 57 additions & 2 deletions go/vt/vttablet/tabletmanager/rpc_backup.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,9 @@ import (

"context"

"vitess.io/vitess/go/vt/topotools"
"vitess.io/vitess/go/vt/vtctl/reparentutil"

"vitess.io/vitess/go/vt/logutil"
"vitess.io/vitess/go/vt/mysqlctl"
"vitess.io/vitess/go/vt/topo/topoproto"
Expand Down Expand Up @@ -73,6 +76,9 @@ func (tm *TabletManager) Backup(ctx context.Context, logger logutil.Logger, req
}
defer tm.endBackup(backupMode)

// create the loggers: tee to console and source
l := logutil.NewTeeLogger(logutil.NewConsoleLogger(), logger)

var originalType topodatapb.TabletType
if engine.ShouldDrainForBackup() {
if err := tm.lock(ctx); err != nil {
Expand All @@ -89,6 +95,7 @@ func (tm *TabletManager) Backup(ctx context.Context, logger logutil.Logger, req
if err := tm.changeTypeLocked(ctx, topodatapb.TabletType_BACKUP, DBActionNone, SemiSyncActionUnset); err != nil {
return err
}

// Tell Orchestrator we're stopped on purpose for some Vitess task.
// Do this in the background, as it's best-effort.
go func() {
Expand All @@ -99,9 +106,57 @@ func (tm *TabletManager) Backup(ctx context.Context, logger logutil.Logger, req
logger.Warningf("Orchestrator BeginMaintenance failed: %v", err)
}
}()

// Adding defer to original value in case of any failures.
defer func() {
bgCtx := context.Background()
// Change our type back to the original value.
// Original type could be primary so pass in a real value for PrimaryTermStartTime
if err := tm.changeTypeLocked(bgCtx, originalType, DBActionNone, SemiSyncActionNone); err != nil {
l.Errorf("Failed to change tablet type from %v to %v, error: %v", topodatapb.TabletType_BACKUP, originalType, err)
return
}

// Find the correct primary tablet and set the replication source,
// since the primary could have changed while we executed the backup which can
// also affect whether we want to send semi sync acks or not.
tabletInfo, err := tm.TopoServer.GetTablet(bgCtx, tablet.Alias)
if err != nil {
l.Errorf("Failed to fetch updated tablet info, error: %v", err)
return
}

// Do not do anything for primary tablets or when active reparenting is disabled
if mysqlctl.DisableActiveReparents || tabletInfo.Type == topodatapb.TabletType_PRIMARY {
return
}

shardPrimary, err := topotools.GetShardPrimaryForTablet(bgCtx, tm.TopoServer, tablet.Tablet)
if err != nil {
return
}

durabilityName, err := tm.TopoServer.GetKeyspaceDurability(bgCtx, tablet.Keyspace)
if err != nil {
l.Errorf("Failed to get durability policy, error: %v", err)
return
}
durability, err := reparentutil.GetDurabilityPolicy(durabilityName)
if err != nil {
l.Errorf("Failed to get durability with name %v, error: %v", durabilityName, err)
}

isSemiSync := reparentutil.IsReplicaSemiSync(durability, shardPrimary.Tablet, tabletInfo.Tablet)
semiSyncAction, err := tm.convertBoolToSemiSyncAction(isSemiSync)
if err != nil {
l.Errorf("Failed to convert bool to semisync action, error: %v", err)
return
}
if err := tm.setReplicationSourceLocked(bgCtx, shardPrimary.Alias, 0, "", false, semiSyncAction); err != nil {
l.Errorf("Failed to set replication source, error: %v", err)
}
}()
}
// create the loggers: tee to console and source
l := logutil.NewTeeLogger(logutil.NewConsoleLogger(), logger)

// now we can run the backup
backupParams := mysqlctl.BackupParams{
Expand Down

0 comments on commit 442af43

Please sign in to comment.