From a1edaee10af2f8b6ae4d3b737f5b2875c6c9b39f Mon Sep 17 00:00:00 2001 From: Manan Gupta <35839558+GuptaManan100@users.noreply.github.com> Date: Thu, 23 May 2024 00:12:22 +0530 Subject: [PATCH] Add timeout to all the contexts used for RPC calls in vtorc (#15991) Signed-off-by: Manan Gupta --- .../testutil/test_tmclient.go | 16 +- go/vt/vtorc/logic/tablet_discovery.go | 20 +- go/vt/vtorc/logic/tablet_discovery_test.go | 259 ++++++++++++++++++ 3 files changed, 289 insertions(+), 6 deletions(-) diff --git a/go/vt/vtctl/grpcvtctldserver/testutil/test_tmclient.go b/go/vt/vtctl/grpcvtctldserver/testutil/test_tmclient.go index 3f13a0d9ff9..9f10ab6c04c 100644 --- a/go/vt/vtctl/grpcvtctldserver/testutil/test_tmclient.go +++ b/go/vt/vtctl/grpcvtctldserver/testutil/test_tmclient.go @@ -186,6 +186,7 @@ type TabletManagerClient struct { } // keyed by tablet alias. ChangeTabletTypeResult map[string]error + ChangeTabletTypeDelays map[string]time.Duration // keyed by tablet alias. DemotePrimaryDelays map[string]time.Duration // keyed by tablet alias. @@ -468,7 +469,20 @@ func (fake *TabletManagerClient) Backup(ctx context.Context, tablet *topodatapb. // ChangeType is part of the tmclient.TabletManagerClient interface. func (fake *TabletManagerClient) ChangeType(ctx context.Context, tablet *topodatapb.Tablet, newType topodatapb.TabletType, semiSync bool) error { - if result, ok := fake.ChangeTabletTypeResult[topoproto.TabletAliasString(tablet.Alias)]; ok { + key := topoproto.TabletAliasString(tablet.Alias) + + if fake.ChangeTabletTypeDelays != nil { + if delay, ok := fake.ChangeTabletTypeDelays[key]; ok { + select { + case <-ctx.Done(): + return ctx.Err() + case <-time.After(delay): + // proceed to results + } + } + } + + if result, ok := fake.ChangeTabletTypeResult[key]; ok { return result } diff --git a/go/vt/vtorc/logic/tablet_discovery.go b/go/vt/vtorc/logic/tablet_discovery.go index acfc7efaae9..593b846a72e 100644 --- a/go/vt/vtorc/logic/tablet_discovery.go +++ b/go/vt/vtorc/logic/tablet_discovery.go @@ -285,27 +285,37 @@ func LockShard(ctx context.Context, tabletAlias string, lockAction string) (cont // tabletUndoDemotePrimary calls the said RPC for the given tablet. func tabletUndoDemotePrimary(ctx context.Context, tablet *topodatapb.Tablet, semiSync bool) error { - return tmc.UndoDemotePrimary(ctx, tablet, semiSync) + tmcCtx, tmcCancel := context.WithTimeout(ctx, topo.RemoteOperationTimeout) + defer tmcCancel() + return tmc.UndoDemotePrimary(tmcCtx, tablet, semiSync) } // setReadOnly calls the said RPC for the given tablet func setReadOnly(ctx context.Context, tablet *topodatapb.Tablet) error { - return tmc.SetReadOnly(ctx, tablet) + tmcCtx, tmcCancel := context.WithTimeout(ctx, topo.RemoteOperationTimeout) + defer tmcCancel() + return tmc.SetReadOnly(tmcCtx, tablet) } // changeTabletType calls the said RPC for the given tablet with the given parameters. func changeTabletType(ctx context.Context, tablet *topodatapb.Tablet, tabletType topodatapb.TabletType, semiSync bool) error { - return tmc.ChangeType(ctx, tablet, tabletType, semiSync) + tmcCtx, tmcCancel := context.WithTimeout(ctx, topo.RemoteOperationTimeout) + defer tmcCancel() + return tmc.ChangeType(tmcCtx, tablet, tabletType, semiSync) } // resetReplicationParameters resets the replication parameters on the given tablet. func resetReplicationParameters(ctx context.Context, tablet *topodatapb.Tablet) error { - return tmc.ResetReplicationParameters(ctx, tablet) + tmcCtx, tmcCancel := context.WithTimeout(ctx, topo.RemoteOperationTimeout) + defer tmcCancel() + return tmc.ResetReplicationParameters(tmcCtx, tablet) } // setReplicationSource calls the said RPC with the parameters provided func setReplicationSource(ctx context.Context, replica *topodatapb.Tablet, primary *topodatapb.Tablet, semiSync bool, heartbeatInterval float64) error { - return tmc.SetReplicationSource(ctx, replica, primary.Alias, 0, "", true, semiSync, heartbeatInterval) + tmcCtx, tmcCancel := context.WithTimeout(ctx, topo.RemoteOperationTimeout) + defer tmcCancel() + return tmc.SetReplicationSource(tmcCtx, replica, primary.Alias, 0, "", true, semiSync, heartbeatInterval) } // shardPrimary finds the primary of the given keyspace-shard by reading the vtorc backend diff --git a/go/vt/vtorc/logic/tablet_discovery_test.go b/go/vt/vtorc/logic/tablet_discovery_test.go index d7e645ba66d..7acb29dcc5b 100644 --- a/go/vt/vtorc/logic/tablet_discovery_test.go +++ b/go/vt/vtorc/logic/tablet_discovery_test.go @@ -21,6 +21,7 @@ import ( "fmt" "sync/atomic" "testing" + "time" "github.com/google/go-cmp/cmp" "github.com/stretchr/testify/assert" @@ -30,8 +31,10 @@ import ( "vitess.io/vitess/go/vt/external/golib/sqlutils" topodatapb "vitess.io/vitess/go/vt/proto/topodata" "vitess.io/vitess/go/vt/proto/vttime" + "vitess.io/vitess/go/vt/topo" "vitess.io/vitess/go/vt/topo/memorytopo" "vitess.io/vitess/go/vt/topo/topoproto" + "vitess.io/vitess/go/vt/vtctl/grpcvtctldserver/testutil" "vitess.io/vitess/go/vt/vtorc/db" "vitess.io/vitess/go/vt/vtorc/inst" "vitess.io/vitess/go/vt/vtorc/process" @@ -362,3 +365,259 @@ func TestProcessHealth(t *testing.T) { _, discoveredOnce = process.HealthTest() require.True(t, discoveredOnce) } + +func TestSetReadOnly(t *testing.T) { + tests := []struct { + name string + tablet *topodatapb.Tablet + tmc *testutil.TabletManagerClient + remoteOpTimeout time.Duration + errShouldContain string + }{ + { + name: "Success", + tablet: tab100, + tmc: &testutil.TabletManagerClient{ + SetReadOnlyResults: map[string]error{ + "zone-1-0000000100": nil, + }, + }, + }, { + name: "Failure", + tablet: tab100, + tmc: &testutil.TabletManagerClient{ + SetReadOnlyResults: map[string]error{ + "zone-1-0000000100": fmt.Errorf("testing error"), + }, + }, + errShouldContain: "testing error", + }, { + name: "Timeout", + tablet: tab100, + remoteOpTimeout: 100 * time.Millisecond, + tmc: &testutil.TabletManagerClient{ + SetReadOnlyResults: map[string]error{ + "zone-1-0000000100": nil, + }, + SetReadOnlyDelays: map[string]time.Duration{ + "zone-1-0000000100": 200 * time.Millisecond, + }, + }, + errShouldContain: "context deadline exceeded", + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + oldTmc := tmc + oldRemoteOpTimeout := topo.RemoteOperationTimeout + defer func() { + tmc = oldTmc + topo.RemoteOperationTimeout = oldRemoteOpTimeout + }() + + tmc = tt.tmc + if tt.remoteOpTimeout != 0 { + topo.RemoteOperationTimeout = tt.remoteOpTimeout + } + + err := setReadOnly(context.Background(), tt.tablet) + if tt.errShouldContain == "" { + require.NoError(t, err) + return + } + require.ErrorContains(t, err, tt.errShouldContain) + }) + } +} + +func TestTabletUndoDemotePrimary(t *testing.T) { + tests := []struct { + name string + tablet *topodatapb.Tablet + tmc *testutil.TabletManagerClient + remoteOpTimeout time.Duration + errShouldContain string + }{ + { + name: "Success", + tablet: tab100, + tmc: &testutil.TabletManagerClient{ + UndoDemotePrimaryResults: map[string]error{ + "zone-1-0000000100": nil, + }, + }, + }, { + name: "Failure", + tablet: tab100, + tmc: &testutil.TabletManagerClient{ + UndoDemotePrimaryResults: map[string]error{ + "zone-1-0000000100": fmt.Errorf("testing error"), + }, + }, + errShouldContain: "testing error", + }, { + name: "Timeout", + tablet: tab100, + remoteOpTimeout: 100 * time.Millisecond, + tmc: &testutil.TabletManagerClient{ + UndoDemotePrimaryResults: map[string]error{ + "zone-1-0000000100": nil, + }, + UndoDemotePrimaryDelays: map[string]time.Duration{ + "zone-1-0000000100": 200 * time.Millisecond, + }, + }, + errShouldContain: "context deadline exceeded", + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + oldTmc := tmc + oldRemoteOpTimeout := topo.RemoteOperationTimeout + defer func() { + tmc = oldTmc + topo.RemoteOperationTimeout = oldRemoteOpTimeout + }() + + tmc = tt.tmc + if tt.remoteOpTimeout != 0 { + topo.RemoteOperationTimeout = tt.remoteOpTimeout + } + + err := tabletUndoDemotePrimary(context.Background(), tt.tablet, false) + if tt.errShouldContain == "" { + require.NoError(t, err) + return + } + require.ErrorContains(t, err, tt.errShouldContain) + }) + } +} + +func TestChangeTabletType(t *testing.T) { + tests := []struct { + name string + tablet *topodatapb.Tablet + tmc *testutil.TabletManagerClient + remoteOpTimeout time.Duration + errShouldContain string + }{ + { + name: "Success", + tablet: tab100, + tmc: &testutil.TabletManagerClient{ + ChangeTabletTypeResult: map[string]error{ + "zone-1-0000000100": nil, + }, + }, + }, { + name: "Failure", + tablet: tab100, + tmc: &testutil.TabletManagerClient{ + ChangeTabletTypeResult: map[string]error{ + "zone-1-0000000100": fmt.Errorf("testing error"), + }, + }, + errShouldContain: "testing error", + }, { + name: "Timeout", + tablet: tab100, + remoteOpTimeout: 100 * time.Millisecond, + tmc: &testutil.TabletManagerClient{ + ChangeTabletTypeResult: map[string]error{ + "zone-1-0000000100": nil, + }, + ChangeTabletTypeDelays: map[string]time.Duration{ + "zone-1-0000000100": 200 * time.Millisecond, + }, + }, + errShouldContain: "context deadline exceeded", + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + oldTmc := tmc + oldRemoteOpTimeout := topo.RemoteOperationTimeout + defer func() { + tmc = oldTmc + topo.RemoteOperationTimeout = oldRemoteOpTimeout + }() + + tmc = tt.tmc + if tt.remoteOpTimeout != 0 { + topo.RemoteOperationTimeout = tt.remoteOpTimeout + } + + err := changeTabletType(context.Background(), tt.tablet, topodatapb.TabletType_REPLICA, false) + if tt.errShouldContain == "" { + require.NoError(t, err) + return + } + require.ErrorContains(t, err, tt.errShouldContain) + }) + } +} + +func TestSetReplicationSource(t *testing.T) { + tests := []struct { + name string + tablet *topodatapb.Tablet + tmc *testutil.TabletManagerClient + remoteOpTimeout time.Duration + errShouldContain string + }{ + { + name: "Success", + tablet: tab100, + tmc: &testutil.TabletManagerClient{ + SetReplicationSourceResults: map[string]error{ + "zone-1-0000000100": nil, + }, + }, + }, { + name: "Failure", + tablet: tab100, + tmc: &testutil.TabletManagerClient{ + SetReplicationSourceResults: map[string]error{ + "zone-1-0000000100": fmt.Errorf("testing error"), + }, + }, + errShouldContain: "testing error", + }, { + name: "Timeout", + tablet: tab100, + remoteOpTimeout: 100 * time.Millisecond, + tmc: &testutil.TabletManagerClient{ + SetReplicationSourceResults: map[string]error{ + "zone-1-0000000100": nil, + }, + SetReplicationSourceDelays: map[string]time.Duration{ + "zone-1-0000000100": 200 * time.Millisecond, + }, + }, + errShouldContain: "context deadline exceeded", + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + oldTmc := tmc + oldRemoteOpTimeout := topo.RemoteOperationTimeout + defer func() { + tmc = oldTmc + topo.RemoteOperationTimeout = oldRemoteOpTimeout + }() + + tmc = tt.tmc + if tt.remoteOpTimeout != 0 { + topo.RemoteOperationTimeout = tt.remoteOpTimeout + } + + err := setReplicationSource(context.Background(), tt.tablet, tab101, false, 0) + if tt.errShouldContain == "" { + require.NoError(t, err) + return + } + require.ErrorContains(t, err, tt.errShouldContain) + }) + } +}