From 1e32824ec64abbd18f60a1faefd549895e0a1dd1 Mon Sep 17 00:00:00 2001 From: Manan Gupta <35839558+GuptaManan100@users.noreply.github.com> Date: Wed, 19 Jul 2023 09:54:51 +0530 Subject: [PATCH] [release-15.0]: Fix flakiness in VTOrc tests (#13489) (#13529) Signed-off-by: Manan Gupta --- go/test/endtoend/cluster/vtorc_process.go | 22 ++++++++++-------- go/test/endtoend/vtorc/general/vtorc_test.go | 9 ++++++++ .../primaryfailure/primary_failure_test.go | 14 ++++++++++- .../vtorc/readtopologyinstance/main_test.go | 3 ++- go/test/endtoend/vtorc/utils/utils.go | 23 +++++++++++++++++++ 5 files changed, 60 insertions(+), 11 deletions(-) diff --git a/go/test/endtoend/cluster/vtorc_process.go b/go/test/endtoend/cluster/vtorc_process.go index 14f475fa48c..17a4aa5c54e 100644 --- a/go/test/endtoend/cluster/vtorc_process.go +++ b/go/test/endtoend/cluster/vtorc_process.go @@ -36,14 +36,15 @@ import ( // vtorc as a separate process for testing type VTOrcProcess struct { VtctlProcess - Port int - LogDir string - ExtraArgs []string - ConfigPath string - Config VTOrcConfiguration - WebPort int - proc *exec.Cmd - exit chan error + Port int + LogDir string + LogFileName string + ExtraArgs []string + ConfigPath string + Config VTOrcConfiguration + WebPort int + proc *exec.Cmd + exit chan error } type VTOrcConfiguration struct { @@ -123,7 +124,10 @@ func (orc *VTOrcProcess) Setup() (err error) { orc.proc.Args = append(orc.proc.Args, orc.ExtraArgs...) orc.proc.Args = append(orc.proc.Args, "--alsologtostderr") - errFile, _ := os.Create(path.Join(orc.LogDir, fmt.Sprintf("orc-stderr-%d.txt", timeNow))) + if orc.LogFileName == "" { + orc.LogFileName = fmt.Sprintf("orc-stderr-%d.txt", timeNow) + } + errFile, _ := os.Create(path.Join(orc.LogDir, orc.LogFileName)) orc.proc.Stderr = errFile orc.proc.Env = append(orc.proc.Env, os.Environ()...) diff --git a/go/test/endtoend/vtorc/general/vtorc_test.go b/go/test/endtoend/vtorc/general/vtorc_test.go index 4254606dd94..c0a845a5699 100644 --- a/go/test/endtoend/vtorc/general/vtorc_test.go +++ b/go/test/endtoend/vtorc/general/vtorc_test.go @@ -37,6 +37,7 @@ import ( // verify replication is setup // verify that with multiple vtorc instances, we still only have 1 PlannedReparentShard call func TestPrimaryElection(t *testing.T) { + defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance) defer cluster.PanicHandler(t) utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 2, 1, nil, cluster.VTOrcConfiguration{ PreventCrossDataCenterPrimaryFailover: true, @@ -64,6 +65,7 @@ func TestPrimaryElection(t *testing.T) { // verify rdonly is not elected, only replica // verify replication is setup func TestSingleKeyspace(t *testing.T) { + defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance) defer cluster.PanicHandler(t) utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 1, 1, []string{"--clusters_to_watch", "ks"}, cluster.VTOrcConfiguration{ PreventCrossDataCenterPrimaryFailover: true, @@ -81,6 +83,7 @@ func TestSingleKeyspace(t *testing.T) { // verify rdonly is not elected, only replica // verify replication is setup func TestKeyspaceShard(t *testing.T) { + defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance) defer cluster.PanicHandler(t) utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 1, 1, []string{"--clusters_to_watch", "ks/0"}, cluster.VTOrcConfiguration{ PreventCrossDataCenterPrimaryFailover: true, @@ -100,6 +103,7 @@ func TestKeyspaceShard(t *testing.T) { // 4. setup replication from non-primary, let vtorc repair // 5. make instance A replicates from B and B from A, wait for repair func TestVTOrcRepairs(t *testing.T) { + defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance) defer cluster.PanicHandler(t) utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 3, 0, nil, cluster.VTOrcConfiguration{ PreventCrossDataCenterPrimaryFailover: true, @@ -216,6 +220,7 @@ func TestVTOrcRepairs(t *testing.T) { func TestRepairAfterTER(t *testing.T) { // test fails intermittently on CI, skip until it can be fixed. t.SkipNow() + defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance) defer cluster.PanicHandler(t) utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 2, 0, nil, cluster.VTOrcConfiguration{ PreventCrossDataCenterPrimaryFailover: true, @@ -252,6 +257,7 @@ func TestSemiSync(t *testing.T) { // stop any vtorc instance running due to a previous test. utils.StopVTOrcs(t, clusterInfo) newCluster := utils.SetupNewClusterSemiSync(t) + defer utils.PrintVTOrcLogsOnFailure(t, newCluster.ClusterInstance) utils.StartVTOrcs(t, newCluster, nil, cluster.VTOrcConfiguration{ PreventCrossDataCenterPrimaryFailover: true, }, 1) @@ -316,6 +322,7 @@ func TestSemiSync(t *testing.T) { // TestVTOrcWithPrs tests that VTOrc works fine even when PRS is called from vtctld func TestVTOrcWithPrs(t *testing.T) { + defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance) defer cluster.PanicHandler(t) utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 4, 0, nil, cluster.VTOrcConfiguration{ PreventCrossDataCenterPrimaryFailover: true, @@ -364,6 +371,7 @@ func TestVTOrcWithPrs(t *testing.T) { // TestMultipleDurabilities tests that VTOrc works with 2 keyspaces having 2 different durability policies func TestMultipleDurabilities(t *testing.T) { + defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance) defer cluster.PanicHandler(t) // Setup a normal cluster and start vtorc utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 1, 1, nil, cluster.VTOrcConfiguration{}, 1, "") @@ -388,6 +396,7 @@ func TestDurabilityPolicySetLater(t *testing.T) { // stop any vtorc instance running due to a previous test. utils.StopVTOrcs(t, clusterInfo) newCluster := utils.SetupNewClusterSemiSync(t) + defer utils.PrintVTOrcLogsOnFailure(t, newCluster.ClusterInstance) keyspace := &newCluster.ClusterInstance.Keyspaces[0] shard0 := &keyspace.Shards[0] // Before starting VTOrc we explicity want to set the durability policy of the keyspace to an empty string diff --git a/go/test/endtoend/vtorc/primaryfailure/primary_failure_test.go b/go/test/endtoend/vtorc/primaryfailure/primary_failure_test.go index 4dcbe0a92ed..43d1b31e0f2 100644 --- a/go/test/endtoend/vtorc/primaryfailure/primary_failure_test.go +++ b/go/test/endtoend/vtorc/primaryfailure/primary_failure_test.go @@ -32,8 +32,12 @@ import ( // covers the test case master-failover from orchestrator // Also tests that VTOrc can handle multiple failures, if the durability policies allow it func TestDownPrimary(t *testing.T) { + defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance) defer cluster.PanicHandler(t) - utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 2, 1, nil, cluster.VTOrcConfiguration{ + // We specify the --wait-replicas-timeout to a small value because we spawn a cross-cell replica later in the test. + // If that replica is more advanced than the same-cell-replica, then we try to promote the cross-cell replica as an intermediate source. + // If we don't specify a small value of --wait-replicas-timeout, then we would end up waiting for 30 seconds for the dead-primary to respond, failing this test. + utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 2, 1, []string{"--remote_operation_timeout=10s", "--wait-replicas-timeout=5s"}, cluster.VTOrcConfiguration{ PreventCrossDataCenterPrimaryFailover: true, }, 1, "semi_sync") keyspace := &clusterInfo.ClusterInstance.Keyspaces[0] @@ -86,6 +90,7 @@ func TestDownPrimary(t *testing.T) { // Failover should not be cross data centers, according to the configuration file // covers part of the test case master-failover-lost-replicas from orchestrator func TestCrossDataCenterFailure(t *testing.T) { + defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance) defer cluster.PanicHandler(t) utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 2, 1, nil, cluster.VTOrcConfiguration{ PreventCrossDataCenterPrimaryFailover: true, @@ -131,6 +136,7 @@ func TestCrossDataCenterFailure(t *testing.T) { // Failover should not be cross data centers, according to the configuration file // In case of no viable candidates, we should error out func TestCrossDataCenterFailureError(t *testing.T) { + defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance) defer cluster.PanicHandler(t) utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 1, 1, nil, cluster.VTOrcConfiguration{ PreventCrossDataCenterPrimaryFailover: true, @@ -177,6 +183,7 @@ func TestLostRdonlyOnPrimaryFailure(t *testing.T) { // Earlier any replicas that were not able to replicate from the previous primary // were detected by vtorc and could be configured to have their sources detached t.Skip() + defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance) defer cluster.PanicHandler(t) utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 2, 2, nil, cluster.VTOrcConfiguration{ PreventCrossDataCenterPrimaryFailover: true, @@ -258,6 +265,7 @@ func TestLostRdonlyOnPrimaryFailure(t *testing.T) { // This test checks that the promotion of a tablet succeeds if it passes the promotion lag test // covers the test case master-failover-fail-promotion-lag-minutes-success from orchestrator func TestPromotionLagSuccess(t *testing.T) { + defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance) defer cluster.PanicHandler(t) utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 2, 1, nil, cluster.VTOrcConfiguration{ ReplicationLagQuery: "select 59", @@ -307,6 +315,7 @@ func TestPromotionLagFailure(t *testing.T) { // Earlier vtorc used to check that the promotion lag between the new primary and the old one // was smaller than the configured value, otherwise it would fail the promotion t.Skip() + defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance) defer cluster.PanicHandler(t) utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 3, 1, nil, cluster.VTOrcConfiguration{ ReplicationLagQuery: "select 61", @@ -359,6 +368,7 @@ func TestPromotionLagFailure(t *testing.T) { // We explicitly set one of the replicas to Prefer promotion rule. // That is the replica which should be promoted in case of primary failure func TestDownPrimaryPromotionRule(t *testing.T) { + defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance) defer cluster.PanicHandler(t) utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 2, 1, nil, cluster.VTOrcConfiguration{ LockShardTimeoutSeconds: 5, @@ -406,6 +416,7 @@ func TestDownPrimaryPromotionRule(t *testing.T) { // That is the replica which should be promoted in case of primary failure // It should also be caught up when it is promoted func TestDownPrimaryPromotionRuleWithLag(t *testing.T) { + defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance) defer cluster.PanicHandler(t) utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 2, 1, nil, cluster.VTOrcConfiguration{ LockShardTimeoutSeconds: 5, @@ -485,6 +496,7 @@ func TestDownPrimaryPromotionRuleWithLag(t *testing.T) { // We let a replica in our own cell lag. That is the replica which should be promoted in case of primary failure // It should also be caught up when it is promoted func TestDownPrimaryPromotionRuleWithLagCrossCenter(t *testing.T) { + defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance) defer cluster.PanicHandler(t) utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 2, 1, nil, cluster.VTOrcConfiguration{ LockShardTimeoutSeconds: 5, diff --git a/go/test/endtoend/vtorc/readtopologyinstance/main_test.go b/go/test/endtoend/vtorc/readtopologyinstance/main_test.go index 8dbd4761f93..63ef7f042ad 100644 --- a/go/test/endtoend/vtorc/readtopologyinstance/main_test.go +++ b/go/test/endtoend/vtorc/readtopologyinstance/main_test.go @@ -40,6 +40,7 @@ func TestReadTopologyInstanceBufferable(t *testing.T) { defer func() { clusterInfo.ClusterInstance.Teardown() }() + defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance) keyspace := &clusterInfo.ClusterInstance.Keyspaces[0] shard0 := &keyspace.Shards[0] oldArgs := os.Args @@ -147,7 +148,7 @@ func TestReadTopologyInstanceBufferable(t *testing.T) { assert.Equal(t, replicaInstance.ReadBinlogCoordinates.LogFile, primaryInstance.SelfBinlogCoordinates.LogFile) assert.Greater(t, replicaInstance.ReadBinlogCoordinates.LogPos, int64(0)) assert.Equal(t, replicaInstance.ExecBinlogCoordinates.LogFile, primaryInstance.SelfBinlogCoordinates.LogFile) - assert.LessOrEqual(t, replicaInstance.ExecBinlogCoordinates.LogPos, replicaInstance.ReadBinlogCoordinates.LogPos) + assert.Greater(t, replicaInstance.ExecBinlogCoordinates.LogPos, int64(0)) assert.Contains(t, replicaInstance.RelaylogCoordinates.LogFile, fmt.Sprintf("vt-0000000%d-relay", replica.TabletUID)) assert.Greater(t, replicaInstance.RelaylogCoordinates.LogPos, int64(0)) assert.Empty(t, replicaInstance.LastIOError) diff --git a/go/test/endtoend/vtorc/utils/utils.go b/go/test/endtoend/vtorc/utils/utils.go index 770f4d9fdff..25d3a8445c6 100644 --- a/go/test/endtoend/vtorc/utils/utils.go +++ b/go/test/endtoend/vtorc/utils/utils.go @@ -944,3 +944,26 @@ func WaitForSuccessfulRecoveryCount(t *testing.T, vtorcInstance *cluster.VTOrcPr successCount := successfulRecoveriesMap[recoveryName] assert.EqualValues(t, countExpected, successCount) } + +// PrintVTOrcLogsOnFailure prints the VTOrc logs on failure of the test. +// This function is supposed to be called as the first defer command from the vtorc tests. +func PrintVTOrcLogsOnFailure(t *testing.T, clusterInstance *cluster.LocalProcessCluster) { + // If the test has not failed, then we don't need to print anything. + if !t.Failed() { + return + } + + log.Errorf("Printing VTOrc logs") + for _, vtorc := range clusterInstance.VTOrcProcesses { + if vtorc == nil || vtorc.LogFileName == "" { + continue + } + filePath := path.Join(vtorc.LogDir, vtorc.LogFileName) + log.Errorf("Printing file - %s", filePath) + content, err := os.ReadFile(filePath) + if err != nil { + log.Errorf("Error while reading the file - %v", err) + } + log.Errorf("%s", string(content)) + } +}