From 149d2db9e492f00fc324ec2c0a524ecea85036be Mon Sep 17 00:00:00 2001 From: Shlomi Noach <2607934+shlomi-noach@users.noreply.github.com> Date: Wed, 5 Jun 2024 06:58:21 +0300 Subject: [PATCH] CI upgrade/downgrade tests for Online DDL / throttler / vreplication flow (#16017) Signed-off-by: Shlomi Noach <2607934+shlomi-noach@users.noreply.github.com> Co-authored-by: Florent Poinsard <35779988+frouioui@users.noreply.github.com> --- .../upgrade_downgrade_test_onlineddl_flow.yml | 258 +++++++ go/test/endtoend/cluster/cluster_process.go | 12 + .../onlineddl/flow/onlineddl_flow_test.go | 661 ++++++++++++++++++ test/config.json | 9 + 4 files changed, 940 insertions(+) create mode 100644 .github/workflows/upgrade_downgrade_test_onlineddl_flow.yml create mode 100644 go/test/endtoend/onlineddl/flow/onlineddl_flow_test.go diff --git a/.github/workflows/upgrade_downgrade_test_onlineddl_flow.yml b/.github/workflows/upgrade_downgrade_test_onlineddl_flow.yml new file mode 100644 index 00000000000..d05283e482b --- /dev/null +++ b/.github/workflows/upgrade_downgrade_test_onlineddl_flow.yml @@ -0,0 +1,258 @@ +name: Online DDL flow - Upgrade Downgrade Testing +on: + push: + pull_request: + +concurrency: + group: format('{0}-{1}', ${{ github.ref }}, 'Upgrade Downgrade Testing Online DDL flow') + cancel-in-progress: true + +permissions: read-all + +# This test ensures that our Online DDL + VReplication + throttler components +# work using primary and replica vttablets built on different versions. + +jobs: + + upgrade_downgrade_test: + name: Run Upgrade Downgrade Test - Online DDL flow + runs-on: gh-hosted-runners-16cores-1 + + steps: + - name: Skip CI + run: | + if [[ "${{contains( github.event.pull_request.labels.*.name, 'Skip CI')}}" == "true" ]]; then + echo "skipping CI due to the 'Skip CI' label" + exit 1 + fi + + - name: Check if workflow needs to be skipped + id: skip-workflow + run: | + skip='false' + if [[ "${{github.event.pull_request}}" == "" ]] && [[ "${{github.ref}}" != "refs/heads/main" ]] && [[ ! "${{github.ref}}" =~ ^refs/heads/release-[0-9]+\.[0-9]$ ]] && [[ ! "${{github.ref}}" =~ "refs/tags/.*" ]]; then + skip='true' + fi + echo Skip ${skip} + echo "skip-workflow=${skip}" >> $GITHUB_OUTPUT + + - name: Check out commit's code + if: steps.skip-workflow.outputs.skip-workflow == 'false' + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Check for changes in relevant files + if: steps.skip-workflow.outputs.skip-workflow == 'false' + uses: dorny/paths-filter@v3.0.1 + id: changes + with: + token: '' + filters: | + end_to_end: + - 'go/**' + - 'go/**/*.go' + - 'test.go' + - 'Makefile' + - 'build.env' + - 'go.sum' + - 'go.mod' + - 'proto/*.proto' + - 'tools/**' + - 'config/**' + - 'bootstrap.sh' + - '.github/workflows/upgrade_downgrade_test_onlineddl_flow.yml' + + - name: Set output with latest release branch + id: output-previous-release-ref + if: steps.skip-workflow.outputs.skip-workflow == 'false' + run: | + previous_release_ref=$(./tools/get_previous_release.sh ${{github.base_ref}} ${{github.ref}}) + echo $previous_release_ref + echo "previous_release_ref=${previous_release_ref}" >> $GITHUB_OUTPUT + + - name: Set output with next release branch + if: steps.skip-workflow.outputs.skip-workflow == 'false' + id: output-next-release-ref + run: | + next_release_ref=$(./tools/get_next_release.sh ${{github.base_ref}} ${{github.ref}}) + echo $next_release_ref + echo "next_release_ref=${next_release_ref}" >> $GITHUB_OUTPUT + + - name: Set up Go + if: steps.skip-workflow.outputs.skip-workflow == 'false' && steps.changes.outputs.end_to_end == 'true' + uses: actions/setup-go@v5 + with: + go-version: 1.22.3 + + - name: Set up python + if: steps.skip-workflow.outputs.skip-workflow == 'false' && steps.changes.outputs.end_to_end == 'true' + uses: actions/setup-python@v5 + + - name: Tune the OS + if: steps.skip-workflow.outputs.skip-workflow == 'false' && steps.changes.outputs.end_to_end == 'true' + run: | + sudo sysctl -w net.ipv4.ip_local_port_range="22768 65535" + + - name: Get base dependencies + if: steps.skip-workflow.outputs.skip-workflow == 'false' && steps.changes.outputs.end_to_end == 'true' + run: | + sudo DEBIAN_FRONTEND="noninteractive" apt-get update + # Uninstall any previously installed MySQL first + sudo systemctl stop apparmor + sudo DEBIAN_FRONTEND="noninteractive" apt-get remove -y --purge mysql-server mysql-client mysql-common + sudo apt-get -y autoremove + sudo apt-get -y autoclean + sudo deluser mysql + sudo rm -rf /var/lib/mysql + sudo rm -rf /etc/mysql + # Install mysql80 + sudo apt-key adv --keyserver keyserver.ubuntu.com --recv-keys A8D3785C + wget -c https://dev.mysql.com/get/mysql-apt-config_0.8.29-1_all.deb + echo mysql-apt-config mysql-apt-config/select-server select mysql-8.0 | sudo debconf-set-selections + sudo DEBIAN_FRONTEND="noninteractive" dpkg -i mysql-apt-config* + sudo apt-get update + sudo DEBIAN_FRONTEND="noninteractive" apt-get install -y mysql-server mysql-client + # Install everything else we need, and configure + sudo apt-get install -y make unzip g++ etcd curl git wget eatmydata + sudo service mysql stop + sudo service etcd stop + sudo bash -c "echo '/usr/sbin/mysqld { }' > /etc/apparmor.d/usr.sbin.mysqld" # https://bugs.launchpad.net/ubuntu/+source/mariadb-10.1/+bug/1806263 + sudo ln -s /etc/apparmor.d/usr.sbin.mysqld /etc/apparmor.d/disable/ + sudo apparmor_parser -R /etc/apparmor.d/usr.sbin.mysqld || echo "could not remove mysqld profile" + + # install JUnit report formatter + go install github.com/vitessio/go-junit-report@HEAD + + wget https://repo.percona.com/apt/percona-release_latest.$(lsb_release -sc)_all.deb + sudo apt-get install -y gnupg2 + sudo dpkg -i percona-release_latest.$(lsb_release -sc)_all.deb + sudo apt-get update + sudo apt-get install -y percona-xtrabackup-24 + + # Checkout to the last release of Vitess + - name: Check out last version's code (${{ steps.output-previous-release-ref.outputs.previous_release_ref }}) + if: steps.skip-workflow.outputs.skip-workflow == 'false' && steps.changes.outputs.end_to_end == 'true' + uses: actions/checkout@v4 + with: + ref: ${{ steps.output-previous-release-ref.outputs.previous_release_ref }} + + - name: Get dependencies for the last release + if: steps.skip-workflow.outputs.skip-workflow == 'false' && steps.changes.outputs.end_to_end == 'true' + run: | + go mod download + + - name: Building last release's binaries + if: steps.skip-workflow.outputs.skip-workflow == 'false' && steps.changes.outputs.end_to_end == 'true' + timeout-minutes: 10 + run: | + source build.env + make build + mkdir -p /tmp/vitess-build-last/ + cp -R bin /tmp/vitess-build-last/ + rm -Rf bin/* + + # Checkout to the next release of Vitess + - name: Check out next version's code (${{ steps.output-next-release-ref.outputs.next_release_ref }}) + if: steps.skip-workflow.outputs.skip-workflow == 'false' && steps.changes.outputs.end_to_end == 'true' + uses: actions/checkout@v4 + with: + ref: ${{ steps.output-next-release-ref.outputs.next_release_ref }} + + - name: Get dependencies for the next release + if: steps.skip-workflow.outputs.skip-workflow == 'false' && steps.changes.outputs.end_to_end == 'true' + run: | + go mod download + + - name: Building next release's binaries + if: steps.skip-workflow.outputs.skip-workflow == 'false' && steps.changes.outputs.end_to_end == 'true' + timeout-minutes: 10 + run: | + source build.env + NOVTADMINBUILD=1 make build + mkdir -p /tmp/vitess-build-next/ + cp -R bin /tmp/vitess-build-next/ + rm -Rf bin/* + + # Checkout to this build's commit + - name: Check out commit's code + if: steps.skip-workflow.outputs.skip-workflow == 'false' && steps.changes.outputs.end_to_end == 'true' + uses: actions/checkout@v4 + + - name: Get dependencies for this commit + if: steps.skip-workflow.outputs.skip-workflow == 'false' && steps.changes.outputs.end_to_end == 'true' + run: | + go mod download + + - name: Building the binaries for this commit + if: steps.skip-workflow.outputs.skip-workflow == 'false' && steps.changes.outputs.end_to_end == 'true' + timeout-minutes: 10 + run: | + source build.env + make build + mkdir -p /tmp/vitess-build-current/ + cp -R bin /tmp/vitess-build-current/ + + # Copy vttablet and related binaries under new names + - name: Use current version Vtctl, and other version VTTablet + if: steps.skip-workflow.outputs.skip-workflow == 'false' && steps.changes.outputs.end_to_end == 'true' + run: | + source build.env + + cp /tmp/vitess-build-last/bin/vttablet $PWD/bin/vttablet-last + cp /tmp/vitess-build-last/bin/mysqlctl $PWD/bin/mysqlctl-last + cp /tmp/vitess-build-last/bin/mysqlctld $PWD/bin/mysqlctld-last + cp /tmp/vitess-build-next/bin/vttablet $PWD/bin/vttablet-next + cp /tmp/vitess-build-next/bin/mysqlctl $PWD/bin/mysqlctl-next + cp /tmp/vitess-build-next/bin/mysqlctld $PWD/bin/mysqlctld-next + $PWD/bin/vttablet-last --version + $PWD/bin/vttablet --version + $PWD/bin/vttablet-next --version + + # Running a test with primary tablet at version n (current SHA) and replica vttablet at version n-1 + - name: Run Online DDL tests (primary=N, replica=N-1) + if: steps.skip-workflow.outputs.skip-workflow == 'false' && steps.changes.outputs.end_to_end == 'true' + run: | + rm -rf /tmp/vtdataroot + mkdir -p /tmp/vtdataroot + + source build.env + export PRIMARY_TABLET_BINARY_SUFFIX="" + export REPLICA_TABLET_BINARY_SUFFIX="-last" + eatmydata -- go run test.go -skip-build -keep-data=false -docker=false -print-log -follow -tag upgrade_downgrade_onlineddl_flow + + # Running a test with primary tablet at version n-1 and replica vttablet at version n (current SHA) + - name: Run Online DDL tests (primary=N-1, replica=N) + if: steps.skip-workflow.outputs.skip-workflow == 'false' && steps.changes.outputs.end_to_end == 'true' + run: | + rm -rf /tmp/vtdataroot + mkdir -p /tmp/vtdataroot + + source build.env + export PRIMARY_TABLET_BINARY_SUFFIX="-last" + export REPLICA_TABLET_BINARY_SUFFIX="" + eatmydata -- go run test.go -skip-build -keep-data=false -docker=false -print-log -follow -tag upgrade_downgrade_onlineddl_flow + + # Running a test with primary tablet at version n+1 and replica vttablet at version n (current SHA) + - name: Run Online DDL tests (primary=N+1, replica=N) + if: steps.skip-workflow.outputs.skip-workflow == 'false' && steps.changes.outputs.end_to_end == 'true' + run: | + rm -rf /tmp/vtdataroot + mkdir -p /tmp/vtdataroot + + source build.env + export PRIMARY_TABLET_BINARY_SUFFIX="-next" + export REPLICA_TABLET_BINARY_SUFFIX="" + eatmydata -- go run test.go -skip-build -keep-data=false -docker=false -print-log -follow -tag upgrade_downgrade_onlineddl_flow + + # Running a test with primary tablet at version n (current SHA) and replica vttablet at version n+1 + - name: Run Online DDL tests (primary=N, replica=N+1) + if: steps.skip-workflow.outputs.skip-workflow == 'false' && steps.changes.outputs.end_to_end == 'true' + run: | + rm -rf /tmp/vtdataroot + mkdir -p /tmp/vtdataroot + + source build.env + export PRIMARY_TABLET_BINARY_SUFFIX="" + export REPLICA_TABLET_BINARY_SUFFIX="-next" + eatmydata -- go run test.go -skip-build -keep-data=false -docker=false -print-log -follow -tag upgrade_downgrade_onlineddl_flow diff --git a/go/test/endtoend/cluster/cluster_process.go b/go/test/endtoend/cluster/cluster_process.go index 0fc5edef1bb..44636b3cdb6 100644 --- a/go/test/endtoend/cluster/cluster_process.go +++ b/go/test/endtoend/cluster/cluster_process.go @@ -403,6 +403,12 @@ func (cluster *LocalProcessCluster) startKeyspace(keyspace Keyspace, shardNames if err != nil { return err } + switch tablet.Type { + case "primary": + mysqlctlProcess.Binary += os.Getenv("PRIMARY_TABLET_BINARY_SUFFIX") + case "replica": + mysqlctlProcess.Binary += os.Getenv("REPLICA_TABLET_BINARY_SUFFIX") + } tablet.MysqlctlProcess = *mysqlctlProcess proc, err := tablet.MysqlctlProcess.StartProcess() if err != nil { @@ -426,6 +432,12 @@ func (cluster *LocalProcessCluster) startKeyspace(keyspace Keyspace, shardNames cluster.TmpDirectory, cluster.VtTabletExtraArgs, cluster.DefaultCharset) + switch tablet.Type { + case "primary": + tablet.VttabletProcess.Binary += os.Getenv("PRIMARY_TABLET_BINARY_SUFFIX") + case "replica": + tablet.VttabletProcess.Binary += os.Getenv("REPLICA_TABLET_BINARY_SUFFIX") + } tablet.Alias = tablet.VttabletProcess.TabletPath if cluster.ReusingVTDATAROOT { tablet.VttabletProcess.ServingStatus = "SERVING" diff --git a/go/test/endtoend/onlineddl/flow/onlineddl_flow_test.go b/go/test/endtoend/onlineddl/flow/onlineddl_flow_test.go new file mode 100644 index 00000000000..772a5fa6fd0 --- /dev/null +++ b/go/test/endtoend/onlineddl/flow/onlineddl_flow_test.go @@ -0,0 +1,661 @@ +/* +Copyright 2024 The Vitess Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// This test is designed to test the flow of a single online DDL migration, with tablet throttler +// enabled. IT tests the following: +// - A primary + replica setup +// - Creating and populating a table +// - Enabling tablet (lag) throttler +// - Running a workload that generates DMLs, and which checks the throttler +// - Running an online DDL migration: +// - Using `online --postpone-completion` to use vreplication +// - vreplication configured (by default) to read from replica +// - vreplication by nature also checks the throttler +// - meanwhile, the workload generates DMLs, give migration some run time +// - proactively throttle and then unthrottle the migration +// - complete the migration +// +// - Validate sufficient DML has been applied +// - Validate the migration completed, and validate new schema is instated +// +// The test is designed with upgrade/downgrade in mind. In particular, we wish to test +// different vitess versions for `primary` and `replica` tablets. Thus, we validate: +// - Cross tablet and cross version throttler communication +// - Cross version vreplication + +package flow + +import ( + "context" + "flag" + "fmt" + "io" + "math/rand/v2" + "net/http" + "os" + "path" + "runtime" + "strings" + "sync" + "sync/atomic" + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "vitess.io/vitess/go/mysql" + "vitess.io/vitess/go/test/endtoend/cluster" + "vitess.io/vitess/go/test/endtoend/onlineddl" + "vitess.io/vitess/go/test/endtoend/throttler" + "vitess.io/vitess/go/vt/log" + "vitess.io/vitess/go/vt/schema" + "vitess.io/vitess/go/vt/vttablet" + throttlebase "vitess.io/vitess/go/vt/vttablet/tabletserver/throttle/base" + "vitess.io/vitess/go/vt/vttablet/tabletserver/throttle/throttlerapp" +) + +var ( + clusterInstance *cluster.LocalProcessCluster + shards []cluster.Shard + vtParams mysql.ConnParams + primaryTablet *cluster.Vttablet + replicaTablet *cluster.Vttablet + tablets []*cluster.Vttablet + httpClient = throttlebase.SetupHTTPClient(time.Second) + throttleWorkload atomic.Bool + totalAppliedDML atomic.Int64 + + hostname = "localhost" + keyspaceName = "ks" + cell = "zone1" + schemaChangeDirectory = "" + tableName = `stress_test` + createStatement = ` + CREATE TABLE stress_test ( + id bigint(20) not null, + rand_val varchar(32) null default '', + hint_col varchar(64) not null default '', + created_timestamp timestamp not null default current_timestamp, + updates int unsigned not null default 0, + PRIMARY KEY (id), + key created_idx(created_timestamp), + key updates_idx(updates) + ) ENGINE=InnoDB + ` + alterHintStatement = ` + ALTER TABLE stress_test modify hint_col varchar(64) not null default '%s' + ` + insertRowStatement = ` + INSERT IGNORE INTO stress_test (id, rand_val) VALUES (%d, left(md5(rand()), 8)) + ` + updateRowStatement = ` + UPDATE stress_test SET updates=updates+1 WHERE id=%d + ` + deleteRowStatement = ` + DELETE FROM stress_test WHERE id=%d AND updates=1 + ` +) + +var ( + countIterations = 5 +) + +const ( + maxTableRows = 4096 + workloadDuration = 5 * time.Second + migrationWaitTimeout = 60 * time.Second +) + +func TestMain(m *testing.M) { + defer cluster.PanicHandler(nil) + flag.Parse() + + exitcode, err := func() (int, error) { + clusterInstance = cluster.NewCluster(cell, hostname) + schemaChangeDirectory = path.Join("/tmp", fmt.Sprintf("schema_change_dir_%d", clusterInstance.GetAndReserveTabletUID())) + defer os.RemoveAll(schemaChangeDirectory) + defer clusterInstance.Teardown() + + if _, err := os.Stat(schemaChangeDirectory); os.IsNotExist(err) { + _ = os.Mkdir(schemaChangeDirectory, 0700) + } + + clusterInstance.VtctldExtraArgs = []string{ + "--schema_change_dir", schemaChangeDirectory, + "--schema_change_controller", "local", + "--schema_change_check_interval", "1s", + } + + clusterInstance.VtTabletExtraArgs = []string{ + "--heartbeat_interval", "250ms", + "--heartbeat_on_demand_duration", "5s", + "--migration_check_interval", "2s", + "--watch_replication_stream", + // Test VPlayer batching mode. + fmt.Sprintf("--vreplication_experimental_flags=%d", + vttablet.VReplicationExperimentalFlagAllowNoBlobBinlogRowImage|vttablet.VReplicationExperimentalFlagOptimizeInserts|vttablet.VReplicationExperimentalFlagVPlayerBatching), + } + clusterInstance.VtGateExtraArgs = []string{ + "--ddl_strategy", "online", + } + + if err := clusterInstance.StartTopo(); err != nil { + return 1, err + } + + // Start keyspace + keyspace := &cluster.Keyspace{ + Name: keyspaceName, + } + + // No need for replicas in this stress test + if err := clusterInstance.StartKeyspace(*keyspace, []string{"1"}, 1, false); err != nil { + return 1, err + } + + // Collect table paths and ports + tablets = clusterInstance.Keyspaces[0].Shards[0].Vttablets + for _, tablet := range tablets { + if tablet.Type == "primary" { + primaryTablet = tablet + } else { + replicaTablet = tablet + } + } + + vtgateInstance := clusterInstance.NewVtgateInstance() + // Start vtgate + if err := vtgateInstance.Setup(); err != nil { + return 1, err + } + // ensure it is torn down during cluster TearDown + clusterInstance.VtgateProcess = *vtgateInstance + vtParams = mysql.ConnParams{ + Host: clusterInstance.Hostname, + Port: clusterInstance.VtgateMySQLPort, + } + + return m.Run(), nil + }() + if err != nil { + fmt.Printf("%v\n", err) + os.Exit(1) + } else { + os.Exit(exitcode) + } + +} + +func TestSchemaChange(t *testing.T) { + defer cluster.PanicHandler(t) + ctx := context.Background() + + require.NotNil(t, clusterInstance) + require.NotNil(t, primaryTablet) + require.NotNil(t, replicaTablet) + require.Equal(t, 2, len(tablets)) + + // This test is designed with upgrade/downgrade in mind. Do some logging to show what's + // the configuration for this test. + if binarySuffix := os.Getenv("PRIMARY_TABLET_BINARY_SUFFIX"); binarySuffix != "" { + t.Logf("Using PRIMARY_TABLET_BINARY_SUFFIX: %s", binarySuffix) + } + if binarySuffix := os.Getenv("REPLICA_TABLET_BINARY_SUFFIX"); binarySuffix != "" { + t.Logf("Using REPLICA_TABLET_BINARY_SUFFIX: %s", binarySuffix) + } + + require.NotEmpty(t, clusterInstance.Keyspaces) + shards = clusterInstance.Keyspaces[0].Shards + require.Equal(t, 1, len(shards)) + + throttler.EnableLagThrottlerAndWaitForStatus(t, clusterInstance, time.Second) + + t.Run("flow", func(t *testing.T) { + t.Run("create schema", func(t *testing.T) { + testWithInitialSchema(t) + }) + t.Run("init table", func(t *testing.T) { + // Populates table. Makes work for vcopier. + initTable(t) + }) + t.Run("migrate", func(t *testing.T) { + ctx, cancel := context.WithCancel(ctx) + defer cancel() + + workloadCtx, cancelWorkload := context.WithCancel(ctx) + defer cancelWorkload() + + t.Run("routine throttler check", func(t *testing.T) { + go func() { + ticker := time.NewTicker(500 * time.Millisecond) + defer ticker.Stop() + for { + _, statusCode, err := throttlerCheck(primaryTablet.VttabletProcess, throttlerapp.OnlineDDLName) + assert.NoError(t, err) + throttleWorkload.Store(statusCode != http.StatusOK) + select { + case <-ticker.C: + case <-workloadCtx.Done(): + t.Logf("Terminating routine throttler check") + return + } + } + }() + }) + + var wg sync.WaitGroup + t.Run("generate workload", func(t *testing.T) { + // Create work for vplayer. + // This workload will consider throttling state and avoid generating DMLs if throttled. + wg.Add(1) + go func() { + defer cancel() + defer t.Logf("Terminating workload") + defer wg.Done() + runMultipleConnections(workloadCtx, t) + }() + }) + appliedDMLStart := totalAppliedDML.Load() + + hint := "post_completion_hint" + var uuid string + t.Run("submit migration", func(t *testing.T) { + uuid = testOnlineDDLStatement(t, fmt.Sprintf(alterHintStatement, hint), "online --postpone-completion", "", true) + }) + t.Run("wait for ready_to_complete", func(t *testing.T) { + waitForReadyToComplete(t, uuid, true) + }) + t.Run("validating running status", func(t *testing.T) { + onlineddl.CheckMigrationStatus(t, &vtParams, shards, uuid, schema.OnlineDDLStatusRunning) + }) + t.Run("throttle online-ddl", func(t *testing.T) { + onlineddl.ThrottleAllMigrations(t, &vtParams) + onlineddl.CheckThrottledApps(t, &vtParams, throttlerapp.OnlineDDLName, true) + + for _, tab := range tablets { + body, err := throttleApp(tab.VttabletProcess, throttlerapp.OnlineDDLName) + assert.NoError(t, err) + assert.Contains(t, body, throttlerapp.OnlineDDLName) + } + waitForThrottleCheckStatus(t, throttlerapp.OnlineDDLName, primaryTablet, http.StatusExpectationFailed) + }) + t.Run("unthrottle online-ddl", func(t *testing.T) { + onlineddl.UnthrottleAllMigrations(t, &vtParams) + onlineddl.CheckThrottledApps(t, &vtParams, throttlerapp.OnlineDDLName, false) + + for _, tab := range tablets { + body, err := unthrottleApp(tab.VttabletProcess, throttlerapp.OnlineDDLName) + assert.NoError(t, err) + assert.Contains(t, body, throttlerapp.OnlineDDLName) + } + waitForThrottleCheckStatus(t, throttlerapp.OnlineDDLName, primaryTablet, http.StatusOK) + }) + t.Run("additional wait", func(t *testing.T) { + // Waiting just so that we generate more DMLs, and give migration/vreplication + // more "opportunities" to throttle or to make progress. + select { + case <-time.After(3 * time.Second): + case <-ctx.Done(): + require.Fail(t, "context cancelled") + } + }) + t.Run("validate applied DML", func(t *testing.T) { + // Validate that during Online DDL, and even with throttling, we were + // able to produce meaningful traffic. + appliedDMLEnd := totalAppliedDML.Load() + assert.Greater(t, appliedDMLEnd, appliedDMLStart) + assert.GreaterOrEqual(t, appliedDMLEnd-appliedDMLStart, int64(maxTableRows)) + t.Logf("Applied DML: %d", appliedDMLEnd-appliedDMLStart) + }) + t.Run("attempt to complete", func(t *testing.T) { + onlineddl.CheckCompleteMigration(t, &vtParams, shards, uuid, true) + }) + isComplete := false + t.Run("optimistic wait for migration completion", func(t *testing.T) { + status := onlineddl.WaitForMigrationStatus(t, &vtParams, shards, uuid, migrationWaitTimeout, schema.OnlineDDLStatusRunning, schema.OnlineDDLStatusComplete) + isComplete = (status == schema.OnlineDDLStatusComplete) + fmt.Printf("# Migration status (for debug purposes): <%s>\n", status) + }) + if !isComplete { + t.Run("force complete cut-over", func(t *testing.T) { + onlineddl.CheckForceMigrationCutOver(t, &vtParams, shards, uuid, true) + }) + t.Run("another optimistic wait for migration completion", func(t *testing.T) { + status := onlineddl.WaitForMigrationStatus(t, &vtParams, shards, uuid, migrationWaitTimeout, schema.OnlineDDLStatusRunning, schema.OnlineDDLStatusComplete) + isComplete = (status == schema.OnlineDDLStatusComplete) + fmt.Printf("# Migration status (for debug purposes): <%s>\n", status) + }) + } + if !isComplete { + t.Run("terminate workload", func(t *testing.T) { + // Seems like workload is too high and preventing migration from completing. + // We can't go on forever. It's nice to have normal completion under workload, + // but it's not strictly what this test is designed for. We terminate the + // workload so as to allow the migration to complete. + cancelWorkload() + }) + } + t.Run("wait for migration completion", func(t *testing.T) { + status := onlineddl.WaitForMigrationStatus(t, &vtParams, shards, uuid, migrationWaitTimeout, schema.OnlineDDLStatusComplete) + fmt.Printf("# Migration status (for debug purposes): <%s>\n", status) + onlineddl.CheckMigrationStatus(t, &vtParams, shards, uuid, schema.OnlineDDLStatusComplete) + }) + t.Run("validate table schema", func(t *testing.T) { + checkMigratedTable(t, tableName, hint) + }) + + cancelWorkload() // Early break + cancel() // Early break + wg.Wait() + }) + }) +} + +func testWithInitialSchema(t *testing.T) { + // Create the stress table + err := clusterInstance.VtctldClientProcess.ApplySchema(keyspaceName, createStatement) + require.Nil(t, err) + + // Check if table is created + checkTable(t, tableName) +} + +// testOnlineDDLStatement runs an online DDL, ALTER statement +func testOnlineDDLStatement(t *testing.T, alterStatement string, ddlStrategy string, expectHint string, skipWait bool) (uuid string) { + row := onlineddl.VtgateExecDDL(t, &vtParams, ddlStrategy, alterStatement, "").Named().Row() + require.NotNil(t, row) + uuid = row.AsString("uuid", "") + uuid = strings.TrimSpace(uuid) + require.NotEmpty(t, uuid) + fmt.Println("# Generated UUID (for debug purposes):") + fmt.Printf("<%s>\n", uuid) + + strategySetting, err := schema.ParseDDLStrategy(ddlStrategy) + assert.NoError(t, err) + + if !strategySetting.Strategy.IsDirect() && !skipWait && uuid != "" { + status := onlineddl.WaitForMigrationStatus(t, &vtParams, shards, uuid, migrationWaitTimeout, schema.OnlineDDLStatusComplete, schema.OnlineDDLStatusFailed) + fmt.Printf("# Migration status (for debug purposes): <%s>\n", status) + } + + if expectHint != "" { + checkMigratedTable(t, tableName, expectHint) + } + return uuid +} + +// checkTable checks the number of tables in the first two shards. +func checkTable(t *testing.T, showTableName string) { + for i := range clusterInstance.Keyspaces[0].Shards { + checkTablesCount(t, clusterInstance.Keyspaces[0].Shards[i].Vttablets[0], showTableName, 1) + } +} + +// checkTablesCount checks the number of tables in the given tablet +func checkTablesCount(t *testing.T, tablet *cluster.Vttablet, showTableName string, expectCount int) { + query := fmt.Sprintf(`show tables like '%%%s%%';`, showTableName) + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + ticker := time.NewTicker(time.Second) + defer ticker.Stop() + + rowcount := 0 + + for { + queryResult, err := tablet.VttabletProcess.QueryTablet(query, keyspaceName, true) + require.Nil(t, err) + rowcount = len(queryResult.Rows) + if rowcount > 0 { + break + } + + select { + case <-ticker.C: + continue // Keep looping + case <-ctx.Done(): + // Break below to the assertion + } + + break + } + + assert.Equal(t, expectCount, rowcount) +} + +// checkMigratedTables checks the CREATE STATEMENT of a table after migration +func checkMigratedTable(t *testing.T, tableName, expectHint string) { + for i := range clusterInstance.Keyspaces[0].Shards { + createStatement := getCreateTableStatement(t, clusterInstance.Keyspaces[0].Shards[i].Vttablets[0], tableName) + assert.Contains(t, createStatement, expectHint) + } +} + +// getCreateTableStatement returns the CREATE TABLE statement for a given table +func getCreateTableStatement(t *testing.T, tablet *cluster.Vttablet, tableName string) (statement string) { + queryResult, err := tablet.VttabletProcess.QueryTablet(fmt.Sprintf("show create table %s;", tableName), keyspaceName, true) + require.Nil(t, err) + + assert.Equal(t, len(queryResult.Rows), 1) + assert.Equal(t, len(queryResult.Rows[0]), 2) // table name, create statement + statement = queryResult.Rows[0][1].ToString() + return statement +} + +func waitForReadyToComplete(t *testing.T, uuid string, expected bool) bool { + ctx, cancel := context.WithTimeout(context.Background(), migrationWaitTimeout) + defer cancel() + + ticker := time.NewTicker(time.Second) + defer ticker.Stop() + for { + rs := onlineddl.ReadMigrations(t, &vtParams, uuid) + require.NotNil(t, rs) + for _, row := range rs.Named().Rows { + readyToComplete := row.AsInt64("ready_to_complete", 0) + if expected == (readyToComplete > 0) { + // all good. This is what we waited for + if expected { + // if migration is ready to complete, the timestamp should be non-null + assert.False(t, row["ready_to_complete_timestamp"].IsNull()) + } else { + assert.True(t, row["ready_to_complete_timestamp"].IsNull()) + } + return true + } + } + select { + case <-ticker.C: + case <-ctx.Done(): + assert.NoError(t, ctx.Err(), "timeout waiting for ready_to_complete") + return false + } + } +} + +func generateInsert(t *testing.T, conn *mysql.Conn) error { + id := rand.Int32N(int32(maxTableRows)) + query := fmt.Sprintf(insertRowStatement, id) + _, err := conn.ExecuteFetch(query, 1, false) + if err == nil { + totalAppliedDML.Add(1) + } + + return err +} + +func generateUpdate(t *testing.T, conn *mysql.Conn) error { + id := rand.Int32N(int32(maxTableRows)) + query := fmt.Sprintf(updateRowStatement, id) + _, err := conn.ExecuteFetch(query, 1, false) + if err == nil { + totalAppliedDML.Add(1) + } + + return err +} + +func generateDelete(t *testing.T, conn *mysql.Conn) error { + id := rand.Int32N(int32(maxTableRows)) + query := fmt.Sprintf(deleteRowStatement, id) + _, err := conn.ExecuteFetch(query, 1, false) + if err == nil { + totalAppliedDML.Add(1) + } + + return err +} + +func runSingleConnection(ctx context.Context, t *testing.T, sleepInterval time.Duration) { + log.Infof("Running single connection") + conn, err := mysql.Connect(ctx, &vtParams) + require.Nil(t, err) + defer conn.Close() + + _, err = conn.ExecuteFetch("set autocommit=1", 1000, true) + require.Nil(t, err) + _, err = conn.ExecuteFetch("set transaction isolation level read committed", 1000, true) + require.Nil(t, err) + + ticker := time.NewTicker(sleepInterval) + defer ticker.Stop() + + for { + if !throttleWorkload.Load() { + switch rand.Int32N(3) { + case 0: + err = generateInsert(t, conn) + case 1: + err = generateUpdate(t, conn) + case 2: + err = generateDelete(t, conn) + } + } + select { + case <-ctx.Done(): + log.Infof("Terminating single connection") + return + case <-ticker.C: + } + assert.Nil(t, err) + } +} + +func runMultipleConnections(ctx context.Context, t *testing.T) { + // The workload for a 16 vCPU machine is: + // - Concurrency of 16 + // - 2ms interval between queries for each connection + // As the number of vCPUs decreases, so do we decrease concurrency, and increase intervals. For example, on a 8 vCPU machine + // we run concurrency of 8 and interval of 4ms. On a 4 vCPU machine we run concurrency of 4 and interval of 8ms. + maxConcurrency := runtime.NumCPU() + sleepModifier := 16.0 / float64(maxConcurrency) + baseSleepInterval := 2 * time.Millisecond + singleConnectionSleepIntervalNanoseconds := float64(baseSleepInterval.Nanoseconds()) * sleepModifier + sleepInterval := time.Duration(int64(singleConnectionSleepIntervalNanoseconds)) + + log.Infof("Running multiple connections: maxConcurrency=%v, sleep interval=%v", maxConcurrency, sleepInterval) + var wg sync.WaitGroup + for i := 0; i < maxConcurrency; i++ { + wg.Add(1) + go func() { + defer wg.Done() + runSingleConnection(ctx, t, sleepInterval) + }() + } + wg.Wait() + log.Infof("Running multiple connections: done") +} + +func initTable(t *testing.T) { + log.Infof("initTable begin") + defer log.Infof("initTable complete") + + ctx := context.Background() + conn, err := mysql.Connect(ctx, &vtParams) + require.Nil(t, err) + defer conn.Close() + + appliedDMLStart := totalAppliedDML.Load() + + for i := 0; i < maxTableRows/2; i++ { + generateInsert(t, conn) + } + for i := 0; i < maxTableRows/4; i++ { + generateUpdate(t, conn) + } + for i := 0; i < maxTableRows/4; i++ { + generateDelete(t, conn) + } + appliedDMLEnd := totalAppliedDML.Load() + assert.Greater(t, appliedDMLEnd, appliedDMLStart) + assert.GreaterOrEqual(t, appliedDMLEnd-appliedDMLStart, int64(maxTableRows)) +} + +func throttleResponse(tablet *cluster.VttabletProcess, path string) (respBody string, err error) { + apiURL := fmt.Sprintf("http://%s:%d/%s", tablet.TabletHostname, tablet.Port, path) + resp, err := httpClient.Get(apiURL) + if err != nil { + return "", err + } + defer resp.Body.Close() + b, err := io.ReadAll(resp.Body) + respBody = string(b) + return respBody, err +} + +func throttleApp(tablet *cluster.VttabletProcess, throttlerApp throttlerapp.Name) (string, error) { + return throttleResponse(tablet, fmt.Sprintf("throttler/throttle-app?app=%s&duration=1h", throttlerApp.String())) +} + +func unthrottleApp(tablet *cluster.VttabletProcess, throttlerApp throttlerapp.Name) (string, error) { + return throttleResponse(tablet, fmt.Sprintf("throttler/unthrottle-app?app=%s", throttlerApp.String())) +} + +func throttlerCheck(tablet *cluster.VttabletProcess, throttlerApp throttlerapp.Name) (respBody string, statusCode int, err error) { + apiURL := fmt.Sprintf("http://%s:%d/throttler/check?app=%s", tablet.TabletHostname, tablet.Port, throttlerApp.String()) + resp, err := httpClient.Get(apiURL) + if err != nil { + return "", 0, err + } + defer resp.Body.Close() + statusCode = resp.StatusCode + b, err := io.ReadAll(resp.Body) + respBody = string(b) + return respBody, statusCode, err +} + +// waitForThrottleCheckStatus waits for the tablet to return the provided HTTP code in a throttle check +func waitForThrottleCheckStatus(t *testing.T, throttlerApp throttlerapp.Name, tablet *cluster.Vttablet, wantCode int) { + ctx, cancel := context.WithTimeout(context.Background(), migrationWaitTimeout) + defer cancel() + ticker := time.NewTicker(time.Second) + defer ticker.Stop() + + for { + respBody, statusCode, err := throttlerCheck(tablet.VttabletProcess, throttlerApp) + require.NoError(t, err) + + if wantCode == statusCode { + return + } + select { + case <-ctx.Done(): + assert.Equalf(t, wantCode, statusCode, "body: %s", respBody) + return + case <-ticker.C: + } + } +} diff --git a/test/config.json b/test/config.json index 2e612e57ca5..21c16af6caf 100644 --- a/test/config.json +++ b/test/config.json @@ -313,6 +313,15 @@ "RetryMax": 1, "Tags": [] }, + "onlineddl_flow": { + "File": "unused.go", + "Args": ["vitess.io/vitess/go/test/endtoend/onlineddl/flow", "-timeout", "30m"], + "Command": [], + "Manual": false, + "Shard": "onlineddl_flow", + "RetryMax": 1, + "Tags": ["upgrade_downgrade_onlineddl_flow"] + }, "schemadiff_vrepl": { "File": "unused.go", "Args": ["vitess.io/vitess/go/test/endtoend/schemadiff/vrepl", "-timeout", "30m"],