diff --git a/changelog/22.0/22.0.0/summary.md b/changelog/22.0/22.0.0/summary.md index 7c7257bfae3..c95c1b44dd8 100644 --- a/changelog/22.0/22.0.0/summary.md +++ b/changelog/22.0/22.0.0/summary.md @@ -5,6 +5,7 @@ - **[Major Changes](#major-changes)** - **[RPC Changes](#rpc-changes)** - **[Prefer not promoting a replica that is currently taking a backup](#reparents-prefer-not-backing-up)** + - **[VTOrc Config File Changes](#vtorc-config-file-changes)** ## Major Changes @@ -25,4 +26,25 @@ For planned reparents, hosts taking backups with a backup engine other than `bui valid candidates. This means they will never get promoted - not even if there's no other candidates. Note that behavior for `builtin` backups remains unchanged: a replica that is currently taking a `builtin` backup will -never be promoted, neither by planned nor by emergency reparents. \ No newline at end of file +never be promoted, neither by planned nor by emergency reparents. + +### VTOrc Config File Changes + +The configuration file for VTOrc has been updated to now support dynamic fields. The old `--config` parameter has been removed. The alternative is to use the `--config-file` parameter. The configuration can now be provided in both json, yaml or any other format that [viper](https://github.com/spf13/viper) supports. + +The following fields can be dynamically changed - +1. `InstancePollTime` +2. `PreventCrossCellFailover` +3. `SnapshotTopologyInterval` +4. `ReasonableReplicationLag` +5. `AuditToBackend` +6. `AuditToSyslog` +7. `AuditPurgeDuration` +8. `WaitReplicasTimeout` +9. `TolerableReplicationLag` +10. `TopoInformationRefreshDuration` +11. `RecoveryPollDuration` +12. `AllowEmergencyReparent` +13. `ChangeTabletsWithErrantGtidToDrained` + +To upgrade to the newer version of the configuration file, first switch to using the flags in your current deployment before upgrading. Then you can switch to using the configuration file in the newer release. diff --git a/examples/common/scripts/vtorc-up.sh b/examples/common/scripts/vtorc-up.sh index 23ca4e62b48..807f522b1f7 100755 --- a/examples/common/scripts/vtorc-up.sh +++ b/examples/common/scripts/vtorc-up.sh @@ -11,7 +11,9 @@ vtorc \ $TOPOLOGY_FLAGS \ --logtostderr \ --alsologtostderr \ - --config="${script_dir}/../vtorc/config.json" \ + --config-path="${script_dir}/../vtorc/" \ + --config-name="config.yaml" \ + --config-type="yml" \ --port $port \ > "${log_dir}/vtorc.out" 2>&1 & diff --git a/examples/common/vtorc/config.json b/examples/common/vtorc/config.json deleted file mode 100644 index 53b012c2162..00000000000 --- a/examples/common/vtorc/config.json +++ /dev/null @@ -1,4 +0,0 @@ -{ - "RecoveryPeriodBlockSeconds": 1, - "InstancePollSeconds": 1 -} \ No newline at end of file diff --git a/examples/common/vtorc/config.yaml b/examples/common/vtorc/config.yaml new file mode 100644 index 00000000000..e615f6814f4 --- /dev/null +++ b/examples/common/vtorc/config.yaml @@ -0,0 +1 @@ +InstancePollTime: 1s diff --git a/go/cmd/vtorc/cli/cli.go b/go/cmd/vtorc/cli/cli.go index 1233c1e2ac2..b79793c6492 100644 --- a/go/cmd/vtorc/cli/cli.go +++ b/go/cmd/vtorc/cli/cli.go @@ -20,6 +20,7 @@ import ( "github.com/spf13/cobra" "vitess.io/vitess/go/acl" + "vitess.io/vitess/go/viperutil/debug" "vitess.io/vitess/go/vt/log" "vitess.io/vitess/go/vt/servenv" "vitess.io/vitess/go/vt/vtorc/config" @@ -29,8 +30,7 @@ import ( ) var ( - configFile string - Main = &cobra.Command{ + Main = &cobra.Command{ Use: "vtorc", Short: "VTOrc is the automated fault detection and repair tool in Vitess.", Example: `vtorc \ @@ -51,22 +51,16 @@ var ( func run(cmd *cobra.Command, args []string) { servenv.Init() - config.UpdateConfigValuesFromFlags() inst.RegisterStats() log.Info("starting vtorc") - if len(configFile) > 0 { - config.ForceRead(configFile) - } else { - config.Read("/etc/vtorc.conf.json", "conf/vtorc.conf.json", "vtorc.conf.json") - } - if config.Config.AuditToSyslog { + if config.GetAuditToSyslog() { inst.EnableAuditSyslog() } config.MarkConfigurationLoaded() // Log final config values to debug if something goes wrong. - config.LogConfigValues() + log.Infof("Running with Configuration - %v", debug.AllSettings()) server.StartVTOrcDiscovery() server.RegisterVTOrcAPIEndpoints() @@ -96,7 +90,5 @@ func init() { servenv.MoveFlagsToCobraCommand(Main) logic.RegisterFlags(Main.Flags()) - config.RegisterFlags(Main.Flags()) acl.RegisterFlags(Main.Flags()) - Main.Flags().StringVar(&configFile, "config", "", "config file name") } diff --git a/go/flags/endtoend/vtorc.txt b/go/flags/endtoend/vtorc.txt index d34b4404df7..efccb0afdfc 100644 --- a/go/flags/endtoend/vtorc.txt +++ b/go/flags/endtoend/vtorc.txt @@ -25,7 +25,6 @@ Flags: --catch-sigpipe catch and ignore SIGPIPE on stdout and stderr if specified --change-tablets-with-errant-gtid-to-drained Whether VTOrc should be changing the type of tablets with errant GTIDs to DRAINED --clusters_to_watch strings Comma-separated list of keyspaces or keyspace/shards that this instance will monitor and repair. Defaults to all clusters in the topology. Example: "ks1,ks2/-80" - --config string config file name --config-file string Full path of the config file (with extension) to use. If set, --config-path, --config-type, and --config-name are ignored. --config-file-not-found-handling ConfigFileNotFoundHandling Behavior when a config file is not found. (Options: error, exit, ignore, warn) (default warn) --config-name string Name of the config file (without extension) to search for. (default "vtconfig") diff --git a/go/test/endtoend/cluster/cluster_process.go b/go/test/endtoend/cluster/cluster_process.go index 6f800a70a49..b89e007b4f2 100644 --- a/go/test/endtoend/cluster/cluster_process.go +++ b/go/test/endtoend/cluster/cluster_process.go @@ -1301,7 +1301,6 @@ func (cluster *LocalProcessCluster) NewVTOrcProcess(config VTOrcConfiguration) * VtctlProcess: *base, LogDir: cluster.TmpDirectory, Config: config, - WebPort: cluster.GetAndReservePort(), Port: cluster.GetAndReservePort(), } } diff --git a/go/test/endtoend/cluster/vtorc_process.go b/go/test/endtoend/cluster/vtorc_process.go index 4fcb68e292d..af101a8bebd 100644 --- a/go/test/endtoend/cluster/vtorc_process.go +++ b/go/test/endtoend/cluster/vtorc_process.go @@ -43,20 +43,28 @@ type VTOrcProcess struct { ExtraArgs []string ConfigPath string Config VTOrcConfiguration - WebPort int + NoOverride bool proc *exec.Cmd exit chan error } type VTOrcConfiguration struct { - Debug bool - ListenAddress string - RecoveryPeriodBlockSeconds int - TopologyRefreshSeconds int `json:",omitempty"` - PreventCrossDataCenterPrimaryFailover bool `json:",omitempty"` - LockShardTimeoutSeconds int `json:",omitempty"` - ReplicationLagQuery string `json:",omitempty"` - FailPrimaryPromotionOnLagMinutes int `json:",omitempty"` + InstancePollTime string `json:"instance-poll-time,omitempty"` + SnapshotTopologyInterval string `json:"snapshot-topology-interval,omitempty"` + PreventCrossCellFailover bool `json:"prevent-cross-cell-failover,omitempty"` + ReasonableReplicationLag string `json:"reasonable-replication-lag,omitempty"` + AuditToBackend bool `json:"audit-to-backend,omitempty"` + AuditToSyslog bool `json:"audit-to-syslog,omitempty"` + AuditPurgeDuration string `json:"audit-purge-duration,omitempty"` + WaitReplicasTimeout string `json:"wait-replicas-timeout,omitempty"` + TolerableReplicationLag string `json:"tolerable-replication-lag,omitempty"` + TopoInformationRefreshDuration string `json:"topo-information-refresh-duration,omitempty"` + RecoveryPollDuration string `json:"recovery-poll-duration,omitempty"` + AllowEmergencyReparent string `json:"allow-emergency-reparent,omitempty"` + ChangeTabletsWithErrantGtidToDrained bool `json:"change-tablets-with-errant-gtid-to-drained,omitempty"` + LockShardTimeoutSeconds int `json:",omitempty"` + ReplicationLagQuery string `json:",omitempty"` + FailPrimaryPromotionOnLagMinutes int `json:",omitempty"` } // ToJSONString will marshal this configuration as JSON @@ -65,12 +73,12 @@ func (config *VTOrcConfiguration) ToJSONString() string { return string(b) } -func (config *VTOrcConfiguration) AddDefaults(webPort int) { - config.Debug = true - if config.RecoveryPeriodBlockSeconds == 0 { - config.RecoveryPeriodBlockSeconds = 1 - } - config.ListenAddress = fmt.Sprintf(":%d", webPort) +func (config *VTOrcConfiguration) addValuesToCheckOverride() { + config.InstancePollTime = "10h" +} + +func (orc *VTOrcProcess) RewriteConfiguration() error { + return os.WriteFile(orc.ConfigPath, []byte(orc.Config.ToJSONString()), 0644) } // Setup starts orc process with required arguements @@ -91,7 +99,9 @@ func (orc *VTOrcProcess) Setup() (err error) { orc.ConfigPath = configFile.Name() // Add the default configurations and print them out - orc.Config.AddDefaults(orc.WebPort) + if !orc.NoOverride { + orc.Config.addValuesToCheckOverride() + } log.Errorf("configuration - %v", orc.Config.ToJSONString()) _, err = configFile.WriteString(orc.Config.ToJSONString()) if err != nil { @@ -111,15 +121,18 @@ func (orc *VTOrcProcess) Setup() (err error) { "--topo_implementation", orc.TopoImplementation, "--topo_global_server_address", orc.TopoGlobalAddress, "--topo_global_root", orc.TopoGlobalRoot, - "--config", orc.ConfigPath, + "--config-file", orc.ConfigPath, "--port", fmt.Sprintf("%d", orc.Port), - // This parameter is overriden from the config file, added here to just verify that we indeed use the config file paramter over the flag - "--recovery-period-block-duration", "10h", - "--instance-poll-time", "1s", - // Faster topo information refresh speeds up the tests. This doesn't add any significant load either - "--topo-information-refresh-duration", "3s", "--bind-address", "127.0.0.1", ) + if !orc.NoOverride { + orc.proc.Args = append(orc.proc.Args, + // This parameter is overriden from the config file. This verifies that we indeed use the flag value over the config file. + "--instance-poll-time", "1s", + // Faster topo information refresh speeds up the tests. This doesn't add any significant load either. + "--topo-information-refresh-duration", "3s", + ) + } if *isCoverage { orc.proc.Args = append(orc.proc.Args, "--test.coverprofile="+getCoveragePath("orc.out")) diff --git a/go/test/endtoend/vtorc/api/api_test.go b/go/test/endtoend/vtorc/api/api_test.go index 670e8c803fa..638ea5fa72e 100644 --- a/go/test/endtoend/vtorc/api/api_test.go +++ b/go/test/endtoend/vtorc/api/api_test.go @@ -35,8 +35,7 @@ import ( func TestAPIEndpoints(t *testing.T) { defer cluster.PanicHandler(t) utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 2, 1, nil, cluster.VTOrcConfiguration{ - PreventCrossDataCenterPrimaryFailover: true, - RecoveryPeriodBlockSeconds: 5, + PreventCrossCellFailover: true, }, 1, "") keyspace := &clusterInfo.ClusterInstance.Keyspaces[0] shard0 := &keyspace.Shards[0] diff --git a/go/test/endtoend/vtorc/api/config_test.go b/go/test/endtoend/vtorc/api/config_test.go new file mode 100644 index 00000000000..71cc6291be7 --- /dev/null +++ b/go/test/endtoend/vtorc/api/config_test.go @@ -0,0 +1,204 @@ +/* +Copyright 2024 The Vitess Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +*/ + +package api + +import ( + "strings" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "vitess.io/vitess/go/test/endtoend/cluster" + "vitess.io/vitess/go/test/endtoend/vtorc/utils" +) + +// TestDynamicConfigs tests the dyanamic configurations that VTOrc offers. +func TestDynamicConfigs(t *testing.T) { + defer cluster.PanicHandler(t) + utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 2, 1, nil, cluster.VTOrcConfiguration{}, 1, "") + vtorc := clusterInfo.ClusterInstance.VTOrcProcesses[0] + + // Restart VTOrc without any flag overrides so that all the configurations can be tested. + err := vtorc.TearDown() + require.NoError(t, err) + vtorc.Config = cluster.VTOrcConfiguration{} + vtorc.NoOverride = true + err = vtorc.Setup() + require.NoError(t, err) + + // Call API with retry to ensure VTOrc is up + status, resp := utils.MakeAPICallRetry(t, vtorc, "/debug/health", func(code int, response string) bool { + return code != 200 + }) + // Verify when VTOrc is healthy, it has also run the first discovery. + assert.Equal(t, 200, status) + assert.Contains(t, resp, `"Healthy": true,`) + + t.Run("InstancePollTime", func(t *testing.T) { + // Get configuration and verify the output. + waitForConfig(t, vtorc, `"instance-poll-time": 5000000000`) + // Update configuration and verify the output. + vtorc.Config.InstancePollTime = "10h" + err := vtorc.RewriteConfiguration() + assert.NoError(t, err) + // Wait until the config has been updated and seen. + waitForConfig(t, vtorc, `"instance-poll-time": "10h"`) + }) + + t.Run("PreventCrossCellFailover", func(t *testing.T) { + // Get configuration and verify the output. + waitForConfig(t, vtorc, `"prevent-cross-cell-failover": false`) + // Update configuration and verify the output. + vtorc.Config.PreventCrossCellFailover = true + err := vtorc.RewriteConfiguration() + assert.NoError(t, err) + // Wait until the config has been updated and seen. + waitForConfig(t, vtorc, `"prevent-cross-cell-failover": true`) + }) + + t.Run("SnapshotTopologyInterval", func(t *testing.T) { + // Get configuration and verify the output. + waitForConfig(t, vtorc, `"snapshot-topology-interval": 0`) + // Update configuration and verify the output. + vtorc.Config.SnapshotTopologyInterval = "10h" + err := vtorc.RewriteConfiguration() + assert.NoError(t, err) + // Wait until the config has been updated and seen. + waitForConfig(t, vtorc, `"snapshot-topology-interval": "10h"`) + }) + + t.Run("ReasonableReplicationLag", func(t *testing.T) { + // Get configuration and verify the output. + waitForConfig(t, vtorc, `"reasonable-replication-lag": 10000000000`) + // Update configuration and verify the output. + vtorc.Config.ReasonableReplicationLag = "10h" + err := vtorc.RewriteConfiguration() + assert.NoError(t, err) + // Wait until the config has been updated and seen. + waitForConfig(t, vtorc, `"reasonable-replication-lag": "10h"`) + }) + + t.Run("AuditToBackend", func(t *testing.T) { + // Get configuration and verify the output. + waitForConfig(t, vtorc, `"audit-to-backend": false`) + // Update configuration and verify the output. + vtorc.Config.AuditToBackend = true + err := vtorc.RewriteConfiguration() + assert.NoError(t, err) + // Wait until the config has been updated and seen. + waitForConfig(t, vtorc, `"audit-to-backend": true`) + }) + + t.Run("AuditToSyslog", func(t *testing.T) { + // Get configuration and verify the output. + waitForConfig(t, vtorc, `"audit-to-syslog": false`) + // Update configuration and verify the output. + vtorc.Config.AuditToSyslog = true + err := vtorc.RewriteConfiguration() + assert.NoError(t, err) + // Wait until the config has been updated and seen. + waitForConfig(t, vtorc, `"audit-to-syslog": true`) + }) + + t.Run("AuditPurgeDuration", func(t *testing.T) { + // Get configuration and verify the output. + waitForConfig(t, vtorc, `"audit-purge-duration": 604800000000000`) + // Update configuration and verify the output. + vtorc.Config.AuditPurgeDuration = "10h" + err := vtorc.RewriteConfiguration() + assert.NoError(t, err) + // Wait until the config has been updated and seen. + waitForConfig(t, vtorc, `"audit-purge-duration": "10h"`) + }) + + t.Run("WaitReplicasTimeout", func(t *testing.T) { + // Get configuration and verify the output. + waitForConfig(t, vtorc, `"wait-replicas-timeout": 30000000000`) + // Update configuration and verify the output. + vtorc.Config.WaitReplicasTimeout = "10h" + err := vtorc.RewriteConfiguration() + assert.NoError(t, err) + // Wait until the config has been updated and seen. + waitForConfig(t, vtorc, `"wait-replicas-timeout": "10h"`) + }) + + t.Run("TolerableReplicationLag", func(t *testing.T) { + // Get configuration and verify the output. + waitForConfig(t, vtorc, `"tolerable-replication-lag": 0`) + // Update configuration and verify the output. + vtorc.Config.TolerableReplicationLag = "10h" + err := vtorc.RewriteConfiguration() + assert.NoError(t, err) + // Wait until the config has been updated and seen. + waitForConfig(t, vtorc, `"tolerable-replication-lag": "10h"`) + }) + + t.Run("TopoInformationRefreshDuration", func(t *testing.T) { + // Get configuration and verify the output. + waitForConfig(t, vtorc, `"topo-information-refresh-duration": 15000000000`) + // Update configuration and verify the output. + vtorc.Config.TopoInformationRefreshDuration = "10h" + err := vtorc.RewriteConfiguration() + assert.NoError(t, err) + // Wait until the config has been updated and seen. + waitForConfig(t, vtorc, `"topo-information-refresh-duration": "10h"`) + }) + + t.Run("RecoveryPollDuration", func(t *testing.T) { + // Get configuration and verify the output. + waitForConfig(t, vtorc, `"recovery-poll-duration": 1000000000`) + // Update configuration and verify the output. + vtorc.Config.RecoveryPollDuration = "10h" + err := vtorc.RewriteConfiguration() + assert.NoError(t, err) + // Wait until the config has been updated and seen. + waitForConfig(t, vtorc, `"recovery-poll-duration": "10h"`) + }) + + t.Run("AllowEmergencyReparent", func(t *testing.T) { + // Get configuration and verify the output. + waitForConfig(t, vtorc, `"allow-emergency-reparent": true`) + // Update configuration and verify the output. + vtorc.Config.AllowEmergencyReparent = "false" + err := vtorc.RewriteConfiguration() + assert.NoError(t, err) + // Wait until the config has been updated and seen. + waitForConfig(t, vtorc, `"allow-emergency-reparent": "false"`) + }) + + t.Run("ChangeTabletsWithErrantGtidToDrained", func(t *testing.T) { + // Get configuration and verify the output. + waitForConfig(t, vtorc, `"change-tablets-with-errant-gtid-to-drained": false`) + // Update configuration and verify the output. + vtorc.Config.ChangeTabletsWithErrantGtidToDrained = true + err := vtorc.RewriteConfiguration() + assert.NoError(t, err) + // Wait until the config has been updated and seen. + waitForConfig(t, vtorc, `"change-tablets-with-errant-gtid-to-drained": true`) + }) +} + +// waitForConfig waits for the expectedConfig to be present in the VTOrc configuration. +func waitForConfig(t *testing.T, vtorc *cluster.VTOrcProcess, expectedConfig string) { + t.Helper() + status, _ := utils.MakeAPICallRetry(t, vtorc, "/api/config", func(_ int, response string) bool { + return !strings.Contains(response, expectedConfig) + }) + require.EqualValues(t, 200, status) +} diff --git a/go/test/endtoend/vtorc/general/vtorc_test.go b/go/test/endtoend/vtorc/general/vtorc_test.go index 329601deb0c..f83ede48f44 100644 --- a/go/test/endtoend/vtorc/general/vtorc_test.go +++ b/go/test/endtoend/vtorc/general/vtorc_test.go @@ -42,7 +42,7 @@ func TestPrimaryElection(t *testing.T) { defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance) defer cluster.PanicHandler(t) utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 2, 1, nil, cluster.VTOrcConfiguration{ - PreventCrossDataCenterPrimaryFailover: true, + PreventCrossCellFailover: true, }, 2, "") keyspace := &clusterInfo.ClusterInstance.Keyspaces[0] shard0 := &keyspace.Shards[0] @@ -70,7 +70,7 @@ func TestSingleKeyspace(t *testing.T) { defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance) defer cluster.PanicHandler(t) utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 1, 1, []string{"--clusters_to_watch", "ks"}, cluster.VTOrcConfiguration{ - PreventCrossDataCenterPrimaryFailover: true, + PreventCrossCellFailover: true, }, 1, "") keyspace := &clusterInfo.ClusterInstance.Keyspaces[0] shard0 := &keyspace.Shards[0] @@ -89,7 +89,7 @@ func TestKeyspaceShard(t *testing.T) { defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance) defer cluster.PanicHandler(t) utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 1, 1, []string{"--clusters_to_watch", "ks/0"}, cluster.VTOrcConfiguration{ - PreventCrossDataCenterPrimaryFailover: true, + PreventCrossCellFailover: true, }, 1, "") keyspace := &clusterInfo.ClusterInstance.Keyspaces[0] shard0 := &keyspace.Shards[0] @@ -111,7 +111,7 @@ func TestVTOrcRepairs(t *testing.T) { defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance) defer cluster.PanicHandler(t) utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 3, 0, []string{"--change-tablets-with-errant-gtid-to-drained"}, cluster.VTOrcConfiguration{ - PreventCrossDataCenterPrimaryFailover: true, + PreventCrossCellFailover: true, }, 1, "") keyspace := &clusterInfo.ClusterInstance.Keyspaces[0] shard0 := &keyspace.Shards[0] @@ -290,7 +290,7 @@ func TestRepairAfterTER(t *testing.T) { defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance) defer cluster.PanicHandler(t) utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 2, 0, nil, cluster.VTOrcConfiguration{ - PreventCrossDataCenterPrimaryFailover: true, + PreventCrossCellFailover: true, }, 1, "") keyspace := &clusterInfo.ClusterInstance.Keyspaces[0] shard0 := &keyspace.Shards[0] @@ -326,7 +326,7 @@ func TestSemiSync(t *testing.T) { newCluster := utils.SetupNewClusterSemiSync(t) defer utils.PrintVTOrcLogsOnFailure(t, newCluster.ClusterInstance) utils.StartVTOrcs(t, newCluster, nil, cluster.VTOrcConfiguration{ - PreventCrossDataCenterPrimaryFailover: true, + PreventCrossCellFailover: true, }, 1) defer func() { utils.StopVTOrcs(t, newCluster) @@ -424,7 +424,7 @@ func TestVTOrcWithPrs(t *testing.T) { defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance) defer cluster.PanicHandler(t) utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 4, 0, nil, cluster.VTOrcConfiguration{ - PreventCrossDataCenterPrimaryFailover: true, + PreventCrossCellFailover: true, }, 1, "") keyspace := &clusterInfo.ClusterInstance.Keyspaces[0] shard0 := &keyspace.Shards[0] @@ -456,8 +456,6 @@ func TestVTOrcWithPrs(t *testing.T) { "--new-primary", replica.Alias) require.NoError(t, err, "error in PlannedReparentShard output - %s", output) - time.Sleep(40 * time.Second) - // check that the replica gets promoted utils.CheckPrimaryTablet(t, clusterInfo, replica, true) // Verify that VTOrc didn't run any other recovery @@ -560,7 +558,7 @@ func TestDurabilityPolicySetLater(t *testing.T) { // Now start the vtorc instances utils.StartVTOrcs(t, newCluster, nil, cluster.VTOrcConfiguration{ - PreventCrossDataCenterPrimaryFailover: true, + PreventCrossCellFailover: true, }, 1) defer func() { utils.StopVTOrcs(t, newCluster) @@ -587,7 +585,7 @@ func TestFullStatusConnectionPooling(t *testing.T) { utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 4, 0, []string{ "--tablet_manager_grpc_concurrency=1", }, cluster.VTOrcConfiguration{ - PreventCrossDataCenterPrimaryFailover: true, + PreventCrossCellFailover: true, }, 1, "") keyspace := &clusterInfo.ClusterInstance.Keyspaces[0] shard0 := &keyspace.Shards[0] diff --git a/go/test/endtoend/vtorc/primaryfailure/primary_failure_test.go b/go/test/endtoend/vtorc/primaryfailure/primary_failure_test.go index 886aa3a580a..a46e3789730 100644 --- a/go/test/endtoend/vtorc/primaryfailure/primary_failure_test.go +++ b/go/test/endtoend/vtorc/primaryfailure/primary_failure_test.go @@ -44,7 +44,7 @@ func TestDownPrimary(t *testing.T) { // If that replica is more advanced than the same-cell-replica, then we try to promote the cross-cell replica as an intermediate source. // If we don't specify a small value of --wait-replicas-timeout, then we would end up waiting for 30 seconds for the dead-primary to respond, failing this test. utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 2, 1, []string{"--remote_operation_timeout=10s", "--wait-replicas-timeout=5s"}, cluster.VTOrcConfiguration{ - PreventCrossDataCenterPrimaryFailover: true, + PreventCrossCellFailover: true, }, 1, "semi_sync") keyspace := &clusterInfo.ClusterInstance.Keyspaces[0] shard0 := &keyspace.Shards[0] @@ -150,7 +150,7 @@ func TestDownPrimaryBeforeVTOrc(t *testing.T) { // Start a VTOrc instance utils.StartVTOrcs(t, clusterInfo, []string{"--remote_operation_timeout=10s"}, cluster.VTOrcConfiguration{ - PreventCrossDataCenterPrimaryFailover: true, + PreventCrossCellFailover: true, }, 1) vtOrcProcess := clusterInfo.ClusterInstance.VTOrcProcesses[0] @@ -244,7 +244,7 @@ func TestDeadPrimaryRecoversImmediately(t *testing.T) { // If that replica is more advanced than the same-cell-replica, then we try to promote the cross-cell replica as an intermediate source. // If we don't specify a small value of --wait-replicas-timeout, then we would end up waiting for 30 seconds for the dead-primary to respond, failing this test. utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 2, 1, []string{"--remote_operation_timeout=10s", "--wait-replicas-timeout=5s"}, cluster.VTOrcConfiguration{ - PreventCrossDataCenterPrimaryFailover: true, + PreventCrossCellFailover: true, }, 1, "semi_sync") keyspace := &clusterInfo.ClusterInstance.Keyspaces[0] shard0 := &keyspace.Shards[0] @@ -324,7 +324,7 @@ func TestCrossDataCenterFailure(t *testing.T) { defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance) defer cluster.PanicHandler(t) utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 2, 1, nil, cluster.VTOrcConfiguration{ - PreventCrossDataCenterPrimaryFailover: true, + PreventCrossCellFailover: true, }, 1, "") keyspace := &clusterInfo.ClusterInstance.Keyspaces[0] shard0 := &keyspace.Shards[0] @@ -370,7 +370,7 @@ func TestCrossDataCenterFailureError(t *testing.T) { defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance) defer cluster.PanicHandler(t) utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 1, 1, nil, cluster.VTOrcConfiguration{ - PreventCrossDataCenterPrimaryFailover: true, + PreventCrossCellFailover: true, }, 1, "") keyspace := &clusterInfo.ClusterInstance.Keyspaces[0] shard0 := &keyspace.Shards[0] @@ -417,7 +417,7 @@ func TestLostRdonlyOnPrimaryFailure(t *testing.T) { defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance) defer cluster.PanicHandler(t) utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 2, 2, nil, cluster.VTOrcConfiguration{ - PreventCrossDataCenterPrimaryFailover: true, + PreventCrossCellFailover: true, }, 1, "") keyspace := &clusterInfo.ClusterInstance.Keyspaces[0] shard0 := &keyspace.Shards[0] @@ -728,8 +728,8 @@ func TestDownPrimaryPromotionRuleWithLagCrossCenter(t *testing.T) { defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance) defer cluster.PanicHandler(t) utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 2, 1, nil, cluster.VTOrcConfiguration{ - LockShardTimeoutSeconds: 5, - PreventCrossDataCenterPrimaryFailover: true, + LockShardTimeoutSeconds: 5, + PreventCrossCellFailover: true, }, 1, "test") keyspace := &clusterInfo.ClusterInstance.Keyspaces[0] shard0 := &keyspace.Shards[0] diff --git a/go/test/endtoend/vtorc/readtopologyinstance/main_test.go b/go/test/endtoend/vtorc/readtopologyinstance/main_test.go index fa8dc116782..823655ed785 100644 --- a/go/test/endtoend/vtorc/readtopologyinstance/main_test.go +++ b/go/test/endtoend/vtorc/readtopologyinstance/main_test.go @@ -55,8 +55,7 @@ func TestReadTopologyInstanceBufferable(t *testing.T) { "--topo_global_root", clusterInfo.ClusterInstance.VtctlProcess.TopoGlobalRoot, } servenv.ParseFlags("vtorc") - config.Config.RecoveryPeriodBlockSeconds = 1 - config.Config.InstancePollSeconds = 1 + config.SetInstancePollTime(1 * time.Second) config.MarkConfigurationLoaded() server.StartVTOrcDiscovery() diff --git a/go/test/endtoend/vtorc/utils/utils.go b/go/test/endtoend/vtorc/utils/utils.go index 680d1bfa39a..456d55518dd 100644 --- a/go/test/endtoend/vtorc/utils/utils.go +++ b/go/test/endtoend/vtorc/utils/utils.go @@ -376,7 +376,6 @@ func CheckPrimaryTablet(t *testing.T, clusterInfo *VTOrcClusterInfo, tablet *clu for { now := time.Now() if now.Sub(start) > time.Second*60 { - //log.Exitf("error") assert.FailNow(t, "failed to elect primary before timeout") } tabletInfo, err := clusterInfo.ClusterInstance.VtctldClientProcess.GetTablet(tablet.Alias) @@ -775,10 +774,10 @@ func MakeAPICallRetry(t *testing.T, vtorc *cluster.VTOrcProcess, url string, ret for { select { case <-timeout: - t.Fatal("timed out waiting for api to work") + t.Fatalf("timed out waiting for api to work. Last response - %s", response) return default: - status, response, _ := MakeAPICall(t, vtorc, url) + status, response, _ = MakeAPICall(t, vtorc, url) if retry(status, response) { time.Sleep(1 * time.Second) break diff --git a/go/viperutil/debug/debug.go b/go/viperutil/debug/debug.go index 66cbc7f2962..662634a5675 100644 --- a/go/viperutil/debug/debug.go +++ b/go/viperutil/debug/debug.go @@ -25,3 +25,13 @@ import ( func Debug() { registry.Combined().Debug() } + +// WriteConfigAs writes the config into the given filename. +func WriteConfigAs(filename string) error { + return registry.Combined().WriteConfigAs(filename) +} + +// AllSettings gets all the settings in the configuration. +func AllSettings() map[string]any { + return registry.Combined().AllSettings() +} diff --git a/go/vt/servenv/servenv.go b/go/vt/servenv/servenv.go index 4aa9818eb7d..22bf3523dfc 100644 --- a/go/vt/servenv/servenv.go +++ b/go/vt/servenv/servenv.go @@ -370,6 +370,14 @@ func moveFlags(name string, fs *pflag.FlagSet) { // functions. func CobraPreRunE(cmd *cobra.Command, args []string) error { _flag.TrickGlog() + // Register logging on config file change. + ch := make(chan struct{}) + viperutil.NotifyConfigReload(ch) + go func() { + for range ch { + log.Infof("Change in configuration - %v", viperdebug.AllSettings()) + } + }() watchCancel, err := viperutil.LoadConfig() if err != nil { @@ -377,6 +385,10 @@ func CobraPreRunE(cmd *cobra.Command, args []string) error { } OnTerm(watchCancel) + // Register a function to be called on termination that closes the channel. + // This is done after the watchCancel has registered to ensure that we don't end up + // sending on a closed channel. + OnTerm(func() { close(ch) }) HTTPHandleFunc("/debug/config", viperdebug.HandlerFunc) logutil.PurgeLogs() diff --git a/go/vt/vtorc/config/config.go b/go/vt/vtorc/config/config.go index 2d21e377cb6..cafff5acce8 100644 --- a/go/vt/vtorc/config/config.go +++ b/go/vt/vtorc/config/config.go @@ -17,14 +17,12 @@ package config import ( - "encoding/json" - "fmt" - "os" "time" "github.com/spf13/pflag" - "vitess.io/vitess/go/vt/log" + "vitess.io/vitess/go/viperutil" + "vitess.io/vitess/go/vt/servenv" ) var configurationLoaded = make(chan bool) @@ -42,200 +40,296 @@ const ( ) var ( - sqliteDataFile = "file::memory:?mode=memory&cache=shared" - instancePollTime = 5 * time.Second - snapshotTopologyInterval = 0 * time.Hour - reasonableReplicationLag = 10 * time.Second - auditFileLocation = "" - auditToBackend = false - auditToSyslog = false - auditPurgeDuration = 7 * 24 * time.Hour // Equivalent of 7 days - recoveryPeriodBlockDuration = 30 * time.Second - preventCrossCellFailover = false - waitReplicasTimeout = 30 * time.Second - tolerableReplicationLag = 0 * time.Second - topoInformationRefreshDuration = 15 * time.Second - recoveryPollDuration = 1 * time.Second - ersEnabled = true - convertTabletsWithErrantGTIDs = false + instancePollTime = viperutil.Configure( + "instance-poll-time", + viperutil.Options[time.Duration]{ + FlagName: "instance-poll-time", + Default: 5 * time.Second, + Dynamic: true, + }, + ) + + preventCrossCellFailover = viperutil.Configure( + "prevent-cross-cell-failover", + viperutil.Options[bool]{ + FlagName: "prevent-cross-cell-failover", + Default: false, + Dynamic: true, + }, + ) + + sqliteDataFile = viperutil.Configure( + "sqlite-data-file", + viperutil.Options[string]{ + FlagName: "sqlite-data-file", + Default: "file::memory:?mode=memory&cache=shared", + Dynamic: false, + }, + ) + + snapshotTopologyInterval = viperutil.Configure( + "snapshot-topology-interval", + viperutil.Options[time.Duration]{ + FlagName: "snapshot-topology-interval", + Default: 0 * time.Hour, + Dynamic: true, + }, + ) + + reasonableReplicationLag = viperutil.Configure( + "reasonable-replication-lag", + viperutil.Options[time.Duration]{ + FlagName: "reasonable-replication-lag", + Default: 10 * time.Second, + Dynamic: true, + }, + ) + + auditFileLocation = viperutil.Configure( + "audit-file-location", + viperutil.Options[string]{ + FlagName: "audit-file-location", + Default: "", + Dynamic: false, + }, + ) + + auditToBackend = viperutil.Configure( + "audit-to-backend", + viperutil.Options[bool]{ + FlagName: "audit-to-backend", + Default: false, + Dynamic: true, + }, + ) + + auditToSyslog = viperutil.Configure( + "audit-to-syslog", + viperutil.Options[bool]{ + FlagName: "audit-to-syslog", + Default: false, + Dynamic: true, + }, + ) + + auditPurgeDuration = viperutil.Configure( + "audit-purge-duration", + viperutil.Options[time.Duration]{ + FlagName: "audit-purge-duration", + Default: 7 * 24 * time.Hour, + Dynamic: true, + }, + ) + + waitReplicasTimeout = viperutil.Configure( + "wait-replicas-timeout", + viperutil.Options[time.Duration]{ + FlagName: "wait-replicas-timeout", + Default: 30 * time.Second, + Dynamic: true, + }, + ) + + tolerableReplicationLag = viperutil.Configure( + "tolerable-replication-lag", + viperutil.Options[time.Duration]{ + FlagName: "tolerable-replication-lag", + Default: 0 * time.Second, + Dynamic: true, + }, + ) + + topoInformationRefreshDuration = viperutil.Configure( + "topo-information-refresh-duration", + viperutil.Options[time.Duration]{ + FlagName: "topo-information-refresh-duration", + Default: 15 * time.Second, + Dynamic: true, + }, + ) + + recoveryPollDuration = viperutil.Configure( + "recovery-poll-duration", + viperutil.Options[time.Duration]{ + FlagName: "recovery-poll-duration", + Default: 1 * time.Second, + Dynamic: true, + }, + ) + + ersEnabled = viperutil.Configure( + "allow-emergency-reparent", + viperutil.Options[bool]{ + FlagName: "allow-emergency-reparent", + Default: true, + Dynamic: true, + }, + ) + + convertTabletsWithErrantGTIDs = viperutil.Configure( + "change-tablets-with-errant-gtid-to-drained", + viperutil.Options[bool]{ + FlagName: "change-tablets-with-errant-gtid-to-drained", + Default: false, + Dynamic: true, + }, + ) ) -// RegisterFlags registers the flags required by VTOrc -func RegisterFlags(fs *pflag.FlagSet) { - fs.StringVar(&sqliteDataFile, "sqlite-data-file", sqliteDataFile, "SQLite Datafile to use as VTOrc's database") - fs.DurationVar(&instancePollTime, "instance-poll-time", instancePollTime, "Timer duration on which VTOrc refreshes MySQL information") - fs.DurationVar(&snapshotTopologyInterval, "snapshot-topology-interval", snapshotTopologyInterval, "Timer duration on which VTOrc takes a snapshot of the current MySQL information it has in the database. Should be in multiple of hours") - fs.DurationVar(&reasonableReplicationLag, "reasonable-replication-lag", reasonableReplicationLag, "Maximum replication lag on replicas which is deemed to be acceptable") - fs.StringVar(&auditFileLocation, "audit-file-location", auditFileLocation, "File location where the audit logs are to be stored") - fs.BoolVar(&auditToBackend, "audit-to-backend", auditToBackend, "Whether to store the audit log in the VTOrc database") - fs.BoolVar(&auditToSyslog, "audit-to-syslog", auditToSyslog, "Whether to store the audit log in the syslog") - fs.DurationVar(&auditPurgeDuration, "audit-purge-duration", auditPurgeDuration, "Duration for which audit logs are held before being purged. Should be in multiples of days") - fs.DurationVar(&recoveryPeriodBlockDuration, "recovery-period-block-duration", recoveryPeriodBlockDuration, "Duration for which a new recovery is blocked on an instance after running a recovery") - fs.MarkDeprecated("recovery-period-block-duration", "As of v20 this is ignored and will be removed in a future release.") - fs.BoolVar(&preventCrossCellFailover, "prevent-cross-cell-failover", preventCrossCellFailover, "Prevent VTOrc from promoting a primary in a different cell than the current primary in case of a failover") - fs.DurationVar(&waitReplicasTimeout, "wait-replicas-timeout", waitReplicasTimeout, "Duration for which to wait for replica's to respond when issuing RPCs") - fs.DurationVar(&tolerableReplicationLag, "tolerable-replication-lag", tolerableReplicationLag, "Amount of replication lag that is considered acceptable for a tablet to be eligible for promotion when Vitess makes the choice of a new primary in PRS") - fs.DurationVar(&topoInformationRefreshDuration, "topo-information-refresh-duration", topoInformationRefreshDuration, "Timer duration on which VTOrc refreshes the keyspace and vttablet records from the topology server") - fs.DurationVar(&recoveryPollDuration, "recovery-poll-duration", recoveryPollDuration, "Timer duration on which VTOrc polls its database to run a recovery") - fs.BoolVar(&ersEnabled, "allow-emergency-reparent", ersEnabled, "Whether VTOrc should be allowed to run emergency reparent operation when it detects a dead primary") - fs.BoolVar(&convertTabletsWithErrantGTIDs, "change-tablets-with-errant-gtid-to-drained", convertTabletsWithErrantGTIDs, "Whether VTOrc should be changing the type of tablets with errant GTIDs to DRAINED") +func init() { + servenv.OnParseFor("vtorc", registerFlags) } -// Configuration makes for vtorc configuration input, which can be provided by user via JSON formatted file. -// Some of the parameters have reasonable default values, and some (like database credentials) are -// strictly expected from user. -// TODO(sougou): change this to yaml parsing, and possible merge with tabletenv. -type Configuration struct { - SQLite3DataFile string // full path to sqlite3 datafile - InstancePollSeconds uint // Number of seconds between instance reads - SnapshotTopologiesIntervalHours uint // Interval in hour between snapshot-topologies invocation. Default: 0 (disabled) - ReasonableReplicationLagSeconds int // Above this value is considered a problem - AuditLogFile string // Name of log file for audit operations. Disabled when empty. - AuditToSyslog bool // If true, audit messages are written to syslog - AuditToBackendDB bool // If true, audit messages are written to the backend DB's `audit` table (default: true) - AuditPurgeDays uint // Days after which audit entries are purged from the database - RecoveryPeriodBlockSeconds int // (overrides `RecoveryPeriodBlockMinutes`) The time for which an instance's recovery is kept "active", so as to avoid concurrent recoveries on smae instance as well as flapping - PreventCrossDataCenterPrimaryFailover bool // When true (default: false), cross-DC primary failover are not allowed, vtorc will do all it can to only fail over within same DC, or else not fail over at all. - WaitReplicasTimeoutSeconds int // Timeout on amount of time to wait for the replicas in case of ERS. Should be a small value because we should fail-fast. Should not be larger than LockTimeout since that is the total time we use for an ERS. - TolerableReplicationLagSeconds int // Amount of replication lag that is considered acceptable for a tablet to be eligible for promotion when Vitess makes the choice of a new primary in PRS. - TopoInformationRefreshSeconds int // Timer duration on which VTOrc refreshes the keyspace and vttablet records from the topo-server. - RecoveryPollSeconds int // Timer duration on which VTOrc recovery analysis runs +// registerFlags registers the flags required by VTOrc +func registerFlags(fs *pflag.FlagSet) { + fs.String("sqlite-data-file", sqliteDataFile.Default(), "SQLite Datafile to use as VTOrc's database") + fs.Duration("instance-poll-time", instancePollTime.Default(), "Timer duration on which VTOrc refreshes MySQL information") + fs.Duration("snapshot-topology-interval", snapshotTopologyInterval.Default(), "Timer duration on which VTOrc takes a snapshot of the current MySQL information it has in the database. Should be in multiple of hours") + fs.Duration("reasonable-replication-lag", reasonableReplicationLag.Default(), "Maximum replication lag on replicas which is deemed to be acceptable") + fs.String("audit-file-location", auditFileLocation.Default(), "File location where the audit logs are to be stored") + fs.Bool("audit-to-backend", auditToBackend.Default(), "Whether to store the audit log in the VTOrc database") + fs.Bool("audit-to-syslog", auditToSyslog.Default(), "Whether to store the audit log in the syslog") + fs.Duration("audit-purge-duration", auditPurgeDuration.Default(), "Duration for which audit logs are held before being purged. Should be in multiples of days") + fs.Bool("prevent-cross-cell-failover", preventCrossCellFailover.Default(), "Prevent VTOrc from promoting a primary in a different cell than the current primary in case of a failover") + fs.Duration("wait-replicas-timeout", waitReplicasTimeout.Default(), "Duration for which to wait for replica's to respond when issuing RPCs") + fs.Duration("tolerable-replication-lag", tolerableReplicationLag.Default(), "Amount of replication lag that is considered acceptable for a tablet to be eligible for promotion when Vitess makes the choice of a new primary in PRS") + fs.Duration("topo-information-refresh-duration", topoInformationRefreshDuration.Default(), "Timer duration on which VTOrc refreshes the keyspace and vttablet records from the topology server") + fs.Duration("recovery-poll-duration", recoveryPollDuration.Default(), "Timer duration on which VTOrc polls its database to run a recovery") + fs.Bool("allow-emergency-reparent", ersEnabled.Default(), "Whether VTOrc should be allowed to run emergency reparent operation when it detects a dead primary") + fs.Bool("change-tablets-with-errant-gtid-to-drained", convertTabletsWithErrantGTIDs.Default(), "Whether VTOrc should be changing the type of tablets with errant GTIDs to DRAINED") + + viperutil.BindFlags(fs, + instancePollTime, + preventCrossCellFailover, + sqliteDataFile, + snapshotTopologyInterval, + reasonableReplicationLag, + auditFileLocation, + auditToBackend, + auditToSyslog, + auditPurgeDuration, + waitReplicasTimeout, + tolerableReplicationLag, + topoInformationRefreshDuration, + recoveryPollDuration, + ersEnabled, + convertTabletsWithErrantGTIDs, + ) } -// ToJSONString will marshal this configuration as JSON -func (config *Configuration) ToJSONString() string { - b, _ := json.Marshal(config) - return string(b) +// GetInstancePollTime is a getter function. +func GetInstancePollTime() time.Duration { + return instancePollTime.Get() } -// Config is *the* configuration instance, used globally to get configuration data -var Config = newConfiguration() -var readFileNames []string - -// UpdateConfigValuesFromFlags is used to update the config values from the flags defined. -// This is done before we read any configuration files from the user. So the config files take precedence. -func UpdateConfigValuesFromFlags() { - Config.SQLite3DataFile = sqliteDataFile - Config.InstancePollSeconds = uint(instancePollTime / time.Second) - Config.InstancePollSeconds = uint(instancePollTime / time.Second) - Config.SnapshotTopologiesIntervalHours = uint(snapshotTopologyInterval / time.Hour) - Config.ReasonableReplicationLagSeconds = int(reasonableReplicationLag / time.Second) - Config.AuditLogFile = auditFileLocation - Config.AuditToBackendDB = auditToBackend - Config.AuditToSyslog = auditToSyslog - Config.AuditPurgeDays = uint(auditPurgeDuration / (time.Hour * 24)) - Config.RecoveryPeriodBlockSeconds = int(recoveryPeriodBlockDuration / time.Second) - Config.PreventCrossDataCenterPrimaryFailover = preventCrossCellFailover - Config.WaitReplicasTimeoutSeconds = int(waitReplicasTimeout / time.Second) - Config.TolerableReplicationLagSeconds = int(tolerableReplicationLag / time.Second) - Config.TopoInformationRefreshSeconds = int(topoInformationRefreshDuration / time.Second) - Config.RecoveryPollSeconds = int(recoveryPollDuration / time.Second) +// SetInstancePollTime is a setter function. +func SetInstancePollTime(v time.Duration) { + instancePollTime.Set(v) } -// ERSEnabled reports whether VTOrc is allowed to run ERS or not. -func ERSEnabled() bool { - return ersEnabled +// GetInstancePollSeconds gets the instance poll time but in seconds. +func GetInstancePollSeconds() uint { + return uint(instancePollTime.Get() / time.Second) } -// SetERSEnabled sets the value for the ersEnabled variable. This should only be used from tests. -func SetERSEnabled(val bool) { - ersEnabled = val +// GetPreventCrossCellFailover is a getter function. +func GetPreventCrossCellFailover() bool { + return preventCrossCellFailover.Get() } -// ConvertTabletWithErrantGTIDs reports whether VTOrc is allowed to change the tablet type of tablets with errant GTIDs to DRAINED. -func ConvertTabletWithErrantGTIDs() bool { - return convertTabletsWithErrantGTIDs +// GetSQLiteDataFile is a getter function. +func GetSQLiteDataFile() string { + return sqliteDataFile.Get() } -// SetConvertTabletWithErrantGTIDs sets the value for the convertTabletWithErrantGTIDs variable. This should only be used from tests. -func SetConvertTabletWithErrantGTIDs(val bool) { - convertTabletsWithErrantGTIDs = val +// GetReasonableReplicationLagSeconds gets the reasonable replication lag but in seconds. +func GetReasonableReplicationLagSeconds() int64 { + return int64(reasonableReplicationLag.Get() / time.Second) +} + +// GetSnapshotTopologyInterval is a getter function. +func GetSnapshotTopologyInterval() time.Duration { + return snapshotTopologyInterval.Get() } -// LogConfigValues is used to log the config values. -func LogConfigValues() { - b, _ := json.MarshalIndent(Config, "", "\t") - log.Infof("Running with Configuration - %v", string(b)) +// GetAuditFileLocation is a getter function. +func GetAuditFileLocation() string { + return auditFileLocation.Get() } -func newConfiguration() *Configuration { - return &Configuration{ - SQLite3DataFile: "file::memory:?mode=memory&cache=shared", - InstancePollSeconds: 5, - SnapshotTopologiesIntervalHours: 0, - ReasonableReplicationLagSeconds: 10, - AuditLogFile: "", - AuditToSyslog: false, - AuditToBackendDB: false, - AuditPurgeDays: 7, - RecoveryPeriodBlockSeconds: 30, - PreventCrossDataCenterPrimaryFailover: false, - WaitReplicasTimeoutSeconds: 30, - TopoInformationRefreshSeconds: 15, - RecoveryPollSeconds: 1, - } +// SetAuditFileLocation is a setter function. +func SetAuditFileLocation(v string) { + auditFileLocation.Set(v) } -func (config *Configuration) postReadAdjustments() error { - if config.SQLite3DataFile == "" { - return fmt.Errorf("SQLite3DataFile must be set") - } +// GetAuditToSyslog is a getter function. +func GetAuditToSyslog() bool { + return auditToSyslog.Get() +} + +// SetAuditToSyslog is a setter function. +func SetAuditToSyslog(v bool) { + auditToSyslog.Set(v) +} + +// GetAuditToBackend is a getter function. +func GetAuditToBackend() bool { + return auditToBackend.Get() +} + +// SetAuditToBackend is a setter function. +func SetAuditToBackend(v bool) { + auditToBackend.Set(v) +} - return nil +// GetAuditPurgeDays gets the audit purge duration but in days. +func GetAuditPurgeDays() int64 { + return int64(auditPurgeDuration.Get() / (24 * time.Hour)) } -// read reads configuration from given file, or silently skips if the file does not exist. -// If the file does exist, then it is expected to be in valid JSON format or the function bails out. -func read(fileName string) (*Configuration, error) { - if fileName == "" { - return Config, fmt.Errorf("Empty file name") - } - file, err := os.Open(fileName) - if err != nil { - return Config, err - } - decoder := json.NewDecoder(file) - err = decoder.Decode(Config) - if err == nil { - log.Infof("Read config: %s", fileName) - } else { - log.Fatal("Cannot read config file:", fileName, err) - } - if err := Config.postReadAdjustments(); err != nil { - log.Fatal(err) - } - return Config, err +// SetAuditPurgeDays sets the audit purge duration. +func SetAuditPurgeDays(days int64) { + auditPurgeDuration.Set(time.Duration(days) * 24 * time.Hour) } -// Read reads configuration from zero, either, some or all given files, in order of input. -// A file can override configuration provided in previous file. -func Read(fileNames ...string) *Configuration { - for _, fileName := range fileNames { - _, _ = read(fileName) - } - readFileNames = fileNames - return Config +// GetWaitReplicasTimeout is a getter function. +func GetWaitReplicasTimeout() time.Duration { + return waitReplicasTimeout.Get() } -// ForceRead reads configuration from given file name or bails out if it fails -func ForceRead(fileName string) *Configuration { - _, err := read(fileName) - if err != nil { - log.Fatal("Cannot read config file:", fileName, err) - } - readFileNames = []string{fileName} - return Config +// GetTolerableReplicationLag is a getter function. +func GetTolerableReplicationLag() time.Duration { + return tolerableReplicationLag.Get() } -// Reload re-reads configuration from last used files -func Reload(extraFileNames ...string) *Configuration { - for _, fileName := range readFileNames { - _, _ = read(fileName) - } - for _, fileName := range extraFileNames { - _, _ = read(fileName) - } - return Config +// GetTopoInformationRefreshDuration is a getter function. +func GetTopoInformationRefreshDuration() time.Duration { + return topoInformationRefreshDuration.Get() +} + +// GetRecoveryPollDuration is a getter function. +func GetRecoveryPollDuration() time.Duration { + return recoveryPollDuration.Get() +} + +// ERSEnabled reports whether VTOrc is allowed to run ERS or not. +func ERSEnabled() bool { + return ersEnabled.Get() +} + +// SetERSEnabled sets the value for the ersEnabled variable. This should only be used from tests. +func SetERSEnabled(val bool) { + ersEnabled.Set(val) +} + +// ConvertTabletWithErrantGTIDs reports whether VTOrc is allowed to change the tablet type of tablets with errant GTIDs to DRAINED. +func ConvertTabletWithErrantGTIDs() bool { + return convertTabletsWithErrantGTIDs.Get() +} + +// SetConvertTabletWithErrantGTIDs sets the value for the convertTabletWithErrantGTIDs variable. This should only be used from tests. +func SetConvertTabletWithErrantGTIDs(val bool) { + convertTabletsWithErrantGTIDs.Set(val) } // MarkConfigurationLoaded is called once configuration has first been loaded. diff --git a/go/vt/vtorc/config/config_test.go b/go/vt/vtorc/config/config_test.go deleted file mode 100644 index 2009b476f1d..00000000000 --- a/go/vt/vtorc/config/config_test.go +++ /dev/null @@ -1,234 +0,0 @@ -/* -Copyright 2022 The Vitess Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package config - -import ( - "testing" - "time" - - "github.com/stretchr/testify/require" -) - -func TestUpdateConfigValuesFromFlags(t *testing.T) { - t.Run("defaults", func(t *testing.T) { - // Restore the changes we make to the Config parameter - defer func() { - Config = newConfiguration() - }() - defaultConfig := newConfiguration() - UpdateConfigValuesFromFlags() - require.Equal(t, defaultConfig, Config) - }) - - t.Run("override auditPurgeDuration", func(t *testing.T) { - oldAuditPurgeDuration := auditPurgeDuration - auditPurgeDuration = 8 * time.Hour * 24 - auditPurgeDuration += time.Second + 4*time.Minute - // Restore the changes we make - defer func() { - Config = newConfiguration() - auditPurgeDuration = oldAuditPurgeDuration - }() - - testConfig := newConfiguration() - // auditPurgeDuration is supposed to be in multiples of days. - // If it is not, then we round down to the nearest number of days. - testConfig.AuditPurgeDays = 8 - UpdateConfigValuesFromFlags() - require.Equal(t, testConfig, Config) - }) - - t.Run("override sqliteDataFile", func(t *testing.T) { - oldSqliteDataFile := sqliteDataFile - sqliteDataFile = "newVal" - // Restore the changes we make - defer func() { - Config = newConfiguration() - sqliteDataFile = oldSqliteDataFile - }() - - testConfig := newConfiguration() - testConfig.SQLite3DataFile = "newVal" - UpdateConfigValuesFromFlags() - require.Equal(t, testConfig, Config) - }) - - t.Run("override instancePollTime", func(t *testing.T) { - oldInstancePollTime := instancePollTime - instancePollTime = 7 * time.Second - // Restore the changes we make - defer func() { - Config = newConfiguration() - instancePollTime = oldInstancePollTime - }() - - testConfig := newConfiguration() - testConfig.InstancePollSeconds = 7 - UpdateConfigValuesFromFlags() - require.Equal(t, testConfig, Config) - }) - - t.Run("override snapshotTopologyInterval", func(t *testing.T) { - oldSnapshotTopologyInterval := snapshotTopologyInterval - snapshotTopologyInterval = 1 * time.Hour - // Restore the changes we make - defer func() { - Config = newConfiguration() - snapshotTopologyInterval = oldSnapshotTopologyInterval - }() - - testConfig := newConfiguration() - testConfig.SnapshotTopologiesIntervalHours = 1 - UpdateConfigValuesFromFlags() - require.Equal(t, testConfig, Config) - }) - - t.Run("override reasonableReplicationLag", func(t *testing.T) { - oldReasonableReplicationLag := reasonableReplicationLag - reasonableReplicationLag = 15 * time.Second - // Restore the changes we make - defer func() { - Config = newConfiguration() - reasonableReplicationLag = oldReasonableReplicationLag - }() - - testConfig := newConfiguration() - testConfig.ReasonableReplicationLagSeconds = 15 - UpdateConfigValuesFromFlags() - require.Equal(t, testConfig, Config) - }) - - t.Run("override auditFileLocation", func(t *testing.T) { - oldAuditFileLocation := auditFileLocation - auditFileLocation = "newFile" - // Restore the changes we make - defer func() { - Config = newConfiguration() - auditFileLocation = oldAuditFileLocation - }() - - testConfig := newConfiguration() - testConfig.AuditLogFile = "newFile" - UpdateConfigValuesFromFlags() - require.Equal(t, testConfig, Config) - }) - - t.Run("override auditToBackend", func(t *testing.T) { - oldAuditToBackend := auditToBackend - auditToBackend = true - // Restore the changes we make - defer func() { - Config = newConfiguration() - auditToBackend = oldAuditToBackend - }() - - testConfig := newConfiguration() - testConfig.AuditToBackendDB = true - UpdateConfigValuesFromFlags() - require.Equal(t, testConfig, Config) - }) - - t.Run("override auditToSyslog", func(t *testing.T) { - oldAuditToSyslog := auditToSyslog - auditToSyslog = true - // Restore the changes we make - defer func() { - Config = newConfiguration() - auditToSyslog = oldAuditToSyslog - }() - - testConfig := newConfiguration() - testConfig.AuditToSyslog = true - UpdateConfigValuesFromFlags() - require.Equal(t, testConfig, Config) - }) - - t.Run("override recoveryPeriodBlockDuration", func(t *testing.T) { - oldRecoveryPeriodBlockDuration := recoveryPeriodBlockDuration - recoveryPeriodBlockDuration = 5 * time.Minute - // Restore the changes we make - defer func() { - Config = newConfiguration() - recoveryPeriodBlockDuration = oldRecoveryPeriodBlockDuration - }() - - testConfig := newConfiguration() - testConfig.RecoveryPeriodBlockSeconds = 300 - UpdateConfigValuesFromFlags() - require.Equal(t, testConfig, Config) - }) - - t.Run("override preventCrossCellFailover", func(t *testing.T) { - oldPreventCrossCellFailover := preventCrossCellFailover - preventCrossCellFailover = true - // Restore the changes we make - defer func() { - Config = newConfiguration() - preventCrossCellFailover = oldPreventCrossCellFailover - }() - - testConfig := newConfiguration() - testConfig.PreventCrossDataCenterPrimaryFailover = true - UpdateConfigValuesFromFlags() - require.Equal(t, testConfig, Config) - }) - - t.Run("override waitReplicasTimeout", func(t *testing.T) { - oldWaitReplicasTimeout := waitReplicasTimeout - waitReplicasTimeout = 3*time.Minute + 4*time.Second - // Restore the changes we make - defer func() { - Config = newConfiguration() - waitReplicasTimeout = oldWaitReplicasTimeout - }() - - testConfig := newConfiguration() - testConfig.WaitReplicasTimeoutSeconds = 184 - UpdateConfigValuesFromFlags() - require.Equal(t, testConfig, Config) - }) - - t.Run("override topoInformationRefreshDuration", func(t *testing.T) { - oldTopoInformationRefreshDuration := topoInformationRefreshDuration - topoInformationRefreshDuration = 1 * time.Second - // Restore the changes we make - defer func() { - Config = newConfiguration() - topoInformationRefreshDuration = oldTopoInformationRefreshDuration - }() - - testConfig := newConfiguration() - testConfig.TopoInformationRefreshSeconds = 1 - UpdateConfigValuesFromFlags() - require.Equal(t, testConfig, Config) - }) - - t.Run("override recoveryPollDuration", func(t *testing.T) { - oldRecoveryPollDuration := recoveryPollDuration - recoveryPollDuration = 15 * time.Second - // Restore the changes we make - defer func() { - Config = newConfiguration() - recoveryPollDuration = oldRecoveryPollDuration - }() - - testConfig := newConfiguration() - testConfig.RecoveryPollSeconds = 15 - UpdateConfigValuesFromFlags() - require.Equal(t, testConfig, Config) - }) -} diff --git a/go/vt/vtorc/db/db.go b/go/vt/vtorc/db/db.go index 64143477645..6be9da0c27b 100644 --- a/go/vt/vtorc/db/db.go +++ b/go/vt/vtorc/db/db.go @@ -44,9 +44,9 @@ func (m *vtorcDB) QueryVTOrc(query string, argsArray []any, onRow func(sqlutils. // OpenTopology returns the DB instance for the vtorc backed database func OpenVTOrc() (db *sql.DB, err error) { var fromCache bool - db, fromCache, err = sqlutils.GetSQLiteDB(config.Config.SQLite3DataFile) + db, fromCache, err = sqlutils.GetSQLiteDB(config.GetSQLiteDataFile()) if err == nil && !fromCache { - log.Infof("Connected to vtorc backend: sqlite on %v", config.Config.SQLite3DataFile) + log.Infof("Connected to vtorc backend: sqlite on %v", config.GetSQLiteDataFile()) _ = initVTOrcDB(db) } if db != nil { @@ -94,7 +94,7 @@ func deployStatements(db *sql.DB, queries []string) error { // ClearVTOrcDatabase is used to clear the VTOrc database. This function is meant to be used by tests to clear the // database to get a clean slate without starting a new one. func ClearVTOrcDatabase() { - db, _, _ := sqlutils.GetSQLiteDB(config.Config.SQLite3DataFile) + db, _, _ := sqlutils.GetSQLiteDB(config.GetSQLiteDataFile()) if db != nil { _ = initVTOrcDB(db) } diff --git a/go/vt/vtorc/discovery/queue.go b/go/vt/vtorc/discovery/queue.go index 95751c6ae25..4b18303959b 100644 --- a/go/vt/vtorc/discovery/queue.go +++ b/go/vt/vtorc/discovery/queue.go @@ -153,7 +153,7 @@ func (q *Queue) Consume() string { // alarm if have been waiting for too long timeOnQueue := time.Since(q.queuedKeys[key]) - if timeOnQueue > time.Duration(config.Config.InstancePollSeconds)*time.Second { + if timeOnQueue > config.GetInstancePollTime() { log.Warningf("key %v spent %.4fs waiting on a discoveryQueue", key, timeOnQueue.Seconds()) } diff --git a/go/vt/vtorc/inst/analysis.go b/go/vt/vtorc/inst/analysis.go index 66d6c6dd9ce..3e9e81c5c9f 100644 --- a/go/vt/vtorc/inst/analysis.go +++ b/go/vt/vtorc/inst/analysis.go @@ -144,5 +144,5 @@ func (replicationAnalysis *ReplicationAnalysis) MarshalJSON() ([]byte, error) { // ValidSecondsFromSeenToLastAttemptedCheck returns the maximum allowed elapsed time // between last_attempted_check to last_checked before we consider the instance as invalid. func ValidSecondsFromSeenToLastAttemptedCheck() uint { - return config.Config.InstancePollSeconds + 1 + return config.GetInstancePollSeconds() } diff --git a/go/vt/vtorc/inst/analysis_dao.go b/go/vt/vtorc/inst/analysis_dao.go index 25d93a6864b..98abdcc7188 100644 --- a/go/vt/vtorc/inst/analysis_dao.go +++ b/go/vt/vtorc/inst/analysis_dao.go @@ -47,7 +47,7 @@ func init() { func initializeAnalysisDaoPostConfiguration() { config.WaitForConfigurationToBeLoaded() - recentInstantAnalysis = cache.New(time.Duration(config.Config.RecoveryPollSeconds*2)*time.Second, time.Second) + recentInstantAnalysis = cache.New(config.GetRecoveryPollDuration()*2, time.Second) } type clusterAnalysis struct { @@ -68,7 +68,7 @@ func GetReplicationAnalysis(keyspace string, shard string, hints *ReplicationAna } // TODO(sougou); deprecate ReduceReplicationAnalysisCount - args := sqlutils.Args(config.Config.ReasonableReplicationLagSeconds, ValidSecondsFromSeenToLastAttemptedCheck(), config.Config.ReasonableReplicationLagSeconds, keyspace, shard) + args := sqlutils.Args(config.GetReasonableReplicationLagSeconds(), ValidSecondsFromSeenToLastAttemptedCheck(), config.GetReasonableReplicationLagSeconds(), keyspace, shard) query := ` SELECT vitess_tablet.info AS tablet_info, diff --git a/go/vt/vtorc/inst/audit_dao.go b/go/vt/vtorc/inst/audit_dao.go index 642fb187509..e500289a382 100644 --- a/go/vt/vtorc/inst/audit_dao.go +++ b/go/vt/vtorc/inst/audit_dao.go @@ -38,10 +38,10 @@ func AuditOperation(auditType string, tabletAlias string, message string) error } auditWrittenToFile := false - if config.Config.AuditLogFile != "" { + if config.GetAuditFileLocation() != "" { auditWrittenToFile = true go func() { - f, err := os.OpenFile(config.Config.AuditLogFile, os.O_RDWR|os.O_CREATE|os.O_APPEND, 0640) + f, err := os.OpenFile(config.GetAuditFileLocation(), os.O_RDWR|os.O_CREATE|os.O_APPEND, 0640) if err != nil { log.Error(err) return @@ -54,7 +54,7 @@ func AuditOperation(auditType string, tabletAlias string, message string) error } }() } - if config.Config.AuditToBackendDB { + if config.GetAuditToBackend() { _, err := db.ExecVTOrc(` insert into audit ( diff --git a/go/vt/vtorc/inst/audit_dao_test.go b/go/vt/vtorc/inst/audit_dao_test.go index 1d50de4c146..d22e9177dc3 100644 --- a/go/vt/vtorc/inst/audit_dao_test.go +++ b/go/vt/vtorc/inst/audit_dao_test.go @@ -35,13 +35,13 @@ import ( // This test also verifies that we are able to read the recent audits that are written to the databaes. func TestAuditOperation(t *testing.T) { // Restore original configurations - originalAuditSysLog := config.Config.AuditToSyslog - originalAuditLogFile := config.Config.AuditLogFile - originalAuditBackend := config.Config.AuditToBackendDB + originalAuditSysLog := config.GetAuditToSyslog() + originalAuditLogFile := config.GetAuditFileLocation() + originalAuditBackend := config.GetAuditToBackend() defer func() { - config.Config.AuditToSyslog = originalAuditSysLog - config.Config.AuditLogFile = originalAuditLogFile - config.Config.AuditToBackendDB = originalAuditBackend + config.SetAuditToSyslog(originalAuditSysLog) + config.SetAuditFileLocation(originalAuditLogFile) + config.SetAuditToBackend(originalAuditBackend) }() orcDb, err := db.OpenVTOrc() @@ -78,9 +78,9 @@ func TestAuditOperation(t *testing.T) { message := "test-message" t.Run("audit to backend", func(t *testing.T) { - config.Config.AuditLogFile = "" - config.Config.AuditToSyslog = false - config.Config.AuditToBackendDB = true + config.SetAuditFileLocation("") + config.SetAuditToSyslog(false) + config.SetAuditToBackend(true) // Auditing should succeed as expected err = AuditOperation(auditType, tab100Alias, message) @@ -106,13 +106,13 @@ func TestAuditOperation(t *testing.T) { }) t.Run("audit to File", func(t *testing.T) { - config.Config.AuditToBackendDB = false - config.Config.AuditToSyslog = false + config.SetAuditToBackend(false) + config.SetAuditToSyslog(false) file, err := os.CreateTemp("", "test-auditing-*") require.NoError(t, err) defer os.Remove(file.Name()) - config.Config.AuditLogFile = file.Name() + config.SetAuditFileLocation(file.Name()) err = AuditOperation(auditType, tab100Alias, message) require.NoError(t, err) diff --git a/go/vt/vtorc/inst/instance_dao.go b/go/vt/vtorc/inst/instance_dao.go index bd4438dd05f..985af855713 100644 --- a/go/vt/vtorc/inst/instance_dao.go +++ b/go/vt/vtorc/inst/instance_dao.go @@ -80,7 +80,7 @@ func init() { func initializeInstanceDao() { config.WaitForConfigurationToBeLoaded() - forgetAliases = cache.New(time.Duration(config.Config.InstancePollSeconds*3)*time.Second, time.Second) + forgetAliases = cache.New(config.GetInstancePollTime()*3, time.Second) cacheInitializationCompleted.Store(true) } @@ -116,7 +116,7 @@ func ExpireTableData(tableName string, timestampColumn string) error { writeFunc := func() error { _, err := db.ExecVTOrc( fmt.Sprintf("delete from %s where %s < datetime('now', printf('-%%d DAY', ?))", tableName, timestampColumn), - config.Config.AuditPurgeDays, + config.GetAuditPurgeDays(), ) return err } @@ -544,8 +544,8 @@ func readInstanceRow(m sqlutils.RowMap) *Instance { instance.ReplicationDepth = m.GetUint("replication_depth") instance.IsCoPrimary = m.GetBool("is_co_primary") instance.HasReplicationCredentials = m.GetBool("has_replication_credentials") - instance.IsUpToDate = (m.GetUint("seconds_since_last_checked") <= config.Config.InstancePollSeconds) - instance.IsRecentlyChecked = (m.GetUint("seconds_since_last_checked") <= config.Config.InstancePollSeconds*5) + instance.IsUpToDate = m.GetUint("seconds_since_last_checked") <= config.GetInstancePollSeconds() + instance.IsRecentlyChecked = m.GetUint("seconds_since_last_checked") <= config.GetInstancePollSeconds()*5 instance.LastSeenTimestamp = m.GetString("last_seen") instance.IsLastCheckValid = m.GetBool("is_last_check_valid") instance.SecondsSinceLastSeen = m.GetNullInt64("seconds_since_last_seen") @@ -562,7 +562,7 @@ func readInstanceRow(m sqlutils.RowMap) *Instance { instance.Problems = append(instance.Problems, "not_recently_checked") } else if instance.ReplicationThreadsExist() && !instance.ReplicaRunning() { instance.Problems = append(instance.Problems, "not_replicating") - } else if instance.ReplicationLagSeconds.Valid && util.AbsInt64(instance.ReplicationLagSeconds.Int64-int64(instance.SQLDelay)) > int64(config.Config.ReasonableReplicationLagSeconds) { + } else if instance.ReplicationLagSeconds.Valid && util.AbsInt64(instance.ReplicationLagSeconds.Int64-int64(instance.SQLDelay)) > int64(config.GetReasonableReplicationLagSeconds()) { instance.Problems = append(instance.Problems, "replication_lag") } if instance.GtidErrant != "" { @@ -646,7 +646,7 @@ func ReadProblemInstances(keyspace string, shard string) ([](*Instance), error) ) ` - args := sqlutils.Args(keyspace, keyspace, shard, shard, config.Config.InstancePollSeconds*5, config.Config.ReasonableReplicationLagSeconds, config.Config.ReasonableReplicationLagSeconds) + args := sqlutils.Args(keyspace, keyspace, shard, shard, config.GetInstancePollSeconds()*5, config.GetReasonableReplicationLagSeconds(), config.GetReasonableReplicationLagSeconds()) return readInstancesByCondition(condition, args, "") } @@ -716,7 +716,7 @@ func ReadOutdatedInstanceKeys() ([]string, error) { WHERE database_instance.alias IS NULL ` - args := sqlutils.Args(config.Config.InstancePollSeconds, 2*config.Config.InstancePollSeconds) + args := sqlutils.Args(config.GetInstancePollSeconds(), 2*config.GetInstancePollSeconds()) err := db.QueryVTOrc(query, args, func(m sqlutils.RowMap) error { tabletAlias := m.GetString("alias") @@ -1127,7 +1127,7 @@ func SnapshotTopologies() error { } func ExpireStaleInstanceBinlogCoordinates() error { - expireSeconds := config.Config.ReasonableReplicationLagSeconds * 2 + expireSeconds := config.GetReasonableReplicationLagSeconds() * 2 if expireSeconds < config.StaleInstanceCoordinatesExpireSeconds { expireSeconds = config.StaleInstanceCoordinatesExpireSeconds } diff --git a/go/vt/vtorc/inst/instance_dao_test.go b/go/vt/vtorc/inst/instance_dao_test.go index 2416c1abb90..bc3febf97e6 100644 --- a/go/vt/vtorc/inst/instance_dao_test.go +++ b/go/vt/vtorc/inst/instance_dao_test.go @@ -241,11 +241,11 @@ func TestReadProblemInstances(t *testing.T) { // We need to set InstancePollSeconds to a large value otherwise all the instances are reported as having problems since their last_checked is very old. // Setting this value to a hundred years, we ensure that this test doesn't fail with this issue for the next hundred years. - oldVal := config.Config.InstancePollSeconds + oldVal := config.GetInstancePollTime() defer func() { - config.Config.InstancePollSeconds = oldVal + config.SetInstancePollTime(oldVal) }() - config.Config.InstancePollSeconds = 60 * 60 * 24 * 365 * 100 + config.SetInstancePollTime(60 * 60 * 24 * 365 * 100 * time.Second) for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { @@ -325,11 +325,11 @@ func TestReadInstancesWithErrantGTIds(t *testing.T) { // We need to set InstancePollSeconds to a large value otherwise all the instances are reported as having problems since their last_checked is very old. // Setting this value to a hundred years, we ensure that this test doesn't fail with this issue for the next hundred years. - oldVal := config.Config.InstancePollSeconds + oldVal := config.GetInstancePollTime() defer func() { - config.Config.InstancePollSeconds = oldVal + config.SetInstancePollTime(oldVal) }() - config.Config.InstancePollSeconds = 60 * 60 * 24 * 365 * 100 + config.SetInstancePollTime(60 * 60 * 24 * 365 * 100 * time.Second) for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { @@ -459,13 +459,13 @@ func TestReadOutdatedInstanceKeys(t *testing.T) { waitForCacheInitialization() // We are setting InstancePollSeconds to 59 minutes, just for the test. - oldVal := config.Config.InstancePollSeconds + oldVal := config.GetInstancePollTime() oldCache := forgetAliases defer func() { forgetAliases = oldCache - config.Config.InstancePollSeconds = oldVal + config.SetInstancePollTime(oldVal) }() - config.Config.InstancePollSeconds = 60 * 25 + config.SetInstancePollTime(60 * 25 * time.Second) forgetAliases = cache.New(time.Minute, time.Minute) for _, tt := range tests { @@ -718,10 +718,10 @@ func TestGetDatabaseState(t *testing.T) { } func TestExpireTableData(t *testing.T) { - oldVal := config.Config.AuditPurgeDays - config.Config.AuditPurgeDays = 10 + oldVal := config.GetAuditPurgeDays() + config.SetAuditPurgeDays(10) defer func() { - config.Config.AuditPurgeDays = oldVal + config.SetAuditPurgeDays(oldVal) }() tests := []struct { diff --git a/go/vt/vtorc/logic/tablet_discovery.go b/go/vt/vtorc/logic/tablet_discovery.go index e9bbcee35cb..90100308f7f 100644 --- a/go/vt/vtorc/logic/tablet_discovery.go +++ b/go/vt/vtorc/logic/tablet_discovery.go @@ -71,7 +71,7 @@ func OpenTabletDiscovery() <-chan time.Time { } // We refresh all information from the topo once before we start the ticks to do it on a timer. populateAllInformation() - return time.Tick(time.Second * time.Duration(config.Config.TopoInformationRefreshSeconds)) //nolint SA1015: using time.Tick leaks the underlying ticker + return time.Tick(config.GetTopoInformationRefreshDuration()) //nolint SA1015: using time.Tick leaks the underlying ticker } // populateAllInformation initializes all the information for VTOrc to function. diff --git a/go/vt/vtorc/logic/topology_recovery.go b/go/vt/vtorc/logic/topology_recovery.go index aec137a45b4..f14eca624c9 100644 --- a/go/vt/vtorc/logic/topology_recovery.go +++ b/go/vt/vtorc/logic/topology_recovery.go @@ -21,7 +21,6 @@ import ( "encoding/json" "fmt" "math/rand/v2" - "time" "vitess.io/vitess/go/stats" "vitess.io/vitess/go/vt/log" @@ -235,8 +234,8 @@ func runEmergencyReparentOp(ctx context.Context, analysisEntry *inst.Replication tablet.Shard, reparentutil.EmergencyReparentOptions{ IgnoreReplicas: nil, - WaitReplicasTimeout: time.Duration(config.Config.WaitReplicasTimeoutSeconds) * time.Second, - PreventCrossCellPromotion: config.Config.PreventCrossDataCenterPrimaryFailover, + WaitReplicasTimeout: config.GetWaitReplicasTimeout(), + PreventCrossCellPromotion: config.GetPreventCrossCellFailover(), WaitAllTablets: waitForAllTablets, }, ) @@ -703,8 +702,8 @@ func electNewPrimary(ctx context.Context, analysisEntry *inst.ReplicationAnalysi analyzedTablet.Keyspace, analyzedTablet.Shard, reparentutil.PlannedReparentOptions{ - WaitReplicasTimeout: time.Duration(config.Config.WaitReplicasTimeoutSeconds) * time.Second, - TolerableReplLag: time.Duration(config.Config.TolerableReplicationLagSeconds) * time.Second, + WaitReplicasTimeout: config.GetWaitReplicasTimeout(), + TolerableReplLag: config.GetTolerableReplicationLag(), }, ) diff --git a/go/vt/vtorc/logic/topology_recovery_dao_test.go b/go/vt/vtorc/logic/topology_recovery_dao_test.go index 20dfb7e91e2..6a1d7c4c48f 100644 --- a/go/vt/vtorc/logic/topology_recovery_dao_test.go +++ b/go/vt/vtorc/logic/topology_recovery_dao_test.go @@ -70,10 +70,10 @@ func TestTopologyRecovery(t *testing.T) { } func TestExpireTableData(t *testing.T) { - oldVal := config.Config.AuditPurgeDays - config.Config.AuditPurgeDays = 10 + oldVal := config.GetAuditPurgeDays() + config.SetAuditPurgeDays(10) defer func() { - config.Config.AuditPurgeDays = oldVal + config.SetAuditPurgeDays(oldVal) }() tests := []struct { diff --git a/go/vt/vtorc/logic/vtorc.go b/go/vt/vtorc/logic/vtorc.go index 9a468d1508a..b8cf404d050 100644 --- a/go/vt/vtorc/logic/vtorc.go +++ b/go/vt/vtorc/logic/vtorc.go @@ -17,11 +17,8 @@ package logic import ( - "os" - "os/signal" "sync" "sync/atomic" - "syscall" "time" "github.com/patrickmn/go-cache" @@ -73,26 +70,6 @@ func init() { }) } -// used in several places -func instancePollSecondsDuration() time.Duration { - return time.Duration(config.Config.InstancePollSeconds) * time.Second -} - -// acceptSighupSignal registers for SIGHUP signal from the OS to reload the configuration files. -func acceptSighupSignal() { - c := make(chan os.Signal, 1) - - signal.Notify(c, syscall.SIGHUP) - go func() { - for range c { - log.Infof("Received SIGHUP. Reloading configuration") - _ = inst.AuditOperation("reload-configuration", "", "Triggered via SIGHUP") - config.Reload() - discoveryMetrics.SetExpirePeriod(time.Duration(config.DiscoveryCollectionRetentionSeconds) * time.Second) - } - }() -} - // closeVTOrc runs all the operations required to cleanly shutdown VTOrc func closeVTOrc() { log.Infof("Starting VTOrc shutdown") @@ -161,7 +138,7 @@ func DiscoverInstance(tabletAlias string, forceDiscovery bool) { defer func() { latency.Stop("total") discoveryTime := latency.Elapsed("total") - if discoveryTime > instancePollSecondsDuration() { + if discoveryTime > config.GetInstancePollTime() { instancePollSecondsExceededCounter.Add(1) log.Warningf("discoverInstance exceeded InstancePollSeconds for %+v, took %.4fs", tabletAlias, discoveryTime.Seconds()) if metric != nil { @@ -177,7 +154,7 @@ func DiscoverInstance(tabletAlias string, forceDiscovery bool) { // Calculate the expiry period each time as InstancePollSeconds // _may_ change during the run of the process (via SIGHUP) and // it is not possible to change the cache's default expiry.. - if existsInCacheError := recentDiscoveryOperationKeys.Add(tabletAlias, true, instancePollSecondsDuration()); existsInCacheError != nil && !forceDiscovery { + if existsInCacheError := recentDiscoveryOperationKeys.Add(tabletAlias, true, config.GetInstancePollTime()); existsInCacheError != nil && !forceDiscovery { // Just recently attempted return } @@ -271,24 +248,23 @@ func onHealthTick() { // nolint SA1015: using time.Tick leaks the underlying ticker func ContinuousDiscovery() { log.Infof("continuous discovery: setting up") - recentDiscoveryOperationKeys = cache.New(instancePollSecondsDuration(), time.Second) + recentDiscoveryOperationKeys = cache.New(config.GetInstancePollTime(), time.Second) go handleDiscoveryRequests() healthTick := time.Tick(config.HealthPollSeconds * time.Second) caretakingTick := time.Tick(time.Minute) - recoveryTick := time.Tick(time.Duration(config.Config.RecoveryPollSeconds) * time.Second) + recoveryTick := time.Tick(config.GetRecoveryPollDuration()) tabletTopoTick := OpenTabletDiscovery() var recoveryEntrance int64 var snapshotTopologiesTick <-chan time.Time - if config.Config.SnapshotTopologiesIntervalHours > 0 { - snapshotTopologiesTick = time.Tick(time.Duration(config.Config.SnapshotTopologiesIntervalHours) * time.Hour) + if config.GetSnapshotTopologyInterval() > 0 { + snapshotTopologiesTick = time.Tick(config.GetSnapshotTopologyInterval()) } go func() { _ = ometrics.InitMetrics() }() - go acceptSighupSignal() // On termination of the server, we should close VTOrc cleanly servenv.OnTermSync(closeVTOrc) diff --git a/go/vt/vtorc/server/api.go b/go/vt/vtorc/server/api.go index 5e9a84c0a29..177f2c80333 100644 --- a/go/vt/vtorc/server/api.go +++ b/go/vt/vtorc/server/api.go @@ -25,6 +25,7 @@ import ( "time" "vitess.io/vitess/go/acl" + "vitess.io/vitess/go/viperutil/debug" "vitess.io/vitess/go/vt/servenv" "vitess.io/vitess/go/vt/vtorc/collection" "vitess.io/vitess/go/vt/vtorc/discovery" @@ -46,6 +47,7 @@ const ( enableGlobalRecoveriesAPI = "/api/enable-global-recoveries" replicationAnalysisAPI = "/api/replication-analysis" databaseStateAPI = "/api/database-state" + configAPI = "/api/config" healthAPI = "/debug/health" AggregatedDiscoveryMetricsAPI = "/api/aggregated-discovery-metrics" @@ -62,6 +64,7 @@ var ( enableGlobalRecoveriesAPI, replicationAnalysisAPI, databaseStateAPI, + configAPI, healthAPI, AggregatedDiscoveryMetricsAPI, } @@ -90,6 +93,8 @@ func (v *vtorcAPI) ServeHTTP(response http.ResponseWriter, request *http.Request replicationAnalysisAPIHandler(response, request) case databaseStateAPI: databaseStateAPIHandler(response) + case configAPI: + configAPIHandler(response) case AggregatedDiscoveryMetricsAPI: AggregatedDiscoveryMetricsAPIHandler(response, request) default: @@ -106,7 +111,7 @@ func getACLPermissionLevelForAPI(apiEndpoint string) string { return acl.MONITORING case disableGlobalRecoveriesAPI, enableGlobalRecoveriesAPI: return acl.ADMIN - case replicationAnalysisAPI: + case replicationAnalysisAPI, configAPI: return acl.MONITORING case healthAPI, databaseStateAPI: return acl.MONITORING @@ -180,6 +185,17 @@ func databaseStateAPIHandler(response http.ResponseWriter) { writePlainTextResponse(response, ds, http.StatusOK) } +// configAPIHandler is the handler for the configAPI endpoint +func configAPIHandler(response http.ResponseWriter) { + settingsMap := debug.AllSettings() + jsonOut, err := json.MarshalIndent(settingsMap, "", "\t") + if err != nil { + http.Error(response, err.Error(), http.StatusInternalServerError) + return + } + writePlainTextResponse(response, string(jsonOut), http.StatusOK) +} + // AggregatedDiscoveryMetricsAPIHandler is the handler for the discovery metrics endpoint func AggregatedDiscoveryMetricsAPIHandler(response http.ResponseWriter, request *http.Request) { // return metrics for last x seconds diff --git a/go/vt/vtorc/server/api_test.go b/go/vt/vtorc/server/api_test.go index c352d1e600f..ab6b9eed9af 100644 --- a/go/vt/vtorc/server/api_test.go +++ b/go/vt/vtorc/server/api_test.go @@ -31,6 +31,9 @@ func TestGetACLPermissionLevelForAPI(t *testing.T) { }, { apiEndpoint: healthAPI, want: acl.MONITORING, + }, { + apiEndpoint: configAPI, + want: acl.MONITORING, }, { apiEndpoint: "gibberish", want: acl.ADMIN,