From b3e4aecba5feb0656e809d5cfb24c7c2a2bc76c7 Mon Sep 17 00:00:00 2001 From: Manan Gupta Date: Tue, 12 Nov 2024 08:58:35 +0530 Subject: [PATCH 01/18] feat: remove deprecated flag Signed-off-by: Manan Gupta --- go/test/endtoend/cluster/vtorc_process.go | 15 ++++----------- go/test/endtoend/vtorc/api/api_test.go | 1 - go/test/endtoend/vtorc/general/vtorc_test.go | 2 -- go/test/endtoend/vtorc/utils/utils.go | 1 - go/vt/vtorc/config/config.go | 7 ------- go/vt/vtorc/config/config_test.go | 15 --------------- 6 files changed, 4 insertions(+), 37 deletions(-) diff --git a/go/test/endtoend/cluster/vtorc_process.go b/go/test/endtoend/cluster/vtorc_process.go index 4fcb68e292d..d0e4a6b5f35 100644 --- a/go/test/endtoend/cluster/vtorc_process.go +++ b/go/test/endtoend/cluster/vtorc_process.go @@ -49,9 +49,7 @@ type VTOrcProcess struct { } type VTOrcConfiguration struct { - Debug bool - ListenAddress string - RecoveryPeriodBlockSeconds int + InstancePollSeconds int `json:",omitempty"` TopologyRefreshSeconds int `json:",omitempty"` PreventCrossDataCenterPrimaryFailover bool `json:",omitempty"` LockShardTimeoutSeconds int `json:",omitempty"` @@ -66,11 +64,7 @@ func (config *VTOrcConfiguration) ToJSONString() string { } func (config *VTOrcConfiguration) AddDefaults(webPort int) { - config.Debug = true - if config.RecoveryPeriodBlockSeconds == 0 { - config.RecoveryPeriodBlockSeconds = 1 - } - config.ListenAddress = fmt.Sprintf(":%d", webPort) + config.InstancePollSeconds = 1 } // Setup starts orc process with required arguements @@ -114,9 +108,8 @@ func (orc *VTOrcProcess) Setup() (err error) { "--config", orc.ConfigPath, "--port", fmt.Sprintf("%d", orc.Port), // This parameter is overriden from the config file, added here to just verify that we indeed use the config file paramter over the flag - "--recovery-period-block-duration", "10h", - "--instance-poll-time", "1s", - // Faster topo information refresh speeds up the tests. This doesn't add any significant load either + "--instance-poll-time", "10h", + // Faster topo information refresh speeds up the tests. This doesn't add any significant load either. "--topo-information-refresh-duration", "3s", "--bind-address", "127.0.0.1", ) diff --git a/go/test/endtoend/vtorc/api/api_test.go b/go/test/endtoend/vtorc/api/api_test.go index 670e8c803fa..c9fa8840814 100644 --- a/go/test/endtoend/vtorc/api/api_test.go +++ b/go/test/endtoend/vtorc/api/api_test.go @@ -36,7 +36,6 @@ func TestAPIEndpoints(t *testing.T) { defer cluster.PanicHandler(t) utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 2, 1, nil, cluster.VTOrcConfiguration{ PreventCrossDataCenterPrimaryFailover: true, - RecoveryPeriodBlockSeconds: 5, }, 1, "") keyspace := &clusterInfo.ClusterInstance.Keyspaces[0] shard0 := &keyspace.Shards[0] diff --git a/go/test/endtoend/vtorc/general/vtorc_test.go b/go/test/endtoend/vtorc/general/vtorc_test.go index 329601deb0c..854623f45c3 100644 --- a/go/test/endtoend/vtorc/general/vtorc_test.go +++ b/go/test/endtoend/vtorc/general/vtorc_test.go @@ -456,8 +456,6 @@ func TestVTOrcWithPrs(t *testing.T) { "--new-primary", replica.Alias) require.NoError(t, err, "error in PlannedReparentShard output - %s", output) - time.Sleep(40 * time.Second) - // check that the replica gets promoted utils.CheckPrimaryTablet(t, clusterInfo, replica, true) // Verify that VTOrc didn't run any other recovery diff --git a/go/test/endtoend/vtorc/utils/utils.go b/go/test/endtoend/vtorc/utils/utils.go index 680d1bfa39a..b00d618079b 100644 --- a/go/test/endtoend/vtorc/utils/utils.go +++ b/go/test/endtoend/vtorc/utils/utils.go @@ -376,7 +376,6 @@ func CheckPrimaryTablet(t *testing.T, clusterInfo *VTOrcClusterInfo, tablet *clu for { now := time.Now() if now.Sub(start) > time.Second*60 { - //log.Exitf("error") assert.FailNow(t, "failed to elect primary before timeout") } tabletInfo, err := clusterInfo.ClusterInstance.VtctldClientProcess.GetTablet(tablet.Alias) diff --git a/go/vt/vtorc/config/config.go b/go/vt/vtorc/config/config.go index 2d21e377cb6..3341d08a163 100644 --- a/go/vt/vtorc/config/config.go +++ b/go/vt/vtorc/config/config.go @@ -50,7 +50,6 @@ var ( auditToBackend = false auditToSyslog = false auditPurgeDuration = 7 * 24 * time.Hour // Equivalent of 7 days - recoveryPeriodBlockDuration = 30 * time.Second preventCrossCellFailover = false waitReplicasTimeout = 30 * time.Second tolerableReplicationLag = 0 * time.Second @@ -70,8 +69,6 @@ func RegisterFlags(fs *pflag.FlagSet) { fs.BoolVar(&auditToBackend, "audit-to-backend", auditToBackend, "Whether to store the audit log in the VTOrc database") fs.BoolVar(&auditToSyslog, "audit-to-syslog", auditToSyslog, "Whether to store the audit log in the syslog") fs.DurationVar(&auditPurgeDuration, "audit-purge-duration", auditPurgeDuration, "Duration for which audit logs are held before being purged. Should be in multiples of days") - fs.DurationVar(&recoveryPeriodBlockDuration, "recovery-period-block-duration", recoveryPeriodBlockDuration, "Duration for which a new recovery is blocked on an instance after running a recovery") - fs.MarkDeprecated("recovery-period-block-duration", "As of v20 this is ignored and will be removed in a future release.") fs.BoolVar(&preventCrossCellFailover, "prevent-cross-cell-failover", preventCrossCellFailover, "Prevent VTOrc from promoting a primary in a different cell than the current primary in case of a failover") fs.DurationVar(&waitReplicasTimeout, "wait-replicas-timeout", waitReplicasTimeout, "Duration for which to wait for replica's to respond when issuing RPCs") fs.DurationVar(&tolerableReplicationLag, "tolerable-replication-lag", tolerableReplicationLag, "Amount of replication lag that is considered acceptable for a tablet to be eligible for promotion when Vitess makes the choice of a new primary in PRS") @@ -94,7 +91,6 @@ type Configuration struct { AuditToSyslog bool // If true, audit messages are written to syslog AuditToBackendDB bool // If true, audit messages are written to the backend DB's `audit` table (default: true) AuditPurgeDays uint // Days after which audit entries are purged from the database - RecoveryPeriodBlockSeconds int // (overrides `RecoveryPeriodBlockMinutes`) The time for which an instance's recovery is kept "active", so as to avoid concurrent recoveries on smae instance as well as flapping PreventCrossDataCenterPrimaryFailover bool // When true (default: false), cross-DC primary failover are not allowed, vtorc will do all it can to only fail over within same DC, or else not fail over at all. WaitReplicasTimeoutSeconds int // Timeout on amount of time to wait for the replicas in case of ERS. Should be a small value because we should fail-fast. Should not be larger than LockTimeout since that is the total time we use for an ERS. TolerableReplicationLagSeconds int // Amount of replication lag that is considered acceptable for a tablet to be eligible for promotion when Vitess makes the choice of a new primary in PRS. @@ -117,14 +113,12 @@ var readFileNames []string func UpdateConfigValuesFromFlags() { Config.SQLite3DataFile = sqliteDataFile Config.InstancePollSeconds = uint(instancePollTime / time.Second) - Config.InstancePollSeconds = uint(instancePollTime / time.Second) Config.SnapshotTopologiesIntervalHours = uint(snapshotTopologyInterval / time.Hour) Config.ReasonableReplicationLagSeconds = int(reasonableReplicationLag / time.Second) Config.AuditLogFile = auditFileLocation Config.AuditToBackendDB = auditToBackend Config.AuditToSyslog = auditToSyslog Config.AuditPurgeDays = uint(auditPurgeDuration / (time.Hour * 24)) - Config.RecoveryPeriodBlockSeconds = int(recoveryPeriodBlockDuration / time.Second) Config.PreventCrossDataCenterPrimaryFailover = preventCrossCellFailover Config.WaitReplicasTimeoutSeconds = int(waitReplicasTimeout / time.Second) Config.TolerableReplicationLagSeconds = int(tolerableReplicationLag / time.Second) @@ -168,7 +162,6 @@ func newConfiguration() *Configuration { AuditToSyslog: false, AuditToBackendDB: false, AuditPurgeDays: 7, - RecoveryPeriodBlockSeconds: 30, PreventCrossDataCenterPrimaryFailover: false, WaitReplicasTimeoutSeconds: 30, TopoInformationRefreshSeconds: 15, diff --git a/go/vt/vtorc/config/config_test.go b/go/vt/vtorc/config/config_test.go index 2009b476f1d..5a99ba07bf5 100644 --- a/go/vt/vtorc/config/config_test.go +++ b/go/vt/vtorc/config/config_test.go @@ -157,21 +157,6 @@ func TestUpdateConfigValuesFromFlags(t *testing.T) { require.Equal(t, testConfig, Config) }) - t.Run("override recoveryPeriodBlockDuration", func(t *testing.T) { - oldRecoveryPeriodBlockDuration := recoveryPeriodBlockDuration - recoveryPeriodBlockDuration = 5 * time.Minute - // Restore the changes we make - defer func() { - Config = newConfiguration() - recoveryPeriodBlockDuration = oldRecoveryPeriodBlockDuration - }() - - testConfig := newConfiguration() - testConfig.RecoveryPeriodBlockSeconds = 300 - UpdateConfigValuesFromFlags() - require.Equal(t, testConfig, Config) - }) - t.Run("override preventCrossCellFailover", func(t *testing.T) { oldPreventCrossCellFailover := preventCrossCellFailover preventCrossCellFailover = true From 85913d118bc231e208e59eee7e9e52a9a21df62b Mon Sep 17 00:00:00 2001 From: Manan Gupta Date: Tue, 12 Nov 2024 11:30:41 +0530 Subject: [PATCH 02/18] feat: remove config file flag and start using viper instead for instance poll seconds Signed-off-by: Manan Gupta --- go/cmd/vtorc/cli/cli.go | 10 +-- go/flags/endtoend/vtorc.txt | 1 - go/test/endtoend/cluster/vtorc_process.go | 11 ++- .../vtorc/readtopologyinstance/main_test.go | 3 +- go/viperutil/debug/debug.go | 4 ++ go/vt/vtorc/config/config.go | 70 +++++++++++-------- go/vt/vtorc/config/config_test.go | 15 ---- go/vt/vtorc/discovery/queue.go | 2 +- go/vt/vtorc/inst/analysis.go | 2 +- go/vt/vtorc/inst/instance_dao.go | 10 +-- go/vt/vtorc/inst/instance_dao_test.go | 18 ++--- go/vt/vtorc/logic/vtorc.go | 11 +-- 12 files changed, 69 insertions(+), 88 deletions(-) diff --git a/go/cmd/vtorc/cli/cli.go b/go/cmd/vtorc/cli/cli.go index 1233c1e2ac2..c58eaf2853c 100644 --- a/go/cmd/vtorc/cli/cli.go +++ b/go/cmd/vtorc/cli/cli.go @@ -29,8 +29,7 @@ import ( ) var ( - configFile string - Main = &cobra.Command{ + Main = &cobra.Command{ Use: "vtorc", Short: "VTOrc is the automated fault detection and repair tool in Vitess.", Example: `vtorc \ @@ -55,11 +54,6 @@ func run(cmd *cobra.Command, args []string) { inst.RegisterStats() log.Info("starting vtorc") - if len(configFile) > 0 { - config.ForceRead(configFile) - } else { - config.Read("/etc/vtorc.conf.json", "conf/vtorc.conf.json", "vtorc.conf.json") - } if config.Config.AuditToSyslog { inst.EnableAuditSyslog() } @@ -96,7 +90,5 @@ func init() { servenv.MoveFlagsToCobraCommand(Main) logic.RegisterFlags(Main.Flags()) - config.RegisterFlags(Main.Flags()) acl.RegisterFlags(Main.Flags()) - Main.Flags().StringVar(&configFile, "config", "", "config file name") } diff --git a/go/flags/endtoend/vtorc.txt b/go/flags/endtoend/vtorc.txt index d34b4404df7..efccb0afdfc 100644 --- a/go/flags/endtoend/vtorc.txt +++ b/go/flags/endtoend/vtorc.txt @@ -25,7 +25,6 @@ Flags: --catch-sigpipe catch and ignore SIGPIPE on stdout and stderr if specified --change-tablets-with-errant-gtid-to-drained Whether VTOrc should be changing the type of tablets with errant GTIDs to DRAINED --clusters_to_watch strings Comma-separated list of keyspaces or keyspace/shards that this instance will monitor and repair. Defaults to all clusters in the topology. Example: "ks1,ks2/-80" - --config string config file name --config-file string Full path of the config file (with extension) to use. If set, --config-path, --config-type, and --config-name are ignored. --config-file-not-found-handling ConfigFileNotFoundHandling Behavior when a config file is not found. (Options: error, exit, ignore, warn) (default warn) --config-name string Name of the config file (without extension) to search for. (default "vtconfig") diff --git a/go/test/endtoend/cluster/vtorc_process.go b/go/test/endtoend/cluster/vtorc_process.go index d0e4a6b5f35..8ed358f12a3 100644 --- a/go/test/endtoend/cluster/vtorc_process.go +++ b/go/test/endtoend/cluster/vtorc_process.go @@ -49,8 +49,7 @@ type VTOrcProcess struct { } type VTOrcConfiguration struct { - InstancePollSeconds int `json:",omitempty"` - TopologyRefreshSeconds int `json:",omitempty"` + InstancePollTime string `json:",omitempty"` PreventCrossDataCenterPrimaryFailover bool `json:",omitempty"` LockShardTimeoutSeconds int `json:",omitempty"` ReplicationLagQuery string `json:",omitempty"` @@ -64,7 +63,7 @@ func (config *VTOrcConfiguration) ToJSONString() string { } func (config *VTOrcConfiguration) AddDefaults(webPort int) { - config.InstancePollSeconds = 1 + config.InstancePollTime = "10h" } // Setup starts orc process with required arguements @@ -105,10 +104,10 @@ func (orc *VTOrcProcess) Setup() (err error) { "--topo_implementation", orc.TopoImplementation, "--topo_global_server_address", orc.TopoGlobalAddress, "--topo_global_root", orc.TopoGlobalRoot, - "--config", orc.ConfigPath, + "--config-file", orc.ConfigPath, "--port", fmt.Sprintf("%d", orc.Port), - // This parameter is overriden from the config file, added here to just verify that we indeed use the config file paramter over the flag - "--instance-poll-time", "10h", + // This parameter is overriden from the config file. This verifies that we indeed use the flag value over the config file. + "--instance-poll-time", "1s", // Faster topo information refresh speeds up the tests. This doesn't add any significant load either. "--topo-information-refresh-duration", "3s", "--bind-address", "127.0.0.1", diff --git a/go/test/endtoend/vtorc/readtopologyinstance/main_test.go b/go/test/endtoend/vtorc/readtopologyinstance/main_test.go index fa8dc116782..823655ed785 100644 --- a/go/test/endtoend/vtorc/readtopologyinstance/main_test.go +++ b/go/test/endtoend/vtorc/readtopologyinstance/main_test.go @@ -55,8 +55,7 @@ func TestReadTopologyInstanceBufferable(t *testing.T) { "--topo_global_root", clusterInfo.ClusterInstance.VtctlProcess.TopoGlobalRoot, } servenv.ParseFlags("vtorc") - config.Config.RecoveryPeriodBlockSeconds = 1 - config.Config.InstancePollSeconds = 1 + config.SetInstancePollTime(1 * time.Second) config.MarkConfigurationLoaded() server.StartVTOrcDiscovery() diff --git a/go/viperutil/debug/debug.go b/go/viperutil/debug/debug.go index 66cbc7f2962..f2c6d66b479 100644 --- a/go/viperutil/debug/debug.go +++ b/go/viperutil/debug/debug.go @@ -25,3 +25,7 @@ import ( func Debug() { registry.Combined().Debug() } + +func WriteConfigAs(filename string) error { + return registry.Combined().WriteConfigAs(filename) +} diff --git a/go/vt/vtorc/config/config.go b/go/vt/vtorc/config/config.go index 3341d08a163..e9bd27b2d62 100644 --- a/go/vt/vtorc/config/config.go +++ b/go/vt/vtorc/config/config.go @@ -24,7 +24,9 @@ import ( "github.com/spf13/pflag" + "vitess.io/vitess/go/viperutil" "vitess.io/vitess/go/vt/log" + "vitess.io/vitess/go/vt/servenv" ) var configurationLoaded = make(chan bool) @@ -41,9 +43,19 @@ const ( UnseenInstanceForgetHours = 240 // Number of hours after which an unseen instance is forgotten ) +var ( + instancePollTime = viperutil.Configure( + "InstancePollTime", + viperutil.Options[time.Duration]{ + FlagName: "instance-poll-time", + Default: 5 * time.Second, + Dynamic: true, + }, + ) +) + var ( sqliteDataFile = "file::memory:?mode=memory&cache=shared" - instancePollTime = 5 * time.Second snapshotTopologyInterval = 0 * time.Hour reasonableReplicationLag = 10 * time.Second auditFileLocation = "" @@ -59,10 +71,14 @@ var ( convertTabletsWithErrantGTIDs = false ) -// RegisterFlags registers the flags required by VTOrc -func RegisterFlags(fs *pflag.FlagSet) { +func init() { + servenv.OnParseFor("vtorc", registerFlags) +} + +// registerFlags registers the flags required by VTOrc +func registerFlags(fs *pflag.FlagSet) { fs.StringVar(&sqliteDataFile, "sqlite-data-file", sqliteDataFile, "SQLite Datafile to use as VTOrc's database") - fs.DurationVar(&instancePollTime, "instance-poll-time", instancePollTime, "Timer duration on which VTOrc refreshes MySQL information") + fs.Duration("instance-poll-time", instancePollTime.Default(), "Timer duration on which VTOrc refreshes MySQL information") fs.DurationVar(&snapshotTopologyInterval, "snapshot-topology-interval", snapshotTopologyInterval, "Timer duration on which VTOrc takes a snapshot of the current MySQL information it has in the database. Should be in multiple of hours") fs.DurationVar(&reasonableReplicationLag, "reasonable-replication-lag", reasonableReplicationLag, "Maximum replication lag on replicas which is deemed to be acceptable") fs.StringVar(&auditFileLocation, "audit-file-location", auditFileLocation, "File location where the audit logs are to be stored") @@ -76,6 +92,10 @@ func RegisterFlags(fs *pflag.FlagSet) { fs.DurationVar(&recoveryPollDuration, "recovery-poll-duration", recoveryPollDuration, "Timer duration on which VTOrc polls its database to run a recovery") fs.BoolVar(&ersEnabled, "allow-emergency-reparent", ersEnabled, "Whether VTOrc should be allowed to run emergency reparent operation when it detects a dead primary") fs.BoolVar(&convertTabletsWithErrantGTIDs, "change-tablets-with-errant-gtid-to-drained", convertTabletsWithErrantGTIDs, "Whether VTOrc should be changing the type of tablets with errant GTIDs to DRAINED") + + viperutil.BindFlags(fs, + instancePollTime, + ) } // Configuration makes for vtorc configuration input, which can be provided by user via JSON formatted file. @@ -84,7 +104,6 @@ func RegisterFlags(fs *pflag.FlagSet) { // TODO(sougou): change this to yaml parsing, and possible merge with tabletenv. type Configuration struct { SQLite3DataFile string // full path to sqlite3 datafile - InstancePollSeconds uint // Number of seconds between instance reads SnapshotTopologiesIntervalHours uint // Interval in hour between snapshot-topologies invocation. Default: 0 (disabled) ReasonableReplicationLagSeconds int // Above this value is considered a problem AuditLogFile string // Name of log file for audit operations. Disabled when empty. @@ -106,13 +125,26 @@ func (config *Configuration) ToJSONString() string { // Config is *the* configuration instance, used globally to get configuration data var Config = newConfiguration() -var readFileNames []string + +// GetInstancePollTime is a getter function. +func GetInstancePollTime() time.Duration { + return instancePollTime.Get() +} + +// SetInstancePollTime is a setter function. +func SetInstancePollTime(v time.Duration) { + instancePollTime.Set(v) +} + +// GetInstancePollSeconds gets the instance poll time but in seconds. +func GetInstancePollSeconds() uint { + return uint(instancePollTime.Get() / time.Second) +} // UpdateConfigValuesFromFlags is used to update the config values from the flags defined. // This is done before we read any configuration files from the user. So the config files take precedence. func UpdateConfigValuesFromFlags() { Config.SQLite3DataFile = sqliteDataFile - Config.InstancePollSeconds = uint(instancePollTime / time.Second) Config.SnapshotTopologiesIntervalHours = uint(snapshotTopologyInterval / time.Hour) Config.ReasonableReplicationLagSeconds = int(reasonableReplicationLag / time.Second) Config.AuditLogFile = auditFileLocation @@ -155,7 +187,6 @@ func LogConfigValues() { func newConfiguration() *Configuration { return &Configuration{ SQLite3DataFile: "file::memory:?mode=memory&cache=shared", - InstancePollSeconds: 5, SnapshotTopologiesIntervalHours: 0, ReasonableReplicationLagSeconds: 10, AuditLogFile: "", @@ -200,31 +231,8 @@ func read(fileName string) (*Configuration, error) { return Config, err } -// Read reads configuration from zero, either, some or all given files, in order of input. -// A file can override configuration provided in previous file. -func Read(fileNames ...string) *Configuration { - for _, fileName := range fileNames { - _, _ = read(fileName) - } - readFileNames = fileNames - return Config -} - -// ForceRead reads configuration from given file name or bails out if it fails -func ForceRead(fileName string) *Configuration { - _, err := read(fileName) - if err != nil { - log.Fatal("Cannot read config file:", fileName, err) - } - readFileNames = []string{fileName} - return Config -} - // Reload re-reads configuration from last used files func Reload(extraFileNames ...string) *Configuration { - for _, fileName := range readFileNames { - _, _ = read(fileName) - } for _, fileName := range extraFileNames { _, _ = read(fileName) } diff --git a/go/vt/vtorc/config/config_test.go b/go/vt/vtorc/config/config_test.go index 5a99ba07bf5..a564c81e58f 100644 --- a/go/vt/vtorc/config/config_test.go +++ b/go/vt/vtorc/config/config_test.go @@ -67,21 +67,6 @@ func TestUpdateConfigValuesFromFlags(t *testing.T) { require.Equal(t, testConfig, Config) }) - t.Run("override instancePollTime", func(t *testing.T) { - oldInstancePollTime := instancePollTime - instancePollTime = 7 * time.Second - // Restore the changes we make - defer func() { - Config = newConfiguration() - instancePollTime = oldInstancePollTime - }() - - testConfig := newConfiguration() - testConfig.InstancePollSeconds = 7 - UpdateConfigValuesFromFlags() - require.Equal(t, testConfig, Config) - }) - t.Run("override snapshotTopologyInterval", func(t *testing.T) { oldSnapshotTopologyInterval := snapshotTopologyInterval snapshotTopologyInterval = 1 * time.Hour diff --git a/go/vt/vtorc/discovery/queue.go b/go/vt/vtorc/discovery/queue.go index 95751c6ae25..4b18303959b 100644 --- a/go/vt/vtorc/discovery/queue.go +++ b/go/vt/vtorc/discovery/queue.go @@ -153,7 +153,7 @@ func (q *Queue) Consume() string { // alarm if have been waiting for too long timeOnQueue := time.Since(q.queuedKeys[key]) - if timeOnQueue > time.Duration(config.Config.InstancePollSeconds)*time.Second { + if timeOnQueue > config.GetInstancePollTime() { log.Warningf("key %v spent %.4fs waiting on a discoveryQueue", key, timeOnQueue.Seconds()) } diff --git a/go/vt/vtorc/inst/analysis.go b/go/vt/vtorc/inst/analysis.go index 66d6c6dd9ce..3e9e81c5c9f 100644 --- a/go/vt/vtorc/inst/analysis.go +++ b/go/vt/vtorc/inst/analysis.go @@ -144,5 +144,5 @@ func (replicationAnalysis *ReplicationAnalysis) MarshalJSON() ([]byte, error) { // ValidSecondsFromSeenToLastAttemptedCheck returns the maximum allowed elapsed time // between last_attempted_check to last_checked before we consider the instance as invalid. func ValidSecondsFromSeenToLastAttemptedCheck() uint { - return config.Config.InstancePollSeconds + 1 + return config.GetInstancePollSeconds() } diff --git a/go/vt/vtorc/inst/instance_dao.go b/go/vt/vtorc/inst/instance_dao.go index bd4438dd05f..7697cf713c8 100644 --- a/go/vt/vtorc/inst/instance_dao.go +++ b/go/vt/vtorc/inst/instance_dao.go @@ -80,7 +80,7 @@ func init() { func initializeInstanceDao() { config.WaitForConfigurationToBeLoaded() - forgetAliases = cache.New(time.Duration(config.Config.InstancePollSeconds*3)*time.Second, time.Second) + forgetAliases = cache.New(config.GetInstancePollTime()*3, time.Second) cacheInitializationCompleted.Store(true) } @@ -544,8 +544,8 @@ func readInstanceRow(m sqlutils.RowMap) *Instance { instance.ReplicationDepth = m.GetUint("replication_depth") instance.IsCoPrimary = m.GetBool("is_co_primary") instance.HasReplicationCredentials = m.GetBool("has_replication_credentials") - instance.IsUpToDate = (m.GetUint("seconds_since_last_checked") <= config.Config.InstancePollSeconds) - instance.IsRecentlyChecked = (m.GetUint("seconds_since_last_checked") <= config.Config.InstancePollSeconds*5) + instance.IsUpToDate = m.GetUint("seconds_since_last_checked") <= config.GetInstancePollSeconds() + instance.IsRecentlyChecked = m.GetUint("seconds_since_last_checked") <= config.GetInstancePollSeconds()*5 instance.LastSeenTimestamp = m.GetString("last_seen") instance.IsLastCheckValid = m.GetBool("is_last_check_valid") instance.SecondsSinceLastSeen = m.GetNullInt64("seconds_since_last_seen") @@ -646,7 +646,7 @@ func ReadProblemInstances(keyspace string, shard string) ([](*Instance), error) ) ` - args := sqlutils.Args(keyspace, keyspace, shard, shard, config.Config.InstancePollSeconds*5, config.Config.ReasonableReplicationLagSeconds, config.Config.ReasonableReplicationLagSeconds) + args := sqlutils.Args(keyspace, keyspace, shard, shard, config.GetInstancePollSeconds()*5, config.Config.ReasonableReplicationLagSeconds, config.Config.ReasonableReplicationLagSeconds) return readInstancesByCondition(condition, args, "") } @@ -716,7 +716,7 @@ func ReadOutdatedInstanceKeys() ([]string, error) { WHERE database_instance.alias IS NULL ` - args := sqlutils.Args(config.Config.InstancePollSeconds, 2*config.Config.InstancePollSeconds) + args := sqlutils.Args(config.GetInstancePollSeconds(), 2*config.GetInstancePollSeconds()) err := db.QueryVTOrc(query, args, func(m sqlutils.RowMap) error { tabletAlias := m.GetString("alias") diff --git a/go/vt/vtorc/inst/instance_dao_test.go b/go/vt/vtorc/inst/instance_dao_test.go index 2416c1abb90..e71731f9f51 100644 --- a/go/vt/vtorc/inst/instance_dao_test.go +++ b/go/vt/vtorc/inst/instance_dao_test.go @@ -241,11 +241,11 @@ func TestReadProblemInstances(t *testing.T) { // We need to set InstancePollSeconds to a large value otherwise all the instances are reported as having problems since their last_checked is very old. // Setting this value to a hundred years, we ensure that this test doesn't fail with this issue for the next hundred years. - oldVal := config.Config.InstancePollSeconds + oldVal := config.GetInstancePollTime() defer func() { - config.Config.InstancePollSeconds = oldVal + config.SetInstancePollTime(oldVal) }() - config.Config.InstancePollSeconds = 60 * 60 * 24 * 365 * 100 + config.SetInstancePollTime(60 * 60 * 24 * 365 * 100 * time.Second) for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { @@ -325,11 +325,11 @@ func TestReadInstancesWithErrantGTIds(t *testing.T) { // We need to set InstancePollSeconds to a large value otherwise all the instances are reported as having problems since their last_checked is very old. // Setting this value to a hundred years, we ensure that this test doesn't fail with this issue for the next hundred years. - oldVal := config.Config.InstancePollSeconds + oldVal := config.GetInstancePollTime() defer func() { - config.Config.InstancePollSeconds = oldVal + config.SetInstancePollTime(oldVal) }() - config.Config.InstancePollSeconds = 60 * 60 * 24 * 365 * 100 + config.SetInstancePollTime(60 * 60 * 24 * 365 * 100 * time.Second) for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { @@ -459,13 +459,13 @@ func TestReadOutdatedInstanceKeys(t *testing.T) { waitForCacheInitialization() // We are setting InstancePollSeconds to 59 minutes, just for the test. - oldVal := config.Config.InstancePollSeconds + oldVal := config.GetInstancePollTime() oldCache := forgetAliases defer func() { forgetAliases = oldCache - config.Config.InstancePollSeconds = oldVal + config.SetInstancePollTime(oldVal) }() - config.Config.InstancePollSeconds = 60 * 25 + config.SetInstancePollTime(60 * 25 * time.Second) forgetAliases = cache.New(time.Minute, time.Minute) for _, tt := range tests { diff --git a/go/vt/vtorc/logic/vtorc.go b/go/vt/vtorc/logic/vtorc.go index 9a468d1508a..0a9f668835c 100644 --- a/go/vt/vtorc/logic/vtorc.go +++ b/go/vt/vtorc/logic/vtorc.go @@ -73,11 +73,6 @@ func init() { }) } -// used in several places -func instancePollSecondsDuration() time.Duration { - return time.Duration(config.Config.InstancePollSeconds) * time.Second -} - // acceptSighupSignal registers for SIGHUP signal from the OS to reload the configuration files. func acceptSighupSignal() { c := make(chan os.Signal, 1) @@ -161,7 +156,7 @@ func DiscoverInstance(tabletAlias string, forceDiscovery bool) { defer func() { latency.Stop("total") discoveryTime := latency.Elapsed("total") - if discoveryTime > instancePollSecondsDuration() { + if discoveryTime > config.GetInstancePollTime() { instancePollSecondsExceededCounter.Add(1) log.Warningf("discoverInstance exceeded InstancePollSeconds for %+v, took %.4fs", tabletAlias, discoveryTime.Seconds()) if metric != nil { @@ -177,7 +172,7 @@ func DiscoverInstance(tabletAlias string, forceDiscovery bool) { // Calculate the expiry period each time as InstancePollSeconds // _may_ change during the run of the process (via SIGHUP) and // it is not possible to change the cache's default expiry.. - if existsInCacheError := recentDiscoveryOperationKeys.Add(tabletAlias, true, instancePollSecondsDuration()); existsInCacheError != nil && !forceDiscovery { + if existsInCacheError := recentDiscoveryOperationKeys.Add(tabletAlias, true, config.GetInstancePollTime()); existsInCacheError != nil && !forceDiscovery { // Just recently attempted return } @@ -271,7 +266,7 @@ func onHealthTick() { // nolint SA1015: using time.Tick leaks the underlying ticker func ContinuousDiscovery() { log.Infof("continuous discovery: setting up") - recentDiscoveryOperationKeys = cache.New(instancePollSecondsDuration(), time.Second) + recentDiscoveryOperationKeys = cache.New(config.GetInstancePollTime(), time.Second) go handleDiscoveryRequests() From 0a27ba831d77e86a7464dfe8e722ebdc2e9d2f84 Mon Sep 17 00:00:00 2001 From: Manan Gupta Date: Tue, 12 Nov 2024 15:03:32 +0530 Subject: [PATCH 03/18] feat: move prevent cross cell promotion as well to viper Signed-off-by: Manan Gupta --- go/test/endtoend/cluster/vtorc_process.go | 10 +-- go/test/endtoend/vtorc/api/api_test.go | 2 +- go/test/endtoend/vtorc/general/vtorc_test.go | 18 ++--- .../primaryfailure/primary_failure_test.go | 16 ++--- go/viperutil/debug/debug.go | 6 ++ go/vt/vtorc/config/config.go | 67 +++++++++++-------- go/vt/vtorc/config/config_test.go | 15 ----- go/vt/vtorc/logic/topology_recovery.go | 2 +- 8 files changed, 69 insertions(+), 67 deletions(-) diff --git a/go/test/endtoend/cluster/vtorc_process.go b/go/test/endtoend/cluster/vtorc_process.go index 8ed358f12a3..6ac39024dca 100644 --- a/go/test/endtoend/cluster/vtorc_process.go +++ b/go/test/endtoend/cluster/vtorc_process.go @@ -49,11 +49,11 @@ type VTOrcProcess struct { } type VTOrcConfiguration struct { - InstancePollTime string `json:",omitempty"` - PreventCrossDataCenterPrimaryFailover bool `json:",omitempty"` - LockShardTimeoutSeconds int `json:",omitempty"` - ReplicationLagQuery string `json:",omitempty"` - FailPrimaryPromotionOnLagMinutes int `json:",omitempty"` + InstancePollTime string `json:",omitempty"` + PreventCrossCellFailover bool `json:",omitempty"` + LockShardTimeoutSeconds int `json:",omitempty"` + ReplicationLagQuery string `json:",omitempty"` + FailPrimaryPromotionOnLagMinutes int `json:",omitempty"` } // ToJSONString will marshal this configuration as JSON diff --git a/go/test/endtoend/vtorc/api/api_test.go b/go/test/endtoend/vtorc/api/api_test.go index c9fa8840814..638ea5fa72e 100644 --- a/go/test/endtoend/vtorc/api/api_test.go +++ b/go/test/endtoend/vtorc/api/api_test.go @@ -35,7 +35,7 @@ import ( func TestAPIEndpoints(t *testing.T) { defer cluster.PanicHandler(t) utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 2, 1, nil, cluster.VTOrcConfiguration{ - PreventCrossDataCenterPrimaryFailover: true, + PreventCrossCellFailover: true, }, 1, "") keyspace := &clusterInfo.ClusterInstance.Keyspaces[0] shard0 := &keyspace.Shards[0] diff --git a/go/test/endtoend/vtorc/general/vtorc_test.go b/go/test/endtoend/vtorc/general/vtorc_test.go index 854623f45c3..f83ede48f44 100644 --- a/go/test/endtoend/vtorc/general/vtorc_test.go +++ b/go/test/endtoend/vtorc/general/vtorc_test.go @@ -42,7 +42,7 @@ func TestPrimaryElection(t *testing.T) { defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance) defer cluster.PanicHandler(t) utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 2, 1, nil, cluster.VTOrcConfiguration{ - PreventCrossDataCenterPrimaryFailover: true, + PreventCrossCellFailover: true, }, 2, "") keyspace := &clusterInfo.ClusterInstance.Keyspaces[0] shard0 := &keyspace.Shards[0] @@ -70,7 +70,7 @@ func TestSingleKeyspace(t *testing.T) { defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance) defer cluster.PanicHandler(t) utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 1, 1, []string{"--clusters_to_watch", "ks"}, cluster.VTOrcConfiguration{ - PreventCrossDataCenterPrimaryFailover: true, + PreventCrossCellFailover: true, }, 1, "") keyspace := &clusterInfo.ClusterInstance.Keyspaces[0] shard0 := &keyspace.Shards[0] @@ -89,7 +89,7 @@ func TestKeyspaceShard(t *testing.T) { defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance) defer cluster.PanicHandler(t) utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 1, 1, []string{"--clusters_to_watch", "ks/0"}, cluster.VTOrcConfiguration{ - PreventCrossDataCenterPrimaryFailover: true, + PreventCrossCellFailover: true, }, 1, "") keyspace := &clusterInfo.ClusterInstance.Keyspaces[0] shard0 := &keyspace.Shards[0] @@ -111,7 +111,7 @@ func TestVTOrcRepairs(t *testing.T) { defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance) defer cluster.PanicHandler(t) utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 3, 0, []string{"--change-tablets-with-errant-gtid-to-drained"}, cluster.VTOrcConfiguration{ - PreventCrossDataCenterPrimaryFailover: true, + PreventCrossCellFailover: true, }, 1, "") keyspace := &clusterInfo.ClusterInstance.Keyspaces[0] shard0 := &keyspace.Shards[0] @@ -290,7 +290,7 @@ func TestRepairAfterTER(t *testing.T) { defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance) defer cluster.PanicHandler(t) utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 2, 0, nil, cluster.VTOrcConfiguration{ - PreventCrossDataCenterPrimaryFailover: true, + PreventCrossCellFailover: true, }, 1, "") keyspace := &clusterInfo.ClusterInstance.Keyspaces[0] shard0 := &keyspace.Shards[0] @@ -326,7 +326,7 @@ func TestSemiSync(t *testing.T) { newCluster := utils.SetupNewClusterSemiSync(t) defer utils.PrintVTOrcLogsOnFailure(t, newCluster.ClusterInstance) utils.StartVTOrcs(t, newCluster, nil, cluster.VTOrcConfiguration{ - PreventCrossDataCenterPrimaryFailover: true, + PreventCrossCellFailover: true, }, 1) defer func() { utils.StopVTOrcs(t, newCluster) @@ -424,7 +424,7 @@ func TestVTOrcWithPrs(t *testing.T) { defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance) defer cluster.PanicHandler(t) utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 4, 0, nil, cluster.VTOrcConfiguration{ - PreventCrossDataCenterPrimaryFailover: true, + PreventCrossCellFailover: true, }, 1, "") keyspace := &clusterInfo.ClusterInstance.Keyspaces[0] shard0 := &keyspace.Shards[0] @@ -558,7 +558,7 @@ func TestDurabilityPolicySetLater(t *testing.T) { // Now start the vtorc instances utils.StartVTOrcs(t, newCluster, nil, cluster.VTOrcConfiguration{ - PreventCrossDataCenterPrimaryFailover: true, + PreventCrossCellFailover: true, }, 1) defer func() { utils.StopVTOrcs(t, newCluster) @@ -585,7 +585,7 @@ func TestFullStatusConnectionPooling(t *testing.T) { utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 4, 0, []string{ "--tablet_manager_grpc_concurrency=1", }, cluster.VTOrcConfiguration{ - PreventCrossDataCenterPrimaryFailover: true, + PreventCrossCellFailover: true, }, 1, "") keyspace := &clusterInfo.ClusterInstance.Keyspaces[0] shard0 := &keyspace.Shards[0] diff --git a/go/test/endtoend/vtorc/primaryfailure/primary_failure_test.go b/go/test/endtoend/vtorc/primaryfailure/primary_failure_test.go index 886aa3a580a..a46e3789730 100644 --- a/go/test/endtoend/vtorc/primaryfailure/primary_failure_test.go +++ b/go/test/endtoend/vtorc/primaryfailure/primary_failure_test.go @@ -44,7 +44,7 @@ func TestDownPrimary(t *testing.T) { // If that replica is more advanced than the same-cell-replica, then we try to promote the cross-cell replica as an intermediate source. // If we don't specify a small value of --wait-replicas-timeout, then we would end up waiting for 30 seconds for the dead-primary to respond, failing this test. utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 2, 1, []string{"--remote_operation_timeout=10s", "--wait-replicas-timeout=5s"}, cluster.VTOrcConfiguration{ - PreventCrossDataCenterPrimaryFailover: true, + PreventCrossCellFailover: true, }, 1, "semi_sync") keyspace := &clusterInfo.ClusterInstance.Keyspaces[0] shard0 := &keyspace.Shards[0] @@ -150,7 +150,7 @@ func TestDownPrimaryBeforeVTOrc(t *testing.T) { // Start a VTOrc instance utils.StartVTOrcs(t, clusterInfo, []string{"--remote_operation_timeout=10s"}, cluster.VTOrcConfiguration{ - PreventCrossDataCenterPrimaryFailover: true, + PreventCrossCellFailover: true, }, 1) vtOrcProcess := clusterInfo.ClusterInstance.VTOrcProcesses[0] @@ -244,7 +244,7 @@ func TestDeadPrimaryRecoversImmediately(t *testing.T) { // If that replica is more advanced than the same-cell-replica, then we try to promote the cross-cell replica as an intermediate source. // If we don't specify a small value of --wait-replicas-timeout, then we would end up waiting for 30 seconds for the dead-primary to respond, failing this test. utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 2, 1, []string{"--remote_operation_timeout=10s", "--wait-replicas-timeout=5s"}, cluster.VTOrcConfiguration{ - PreventCrossDataCenterPrimaryFailover: true, + PreventCrossCellFailover: true, }, 1, "semi_sync") keyspace := &clusterInfo.ClusterInstance.Keyspaces[0] shard0 := &keyspace.Shards[0] @@ -324,7 +324,7 @@ func TestCrossDataCenterFailure(t *testing.T) { defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance) defer cluster.PanicHandler(t) utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 2, 1, nil, cluster.VTOrcConfiguration{ - PreventCrossDataCenterPrimaryFailover: true, + PreventCrossCellFailover: true, }, 1, "") keyspace := &clusterInfo.ClusterInstance.Keyspaces[0] shard0 := &keyspace.Shards[0] @@ -370,7 +370,7 @@ func TestCrossDataCenterFailureError(t *testing.T) { defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance) defer cluster.PanicHandler(t) utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 1, 1, nil, cluster.VTOrcConfiguration{ - PreventCrossDataCenterPrimaryFailover: true, + PreventCrossCellFailover: true, }, 1, "") keyspace := &clusterInfo.ClusterInstance.Keyspaces[0] shard0 := &keyspace.Shards[0] @@ -417,7 +417,7 @@ func TestLostRdonlyOnPrimaryFailure(t *testing.T) { defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance) defer cluster.PanicHandler(t) utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 2, 2, nil, cluster.VTOrcConfiguration{ - PreventCrossDataCenterPrimaryFailover: true, + PreventCrossCellFailover: true, }, 1, "") keyspace := &clusterInfo.ClusterInstance.Keyspaces[0] shard0 := &keyspace.Shards[0] @@ -728,8 +728,8 @@ func TestDownPrimaryPromotionRuleWithLagCrossCenter(t *testing.T) { defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance) defer cluster.PanicHandler(t) utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 2, 1, nil, cluster.VTOrcConfiguration{ - LockShardTimeoutSeconds: 5, - PreventCrossDataCenterPrimaryFailover: true, + LockShardTimeoutSeconds: 5, + PreventCrossCellFailover: true, }, 1, "test") keyspace := &clusterInfo.ClusterInstance.Keyspaces[0] shard0 := &keyspace.Shards[0] diff --git a/go/viperutil/debug/debug.go b/go/viperutil/debug/debug.go index f2c6d66b479..662634a5675 100644 --- a/go/viperutil/debug/debug.go +++ b/go/viperutil/debug/debug.go @@ -26,6 +26,12 @@ func Debug() { registry.Combined().Debug() } +// WriteConfigAs writes the config into the given filename. func WriteConfigAs(filename string) error { return registry.Combined().WriteConfigAs(filename) } + +// AllSettings gets all the settings in the configuration. +func AllSettings() map[string]any { + return registry.Combined().AllSettings() +} diff --git a/go/vt/vtorc/config/config.go b/go/vt/vtorc/config/config.go index e9bd27b2d62..3b9508cd571 100644 --- a/go/vt/vtorc/config/config.go +++ b/go/vt/vtorc/config/config.go @@ -25,6 +25,7 @@ import ( "github.com/spf13/pflag" "vitess.io/vitess/go/viperutil" + "vitess.io/vitess/go/viperutil/debug" "vitess.io/vitess/go/vt/log" "vitess.io/vitess/go/vt/servenv" ) @@ -52,6 +53,15 @@ var ( Dynamic: true, }, ) + + preventCrossCellFailover = viperutil.Configure( + "PreventCrossCellFailover", + viperutil.Options[bool]{ + FlagName: "prevent-cross-cell-failover", + Default: false, + Dynamic: true, + }, + ) ) var ( @@ -62,7 +72,6 @@ var ( auditToBackend = false auditToSyslog = false auditPurgeDuration = 7 * 24 * time.Hour // Equivalent of 7 days - preventCrossCellFailover = false waitReplicasTimeout = 30 * time.Second tolerableReplicationLag = 0 * time.Second topoInformationRefreshDuration = 15 * time.Second @@ -85,7 +94,7 @@ func registerFlags(fs *pflag.FlagSet) { fs.BoolVar(&auditToBackend, "audit-to-backend", auditToBackend, "Whether to store the audit log in the VTOrc database") fs.BoolVar(&auditToSyslog, "audit-to-syslog", auditToSyslog, "Whether to store the audit log in the syslog") fs.DurationVar(&auditPurgeDuration, "audit-purge-duration", auditPurgeDuration, "Duration for which audit logs are held before being purged. Should be in multiples of days") - fs.BoolVar(&preventCrossCellFailover, "prevent-cross-cell-failover", preventCrossCellFailover, "Prevent VTOrc from promoting a primary in a different cell than the current primary in case of a failover") + fs.Bool("prevent-cross-cell-failover", preventCrossCellFailover.Default(), "Prevent VTOrc from promoting a primary in a different cell than the current primary in case of a failover") fs.DurationVar(&waitReplicasTimeout, "wait-replicas-timeout", waitReplicasTimeout, "Duration for which to wait for replica's to respond when issuing RPCs") fs.DurationVar(&tolerableReplicationLag, "tolerable-replication-lag", tolerableReplicationLag, "Amount of replication lag that is considered acceptable for a tablet to be eligible for promotion when Vitess makes the choice of a new primary in PRS") fs.DurationVar(&topoInformationRefreshDuration, "topo-information-refresh-duration", topoInformationRefreshDuration, "Timer duration on which VTOrc refreshes the keyspace and vttablet records from the topology server") @@ -95,6 +104,7 @@ func registerFlags(fs *pflag.FlagSet) { viperutil.BindFlags(fs, instancePollTime, + preventCrossCellFailover, ) } @@ -103,18 +113,17 @@ func registerFlags(fs *pflag.FlagSet) { // strictly expected from user. // TODO(sougou): change this to yaml parsing, and possible merge with tabletenv. type Configuration struct { - SQLite3DataFile string // full path to sqlite3 datafile - SnapshotTopologiesIntervalHours uint // Interval in hour between snapshot-topologies invocation. Default: 0 (disabled) - ReasonableReplicationLagSeconds int // Above this value is considered a problem - AuditLogFile string // Name of log file for audit operations. Disabled when empty. - AuditToSyslog bool // If true, audit messages are written to syslog - AuditToBackendDB bool // If true, audit messages are written to the backend DB's `audit` table (default: true) - AuditPurgeDays uint // Days after which audit entries are purged from the database - PreventCrossDataCenterPrimaryFailover bool // When true (default: false), cross-DC primary failover are not allowed, vtorc will do all it can to only fail over within same DC, or else not fail over at all. - WaitReplicasTimeoutSeconds int // Timeout on amount of time to wait for the replicas in case of ERS. Should be a small value because we should fail-fast. Should not be larger than LockTimeout since that is the total time we use for an ERS. - TolerableReplicationLagSeconds int // Amount of replication lag that is considered acceptable for a tablet to be eligible for promotion when Vitess makes the choice of a new primary in PRS. - TopoInformationRefreshSeconds int // Timer duration on which VTOrc refreshes the keyspace and vttablet records from the topo-server. - RecoveryPollSeconds int // Timer duration on which VTOrc recovery analysis runs + SQLite3DataFile string // full path to sqlite3 datafile + SnapshotTopologiesIntervalHours uint // Interval in hour between snapshot-topologies invocation. Default: 0 (disabled) + ReasonableReplicationLagSeconds int // Above this value is considered a problem + AuditLogFile string // Name of log file for audit operations. Disabled when empty. + AuditToSyslog bool // If true, audit messages are written to syslog + AuditToBackendDB bool // If true, audit messages are written to the backend DB's `audit` table (default: true) + AuditPurgeDays uint // Days after which audit entries are purged from the database + WaitReplicasTimeoutSeconds int // Timeout on amount of time to wait for the replicas in case of ERS. Should be a small value because we should fail-fast. Should not be larger than LockTimeout since that is the total time we use for an ERS. + TolerableReplicationLagSeconds int // Amount of replication lag that is considered acceptable for a tablet to be eligible for promotion when Vitess makes the choice of a new primary in PRS. + TopoInformationRefreshSeconds int // Timer duration on which VTOrc refreshes the keyspace and vttablet records from the topo-server. + RecoveryPollSeconds int // Timer duration on which VTOrc recovery analysis runs } // ToJSONString will marshal this configuration as JSON @@ -141,6 +150,11 @@ func GetInstancePollSeconds() uint { return uint(instancePollTime.Get() / time.Second) } +// GetPreventCrossCellFailover is a getter function. +func GetPreventCrossCellFailover() bool { + return preventCrossCellFailover.Get() +} + // UpdateConfigValuesFromFlags is used to update the config values from the flags defined. // This is done before we read any configuration files from the user. So the config files take precedence. func UpdateConfigValuesFromFlags() { @@ -151,7 +165,6 @@ func UpdateConfigValuesFromFlags() { Config.AuditToBackendDB = auditToBackend Config.AuditToSyslog = auditToSyslog Config.AuditPurgeDays = uint(auditPurgeDuration / (time.Hour * 24)) - Config.PreventCrossDataCenterPrimaryFailover = preventCrossCellFailover Config.WaitReplicasTimeoutSeconds = int(waitReplicasTimeout / time.Second) Config.TolerableReplicationLagSeconds = int(tolerableReplicationLag / time.Second) Config.TopoInformationRefreshSeconds = int(topoInformationRefreshDuration / time.Second) @@ -180,23 +193,21 @@ func SetConvertTabletWithErrantGTIDs(val bool) { // LogConfigValues is used to log the config values. func LogConfigValues() { - b, _ := json.MarshalIndent(Config, "", "\t") - log.Infof("Running with Configuration - %v", string(b)) + log.Infof("Running with Configuration - %v", debug.AllSettings()) } func newConfiguration() *Configuration { return &Configuration{ - SQLite3DataFile: "file::memory:?mode=memory&cache=shared", - SnapshotTopologiesIntervalHours: 0, - ReasonableReplicationLagSeconds: 10, - AuditLogFile: "", - AuditToSyslog: false, - AuditToBackendDB: false, - AuditPurgeDays: 7, - PreventCrossDataCenterPrimaryFailover: false, - WaitReplicasTimeoutSeconds: 30, - TopoInformationRefreshSeconds: 15, - RecoveryPollSeconds: 1, + SQLite3DataFile: "file::memory:?mode=memory&cache=shared", + SnapshotTopologiesIntervalHours: 0, + ReasonableReplicationLagSeconds: 10, + AuditLogFile: "", + AuditToSyslog: false, + AuditToBackendDB: false, + AuditPurgeDays: 7, + WaitReplicasTimeoutSeconds: 30, + TopoInformationRefreshSeconds: 15, + RecoveryPollSeconds: 1, } } diff --git a/go/vt/vtorc/config/config_test.go b/go/vt/vtorc/config/config_test.go index a564c81e58f..82da551f0c2 100644 --- a/go/vt/vtorc/config/config_test.go +++ b/go/vt/vtorc/config/config_test.go @@ -142,21 +142,6 @@ func TestUpdateConfigValuesFromFlags(t *testing.T) { require.Equal(t, testConfig, Config) }) - t.Run("override preventCrossCellFailover", func(t *testing.T) { - oldPreventCrossCellFailover := preventCrossCellFailover - preventCrossCellFailover = true - // Restore the changes we make - defer func() { - Config = newConfiguration() - preventCrossCellFailover = oldPreventCrossCellFailover - }() - - testConfig := newConfiguration() - testConfig.PreventCrossDataCenterPrimaryFailover = true - UpdateConfigValuesFromFlags() - require.Equal(t, testConfig, Config) - }) - t.Run("override waitReplicasTimeout", func(t *testing.T) { oldWaitReplicasTimeout := waitReplicasTimeout waitReplicasTimeout = 3*time.Minute + 4*time.Second diff --git a/go/vt/vtorc/logic/topology_recovery.go b/go/vt/vtorc/logic/topology_recovery.go index aec137a45b4..7809f25e93b 100644 --- a/go/vt/vtorc/logic/topology_recovery.go +++ b/go/vt/vtorc/logic/topology_recovery.go @@ -236,7 +236,7 @@ func runEmergencyReparentOp(ctx context.Context, analysisEntry *inst.Replication reparentutil.EmergencyReparentOptions{ IgnoreReplicas: nil, WaitReplicasTimeout: time.Duration(config.Config.WaitReplicasTimeoutSeconds) * time.Second, - PreventCrossCellPromotion: config.Config.PreventCrossDataCenterPrimaryFailover, + PreventCrossCellPromotion: config.GetPreventCrossCellFailover(), WaitAllTablets: waitForAllTablets, }, ) From c6c89fb8eb5ebcc993a1ef71bc461ba492376a0c Mon Sep 17 00:00:00 2001 From: Manan Gupta Date: Tue, 12 Nov 2024 15:22:20 +0530 Subject: [PATCH 04/18] feat: add sqldatafile, snapshot time and reasonable replication log to viper Signed-off-by: Manan Gupta --- go/vt/vtorc/config/config.go | 104 ++++++++++++++++++------------ go/vt/vtorc/config/config_test.go | 45 ------------- go/vt/vtorc/db/db.go | 6 +- go/vt/vtorc/inst/analysis_dao.go | 2 +- go/vt/vtorc/inst/instance_dao.go | 6 +- go/vt/vtorc/logic/vtorc.go | 4 +- 6 files changed, 72 insertions(+), 95 deletions(-) diff --git a/go/vt/vtorc/config/config.go b/go/vt/vtorc/config/config.go index 3b9508cd571..447ee529bd5 100644 --- a/go/vt/vtorc/config/config.go +++ b/go/vt/vtorc/config/config.go @@ -62,12 +62,36 @@ var ( Dynamic: true, }, ) + + sqliteDataFile = viperutil.Configure( + "SQLiteDataFile", + viperutil.Options[string]{ + FlagName: "sqlite-data-file", + Default: "file::memory:?mode=memory&cache=shared", + Dynamic: false, + }, + ) + + snapshotTopologyInterval = viperutil.Configure( + "snapshotTopologyInterval", + viperutil.Options[time.Duration]{ + FlagName: "snapshot-topology-interval", + Default: 0 * time.Hour, + Dynamic: true, + }, + ) + + reasonableReplicationLag = viperutil.Configure( + "reasonableReplicationLag", + viperutil.Options[time.Duration]{ + FlagName: "reasonable-replication-lag", + Default: 10 * time.Second, + Dynamic: true, + }, + ) ) var ( - sqliteDataFile = "file::memory:?mode=memory&cache=shared" - snapshotTopologyInterval = 0 * time.Hour - reasonableReplicationLag = 10 * time.Second auditFileLocation = "" auditToBackend = false auditToSyslog = false @@ -86,10 +110,10 @@ func init() { // registerFlags registers the flags required by VTOrc func registerFlags(fs *pflag.FlagSet) { - fs.StringVar(&sqliteDataFile, "sqlite-data-file", sqliteDataFile, "SQLite Datafile to use as VTOrc's database") + fs.String("sqlite-data-file", sqliteDataFile.Default(), "SQLite Datafile to use as VTOrc's database") fs.Duration("instance-poll-time", instancePollTime.Default(), "Timer duration on which VTOrc refreshes MySQL information") - fs.DurationVar(&snapshotTopologyInterval, "snapshot-topology-interval", snapshotTopologyInterval, "Timer duration on which VTOrc takes a snapshot of the current MySQL information it has in the database. Should be in multiple of hours") - fs.DurationVar(&reasonableReplicationLag, "reasonable-replication-lag", reasonableReplicationLag, "Maximum replication lag on replicas which is deemed to be acceptable") + fs.Duration("snapshot-topology-interval", snapshotTopologyInterval.Default(), "Timer duration on which VTOrc takes a snapshot of the current MySQL information it has in the database. Should be in multiple of hours") + fs.Duration("reasonable-replication-lag", reasonableReplicationLag.Default(), "Maximum replication lag on replicas which is deemed to be acceptable") fs.StringVar(&auditFileLocation, "audit-file-location", auditFileLocation, "File location where the audit logs are to be stored") fs.BoolVar(&auditToBackend, "audit-to-backend", auditToBackend, "Whether to store the audit log in the VTOrc database") fs.BoolVar(&auditToSyslog, "audit-to-syslog", auditToSyslog, "Whether to store the audit log in the syslog") @@ -105,6 +129,9 @@ func registerFlags(fs *pflag.FlagSet) { viperutil.BindFlags(fs, instancePollTime, preventCrossCellFailover, + sqliteDataFile, + snapshotTopologyInterval, + reasonableReplicationLag, ) } @@ -113,17 +140,14 @@ func registerFlags(fs *pflag.FlagSet) { // strictly expected from user. // TODO(sougou): change this to yaml parsing, and possible merge with tabletenv. type Configuration struct { - SQLite3DataFile string // full path to sqlite3 datafile - SnapshotTopologiesIntervalHours uint // Interval in hour between snapshot-topologies invocation. Default: 0 (disabled) - ReasonableReplicationLagSeconds int // Above this value is considered a problem - AuditLogFile string // Name of log file for audit operations. Disabled when empty. - AuditToSyslog bool // If true, audit messages are written to syslog - AuditToBackendDB bool // If true, audit messages are written to the backend DB's `audit` table (default: true) - AuditPurgeDays uint // Days after which audit entries are purged from the database - WaitReplicasTimeoutSeconds int // Timeout on amount of time to wait for the replicas in case of ERS. Should be a small value because we should fail-fast. Should not be larger than LockTimeout since that is the total time we use for an ERS. - TolerableReplicationLagSeconds int // Amount of replication lag that is considered acceptable for a tablet to be eligible for promotion when Vitess makes the choice of a new primary in PRS. - TopoInformationRefreshSeconds int // Timer duration on which VTOrc refreshes the keyspace and vttablet records from the topo-server. - RecoveryPollSeconds int // Timer duration on which VTOrc recovery analysis runs + AuditLogFile string // Name of log file for audit operations. Disabled when empty. + AuditToSyslog bool // If true, audit messages are written to syslog + AuditToBackendDB bool // If true, audit messages are written to the backend DB's `audit` table (default: true) + AuditPurgeDays uint // Days after which audit entries are purged from the database + WaitReplicasTimeoutSeconds int // Timeout on amount of time to wait for the replicas in case of ERS. Should be a small value because we should fail-fast. Should not be larger than LockTimeout since that is the total time we use for an ERS. + TolerableReplicationLagSeconds int // Amount of replication lag that is considered acceptable for a tablet to be eligible for promotion when Vitess makes the choice of a new primary in PRS. + TopoInformationRefreshSeconds int // Timer duration on which VTOrc refreshes the keyspace and vttablet records from the topo-server. + RecoveryPollSeconds int // Timer duration on which VTOrc recovery analysis runs } // ToJSONString will marshal this configuration as JSON @@ -155,12 +179,24 @@ func GetPreventCrossCellFailover() bool { return preventCrossCellFailover.Get() } +// GetSQLiteDataFile is a getter function. +func GetSQLiteDataFile() string { + return sqliteDataFile.Get() +} + +// GetReasonableReplicationLagSeconds gets the reasonable replication lag but in seconds. +func GetReasonableReplicationLagSeconds() int64 { + return int64(reasonableReplicationLag.Get() / time.Second) +} + +// GetSnapshotTopologyInterval is a getter function. +func GetSnapshotTopologyInterval() time.Duration { + return snapshotTopologyInterval.Get() +} + // UpdateConfigValuesFromFlags is used to update the config values from the flags defined. // This is done before we read any configuration files from the user. So the config files take precedence. func UpdateConfigValuesFromFlags() { - Config.SQLite3DataFile = sqliteDataFile - Config.SnapshotTopologiesIntervalHours = uint(snapshotTopologyInterval / time.Hour) - Config.ReasonableReplicationLagSeconds = int(reasonableReplicationLag / time.Second) Config.AuditLogFile = auditFileLocation Config.AuditToBackendDB = auditToBackend Config.AuditToSyslog = auditToSyslog @@ -198,25 +234,14 @@ func LogConfigValues() { func newConfiguration() *Configuration { return &Configuration{ - SQLite3DataFile: "file::memory:?mode=memory&cache=shared", - SnapshotTopologiesIntervalHours: 0, - ReasonableReplicationLagSeconds: 10, - AuditLogFile: "", - AuditToSyslog: false, - AuditToBackendDB: false, - AuditPurgeDays: 7, - WaitReplicasTimeoutSeconds: 30, - TopoInformationRefreshSeconds: 15, - RecoveryPollSeconds: 1, - } -} - -func (config *Configuration) postReadAdjustments() error { - if config.SQLite3DataFile == "" { - return fmt.Errorf("SQLite3DataFile must be set") + AuditLogFile: "", + AuditToSyslog: false, + AuditToBackendDB: false, + AuditPurgeDays: 7, + WaitReplicasTimeoutSeconds: 30, + TopoInformationRefreshSeconds: 15, + RecoveryPollSeconds: 1, } - - return nil } // read reads configuration from given file, or silently skips if the file does not exist. @@ -236,9 +261,6 @@ func read(fileName string) (*Configuration, error) { } else { log.Fatal("Cannot read config file:", fileName, err) } - if err := Config.postReadAdjustments(); err != nil { - log.Fatal(err) - } return Config, err } diff --git a/go/vt/vtorc/config/config_test.go b/go/vt/vtorc/config/config_test.go index 82da551f0c2..4c95d276a89 100644 --- a/go/vt/vtorc/config/config_test.go +++ b/go/vt/vtorc/config/config_test.go @@ -52,51 +52,6 @@ func TestUpdateConfigValuesFromFlags(t *testing.T) { require.Equal(t, testConfig, Config) }) - t.Run("override sqliteDataFile", func(t *testing.T) { - oldSqliteDataFile := sqliteDataFile - sqliteDataFile = "newVal" - // Restore the changes we make - defer func() { - Config = newConfiguration() - sqliteDataFile = oldSqliteDataFile - }() - - testConfig := newConfiguration() - testConfig.SQLite3DataFile = "newVal" - UpdateConfigValuesFromFlags() - require.Equal(t, testConfig, Config) - }) - - t.Run("override snapshotTopologyInterval", func(t *testing.T) { - oldSnapshotTopologyInterval := snapshotTopologyInterval - snapshotTopologyInterval = 1 * time.Hour - // Restore the changes we make - defer func() { - Config = newConfiguration() - snapshotTopologyInterval = oldSnapshotTopologyInterval - }() - - testConfig := newConfiguration() - testConfig.SnapshotTopologiesIntervalHours = 1 - UpdateConfigValuesFromFlags() - require.Equal(t, testConfig, Config) - }) - - t.Run("override reasonableReplicationLag", func(t *testing.T) { - oldReasonableReplicationLag := reasonableReplicationLag - reasonableReplicationLag = 15 * time.Second - // Restore the changes we make - defer func() { - Config = newConfiguration() - reasonableReplicationLag = oldReasonableReplicationLag - }() - - testConfig := newConfiguration() - testConfig.ReasonableReplicationLagSeconds = 15 - UpdateConfigValuesFromFlags() - require.Equal(t, testConfig, Config) - }) - t.Run("override auditFileLocation", func(t *testing.T) { oldAuditFileLocation := auditFileLocation auditFileLocation = "newFile" diff --git a/go/vt/vtorc/db/db.go b/go/vt/vtorc/db/db.go index 64143477645..6be9da0c27b 100644 --- a/go/vt/vtorc/db/db.go +++ b/go/vt/vtorc/db/db.go @@ -44,9 +44,9 @@ func (m *vtorcDB) QueryVTOrc(query string, argsArray []any, onRow func(sqlutils. // OpenTopology returns the DB instance for the vtorc backed database func OpenVTOrc() (db *sql.DB, err error) { var fromCache bool - db, fromCache, err = sqlutils.GetSQLiteDB(config.Config.SQLite3DataFile) + db, fromCache, err = sqlutils.GetSQLiteDB(config.GetSQLiteDataFile()) if err == nil && !fromCache { - log.Infof("Connected to vtorc backend: sqlite on %v", config.Config.SQLite3DataFile) + log.Infof("Connected to vtorc backend: sqlite on %v", config.GetSQLiteDataFile()) _ = initVTOrcDB(db) } if db != nil { @@ -94,7 +94,7 @@ func deployStatements(db *sql.DB, queries []string) error { // ClearVTOrcDatabase is used to clear the VTOrc database. This function is meant to be used by tests to clear the // database to get a clean slate without starting a new one. func ClearVTOrcDatabase() { - db, _, _ := sqlutils.GetSQLiteDB(config.Config.SQLite3DataFile) + db, _, _ := sqlutils.GetSQLiteDB(config.GetSQLiteDataFile()) if db != nil { _ = initVTOrcDB(db) } diff --git a/go/vt/vtorc/inst/analysis_dao.go b/go/vt/vtorc/inst/analysis_dao.go index 25d93a6864b..a0be620b450 100644 --- a/go/vt/vtorc/inst/analysis_dao.go +++ b/go/vt/vtorc/inst/analysis_dao.go @@ -68,7 +68,7 @@ func GetReplicationAnalysis(keyspace string, shard string, hints *ReplicationAna } // TODO(sougou); deprecate ReduceReplicationAnalysisCount - args := sqlutils.Args(config.Config.ReasonableReplicationLagSeconds, ValidSecondsFromSeenToLastAttemptedCheck(), config.Config.ReasonableReplicationLagSeconds, keyspace, shard) + args := sqlutils.Args(config.GetReasonableReplicationLagSeconds(), ValidSecondsFromSeenToLastAttemptedCheck(), config.GetReasonableReplicationLagSeconds(), keyspace, shard) query := ` SELECT vitess_tablet.info AS tablet_info, diff --git a/go/vt/vtorc/inst/instance_dao.go b/go/vt/vtorc/inst/instance_dao.go index 7697cf713c8..00059293e1a 100644 --- a/go/vt/vtorc/inst/instance_dao.go +++ b/go/vt/vtorc/inst/instance_dao.go @@ -562,7 +562,7 @@ func readInstanceRow(m sqlutils.RowMap) *Instance { instance.Problems = append(instance.Problems, "not_recently_checked") } else if instance.ReplicationThreadsExist() && !instance.ReplicaRunning() { instance.Problems = append(instance.Problems, "not_replicating") - } else if instance.ReplicationLagSeconds.Valid && util.AbsInt64(instance.ReplicationLagSeconds.Int64-int64(instance.SQLDelay)) > int64(config.Config.ReasonableReplicationLagSeconds) { + } else if instance.ReplicationLagSeconds.Valid && util.AbsInt64(instance.ReplicationLagSeconds.Int64-int64(instance.SQLDelay)) > int64(config.GetReasonableReplicationLagSeconds()) { instance.Problems = append(instance.Problems, "replication_lag") } if instance.GtidErrant != "" { @@ -646,7 +646,7 @@ func ReadProblemInstances(keyspace string, shard string) ([](*Instance), error) ) ` - args := sqlutils.Args(keyspace, keyspace, shard, shard, config.GetInstancePollSeconds()*5, config.Config.ReasonableReplicationLagSeconds, config.Config.ReasonableReplicationLagSeconds) + args := sqlutils.Args(keyspace, keyspace, shard, shard, config.GetInstancePollSeconds()*5, config.GetReasonableReplicationLagSeconds(), config.GetReasonableReplicationLagSeconds()) return readInstancesByCondition(condition, args, "") } @@ -1127,7 +1127,7 @@ func SnapshotTopologies() error { } func ExpireStaleInstanceBinlogCoordinates() error { - expireSeconds := config.Config.ReasonableReplicationLagSeconds * 2 + expireSeconds := config.GetReasonableReplicationLagSeconds() * 2 if expireSeconds < config.StaleInstanceCoordinatesExpireSeconds { expireSeconds = config.StaleInstanceCoordinatesExpireSeconds } diff --git a/go/vt/vtorc/logic/vtorc.go b/go/vt/vtorc/logic/vtorc.go index 0a9f668835c..01acab33dc2 100644 --- a/go/vt/vtorc/logic/vtorc.go +++ b/go/vt/vtorc/logic/vtorc.go @@ -276,8 +276,8 @@ func ContinuousDiscovery() { tabletTopoTick := OpenTabletDiscovery() var recoveryEntrance int64 var snapshotTopologiesTick <-chan time.Time - if config.Config.SnapshotTopologiesIntervalHours > 0 { - snapshotTopologiesTick = time.Tick(time.Duration(config.Config.SnapshotTopologiesIntervalHours) * time.Hour) + if config.GetSnapshotTopologyInterval() > 0 { + snapshotTopologiesTick = time.Tick(config.GetSnapshotTopologyInterval()) } go func() { From 7b86cb3f9e5363e530868ce9d794110ce42f4eec Mon Sep 17 00:00:00 2001 From: Manan Gupta Date: Tue, 12 Nov 2024 15:36:08 +0530 Subject: [PATCH 05/18] feat: add audit file location to viper Signed-off-by: Manan Gupta --- go/vt/vtorc/config/config.go | 27 ++++++++++++++++++++++----- go/vt/vtorc/config/config_test.go | 15 --------------- go/vt/vtorc/inst/audit_dao.go | 4 ++-- go/vt/vtorc/inst/audit_dao_test.go | 8 ++++---- 4 files changed, 28 insertions(+), 26 deletions(-) diff --git a/go/vt/vtorc/config/config.go b/go/vt/vtorc/config/config.go index 447ee529bd5..12df1eaf7a8 100644 --- a/go/vt/vtorc/config/config.go +++ b/go/vt/vtorc/config/config.go @@ -46,7 +46,7 @@ const ( var ( instancePollTime = viperutil.Configure( - "InstancePollTime", + "instance-pPollTime", viperutil.Options[time.Duration]{ FlagName: "instance-poll-time", Default: 5 * time.Second, @@ -89,10 +89,18 @@ var ( Dynamic: true, }, ) + + auditFileLocation = viperutil.Configure( + "AuditFileLocation", + viperutil.Options[string]{ + FlagName: "audit-file-location", + Default: "", + Dynamic: false, + }, + ) ) var ( - auditFileLocation = "" auditToBackend = false auditToSyslog = false auditPurgeDuration = 7 * 24 * time.Hour // Equivalent of 7 days @@ -114,7 +122,7 @@ func registerFlags(fs *pflag.FlagSet) { fs.Duration("instance-poll-time", instancePollTime.Default(), "Timer duration on which VTOrc refreshes MySQL information") fs.Duration("snapshot-topology-interval", snapshotTopologyInterval.Default(), "Timer duration on which VTOrc takes a snapshot of the current MySQL information it has in the database. Should be in multiple of hours") fs.Duration("reasonable-replication-lag", reasonableReplicationLag.Default(), "Maximum replication lag on replicas which is deemed to be acceptable") - fs.StringVar(&auditFileLocation, "audit-file-location", auditFileLocation, "File location where the audit logs are to be stored") + fs.String("audit-file-location", auditFileLocation.Default(), "File location where the audit logs are to be stored") fs.BoolVar(&auditToBackend, "audit-to-backend", auditToBackend, "Whether to store the audit log in the VTOrc database") fs.BoolVar(&auditToSyslog, "audit-to-syslog", auditToSyslog, "Whether to store the audit log in the syslog") fs.DurationVar(&auditPurgeDuration, "audit-purge-duration", auditPurgeDuration, "Duration for which audit logs are held before being purged. Should be in multiples of days") @@ -132,6 +140,7 @@ func registerFlags(fs *pflag.FlagSet) { sqliteDataFile, snapshotTopologyInterval, reasonableReplicationLag, + auditFileLocation, ) } @@ -194,10 +203,19 @@ func GetSnapshotTopologyInterval() time.Duration { return snapshotTopologyInterval.Get() } +// GetAuditFileLocation is a getter function. +func GetAuditFileLocation() string { + return auditFileLocation.Get() +} + +// SetAuditFileLocation is a setter function. +func SetAuditFileLocation(v string) { + auditFileLocation.Set(v) +} + // UpdateConfigValuesFromFlags is used to update the config values from the flags defined. // This is done before we read any configuration files from the user. So the config files take precedence. func UpdateConfigValuesFromFlags() { - Config.AuditLogFile = auditFileLocation Config.AuditToBackendDB = auditToBackend Config.AuditToSyslog = auditToSyslog Config.AuditPurgeDays = uint(auditPurgeDuration / (time.Hour * 24)) @@ -234,7 +252,6 @@ func LogConfigValues() { func newConfiguration() *Configuration { return &Configuration{ - AuditLogFile: "", AuditToSyslog: false, AuditToBackendDB: false, AuditPurgeDays: 7, diff --git a/go/vt/vtorc/config/config_test.go b/go/vt/vtorc/config/config_test.go index 4c95d276a89..d60bdc9dec8 100644 --- a/go/vt/vtorc/config/config_test.go +++ b/go/vt/vtorc/config/config_test.go @@ -52,21 +52,6 @@ func TestUpdateConfigValuesFromFlags(t *testing.T) { require.Equal(t, testConfig, Config) }) - t.Run("override auditFileLocation", func(t *testing.T) { - oldAuditFileLocation := auditFileLocation - auditFileLocation = "newFile" - // Restore the changes we make - defer func() { - Config = newConfiguration() - auditFileLocation = oldAuditFileLocation - }() - - testConfig := newConfiguration() - testConfig.AuditLogFile = "newFile" - UpdateConfigValuesFromFlags() - require.Equal(t, testConfig, Config) - }) - t.Run("override auditToBackend", func(t *testing.T) { oldAuditToBackend := auditToBackend auditToBackend = true diff --git a/go/vt/vtorc/inst/audit_dao.go b/go/vt/vtorc/inst/audit_dao.go index 642fb187509..9a8048491d3 100644 --- a/go/vt/vtorc/inst/audit_dao.go +++ b/go/vt/vtorc/inst/audit_dao.go @@ -38,10 +38,10 @@ func AuditOperation(auditType string, tabletAlias string, message string) error } auditWrittenToFile := false - if config.Config.AuditLogFile != "" { + if config.GetAuditFileLocation() != "" { auditWrittenToFile = true go func() { - f, err := os.OpenFile(config.Config.AuditLogFile, os.O_RDWR|os.O_CREATE|os.O_APPEND, 0640) + f, err := os.OpenFile(config.GetAuditFileLocation(), os.O_RDWR|os.O_CREATE|os.O_APPEND, 0640) if err != nil { log.Error(err) return diff --git a/go/vt/vtorc/inst/audit_dao_test.go b/go/vt/vtorc/inst/audit_dao_test.go index 1d50de4c146..ea67d22dcb7 100644 --- a/go/vt/vtorc/inst/audit_dao_test.go +++ b/go/vt/vtorc/inst/audit_dao_test.go @@ -36,11 +36,11 @@ import ( func TestAuditOperation(t *testing.T) { // Restore original configurations originalAuditSysLog := config.Config.AuditToSyslog - originalAuditLogFile := config.Config.AuditLogFile + originalAuditLogFile := config.GetAuditFileLocation() originalAuditBackend := config.Config.AuditToBackendDB defer func() { config.Config.AuditToSyslog = originalAuditSysLog - config.Config.AuditLogFile = originalAuditLogFile + config.SetAuditFileLocation(originalAuditLogFile) config.Config.AuditToBackendDB = originalAuditBackend }() @@ -78,7 +78,7 @@ func TestAuditOperation(t *testing.T) { message := "test-message" t.Run("audit to backend", func(t *testing.T) { - config.Config.AuditLogFile = "" + config.SetAuditFileLocation("") config.Config.AuditToSyslog = false config.Config.AuditToBackendDB = true @@ -112,7 +112,7 @@ func TestAuditOperation(t *testing.T) { file, err := os.CreateTemp("", "test-auditing-*") require.NoError(t, err) defer os.Remove(file.Name()) - config.Config.AuditLogFile = file.Name() + config.SetAuditFileLocation(file.Name()) err = AuditOperation(auditType, tab100Alias, message) require.NoError(t, err) From 1ab5cc3a22a830116ff0302dc50f5d5eb7858662 Mon Sep 17 00:00:00 2001 From: Manan Gupta Date: Tue, 12 Nov 2024 15:48:53 +0530 Subject: [PATCH 06/18] feat: move remaining audit flags to viper Signed-off-by: Manan Gupta --- go/cmd/vtorc/cli/cli.go | 2 +- go/vt/vtorc/config/config.go | 87 ++++++++++++++----- go/vt/vtorc/config/config_test.go | 48 ---------- go/vt/vtorc/inst/audit_dao.go | 2 +- go/vt/vtorc/inst/audit_dao_test.go | 16 ++-- go/vt/vtorc/inst/instance_dao.go | 2 +- go/vt/vtorc/inst/instance_dao_test.go | 6 +- .../vtorc/logic/topology_recovery_dao_test.go | 6 +- 8 files changed, 84 insertions(+), 85 deletions(-) diff --git a/go/cmd/vtorc/cli/cli.go b/go/cmd/vtorc/cli/cli.go index c58eaf2853c..f2bd7383e01 100644 --- a/go/cmd/vtorc/cli/cli.go +++ b/go/cmd/vtorc/cli/cli.go @@ -54,7 +54,7 @@ func run(cmd *cobra.Command, args []string) { inst.RegisterStats() log.Info("starting vtorc") - if config.Config.AuditToSyslog { + if config.GetAuditToSyslog() { inst.EnableAuditSyslog() } config.MarkConfigurationLoaded() diff --git a/go/vt/vtorc/config/config.go b/go/vt/vtorc/config/config.go index 12df1eaf7a8..b663c2fd3a4 100644 --- a/go/vt/vtorc/config/config.go +++ b/go/vt/vtorc/config/config.go @@ -98,12 +98,36 @@ var ( Dynamic: false, }, ) + + auditToBackend = viperutil.Configure( + "AuditToBackend", + viperutil.Options[bool]{ + FlagName: "audit-to-backend", + Default: false, + Dynamic: true, + }, + ) + + auditToSyslog = viperutil.Configure( + "AuditToSyslog", + viperutil.Options[bool]{ + FlagName: "audit-to-syslog", + Default: false, + Dynamic: true, + }, + ) + + auditPurgeDuration = viperutil.Configure( + "AuditPurgeDuration", + viperutil.Options[time.Duration]{ + FlagName: "reasonable-replication-lag", + Default: 7 * 24 * time.Hour, + Dynamic: true, + }, + ) ) var ( - auditToBackend = false - auditToSyslog = false - auditPurgeDuration = 7 * 24 * time.Hour // Equivalent of 7 days waitReplicasTimeout = 30 * time.Second tolerableReplicationLag = 0 * time.Second topoInformationRefreshDuration = 15 * time.Second @@ -123,9 +147,9 @@ func registerFlags(fs *pflag.FlagSet) { fs.Duration("snapshot-topology-interval", snapshotTopologyInterval.Default(), "Timer duration on which VTOrc takes a snapshot of the current MySQL information it has in the database. Should be in multiple of hours") fs.Duration("reasonable-replication-lag", reasonableReplicationLag.Default(), "Maximum replication lag on replicas which is deemed to be acceptable") fs.String("audit-file-location", auditFileLocation.Default(), "File location where the audit logs are to be stored") - fs.BoolVar(&auditToBackend, "audit-to-backend", auditToBackend, "Whether to store the audit log in the VTOrc database") - fs.BoolVar(&auditToSyslog, "audit-to-syslog", auditToSyslog, "Whether to store the audit log in the syslog") - fs.DurationVar(&auditPurgeDuration, "audit-purge-duration", auditPurgeDuration, "Duration for which audit logs are held before being purged. Should be in multiples of days") + fs.Bool("audit-to-backend", auditToBackend.Default(), "Whether to store the audit log in the VTOrc database") + fs.Bool("audit-to-syslog", auditToSyslog.Default(), "Whether to store the audit log in the syslog") + fs.Duration("audit-purge-duration", auditPurgeDuration.Default(), "Duration for which audit logs are held before being purged. Should be in multiples of days") fs.Bool("prevent-cross-cell-failover", preventCrossCellFailover.Default(), "Prevent VTOrc from promoting a primary in a different cell than the current primary in case of a failover") fs.DurationVar(&waitReplicasTimeout, "wait-replicas-timeout", waitReplicasTimeout, "Duration for which to wait for replica's to respond when issuing RPCs") fs.DurationVar(&tolerableReplicationLag, "tolerable-replication-lag", tolerableReplicationLag, "Amount of replication lag that is considered acceptable for a tablet to be eligible for promotion when Vitess makes the choice of a new primary in PRS") @@ -141,6 +165,9 @@ func registerFlags(fs *pflag.FlagSet) { snapshotTopologyInterval, reasonableReplicationLag, auditFileLocation, + auditToBackend, + auditToSyslog, + auditPurgeDuration, ) } @@ -149,14 +176,10 @@ func registerFlags(fs *pflag.FlagSet) { // strictly expected from user. // TODO(sougou): change this to yaml parsing, and possible merge with tabletenv. type Configuration struct { - AuditLogFile string // Name of log file for audit operations. Disabled when empty. - AuditToSyslog bool // If true, audit messages are written to syslog - AuditToBackendDB bool // If true, audit messages are written to the backend DB's `audit` table (default: true) - AuditPurgeDays uint // Days after which audit entries are purged from the database - WaitReplicasTimeoutSeconds int // Timeout on amount of time to wait for the replicas in case of ERS. Should be a small value because we should fail-fast. Should not be larger than LockTimeout since that is the total time we use for an ERS. - TolerableReplicationLagSeconds int // Amount of replication lag that is considered acceptable for a tablet to be eligible for promotion when Vitess makes the choice of a new primary in PRS. - TopoInformationRefreshSeconds int // Timer duration on which VTOrc refreshes the keyspace and vttablet records from the topo-server. - RecoveryPollSeconds int // Timer duration on which VTOrc recovery analysis runs + WaitReplicasTimeoutSeconds int // Timeout on amount of time to wait for the replicas in case of ERS. Should be a small value because we should fail-fast. Should not be larger than LockTimeout since that is the total time we use for an ERS. + TolerableReplicationLagSeconds int // Amount of replication lag that is considered acceptable for a tablet to be eligible for promotion when Vitess makes the choice of a new primary in PRS. + TopoInformationRefreshSeconds int // Timer duration on which VTOrc refreshes the keyspace and vttablet records from the topo-server. + RecoveryPollSeconds int // Timer duration on which VTOrc recovery analysis runs } // ToJSONString will marshal this configuration as JSON @@ -213,12 +236,39 @@ func SetAuditFileLocation(v string) { auditFileLocation.Set(v) } +// GetAuditToSyslog is a getter function. +func GetAuditToSyslog() bool { + return auditToSyslog.Get() +} + +// SetAuditToSyslog is a setter function. +func SetAuditToSyslog(v bool) { + auditToSyslog.Set(v) +} + +// GetAuditToBackend is a getter function. +func GetAuditToBackend() bool { + return auditToBackend.Get() +} + +// SetAuditToBackend is a setter function. +func SetAuditToBackend(v bool) { + auditToBackend.Set(v) +} + +// GetAuditPurgeDays gets the audit purge duration but in days. +func GetAuditPurgeDays() int64 { + return int64(auditPurgeDuration.Get() / (24 * time.Hour)) +} + +// SetAuditPurgeDays sets the audit purge duration. +func SetAuditPurgeDays(days int64) { + auditPurgeDuration.Set(time.Duration(days) * 24 * time.Hour) +} + // UpdateConfigValuesFromFlags is used to update the config values from the flags defined. // This is done before we read any configuration files from the user. So the config files take precedence. func UpdateConfigValuesFromFlags() { - Config.AuditToBackendDB = auditToBackend - Config.AuditToSyslog = auditToSyslog - Config.AuditPurgeDays = uint(auditPurgeDuration / (time.Hour * 24)) Config.WaitReplicasTimeoutSeconds = int(waitReplicasTimeout / time.Second) Config.TolerableReplicationLagSeconds = int(tolerableReplicationLag / time.Second) Config.TopoInformationRefreshSeconds = int(topoInformationRefreshDuration / time.Second) @@ -252,9 +302,6 @@ func LogConfigValues() { func newConfiguration() *Configuration { return &Configuration{ - AuditToSyslog: false, - AuditToBackendDB: false, - AuditPurgeDays: 7, WaitReplicasTimeoutSeconds: 30, TopoInformationRefreshSeconds: 15, RecoveryPollSeconds: 1, diff --git a/go/vt/vtorc/config/config_test.go b/go/vt/vtorc/config/config_test.go index d60bdc9dec8..8b49c5b41cd 100644 --- a/go/vt/vtorc/config/config_test.go +++ b/go/vt/vtorc/config/config_test.go @@ -34,54 +34,6 @@ func TestUpdateConfigValuesFromFlags(t *testing.T) { require.Equal(t, defaultConfig, Config) }) - t.Run("override auditPurgeDuration", func(t *testing.T) { - oldAuditPurgeDuration := auditPurgeDuration - auditPurgeDuration = 8 * time.Hour * 24 - auditPurgeDuration += time.Second + 4*time.Minute - // Restore the changes we make - defer func() { - Config = newConfiguration() - auditPurgeDuration = oldAuditPurgeDuration - }() - - testConfig := newConfiguration() - // auditPurgeDuration is supposed to be in multiples of days. - // If it is not, then we round down to the nearest number of days. - testConfig.AuditPurgeDays = 8 - UpdateConfigValuesFromFlags() - require.Equal(t, testConfig, Config) - }) - - t.Run("override auditToBackend", func(t *testing.T) { - oldAuditToBackend := auditToBackend - auditToBackend = true - // Restore the changes we make - defer func() { - Config = newConfiguration() - auditToBackend = oldAuditToBackend - }() - - testConfig := newConfiguration() - testConfig.AuditToBackendDB = true - UpdateConfigValuesFromFlags() - require.Equal(t, testConfig, Config) - }) - - t.Run("override auditToSyslog", func(t *testing.T) { - oldAuditToSyslog := auditToSyslog - auditToSyslog = true - // Restore the changes we make - defer func() { - Config = newConfiguration() - auditToSyslog = oldAuditToSyslog - }() - - testConfig := newConfiguration() - testConfig.AuditToSyslog = true - UpdateConfigValuesFromFlags() - require.Equal(t, testConfig, Config) - }) - t.Run("override waitReplicasTimeout", func(t *testing.T) { oldWaitReplicasTimeout := waitReplicasTimeout waitReplicasTimeout = 3*time.Minute + 4*time.Second diff --git a/go/vt/vtorc/inst/audit_dao.go b/go/vt/vtorc/inst/audit_dao.go index 9a8048491d3..e500289a382 100644 --- a/go/vt/vtorc/inst/audit_dao.go +++ b/go/vt/vtorc/inst/audit_dao.go @@ -54,7 +54,7 @@ func AuditOperation(auditType string, tabletAlias string, message string) error } }() } - if config.Config.AuditToBackendDB { + if config.GetAuditToBackend() { _, err := db.ExecVTOrc(` insert into audit ( diff --git a/go/vt/vtorc/inst/audit_dao_test.go b/go/vt/vtorc/inst/audit_dao_test.go index ea67d22dcb7..d22e9177dc3 100644 --- a/go/vt/vtorc/inst/audit_dao_test.go +++ b/go/vt/vtorc/inst/audit_dao_test.go @@ -35,13 +35,13 @@ import ( // This test also verifies that we are able to read the recent audits that are written to the databaes. func TestAuditOperation(t *testing.T) { // Restore original configurations - originalAuditSysLog := config.Config.AuditToSyslog + originalAuditSysLog := config.GetAuditToSyslog() originalAuditLogFile := config.GetAuditFileLocation() - originalAuditBackend := config.Config.AuditToBackendDB + originalAuditBackend := config.GetAuditToBackend() defer func() { - config.Config.AuditToSyslog = originalAuditSysLog + config.SetAuditToSyslog(originalAuditSysLog) config.SetAuditFileLocation(originalAuditLogFile) - config.Config.AuditToBackendDB = originalAuditBackend + config.SetAuditToBackend(originalAuditBackend) }() orcDb, err := db.OpenVTOrc() @@ -79,8 +79,8 @@ func TestAuditOperation(t *testing.T) { t.Run("audit to backend", func(t *testing.T) { config.SetAuditFileLocation("") - config.Config.AuditToSyslog = false - config.Config.AuditToBackendDB = true + config.SetAuditToSyslog(false) + config.SetAuditToBackend(true) // Auditing should succeed as expected err = AuditOperation(auditType, tab100Alias, message) @@ -106,8 +106,8 @@ func TestAuditOperation(t *testing.T) { }) t.Run("audit to File", func(t *testing.T) { - config.Config.AuditToBackendDB = false - config.Config.AuditToSyslog = false + config.SetAuditToBackend(false) + config.SetAuditToSyslog(false) file, err := os.CreateTemp("", "test-auditing-*") require.NoError(t, err) diff --git a/go/vt/vtorc/inst/instance_dao.go b/go/vt/vtorc/inst/instance_dao.go index 00059293e1a..985af855713 100644 --- a/go/vt/vtorc/inst/instance_dao.go +++ b/go/vt/vtorc/inst/instance_dao.go @@ -116,7 +116,7 @@ func ExpireTableData(tableName string, timestampColumn string) error { writeFunc := func() error { _, err := db.ExecVTOrc( fmt.Sprintf("delete from %s where %s < datetime('now', printf('-%%d DAY', ?))", tableName, timestampColumn), - config.Config.AuditPurgeDays, + config.GetAuditPurgeDays(), ) return err } diff --git a/go/vt/vtorc/inst/instance_dao_test.go b/go/vt/vtorc/inst/instance_dao_test.go index e71731f9f51..bc3febf97e6 100644 --- a/go/vt/vtorc/inst/instance_dao_test.go +++ b/go/vt/vtorc/inst/instance_dao_test.go @@ -718,10 +718,10 @@ func TestGetDatabaseState(t *testing.T) { } func TestExpireTableData(t *testing.T) { - oldVal := config.Config.AuditPurgeDays - config.Config.AuditPurgeDays = 10 + oldVal := config.GetAuditPurgeDays() + config.SetAuditPurgeDays(10) defer func() { - config.Config.AuditPurgeDays = oldVal + config.SetAuditPurgeDays(oldVal) }() tests := []struct { diff --git a/go/vt/vtorc/logic/topology_recovery_dao_test.go b/go/vt/vtorc/logic/topology_recovery_dao_test.go index 20dfb7e91e2..6a1d7c4c48f 100644 --- a/go/vt/vtorc/logic/topology_recovery_dao_test.go +++ b/go/vt/vtorc/logic/topology_recovery_dao_test.go @@ -70,10 +70,10 @@ func TestTopologyRecovery(t *testing.T) { } func TestExpireTableData(t *testing.T) { - oldVal := config.Config.AuditPurgeDays - config.Config.AuditPurgeDays = 10 + oldVal := config.GetAuditPurgeDays() + config.SetAuditPurgeDays(10) defer func() { - config.Config.AuditPurgeDays = oldVal + config.SetAuditPurgeDays(oldVal) }() tests := []struct { From ec2ef8788ccd3287455e17fa2945fc7ca09c5aa1 Mon Sep 17 00:00:00 2001 From: Manan Gupta Date: Tue, 12 Nov 2024 16:12:45 +0530 Subject: [PATCH 07/18] feat: move remaining configs to viper Signed-off-by: Manan Gupta --- go/cmd/vtorc/cli/cli.go | 1 - go/vt/vtorc/config/config.go | 146 ++++++++++++------------- go/vt/vtorc/config/config_test.go | 64 ----------- go/vt/vtorc/inst/analysis_dao.go | 2 +- go/vt/vtorc/logic/tablet_discovery.go | 2 +- go/vt/vtorc/logic/topology_recovery.go | 7 +- go/vt/vtorc/logic/vtorc.go | 21 +--- 7 files changed, 73 insertions(+), 170 deletions(-) diff --git a/go/cmd/vtorc/cli/cli.go b/go/cmd/vtorc/cli/cli.go index f2bd7383e01..bfc1e6ba662 100644 --- a/go/cmd/vtorc/cli/cli.go +++ b/go/cmd/vtorc/cli/cli.go @@ -50,7 +50,6 @@ var ( func run(cmd *cobra.Command, args []string) { servenv.Init() - config.UpdateConfigValuesFromFlags() inst.RegisterStats() log.Info("starting vtorc") diff --git a/go/vt/vtorc/config/config.go b/go/vt/vtorc/config/config.go index b663c2fd3a4..47e4c1e8326 100644 --- a/go/vt/vtorc/config/config.go +++ b/go/vt/vtorc/config/config.go @@ -17,9 +17,6 @@ package config import ( - "encoding/json" - "fmt" - "os" "time" "github.com/spf13/pflag" @@ -46,7 +43,7 @@ const ( var ( instancePollTime = viperutil.Configure( - "instance-pPollTime", + "InstancePollTime", viperutil.Options[time.Duration]{ FlagName: "instance-poll-time", Default: 5 * time.Second, @@ -73,7 +70,7 @@ var ( ) snapshotTopologyInterval = viperutil.Configure( - "snapshotTopologyInterval", + "SnapshotTopologyInterval", viperutil.Options[time.Duration]{ FlagName: "snapshot-topology-interval", Default: 0 * time.Hour, @@ -82,7 +79,7 @@ var ( ) reasonableReplicationLag = viperutil.Configure( - "reasonableReplicationLag", + "ReasonableReplicationLag", viperutil.Options[time.Duration]{ FlagName: "reasonable-replication-lag", Default: 10 * time.Second, @@ -125,15 +122,47 @@ var ( Dynamic: true, }, ) + + waitReplicasTimeout = viperutil.Configure( + "WaitReplicasTimeout", + viperutil.Options[time.Duration]{ + FlagName: "wait-replicas-timeout", + Default: 30 * time.Second, + Dynamic: true, + }, + ) + + tolerableReplicationLag = viperutil.Configure( + "TolerableReplicationLag", + viperutil.Options[time.Duration]{ + FlagName: "tolerable-replication-lag", + Default: 0 * time.Second, + Dynamic: true, + }, + ) + + topoInformationRefreshDuration = viperutil.Configure( + "TopoInformationRefreshDuration", + viperutil.Options[time.Duration]{ + FlagName: "topo-information-refresh-duration", + Default: 15 * time.Second, + Dynamic: true, + }, + ) + + recoveryPollDuration = viperutil.Configure( + "RecoveryPollDuration", + viperutil.Options[time.Duration]{ + FlagName: "recovery-poll-duration", + Default: 1 * time.Second, + Dynamic: true, + }, + ) ) var ( - waitReplicasTimeout = 30 * time.Second - tolerableReplicationLag = 0 * time.Second - topoInformationRefreshDuration = 15 * time.Second - recoveryPollDuration = 1 * time.Second - ersEnabled = true - convertTabletsWithErrantGTIDs = false + ersEnabled = true + convertTabletsWithErrantGTIDs = false ) func init() { @@ -151,10 +180,10 @@ func registerFlags(fs *pflag.FlagSet) { fs.Bool("audit-to-syslog", auditToSyslog.Default(), "Whether to store the audit log in the syslog") fs.Duration("audit-purge-duration", auditPurgeDuration.Default(), "Duration for which audit logs are held before being purged. Should be in multiples of days") fs.Bool("prevent-cross-cell-failover", preventCrossCellFailover.Default(), "Prevent VTOrc from promoting a primary in a different cell than the current primary in case of a failover") - fs.DurationVar(&waitReplicasTimeout, "wait-replicas-timeout", waitReplicasTimeout, "Duration for which to wait for replica's to respond when issuing RPCs") - fs.DurationVar(&tolerableReplicationLag, "tolerable-replication-lag", tolerableReplicationLag, "Amount of replication lag that is considered acceptable for a tablet to be eligible for promotion when Vitess makes the choice of a new primary in PRS") - fs.DurationVar(&topoInformationRefreshDuration, "topo-information-refresh-duration", topoInformationRefreshDuration, "Timer duration on which VTOrc refreshes the keyspace and vttablet records from the topology server") - fs.DurationVar(&recoveryPollDuration, "recovery-poll-duration", recoveryPollDuration, "Timer duration on which VTOrc polls its database to run a recovery") + fs.Duration("wait-replicas-timeout", waitReplicasTimeout.Default(), "Duration for which to wait for replica's to respond when issuing RPCs") + fs.Duration("tolerable-replication-lag", tolerableReplicationLag.Default(), "Amount of replication lag that is considered acceptable for a tablet to be eligible for promotion when Vitess makes the choice of a new primary in PRS") + fs.Duration("topo-information-refresh-duration", topoInformationRefreshDuration.Default(), "Timer duration on which VTOrc refreshes the keyspace and vttablet records from the topology server") + fs.Duration("recovery-poll-duration", recoveryPollDuration.Default(), "Timer duration on which VTOrc polls its database to run a recovery") fs.BoolVar(&ersEnabled, "allow-emergency-reparent", ersEnabled, "Whether VTOrc should be allowed to run emergency reparent operation when it detects a dead primary") fs.BoolVar(&convertTabletsWithErrantGTIDs, "change-tablets-with-errant-gtid-to-drained", convertTabletsWithErrantGTIDs, "Whether VTOrc should be changing the type of tablets with errant GTIDs to DRAINED") @@ -168,29 +197,13 @@ func registerFlags(fs *pflag.FlagSet) { auditToBackend, auditToSyslog, auditPurgeDuration, + waitReplicasTimeout, + tolerableReplicationLag, + topoInformationRefreshDuration, + recoveryPollDuration, ) } -// Configuration makes for vtorc configuration input, which can be provided by user via JSON formatted file. -// Some of the parameters have reasonable default values, and some (like database credentials) are -// strictly expected from user. -// TODO(sougou): change this to yaml parsing, and possible merge with tabletenv. -type Configuration struct { - WaitReplicasTimeoutSeconds int // Timeout on amount of time to wait for the replicas in case of ERS. Should be a small value because we should fail-fast. Should not be larger than LockTimeout since that is the total time we use for an ERS. - TolerableReplicationLagSeconds int // Amount of replication lag that is considered acceptable for a tablet to be eligible for promotion when Vitess makes the choice of a new primary in PRS. - TopoInformationRefreshSeconds int // Timer duration on which VTOrc refreshes the keyspace and vttablet records from the topo-server. - RecoveryPollSeconds int // Timer duration on which VTOrc recovery analysis runs -} - -// ToJSONString will marshal this configuration as JSON -func (config *Configuration) ToJSONString() string { - b, _ := json.Marshal(config) - return string(b) -} - -// Config is *the* configuration instance, used globally to get configuration data -var Config = newConfiguration() - // GetInstancePollTime is a getter function. func GetInstancePollTime() time.Duration { return instancePollTime.Get() @@ -266,13 +279,24 @@ func SetAuditPurgeDays(days int64) { auditPurgeDuration.Set(time.Duration(days) * 24 * time.Hour) } -// UpdateConfigValuesFromFlags is used to update the config values from the flags defined. -// This is done before we read any configuration files from the user. So the config files take precedence. -func UpdateConfigValuesFromFlags() { - Config.WaitReplicasTimeoutSeconds = int(waitReplicasTimeout / time.Second) - Config.TolerableReplicationLagSeconds = int(tolerableReplicationLag / time.Second) - Config.TopoInformationRefreshSeconds = int(topoInformationRefreshDuration / time.Second) - Config.RecoveryPollSeconds = int(recoveryPollDuration / time.Second) +// GetWaitReplicasTimeout is a getter function. +func GetWaitReplicasTimeout() time.Duration { + return waitReplicasTimeout.Get() +} + +// GetTolerableReplicationLag is a getter function. +func GetTolerableReplicationLag() time.Duration { + return tolerableReplicationLag.Get() +} + +// GetTopoInformationRefreshDuration is a getter function. +func GetTopoInformationRefreshDuration() time.Duration { + return topoInformationRefreshDuration.Get() +} + +// GetRecoveryPollDuration is a getter function. +func GetRecoveryPollDuration() time.Duration { + return recoveryPollDuration.Get() } // ERSEnabled reports whether VTOrc is allowed to run ERS or not. @@ -300,42 +324,6 @@ func LogConfigValues() { log.Infof("Running with Configuration - %v", debug.AllSettings()) } -func newConfiguration() *Configuration { - return &Configuration{ - WaitReplicasTimeoutSeconds: 30, - TopoInformationRefreshSeconds: 15, - RecoveryPollSeconds: 1, - } -} - -// read reads configuration from given file, or silently skips if the file does not exist. -// If the file does exist, then it is expected to be in valid JSON format or the function bails out. -func read(fileName string) (*Configuration, error) { - if fileName == "" { - return Config, fmt.Errorf("Empty file name") - } - file, err := os.Open(fileName) - if err != nil { - return Config, err - } - decoder := json.NewDecoder(file) - err = decoder.Decode(Config) - if err == nil { - log.Infof("Read config: %s", fileName) - } else { - log.Fatal("Cannot read config file:", fileName, err) - } - return Config, err -} - -// Reload re-reads configuration from last used files -func Reload(extraFileNames ...string) *Configuration { - for _, fileName := range extraFileNames { - _, _ = read(fileName) - } - return Config -} - // MarkConfigurationLoaded is called once configuration has first been loaded. // Listeners on ConfigurationLoaded will get a notification func MarkConfigurationLoaded() { diff --git a/go/vt/vtorc/config/config_test.go b/go/vt/vtorc/config/config_test.go index 8b49c5b41cd..69abc75cde4 100644 --- a/go/vt/vtorc/config/config_test.go +++ b/go/vt/vtorc/config/config_test.go @@ -15,67 +15,3 @@ limitations under the License. */ package config - -import ( - "testing" - "time" - - "github.com/stretchr/testify/require" -) - -func TestUpdateConfigValuesFromFlags(t *testing.T) { - t.Run("defaults", func(t *testing.T) { - // Restore the changes we make to the Config parameter - defer func() { - Config = newConfiguration() - }() - defaultConfig := newConfiguration() - UpdateConfigValuesFromFlags() - require.Equal(t, defaultConfig, Config) - }) - - t.Run("override waitReplicasTimeout", func(t *testing.T) { - oldWaitReplicasTimeout := waitReplicasTimeout - waitReplicasTimeout = 3*time.Minute + 4*time.Second - // Restore the changes we make - defer func() { - Config = newConfiguration() - waitReplicasTimeout = oldWaitReplicasTimeout - }() - - testConfig := newConfiguration() - testConfig.WaitReplicasTimeoutSeconds = 184 - UpdateConfigValuesFromFlags() - require.Equal(t, testConfig, Config) - }) - - t.Run("override topoInformationRefreshDuration", func(t *testing.T) { - oldTopoInformationRefreshDuration := topoInformationRefreshDuration - topoInformationRefreshDuration = 1 * time.Second - // Restore the changes we make - defer func() { - Config = newConfiguration() - topoInformationRefreshDuration = oldTopoInformationRefreshDuration - }() - - testConfig := newConfiguration() - testConfig.TopoInformationRefreshSeconds = 1 - UpdateConfigValuesFromFlags() - require.Equal(t, testConfig, Config) - }) - - t.Run("override recoveryPollDuration", func(t *testing.T) { - oldRecoveryPollDuration := recoveryPollDuration - recoveryPollDuration = 15 * time.Second - // Restore the changes we make - defer func() { - Config = newConfiguration() - recoveryPollDuration = oldRecoveryPollDuration - }() - - testConfig := newConfiguration() - testConfig.RecoveryPollSeconds = 15 - UpdateConfigValuesFromFlags() - require.Equal(t, testConfig, Config) - }) -} diff --git a/go/vt/vtorc/inst/analysis_dao.go b/go/vt/vtorc/inst/analysis_dao.go index a0be620b450..98abdcc7188 100644 --- a/go/vt/vtorc/inst/analysis_dao.go +++ b/go/vt/vtorc/inst/analysis_dao.go @@ -47,7 +47,7 @@ func init() { func initializeAnalysisDaoPostConfiguration() { config.WaitForConfigurationToBeLoaded() - recentInstantAnalysis = cache.New(time.Duration(config.Config.RecoveryPollSeconds*2)*time.Second, time.Second) + recentInstantAnalysis = cache.New(config.GetRecoveryPollDuration()*2, time.Second) } type clusterAnalysis struct { diff --git a/go/vt/vtorc/logic/tablet_discovery.go b/go/vt/vtorc/logic/tablet_discovery.go index e9bbcee35cb..90100308f7f 100644 --- a/go/vt/vtorc/logic/tablet_discovery.go +++ b/go/vt/vtorc/logic/tablet_discovery.go @@ -71,7 +71,7 @@ func OpenTabletDiscovery() <-chan time.Time { } // We refresh all information from the topo once before we start the ticks to do it on a timer. populateAllInformation() - return time.Tick(time.Second * time.Duration(config.Config.TopoInformationRefreshSeconds)) //nolint SA1015: using time.Tick leaks the underlying ticker + return time.Tick(config.GetTopoInformationRefreshDuration()) //nolint SA1015: using time.Tick leaks the underlying ticker } // populateAllInformation initializes all the information for VTOrc to function. diff --git a/go/vt/vtorc/logic/topology_recovery.go b/go/vt/vtorc/logic/topology_recovery.go index 7809f25e93b..f14eca624c9 100644 --- a/go/vt/vtorc/logic/topology_recovery.go +++ b/go/vt/vtorc/logic/topology_recovery.go @@ -21,7 +21,6 @@ import ( "encoding/json" "fmt" "math/rand/v2" - "time" "vitess.io/vitess/go/stats" "vitess.io/vitess/go/vt/log" @@ -235,7 +234,7 @@ func runEmergencyReparentOp(ctx context.Context, analysisEntry *inst.Replication tablet.Shard, reparentutil.EmergencyReparentOptions{ IgnoreReplicas: nil, - WaitReplicasTimeout: time.Duration(config.Config.WaitReplicasTimeoutSeconds) * time.Second, + WaitReplicasTimeout: config.GetWaitReplicasTimeout(), PreventCrossCellPromotion: config.GetPreventCrossCellFailover(), WaitAllTablets: waitForAllTablets, }, @@ -703,8 +702,8 @@ func electNewPrimary(ctx context.Context, analysisEntry *inst.ReplicationAnalysi analyzedTablet.Keyspace, analyzedTablet.Shard, reparentutil.PlannedReparentOptions{ - WaitReplicasTimeout: time.Duration(config.Config.WaitReplicasTimeoutSeconds) * time.Second, - TolerableReplLag: time.Duration(config.Config.TolerableReplicationLagSeconds) * time.Second, + WaitReplicasTimeout: config.GetWaitReplicasTimeout(), + TolerableReplLag: config.GetTolerableReplicationLag(), }, ) diff --git a/go/vt/vtorc/logic/vtorc.go b/go/vt/vtorc/logic/vtorc.go index 01acab33dc2..b8cf404d050 100644 --- a/go/vt/vtorc/logic/vtorc.go +++ b/go/vt/vtorc/logic/vtorc.go @@ -17,11 +17,8 @@ package logic import ( - "os" - "os/signal" "sync" "sync/atomic" - "syscall" "time" "github.com/patrickmn/go-cache" @@ -73,21 +70,6 @@ func init() { }) } -// acceptSighupSignal registers for SIGHUP signal from the OS to reload the configuration files. -func acceptSighupSignal() { - c := make(chan os.Signal, 1) - - signal.Notify(c, syscall.SIGHUP) - go func() { - for range c { - log.Infof("Received SIGHUP. Reloading configuration") - _ = inst.AuditOperation("reload-configuration", "", "Triggered via SIGHUP") - config.Reload() - discoveryMetrics.SetExpirePeriod(time.Duration(config.DiscoveryCollectionRetentionSeconds) * time.Second) - } - }() -} - // closeVTOrc runs all the operations required to cleanly shutdown VTOrc func closeVTOrc() { log.Infof("Starting VTOrc shutdown") @@ -272,7 +254,7 @@ func ContinuousDiscovery() { healthTick := time.Tick(config.HealthPollSeconds * time.Second) caretakingTick := time.Tick(time.Minute) - recoveryTick := time.Tick(time.Duration(config.Config.RecoveryPollSeconds) * time.Second) + recoveryTick := time.Tick(config.GetRecoveryPollDuration()) tabletTopoTick := OpenTabletDiscovery() var recoveryEntrance int64 var snapshotTopologiesTick <-chan time.Time @@ -283,7 +265,6 @@ func ContinuousDiscovery() { go func() { _ = ometrics.InitMetrics() }() - go acceptSighupSignal() // On termination of the server, we should close VTOrc cleanly servenv.OnTermSync(closeVTOrc) From 2121e16526c85ea9977eca71ab6630a3bfba59cc Mon Sep 17 00:00:00 2001 From: Manan Gupta Date: Tue, 12 Nov 2024 17:23:22 +0530 Subject: [PATCH 08/18] feat: add api to read config and add a test for dynamic configuration Signed-off-by: Manan Gupta --- go/test/endtoend/cluster/vtorc_process.go | 5 +++++ go/test/endtoend/vtorc/api/api_test.go | 17 +++++++++++++++++ go/vt/vtorc/config/config_test.go | 17 ----------------- go/vt/vtorc/server/api.go | 18 +++++++++++++++++- go/vt/vtorc/server/api_test.go | 3 +++ 5 files changed, 42 insertions(+), 18 deletions(-) delete mode 100644 go/vt/vtorc/config/config_test.go diff --git a/go/test/endtoend/cluster/vtorc_process.go b/go/test/endtoend/cluster/vtorc_process.go index 6ac39024dca..60a1ff603f9 100644 --- a/go/test/endtoend/cluster/vtorc_process.go +++ b/go/test/endtoend/cluster/vtorc_process.go @@ -50,6 +50,7 @@ type VTOrcProcess struct { type VTOrcConfiguration struct { InstancePollTime string `json:",omitempty"` + SnapshotTopologyInterval string `json:",omitempty"` PreventCrossCellFailover bool `json:",omitempty"` LockShardTimeoutSeconds int `json:",omitempty"` ReplicationLagQuery string `json:",omitempty"` @@ -66,6 +67,10 @@ func (config *VTOrcConfiguration) AddDefaults(webPort int) { config.InstancePollTime = "10h" } +func (orc *VTOrcProcess) RewriteConfiguration() error { + return os.WriteFile(orc.ConfigPath, []byte(orc.Config.ToJSONString()), 0644) +} + // Setup starts orc process with required arguements func (orc *VTOrcProcess) Setup() (err error) { diff --git a/go/test/endtoend/vtorc/api/api_test.go b/go/test/endtoend/vtorc/api/api_test.go index 638ea5fa72e..eb62a8655c0 100644 --- a/go/test/endtoend/vtorc/api/api_test.go +++ b/go/test/endtoend/vtorc/api/api_test.go @@ -21,6 +21,7 @@ import ( "fmt" "math" "reflect" + "strings" "testing" "time" @@ -90,6 +91,22 @@ func TestAPIEndpoints(t *testing.T) { return response != "null" }) + t.Run("Dynamic Configuration", func(t *testing.T) { + // Get configuration and verify the output. + status, resp, err := utils.MakeAPICall(t, vtorc, "/api/config") + require.NoError(t, err) + assert.Equal(t, 200, status) + assert.Contains(t, resp, `"snapshottopologyinterval": 0`) + // Update configuration and verify the output. + vtorc.Config.SnapshotTopologyInterval = "10h" + err = vtorc.RewriteConfiguration() + assert.NoError(t, err) + // Wait until the config has been updated and seen. + utils.MakeAPICallRetry(t, vtorc, "/api/config", func(_ int, response string) bool { + return !strings.Contains(response, `"snapshottopologyinterval": "10h"`) + }) + }) + t.Run("Database State", func(t *testing.T) { // Get database state status, resp, err := utils.MakeAPICall(t, vtorc, "/api/database-state") diff --git a/go/vt/vtorc/config/config_test.go b/go/vt/vtorc/config/config_test.go deleted file mode 100644 index 69abc75cde4..00000000000 --- a/go/vt/vtorc/config/config_test.go +++ /dev/null @@ -1,17 +0,0 @@ -/* -Copyright 2022 The Vitess Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package config diff --git a/go/vt/vtorc/server/api.go b/go/vt/vtorc/server/api.go index 5e9a84c0a29..177f2c80333 100644 --- a/go/vt/vtorc/server/api.go +++ b/go/vt/vtorc/server/api.go @@ -25,6 +25,7 @@ import ( "time" "vitess.io/vitess/go/acl" + "vitess.io/vitess/go/viperutil/debug" "vitess.io/vitess/go/vt/servenv" "vitess.io/vitess/go/vt/vtorc/collection" "vitess.io/vitess/go/vt/vtorc/discovery" @@ -46,6 +47,7 @@ const ( enableGlobalRecoveriesAPI = "/api/enable-global-recoveries" replicationAnalysisAPI = "/api/replication-analysis" databaseStateAPI = "/api/database-state" + configAPI = "/api/config" healthAPI = "/debug/health" AggregatedDiscoveryMetricsAPI = "/api/aggregated-discovery-metrics" @@ -62,6 +64,7 @@ var ( enableGlobalRecoveriesAPI, replicationAnalysisAPI, databaseStateAPI, + configAPI, healthAPI, AggregatedDiscoveryMetricsAPI, } @@ -90,6 +93,8 @@ func (v *vtorcAPI) ServeHTTP(response http.ResponseWriter, request *http.Request replicationAnalysisAPIHandler(response, request) case databaseStateAPI: databaseStateAPIHandler(response) + case configAPI: + configAPIHandler(response) case AggregatedDiscoveryMetricsAPI: AggregatedDiscoveryMetricsAPIHandler(response, request) default: @@ -106,7 +111,7 @@ func getACLPermissionLevelForAPI(apiEndpoint string) string { return acl.MONITORING case disableGlobalRecoveriesAPI, enableGlobalRecoveriesAPI: return acl.ADMIN - case replicationAnalysisAPI: + case replicationAnalysisAPI, configAPI: return acl.MONITORING case healthAPI, databaseStateAPI: return acl.MONITORING @@ -180,6 +185,17 @@ func databaseStateAPIHandler(response http.ResponseWriter) { writePlainTextResponse(response, ds, http.StatusOK) } +// configAPIHandler is the handler for the configAPI endpoint +func configAPIHandler(response http.ResponseWriter) { + settingsMap := debug.AllSettings() + jsonOut, err := json.MarshalIndent(settingsMap, "", "\t") + if err != nil { + http.Error(response, err.Error(), http.StatusInternalServerError) + return + } + writePlainTextResponse(response, string(jsonOut), http.StatusOK) +} + // AggregatedDiscoveryMetricsAPIHandler is the handler for the discovery metrics endpoint func AggregatedDiscoveryMetricsAPIHandler(response http.ResponseWriter, request *http.Request) { // return metrics for last x seconds diff --git a/go/vt/vtorc/server/api_test.go b/go/vt/vtorc/server/api_test.go index c352d1e600f..ab6b9eed9af 100644 --- a/go/vt/vtorc/server/api_test.go +++ b/go/vt/vtorc/server/api_test.go @@ -31,6 +31,9 @@ func TestGetACLPermissionLevelForAPI(t *testing.T) { }, { apiEndpoint: healthAPI, want: acl.MONITORING, + }, { + apiEndpoint: configAPI, + want: acl.MONITORING, }, { apiEndpoint: "gibberish", want: acl.ADMIN, From 0420ce597aae55fb46486f6acfb1ece52a5d9e3f Mon Sep 17 00:00:00 2001 From: Manan Gupta Date: Tue, 12 Nov 2024 17:32:28 +0530 Subject: [PATCH 09/18] feat: add summary changes Signed-off-by: Manan Gupta --- changelog/22.0/22.0.0/summary.md | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/changelog/22.0/22.0.0/summary.md b/changelog/22.0/22.0.0/summary.md index 6846913bed5..5e8eb7b7c8f 100644 --- a/changelog/22.0/22.0.0/summary.md +++ b/changelog/22.0/22.0.0/summary.md @@ -4,6 +4,7 @@ - **[Major Changes](#major-changes)** - **[RPC Changes](#rpc-changes)** + - **[VTOrc Config File Changes](#vtorc-config-file-changes)** ## Major Changes @@ -13,3 +14,22 @@ These are the RPC changes made in this release - 1. `GetTransactionInfo` RPC has been added to both `VtctldServer`, and `TabletManagerClient` interface. These RPCs are used to fecilitate the users in reading the state of an unresolved distributed transaction. This can be useful in debugging what went wrong and how to fix the problem. + +### VTOrc Config File Changes + +The configuration file for VTOrc has been updated to now support dynamic fields. The old `--config` parameter has been removed. The alternative is to use the `--config-file` parameter. The configuration can now be provided in both json, yaml or any other format that [viper](https://github.com/spf13/viper) supports. + +The following fields can be dynamically changed - +1. `InstancePollTime` +2. `PreventCrossCellFailover` +3. `SnapshotTopologyInterval` +4. `ReasonableReplicationLag` +5. `AuditToBackend` +6. `AuditToSyslog` +7. `AuditPurgeDuration` +8. `WaitReplicasTimeout` +9. `TolerableReplicationLag` +10. `TopoInformationRefreshDuration` +11. `RecoveryPollDuration` + +To upgrade to the newer version of the configuration file, the users can first change to using the flags in the previous release before upgrading. They can then revert to using the configuration file in the newer release. From 897ab1065ebfe46eaaa16405b17fb8a8b6a5bed5 Mon Sep 17 00:00:00 2001 From: Manan Gupta Date: Wed, 13 Nov 2024 09:26:38 +0530 Subject: [PATCH 10/18] feat: fix vtorc config in local example Signed-off-by: Manan Gupta --- examples/common/scripts/vtorc-up.sh | 2 +- examples/common/vtorc/config.json | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/examples/common/scripts/vtorc-up.sh b/examples/common/scripts/vtorc-up.sh index 23ca4e62b48..6660f2a8077 100755 --- a/examples/common/scripts/vtorc-up.sh +++ b/examples/common/scripts/vtorc-up.sh @@ -11,7 +11,7 @@ vtorc \ $TOPOLOGY_FLAGS \ --logtostderr \ --alsologtostderr \ - --config="${script_dir}/../vtorc/config.json" \ + --config-path="${script_dir}/../vtorc/config.json" \ --port $port \ > "${log_dir}/vtorc.out" 2>&1 & diff --git a/examples/common/vtorc/config.json b/examples/common/vtorc/config.json index 53b012c2162..931d7effbe2 100644 --- a/examples/common/vtorc/config.json +++ b/examples/common/vtorc/config.json @@ -1,4 +1,3 @@ { - "RecoveryPeriodBlockSeconds": 1, - "InstancePollSeconds": 1 + "InstancePollTime": "1s" } \ No newline at end of file From d20979bfd9253bb82c3e14a8a01f6f90345f5717 Mon Sep 17 00:00:00 2001 From: Manan Gupta Date: Thu, 21 Nov 2024 20:04:50 +0530 Subject: [PATCH 11/18] feat: change way we use config in the e2e tests Signed-off-by: Manan Gupta --- examples/common/scripts/vtorc-up.sh | 4 +++- examples/common/vtorc/config.json | 3 --- examples/common/vtorc/config.yaml | 1 + 3 files changed, 4 insertions(+), 4 deletions(-) delete mode 100644 examples/common/vtorc/config.json create mode 100644 examples/common/vtorc/config.yaml diff --git a/examples/common/scripts/vtorc-up.sh b/examples/common/scripts/vtorc-up.sh index 6660f2a8077..807f522b1f7 100755 --- a/examples/common/scripts/vtorc-up.sh +++ b/examples/common/scripts/vtorc-up.sh @@ -11,7 +11,9 @@ vtorc \ $TOPOLOGY_FLAGS \ --logtostderr \ --alsologtostderr \ - --config-path="${script_dir}/../vtorc/config.json" \ + --config-path="${script_dir}/../vtorc/" \ + --config-name="config.yaml" \ + --config-type="yml" \ --port $port \ > "${log_dir}/vtorc.out" 2>&1 & diff --git a/examples/common/vtorc/config.json b/examples/common/vtorc/config.json deleted file mode 100644 index 931d7effbe2..00000000000 --- a/examples/common/vtorc/config.json +++ /dev/null @@ -1,3 +0,0 @@ -{ - "InstancePollTime": "1s" -} \ No newline at end of file diff --git a/examples/common/vtorc/config.yaml b/examples/common/vtorc/config.yaml new file mode 100644 index 00000000000..e615f6814f4 --- /dev/null +++ b/examples/common/vtorc/config.yaml @@ -0,0 +1 @@ +InstancePollTime: 1s From d5923cdbb6e7c9d5060051beba1b64ff9666a0f0 Mon Sep 17 00:00:00 2001 From: Manan Gupta Date: Thu, 21 Nov 2024 20:15:09 +0530 Subject: [PATCH 12/18] feat: add two more fields to viper Signed-off-by: Manan Gupta --- changelog/22.0/22.0.0/summary.md | 4 +++- go/vt/vtorc/config/config.go | 35 +++++++++++++++++++++++--------- 2 files changed, 28 insertions(+), 11 deletions(-) diff --git a/changelog/22.0/22.0.0/summary.md b/changelog/22.0/22.0.0/summary.md index 5e8eb7b7c8f..d382dc7df4e 100644 --- a/changelog/22.0/22.0.0/summary.md +++ b/changelog/22.0/22.0.0/summary.md @@ -31,5 +31,7 @@ The following fields can be dynamically changed - 9. `TolerableReplicationLag` 10. `TopoInformationRefreshDuration` 11. `RecoveryPollDuration` +12. `AllowEmergencyReparent` +13. `ChangeTabletsWithErrantGtidToDrained` -To upgrade to the newer version of the configuration file, the users can first change to using the flags in the previous release before upgrading. They can then revert to using the configuration file in the newer release. +To upgrade to the newer version of the configuration file, first switch to using the flags in your current deployment before upgrading. Then you can switch to using the configuration file in the newer release. diff --git a/go/vt/vtorc/config/config.go b/go/vt/vtorc/config/config.go index 47e4c1e8326..e4d445bfe69 100644 --- a/go/vt/vtorc/config/config.go +++ b/go/vt/vtorc/config/config.go @@ -158,11 +158,24 @@ var ( Dynamic: true, }, ) -) -var ( - ersEnabled = true - convertTabletsWithErrantGTIDs = false + ersEnabled = viperutil.Configure( + "AllowEmergencyReparent", + viperutil.Options[bool]{ + FlagName: "allow-emergency-reparent", + Default: true, + Dynamic: true, + }, + ) + + convertTabletsWithErrantGTIDs = viperutil.Configure( + "ChangeTabletsWithErrantGtidToDrained", + viperutil.Options[bool]{ + FlagName: "change-tablets-with-errant-gtid-to-drained", + Default: false, + Dynamic: true, + }, + ) ) func init() { @@ -184,8 +197,8 @@ func registerFlags(fs *pflag.FlagSet) { fs.Duration("tolerable-replication-lag", tolerableReplicationLag.Default(), "Amount of replication lag that is considered acceptable for a tablet to be eligible for promotion when Vitess makes the choice of a new primary in PRS") fs.Duration("topo-information-refresh-duration", topoInformationRefreshDuration.Default(), "Timer duration on which VTOrc refreshes the keyspace and vttablet records from the topology server") fs.Duration("recovery-poll-duration", recoveryPollDuration.Default(), "Timer duration on which VTOrc polls its database to run a recovery") - fs.BoolVar(&ersEnabled, "allow-emergency-reparent", ersEnabled, "Whether VTOrc should be allowed to run emergency reparent operation when it detects a dead primary") - fs.BoolVar(&convertTabletsWithErrantGTIDs, "change-tablets-with-errant-gtid-to-drained", convertTabletsWithErrantGTIDs, "Whether VTOrc should be changing the type of tablets with errant GTIDs to DRAINED") + fs.Bool("allow-emergency-reparent", ersEnabled.Default(), "Whether VTOrc should be allowed to run emergency reparent operation when it detects a dead primary") + fs.Bool("change-tablets-with-errant-gtid-to-drained", convertTabletsWithErrantGTIDs.Default(), "Whether VTOrc should be changing the type of tablets with errant GTIDs to DRAINED") viperutil.BindFlags(fs, instancePollTime, @@ -201,6 +214,8 @@ func registerFlags(fs *pflag.FlagSet) { tolerableReplicationLag, topoInformationRefreshDuration, recoveryPollDuration, + ersEnabled, + convertTabletsWithErrantGTIDs, ) } @@ -301,22 +316,22 @@ func GetRecoveryPollDuration() time.Duration { // ERSEnabled reports whether VTOrc is allowed to run ERS or not. func ERSEnabled() bool { - return ersEnabled + return ersEnabled.Get() } // SetERSEnabled sets the value for the ersEnabled variable. This should only be used from tests. func SetERSEnabled(val bool) { - ersEnabled = val + ersEnabled.Set(val) } // ConvertTabletWithErrantGTIDs reports whether VTOrc is allowed to change the tablet type of tablets with errant GTIDs to DRAINED. func ConvertTabletWithErrantGTIDs() bool { - return convertTabletsWithErrantGTIDs + return convertTabletsWithErrantGTIDs.Get() } // SetConvertTabletWithErrantGTIDs sets the value for the convertTabletWithErrantGTIDs variable. This should only be used from tests. func SetConvertTabletWithErrantGTIDs(val bool) { - convertTabletsWithErrantGTIDs = val + convertTabletsWithErrantGTIDs.Set(val) } // LogConfigValues is used to log the config values. From 682275279c009f0dc2a76371594ba7962d323182 Mon Sep 17 00:00:00 2001 From: Manan Gupta Date: Fri, 22 Nov 2024 14:41:53 +0530 Subject: [PATCH 13/18] feat: fix flag in config Signed-off-by: Manan Gupta --- go/vt/vtorc/config/config.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/go/vt/vtorc/config/config.go b/go/vt/vtorc/config/config.go index e4d445bfe69..2a2d12bb0ef 100644 --- a/go/vt/vtorc/config/config.go +++ b/go/vt/vtorc/config/config.go @@ -117,7 +117,7 @@ var ( auditPurgeDuration = viperutil.Configure( "AuditPurgeDuration", viperutil.Options[time.Duration]{ - FlagName: "reasonable-replication-lag", + FlagName: "audit-purge-duration", Default: 7 * 24 * time.Hour, Dynamic: true, }, From 8b1e52ba1e0a02828eade80f1dfada05aad74b9e Mon Sep 17 00:00:00 2001 From: Manan Gupta Date: Sun, 24 Nov 2024 13:06:06 +0530 Subject: [PATCH 14/18] test: start adding dynamic config tests in e2e fashion Signed-off-by: Manan Gupta --- go/test/endtoend/cluster/vtorc_process.go | 17 ++-- go/test/endtoend/vtorc/api/api_test.go | 17 ---- go/test/endtoend/vtorc/api/config_test.go | 94 +++++++++++++++++++++++ go/test/endtoend/vtorc/utils/utils.go | 4 +- 4 files changed, 108 insertions(+), 24 deletions(-) create mode 100644 go/test/endtoend/vtorc/api/config_test.go diff --git a/go/test/endtoend/cluster/vtorc_process.go b/go/test/endtoend/cluster/vtorc_process.go index 60a1ff603f9..444350aa7a7 100644 --- a/go/test/endtoend/cluster/vtorc_process.go +++ b/go/test/endtoend/cluster/vtorc_process.go @@ -43,6 +43,7 @@ type VTOrcProcess struct { ExtraArgs []string ConfigPath string Config VTOrcConfiguration + NoOverride bool WebPort int proc *exec.Cmd exit chan error @@ -89,7 +90,9 @@ func (orc *VTOrcProcess) Setup() (err error) { orc.ConfigPath = configFile.Name() // Add the default configurations and print them out - orc.Config.AddDefaults(orc.WebPort) + if !orc.NoOverride { + orc.Config.AddDefaults(orc.WebPort) + } log.Errorf("configuration - %v", orc.Config.ToJSONString()) _, err = configFile.WriteString(orc.Config.ToJSONString()) if err != nil { @@ -111,12 +114,16 @@ func (orc *VTOrcProcess) Setup() (err error) { "--topo_global_root", orc.TopoGlobalRoot, "--config-file", orc.ConfigPath, "--port", fmt.Sprintf("%d", orc.Port), - // This parameter is overriden from the config file. This verifies that we indeed use the flag value over the config file. - "--instance-poll-time", "1s", - // Faster topo information refresh speeds up the tests. This doesn't add any significant load either. - "--topo-information-refresh-duration", "3s", "--bind-address", "127.0.0.1", ) + if !orc.NoOverride { + orc.proc.Args = append(orc.proc.Args, + // This parameter is overriden from the config file. This verifies that we indeed use the flag value over the config file. + "--instance-poll-time", "1s", + // Faster topo information refresh speeds up the tests. This doesn't add any significant load either. + "--topo-information-refresh-duration", "3s", + ) + } if *isCoverage { orc.proc.Args = append(orc.proc.Args, "--test.coverprofile="+getCoveragePath("orc.out")) diff --git a/go/test/endtoend/vtorc/api/api_test.go b/go/test/endtoend/vtorc/api/api_test.go index eb62a8655c0..638ea5fa72e 100644 --- a/go/test/endtoend/vtorc/api/api_test.go +++ b/go/test/endtoend/vtorc/api/api_test.go @@ -21,7 +21,6 @@ import ( "fmt" "math" "reflect" - "strings" "testing" "time" @@ -91,22 +90,6 @@ func TestAPIEndpoints(t *testing.T) { return response != "null" }) - t.Run("Dynamic Configuration", func(t *testing.T) { - // Get configuration and verify the output. - status, resp, err := utils.MakeAPICall(t, vtorc, "/api/config") - require.NoError(t, err) - assert.Equal(t, 200, status) - assert.Contains(t, resp, `"snapshottopologyinterval": 0`) - // Update configuration and verify the output. - vtorc.Config.SnapshotTopologyInterval = "10h" - err = vtorc.RewriteConfiguration() - assert.NoError(t, err) - // Wait until the config has been updated and seen. - utils.MakeAPICallRetry(t, vtorc, "/api/config", func(_ int, response string) bool { - return !strings.Contains(response, `"snapshottopologyinterval": "10h"`) - }) - }) - t.Run("Database State", func(t *testing.T) { // Get database state status, resp, err := utils.MakeAPICall(t, vtorc, "/api/database-state") diff --git a/go/test/endtoend/vtorc/api/config_test.go b/go/test/endtoend/vtorc/api/config_test.go new file mode 100644 index 00000000000..9d4dc847ced --- /dev/null +++ b/go/test/endtoend/vtorc/api/config_test.go @@ -0,0 +1,94 @@ +/* +Copyright 2024 The Vitess Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +*/ + +package api + +import ( + "strings" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "vitess.io/vitess/go/test/endtoend/cluster" + "vitess.io/vitess/go/test/endtoend/vtorc/utils" +) + +// TestDynamicConfigs tests the dyanamic configurations that VTOrc offers. +func TestDynamicConfigs(t *testing.T) { + defer cluster.PanicHandler(t) + utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 2, 1, nil, cluster.VTOrcConfiguration{}, 1, "") + vtorc := clusterInfo.ClusterInstance.VTOrcProcesses[0] + + // Restart VTOrc without any flag overrides so that all the configurations can be tested. + err := vtorc.TearDown() + require.NoError(t, err) + vtorc.Config = cluster.VTOrcConfiguration{} + vtorc.NoOverride = true + err = vtorc.Setup() + require.NoError(t, err) + + // Call API with retry to ensure VTOrc is up + status, resp := utils.MakeAPICallRetry(t, vtorc, "/debug/health", func(code int, response string) bool { + return code != 200 + }) + // Verify when VTOrc is healthy, it has also run the first discovery. + assert.Equal(t, 200, status) + assert.Contains(t, resp, `"Healthy": true,`) + + t.Run("InstancePollTime", func(t *testing.T) { + // Get configuration and verify the output. + waitForConfig(t, vtorc, `"instance-poll-time": 5000000000`) + // Update configuration and verify the output. + vtorc.Config.InstancePollTime = "10h" + err := vtorc.RewriteConfiguration() + assert.NoError(t, err) + // Wait until the config has been updated and seen. + waitForConfig(t, vtorc, `"instance-poll-time": "10h"`) + }) + + t.Run("PreventCrossCellFailover", func(t *testing.T) { + // Get configuration and verify the output. + waitForConfig(t, vtorc, `"prevent-cross-cell-failover": false`) + // Update configuration and verify the output. + vtorc.Config.PreventCrossCellFailover = true + err := vtorc.RewriteConfiguration() + assert.NoError(t, err) + // Wait until the config has been updated and seen. + waitForConfig(t, vtorc, `"prevent-cross-cell-failover": true`) + }) + + t.Run("SnapshotTopologyInterval", func(t *testing.T) { + // Get configuration and verify the output. + waitForConfig(t, vtorc, `"snapshot-topology-interval": 0`) + // Update configuration and verify the output. + vtorc.Config.SnapshotTopologyInterval = "10h" + err := vtorc.RewriteConfiguration() + assert.NoError(t, err) + // Wait until the config has been updated and seen. + waitForConfig(t, vtorc, `"snapshot-topology-interval": "10h"`) + }) +} + +// waitForConfig waits for the expectedConfig to be present in the VTOrc configuration. +func waitForConfig(t *testing.T, vtorc *cluster.VTOrcProcess, expectedConfig string) { + t.Helper() + status, _ := utils.MakeAPICallRetry(t, vtorc, "/api/config", func(_ int, response string) bool { + return !strings.Contains(response, expectedConfig) + }) + require.EqualValues(t, 200, status) +} diff --git a/go/test/endtoend/vtorc/utils/utils.go b/go/test/endtoend/vtorc/utils/utils.go index b00d618079b..456d55518dd 100644 --- a/go/test/endtoend/vtorc/utils/utils.go +++ b/go/test/endtoend/vtorc/utils/utils.go @@ -774,10 +774,10 @@ func MakeAPICallRetry(t *testing.T, vtorc *cluster.VTOrcProcess, url string, ret for { select { case <-timeout: - t.Fatal("timed out waiting for api to work") + t.Fatalf("timed out waiting for api to work. Last response - %s", response) return default: - status, response, _ := MakeAPICall(t, vtorc, url) + status, response, _ = MakeAPICall(t, vtorc, url) if retry(status, response) { time.Sleep(1 * time.Second) break From 3a422fa5f22204956f1427c61d2e11c41091dd5a Mon Sep 17 00:00:00 2001 From: Manan Gupta Date: Sun, 24 Nov 2024 13:28:57 +0530 Subject: [PATCH 15/18] test: add all the remaining configs to the test Signed-off-by: Manan Gupta --- go/test/endtoend/cluster/cluster_process.go | 1 - go/test/endtoend/cluster/vtorc_process.go | 27 +++-- go/test/endtoend/vtorc/api/config_test.go | 110 ++++++++++++++++++++ 3 files changed, 128 insertions(+), 10 deletions(-) diff --git a/go/test/endtoend/cluster/cluster_process.go b/go/test/endtoend/cluster/cluster_process.go index 6f800a70a49..b89e007b4f2 100644 --- a/go/test/endtoend/cluster/cluster_process.go +++ b/go/test/endtoend/cluster/cluster_process.go @@ -1301,7 +1301,6 @@ func (cluster *LocalProcessCluster) NewVTOrcProcess(config VTOrcConfiguration) * VtctlProcess: *base, LogDir: cluster.TmpDirectory, Config: config, - WebPort: cluster.GetAndReservePort(), Port: cluster.GetAndReservePort(), } } diff --git a/go/test/endtoend/cluster/vtorc_process.go b/go/test/endtoend/cluster/vtorc_process.go index 444350aa7a7..bc099b8bfe0 100644 --- a/go/test/endtoend/cluster/vtorc_process.go +++ b/go/test/endtoend/cluster/vtorc_process.go @@ -44,18 +44,27 @@ type VTOrcProcess struct { ConfigPath string Config VTOrcConfiguration NoOverride bool - WebPort int proc *exec.Cmd exit chan error } type VTOrcConfiguration struct { - InstancePollTime string `json:",omitempty"` - SnapshotTopologyInterval string `json:",omitempty"` - PreventCrossCellFailover bool `json:",omitempty"` - LockShardTimeoutSeconds int `json:",omitempty"` - ReplicationLagQuery string `json:",omitempty"` - FailPrimaryPromotionOnLagMinutes int `json:",omitempty"` + InstancePollTime string `json:",omitempty"` + SnapshotTopologyInterval string `json:",omitempty"` + PreventCrossCellFailover bool `json:",omitempty"` + ReasonableReplicationLag string `json:",omitempty"` + AuditToBackend bool `json:",omitempty"` + AuditToSyslog bool `json:",omitempty"` + AuditPurgeDuration string `json:",omitempty"` + WaitReplicasTimeout string `json:",omitempty"` + TolerableReplicationLag string `json:",omitempty"` + TopoInformationRefreshDuration string `json:",omitempty"` + RecoveryPollDuration string `json:",omitempty"` + AllowEmergencyReparent string `json:",omitempty"` + ChangeTabletsWithErrantGtidToDrained bool `json:",omitempty"` + LockShardTimeoutSeconds int `json:",omitempty"` + ReplicationLagQuery string `json:",omitempty"` + FailPrimaryPromotionOnLagMinutes int `json:",omitempty"` } // ToJSONString will marshal this configuration as JSON @@ -64,7 +73,7 @@ func (config *VTOrcConfiguration) ToJSONString() string { return string(b) } -func (config *VTOrcConfiguration) AddDefaults(webPort int) { +func (config *VTOrcConfiguration) addValuesToCheckOverride() { config.InstancePollTime = "10h" } @@ -91,7 +100,7 @@ func (orc *VTOrcProcess) Setup() (err error) { // Add the default configurations and print them out if !orc.NoOverride { - orc.Config.AddDefaults(orc.WebPort) + orc.Config.addValuesToCheckOverride() } log.Errorf("configuration - %v", orc.Config.ToJSONString()) _, err = configFile.WriteString(orc.Config.ToJSONString()) diff --git a/go/test/endtoend/vtorc/api/config_test.go b/go/test/endtoend/vtorc/api/config_test.go index 9d4dc847ced..71cc6291be7 100644 --- a/go/test/endtoend/vtorc/api/config_test.go +++ b/go/test/endtoend/vtorc/api/config_test.go @@ -82,6 +82,116 @@ func TestDynamicConfigs(t *testing.T) { // Wait until the config has been updated and seen. waitForConfig(t, vtorc, `"snapshot-topology-interval": "10h"`) }) + + t.Run("ReasonableReplicationLag", func(t *testing.T) { + // Get configuration and verify the output. + waitForConfig(t, vtorc, `"reasonable-replication-lag": 10000000000`) + // Update configuration and verify the output. + vtorc.Config.ReasonableReplicationLag = "10h" + err := vtorc.RewriteConfiguration() + assert.NoError(t, err) + // Wait until the config has been updated and seen. + waitForConfig(t, vtorc, `"reasonable-replication-lag": "10h"`) + }) + + t.Run("AuditToBackend", func(t *testing.T) { + // Get configuration and verify the output. + waitForConfig(t, vtorc, `"audit-to-backend": false`) + // Update configuration and verify the output. + vtorc.Config.AuditToBackend = true + err := vtorc.RewriteConfiguration() + assert.NoError(t, err) + // Wait until the config has been updated and seen. + waitForConfig(t, vtorc, `"audit-to-backend": true`) + }) + + t.Run("AuditToSyslog", func(t *testing.T) { + // Get configuration and verify the output. + waitForConfig(t, vtorc, `"audit-to-syslog": false`) + // Update configuration and verify the output. + vtorc.Config.AuditToSyslog = true + err := vtorc.RewriteConfiguration() + assert.NoError(t, err) + // Wait until the config has been updated and seen. + waitForConfig(t, vtorc, `"audit-to-syslog": true`) + }) + + t.Run("AuditPurgeDuration", func(t *testing.T) { + // Get configuration and verify the output. + waitForConfig(t, vtorc, `"audit-purge-duration": 604800000000000`) + // Update configuration and verify the output. + vtorc.Config.AuditPurgeDuration = "10h" + err := vtorc.RewriteConfiguration() + assert.NoError(t, err) + // Wait until the config has been updated and seen. + waitForConfig(t, vtorc, `"audit-purge-duration": "10h"`) + }) + + t.Run("WaitReplicasTimeout", func(t *testing.T) { + // Get configuration and verify the output. + waitForConfig(t, vtorc, `"wait-replicas-timeout": 30000000000`) + // Update configuration and verify the output. + vtorc.Config.WaitReplicasTimeout = "10h" + err := vtorc.RewriteConfiguration() + assert.NoError(t, err) + // Wait until the config has been updated and seen. + waitForConfig(t, vtorc, `"wait-replicas-timeout": "10h"`) + }) + + t.Run("TolerableReplicationLag", func(t *testing.T) { + // Get configuration and verify the output. + waitForConfig(t, vtorc, `"tolerable-replication-lag": 0`) + // Update configuration and verify the output. + vtorc.Config.TolerableReplicationLag = "10h" + err := vtorc.RewriteConfiguration() + assert.NoError(t, err) + // Wait until the config has been updated and seen. + waitForConfig(t, vtorc, `"tolerable-replication-lag": "10h"`) + }) + + t.Run("TopoInformationRefreshDuration", func(t *testing.T) { + // Get configuration and verify the output. + waitForConfig(t, vtorc, `"topo-information-refresh-duration": 15000000000`) + // Update configuration and verify the output. + vtorc.Config.TopoInformationRefreshDuration = "10h" + err := vtorc.RewriteConfiguration() + assert.NoError(t, err) + // Wait until the config has been updated and seen. + waitForConfig(t, vtorc, `"topo-information-refresh-duration": "10h"`) + }) + + t.Run("RecoveryPollDuration", func(t *testing.T) { + // Get configuration and verify the output. + waitForConfig(t, vtorc, `"recovery-poll-duration": 1000000000`) + // Update configuration and verify the output. + vtorc.Config.RecoveryPollDuration = "10h" + err := vtorc.RewriteConfiguration() + assert.NoError(t, err) + // Wait until the config has been updated and seen. + waitForConfig(t, vtorc, `"recovery-poll-duration": "10h"`) + }) + + t.Run("AllowEmergencyReparent", func(t *testing.T) { + // Get configuration and verify the output. + waitForConfig(t, vtorc, `"allow-emergency-reparent": true`) + // Update configuration and verify the output. + vtorc.Config.AllowEmergencyReparent = "false" + err := vtorc.RewriteConfiguration() + assert.NoError(t, err) + // Wait until the config has been updated and seen. + waitForConfig(t, vtorc, `"allow-emergency-reparent": "false"`) + }) + + t.Run("ChangeTabletsWithErrantGtidToDrained", func(t *testing.T) { + // Get configuration and verify the output. + waitForConfig(t, vtorc, `"change-tablets-with-errant-gtid-to-drained": false`) + // Update configuration and verify the output. + vtorc.Config.ChangeTabletsWithErrantGtidToDrained = true + err := vtorc.RewriteConfiguration() + assert.NoError(t, err) + // Wait until the config has been updated and seen. + waitForConfig(t, vtorc, `"change-tablets-with-errant-gtid-to-drained": true`) + }) } // waitForConfig waits for the expectedConfig to be present in the VTOrc configuration. From ed96f7211ccbfe8516d383ceeef5eeb22398b8fc Mon Sep 17 00:00:00 2001 From: Manan Gupta Date: Mon, 25 Nov 2024 11:02:26 +0530 Subject: [PATCH 16/18] feat: change the keys of the viper configs to match the flag names Signed-off-by: Manan Gupta --- go/test/endtoend/cluster/vtorc_process.go | 26 ++++++++++---------- go/vt/vtorc/config/config.go | 30 +++++++++++------------ 2 files changed, 28 insertions(+), 28 deletions(-) diff --git a/go/test/endtoend/cluster/vtorc_process.go b/go/test/endtoend/cluster/vtorc_process.go index bc099b8bfe0..af101a8bebd 100644 --- a/go/test/endtoend/cluster/vtorc_process.go +++ b/go/test/endtoend/cluster/vtorc_process.go @@ -49,19 +49,19 @@ type VTOrcProcess struct { } type VTOrcConfiguration struct { - InstancePollTime string `json:",omitempty"` - SnapshotTopologyInterval string `json:",omitempty"` - PreventCrossCellFailover bool `json:",omitempty"` - ReasonableReplicationLag string `json:",omitempty"` - AuditToBackend bool `json:",omitempty"` - AuditToSyslog bool `json:",omitempty"` - AuditPurgeDuration string `json:",omitempty"` - WaitReplicasTimeout string `json:",omitempty"` - TolerableReplicationLag string `json:",omitempty"` - TopoInformationRefreshDuration string `json:",omitempty"` - RecoveryPollDuration string `json:",omitempty"` - AllowEmergencyReparent string `json:",omitempty"` - ChangeTabletsWithErrantGtidToDrained bool `json:",omitempty"` + InstancePollTime string `json:"instance-poll-time,omitempty"` + SnapshotTopologyInterval string `json:"snapshot-topology-interval,omitempty"` + PreventCrossCellFailover bool `json:"prevent-cross-cell-failover,omitempty"` + ReasonableReplicationLag string `json:"reasonable-replication-lag,omitempty"` + AuditToBackend bool `json:"audit-to-backend,omitempty"` + AuditToSyslog bool `json:"audit-to-syslog,omitempty"` + AuditPurgeDuration string `json:"audit-purge-duration,omitempty"` + WaitReplicasTimeout string `json:"wait-replicas-timeout,omitempty"` + TolerableReplicationLag string `json:"tolerable-replication-lag,omitempty"` + TopoInformationRefreshDuration string `json:"topo-information-refresh-duration,omitempty"` + RecoveryPollDuration string `json:"recovery-poll-duration,omitempty"` + AllowEmergencyReparent string `json:"allow-emergency-reparent,omitempty"` + ChangeTabletsWithErrantGtidToDrained bool `json:"change-tablets-with-errant-gtid-to-drained,omitempty"` LockShardTimeoutSeconds int `json:",omitempty"` ReplicationLagQuery string `json:",omitempty"` FailPrimaryPromotionOnLagMinutes int `json:",omitempty"` diff --git a/go/vt/vtorc/config/config.go b/go/vt/vtorc/config/config.go index 2a2d12bb0ef..4aa620f753d 100644 --- a/go/vt/vtorc/config/config.go +++ b/go/vt/vtorc/config/config.go @@ -43,7 +43,7 @@ const ( var ( instancePollTime = viperutil.Configure( - "InstancePollTime", + "instance-poll-time", viperutil.Options[time.Duration]{ FlagName: "instance-poll-time", Default: 5 * time.Second, @@ -52,7 +52,7 @@ var ( ) preventCrossCellFailover = viperutil.Configure( - "PreventCrossCellFailover", + "prevent-cross-cell-failover", viperutil.Options[bool]{ FlagName: "prevent-cross-cell-failover", Default: false, @@ -61,7 +61,7 @@ var ( ) sqliteDataFile = viperutil.Configure( - "SQLiteDataFile", + "sqlite-data-file", viperutil.Options[string]{ FlagName: "sqlite-data-file", Default: "file::memory:?mode=memory&cache=shared", @@ -70,7 +70,7 @@ var ( ) snapshotTopologyInterval = viperutil.Configure( - "SnapshotTopologyInterval", + "snapshot-topology-interval", viperutil.Options[time.Duration]{ FlagName: "snapshot-topology-interval", Default: 0 * time.Hour, @@ -79,7 +79,7 @@ var ( ) reasonableReplicationLag = viperutil.Configure( - "ReasonableReplicationLag", + "reasonable-replication-lag", viperutil.Options[time.Duration]{ FlagName: "reasonable-replication-lag", Default: 10 * time.Second, @@ -88,7 +88,7 @@ var ( ) auditFileLocation = viperutil.Configure( - "AuditFileLocation", + "audit-file-location", viperutil.Options[string]{ FlagName: "audit-file-location", Default: "", @@ -97,7 +97,7 @@ var ( ) auditToBackend = viperutil.Configure( - "AuditToBackend", + "audit-to-backend", viperutil.Options[bool]{ FlagName: "audit-to-backend", Default: false, @@ -106,7 +106,7 @@ var ( ) auditToSyslog = viperutil.Configure( - "AuditToSyslog", + "audit-to-syslog", viperutil.Options[bool]{ FlagName: "audit-to-syslog", Default: false, @@ -115,7 +115,7 @@ var ( ) auditPurgeDuration = viperutil.Configure( - "AuditPurgeDuration", + "audit-purge-duration", viperutil.Options[time.Duration]{ FlagName: "audit-purge-duration", Default: 7 * 24 * time.Hour, @@ -124,7 +124,7 @@ var ( ) waitReplicasTimeout = viperutil.Configure( - "WaitReplicasTimeout", + "wait-replicas-timeout", viperutil.Options[time.Duration]{ FlagName: "wait-replicas-timeout", Default: 30 * time.Second, @@ -133,7 +133,7 @@ var ( ) tolerableReplicationLag = viperutil.Configure( - "TolerableReplicationLag", + "tolerable-replication-lag", viperutil.Options[time.Duration]{ FlagName: "tolerable-replication-lag", Default: 0 * time.Second, @@ -142,7 +142,7 @@ var ( ) topoInformationRefreshDuration = viperutil.Configure( - "TopoInformationRefreshDuration", + "topo-information-refresh-duration", viperutil.Options[time.Duration]{ FlagName: "topo-information-refresh-duration", Default: 15 * time.Second, @@ -151,7 +151,7 @@ var ( ) recoveryPollDuration = viperutil.Configure( - "RecoveryPollDuration", + "recovery-poll-duration", viperutil.Options[time.Duration]{ FlagName: "recovery-poll-duration", Default: 1 * time.Second, @@ -160,7 +160,7 @@ var ( ) ersEnabled = viperutil.Configure( - "AllowEmergencyReparent", + "allow-emergency-reparent", viperutil.Options[bool]{ FlagName: "allow-emergency-reparent", Default: true, @@ -169,7 +169,7 @@ var ( ) convertTabletsWithErrantGTIDs = viperutil.Configure( - "ChangeTabletsWithErrantGtidToDrained", + "change-tablets-with-errant-gtid-to-drained", viperutil.Options[bool]{ FlagName: "change-tablets-with-errant-gtid-to-drained", Default: false, From 9f14d8b013cf27f690741047308879e5016c9c05 Mon Sep 17 00:00:00 2001 From: Manan Gupta Date: Mon, 25 Nov 2024 13:58:05 +0530 Subject: [PATCH 17/18] feat: add logging for configuration changes Signed-off-by: Manan Gupta --- go/cmd/vtorc/cli/cli.go | 3 ++- go/vt/servenv/servenv.go | 12 ++++++++++++ go/vt/vtorc/config/config.go | 7 ------- 3 files changed, 14 insertions(+), 8 deletions(-) diff --git a/go/cmd/vtorc/cli/cli.go b/go/cmd/vtorc/cli/cli.go index bfc1e6ba662..b79793c6492 100644 --- a/go/cmd/vtorc/cli/cli.go +++ b/go/cmd/vtorc/cli/cli.go @@ -20,6 +20,7 @@ import ( "github.com/spf13/cobra" "vitess.io/vitess/go/acl" + "vitess.io/vitess/go/viperutil/debug" "vitess.io/vitess/go/vt/log" "vitess.io/vitess/go/vt/servenv" "vitess.io/vitess/go/vt/vtorc/config" @@ -59,7 +60,7 @@ func run(cmd *cobra.Command, args []string) { config.MarkConfigurationLoaded() // Log final config values to debug if something goes wrong. - config.LogConfigValues() + log.Infof("Running with Configuration - %v", debug.AllSettings()) server.StartVTOrcDiscovery() server.RegisterVTOrcAPIEndpoints() diff --git a/go/vt/servenv/servenv.go b/go/vt/servenv/servenv.go index 4aa9818eb7d..22bf3523dfc 100644 --- a/go/vt/servenv/servenv.go +++ b/go/vt/servenv/servenv.go @@ -370,6 +370,14 @@ func moveFlags(name string, fs *pflag.FlagSet) { // functions. func CobraPreRunE(cmd *cobra.Command, args []string) error { _flag.TrickGlog() + // Register logging on config file change. + ch := make(chan struct{}) + viperutil.NotifyConfigReload(ch) + go func() { + for range ch { + log.Infof("Change in configuration - %v", viperdebug.AllSettings()) + } + }() watchCancel, err := viperutil.LoadConfig() if err != nil { @@ -377,6 +385,10 @@ func CobraPreRunE(cmd *cobra.Command, args []string) error { } OnTerm(watchCancel) + // Register a function to be called on termination that closes the channel. + // This is done after the watchCancel has registered to ensure that we don't end up + // sending on a closed channel. + OnTerm(func() { close(ch) }) HTTPHandleFunc("/debug/config", viperdebug.HandlerFunc) logutil.PurgeLogs() diff --git a/go/vt/vtorc/config/config.go b/go/vt/vtorc/config/config.go index 4aa620f753d..cafff5acce8 100644 --- a/go/vt/vtorc/config/config.go +++ b/go/vt/vtorc/config/config.go @@ -22,8 +22,6 @@ import ( "github.com/spf13/pflag" "vitess.io/vitess/go/viperutil" - "vitess.io/vitess/go/viperutil/debug" - "vitess.io/vitess/go/vt/log" "vitess.io/vitess/go/vt/servenv" ) @@ -334,11 +332,6 @@ func SetConvertTabletWithErrantGTIDs(val bool) { convertTabletsWithErrantGTIDs.Set(val) } -// LogConfigValues is used to log the config values. -func LogConfigValues() { - log.Infof("Running with Configuration - %v", debug.AllSettings()) -} - // MarkConfigurationLoaded is called once configuration has first been loaded. // Listeners on ConfigurationLoaded will get a notification func MarkConfigurationLoaded() { From b63d4f890eb4948b3e2f070b93f0792424aa19f1 Mon Sep 17 00:00:00 2001 From: Manan Gupta Date: Tue, 26 Nov 2024 14:59:42 +0530 Subject: [PATCH 18/18] feat: fix config in examples and update summary Signed-off-by: Manan Gupta --- changelog/22.0/22.0.0/summary.md | 26 +++++++++++++------------- examples/common/vtorc/config.yaml | 2 +- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/changelog/22.0/22.0.0/summary.md b/changelog/22.0/22.0.0/summary.md index c95c1b44dd8..b329749cc38 100644 --- a/changelog/22.0/22.0.0/summary.md +++ b/changelog/22.0/22.0.0/summary.md @@ -33,18 +33,18 @@ never be promoted, neither by planned nor by emergency reparents. The configuration file for VTOrc has been updated to now support dynamic fields. The old `--config` parameter has been removed. The alternative is to use the `--config-file` parameter. The configuration can now be provided in both json, yaml or any other format that [viper](https://github.com/spf13/viper) supports. The following fields can be dynamically changed - -1. `InstancePollTime` -2. `PreventCrossCellFailover` -3. `SnapshotTopologyInterval` -4. `ReasonableReplicationLag` -5. `AuditToBackend` -6. `AuditToSyslog` -7. `AuditPurgeDuration` -8. `WaitReplicasTimeout` -9. `TolerableReplicationLag` -10. `TopoInformationRefreshDuration` -11. `RecoveryPollDuration` -12. `AllowEmergencyReparent` -13. `ChangeTabletsWithErrantGtidToDrained` +1. `instance-poll-time` +2. `prevent-cross-cell-failover` +3. `snapshot-topology-interval` +4. `reasonable-replication-lag` +5. `audit-to-backend` +6. `audit-to-syslog` +7. `audit-purge-duration` +8. `wait-replicas-timeout` +9. `tolerable-replication-lag` +10. `topo-information-refresh-duration` +11. `recovery-poll-duration` +12. `allow-emergency-reparent` +13. `change-tablets-with-errant-gtid-to-drained` To upgrade to the newer version of the configuration file, first switch to using the flags in your current deployment before upgrading. Then you can switch to using the configuration file in the newer release. diff --git a/examples/common/vtorc/config.yaml b/examples/common/vtorc/config.yaml index e615f6814f4..4f8d63a45f6 100644 --- a/examples/common/vtorc/config.yaml +++ b/examples/common/vtorc/config.yaml @@ -1 +1 @@ -InstancePollTime: 1s +instance-poll-time: 1s