Skip to content

Commit

Permalink
VTOrc: Rework recovery registration (#15591)
Browse files Browse the repository at this point in the history
Signed-off-by: Manan Gupta <manan@planetscale.com>
Signed-off-by: Manan Gupta <35839558+GuptaManan100@users.noreply.github.com>
Co-authored-by: Deepthi Sigireddi <deepthi.sigireddi@gmail.com>
  • Loading branch information
GuptaManan100 and deepthi committed Apr 2, 2024
1 parent 37cc00a commit e55897b
Show file tree
Hide file tree
Showing 15 changed files with 259 additions and 514 deletions.
8 changes: 8 additions & 0 deletions changelog/20.0/20.0.0/summary.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
- **[Breaking changes](#breaking-changes)**
- [`shutdown_grace_period` Default Change](#shutdown-grace-period-default)
- [New `unmanaged` Flag and `disable_active_reparents` deprecation](#unmanaged-flag)
- [`recovery-period-block-duration` Flag deprecation](#recovery-block-deprecation)
- [`mysqlctld` `onterm-timeout` Default Change](#mysqlctld-onterm-timeout)
- [`Durabler` interface method renaming](#durabler-interface-method-renaming)
- **[Query Compatibility](#query-compatibility)**
Expand Down Expand Up @@ -40,6 +41,13 @@ New flag `--unmanaged` has been introduced in this release to make it easier to

Starting this release, all unmanaged tablets should specify this flag.


#### <a id="recovery-block-deprecation"/> `recovery-period-block-duration` Flag deprecation

The flag `--recovery-period-block-duration` has been deprecated in VTOrc from this release. Its value is now ignored and the flag will be removed in later releases.
VTOrc no longer blocks recoveries for a certain duration after a previous recovery has completed. Since VTOrc refreshes the required information after
acquiring a shard lock, blocking of recoveries is not required.

#### <a id="mysqlctld-onterm-timeout"/>`mysqlctld` `onterm_timeout` Default Change

The `--onterm_timeout` flag default value has changed for `mysqlctld`. It now is by default long enough to be able to wait for the default `--shutdown-wait-time` when shutting down on a `TERM` signal.
Expand Down
3 changes: 1 addition & 2 deletions go/cmd/vtorc/cli/cli.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,6 @@ var (
--topo_global_root /vitess/global \
--log_dir $VTDATAROOT/tmp \
--port 15000 \
--recovery-period-block-duration "10m" \
--instance-poll-time "1s" \
--topo-information-refresh-duration "30s" \
--alsologtostderr`,
Expand Down Expand Up @@ -85,7 +84,7 @@ func run(cmd *cobra.Command, args []string) {
// addStatusParts adds UI parts to the /debug/status page of VTOrc
func addStatusParts() {
servenv.AddStatusPart("Recent Recoveries", logic.TopologyRecoveriesTemplate, func() any {
recoveries, _ := logic.ReadRecentRecoveries(false, 0)
recoveries, _ := logic.ReadRecentRecoveries(0)
return recoveries
})
}
Expand Down
2 changes: 0 additions & 2 deletions go/flags/endtoend/vtorc.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@ vtorc \
--topo_global_root /vitess/global \
--log_dir $VTDATAROOT/tmp \
--port 15000 \
--recovery-period-block-duration "10m" \
--instance-poll-time "1s" \
--topo-information-refresh-duration "30s" \
--alsologtostderr
Expand Down Expand Up @@ -65,7 +64,6 @@ Flags:
--prevent-cross-cell-failover Prevent VTOrc from promoting a primary in a different cell than the current primary in case of a failover
--purge_logs_interval duration how often try to remove old logs (default 1h0m0s)
--reasonable-replication-lag duration Maximum replication lag on replicas which is deemed to be acceptable (default 10s)
--recovery-period-block-duration duration Duration for which a new recovery is blocked on an instance after running a recovery (default 30s)
--recovery-poll-duration duration Timer duration on which VTOrc polls its database to run a recovery (default 1s)
--remote_operation_timeout duration time to wait for a remote operation (default 15s)
--security_policy string the name of a registered security policy to use for controlling access to URLs - empty means allow all for anyone (built-in policies: deny-all, read-only)
Expand Down
2 changes: 1 addition & 1 deletion go/vt/vtorc/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,6 @@ const (
DiscoveryQueueMaxStatisticsSize = 120
DiscoveryCollectionRetentionSeconds = 120
UnseenInstanceForgetHours = 240 // Number of hours after which an unseen instance is forgotten
FailureDetectionPeriodBlockMinutes = 60 // The time for which an instance's failure discovery is kept "active", so as to avoid concurrent "discoveries" of the instance's failure; this precedes any recovery process, if any.
)

var (
Expand Down Expand Up @@ -77,6 +76,7 @@ func RegisterFlags(fs *pflag.FlagSet) {
fs.BoolVar(&auditToSyslog, "audit-to-syslog", auditToSyslog, "Whether to store the audit log in the syslog")
fs.DurationVar(&auditPurgeDuration, "audit-purge-duration", auditPurgeDuration, "Duration for which audit logs are held before being purged. Should be in multiples of days")
fs.DurationVar(&recoveryPeriodBlockDuration, "recovery-period-block-duration", recoveryPeriodBlockDuration, "Duration for which a new recovery is blocked on an instance after running a recovery")
fs.MarkDeprecated("recovery-period-block-duration", "As of v20 this is ignored and will be removed in a future release.")
fs.BoolVar(&preventCrossCellFailover, "prevent-cross-cell-failover", preventCrossCellFailover, "Prevent VTOrc from promoting a primary in a different cell than the current primary in case of a failover")
fs.DurationVar(&waitReplicasTimeout, "wait-replicas-timeout", waitReplicasTimeout, "Duration for which to wait for replica's to respond when issuing RPCs")
fs.DurationVar(&tolerableReplicationLag, "tolerable-replication-lag", tolerableReplicationLag, "Amount of replication lag that is considered acceptable for a tablet to be eligible for promotion when Vitess makes the choice of a new primary in PRS")
Expand Down
76 changes: 11 additions & 65 deletions go/vt/vtorc/db/generate_base.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,7 @@ var TableNames = []string{
"topology_recovery",
"database_instance_topology_history",
"candidate_database_instance",
"topology_failure_detection",
"blocked_topology_recovery",
"recovery_detection",
"database_instance_last_analysis",
"database_instance_analysis_changelog",
"node_health_history",
Expand Down Expand Up @@ -172,35 +171,19 @@ DROP TABLE IF EXISTS topology_recovery
CREATE TABLE topology_recovery (
recovery_id integer,
alias varchar(256) NOT NULL,
in_active_period tinyint NOT NULL DEFAULT 0,
start_active_period timestamp not null default (''),
end_active_period_unixtime int,
start_recovery timestamp NOT NULL DEFAULT (''),
end_recovery timestamp NULL DEFAULT NULL,
processing_node_hostname varchar(128) NOT NULL,
processcing_node_token varchar(128) NOT NULL,
successor_alias varchar(256) DEFAULT NULL,
analysis varchar(128) not null default '',
keyspace varchar(128) NOT NULL,
shard varchar(128) NOT NULL,
count_affected_replicas int not null default 0,
is_successful TINYint NOT NULL DEFAULT 0,
acknowledged TINYint NOT NULL DEFAULT 0,
acknowledged_by varchar(128) not null default '',
acknowledge_comment text not null default '',
all_errors text not null default '',
acknowledged_at TIMESTAMP NULL,
last_detection_id bigint not null default 0,
uid varchar(128) not null default '',
detection_id bigint not null default 0,
PRIMARY KEY (recovery_id)
)`,
`
CREATE INDEX in_active_start_period_idx_topology_recovery ON topology_recovery (in_active_period, start_active_period)
`,
`
CREATE INDEX start_active_period_idx_topology_recovery ON topology_recovery (start_active_period)
`,
`
CREATE UNIQUE INDEX alias_active_period_uidx_topology_recovery ON topology_recovery (alias, in_active_period, end_active_period_unixtime)
CREATE INDEX start_recovery_idx_topology_recovery ON topology_recovery (start_recovery)
`,
`
DROP TABLE IF EXISTS database_instance_topology_history
Expand Down Expand Up @@ -236,44 +219,19 @@ CREATE TABLE candidate_database_instance (
CREATE INDEX last_suggested_idx_candidate_database_instance ON candidate_database_instance (last_suggested)
`,
`
DROP TABLE IF EXISTS topology_failure_detection
DROP TABLE IF EXISTS recovery_detection
`,
`
CREATE TABLE topology_failure_detection (
CREATE TABLE recovery_detection (
detection_id integer,
alias varchar(256) NOT NULL,
in_active_period tinyint NOT NULL DEFAULT '0',
start_active_period timestamp not null default (''),
end_active_period_unixtime int NOT NULL,
processing_node_hostname varchar(128) NOT NULL,
processcing_node_token varchar(128) NOT NULL,
analysis varchar(128) NOT NULL,
keyspace varchar(128) NOT NULL,
shard varchar(128) NOT NULL,
count_affected_replicas int NOT NULL,
is_actionable tinyint not null default 0,
detection_timestamp timestamp NOT NULL default (''),
PRIMARY KEY (detection_id)
)`,
`
CREATE INDEX in_active_start_period_idx_topology_failure_detection ON topology_failure_detection (in_active_period, start_active_period)
`,
`
DROP TABLE IF EXISTS blocked_topology_recovery
`,
`
CREATE TABLE blocked_topology_recovery (
alias varchar(256) NOT NULL,
keyspace varchar(128) NOT NULL,
shard varchar(128) NOT NULL,
analysis varchar(128) NOT NULL,
last_blocked_timestamp timestamp not null default (''),
blocking_recovery_id bigint,
PRIMARY KEY (alias)
)`,
`
CREATE INDEX keyspace_shard_blocked_idx_blocked_topology_recovery ON blocked_topology_recovery (keyspace, shard, last_blocked_timestamp)
`,
`
DROP TABLE IF EXISTS database_instance_last_analysis
`,
`
Expand Down Expand Up @@ -343,7 +301,7 @@ DROP TABLE IF EXISTS topology_recovery_steps
`
CREATE TABLE topology_recovery_steps (
recovery_step_id integer,
recovery_uid varchar(128) NOT NULL,
recovery_id integer NOT NULL,
audit_at timestamp not null default (''),
message text NOT NULL,
PRIMARY KEY (recovery_step_id)
Expand Down Expand Up @@ -409,33 +367,21 @@ CREATE TABLE vitess_shard (
CREATE INDEX source_host_port_idx_database_instance_database_instance on database_instance (source_host, source_port)
`,
`
CREATE INDEX keyspace_shard_in_active_idx_topology_recovery on topology_recovery (keyspace, shard, in_active_period)
CREATE INDEX keyspace_shard_idx_topology_recovery on topology_recovery (keyspace, shard)
`,
`
CREATE INDEX end_recovery_idx_topology_recovery on topology_recovery (end_recovery)
`,
`
CREATE INDEX acknowledged_idx_topology_recovery on topology_recovery (acknowledged, acknowledged_at)
`,
`
CREATE INDEX last_blocked_idx_blocked_topology_recovery on blocked_topology_recovery (last_blocked_timestamp)
`,
`
CREATE INDEX instance_timestamp_idx_database_instance_analysis_changelog on database_instance_analysis_changelog (alias, analysis_timestamp)
`,
`
CREATE INDEX last_detection_idx_topology_recovery on topology_recovery (last_detection_id)
CREATE INDEX detection_idx_topology_recovery on topology_recovery (detection_id)
`,
`
CREATE INDEX last_seen_active_idx_node_health on node_health (last_seen_active)
`,
`
CREATE INDEX uid_idx_topology_recovery ON topology_recovery(uid)
`,
`
CREATE INDEX recovery_uid_idx_topology_recovery_steps ON topology_recovery_steps(recovery_uid)
`,
`
CREATE UNIQUE INDEX alias_active_recoverable_uidx_topology_failure_detection ON topology_failure_detection (alias, in_active_period, end_active_period_unixtime, is_actionable)
CREATE INDEX recovery_id_idx_topology_recovery_steps ON topology_recovery_steps(recovery_id)
`,
}
4 changes: 1 addition & 3 deletions go/vt/vtorc/inst/analysis.go
Original file line number Diff line number Diff line change
Expand Up @@ -142,9 +142,7 @@ type ReplicationAnalysis struct {
CountDelayedReplicas uint
CountLaggingReplicas uint
IsActionableRecovery bool
ProcessingNodeHostname string
ProcessingNodeToken string
StartActivePeriod string
RecoveryId int64
GTIDMode string
MinReplicaGTIDMode string
MaxReplicaGTIDMode string
Expand Down
5 changes: 1 addition & 4 deletions go/vt/vtorc/inst/analysis_dao.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,6 @@ import (
"vitess.io/vitess/go/vt/vtctl/reparentutil"
"vitess.io/vitess/go/vt/vtorc/config"
"vitess.io/vitess/go/vt/vtorc/db"
"vitess.io/vitess/go/vt/vtorc/process"
"vitess.io/vitess/go/vt/vtorc/util"

"github.com/patrickmn/go-cache"
Expand Down Expand Up @@ -302,9 +301,7 @@ func GetReplicationAnalysis(keyspace string, shard string, hints *ReplicationAna
clusters := make(map[string]*clusterAnalysis)
err := db.Db.QueryVTOrc(query, args, func(m sqlutils.RowMap) error {
a := &ReplicationAnalysis{
Analysis: NoProblem,
ProcessingNodeHostname: process.ThisHostname,
ProcessingNodeToken: util.ProcessToken.Hash,
Analysis: NoProblem,
}

tablet := &topodatapb.Tablet{}
Expand Down
6 changes: 4 additions & 2 deletions go/vt/vtorc/inst/instance_dao.go
Original file line number Diff line number Diff line change
Expand Up @@ -119,9 +119,11 @@ func ExecDBWriteFunc(f func() error) error {
}

func ExpireTableData(tableName string, timestampColumn string) error {
query := fmt.Sprintf("delete from %s where %s < NOW() - INTERVAL ? DAY", tableName, timestampColumn)
writeFunc := func() error {
_, err := db.ExecVTOrc(query, config.Config.AuditPurgeDays)
_, err := db.ExecVTOrc(
fmt.Sprintf("delete from %s where %s < NOW() - INTERVAL ? DAY", tableName, timestampColumn),
config.Config.AuditPurgeDays,
)
return err
}
return ExecDBWriteFunc(writeFunc)
Expand Down
57 changes: 57 additions & 0 deletions go/vt/vtorc/inst/instance_dao_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -715,3 +715,60 @@ func TestGetDatabaseState(t *testing.T) {
require.NoError(t, err)
require.Contains(t, ds, `"alias": "zone1-0000000112"`)
}

func TestExpireTableData(t *testing.T) {
oldVal := config.Config.AuditPurgeDays
config.Config.AuditPurgeDays = 10
defer func() {
config.Config.AuditPurgeDays = oldVal
}()

tests := []struct {
name string
tableName string
insertQuery string
timestampColumn string
expectedRowCount int
}{
{
name: "ExpireAudit",
tableName: "audit",
timestampColumn: "audit_timestamp",
expectedRowCount: 1,
insertQuery: `insert into audit (audit_id, audit_timestamp, audit_type, alias, message, keyspace, shard) values
(1, NOW() - INTERVAL 50 DAY, 'a','a','a','a','a'),
(2, NOW() - INTERVAL 5 DAY, 'a','a','a','a','a')`,
},
{
name: "ExpireRecoveryDetectionHistory",
tableName: "recovery_detection",
timestampColumn: "detection_timestamp",
expectedRowCount: 2,
insertQuery: `insert into recovery_detection (detection_id, detection_timestamp, alias, analysis, keyspace, shard) values
(1, NOW() - INTERVAL 3 DAY,'a','a','a','a'),
(2, NOW() - INTERVAL 5 DAY,'a','a','a','a'),
(3, NOW() - INTERVAL 15 DAY,'a','a','a','a')`,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
// Clear the database after the test. The easiest way to do that is to run all the initialization commands again.
defer func() {
db.ClearVTOrcDatabase()
}()
_, err := db.ExecVTOrc(tt.insertQuery)
require.NoError(t, err)

err = ExpireTableData(tt.tableName, tt.timestampColumn)
require.NoError(t, err)

rowsCount := 0
err = db.QueryVTOrc(`select * from `+tt.tableName, nil, func(rowMap sqlutils.RowMap) error {
rowsCount++
return nil
})
require.NoError(t, err)
require.EqualValues(t, tt.expectedRowCount, rowsCount)
})
}
}
Loading

0 comments on commit e55897b

Please sign in to comment.