Skip to content

Commit

Permalink
Improve logging and renaming PrimaryTermStartTimestamp in vttablets (v…
Browse files Browse the repository at this point in the history
…itessio#13625)

Signed-off-by: Manan Gupta <manan@planetscale.com>
  • Loading branch information
GuptaManan100 authored Aug 1, 2023
1 parent 0a8e89e commit 80baafd
Show file tree
Hide file tree
Showing 19 changed files with 328 additions and 332 deletions.
2 changes: 1 addition & 1 deletion go/test/endtoend/reparent/utils/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -612,7 +612,7 @@ func CheckReparentFromOutside(t *testing.T, clusterInstance *cluster.LocalProces
streamHealthResponse := shrs[0]

assert.Equal(t, streamHealthResponse.Target.TabletType, topodatapb.TabletType_PRIMARY)
assert.True(t, streamHealthResponse.TabletExternallyReparentedTimestamp >= baseTime)
assert.True(t, streamHealthResponse.PrimaryTermStartTimestamp >= baseTime)
}

// WaitForReplicationPosition waits for tablet B to catch up to the replication position of tablet A.
Expand Down
28 changes: 14 additions & 14 deletions go/test/endtoend/tabletmanager/primary/tablet_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -155,11 +155,11 @@ func TestRepeatedInitShardPrimary(t *testing.T) {
checkTabletType(t, replicaTablet.Alias, "REPLICA")
}

func TestPrimaryRestartSetsTERTimestamp(t *testing.T) {
func TestPrimaryRestartSetsPTSTimestamp(t *testing.T) {
defer cluster.PanicHandler(t)
// Test that TER timestamp is set when we restart the PRIMARY vttablet.
// TER = TabletExternallyReparented.
// See StreamHealthResponse.tablet_externally_reparented_timestamp for details.
// Test that PTS timestamp is set when we restart the PRIMARY vttablet.
// PTS = PrimaryTermStart.
// See StreamHealthResponse.primary_term_start_timestamp for details.

// Make replica as primary
err := clusterInstance.VtctlclientProcess.InitShardPrimary(keyspaceName, shardName, cell, replicaTablet.TabletUID)
Expand All @@ -168,7 +168,7 @@ func TestPrimaryRestartSetsTERTimestamp(t *testing.T) {
err = replicaTablet.VttabletProcess.WaitForTabletStatus("SERVING")
require.NoError(t, err)

// Capture the current TER.
// Capture the current PTS.
shrs, err := clusterInstance.StreamTabletHealth(context.Background(), &replicaTablet, 1)
require.NoError(t, err)

Expand All @@ -178,9 +178,9 @@ func TestPrimaryRestartSetsTERTimestamp(t *testing.T) {
got := fmt.Sprintf("%d", actualType)
want := fmt.Sprintf("%d", tabletType)
assert.Equal(t, want, got)
assert.NotNil(t, streamHealthRes1.GetTabletExternallyReparentedTimestamp())
assert.True(t, streamHealthRes1.GetTabletExternallyReparentedTimestamp() > 0,
"TER on PRIMARY must be set after InitShardPrimary")
assert.NotNil(t, streamHealthRes1.GetPrimaryTermStartTimestamp())
assert.True(t, streamHealthRes1.GetPrimaryTermStartTimestamp() > 0,
"PTS on PRIMARY must be set after InitShardPrimary")

// Restart the PRIMARY vttablet and test again

Expand All @@ -192,7 +192,7 @@ func TestPrimaryRestartSetsTERTimestamp(t *testing.T) {
err = clusterInstance.StartVttablet(&replicaTablet, "SERVING", false, cell, keyspaceName, hostname, shardName)
require.NoError(t, err)

// Make sure that the TER did not change
// Make sure that the PTS did not change
shrs, err = clusterInstance.StreamTabletHealth(context.Background(), &replicaTablet, 1)
require.NoError(t, err)

Expand All @@ -204,12 +204,12 @@ func TestPrimaryRestartSetsTERTimestamp(t *testing.T) {
want = fmt.Sprintf("%d", tabletType)
assert.Equal(t, want, got)

assert.NotNil(t, streamHealthRes2.GetTabletExternallyReparentedTimestamp())
assert.True(t, streamHealthRes2.GetTabletExternallyReparentedTimestamp() == streamHealthRes1.GetTabletExternallyReparentedTimestamp(),
assert.NotNil(t, streamHealthRes2.GetPrimaryTermStartTimestamp())
assert.True(t, streamHealthRes2.GetPrimaryTermStartTimestamp() == streamHealthRes1.GetPrimaryTermStartTimestamp(),
fmt.Sprintf("When the PRIMARY vttablet was restarted, "+
"the TER timestamp must be set by reading the old value from the tablet record. Old: %d, New: %d",
streamHealthRes1.GetTabletExternallyReparentedTimestamp(),
streamHealthRes2.GetTabletExternallyReparentedTimestamp()))
"the PTS timestamp must be set by reading the old value from the tablet record. Old: %d, New: %d",
streamHealthRes1.GetPrimaryTermStartTimestamp(),
streamHealthRes2.GetPrimaryTermStartTimestamp()))

// Reset primary
err = clusterInstance.VtctlclientProcess.InitShardPrimary(keyspaceName, shardName, cell, primaryTablet.TabletUID)
Expand Down
248 changes: 124 additions & 124 deletions go/vt/discovery/healthcheck_test.go

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion go/vt/discovery/tablet_health_check.go
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,7 @@ func (thc *tabletHealthCheck) processResponse(hc *HealthCheckImpl, shr *query.St
prevTarget.TabletType != topodata.TabletType_PRIMARY && prevTarget.TabletType == shr.Target.TabletType && thc.isTrivialReplagChange(shr.RealtimeStats)
thc.lastResponseTimestamp = time.Now()
thc.Target = shr.Target
thc.PrimaryTermStartTime = shr.TabletExternallyReparentedTimestamp
thc.PrimaryTermStartTime = shr.PrimaryTermStartTimestamp
thc.Stats = shr.RealtimeStats
thc.LastError = healthErr
reason := "healthCheck update"
Expand Down
248 changes: 123 additions & 125 deletions go/vt/proto/query/query.pb.go

Large diffs are not rendered by default.

14 changes: 7 additions & 7 deletions go/vt/proto/query/query_vtproto.pb.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 2 additions & 3 deletions go/vt/vtctld/tablet_data_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,12 @@ limitations under the License.
package vtctld

import (
"context"
"io"
"sync"
"testing"
"time"

"context"

"google.golang.org/protobuf/proto"

"vitess.io/vitess/go/vt/logutil"
Expand Down Expand Up @@ -93,7 +92,7 @@ func (s *streamHealthTabletServer) streamHealthUnregister(id int) error {
// BroadcastHealth will broadcast the current health to all listeners
func (s *streamHealthTabletServer) BroadcastHealth() {
shr := &querypb.StreamHealthResponse{
TabletExternallyReparentedTimestamp: 42,
PrimaryTermStartTimestamp: 42,
RealtimeStats: &querypb.RealtimeStats{
HealthError: "testHealthError",
ReplicationLagSeconds: 72,
Expand Down
2 changes: 1 addition & 1 deletion go/vt/vttablet/tabletconntest/fakequeryservice.go
Original file line number Diff line number Diff line change
Expand Up @@ -655,7 +655,7 @@ var TestStreamHealthStreamHealthResponse = &querypb.StreamHealthResponse{
},
Serving: true,

TabletExternallyReparentedTimestamp: 1234589,
PrimaryTermStartTimestamp: 1234589,

RealtimeStats: &querypb.RealtimeStats{
CpuUsage: 1.0,
Expand Down
6 changes: 3 additions & 3 deletions go/vt/vttablet/tabletmanager/tm_state.go
Original file line number Diff line number Diff line change
Expand Up @@ -264,7 +264,7 @@ func (ts *tmState) updateLocked(ctx context.Context) error {
return nil
}

terTime := logutil.ProtoToTime(ts.tablet.PrimaryTermStartTime)
ptsTime := logutil.ProtoToTime(ts.tablet.PrimaryTermStartTime)

// Disable TabletServer first so the nonserving state gets advertised
// before other services are shutdown.
Expand All @@ -277,7 +277,7 @@ func (ts *tmState) updateLocked(ctx context.Context) error {
// always return error from 'SetServingType' and 'applyDenyList' to our client. It is up to them to handle it accordingly.
// UpdateLock is called from 'ChangeTabletType', 'Open' and 'RefreshFromTopoInfo'. For 'Open' and 'RefreshFromTopoInfo' we don't need
// to propagate error to client hence no changes there but we will propagate error from 'ChangeTabletType' to client.
if err := ts.tm.QueryServiceControl.SetServingType(ts.tablet.Type, terTime, false, reason); err != nil {
if err := ts.tm.QueryServiceControl.SetServingType(ts.tablet.Type, ptsTime, false, reason); err != nil {
errStr := fmt.Sprintf("SetServingType(serving=false) failed: %v", err)
log.Errorf(errStr)
// No need to short circuit. Apply all steps and return error in the end.
Expand Down Expand Up @@ -326,7 +326,7 @@ func (ts *tmState) updateLocked(ctx context.Context) error {

// Open TabletServer last so that it advertises serving after all other services are up.
if reason == "" {
if err := ts.tm.QueryServiceControl.SetServingType(ts.tablet.Type, terTime, true, ""); err != nil {
if err := ts.tm.QueryServiceControl.SetServingType(ts.tablet.Type, ptsTime, true, ""); err != nil {
errStr := fmt.Sprintf("Cannot start query service: %v", err)
log.Errorf(errStr)
returnErr = vterrors.Wrapf(err, errStr)
Expand Down
5 changes: 2 additions & 3 deletions go/vt/vttablet/tabletserver/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ package tabletserver

import (
"context"
"time"

"vitess.io/vitess/go/vt/dbconfigs"
"vitess.io/vitess/go/vt/mysqlctl"
Expand All @@ -28,8 +29,6 @@ import (
"vitess.io/vitess/go/vt/vttablet/tabletserver/tabletenv"
"vitess.io/vitess/go/vt/vttablet/tabletserver/throttle"

"time"

querypb "vitess.io/vitess/go/vt/proto/query"
topodatapb "vitess.io/vitess/go/vt/proto/topodata"
)
Expand All @@ -53,7 +52,7 @@ type Controller interface {

// SetServingType transitions the query service to the required serving type.
// Returns true if the state of QueryService or the tablet type changed.
SetServingType(tabletType topodatapb.TabletType, terTimestamp time.Time, serving bool, reason string) error
SetServingType(tabletType topodatapb.TabletType, ptsTimestamp time.Time, serving bool, reason string) error

// EnterLameduck causes tabletserver to enter the lameduck state.
EnterLameduck()
Expand Down
6 changes: 3 additions & 3 deletions go/vt/vttablet/tabletserver/health_streamer.go
Original file line number Diff line number Diff line change
Expand Up @@ -208,15 +208,15 @@ func (hs *healthStreamer) unregister(ch chan *querypb.StreamHealthResponse) {
delete(hs.clients, ch)
}

func (hs *healthStreamer) ChangeState(tabletType topodatapb.TabletType, terTimestamp time.Time, lag time.Duration, err error, serving bool) {
func (hs *healthStreamer) ChangeState(tabletType topodatapb.TabletType, ptsTimestamp time.Time, lag time.Duration, err error, serving bool) {
hs.mu.Lock()
defer hs.mu.Unlock()

hs.state.Target.TabletType = tabletType
if tabletType == topodatapb.TabletType_PRIMARY {
hs.state.TabletExternallyReparentedTimestamp = terTimestamp.Unix()
hs.state.PrimaryTermStartTimestamp = ptsTimestamp.Unix()
} else {
hs.state.TabletExternallyReparentedTimestamp = 0
hs.state.PrimaryTermStartTimestamp = 0
}
if err != nil {
hs.state.RealtimeStats.HealthError = err.Error()
Expand Down
6 changes: 3 additions & 3 deletions go/vt/vttablet/tabletserver/health_streamer_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -150,9 +150,9 @@ func TestHealthStreamerBroadcast(t *testing.T) {
Target: &querypb.Target{
TabletType: topodatapb.TabletType_PRIMARY,
},
TabletAlias: alias,
Serving: true,
TabletExternallyReparentedTimestamp: now.Unix(),
TabletAlias: alias,
Serving: true,
PrimaryTermStartTimestamp: now.Unix(),
RealtimeStats: &querypb.RealtimeStats{
FilteredReplicationLagSeconds: 1,
BinlogPlayersCount: 2,
Expand Down
16 changes: 8 additions & 8 deletions go/vt/vttablet/tabletserver/state_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ type stateManager struct {
wantTabletType topodatapb.TabletType
state servingState
target *querypb.Target
terTimestamp time.Time
ptsTimestamp time.Time
retrying bool
replHealthy bool
lameduck bool
Expand Down Expand Up @@ -209,7 +209,7 @@ func (sm *stateManager) Init(env tabletenv.Env, target *querypb.Target) {
// be honored.
// If sm is already in the requested state, it returns stateChanged as
// false.
func (sm *stateManager) SetServingType(tabletType topodatapb.TabletType, terTimestamp time.Time, state servingState, reason string) error {
func (sm *stateManager) SetServingType(tabletType topodatapb.TabletType, ptsTimestamp time.Time, state servingState, reason string) error {
defer sm.ExitLameduck()

sm.hs.Open()
Expand All @@ -219,8 +219,8 @@ func (sm *stateManager) SetServingType(tabletType topodatapb.TabletType, terTime
state = StateNotConnected
}

log.Infof("Starting transition to %v %v, timestamp: %v", tabletType, state, terTimestamp)
if sm.mustTransition(tabletType, terTimestamp, state, reason) {
log.Infof("Starting transition to %v %v, primary term start timestamp: %v", tabletType, state, ptsTimestamp)
if sm.mustTransition(tabletType, ptsTimestamp, state, reason) {
return sm.execTransition(tabletType, state)
}
return nil
Expand All @@ -230,7 +230,7 @@ func (sm *stateManager) SetServingType(tabletType topodatapb.TabletType, terTime
// state. If so, it acquires the semaphore and returns true. If a transition is
// already in progress, it waits. If the desired state is already reached, it
// returns false without acquiring the semaphore.
func (sm *stateManager) mustTransition(tabletType topodatapb.TabletType, terTimestamp time.Time, state servingState, reason string) bool {
func (sm *stateManager) mustTransition(tabletType topodatapb.TabletType, ptsTimestamp time.Time, state servingState, reason string) bool {
if sm.transitioning.Acquire(context.Background(), 1) != nil {
return false
}
Expand All @@ -239,7 +239,7 @@ func (sm *stateManager) mustTransition(tabletType topodatapb.TabletType, terTime

sm.wantTabletType = tabletType
sm.wantState = state
sm.terTimestamp = terTimestamp
sm.ptsTimestamp = ptsTimestamp
sm.reason = reason
if sm.target.TabletType == tabletType && sm.state == state {
sm.transitioning.Release(1)
Expand Down Expand Up @@ -639,7 +639,7 @@ func (sm *stateManager) stateStringLocked(tabletType topodatapb.TabletType, stat
if tabletType != topodatapb.TabletType_PRIMARY {
return fmt.Sprintf("%v: %v", tabletType, state)
}
return fmt.Sprintf("%v: %v, %v", tabletType, state, sm.terTimestamp.Local().Format("Jan 2, 2006 at 15:04:05 (MST)"))
return fmt.Sprintf("%v: %v, %v", tabletType, state, sm.ptsTimestamp.Local().Format("Jan 2, 2006 at 15:04:05 (MST)"))
}

func (sm *stateManager) handleGracePeriod(tabletType topodatapb.TabletType) {
Expand Down Expand Up @@ -674,7 +674,7 @@ func (sm *stateManager) Broadcast() {
defer sm.mu.Unlock()

lag, err := sm.refreshReplHealthLocked()
sm.hs.ChangeState(sm.target.TabletType, sm.terTimestamp, lag, err, sm.isServingLocked())
sm.hs.ChangeState(sm.target.TabletType, sm.ptsTimestamp, lag, err, sm.isServingLocked())
}

func (sm *stateManager) refreshReplHealthLocked() (time.Duration, error) {
Expand Down
2 changes: 1 addition & 1 deletion go/vt/vttablet/tabletserver/state_manager_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ func TestStateManagerServePrimary(t *testing.T) {
require.NoError(t, err)

assert.Equal(t, false, sm.lameduck)
assert.Equal(t, testNow, sm.terTimestamp)
assert.Equal(t, testNow, sm.ptsTimestamp)

verifySubcomponent(t, 1, sm.watcher, testStateClosed)

Expand Down
4 changes: 2 additions & 2 deletions go/vt/vttablet/tabletserver/tabletserver.go
Original file line number Diff line number Diff line change
Expand Up @@ -371,12 +371,12 @@ func (tsv *TabletServer) InitACL(tableACLConfigFile string, enforceTableACLConfi
// SetServingType changes the serving type of the tabletserver. It starts or
// stops internal services as deemed necessary.
// Returns true if the state of QueryService or the tablet type changed.
func (tsv *TabletServer) SetServingType(tabletType topodatapb.TabletType, terTimestamp time.Time, serving bool, reason string) error {
func (tsv *TabletServer) SetServingType(tabletType topodatapb.TabletType, ptsTimestamp time.Time, serving bool, reason string) error {
state := StateNotServing
if serving {
state = StateServing
}
return tsv.sm.SetServingType(tabletType, terTimestamp, state, reason)
return tsv.sm.SetServingType(tabletType, ptsTimestamp, state, reason)
}

// StartService is a convenience function for InitDBConfig->SetServingType
Expand Down
2 changes: 1 addition & 1 deletion go/vt/vttablet/tabletservermock/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@ func (tqsc *Controller) InitDBConfig(target *querypb.Target, dbcfgs *dbconfigs.D
}

// SetServingType is part of the tabletserver.Controller interface
func (tqsc *Controller) SetServingType(tabletType topodatapb.TabletType, terTime time.Time, serving bool, reason string) error {
func (tqsc *Controller) SetServingType(tabletType topodatapb.TabletType, ptsTime time.Time, serving bool, reason string) error {
tqsc.mu.Lock()
defer tqsc.mu.Unlock()

Expand Down
8 changes: 4 additions & 4 deletions proto/query.proto
Original file line number Diff line number Diff line change
Expand Up @@ -922,8 +922,8 @@ message StreamHealthResponse {
// or if a replica should not be used because the keyspace is being resharded.
bool serving = 2;

// tablet_externally_reparented_timestamp can be interpreted as the
// last time we knew that this tablet was the PRIMARY of this shard
// primary_term_start_timestamp can be interpreted as the
// last time we knew that this tablet was promoted to a PRIMARY of this shard
// (if StreamHealthResponse describes a group of tablets, between
// two vtgates, only one primary will be present in the group, and
// this is this primary's value).
Expand All @@ -947,8 +947,8 @@ message StreamHealthResponse {
// as PRIMARY because it was recorded as the shard's current primary in the
// topology (see go/vt/vttablet/tabletmanager/init_tablet.go)
// OR
// d) 0 if the vttablet was never a PRIMARY.
int64 tablet_externally_reparented_timestamp = 3;
// d) 0 if the vttablet is not a PRIMARY.
int64 primary_term_start_timestamp = 3;

// realtime_stats contains information about the tablet status.
// It is only filled in if the information is about a tablet.
Expand Down
8 changes: 4 additions & 4 deletions web/vtadmin/src/proto/vtadmin.d.ts

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit 80baafd

Please sign in to comment.