diff --git a/changelog/27277.txt b/changelog/27277.txt new file mode 100644 index 000000000000..1a7abc5a6f05 --- /dev/null +++ b/changelog/27277.txt @@ -0,0 +1,4 @@ +```release-note:bug +storage/raft (enterprise): Fix a regression introduced in 1.15.8 that causes +autopilot to fail to discover new server versions and so not trigger an upgrade. +``` \ No newline at end of file diff --git a/physical/raft/raft.go b/physical/raft/raft.go index 8958ba78065d..e11cb2753eef 100644 --- a/physical/raft/raft.go +++ b/physical/raft/raft.go @@ -41,6 +41,7 @@ import ( "github.com/hashicorp/vault/sdk/logical" "github.com/hashicorp/vault/sdk/physical" "github.com/hashicorp/vault/vault/cluster" + "github.com/hashicorp/vault/version" etcdbolt "go.etcd.io/bbolt" ) @@ -590,6 +591,7 @@ func NewRaftBackend(conf map[string]string, logger log.Logger) (physical.Backend failGetInTxn: new(uint32), raftLogVerifierEnabled: backendConfig.RaftLogVerifierEnabled, raftLogVerificationInterval: backendConfig.RaftLogVerificationInterval, + effectiveSDKVersion: version.GetVersion().Version, }, nil } @@ -646,12 +648,6 @@ func (b *RaftBackend) FailGetInTxn(fail bool) { atomic.StoreUint32(b.failGetInTxn, val) } -func (b *RaftBackend) SetEffectiveSDKVersion(sdkVersion string) { - b.l.Lock() - b.effectiveSDKVersion = sdkVersion - b.l.Unlock() -} - func (b *RaftBackend) RedundancyZone() string { b.l.RLock() defer b.l.RUnlock() @@ -1075,6 +1071,11 @@ type SetupOpts struct { // RecoveryModeConfig is the configuration for the raft cluster in recovery // mode. RecoveryModeConfig *raft.Configuration + + // EffectiveSDKVersion is typically the version string baked into the binary. + // We pass it in though because it can be overridden in tests or via ENV in + // core. + EffectiveSDKVersion string } func (b *RaftBackend) StartRecoveryCluster(ctx context.Context, peer Peer) error { @@ -1118,6 +1119,11 @@ func (b *RaftBackend) SetupCluster(ctx context.Context, opts SetupOpts) error { return errors.New("no local node id configured") } + if opts.EffectiveSDKVersion != "" { + // Override the SDK version + b.effectiveSDKVersion = opts.EffectiveSDKVersion + } + // Setup the raft config raftConfig := raft.DefaultConfig() if err := b.applyConfigSettings(raftConfig); err != nil { diff --git a/physical/raft/raft_autopilot.go b/physical/raft/raft_autopilot.go index 1acd85cefc91..fd65ccbf2e2f 100644 --- a/physical/raft/raft_autopilot.go +++ b/physical/raft/raft_autopilot.go @@ -421,7 +421,10 @@ func (d *Delegate) KnownServers() map[raft.ServerID]*autopilot.Server { } followerVersion = leaderVersion } else { - delete(d.emptyVersionLogs, currentServerID) + if _, ok := d.emptyVersionLogs[currentServerID]; ok { + d.logger.Trace("received non-empty version in heartbeat state. no longer need to fake it", "id", id, "update_version", followerVersion) + delete(d.emptyVersionLogs, currentServerID) + } } d.dl.Unlock() diff --git a/vault/external_tests/raft/raft_autopilot_test.go b/vault/external_tests/raft/raft_autopilot_test.go index fa94fb3c510e..933c259f184e 100644 --- a/vault/external_tests/raft/raft_autopilot_test.go +++ b/vault/external_tests/raft/raft_autopilot_test.go @@ -41,6 +41,41 @@ func TestRaft_Autopilot_Disable(t *testing.T) { require.Nil(t, nil, state) } +// TestRaft_Autopilot_BinaryVersionPlumbing is an apparently trivial test that +// ensures that the default plumbing in Vault core to configure the binary +// version of the raft library is working. Hopefully this will trivially pass +// from now on, however it would have caught a regression in the past! +func TestRaft_Autopilot_BinaryVersionPlumbing(t *testing.T) { + t.Parallel() + + coreCfg, clusterOpts := raftClusterBuilder(t, &RaftClusterOpts{ + EnableAutopilot: true, + // We need 2 nodes because the code path that regressed was different on a + // standby vs active node so we'd not detect the problem if we only test on + // an active node. + NumCores: 2, + }) + + // Default options should not set EffectiveSDKVersion(Map) which would defeat + // the point of this test by plumbing versions via config. + require.Nil(t, clusterOpts.EffectiveSDKVersionMap) + require.Empty(t, coreCfg.EffectiveSDKVersion) + + c := vault.NewTestCluster(t, coreCfg, &clusterOpts) + defer c.Cleanup() + + // Wait for follower to be perf standby (in Ent, in CE it will only wait for + // active). In either Enterprise or CE case, this should pass if we've plumbed + // the version correctly so it's valuable for both. + testhelpers.WaitForActiveNodeAndStandbys(t, c) + for _, core := range c.Cores { + be := core.UnderlyingRawStorage.(*raft.RaftBackend) + require.Equal(t, version.GetVersion().Version, be.UpgradeVersion(), + "expected raft upgrade version to default to Vault version for core %q", + core.NodeID) + } +} + // TestRaft_Autopilot_Stabilization_And_State verifies that nodes get promoted // to be voters after the stabilization time has elapsed. Also checks that // the autopilot state is Healthy once all nodes are available. diff --git a/vault/raft.go b/vault/raft.go index f621d30b4f00..3aaab7a84745 100644 --- a/vault/raft.go +++ b/vault/raft.go @@ -149,9 +149,10 @@ func (c *Core) startRaftBackend(ctx context.Context) (retErr error) { raftBackend.SetRestoreCallback(c.raftSnapshotRestoreCallback(true, true)) if err := raftBackend.SetupCluster(ctx, raft.SetupOpts{ - TLSKeyring: raftTLS, - ClusterListener: c.getClusterListener(), - StartAsLeader: creating, + TLSKeyring: raftTLS, + ClusterListener: c.getClusterListener(), + StartAsLeader: creating, + EffectiveSDKVersion: c.effectiveSDKVersion, }); err != nil { return err } @@ -309,7 +310,6 @@ func (c *Core) setupRaftActiveNode(ctx context.Context) error { } c.logger.Info("starting raft active node") - raftBackend.SetEffectiveSDKVersion(c.effectiveSDKVersion) c.pendingRaftPeers = &sync.Map{} diff --git a/vault/request_forwarding_rpc.go b/vault/request_forwarding_rpc.go index 83c96cd34b0e..3da6fbba9bc5 100644 --- a/vault/request_forwarding_rpc.go +++ b/vault/request_forwarding_rpc.go @@ -100,6 +100,15 @@ func (s *forwardedRequestRPCServer) Echo(ctx context.Context, in *EchoRequest) ( } if in.RaftAppliedIndex > 0 && len(in.RaftNodeID) > 0 && s.raftFollowerStates != nil { + s.core.logger.Trace("forwarding RPC: echo received", + "node_id", in.RaftNodeID, + "applied_index", in.RaftAppliedIndex, + "term", in.RaftTerm, + "desired_suffrage", in.RaftDesiredSuffrage, + "sdk_version", in.SdkVersion, + "upgrade_version", in.RaftUpgradeVersion, + "redundancy_zone", in.RaftRedundancyZone) + s.raftFollowerStates.Update(&raft.EchoRequestUpdate{ NodeID: in.RaftNodeID, AppliedIndex: in.RaftAppliedIndex,