Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Backport of Fix AP upgrade version issue into release/1.16.x #27364

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions changelog/27277.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
```release-note:bug
storage/raft (enterprise): Fix a regression introduced in 1.15.8 that causes
autopilot to fail to discover new server versions and so not trigger an upgrade.
```
18 changes: 12 additions & 6 deletions physical/raft/raft.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ import (
"github.com/hashicorp/vault/sdk/logical"
"github.com/hashicorp/vault/sdk/physical"
"github.com/hashicorp/vault/vault/cluster"
"github.com/hashicorp/vault/version"
etcdbolt "go.etcd.io/bbolt"
)

Expand Down Expand Up @@ -590,6 +591,7 @@ func NewRaftBackend(conf map[string]string, logger log.Logger) (physical.Backend
failGetInTxn: new(uint32),
raftLogVerifierEnabled: backendConfig.RaftLogVerifierEnabled,
raftLogVerificationInterval: backendConfig.RaftLogVerificationInterval,
effectiveSDKVersion: version.GetVersion().Version,
}, nil
}

Expand Down Expand Up @@ -646,12 +648,6 @@ func (b *RaftBackend) FailGetInTxn(fail bool) {
atomic.StoreUint32(b.failGetInTxn, val)
}

func (b *RaftBackend) SetEffectiveSDKVersion(sdkVersion string) {
b.l.Lock()
b.effectiveSDKVersion = sdkVersion
b.l.Unlock()
}

func (b *RaftBackend) RedundancyZone() string {
b.l.RLock()
defer b.l.RUnlock()
Expand Down Expand Up @@ -1075,6 +1071,11 @@ type SetupOpts struct {
// RecoveryModeConfig is the configuration for the raft cluster in recovery
// mode.
RecoveryModeConfig *raft.Configuration

// EffectiveSDKVersion is typically the version string baked into the binary.
// We pass it in though because it can be overridden in tests or via ENV in
// core.
EffectiveSDKVersion string
}

func (b *RaftBackend) StartRecoveryCluster(ctx context.Context, peer Peer) error {
Expand Down Expand Up @@ -1118,6 +1119,11 @@ func (b *RaftBackend) SetupCluster(ctx context.Context, opts SetupOpts) error {
return errors.New("no local node id configured")
}

if opts.EffectiveSDKVersion != "" {
// Override the SDK version
b.effectiveSDKVersion = opts.EffectiveSDKVersion
}

// Setup the raft config
raftConfig := raft.DefaultConfig()
if err := b.applyConfigSettings(raftConfig); err != nil {
Expand Down
5 changes: 4 additions & 1 deletion physical/raft/raft_autopilot.go
Original file line number Diff line number Diff line change
Expand Up @@ -421,7 +421,10 @@ func (d *Delegate) KnownServers() map[raft.ServerID]*autopilot.Server {
}
followerVersion = leaderVersion
} else {
delete(d.emptyVersionLogs, currentServerID)
if _, ok := d.emptyVersionLogs[currentServerID]; ok {
d.logger.Trace("received non-empty version in heartbeat state. no longer need to fake it", "id", id, "update_version", followerVersion)
delete(d.emptyVersionLogs, currentServerID)
}
}
d.dl.Unlock()

Expand Down
35 changes: 35 additions & 0 deletions vault/external_tests/raft/raft_autopilot_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,41 @@ func TestRaft_Autopilot_Disable(t *testing.T) {
require.Nil(t, nil, state)
}

// TestRaft_Autopilot_BinaryVersionPlumbing is an apparently trivial test that
// ensures that the default plumbing in Vault core to configure the binary
// version of the raft library is working. Hopefully this will trivially pass
// from now on, however it would have caught a regression in the past!
func TestRaft_Autopilot_BinaryVersionPlumbing(t *testing.T) {
t.Parallel()

coreCfg, clusterOpts := raftClusterBuilder(t, &RaftClusterOpts{
EnableAutopilot: true,
// We need 2 nodes because the code path that regressed was different on a
// standby vs active node so we'd not detect the problem if we only test on
// an active node.
NumCores: 2,
})

// Default options should not set EffectiveSDKVersion(Map) which would defeat
// the point of this test by plumbing versions via config.
require.Nil(t, clusterOpts.EffectiveSDKVersionMap)
require.Empty(t, coreCfg.EffectiveSDKVersion)

c := vault.NewTestCluster(t, coreCfg, &clusterOpts)
defer c.Cleanup()

// Wait for follower to be perf standby (in Ent, in CE it will only wait for
// active). In either Enterprise or CE case, this should pass if we've plumbed
// the version correctly so it's valuable for both.
testhelpers.WaitForActiveNodeAndStandbys(t, c)
for _, core := range c.Cores {
be := core.UnderlyingRawStorage.(*raft.RaftBackend)
require.Equal(t, version.GetVersion().Version, be.UpgradeVersion(),
"expected raft upgrade version to default to Vault version for core %q",
core.NodeID)
}
}

// TestRaft_Autopilot_Stabilization_And_State verifies that nodes get promoted
// to be voters after the stabilization time has elapsed. Also checks that
// the autopilot state is Healthy once all nodes are available.
Expand Down
8 changes: 4 additions & 4 deletions vault/raft.go
Original file line number Diff line number Diff line change
Expand Up @@ -149,9 +149,10 @@ func (c *Core) startRaftBackend(ctx context.Context) (retErr error) {
raftBackend.SetRestoreCallback(c.raftSnapshotRestoreCallback(true, true))

if err := raftBackend.SetupCluster(ctx, raft.SetupOpts{
TLSKeyring: raftTLS,
ClusterListener: c.getClusterListener(),
StartAsLeader: creating,
TLSKeyring: raftTLS,
ClusterListener: c.getClusterListener(),
StartAsLeader: creating,
EffectiveSDKVersion: c.effectiveSDKVersion,
}); err != nil {
return err
}
Expand Down Expand Up @@ -309,7 +310,6 @@ func (c *Core) setupRaftActiveNode(ctx context.Context) error {
}

c.logger.Info("starting raft active node")
raftBackend.SetEffectiveSDKVersion(c.effectiveSDKVersion)

c.pendingRaftPeers = &sync.Map{}

Expand Down
9 changes: 9 additions & 0 deletions vault/request_forwarding_rpc.go
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,15 @@ func (s *forwardedRequestRPCServer) Echo(ctx context.Context, in *EchoRequest) (
}

if in.RaftAppliedIndex > 0 && len(in.RaftNodeID) > 0 && s.raftFollowerStates != nil {
s.core.logger.Trace("forwarding RPC: echo received",
"node_id", in.RaftNodeID,
"applied_index", in.RaftAppliedIndex,
"term", in.RaftTerm,
"desired_suffrage", in.RaftDesiredSuffrage,
"sdk_version", in.SdkVersion,
"upgrade_version", in.RaftUpgradeVersion,
"redundancy_zone", in.RaftRedundancyZone)

s.raftFollowerStates.Update(&raft.EchoRequestUpdate{
NodeID: in.RaftNodeID,
AppliedIndex: in.RaftAppliedIndex,
Expand Down
Loading