-
Notifications
You must be signed in to change notification settings - Fork 892
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
GODRIVER-2101 Direct read/write retries to another mongos if possible #1358
Merged
Merged
Changes from all commits
Commits
Show all changes
11 commits
Select commit
Hold shift + click to select a range
691b976
GODRIVER-2101 Expand test to use pigeonhole principle
prestonvasquez 354597e
GODRIVER-2101 Direct read/write retries to another mongos if possible
prestonvasquez 44dd3f4
GODRIVER-2101 Revert unecessary changes
prestonvasquez ecea751
GODRIVER-2101 revert changes to collection and cursor
prestonvasquez 7e74cd0
GORIVER-2101 resolve merge conflict
prestonvasquez 3e242f8
GODRIVER-2101 Apply opServerSelector
prestonvasquez 4787e20
GODRIVER-2101 Fix static analysis errors
prestonvasquez d189587
GODRIVER-2101 Remove empty line
prestonvasquez 96c42ef
GODRIVER-2101 Use map 'ok' value
prestonvasquez da00d59
Merge branch 'master' into GODRIVER-2101
prestonvasquez 943ecbe
GODRIVER-2101 Resolve merge conflict
prestonvasquez File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -322,8 +322,73 @@ func (op Operation) shouldEncrypt() bool { | |
return op.Crypt != nil && !op.Crypt.BypassAutoEncryption() | ||
} | ||
|
||
// filterDeprioritizedServers will filter out the server candidates that have | ||
// been deprioritized by the operation due to failure. | ||
// | ||
// The server selector should try to select a server that is not in the | ||
// deprioritization list. However, if this is not possible (e.g. there are no | ||
// other healthy servers in the cluster), the selector may return a | ||
// deprioritized server. | ||
func filterDeprioritizedServers(candidates, deprioritized []description.Server) []description.Server { | ||
if len(deprioritized) == 0 { | ||
return candidates | ||
} | ||
|
||
dpaSet := make(map[address.Address]*description.Server) | ||
for i, srv := range deprioritized { | ||
dpaSet[srv.Addr] = &deprioritized[i] | ||
} | ||
|
||
allowed := []description.Server{} | ||
|
||
// Iterate over the candidates and append them to the allowdIndexes slice if | ||
// they are not in the deprioritizedServers list. | ||
for _, candidate := range candidates { | ||
if srv, ok := dpaSet[candidate.Addr]; !ok || !srv.Equal(candidate) { | ||
allowed = append(allowed, candidate) | ||
} | ||
} | ||
|
||
// If nothing is allowed, then all available servers must have been | ||
// deprioritized. In this case, return the candidates list as-is so that the | ||
// selector can find a suitable server | ||
if len(allowed) == 0 { | ||
return candidates | ||
} | ||
|
||
return allowed | ||
} | ||
|
||
// opServerSelector is a wrapper for the server selector that is assigned to the | ||
// operation. The purpose of this wrapper is to filter candidates with | ||
// operation-specific logic, such as deprioritizing failing servers. | ||
type opServerSelector struct { | ||
selector description.ServerSelector | ||
deprioritizedServers []description.Server | ||
} | ||
|
||
// SelectServer will filter candidates with operation-specific logic before | ||
// passing them onto the user-defined or default selector. | ||
func (oss *opServerSelector) SelectServer( | ||
topo description.Topology, | ||
candidates []description.Server, | ||
) ([]description.Server, error) { | ||
selectedServers, err := oss.selector.SelectServer(topo, candidates) | ||
if err != nil { | ||
return nil, err | ||
} | ||
|
||
filteredServers := filterDeprioritizedServers(selectedServers, oss.deprioritizedServers) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This method was designed to add more filters if the need arrises in the future. |
||
|
||
return filteredServers, nil | ||
} | ||
|
||
// selectServer handles performing server selection for an operation. | ||
func (op Operation) selectServer(ctx context.Context, requestID int32) (Server, error) { | ||
func (op Operation) selectServer( | ||
ctx context.Context, | ||
requestID int32, | ||
deprioritized []description.Server, | ||
) (Server, error) { | ||
if err := op.Validate(); err != nil { | ||
return nil, err | ||
} | ||
|
@@ -340,15 +405,24 @@ func (op Operation) selectServer(ctx context.Context, requestID int32) (Server, | |
}) | ||
} | ||
|
||
oss := &opServerSelector{ | ||
selector: selector, | ||
deprioritizedServers: deprioritized, | ||
} | ||
|
||
ctx = logger.WithOperationName(ctx, op.Name) | ||
ctx = logger.WithOperationID(ctx, requestID) | ||
|
||
return op.Deployment.SelectServer(ctx, selector) | ||
return op.Deployment.SelectServer(ctx, oss) | ||
} | ||
|
||
// getServerAndConnection should be used to retrieve a Server and Connection to execute an operation. | ||
func (op Operation) getServerAndConnection(ctx context.Context, requestID int32) (Server, Connection, error) { | ||
server, err := op.selectServer(ctx, requestID) | ||
func (op Operation) getServerAndConnection( | ||
ctx context.Context, | ||
requestID int32, | ||
deprioritized []description.Server, | ||
) (Server, Connection, error) { | ||
server, err := op.selectServer(ctx, requestID, deprioritized) | ||
if err != nil { | ||
if op.Client != nil && | ||
!(op.Client.Committing || op.Client.Aborting) && op.Client.TransactionRunning() { | ||
|
@@ -481,6 +555,11 @@ func (op Operation) Execute(ctx context.Context) error { | |
first := true | ||
currIndex := 0 | ||
|
||
// deprioritizedServers are a running list of servers that should be | ||
// deprioritized during server selection. Per the specifications, we should | ||
// only ever deprioritize the "previous server". | ||
var deprioritizedServers []description.Server | ||
|
||
// resetForRetry records the error that caused the retry, decrements retries, and resets the | ||
// retry loop variables to request a new server and a new connection for the next attempt. | ||
resetForRetry := func(err error) { | ||
|
@@ -506,11 +585,18 @@ func (op Operation) Execute(ctx context.Context) error { | |
} | ||
} | ||
|
||
// If we got a connection, close it immediately to release pool resources for | ||
// subsequent retries. | ||
// If we got a connection, close it immediately to release pool resources | ||
// for subsequent retries. | ||
if conn != nil { | ||
// If we are dealing with a sharded cluster, then mark the failed server | ||
// as "deprioritized". | ||
if desc := conn.Description; desc != nil && op.Deployment.Kind() == description.Sharded { | ||
deprioritizedServers = []description.Server{conn.Description()} | ||
} | ||
|
||
conn.Close() | ||
} | ||
|
||
// Set the server and connection to nil to request a new server and connection. | ||
srvr = nil | ||
conn = nil | ||
|
@@ -535,7 +621,7 @@ func (op Operation) Execute(ctx context.Context) error { | |
|
||
// If the server or connection are nil, try to select a new server and get a new connection. | ||
if srvr == nil || conn == nil { | ||
srvr, conn, err = op.getServerAndConnection(ctx, requestID) | ||
srvr, conn, err = op.getServerAndConnection(ctx, requestID, deprioritizedServers) | ||
if err != nil { | ||
// If the returned error is retryable and there are retries remaining (negative | ||
// retries means retry indefinitely), then retry the operation. Set the server | ||
|
Oops, something went wrong.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is there a task for updating the retryable reads/writes "deprioritized mongos" behavior to account for multiple retries (i.e. CSOT)? The vast majority of sharded clusters have >2 mongos nodes, so that seems like a questionably useful feature for drivers that support CSOT.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
For now, there is no task to do this. Here are a couple of reasons from discussions with @comandeo: