1+ import { setTimeout } from 'node:timers/promises' ;
2+
13import { MIN_SUPPORTED_SNAPSHOT_READS_WIRE_VERSION } from '../cmap/wire_protocol/constants' ;
24import {
35 isRetryableReadError ,
@@ -26,6 +28,7 @@ import {
2628import type { Topology } from '../sdam/topology' ;
2729import type { ClientSession } from '../sessions' ;
2830import { TimeoutContext } from '../timeout' ;
31+ import { RETRY_COST , TOKEN_REFRESH_RATE } from '../token_bucket' ;
2932import { abortable , maxWireVersion , supportsRetryableWrites } from '../utils' ;
3033import { AggregateOperation } from './aggregate' ;
3134import { AbstractOperation , Aspect } from './operation' ;
@@ -232,11 +235,12 @@ async function tryOperation<T extends AbstractOperation, TResult = ResultTypeFro
232235 session . incrementTransactionNumber ( ) ;
233236 }
234237
235- const maxTries = willRetry ? ( timeoutContext . csotEnabled ( ) ? Infinity : 2 ) : 1 ;
238+ const MAX_ATTEMPTS = 5 ;
239+ const maxTries = willRetry ? MAX_ATTEMPTS : 1 ;
236240 let previousOperationError : MongoError | undefined ;
237241 let previousServer : ServerDescription | undefined ;
238242
239- for ( let tries = 0 ; tries < maxTries ; tries ++ ) {
243+ for ( let attempt = 0 ; attempt < maxTries ; attempt ++ ) {
240244 if ( previousOperationError ) {
241245 if ( hasWriteAspect && previousOperationError . code === MMAPv1_RETRY_WRITES_ERROR_CODE ) {
242246 throw new MongoServerError ( {
@@ -246,14 +250,40 @@ async function tryOperation<T extends AbstractOperation, TResult = ResultTypeFro
246250 } ) ;
247251 }
248252
249- if ( operation . hasAspect ( Aspect . COMMAND_BATCHING ) && ! operation . canRetryWrite ) {
253+ const isRetryableError =
254+ // any command with the RetryableError label is retryable
255+ previousOperationError . hasErrorLabel ( MongoErrorLabel . RetryableError ) ||
256+ // COMMAND_BATCHING commands are retryable depending on the contents of the batches
257+ ( operation . hasAspect ( Aspect . COMMAND_BATCHING ) && ! operation . canRetryWrite ) ||
258+ ( hasWriteAspect && isRetryableWriteError ( previousOperationError ) ) ||
259+ ( hasReadAspect && ! isRetryableReadError ( previousOperationError ) ) ;
260+
261+ if ( ! isRetryableError ) {
250262 throw previousOperationError ;
251263 }
252264
253- if ( hasWriteAspect && ! isRetryableWriteError ( previousOperationError ) )
254- throw previousOperationError ;
265+ const isSystemOverloadError = previousOperationError . hasErrorLabel (
266+ MongoErrorLabel . SystemOverloadError
267+ ) ;
268+ if ( isSystemOverloadError ) {
269+ const delay =
270+ Math . random ( ) *
271+ Math . min (
272+ 10_000 , // MAX_BACKOFF
273+ 100 * 2 ** attempt
274+ ) ;
275+
276+ // short circuit if CSOT would expire while sleeping
277+ if ( timeoutContext . csotEnabled ( ) && timeoutContext . remainingTimeMS < delay ) {
278+ throw previousOperationError ;
279+ }
280+
281+ await setTimeout ( delay , undefined , {
282+ // TODO: handle abort signal here?
283+ } ) ;
284+ }
255285
256- if ( hasReadAspect && ! isRetryableReadError ( previousOperationError ) ) {
286+ if ( ! topology . tokenBucket . consume ( RETRY_COST ) ) {
257287 throw previousOperationError ;
258288 }
259289
@@ -284,19 +314,35 @@ async function tryOperation<T extends AbstractOperation, TResult = ResultTypeFro
284314 operation . server = server ;
285315
286316 try {
287- // If tries > 0 and we are command batching we need to reset the batch.
288- if ( tries > 0 && operation . hasAspect ( Aspect . COMMAND_BATCHING ) ) {
317+ // If attempt > 0 and we are command batching we need to reset the batch.
318+ if ( attempt > 0 && operation . hasAspect ( Aspect . COMMAND_BATCHING ) ) {
289319 operation . resetBatch ( ) ;
290320 }
291321
292322 try {
293323 const result = await server . command ( operation , timeoutContext ) ;
324+
325+ // On success, deposit the refresh rate into the bucket. If the attempt
326+ // was a retry, also deposit the retry cost to allow for another retry.
327+ topology . tokenBucket . deposit ( TOKEN_REFRESH_RATE ) ;
328+ if ( attempt > 0 ) {
329+ topology . tokenBucket . deposit ( RETRY_COST ) ;
330+ }
331+
294332 return operation . handleOk ( result ) ;
295333 } catch ( error ) {
296334 return operation . handleError ( error ) ;
297335 }
298336 } catch ( operationError ) {
299337 if ( ! ( operationError instanceof MongoError ) ) throw operationError ;
338+
339+ // If a retry fails with a non-SystemOverloadError, this indicates that the server
340+ // either was not overloaded OR was overloaded but has recovered enough to let this
341+ // request through. Either way, deposit the retry cost to allow for another retry.
342+ if ( attempt > 0 && ! operationError . hasErrorLabel ( MongoErrorLabel . SystemOverloadError ) ) {
343+ topology . tokenBucket . deposit ( RETRY_COST ) ;
344+ }
345+
300346 if (
301347 previousOperationError != null &&
302348 operationError . hasErrorLabel ( MongoErrorLabel . NoWritesPerformed )
0 commit comments