Skip to content

Commit 561dd23

Browse files
RUBY-3803 Fix retries when an overload error (#3021)
1 parent 58f7791 commit 561dd23

5 files changed

Lines changed: 279 additions & 4 deletions

File tree

lib/mongo/retryable/read_worker.rb

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -332,13 +332,19 @@ def retry_read(original_error, session, server_selector, context: nil, failed_se
332332
# Each retry sleeps with jittered backoff, respects MAX_RETRIES,
333333
# and consumes a token from the bucket when adaptive retries
334334
# are enabled.
335+
#
336+
# Per the client-backpressure spec, backoff is applied if and only
337+
# if the error triggering the retry is an overload error. Non-overload
338+
# retryable errors that occur within this loop are retried immediately
339+
# (without backoff) but still count toward MAX_RETRIES.
335340
def overload_read_retry(last_error, session, server_selector, context, failed_server, error_count:)
341+
last_was_overload = true
336342
loop do
337-
delay = retry_policy.backoff_delay(error_count)
343+
delay = last_was_overload ? retry_policy.backoff_delay(error_count) : 0
338344
raise last_error unless retry_policy.should_retry_overload?(error_count, delay, context: context)
339345

340346
log_retry(last_error, message: 'Read retry (overload backoff)')
341-
sleep(delay)
347+
sleep(delay) if last_was_overload
342348

343349
begin
344350
server = select_server(
@@ -363,6 +369,7 @@ def overload_read_retry(last_error, session, server_selector, context, failed_se
363369
is_overload = retryable_overload_error?(e)
364370
raise e unless is_overload || is_retryable_exception?(e) || e.write_retryable?
365371

372+
last_was_overload = is_overload
366373
failed_server = server
367374
last_error = e
368375
end

lib/mongo/retryable/write_worker.rb

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -372,19 +372,25 @@ def retry_write(original_error, txn_num, context:, failed_server: nil, &block)
372372
end
373373

374374
# Retry loop for overload write errors with exponential backoff.
375+
#
376+
# Per the client-backpressure spec, backoff is applied if and only
377+
# if the error triggering the retry is an overload error. Non-overload
378+
# retryable errors that occur within this loop are retried immediately
379+
# (without backoff) but still count toward MAX_RETRIES.
375380
def overload_write_retry(last_error, session, txn_num, context:, failed_server:, error_count:,
376381
was_starting_transaction: false)
377382
# Track the error to return per the NoWritesPerformed spec rules:
378383
# - first error is always saved
379384
# - only update when a new error does NOT have NoWritesPerformed
380385
error_to_raise = last_error
386+
last_was_overload = true
381387

382388
loop do
383-
delay = retry_policy.backoff_delay(error_count)
389+
delay = last_was_overload ? retry_policy.backoff_delay(error_count) : 0
384390
raise error_to_raise unless retry_policy.should_retry_overload?(error_count, delay, context: context)
385391

386392
log_retry(last_error, message: 'Write retry (overload backoff)')
387-
sleep(delay)
393+
sleep(delay) if last_was_overload
388394

389395
begin
390396
server = select_server(
@@ -423,6 +429,7 @@ def overload_write_retry(last_error, session, txn_num, context:, failed_server:,
423429
unless e.respond_to?(:label?) && e.label?('NoWritesPerformed')
424430
error_to_raise = e
425431
end
432+
last_was_overload = is_overload
426433
context = context.with(overload_only_retry: false) unless is_overload
427434
failed_server = server
428435
last_error = e
Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
# frozen_string_literal: true
2+
3+
require 'spec_helper'
4+
5+
# Client Backpressure Prose Tests for backoff behavior.
6+
#
7+
# Spec reference:
8+
# specifications/source/client-backpressure/tests/README.md
9+
describe 'Client Backpressure backoff prose tests' do
10+
require_topology :replica_set
11+
min_server_version '4.4'
12+
13+
let(:client) do
14+
authorized_client.with(retry_reads: true)
15+
end
16+
17+
let(:admin_client) { client.use(:admin) }
18+
19+
let(:collection) { client['backoff-prose-test'] }
20+
21+
let(:subscriber) { Mrss::EventSubscriber.new }
22+
23+
before do
24+
# Inflate BASE_BACKOFF so any accidental backoff is clearly visible
25+
# through timing. Without backoff the operation completes in
26+
# milliseconds; with backoff it would take at least 5 seconds.
27+
stub_const('Mongo::Retryable::Backpressure::BASE_BACKOFF', 5.0)
28+
end
29+
30+
after do
31+
admin_client.command(configureFailPoint: 'failCommand', mode: 'off')
32+
rescue Mongo::Error
33+
# Ignore cleanup failures.
34+
end
35+
36+
# -------------------------------------------------------------------------
37+
# Test 4: Backoff is applied if and only if the error is an
38+
# overload error (mixed overload + non-overload in the overload loop)
39+
# -------------------------------------------------------------------------
40+
describe 'Test 4: backoff applied only for overload errors in overload retry loop' do
41+
it 'applies backoff for the overload error but not for subsequent non-overload errors' do
42+
# Configure first fail point: overload error, fires once.
43+
admin_client.command(
44+
configureFailPoint: 'failCommand',
45+
mode: { times: 1 },
46+
data: {
47+
failCommands: %w[find],
48+
errorCode: 91,
49+
errorLabels: %w[RetryableError SystemOverloadedError]
50+
}
51+
)
52+
53+
# Via CommandFailedEvent, switch to a non-overload retryable error.
54+
failpoint_set = false
55+
client.subscribe(Mongo::Monitoring::COMMAND, subscriber)
56+
57+
allow(subscriber).to receive(:failed).and_wrap_original do |m, event|
58+
m.call(event)
59+
if !failpoint_set && event.command_name == 'find'
60+
failpoint_set = true
61+
admin_client.command(
62+
configureFailPoint: 'failCommand',
63+
mode: 'alwaysOn',
64+
data: {
65+
failCommands: %w[find],
66+
errorCode: 91,
67+
errorLabels: %w[RetryableError]
68+
}
69+
)
70+
end
71+
end
72+
73+
subscriber.clear_events!
74+
75+
start_time = Mongo::Utils.monotonic_time
76+
expect do
77+
collection.find.first
78+
end.to raise_error(Mongo::Error::OperationFailure)
79+
elapsed = Mongo::Utils.monotonic_time - start_time
80+
81+
# With BASE_BACKOFF=5s, correct behavior applies one backoff
82+
# (bounded by BASE_BACKOFF) for the overload error, then retries
83+
# non-overload errors immediately. The elapsed time should stay
84+
# under BASE_BACKOFF plus a small margin for network overhead.
85+
expect(elapsed).to be < Mongo::Retryable::Backpressure::BASE_BACKOFF + 2
86+
end
87+
end
88+
end
Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
# frozen_string_literal: true
2+
3+
require 'spec_helper'
4+
5+
# Retryable Reads Prose Test 1: Test that drivers set the maximum number
6+
# of retries for all retryable read errors when an overload error is
7+
# encountered.
8+
#
9+
# Spec reference:
10+
# specifications/source/retryable-reads/tests/README.md
11+
# "1: Test that drivers set the maximum number of retries for all
12+
# retryable read errors when an overload error is encountered"
13+
describe 'Retryable reads prose test 1: overload retry count' do
14+
require_topology :replica_set
15+
min_server_version '6.0'
16+
17+
let(:client) do
18+
authorized_client.with(retry_reads: true)
19+
end
20+
21+
let(:admin_client) { client.use(:admin) }
22+
23+
let(:collection) { client['overload-retry-count-reads-prose-test'] }
24+
25+
let(:subscriber) { Mrss::EventSubscriber.new }
26+
27+
# MAX_ADAPTIVE_RETRIES in spec terminology; the configured max overload
28+
# retries for this client (defaults to DEFAULT_MAX_RETRIES).
29+
let(:max_adaptive_retries) { client.retry_policy.max_retries }
30+
31+
let(:find_started_events) do
32+
subscriber.started_events.select { |e| e.command_name == 'find' }
33+
end
34+
35+
after do
36+
admin_client.command(configureFailPoint: 'failCommand', mode: 'off')
37+
rescue Mongo::Error
38+
# Ignore cleanup failures.
39+
end
40+
41+
it 'makes MAX_ADAPTIVE_RETRIES + 1 total attempts' do
42+
# Step 2: Configure a fail point for find that fires once with an
43+
# overload error (code 91, labels RetryableError + SystemOverloadedError).
44+
admin_client.command(
45+
configureFailPoint: 'failCommand',
46+
mode: { times: 1 },
47+
data: {
48+
failCommands: %w[find],
49+
errorCode: 91,
50+
errorLabels: %w[RetryableError SystemOverloadedError]
51+
}
52+
)
53+
54+
# Step 3: Via CommandFailedEvent, when the first find error fires,
55+
# configure a second fail point for find with a non-overload retryable
56+
# error (code 91, label RetryableError only), set to alwaysOn.
57+
failpoint_set = false
58+
client.subscribe(Mongo::Monitoring::COMMAND, subscriber)
59+
60+
allow(subscriber).to receive(:failed).and_wrap_original do |m, event|
61+
m.call(event)
62+
if !failpoint_set && event.command_name == 'find'
63+
failpoint_set = true
64+
admin_client.command(
65+
configureFailPoint: 'failCommand',
66+
mode: 'alwaysOn',
67+
data: {
68+
failCommands: %w[find],
69+
errorCode: 91,
70+
errorLabels: %w[RetryableError]
71+
}
72+
)
73+
end
74+
end
75+
76+
# Step 4: Attempt a find. Expect it to fail.
77+
subscriber.clear_events!
78+
expect do
79+
collection.find.first
80+
end.to raise_error(Mongo::Error::OperationFailure)
81+
82+
# Step 5: Assert that MAX_ADAPTIVE_RETRIES + 1 total find commands
83+
# were sent (1 initial attempt + MAX_ADAPTIVE_RETRIES retries).
84+
expect(find_started_events.length).to eq(max_adaptive_retries + 1)
85+
end
86+
end
Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
# frozen_string_literal: true
2+
3+
require 'spec_helper'
4+
5+
# Retryable Writes Prose Test Case 4: Test that drivers set the maximum
6+
# number of retries for all retryable write errors when an overload error
7+
# is encountered.
8+
#
9+
# Spec reference:
10+
# specifications/source/retryable-writes/tests/README.md
11+
# "Case 4: Test that drivers set the maximum number of retries for all
12+
# retryable write errors when an overload error is encountered"
13+
describe 'Retryable writes prose test case 4: overload retry count' do
14+
require_topology :replica_set
15+
min_server_version '6.0'
16+
17+
let(:client) do
18+
authorized_client.with(retry_writes: true)
19+
end
20+
21+
let(:admin_client) { client.use(:admin) }
22+
23+
let(:collection) { client['overload-retry-count-writes-prose-test'] }
24+
25+
let(:subscriber) { Mrss::EventSubscriber.new }
26+
27+
# MAX_ADAPTIVE_RETRIES in spec terminology; the configured max overload
28+
# retries for this client (defaults to DEFAULT_MAX_RETRIES).
29+
let(:max_adaptive_retries) { client.retry_policy.max_retries }
30+
31+
let(:insert_started_events) do
32+
subscriber.started_events.select { |e| e.command_name == 'insert' }
33+
end
34+
35+
after do
36+
admin_client.command(configureFailPoint: 'failCommand', mode: 'off')
37+
rescue Mongo::Error
38+
# Ignore cleanup failures.
39+
end
40+
41+
it 'makes MAX_ADAPTIVE_RETRIES + 1 total attempts' do
42+
# Step 2: Configure a fail point for insert that fires once with an
43+
# overload error (code 91, labels RetryableError + SystemOverloadedError).
44+
admin_client.command(
45+
configureFailPoint: 'failCommand',
46+
mode: { times: 1 },
47+
data: {
48+
failCommands: %w[insert],
49+
errorCode: 91,
50+
errorLabels: %w[RetryableError SystemOverloadedError]
51+
}
52+
)
53+
54+
# Step 3: Via CommandFailedEvent, when the first insert error fires,
55+
# configure a second fail point for insert with a non-overload retryable
56+
# write error (code 91, labels RetryableError + RetryableWriteError),
57+
# set to alwaysOn.
58+
failpoint_set = false
59+
client.subscribe(Mongo::Monitoring::COMMAND, subscriber)
60+
61+
allow(subscriber).to receive(:failed).and_wrap_original do |m, event|
62+
m.call(event)
63+
if !failpoint_set && event.command_name == 'insert'
64+
failpoint_set = true
65+
admin_client.command(
66+
configureFailPoint: 'failCommand',
67+
mode: 'alwaysOn',
68+
data: {
69+
failCommands: %w[insert],
70+
errorCode: 91,
71+
errorLabels: %w[RetryableError RetryableWriteError]
72+
}
73+
)
74+
end
75+
end
76+
77+
# Step 4: Attempt an insertOne. Expect it to fail.
78+
subscriber.clear_events!
79+
expect do
80+
collection.insert_one(x: 1)
81+
end.to raise_error(Mongo::Error::OperationFailure)
82+
83+
# Step 5: Assert that MAX_ADAPTIVE_RETRIES + 1 total insert commands
84+
# were sent (1 initial attempt + MAX_ADAPTIVE_RETRIES retries).
85+
expect(insert_started_events.length).to eq(max_adaptive_retries + 1)
86+
end
87+
end

0 commit comments

Comments
 (0)