Skip to content

Commit 4b91931

Browse files
authored
Merge pull request #1930 from tyrielv/tyrielv/oom-circuit-breaker-timeout
Add circuit breaker, connection pool timeout, and contention telemetry
2 parents cddda1c + 6a5d79b commit 4b91931

4 files changed

Lines changed: 244 additions & 3 deletions

File tree

GVFS/GVFS.Common/Http/HttpRequestor.cs

Lines changed: 28 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,9 @@ namespace GVFS.Common.Http
1717
{
1818
public abstract class HttpRequestor : IDisposable
1919
{
20+
private const int ConnectionPoolWaitTimeoutMs = 30_000;
21+
private const int ConnectionPoolContentionThresholdMs = 100;
22+
2023
private static long requestCount = 0;
2124
private static SemaphoreSlim availableConnections;
2225

@@ -126,8 +129,30 @@ protected GitEndPointResponseData SendRequest(
126129
responseMetadata.Add("availableConnections", availableConnections.CurrentCount);
127130

128131
Stopwatch requestStopwatch = Stopwatch.StartNew();
129-
availableConnections.Wait(cancellationToken);
130-
TimeSpan connectionWaitTime = requestStopwatch.Elapsed;
132+
133+
if (!availableConnections.Wait(ConnectionPoolWaitTimeoutMs, cancellationToken))
134+
{
135+
TimeSpan connectionWaitTime = requestStopwatch.Elapsed;
136+
responseMetadata.Add("connectionWaitTimeMS", $"{connectionWaitTime.TotalMilliseconds:F4}");
137+
this.Tracer.RelatedWarning(responseMetadata, "SendRequest: Connection pool exhausted, all connections busy");
138+
139+
return new GitEndPointResponseData(
140+
HttpStatusCode.ServiceUnavailable,
141+
new GitObjectsHttpException(HttpStatusCode.ServiceUnavailable, "Connection pool exhausted - all connections busy"),
142+
shouldRetry: true,
143+
message: null,
144+
onResponseDisposed: null);
145+
}
146+
147+
TimeSpan connectionWaitTimeElapsed = requestStopwatch.Elapsed;
148+
if (connectionWaitTimeElapsed.TotalMilliseconds > ConnectionPoolContentionThresholdMs)
149+
{
150+
EventMetadata contentionMetadata = new EventMetadata();
151+
contentionMetadata.Add("RequestId", requestId);
152+
contentionMetadata.Add("availableConnections", availableConnections.CurrentCount);
153+
contentionMetadata.Add("connectionWaitTimeMS", $"{connectionWaitTimeElapsed.TotalMilliseconds:F4}");
154+
this.Tracer.RelatedWarning(contentionMetadata, "SendRequest: Connection pool contention detected");
155+
}
131156

132157
TimeSpan responseWaitTime = default(TimeSpan);
133158
GitEndPointResponseData gitEndPointResponseData = null;
@@ -248,7 +273,7 @@ protected GitEndPointResponseData SendRequest(
248273
}
249274
finally
250275
{
251-
responseMetadata.Add("connectionWaitTimeMS", $"{connectionWaitTime.TotalMilliseconds:F4}");
276+
responseMetadata.Add("connectionWaitTimeMS", $"{connectionWaitTimeElapsed.TotalMilliseconds:F4}");
252277
responseMetadata.Add("responseWaitTimeMS", $"{responseWaitTime.TotalMilliseconds:F4}");
253278

254279
this.Tracer.RelatedEvent(EventLevel.Informational, "NetworkResponse", responseMetadata);
Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
using System;
2+
using System.Threading;
3+
4+
namespace GVFS.Common
5+
{
6+
/// <summary>
7+
/// Global circuit breaker for retry operations. When too many consecutive failures
8+
/// occur (e.g., during system-wide resource exhaustion), the circuit opens and
9+
/// subsequent retry attempts fail fast instead of consuming connections and adding
10+
/// backoff delays that worsen the resource pressure.
11+
/// </summary>
12+
public static class RetryCircuitBreaker
13+
{
14+
public const int DefaultFailureThreshold = 15;
15+
public const int DefaultCooldownMs = 30_000;
16+
17+
private static int failureThreshold = DefaultFailureThreshold;
18+
private static int cooldownMs = DefaultCooldownMs;
19+
private static int consecutiveFailures = 0;
20+
private static long circuitOpenedAtUtcTicks = 0;
21+
22+
public static bool IsOpen
23+
{
24+
get
25+
{
26+
if (Volatile.Read(ref consecutiveFailures) < failureThreshold)
27+
{
28+
return false;
29+
}
30+
31+
long openedAt = Volatile.Read(ref circuitOpenedAtUtcTicks);
32+
return (DateTime.UtcNow.Ticks - openedAt) < TimeSpan.FromMilliseconds(cooldownMs).Ticks;
33+
}
34+
}
35+
36+
public static int ConsecutiveFailures => Volatile.Read(ref consecutiveFailures);
37+
38+
public static void RecordSuccess()
39+
{
40+
Interlocked.Exchange(ref consecutiveFailures, 0);
41+
}
42+
43+
public static void RecordFailure()
44+
{
45+
int failures = Interlocked.Increment(ref consecutiveFailures);
46+
if (failures >= failureThreshold)
47+
{
48+
Volatile.Write(ref circuitOpenedAtUtcTicks, DateTime.UtcNow.Ticks);
49+
}
50+
}
51+
52+
/// <summary>
53+
/// Resets the circuit breaker to its initial state. Intended for testing.
54+
/// </summary>
55+
public static void Reset()
56+
{
57+
Volatile.Write(ref consecutiveFailures, 0);
58+
Volatile.Write(ref circuitOpenedAtUtcTicks, 0);
59+
Volatile.Write(ref failureThreshold, DefaultFailureThreshold);
60+
Volatile.Write(ref cooldownMs, DefaultCooldownMs);
61+
}
62+
63+
/// <summary>
64+
/// Configures the circuit breaker thresholds. Intended for testing.
65+
/// </summary>
66+
public static void Configure(int threshold, int cooldownMilliseconds)
67+
{
68+
Volatile.Write(ref failureThreshold, threshold);
69+
Volatile.Write(ref cooldownMs, cooldownMilliseconds);
70+
}
71+
}
72+
}

GVFS/GVFS.Common/RetryWrapper.cs

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,24 @@ public static Action<ErrorEventArgs> StandardErrorHandler(ITracer tracer, long r
6060

6161
public InvocationResult Invoke(Func<int, CallbackResult> toInvoke)
6262
{
63+
// NOTE: Cascade risk — connection pool timeouts (HttpRequestor returns
64+
// ServiceUnavailable when the semaphore wait expires) flow through here
65+
// as callback errors with shouldRetry=true and count toward the circuit
66+
// breaker. Under sustained pool exhaustion, 15 timeouts can trip the
67+
// breaker and fail-fast ALL retry operations for 30 seconds — including
68+
// requests that might have succeeded. In practice, request coalescing
69+
// (GVFSGitObjects) and the larger pool size drastically reduce the
70+
// likelihood of sustained pool exhaustion. If telemetry shows this
71+
// cascade occurring, consider excluding local resource pressure (pool
72+
// timeouts) from circuit breaker failure counts.
73+
if (RetryCircuitBreaker.IsOpen)
74+
{
75+
RetryableException circuitOpenError = new RetryableException(
76+
"Circuit breaker is open - too many consecutive failures. Fast-failing to prevent resource exhaustion.");
77+
this.OnFailure(new ErrorEventArgs(circuitOpenError, tryCount: 1, willRetry: false));
78+
return new InvocationResult(1, circuitOpenError);
79+
}
80+
6381
// Use 1-based counting. This makes reporting look a lot nicer and saves a lot of +1s
6482
for (int tryCount = 1; tryCount <= this.maxAttempts; ++tryCount)
6583
{
@@ -70,13 +88,19 @@ public InvocationResult Invoke(Func<int, CallbackResult> toInvoke)
7088
CallbackResult result = toInvoke(tryCount);
7189
if (result.HasErrors)
7290
{
91+
if (result.ShouldRetry)
92+
{
93+
RetryCircuitBreaker.RecordFailure();
94+
}
95+
7396
if (!this.ShouldRetry(tryCount, null, result))
7497
{
7598
return new InvocationResult(tryCount, result.Error, result.Result);
7699
}
77100
}
78101
else
79102
{
103+
RetryCircuitBreaker.RecordSuccess();
80104
return new InvocationResult(tryCount, true, result.Result);
81105
}
82106
}
@@ -92,6 +116,7 @@ e is AggregateException
92116
throw;
93117
}
94118

119+
RetryCircuitBreaker.RecordFailure();
95120
if (!this.ShouldRetry(tryCount, exceptionToReport, null))
96121
{
97122
return new InvocationResult(tryCount, exceptionToReport);

GVFS/GVFS.UnitTests/Common/RetryWrapperTests.cs

Lines changed: 119 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,12 @@ namespace GVFS.UnitTests.Common
1212
[TestFixture]
1313
public class RetryWrapperTests
1414
{
15+
[SetUp]
16+
public void SetUp()
17+
{
18+
RetryCircuitBreaker.Reset();
19+
}
20+
1521
[TestCase]
1622
[Category(CategoryConstants.ExceptionExpected)]
1723
public void WillRetryOnIOException()
@@ -233,5 +239,118 @@ public void WillRetryWhenRequested()
233239
actualTries.ShouldEqual(ExpectedTries);
234240
actualFailures.ShouldEqual(ExpectedFailures);
235241
}
242+
243+
[TestCase]
244+
[Category(CategoryConstants.ExceptionExpected)]
245+
public void CircuitBreakerOpensAfterConsecutiveFailures()
246+
{
247+
const int Threshold = 5;
248+
const int CooldownMs = 5000;
249+
RetryCircuitBreaker.Configure(Threshold, CooldownMs);
250+
251+
// Generate enough failures to trip the circuit breaker
252+
for (int i = 0; i < Threshold; i++)
253+
{
254+
RetryWrapper<bool> wrapper = new RetryWrapper<bool>(1, CancellationToken.None, exponentialBackoffBase: 0);
255+
wrapper.Invoke(tryCount => throw new IOException("simulated failure"));
256+
}
257+
258+
RetryCircuitBreaker.IsOpen.ShouldBeTrue("Circuit breaker should be open after threshold failures");
259+
260+
// Next invocation should fail fast without calling the callback
261+
int callbackInvocations = 0;
262+
RetryWrapper<bool> dut = new RetryWrapper<bool>(5, CancellationToken.None, exponentialBackoffBase: 0);
263+
RetryWrapper<bool>.InvocationResult result = dut.Invoke(
264+
tryCount =>
265+
{
266+
callbackInvocations++;
267+
return new RetryWrapper<bool>.CallbackResult(true);
268+
});
269+
270+
result.Succeeded.ShouldEqual(false);
271+
callbackInvocations.ShouldEqual(0);
272+
}
273+
274+
[TestCase]
275+
public void CircuitBreakerResetsOnSuccess()
276+
{
277+
const int Threshold = 3;
278+
RetryCircuitBreaker.Configure(Threshold, 30_000);
279+
280+
// Record failures just below threshold
281+
for (int i = 0; i < Threshold - 1; i++)
282+
{
283+
RetryCircuitBreaker.RecordFailure();
284+
}
285+
286+
RetryCircuitBreaker.IsOpen.ShouldBeFalse("Circuit should still be closed below threshold");
287+
288+
// A successful invocation resets the counter
289+
RetryWrapper<bool> dut = new RetryWrapper<bool>(1, CancellationToken.None, exponentialBackoffBase: 0);
290+
dut.Invoke(tryCount => new RetryWrapper<bool>.CallbackResult(true));
291+
292+
RetryCircuitBreaker.ConsecutiveFailures.ShouldEqual(0);
293+
294+
// Now threshold more failures are needed to trip it again
295+
for (int i = 0; i < Threshold - 1; i++)
296+
{
297+
RetryCircuitBreaker.RecordFailure();
298+
}
299+
300+
RetryCircuitBreaker.IsOpen.ShouldBeFalse("Circuit should still be closed after reset");
301+
}
302+
303+
[TestCase]
304+
public void CircuitBreakerIgnoresNonRetryableErrors()
305+
{
306+
const int Threshold = 3;
307+
RetryCircuitBreaker.Configure(Threshold, 30_000);
308+
309+
// Generate non-retryable failures (e.g., 404/400) — these should NOT count
310+
for (int i = 0; i < Threshold + 5; i++)
311+
{
312+
RetryWrapper<bool> wrapper = new RetryWrapper<bool>(1, CancellationToken.None, exponentialBackoffBase: 0);
313+
wrapper.Invoke(tryCount => new RetryWrapper<bool>.CallbackResult(new Exception("404 Not Found"), shouldRetry: false));
314+
}
315+
316+
RetryCircuitBreaker.IsOpen.ShouldBeFalse("Non-retryable errors should not trip the circuit breaker");
317+
RetryCircuitBreaker.ConsecutiveFailures.ShouldEqual(0);
318+
}
319+
320+
[TestCase]
321+
[Category(CategoryConstants.ExceptionExpected)]
322+
public void CircuitBreakerClosesAfterCooldown()
323+
{
324+
const int Threshold = 3;
325+
const int CooldownMs = 100; // Very short cooldown for testing
326+
RetryCircuitBreaker.Configure(Threshold, CooldownMs);
327+
328+
// Trip the circuit breaker
329+
for (int i = 0; i < Threshold; i++)
330+
{
331+
RetryWrapper<bool> wrapper = new RetryWrapper<bool>(1, CancellationToken.None, exponentialBackoffBase: 0);
332+
wrapper.Invoke(tryCount => throw new IOException("simulated failure"));
333+
}
334+
335+
RetryCircuitBreaker.IsOpen.ShouldBeTrue("Circuit should be open");
336+
337+
// Wait for cooldown to expire
338+
Thread.Sleep(CooldownMs + 50);
339+
340+
RetryCircuitBreaker.IsOpen.ShouldBeFalse("Circuit should be closed after cooldown");
341+
342+
// Should be able to invoke successfully now
343+
int callbackInvocations = 0;
344+
RetryWrapper<bool> dut = new RetryWrapper<bool>(1, CancellationToken.None, exponentialBackoffBase: 0);
345+
RetryWrapper<bool>.InvocationResult result = dut.Invoke(
346+
tryCount =>
347+
{
348+
callbackInvocations++;
349+
return new RetryWrapper<bool>.CallbackResult(true);
350+
});
351+
352+
result.Succeeded.ShouldEqual(true);
353+
callbackInvocations.ShouldEqual(1);
354+
}
236355
}
237356
}

0 commit comments

Comments
 (0)