Skip to content

Commit 796af82

Browse files
authored
Merge pull request #1886 from rumpl/retries
Add retries even for models without fallbacks
2 parents 934a5f6 + 0224351 commit 796af82

2 files changed

Lines changed: 43 additions & 7 deletions

File tree

pkg/runtime/fallback.go

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,7 @@ func extractHTTPStatusCode(err error) int {
9090
//
9191
// Retryable status codes:
9292
// - 5xx (server errors): 500, 502, 503, 504
93+
// - 529 (Anthropic overloaded)
9394
// - 408 (request timeout)
9495
//
9596
// Non-retryable status codes (skip to next model immediately):
@@ -99,6 +100,8 @@ func isRetryableStatusCode(statusCode int) bool {
99100
switch statusCode {
100101
case 500, 502, 503, 504: // Server errors
101102
return true
103+
case 529: // Anthropic overloaded
104+
return true
102105
case 408: // Request timeout
103106
return true
104107
case 429: // Rate limit - NOT retryable, skip to next model
@@ -119,6 +122,7 @@ func isRetryableStatusCode(statusCode int) bool {
119122
// - Network timeouts
120123
// - Temporary network errors
121124
// - HTTP 5xx errors (server errors)
125+
// - HTTP 529 (Anthropic overloaded)
122126
// - HTTP 408 (request timeout)
123127
//
124128
// Non-retryable errors (skip to next model in chain immediately):
@@ -180,6 +184,10 @@ func isRetryableModelError(err error) bool {
180184
"gateway timeout", // Gateway timeout
181185
"overloaded", // Server overloaded
182186
"overloaded_error", // Server overloaded
187+
"other side closed", // Connection closed by peer
188+
"fetch failed", // Network fetch failure
189+
"reset before headers", // Connection reset before headers received
190+
"upstream connect", // Upstream connection error
183191
}
184192

185193
for _, pattern := range retryablePatterns {
@@ -370,9 +378,10 @@ func getEffectiveCooldown(a *agent.Agent) time.Duration {
370378
}
371379

372380
// getEffectiveRetries returns the number of retries to use for the agent.
373-
// If no retries are explicitly configured (retries == 0) and fallback models
374-
// are configured, returns DefaultFallbackRetries to provide sensible retry
375-
// behavior out of the box.
381+
// If no retries are explicitly configured (retries == 0), returns
382+
// DefaultFallbackRetries to provide sensible retry behavior out of the box.
383+
// This ensures that transient errors (e.g., Anthropic 529 overloaded) are
384+
// retried even when no fallback models are configured.
376385
//
377386
// Note: Users who explicitly want 0 retries can set retries: -1 in their config
378387
// (though this is an edge case - most users want some retries for resilience).
@@ -382,8 +391,8 @@ func getEffectiveRetries(a *agent.Agent) int {
382391
if retries < 0 {
383392
return 0
384393
}
385-
// 0 means "use default" when fallback models are configured
386-
if retries == 0 && len(a.FallbackModels()) > 0 {
394+
// 0 means "use default" - always provide retries for transient error resilience
395+
if retries == 0 {
387396
return DefaultFallbackRetries
388397
}
389398
return retries

pkg/runtime/fallback_test.go

Lines changed: 29 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -176,6 +176,31 @@ func TestIsRetryableModelError(t *testing.T) {
176176
err: errors.New("authentication failed"),
177177
expected: false,
178178
},
179+
{
180+
name: "anthropic overloaded 529",
181+
err: errors.New("529 overloaded"),
182+
expected: true,
183+
},
184+
{
185+
name: "other side closed",
186+
err: errors.New("other side closed the connection"),
187+
expected: true,
188+
},
189+
{
190+
name: "fetch failed",
191+
err: errors.New("fetch failed"),
192+
expected: true,
193+
},
194+
{
195+
name: "reset before headers",
196+
err: errors.New("reset before headers"),
197+
expected: true,
198+
},
199+
{
200+
name: "upstream connect error",
201+
err: errors.New("upstream connect error"),
202+
expected: true,
203+
},
179204
{
180205
name: "unknown error",
181206
err: errors.New("something weird happened"),
@@ -591,6 +616,7 @@ func TestIsRetryableStatusCode(t *testing.T) {
591616
{503, true}, // Service unavailable - retryable
592617
{504, true}, // Gateway timeout - retryable
593618
{408, true}, // Request timeout - retryable
619+
{529, true}, // Anthropic overloaded - retryable
594620
{429, false}, // Rate limit - NOT retryable (skip to next model)
595621
{400, false}, // Bad request - not retryable
596622
{401, false}, // Unauthorized - not retryable
@@ -719,12 +745,13 @@ func TestGetEffectiveRetries(t *testing.T) {
719745
mockModel := &mockProvider{id: "test/model", stream: newStreamBuilder().AddContent("ok").AddStopWithUsage(1, 1).Build()}
720746
mockFallback := &mockProvider{id: "test/fallback", stream: newStreamBuilder().AddContent("ok").AddStopWithUsage(1, 1).Build()}
721747

722-
// Agent with no retries configured and no fallback models should return 0
748+
// Agent with no retries configured and no fallback models should use default
749+
// retries for transient error resilience (e.g., Anthropic 529 overloaded)
723750
agentNoFallback := agent.New("no-fallback", "test",
724751
agent.WithModel(mockModel),
725752
)
726753
retries := getEffectiveRetries(agentNoFallback)
727-
assert.Equal(t, 0, retries, "no fallback models = no retries (nothing to retry to)")
754+
assert.Equal(t, DefaultFallbackRetries, retries, "should use default retries even without fallback models")
728755

729756
// Agent with no retries configured but with fallback models should use default
730757
agentWithFallback := agent.New("with-fallback", "test",

0 commit comments

Comments
 (0)