@@ -90,6 +90,7 @@ func extractHTTPStatusCode(err error) int {
9090//
9191// Retryable status codes:
9292// - 5xx (server errors): 500, 502, 503, 504
93+ // - 529 (Anthropic overloaded)
9394// - 408 (request timeout)
9495//
9596// Non-retryable status codes (skip to next model immediately):
@@ -99,6 +100,8 @@ func isRetryableStatusCode(statusCode int) bool {
99100 switch statusCode {
100101 case 500 , 502 , 503 , 504 : // Server errors
101102 return true
103+ case 529 : // Anthropic overloaded
104+ return true
102105 case 408 : // Request timeout
103106 return true
104107 case 429 : // Rate limit - NOT retryable, skip to next model
@@ -119,6 +122,7 @@ func isRetryableStatusCode(statusCode int) bool {
119122// - Network timeouts
120123// - Temporary network errors
121124// - HTTP 5xx errors (server errors)
125+ // - HTTP 529 (Anthropic overloaded)
122126// - HTTP 408 (request timeout)
123127//
124128// Non-retryable errors (skip to next model in chain immediately):
@@ -180,6 +184,10 @@ func isRetryableModelError(err error) bool {
180184 "gateway timeout" , // Gateway timeout
181185 "overloaded" , // Server overloaded
182186 "overloaded_error" , // Server overloaded
187+ "other side closed" , // Connection closed by peer
188+ "fetch failed" , // Network fetch failure
189+ "reset before headers" , // Connection reset before headers received
190+ "upstream connect" , // Upstream connection error
183191 }
184192
185193 for _ , pattern := range retryablePatterns {
@@ -370,9 +378,10 @@ func getEffectiveCooldown(a *agent.Agent) time.Duration {
370378}
371379
372380// getEffectiveRetries returns the number of retries to use for the agent.
373- // If no retries are explicitly configured (retries == 0) and fallback models
374- // are configured, returns DefaultFallbackRetries to provide sensible retry
375- // behavior out of the box.
381+ // If no retries are explicitly configured (retries == 0), returns
382+ // DefaultFallbackRetries to provide sensible retry behavior out of the box.
383+ // This ensures that transient errors (e.g., Anthropic 529 overloaded) are
384+ // retried even when no fallback models are configured.
376385//
377386// Note: Users who explicitly want 0 retries can set retries: -1 in their config
378387// (though this is an edge case - most users want some retries for resilience).
@@ -382,8 +391,8 @@ func getEffectiveRetries(a *agent.Agent) int {
382391 if retries < 0 {
383392 return 0
384393 }
385- // 0 means "use default" when fallback models are configured
386- if retries == 0 && len ( a . FallbackModels ()) > 0 {
394+ // 0 means "use default" - always provide retries for transient error resilience
395+ if retries == 0 {
387396 return DefaultFallbackRetries
388397 }
389398 return retries
0 commit comments