Skip to content

Commit 66b8cdd

Browse files
author
Mateusz
committed
fix(codex): harden OAuth refresh, auth resilience, 502 stream handling
- Legacy and managed Codex token refresh: retries for transient network errors, longer timeouts, clearer WARNING logs without traceback spam; ManagedOAuthRefreshError.from_transient_network for credential logging. - Auth handler: do not permanently disable scoped instances when instance id contains codex but OAuth heuristics miss. - Streaming: surface 500/502 pre-output BackendError like 503/504 so gateway failures are not misrouted to empty-stream recovery prompts. - Tests for refresh paths, auth handler, and should_surface_pre_output_error. Made-with: Cursor
1 parent a77212c commit 66b8cdd

8 files changed

Lines changed: 410 additions & 60 deletions

File tree

src/connectors/openai_codex/credentials.py

Lines changed: 53 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,11 @@
7171

7272
logger = logging.getLogger(__name__)
7373

74+
# Legacy OAuth token endpoint: transient network issues are common; retry before failing refresh.
75+
_LEGACY_OAUTH_REFRESH_MAX_ATTEMPTS = 3
76+
_LEGACY_OAUTH_REFRESH_BASE_DELAY_S = 0.5
77+
_LEGACY_OAUTH_TOKEN_TIMEOUT_S = 30.0
78+
7479

7580
class OpenAICredentialsFileHandler(FileSystemEventHandler):
7681
"""File watcher handler for OpenAI Codex credentials."""
@@ -882,7 +887,13 @@ async def _refresh_managed_access_token(self) -> bool:
882887
try:
883888
updated = await self._managed_refresh.force_refresh(account)
884889
except ManagedOAuthRefreshError as exc:
885-
if logger.isEnabledFor(logging.WARNING):
890+
if exc.from_transient_network:
891+
logger.warning(
892+
"Managed OAuth token refresh failed for account %s after retries: %s",
893+
exc.account_id,
894+
exc,
895+
)
896+
else:
886897
logger.warning(
887898
"Managed OAuth token refresh failed for account %s: %s",
888899
exc.account_id,
@@ -933,17 +944,47 @@ async def _refresh_legacy_access_token(self) -> bool:
933944
"scope": "openid profile email",
934945
}
935946

936-
try:
937-
response = await self._http_client.post(
938-
OPENAI_OAUTH_TOKEN_URL,
939-
json=payload,
940-
headers={"Content-Type": "application/json"},
941-
timeout=15.0,
942-
)
943-
except httpx.HTTPError as exc:
944-
logger.warning(
945-
"Failed to refresh OpenAI Codex token: %s", exc, exc_info=True
946-
)
947+
response: httpx.Response | None = None
948+
for attempt in range(_LEGACY_OAUTH_REFRESH_MAX_ATTEMPTS):
949+
try:
950+
response = await self._http_client.post(
951+
OPENAI_OAUTH_TOKEN_URL,
952+
json=payload,
953+
headers={"Content-Type": "application/json"},
954+
timeout=_LEGACY_OAUTH_TOKEN_TIMEOUT_S,
955+
)
956+
break
957+
except (
958+
httpx.TimeoutException,
959+
httpx.ConnectError,
960+
httpx.ReadError,
961+
) as exc:
962+
if attempt + 1 < _LEGACY_OAUTH_REFRESH_MAX_ATTEMPTS:
963+
delay = _LEGACY_OAUTH_REFRESH_BASE_DELAY_S * (2**attempt)
964+
logger.info(
965+
"Transient error during OpenAI Codex token refresh (%s, attempt %d/%d); "
966+
"retrying in %.1fs",
967+
exc.__class__.__name__,
968+
attempt + 1,
969+
_LEGACY_OAUTH_REFRESH_MAX_ATTEMPTS,
970+
delay,
971+
)
972+
await asyncio.sleep(delay)
973+
continue
974+
logger.warning(
975+
"Failed to refresh OpenAI Codex token after %d attempts (%s): %s",
976+
_LEGACY_OAUTH_REFRESH_MAX_ATTEMPTS,
977+
exc.__class__.__name__,
978+
exc,
979+
)
980+
return False
981+
except httpx.HTTPError as exc:
982+
logger.warning(
983+
"Failed to refresh OpenAI Codex token: %s", exc, exc_info=True
984+
)
985+
return False
986+
987+
if response is None:
947988
return False
948989

949990
if response.status_code >= 400:

src/connectors/openai_codex/managed_oauth_refresh.py

Lines changed: 28 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,17 @@
2222

2323
logger = logging.getLogger(__name__)
2424

25+
# Match legacy credential refresh: slow token endpoint + transient errors are common.
26+
_MANAGED_OAUTH_TOKEN_TIMEOUT_S = 30.0
27+
28+
29+
def _is_transient_network_error(exc: BaseException) -> bool:
30+
"""True for transport-level failures that are worth retrying (like legacy OAuth POST)."""
31+
return isinstance(
32+
exc,
33+
httpx.TimeoutException | httpx.ConnectError | httpx.ReadError,
34+
)
35+
2536

2637
class ManagedOAuthRefreshError(RuntimeError):
2738
"""Raised when token refresh fails for a managed account."""
@@ -32,10 +43,12 @@ def __init__(
3243
*,
3344
account_id: str,
3445
needs_reauth: bool = False,
46+
from_transient_network: bool = False,
3547
) -> None:
3648
super().__init__(message)
3749
self.account_id = account_id
3850
self.needs_reauth = needs_reauth
51+
self.from_transient_network = from_transient_network
3952

4053

4154
class ManagedOAuthRefreshService:
@@ -89,10 +102,20 @@ async def _refresh_with_retry(
89102
raise
90103
except Exception as exc:
91104
last_error = exc
92-
if attempt >= self._max_retries - 1:
105+
if attempt + 1 >= self._max_retries:
93106
break
94107
delay = self._base_delay_seconds * (2**attempt)
95-
if logger.isEnabledFor(logging.DEBUG):
108+
if _is_transient_network_error(exc):
109+
logger.info(
110+
"Transient error during managed OAuth token refresh (%s, attempt %d/%d) "
111+
"for account %s; retrying in %.1fs",
112+
exc.__class__.__name__,
113+
attempt + 1,
114+
self._max_retries,
115+
account.account_id,
116+
delay,
117+
)
118+
elif logger.isEnabledFor(logging.DEBUG):
96119
logger.debug(
97120
"Managed OAuth refresh attempt %d/%d failed for %s; retrying in %.1fs",
98121
attempt + 1,
@@ -103,9 +126,11 @@ async def _refresh_with_retry(
103126
)
104127
await asyncio.sleep(delay)
105128

129+
assert last_error is not None
106130
raise ManagedOAuthRefreshError(
107131
f"Managed OAuth refresh failed after {self._max_retries} attempts: {last_error}",
108132
account_id=account.account_id,
133+
from_transient_network=_is_transient_network_error(last_error),
109134
)
110135

111136
async def _refresh_once(self, account: ManagedOAuthAccount) -> ManagedOAuthAccount:
@@ -139,7 +164,7 @@ async def _execute_refresh(
139164
OPENAI_OAUTH_TOKEN_URL,
140165
json=payload,
141166
headers={"Content-Type": "application/json"},
142-
timeout=15.0,
167+
timeout=_MANAGED_OAUTH_TOKEN_TIMEOUT_S,
143168
)
144169

145170
if response.status_code == 400:

src/core/services/backend_request_manager/streaming_response_handler.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -573,7 +573,9 @@ def _should_surface_pre_output_error(stream_error: Exception) -> bool:
573573
# Surface all 4xx client errors immediately, retrying them is useless
574574
if 400 <= status_code < 500:
575575
return True
576-
if status_code in {503, 504}:
576+
# Common upstream / gateway failures: must not fall through to
577+
# "empty model output" recovery (misleading user prompt append).
578+
if status_code in {500, 502, 503, 504}:
577579
return True
578580

579581
details = getattr(stream_error, "details", None)
@@ -582,7 +584,7 @@ def _should_surface_pre_output_error(stream_error: Exception) -> bool:
582584
if isinstance(details_status, int):
583585
if 400 <= details_status < 500:
584586
return True
585-
if details_status in {503, 504}:
587+
if details_status in {500, 502, 503, 504}:
586588
return True
587589

588590
return False

src/core/services/resilience/handlers/auth_error_handler.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,7 @@ def _do_handle(self, context: ErrorContext) -> ResilienceAction:
121121
context.extra.get("is_personal_backend") is True
122122
or "oauth" in backend_type
123123
or "oauth" in instance_id_lower
124+
or "codex" in instance_id_lower
124125
):
125126
return ResilienceAction(
126127
type=ActionType.PROCEED,

0 commit comments

Comments
 (0)