From 7791c2e848a08f81a509544ad1eb2abc8e560ca5 Mon Sep 17 00:00:00 2001 From: Randy Concepcion Date: Thu, 11 Jun 2026 19:51:04 -0400 Subject: [PATCH 1/9] add chat completion availability counter --- src/mlpa/core/completions.py | 13 ++ src/mlpa/core/errors.py | 20 ++- src/mlpa/core/metrics.py | 12 ++ src/mlpa/core/prometheus_metrics.py | 56 ++++++++ src/tests/unit/test_availability.py | 96 ++++++++++++++ src/tests/unit/test_completions.py | 195 ++++++++++++++++++++++++---- 6 files changed, 367 insertions(+), 25 deletions(-) create mode 100644 src/tests/unit/test_availability.py diff --git a/src/mlpa/core/completions.py b/src/mlpa/core/completions.py index 7a5a8ec..5f40844 100644 --- a/src/mlpa/core/completions.py +++ b/src/mlpa/core/completions.py @@ -20,6 +20,7 @@ from mlpa.core.logger import logger from mlpa.core.metrics import ( extract_tool_names, + record_chat_availability, record_chat_request_rejection, record_completion_latency, record_completion_success, @@ -27,6 +28,7 @@ record_ttft, ) from mlpa.core.prometheus_metrics import ( + AvailabilityReason, PrometheusRejectionReason, PrometheusResult, ) @@ -80,6 +82,7 @@ async def stream_completion( record_request_with_tools(authorized_chat_request) body = _build_litellm_body(authorized_chat_request, stream=True) result = PrometheusResult.ERROR + availability_reason = AvailabilityReason.UPSTREAM_ERROR is_first_token = True prompt_tokens = 0 completion_tokens = 0 @@ -139,6 +142,7 @@ async def _read_next_chunk( if match.log_message: logger.warning(match.log_message) record_chat_request_rejection(authorized_chat_request, match.reason) + availability_reason = match.availability_reason() yield f'data: {{"error": {match.error_code}}}\n\n'.encode() return @@ -237,6 +241,7 @@ async def _read_next_chunk( return if not streaming_started: + availability_reason = AvailabilityReason.EMPTY_RESPONSE yield raise_and_log( RuntimeError("LiteLLM returned an empty response"), True, @@ -256,6 +261,7 @@ async def _read_next_chunk( snapshot=litellm_routing_snapshot, ) result = PrometheusResult.SUCCESS + availability_reason = AvailabilityReason.CLEAN_COMPLETION except (GeneratorExit, asyncio.CancelledError): # Client went away mid-stream: Starlette tears the generator down by # throwing GeneratorExit (or cancelling the task) at the paused @@ -291,9 +297,12 @@ async def _read_next_chunk( if result == PrometheusResult.ERROR and disconnect_event.is_set(): result = PrometheusResult.ABORT logger.info(_client_disconnected_msg) + if result == PrometheusResult.ABORT: + availability_reason = AvailabilityReason.CLIENT_DISCONNECT record_completion_latency( authorized_chat_request, result, time.perf_counter() - start_time ) + record_chat_availability(authorized_chat_request, availability_reason) async def get_completion(authorized_chat_request: AuthorizedChatRequest): @@ -304,6 +313,7 @@ async def get_completion(authorized_chat_request: AuthorizedChatRequest): record_request_with_tools(authorized_chat_request) body = _build_litellm_body(authorized_chat_request, stream=False) result = PrometheusResult.ERROR + availability_reason = AvailabilityReason.UPSTREAM_ERROR logger.debug( f"Starting a non-stream completion using {authorized_chat_request.model}, for user {authorized_chat_request.user}", ) @@ -326,6 +336,7 @@ async def get_completion(authorized_chat_request: AuthorizedChatRequest): if match.log_message: logger.warning(match.log_message) record_chat_request_rejection(authorized_chat_request, match.reason) + availability_reason = match.availability_reason() headers = ( {"Retry-After": match.retry_after} if match.retry_after else None ) @@ -362,6 +373,7 @@ async def get_completion(authorized_chat_request: AuthorizedChatRequest): snapshot=litellm_routing_snapshot, ) result = PrometheusResult.SUCCESS + availability_reason = AvailabilityReason.CLEAN_COMPLETION return data except HTTPException: raise @@ -371,3 +383,4 @@ async def get_completion(authorized_chat_request: AuthorizedChatRequest): record_completion_latency( authorized_chat_request, result, time.perf_counter() - start_time ) + record_chat_availability(authorized_chat_request, availability_reason) diff --git a/src/mlpa/core/errors.py b/src/mlpa/core/errors.py index 50b8a68..8bfc0dc 100644 --- a/src/mlpa/core/errors.py +++ b/src/mlpa/core/errors.py @@ -9,7 +9,7 @@ ERROR_CODE_REQUEST_TOO_LARGE, ERROR_CODE_UPSTREAM_RATE_LIMIT_EXCEEDED, ) -from mlpa.core.prometheus_metrics import PrometheusRejectionReason +from mlpa.core.prometheus_metrics import AvailabilityReason, PrometheusRejectionReason from mlpa.core.utils import ( is_context_window_error, is_invalid_model_name_error, @@ -18,6 +18,15 @@ is_rate_limit_error, ) +_REJECTION_TO_AVAILABILITY_REASON: dict[ + PrometheusRejectionReason, AvailabilityReason +] = { + PrometheusRejectionReason.BUDGET_EXCEEDED: AvailabilityReason.BUDGET_EXCEEDED, + PrometheusRejectionReason.PAYLOAD_TOO_LARGE: AvailabilityReason.PAYLOAD_TOO_LARGE, + PrometheusRejectionReason.INVALID_MODEL_NAME: AvailabilityReason.INVALID_MODEL_NAME, + PrometheusRejectionReason.INVALID_REQUEST: AvailabilityReason.INVALID_REQUEST, +} + @dataclass(frozen=True) class RejectionMatch: @@ -27,6 +36,15 @@ class RejectionMatch: retry_after: str | None = None log_message: str = "" + def availability_reason(self) -> AvailabilityReason: + # classify_upstream_error only ever produces these completion-stage + # rejection reasons; SIGNUP_CAP_EXCEEDED is recorded pre-completion. + if self.reason == PrometheusRejectionReason.RATE_LIMITED: + if self.error_code == ERROR_CODE_UPSTREAM_RATE_LIMIT_EXCEEDED: + return AvailabilityReason.RATE_LIMITED_UPSTREAM + return AvailabilityReason.RATE_LIMITED_OWN + return _REJECTION_TO_AVAILABILITY_REASON[self.reason] + _RATE_LIMIT_REJECTION: dict[int, tuple[PrometheusRejectionReason, str, str]] = { ERROR_CODE_BUDGET_LIMIT_EXCEEDED: ( diff --git a/src/mlpa/core/metrics.py b/src/mlpa/core/metrics.py index 5c54846..110fb4d 100644 --- a/src/mlpa/core/metrics.py +++ b/src/mlpa/core/metrics.py @@ -6,9 +6,11 @@ LitellmRoutingSnapshot, ) from mlpa.core.prometheus_metrics import ( + AvailabilityReason, PrometheusRejectionReason, PrometheusResult, TokenType, + availability_outcome_for, metrics, ) @@ -41,6 +43,16 @@ def record_search_request_rejection( metrics.search_request_rejections.labels(reason=reason, **_search_labels(req)).inc() +def record_chat_availability( + req: AuthorizedChatRequest, reason: AvailabilityReason +) -> None: + metrics.chat_availability.labels( + outcome=availability_outcome_for(reason), + reason=reason, + **_chat_labels(req), + ).inc() + + def record_completion_latency( req: AuthorizedChatRequest, result: PrometheusResult, diff --git a/src/mlpa/core/prometheus_metrics.py b/src/mlpa/core/prometheus_metrics.py index 745503d..6930c4b 100644 --- a/src/mlpa/core/prometheus_metrics.py +++ b/src/mlpa/core/prometheus_metrics.py @@ -19,6 +19,55 @@ class PrometheusRejectionReason(StrEnum): INVALID_REQUEST = "invalid_request" +class AvailabilityOutcome(StrEnum): + SUCCESS = "success" + FAILURE = "failure" + EXCLUDED = "excluded" + ABORT = "abort" + + +class AvailabilityReason(StrEnum): + # Values overlapping PrometheusRejectionReason are kept identical so the + # availability counter and the rejection counter reconcile on shared strings. + # Keep them in sync if a rejection reason is added. + # success + CLEAN_COMPLETION = "clean_completion" + # failure + UPSTREAM_ERROR = "upstream_error" + EMPTY_RESPONSE = "empty_response" + # excluded (policy rejections) + BUDGET_EXCEEDED = "budget_exceeded" + RATE_LIMITED_OWN = "rate_limited_own" + RATE_LIMITED_UPSTREAM = "rate_limited_upstream" + PAYLOAD_TOO_LARGE = "payload_too_large" + INVALID_MODEL_NAME = "invalid_model_name" + INVALID_REQUEST = "invalid_request" + # abort + CLIENT_DISCONNECT = "client_disconnect" + # TODO: add pre-completion reasons (auth_rejected, auth_system_failure, + # signup_cap_exceeded, blocked, provisioning_failure, invalid_purpose, + # invalid_service_type_for_model) when those paths are instrumented. + + +_AVAILABILITY_OUTCOME_BY_REASON: dict[AvailabilityReason, AvailabilityOutcome] = { + AvailabilityReason.CLEAN_COMPLETION: AvailabilityOutcome.SUCCESS, + AvailabilityReason.UPSTREAM_ERROR: AvailabilityOutcome.FAILURE, + AvailabilityReason.EMPTY_RESPONSE: AvailabilityOutcome.FAILURE, + AvailabilityReason.BUDGET_EXCEEDED: AvailabilityOutcome.EXCLUDED, + AvailabilityReason.RATE_LIMITED_OWN: AvailabilityOutcome.EXCLUDED, + AvailabilityReason.RATE_LIMITED_UPSTREAM: AvailabilityOutcome.EXCLUDED, + AvailabilityReason.PAYLOAD_TOO_LARGE: AvailabilityOutcome.EXCLUDED, + AvailabilityReason.INVALID_MODEL_NAME: AvailabilityOutcome.EXCLUDED, + AvailabilityReason.INVALID_REQUEST: AvailabilityOutcome.EXCLUDED, + AvailabilityReason.CLIENT_DISCONNECT: AvailabilityOutcome.ABORT, +} + + +def availability_outcome_for(reason: AvailabilityReason) -> AvailabilityOutcome: + """Pure classifier: the availability outcome is fully determined by the reason.""" + return _AVAILABILITY_OUTCOME_BY_REASON[reason] + + class TokenType(StrEnum): PROMPT = "prompt" COMPLETION = "completion" @@ -113,6 +162,7 @@ class PrometheusMetrics: chat_tool_calls_per_completion: Histogram chat_requests_with_tools: Counter chat_request_rejections: Counter + chat_availability: Counter # search search_latency: Histogram @@ -274,6 +324,12 @@ def build_metrics(registry: CollectorRegistry = REGISTRY) -> PrometheusMetrics: ["reason", "model", "service_type", "purpose"], registry=registry, ), + chat_availability=Counter( + "mlpa_chat_availability_total", + "Interim availability outcomes for chat completions. outcome is success/failure/excluded/abort; reason is the bounded cause. Availability = success / (success + failure).", + ["outcome", "reason", "model", "service_type", "purpose"], + registry=registry, + ), search_latency=Histogram( "mlpa_search_latency_seconds", "Search latency in seconds.", diff --git a/src/tests/unit/test_availability.py b/src/tests/unit/test_availability.py new file mode 100644 index 0000000..ca9898e --- /dev/null +++ b/src/tests/unit/test_availability.py @@ -0,0 +1,96 @@ +import pytest + +from mlpa.core.config import ( + ERROR_CODE_BUDGET_LIMIT_EXCEEDED, + ERROR_CODE_INVALID_MODEL_NAME, + ERROR_CODE_INVALID_REQUEST, + ERROR_CODE_RATE_LIMIT_EXCEEDED, + ERROR_CODE_REQUEST_TOO_LARGE, + ERROR_CODE_UPSTREAM_RATE_LIMIT_EXCEEDED, +) +from mlpa.core.errors import RejectionMatch +from mlpa.core.prometheus_metrics import ( + AvailabilityOutcome, + AvailabilityReason, + PrometheusRejectionReason, + availability_outcome_for, +) + + +def test_every_availability_reason_maps_to_an_outcome(): + """Guard: a new AvailabilityReason cannot ship without an outcome mapping. + + Future pre-completion reasons added to AvailabilityReason must extend the + map too; this fails loudly (KeyError) if the map is not updated alongside the enum. + """ + for reason in AvailabilityReason: + assert isinstance(availability_outcome_for(reason), AvailabilityOutcome) + + +# SIGNUP_CAP_EXCEEDED is recorded pre-completion, not by classify_upstream_error, +# so it is intentionally outside the completion-stage availability mapping. +_PRE_COMPLETION_REJECTION_REASONS = {PrometheusRejectionReason.SIGNUP_CAP_EXCEEDED} + + +def test_every_completion_stage_rejection_reason_maps_to_excluded(): + """Guard: every rejection reason classify_upstream_error can produce must + resolve through availability_reason() to an excluded outcome. + + Iterating the enum (minus the pre-completion reasons) means a newly added + completion-stage rejection reason fails loudly here until it is mapped, or is + explicitly classified as pre-completion. + """ + for reason in PrometheusRejectionReason: + if reason in _PRE_COMPLETION_REJECTION_REASONS: + continue + match = RejectionMatch(reason=reason, error_code=0, http_status=400) + assert ( + availability_outcome_for(match.availability_reason()) + == AvailabilityOutcome.EXCLUDED + ) + + +# Pins the expected availability reason for each completion-stage rejection, +# including the own-vs-upstream rate-limit split keyed on error_code. This fixes +# the exact mappings; the completeness test above guards that the map covers +# every completion-stage rejection reason. +@pytest.mark.parametrize( + ("reason", "error_code", "expected"), + [ + ( + PrometheusRejectionReason.BUDGET_EXCEEDED, + ERROR_CODE_BUDGET_LIMIT_EXCEEDED, + AvailabilityReason.BUDGET_EXCEEDED, + ), + ( + PrometheusRejectionReason.RATE_LIMITED, + ERROR_CODE_RATE_LIMIT_EXCEEDED, + AvailabilityReason.RATE_LIMITED_OWN, + ), + ( + PrometheusRejectionReason.RATE_LIMITED, + ERROR_CODE_UPSTREAM_RATE_LIMIT_EXCEEDED, + AvailabilityReason.RATE_LIMITED_UPSTREAM, + ), + ( + PrometheusRejectionReason.PAYLOAD_TOO_LARGE, + ERROR_CODE_REQUEST_TOO_LARGE, + AvailabilityReason.PAYLOAD_TOO_LARGE, + ), + ( + PrometheusRejectionReason.INVALID_MODEL_NAME, + ERROR_CODE_INVALID_MODEL_NAME, + AvailabilityReason.INVALID_MODEL_NAME, + ), + ( + PrometheusRejectionReason.INVALID_REQUEST, + ERROR_CODE_INVALID_REQUEST, + AvailabilityReason.INVALID_REQUEST, + ), + ], +) +def test_rejection_match_availability_reason(reason, error_code, expected): + match = RejectionMatch(reason=reason, error_code=error_code, http_status=400) + assert match.availability_reason() == expected + # All policy rejections are excluded from the availability ratio. + assert availability_outcome_for(expected) == AvailabilityOutcome.EXCLUDED diff --git a/src/tests/unit/test_completions.py b/src/tests/unit/test_completions.py index fb87613..afa6a25 100644 --- a/src/tests/unit/test_completions.py +++ b/src/tests/unit/test_completions.py @@ -28,7 +28,12 @@ LITELLM_HEADER_RESPONSE_DURATION_MS, env, ) -from mlpa.core.prometheus_metrics import PrometheusRejectionReason, PrometheusResult +from mlpa.core.prometheus_metrics import ( + AvailabilityOutcome, + AvailabilityReason, + PrometheusRejectionReason, + PrometheusResult, +) from tests.consts import SAMPLE_REQUEST, SUCCESSFUL_CHAT_RESPONSE @@ -54,6 +59,39 @@ def _rejection_count( ) +def _availability_count( + spy, + outcome: AvailabilityOutcome, + reason: AvailabilityReason, + req=SAMPLE_REQUEST, +) -> float: + return spy.value( + "chat_availability", + outcome=outcome, + reason=reason, + model=req.model, + service_type=req.service_type, + purpose=req.purpose, + ) + + +def _availability_total(spy, req=SAMPLE_REQUEST) -> float: + """Sum of all chat_availability samples for the request labels. + + Proves exactly one availability disposition was recorded, regardless of + which (outcome, reason) pair it landed on. Guards the policy-rejection path + against re-introducing a second emission alongside the correct one. + """ + return sum( + s.value + for s in spy.samples("chat_availability") + if s.name.endswith("_total") + and s.labels.get("model") == req.model + and s.labels.get("service_type") == req.service_type + and s.labels.get("purpose") == req.purpose + ) + + def _sample_litellm_response_headers(**overrides: str) -> httpx.Headers: base = { LITELLM_HEADER_MODEL_API_BASE: "https://api.together.xyz/v1", @@ -105,6 +143,7 @@ async def test_get_completion_success(mocker, metrics_spy): metrics_spy.assert_only( { + "chat_availability", "chat_tokens", "chat_tokens_per_request", "chat_completion_latency", @@ -131,6 +170,14 @@ async def test_get_completion_success(mocker, metrics_spy): == SUCCESSFUL_CHAT_RESPONSE["usage"]["completion_tokens"] ) assert _latency_count(metrics_spy, PrometheusResult.SUCCESS) == 1 + assert ( + _availability_count( + metrics_spy, + AvailabilityOutcome.SUCCESS, + AvailabilityReason.CLEAN_COMPLETION, + ) + == 1 + ) routing = _litellm_routing_label_base() assert ( @@ -256,8 +303,17 @@ async def test_get_completion_http_error(mocker, metrics_spy): assert exc_info.value.status_code == 500 assert exc_info.value.detail["error"] == "Upstream service returned an error" + assert ( + _availability_count( + metrics_spy, + AvailabilityOutcome.FAILURE, + AvailabilityReason.UPSTREAM_ERROR, + ) + == 1 + ) + assert _availability_total(metrics_spy) == 1 - metrics_spy.assert_only({"chat_completion_latency"}) + metrics_spy.assert_only({"chat_completion_latency", "chat_availability"}) assert _latency_count(metrics_spy, PrometheusResult.ERROR) == 1 @@ -273,7 +329,7 @@ async def test_get_completion_network_error(mocker, metrics_spy): assert exc_info.value.status_code == 502 assert exc_info.value.detail["error"] == "Connection timed out" - metrics_spy.assert_only({"chat_completion_latency"}) + metrics_spy.assert_only({"chat_completion_latency", "chat_availability"}) assert _latency_count(metrics_spy, PrometheusResult.ERROR) == 1 @@ -308,6 +364,7 @@ async def test_stream_completion_success( metrics_spy.assert_only( { + "chat_availability", "chat_completion_ttft", "chat_tokens", "chat_tokens_per_request", @@ -329,6 +386,14 @@ async def test_stream_completion_success( assert metrics_spy.value("chat_tokens", type="prompt", **chat_label_base) == 10 assert metrics_spy.value("chat_tokens", type="completion", **chat_label_base) == 25 assert _latency_count(metrics_spy, PrometheusResult.SUCCESS) == 1 + assert ( + _availability_count( + metrics_spy, + AvailabilityOutcome.SUCCESS, + AvailabilityReason.CLEAN_COMPLETION, + ) + == 1 + ) assert ( metrics_spy.histogram_count("chat_completion_ttft", model=SAMPLE_REQUEST.model) == 1 @@ -419,9 +484,22 @@ async def test_get_completion_budget_limit_exceeded_429(mocker, metrics_spy): assert exc_info.value.detail == {"error": 1} assert exc_info.value.headers == {"Retry-After": "86400"} - metrics_spy.assert_only({"chat_request_rejections", "chat_completion_latency"}) + metrics_spy.assert_only( + {"chat_request_rejections", "chat_completion_latency", "chat_availability"} + ) assert _rejection_count(metrics_spy, PrometheusRejectionReason.BUDGET_EXCEEDED) == 1 assert _latency_count(metrics_spy, PrometheusResult.ERROR) == 1 + # gap 1: same request, recorded as excluded (not failure) on availability + # even though the latency histogram above still counts it as result=error. + assert ( + _availability_count( + metrics_spy, + AvailabilityOutcome.EXCLUDED, + AvailabilityReason.BUDGET_EXCEEDED, + ) + == 1 + ) + assert _availability_total(metrics_spy) == 1 async def test_get_completion_budget_limit_exceeded_400(mocker, metrics_spy): @@ -453,7 +531,9 @@ async def test_get_completion_budget_limit_exceeded_400(mocker, metrics_spy): assert exc_info.value.detail == {"error": 1} assert exc_info.value.headers == {"Retry-After": "86400"} - metrics_spy.assert_only({"chat_request_rejections", "chat_completion_latency"}) + metrics_spy.assert_only( + {"chat_request_rejections", "chat_completion_latency", "chat_availability"} + ) assert _rejection_count(metrics_spy, PrometheusRejectionReason.BUDGET_EXCEEDED) == 1 @@ -485,8 +565,18 @@ async def test_get_completion_rate_limit_exceeded(mocker, metrics_spy): assert exc_info.value.status_code == 429 assert exc_info.value.detail == {"error": 2} assert exc_info.value.headers == {"Retry-After": "60"} + assert ( + _availability_count( + metrics_spy, + AvailabilityOutcome.EXCLUDED, + AvailabilityReason.RATE_LIMITED_OWN, + ) + == 1 + ) - metrics_spy.assert_only({"chat_request_rejections", "chat_completion_latency"}) + metrics_spy.assert_only( + {"chat_request_rejections", "chat_completion_latency", "chat_availability"} + ) assert _rejection_count(metrics_spy, PrometheusRejectionReason.RATE_LIMITED) == 1 @@ -518,7 +608,7 @@ async def test_get_completion_400_non_rate_limit_error(mocker, metrics_spy): assert exc_info.value.status_code == 400 assert exc_info.value.detail == {"error": "Upstream service returned an error"} - metrics_spy.assert_only({"chat_completion_latency"}) + metrics_spy.assert_only({"chat_completion_latency", "chat_availability"}) async def test_get_completion_429_non_rate_limit_error(mocker, metrics_spy): @@ -543,7 +633,7 @@ async def test_get_completion_429_non_rate_limit_error(mocker, metrics_spy): assert exc_info.value.status_code == 429 assert exc_info.value.detail == {"error": "Upstream service returned an error"} - metrics_spy.assert_only({"chat_completion_latency"}) + metrics_spy.assert_only({"chat_completion_latency", "chat_availability"}) async def test_get_completion_upstream_rate_limit_error(mocker, metrics_spy): @@ -568,7 +658,17 @@ async def test_get_completion_upstream_rate_limit_error(mocker, metrics_spy): assert exc_info.value.status_code == 429 assert exc_info.value.detail == {"error": ERROR_CODE_UPSTREAM_RATE_LIMIT_EXCEEDED} - metrics_spy.assert_only({"chat_request_rejections", "chat_completion_latency"}) + assert ( + _availability_count( + metrics_spy, + AvailabilityOutcome.EXCLUDED, + AvailabilityReason.RATE_LIMITED_UPSTREAM, + ) + == 1 + ) + metrics_spy.assert_only( + {"chat_request_rejections", "chat_completion_latency", "chat_availability"} + ) assert _rejection_count(metrics_spy, PrometheusRejectionReason.RATE_LIMITED) == 1 assert _latency_count(metrics_spy, PrometheusResult.ERROR) == 1 @@ -599,7 +699,9 @@ async def test_get_completion_context_window_exceeded(mocker, metrics_spy): assert exc_info.value.detail == {"error": ERROR_CODE_REQUEST_TOO_LARGE} mock_logger.warning.assert_called_once() assert "Context window exceeded" in str(mock_logger.warning.call_args) - metrics_spy.assert_only({"chat_request_rejections", "chat_completion_latency"}) + metrics_spy.assert_only( + {"chat_request_rejections", "chat_completion_latency", "chat_availability"} + ) assert ( _rejection_count(metrics_spy, PrometheusRejectionReason.PAYLOAD_TOO_LARGE) == 1 ) @@ -634,7 +736,9 @@ async def test_get_completion_invalid_model_name(mocker, metrics_spy): assert exc_info.value.detail == {"error": ERROR_CODE_INVALID_MODEL_NAME} mock_logger.warning.assert_called_once() assert "Invalid model name" in str(mock_logger.warning.call_args) - metrics_spy.assert_only({"chat_request_rejections", "chat_completion_latency"}) + metrics_spy.assert_only( + {"chat_request_rejections", "chat_completion_latency", "chat_availability"} + ) assert ( _rejection_count(metrics_spy, PrometheusRejectionReason.INVALID_MODEL_NAME) == 1 ) @@ -668,7 +772,9 @@ async def test_get_completion_invalid_request_vertex(mocker, metrics_spy): assert exc_info.value.detail == {"error": ERROR_CODE_INVALID_REQUEST} mock_logger.warning.assert_called_once() assert "Invalid request" in str(mock_logger.warning.call_args) - metrics_spy.assert_only({"chat_request_rejections", "chat_completion_latency"}) + metrics_spy.assert_only( + {"chat_request_rejections", "chat_completion_latency", "chat_availability"} + ) assert _rejection_count(metrics_spy, PrometheusRejectionReason.INVALID_REQUEST) == 1 assert _latency_count(metrics_spy, PrometheusResult.ERROR) == 1 @@ -693,7 +799,7 @@ async def test_get_completion_429_invalid_json(mocker, metrics_spy): assert exc_info.value.status_code == 429 assert exc_info.value.detail == {"error": "Upstream service returned an error"} - metrics_spy.assert_only({"chat_completion_latency"}) + metrics_spy.assert_only({"chat_completion_latency", "chat_availability"}) async def test_stream_completion_budget_limit_exceeded_429( @@ -727,9 +833,20 @@ async def test_stream_completion_budget_limit_exceeded_429( ) mock_logger.warning.assert_called_once() assert "Budget limit exceeded" in str(mock_logger.warning.call_args) - metrics_spy.assert_only({"chat_request_rejections", "chat_completion_latency"}) + metrics_spy.assert_only( + {"chat_request_rejections", "chat_completion_latency", "chat_availability"} + ) assert _rejection_count(metrics_spy, PrometheusRejectionReason.BUDGET_EXCEEDED) == 1 assert _latency_count(metrics_spy, PrometheusResult.ERROR) == 1 + assert ( + _availability_count( + metrics_spy, + AvailabilityOutcome.EXCLUDED, + AvailabilityReason.BUDGET_EXCEEDED, + ) + == 1 + ) + assert _availability_total(metrics_spy) == 1 async def test_stream_completion_budget_limit_exceeded_400( @@ -764,7 +881,9 @@ async def test_stream_completion_budget_limit_exceeded_400( ) mock_logger.warning.assert_called_once() assert "Budget limit exceeded" in str(mock_logger.warning.call_args) - metrics_spy.assert_only({"chat_request_rejections", "chat_completion_latency"}) + metrics_spy.assert_only( + {"chat_request_rejections", "chat_completion_latency", "chat_availability"} + ) assert _rejection_count(metrics_spy, PrometheusRejectionReason.BUDGET_EXCEEDED) == 1 assert _latency_count(metrics_spy, PrometheusResult.ERROR) == 1 @@ -801,7 +920,9 @@ async def test_stream_completion_rate_limit_exceeded( ) mock_logger.warning.assert_called_once() assert "Rate limit exceeded" in str(mock_logger.warning.call_args) - metrics_spy.assert_only({"chat_request_rejections", "chat_completion_latency"}) + metrics_spy.assert_only( + {"chat_request_rejections", "chat_completion_latency", "chat_availability"} + ) assert _rejection_count(metrics_spy, PrometheusRejectionReason.RATE_LIMITED) == 1 assert _latency_count(metrics_spy, PrometheusResult.ERROR) == 1 @@ -833,7 +954,9 @@ async def test_stream_completion_context_window_exceeded( ) mock_logger.warning.assert_called_once() assert "Context window exceeded" in str(mock_logger.warning.call_args) - metrics_spy.assert_only({"chat_request_rejections", "chat_completion_latency"}) + metrics_spy.assert_only( + {"chat_request_rejections", "chat_completion_latency", "chat_availability"} + ) assert ( _rejection_count(metrics_spy, PrometheusRejectionReason.PAYLOAD_TOO_LARGE) == 1 ) @@ -869,7 +992,9 @@ async def test_stream_completion_invalid_model_name( ) mock_logger.warning.assert_called_once() assert "Invalid model name" in str(mock_logger.warning.call_args) - metrics_spy.assert_only({"chat_request_rejections", "chat_completion_latency"}) + metrics_spy.assert_only( + {"chat_request_rejections", "chat_completion_latency", "chat_availability"} + ) assert ( _rejection_count(metrics_spy, PrometheusRejectionReason.INVALID_MODEL_NAME) == 1 ) @@ -904,7 +1029,9 @@ async def test_stream_completion_invalid_request_vertex( ) mock_logger.warning.assert_called_once() assert "Invalid request" in str(mock_logger.warning.call_args) - metrics_spy.assert_only({"chat_request_rejections", "chat_completion_latency"}) + metrics_spy.assert_only( + {"chat_request_rejections", "chat_completion_latency", "chat_availability"} + ) assert _rejection_count(metrics_spy, PrometheusRejectionReason.INVALID_REQUEST) == 1 assert _latency_count(metrics_spy, PrometheusResult.ERROR) == 1 @@ -941,7 +1068,7 @@ async def test_stream_completion_400_non_rate_limit_error( == b'data: {"code": 400, "error": "Upstream service returned an error"}\n\n' ) mock_logger.error.assert_called_once() - metrics_spy.assert_only({"chat_completion_latency"}) + metrics_spy.assert_only({"chat_completion_latency", "chat_availability"}) assert _latency_count(metrics_spy, PrometheusResult.ERROR) == 1 @@ -971,7 +1098,7 @@ async def test_stream_completion_429_non_rate_limit_error( == b'data: {"code": 429, "error": "Upstream service returned an error"}\n\n' ) mock_logger.error.assert_called_once() - metrics_spy.assert_only({"chat_completion_latency"}) + metrics_spy.assert_only({"chat_completion_latency", "chat_availability"}) assert _latency_count(metrics_spy, PrometheusResult.ERROR) == 1 @@ -995,7 +1122,9 @@ async def test_stream_completion_upstream_rate_limit_error( assert received_chunks == [ f'data: {{"error": {ERROR_CODE_UPSTREAM_RATE_LIMIT_EXCEEDED}}}\n\n'.encode() ] - metrics_spy.assert_only({"chat_request_rejections", "chat_completion_latency"}) + metrics_spy.assert_only( + {"chat_request_rejections", "chat_completion_latency", "chat_availability"} + ) assert _rejection_count(metrics_spy, PrometheusRejectionReason.RATE_LIMITED) == 1 assert _latency_count(metrics_spy, PrometheusResult.ERROR) == 1 @@ -1023,7 +1152,7 @@ async def test_stream_completion_429_invalid_json( == b'data: {"code": 429, "error": "Upstream service returned an error"}\n\n' ) mock_logger.error.assert_called_once() - metrics_spy.assert_only({"chat_completion_latency"}) + metrics_spy.assert_only({"chat_completion_latency", "chat_availability"}) assert _latency_count(metrics_spy, PrometheusResult.ERROR) == 1 @@ -1043,7 +1172,7 @@ async def test_stream_completion_exception_after_streaming_started( assert len(received_chunks) == 1 assert b"error" in received_chunks[0] - metrics_spy.assert_only({"chat_completion_latency"}) + metrics_spy.assert_only({"chat_completion_latency", "chat_availability"}) assert _latency_count(metrics_spy, PrometheusResult.ERROR) == 1 @@ -1248,6 +1377,15 @@ async def test_stream_sends_error_sse_on_empty_200_response( ) assert b'"error"' in received[0], "Chunk must be an error SSE frame" _assert_error_latency(metrics_spy) + assert ( + _availability_count( + metrics_spy, + AvailabilityOutcome.FAILURE, + AvailabilityReason.EMPTY_RESPONSE, + ) + == 1 + ) + assert _availability_total(metrics_spy) == 1 async def test_stream_completion_client_disconnect_records_abort( @@ -1280,6 +1418,15 @@ async def _aiter_bytes(): assert _latency_count(metrics_spy, PrometheusResult.ABORT) == 1 assert _latency_count(metrics_spy, PrometheusResult.ERROR) == 0 + assert ( + _availability_count( + metrics_spy, + AvailabilityOutcome.ABORT, + AvailabilityReason.CLIENT_DISCONNECT, + ) + == 1 + ) + assert _availability_total(metrics_spy) == 1 async def test_stream_uses_httpx_timeout_object_preserving_pool_timeout( From 8c41794c3e65cf8d605b5ac716639f082763c998 Mon Sep 17 00:00:00 2001 From: Randy Concepcion Date: Thu, 11 Jun 2026 22:52:42 -0400 Subject: [PATCH 2/9] instrument pre-completion chat availability --- src/mlpa/core/auth/authorize.py | 62 ++- src/mlpa/core/completions.py | 31 +- src/mlpa/core/metrics.py | 23 +- src/mlpa/core/prometheus_metrics.py | 23 +- src/mlpa/run.py | 3 + .../unit/test_precompletion_availability.py | 397 ++++++++++++++++++ 6 files changed, 507 insertions(+), 32 deletions(-) create mode 100644 src/tests/unit/test_precompletion_availability.py diff --git a/src/mlpa/core/auth/authorize.py b/src/mlpa/core/auth/authorize.py index 50eae01..e781ff9 100644 --- a/src/mlpa/core/auth/authorize.py +++ b/src/mlpa/core/auth/authorize.py @@ -13,6 +13,8 @@ ServiceType, ) from mlpa.core.config import env +from mlpa.core.metrics import record_chat_availability_for +from mlpa.core.prometheus_metrics import AvailabilityReason from mlpa.core.routers.appattest import app_attest_auth from mlpa.core.utils import extract_user_from_play_integrity_jwt, parse_app_attest_jwt @@ -123,27 +125,57 @@ async def authorize_chat_request( ) if not is_service_type_valid: + record_chat_availability_for( + AvailabilityReason.INVALID_SERVICE_TYPE_FOR_MODEL, + model=chat_request.model, + service_type=service_type.value, + purpose="", + ) raise HTTPException( status_code=400, detail=f"Invalid service-type value for model {chat_request.model}. Should be one of {env.forced_model_service_type_pairs.get(chat_request.model)}", ) - return await _authorize_common_request( - request=request, - build_authorized_request=lambda user, purpose_value: AuthorizedChatRequest( - user=user, + try: + return await _authorize_common_request( + request=request, + build_authorized_request=lambda user, purpose_value: AuthorizedChatRequest( + user=user, + service_type=service_type.value, + purpose=purpose_value, + **chat_request.model_dump(exclude_unset=True), + ), + authorization=authorization, + service_type=service_type, + purpose=purpose, + x_dev_authorization=x_dev_authorization, + use_app_attest=use_app_attest, + use_qa_certificates=use_qa_certificates, + use_play_integrity=use_play_integrity, + ) + except HTTPException as exc: + # Record only the terminal auth dispositions the dependency intentionally + # emits. 401 is an expected or normalized auth rejection. 400 is a + # client-error request to the auth layer (invalid purpose, or malformed App + # Attest base64 decoded in app_attest_auth). Any other status, including App + # Attest's explicit 500, is re-raised unrecorded; auth-system-failure capture + # is left to a follow-on auth backend change. Non-HTTPException errors are not + # caught here and propagate unrecorded. Purpose is unresolved at these + # dispositions, so a stable "" placeholder is used, never the unvalidated + # header value. + if exc.status_code == 401: + reason = AvailabilityReason.AUTH_REJECTED + elif exc.status_code == 400: + reason = AvailabilityReason.INVALID_AUTH_REQUEST + else: + raise + record_chat_availability_for( + reason, + model=chat_request.model, service_type=service_type.value, - purpose=purpose_value, - **chat_request.model_dump(exclude_unset=True), - ), - authorization=authorization, - service_type=service_type, - purpose=purpose, - x_dev_authorization=x_dev_authorization, - use_app_attest=use_app_attest, - use_qa_certificates=use_qa_certificates, - use_play_integrity=use_play_integrity, - ) + purpose="", + ) + raise async def authorize_search_request( diff --git a/src/mlpa/core/completions.py b/src/mlpa/core/completions.py index 5f40844..1e531a8 100644 --- a/src/mlpa/core/completions.py +++ b/src/mlpa/core/completions.py @@ -54,20 +54,29 @@ def _build_litellm_body(req: AuthorizedChatRequest, *, stream: bool) -> dict: async def get_or_create_user_for_completion( user_id: str, req: AuthorizedChatRequest | AuthorizedSearchRequest ): - """Wraps get_or_create_user and records a signup-cap rejection metric if applicable.""" + """Wraps get_or_create_user, recording the signup-cap rejection metric and the + pre-completion availability disposition (signup cap, or a 5xx provisioning + failure) for chat requests.""" try: return await get_or_create_user(user_id) except HTTPException as exc: - if ( - exc.status_code == 403 - and isinstance(exc.detail, dict) - and exc.detail.get("error") == ERROR_CODE_MAX_USERS_REACHED - and isinstance(req, AuthorizedChatRequest) - ): - record_chat_request_rejection( - req, - PrometheusRejectionReason.SIGNUP_CAP_EXCEEDED, - ) + if isinstance(req, AuthorizedChatRequest): + if ( + exc.status_code == 403 + and isinstance(exc.detail, dict) + and exc.detail.get("error") == ERROR_CODE_MAX_USERS_REACHED + ): + record_chat_request_rejection( + req, + PrometheusRejectionReason.SIGNUP_CAP_EXCEEDED, + ) + record_chat_availability(req, AvailabilityReason.SIGNUP_CAP_EXCEEDED) + elif exc.status_code >= 500: + # User-resolution / provisioning system failure for an eligible + # request. Non-signup-cap, non-5xx dispositions are left unrecorded; + # a future client-side 4xx should get its own classification rather + # than counting as an availability failure. + record_chat_availability(req, AvailabilityReason.PROVISIONING_FAILURE) raise diff --git a/src/mlpa/core/metrics.py b/src/mlpa/core/metrics.py index 110fb4d..f81783a 100644 --- a/src/mlpa/core/metrics.py +++ b/src/mlpa/core/metrics.py @@ -43,16 +43,33 @@ def record_search_request_rejection( metrics.search_request_rejections.labels(reason=reason, **_search_labels(req)).inc() -def record_chat_availability( - req: AuthorizedChatRequest, reason: AvailabilityReason +def record_chat_availability_for( + reason: AvailabilityReason, + *, + model: str, + service_type: str, + purpose: str, ) -> None: metrics.chat_availability.labels( outcome=availability_outcome_for(reason), reason=reason, - **_chat_labels(req), + model=model, + service_type=service_type, + purpose=purpose, ).inc() +def record_chat_availability( + req: AuthorizedChatRequest, reason: AvailabilityReason +) -> None: + record_chat_availability_for( + reason, + model=req.model, + service_type=req.service_type, + purpose=req.purpose, + ) + + def record_completion_latency( req: AuthorizedChatRequest, result: PrometheusResult, diff --git a/src/mlpa/core/prometheus_metrics.py b/src/mlpa/core/prometheus_metrics.py index 6930c4b..531816c 100644 --- a/src/mlpa/core/prometheus_metrics.py +++ b/src/mlpa/core/prometheus_metrics.py @@ -44,9 +44,19 @@ class AvailabilityReason(StrEnum): INVALID_REQUEST = "invalid_request" # abort CLIENT_DISCONNECT = "client_disconnect" - # TODO: add pre-completion reasons (auth_rejected, auth_system_failure, - # signup_cap_exceeded, blocked, provisioning_failure, invalid_purpose, - # invalid_service_type_for_model) when those paths are instrumented. + # pre-completion (auth dependency and route body, before the completion path) + # excluded + AUTH_REJECTED = "auth_rejected" + INVALID_AUTH_REQUEST = "invalid_auth_request" + INVALID_SERVICE_TYPE_FOR_MODEL = "invalid_service_type_for_model" + SIGNUP_CAP_EXCEEDED = "signup_cap_exceeded" + BLOCKED = "blocked" + # failure + PROVISIONING_FAILURE = "provisioning_failure" + # Defined but not emitted yet: auth backends normalize system failures to 401 + # (indistinguishable from expected rejections), so capturing this is left to a + # follow-on auth backend change that surfaces a real disposition. + AUTH_SYSTEM_FAILURE = "auth_system_failure" _AVAILABILITY_OUTCOME_BY_REASON: dict[AvailabilityReason, AvailabilityOutcome] = { @@ -60,6 +70,13 @@ class AvailabilityReason(StrEnum): AvailabilityReason.INVALID_MODEL_NAME: AvailabilityOutcome.EXCLUDED, AvailabilityReason.INVALID_REQUEST: AvailabilityOutcome.EXCLUDED, AvailabilityReason.CLIENT_DISCONNECT: AvailabilityOutcome.ABORT, + AvailabilityReason.AUTH_REJECTED: AvailabilityOutcome.EXCLUDED, + AvailabilityReason.INVALID_AUTH_REQUEST: AvailabilityOutcome.EXCLUDED, + AvailabilityReason.INVALID_SERVICE_TYPE_FOR_MODEL: AvailabilityOutcome.EXCLUDED, + AvailabilityReason.SIGNUP_CAP_EXCEEDED: AvailabilityOutcome.EXCLUDED, + AvailabilityReason.BLOCKED: AvailabilityOutcome.EXCLUDED, + AvailabilityReason.PROVISIONING_FAILURE: AvailabilityOutcome.FAILURE, + AvailabilityReason.AUTH_SYSTEM_FAILURE: AvailabilityOutcome.FAILURE, } diff --git a/src/mlpa/run.py b/src/mlpa/run.py index 6a98a06..b7baebf 100644 --- a/src/mlpa/run.py +++ b/src/mlpa/run.py @@ -26,9 +26,11 @@ ) from mlpa.core.http_client import close_http_client, get_http_client from mlpa.core.logger import logger, setup_logger +from mlpa.core.metrics import record_chat_availability from mlpa.core.middleware import register_middleware from mlpa.core.openapi import customize_openapi from mlpa.core.pg_services.services import app_attest_pg, litellm_pg +from mlpa.core.prometheus_metrics import AvailabilityReason from mlpa.core.routers.appattest import appattest_router from mlpa.core.routers.health import health_router from mlpa.core.routers.mock import mock_router @@ -207,6 +209,7 @@ async def chat_completion( ) user, _ = await get_or_create_user_for_completion(user_id, authorized_chat_request) if user.get("blocked"): + record_chat_availability(authorized_chat_request, AvailabilityReason.BLOCKED) raise HTTPException(status_code=403, detail={"error": "User is blocked."}) if authorized_chat_request.stream: diff --git a/src/tests/unit/test_precompletion_availability.py b/src/tests/unit/test_precompletion_availability.py new file mode 100644 index 0000000..b871ebb --- /dev/null +++ b/src/tests/unit/test_precompletion_availability.py @@ -0,0 +1,397 @@ +"""Pre-completion availability instrumentation. + +Covers the dispositions the completion-stage counter cannot see: the chat auth +dependency (service-type / model, auth rejection, client-error auth request, and +the statuses that are intentionally not recorded) and the route-body sites +(signup cap, provisioning failure, blocked). The completion tests bypass the auth +dependency, so the wrapper behavior is only exercised here. +""" + +import pytest +from fastapi import HTTPException, Request + +from mlpa import run as run_module +from mlpa.core.auth import authorize as authorize_module +from mlpa.core.classes import ( + AuthorizedChatRequest, + AuthorizedSearchRequest, + ChatRequest, +) +from mlpa.core.completions import get_or_create_user_for_completion +from mlpa.core.config import ERROR_CODE_MAX_USERS_REACHED +from mlpa.core.prometheus_metrics import ( + AvailabilityOutcome, + AvailabilityReason, + PrometheusRejectionReason, +) +from tests.consts import SAMPLE_REQUEST + +# A model/service-type pair that is valid together, so the wrapper passes its own +# check and reaches the shared auth call. +_VALID_MODEL = "openai/gpt-4o" +_AI = authorize_module.ServiceType.ai + + +def _make_request() -> Request: + async def receive() -> dict: + return {"type": "http.request", "body": b"", "more_body": False} + + return Request( + {"type": "http", "method": "POST", "path": "/", "headers": []}, receive + ) + + +def _chat_request(model: str = _VALID_MODEL) -> ChatRequest: + return ChatRequest(model=model, messages=[{"role": "user", "content": "hi"}]) + + +def _availability( + spy, + outcome: AvailabilityOutcome, + reason: AvailabilityReason, + *, + model: str, + service_type: str, + purpose: str = "", +) -> float: + return spy.value( + "chat_availability", + outcome=outcome, + reason=reason, + model=model, + service_type=service_type, + purpose=purpose, + ) + + +def _availability_total(spy) -> float: + """Sum of every chat_availability sample. Proves exactly one disposition.""" + return sum( + s.value for s in spy.samples("chat_availability") if s.name.endswith("_total") + ) + + +def _rejection_total(spy) -> float: + return sum( + s.value + for s in spy.samples("chat_request_rejections") + if s.name.endswith("_total") + ) + + +# --- auth dependency (authorize_chat_request) --------------------------------- + + +async def test_wrapper_success_records_no_auth_stage_availability(mocker, metrics_spy): + mocker.patch.object( + authorize_module, + "fxa_auth", + mocker.AsyncMock(return_value={"user": "user-123"}), + ) + + result = await authorize_module.authorize_chat_request( + request=_make_request(), + chat_request=_chat_request(), + authorization="Bearer token", + service_type=_AI, + purpose="chat", + ) + + assert isinstance(result, AuthorizedChatRequest) + # Auth success is finalized later at completion, never at the auth stage. + assert "chat_availability" not in metrics_spy.touched() + + +async def test_wrapper_invalid_service_type_records_excluded(metrics_spy): + with pytest.raises(HTTPException) as exc_info: + await authorize_module.authorize_chat_request( + request=_make_request(), + chat_request=ChatRequest(model="exa", messages=[]), + authorization="Bearer token", + service_type=_AI, + purpose="chat", + ) + + assert exc_info.value.status_code == 400 + assert ( + _availability( + metrics_spy, + AvailabilityOutcome.EXCLUDED, + AvailabilityReason.INVALID_SERVICE_TYPE_FOR_MODEL, + model="exa", + service_type="ai", + ) + == 1 + ) + assert _availability_total(metrics_spy) == 1 + + +async def test_wrapper_invalid_purpose_records_invalid_auth_request(metrics_spy): + # A real shared-call 400 from purpose validation maps to the coarse reason. + with pytest.raises(HTTPException) as exc_info: + await authorize_module.authorize_chat_request( + request=_make_request(), + chat_request=_chat_request(), + authorization="Bearer token", + service_type=_AI, + purpose="definitely-not-a-valid-purpose", + ) + + assert exc_info.value.status_code == 400 + assert ( + _availability( + metrics_spy, + AvailabilityOutcome.EXCLUDED, + AvailabilityReason.INVALID_AUTH_REQUEST, + model=_VALID_MODEL, + service_type="ai", + ) + == 1 + ) + assert _availability_total(metrics_spy) == 1 + + +async def test_wrapper_shared_call_401_records_auth_rejected(mocker, metrics_spy): + raised = HTTPException(status_code=401, detail="Invalid FxA auth") + mocker.patch.object( + authorize_module, + "_authorize_common_request", + mocker.AsyncMock(side_effect=raised), + ) + + with pytest.raises(HTTPException) as exc_info: + await authorize_module.authorize_chat_request( + request=_make_request(), + chat_request=_chat_request(), + authorization="Bearer token", + service_type=_AI, + purpose="chat", + ) + + assert exc_info.value is raised # re-raised unchanged + assert ( + _availability( + metrics_spy, + AvailabilityOutcome.EXCLUDED, + AvailabilityReason.AUTH_REJECTED, + model=_VALID_MODEL, + service_type="ai", + ) + == 1 + ) + assert _availability_total(metrics_spy) == 1 + + +async def test_wrapper_shared_call_400_records_invalid_auth_request( + mocker, metrics_spy +): + # Pins that the wrapper maps any shared-call 400 to invalid_auth_request, + # regardless of source. The non-purpose source (malformed App Attest base64 + # decoded in app_attest_auth, which raises before its try) was confirmed by + # code inspection; this test proves the mapping, not that path's reachability. + raised = HTTPException(status_code=400, detail={"challenge_b64": "Invalid Base64"}) + mocker.patch.object( + authorize_module, + "_authorize_common_request", + mocker.AsyncMock(side_effect=raised), + ) + + with pytest.raises(HTTPException) as exc_info: + await authorize_module.authorize_chat_request( + request=_make_request(), + chat_request=_chat_request(), + authorization="Bearer token", + service_type=_AI, + purpose="chat", + ) + + assert exc_info.value is raised # re-raised unchanged + assert ( + _availability( + metrics_spy, + AvailabilityOutcome.EXCLUDED, + AvailabilityReason.INVALID_AUTH_REQUEST, + model=_VALID_MODEL, + service_type="ai", + ) + == 1 + ) + assert _availability_total(metrics_spy) == 1 + + +async def test_wrapper_shared_call_500_records_nothing(mocker, metrics_spy): + # App Attest's explicit 500 is an HTTPException, so it is caught, but the + # wrapper records only 401/400 and re-raises everything else unrecorded. + raised = HTTPException( + status_code=500, detail="Server error during App Attest auth" + ) + mocker.patch.object( + authorize_module, + "_authorize_common_request", + mocker.AsyncMock(side_effect=raised), + ) + + with pytest.raises(HTTPException) as exc_info: + await authorize_module.authorize_chat_request( + request=_make_request(), + chat_request=_chat_request(), + authorization="Bearer token", + service_type=_AI, + purpose="chat", + ) + + assert exc_info.value is raised # re-raised unchanged + assert "chat_availability" not in metrics_spy.touched() + + +async def test_wrapper_shared_call_non_http_exception_records_nothing( + mocker, metrics_spy +): + # Non-HTTPException auth-path errors are not caught and propagate unrecorded. + raised = RuntimeError("bare auth-path failure") + mocker.patch.object( + authorize_module, + "_authorize_common_request", + mocker.AsyncMock(side_effect=raised), + ) + + with pytest.raises(RuntimeError): + await authorize_module.authorize_chat_request( + request=_make_request(), + chat_request=_chat_request(), + authorization="Bearer token", + service_type=_AI, + purpose="chat", + ) + + assert "chat_availability" not in metrics_spy.touched() + + +# --- route body: get_or_create_user_for_completion ---------------------------- + + +async def test_signup_cap_records_excluded_alongside_rejection(mocker, metrics_spy): + mocker.patch( + "mlpa.core.completions.get_or_create_user", + mocker.AsyncMock( + side_effect=HTTPException( + status_code=403, detail={"error": ERROR_CODE_MAX_USERS_REACHED} + ) + ), + ) + + with pytest.raises(HTTPException) as exc_info: + await get_or_create_user_for_completion(SAMPLE_REQUEST.user, SAMPLE_REQUEST) + + assert exc_info.value.status_code == 403 + assert ( + _availability( + metrics_spy, + AvailabilityOutcome.EXCLUDED, + AvailabilityReason.SIGNUP_CAP_EXCEEDED, + model=SAMPLE_REQUEST.model, + service_type=SAMPLE_REQUEST.service_type, + purpose=SAMPLE_REQUEST.purpose, + ) + == 1 + ) + assert _availability_total(metrics_spy) == 1 + # The existing rejection metric is still recorded. + assert _rejection_total(metrics_spy) == 1 + + +async def test_provisioning_failure_records_failure(mocker, metrics_spy): + mocker.patch( + "mlpa.core.completions.get_or_create_user", + mocker.AsyncMock( + side_effect=HTTPException( + status_code=500, detail={"error": "Error fetching user info"} + ) + ), + ) + + with pytest.raises(HTTPException) as exc_info: + await get_or_create_user_for_completion(SAMPLE_REQUEST.user, SAMPLE_REQUEST) + + assert exc_info.value.status_code == 500 + assert ( + _availability( + metrics_spy, + AvailabilityOutcome.FAILURE, + AvailabilityReason.PROVISIONING_FAILURE, + model=SAMPLE_REQUEST.model, + service_type=SAMPLE_REQUEST.service_type, + purpose=SAMPLE_REQUEST.purpose, + ) + == 1 + ) + assert _availability_total(metrics_spy) == 1 + + +async def test_non_signup_non_5xx_records_nothing(mocker, metrics_spy): + # The strict gate leaves a non-signup-cap, non-5xx disposition unrecorded so a + # client-side 4xx is not counted as an availability failure. + mocker.patch( + "mlpa.core.completions.get_or_create_user", + mocker.AsyncMock( + side_effect=HTTPException( + status_code=400, detail={"error": "Invalid user_id format"} + ) + ), + ) + + with pytest.raises(HTTPException) as exc_info: + await get_or_create_user_for_completion(SAMPLE_REQUEST.user, SAMPLE_REQUEST) + + assert exc_info.value.status_code == 400 + assert "chat_availability" not in metrics_spy.touched() + + +async def test_search_request_records_no_chat_availability(mocker, metrics_spy): + search_req = AuthorizedSearchRequest( + user="user-1:search", service_type="search", query="q", max_results=2 + ) + mocker.patch( + "mlpa.core.completions.get_or_create_user", + mocker.AsyncMock( + side_effect=HTTPException( + status_code=500, detail={"error": "Error fetching user info"} + ) + ), + ) + + with pytest.raises(HTTPException): + await get_or_create_user_for_completion(search_req.user, search_req) + + assert "chat_availability" not in metrics_spy.touched() + + +# --- route body: blocked user ------------------------------------------------- + + +async def test_blocked_user_records_blocked(mocker, metrics_spy): + mocker.patch.object( + run_module, + "get_or_create_user_for_completion", + mocker.AsyncMock(return_value=({"blocked": True}, False)), + ) + + with pytest.raises(HTTPException) as exc_info: + await run_module.chat_completion( + request=_make_request(), + authorized_chat_request=SAMPLE_REQUEST, + ) + + assert exc_info.value.status_code == 403 + assert ( + _availability( + metrics_spy, + AvailabilityOutcome.EXCLUDED, + AvailabilityReason.BLOCKED, + model=SAMPLE_REQUEST.model, + service_type=SAMPLE_REQUEST.service_type, + purpose=SAMPLE_REQUEST.purpose, + ) + == 1 + ) + assert _availability_total(metrics_spy) == 1 From 2c0082137fbecdcb27e938a867397bff88dd956f Mon Sep 17 00:00:00 2001 From: Randy Concepcion Date: Fri, 12 Jun 2026 00:32:21 -0400 Subject: [PATCH 3/9] clean up comments and remove unused import --- src/mlpa/core/auth/authorize.py | 17 +++--- src/mlpa/core/completions.py | 16 +++--- src/mlpa/core/errors.py | 4 +- src/mlpa/core/prometheus_metrics.py | 57 +++++++++---------- .../unit/test_precompletion_availability.py | 1 - 5 files changed, 46 insertions(+), 49 deletions(-) diff --git a/src/mlpa/core/auth/authorize.py b/src/mlpa/core/auth/authorize.py index e781ff9..a12b8c1 100644 --- a/src/mlpa/core/auth/authorize.py +++ b/src/mlpa/core/auth/authorize.py @@ -154,15 +154,14 @@ async def authorize_chat_request( use_play_integrity=use_play_integrity, ) except HTTPException as exc: - # Record only the terminal auth dispositions the dependency intentionally - # emits. 401 is an expected or normalized auth rejection. 400 is a - # client-error request to the auth layer (invalid purpose, or malformed App - # Attest base64 decoded in app_attest_auth). Any other status, including App - # Attest's explicit 500, is re-raised unrecorded; auth-system-failure capture - # is left to a follow-on auth backend change. Non-HTTPException errors are not - # caught here and propagate unrecorded. Purpose is unresolved at these - # dispositions, so a stable "" placeholder is used, never the unvalidated - # header value. + # Only record terminal HTTP failures from the shared auth call: + # - 401: expected or normalized auth rejection (bad creds, expired token, etc.) + # - 400: client error to the auth layer (invalid purpose, or malformed App + # Attest base64 decoded in app_attest_auth before its try block) + # - anything else (e.g. App Attest's explicit 500): re-raised unrecorded; + # auth-system-failure capture is left to a follow-on auth backend change + # Non-HTTPException errors are not caught here and propagate unrecorded. + # Purpose is unknown at this point, so "" is always used as a placeholder. if exc.status_code == 401: reason = AvailabilityReason.AUTH_REJECTED elif exc.status_code == 400: diff --git a/src/mlpa/core/completions.py b/src/mlpa/core/completions.py index 1e531a8..0ee102c 100644 --- a/src/mlpa/core/completions.py +++ b/src/mlpa/core/completions.py @@ -54,9 +54,12 @@ def _build_litellm_body(req: AuthorizedChatRequest, *, stream: bool) -> dict: async def get_or_create_user_for_completion( user_id: str, req: AuthorizedChatRequest | AuthorizedSearchRequest ): - """Wraps get_or_create_user, recording the signup-cap rejection metric and the - pre-completion availability disposition (signup cap, or a 5xx provisioning - failure) for chat requests.""" + """ + Wraps get_or_create_user and records availability for chat requests: + - signup cap (403 + MAX_USERS_REACHED): excluded, alongside the existing rejection metric + - user-resolution failure (status >= 500): failure + - search requests and non-signup-cap, non-5xx failures: not recorded + """ try: return await get_or_create_user(user_id) except HTTPException as exc: @@ -72,10 +75,9 @@ async def get_or_create_user_for_completion( ) record_chat_availability(req, AvailabilityReason.SIGNUP_CAP_EXCEEDED) elif exc.status_code >= 500: - # User-resolution / provisioning system failure for an eligible - # request. Non-signup-cap, non-5xx dispositions are left unrecorded; - # a future client-side 4xx should get its own classification rather - # than counting as an availability failure. + # User-resolution system failure. Non-signup-cap 4xx errors are + # not recorded; a client-side 4xx should get its own classification + # rather than counting as an availability failure. record_chat_availability(req, AvailabilityReason.PROVISIONING_FAILURE) raise diff --git a/src/mlpa/core/errors.py b/src/mlpa/core/errors.py index 8bfc0dc..3ea4ca4 100644 --- a/src/mlpa/core/errors.py +++ b/src/mlpa/core/errors.py @@ -37,8 +37,8 @@ class RejectionMatch: log_message: str = "" def availability_reason(self) -> AvailabilityReason: - # classify_upstream_error only ever produces these completion-stage - # rejection reasons; SIGNUP_CAP_EXCEEDED is recorded pre-completion. + # SIGNUP_CAP_EXCEEDED is recorded pre-completion, not via classify_upstream_error, + # so it is not in the mapping below. if self.reason == PrometheusRejectionReason.RATE_LIMITED: if self.error_code == ERROR_CODE_UPSTREAM_RATE_LIMIT_EXCEEDED: return AvailabilityReason.RATE_LIMITED_UPSTREAM diff --git a/src/mlpa/core/prometheus_metrics.py b/src/mlpa/core/prometheus_metrics.py index 531816c..d104020 100644 --- a/src/mlpa/core/prometheus_metrics.py +++ b/src/mlpa/core/prometheus_metrics.py @@ -27,36 +27,33 @@ class AvailabilityOutcome(StrEnum): class AvailabilityReason(StrEnum): - # Values overlapping PrometheusRejectionReason are kept identical so the - # availability counter and the rejection counter reconcile on shared strings. - # Keep them in sync if a rejection reason is added. - # success - CLEAN_COMPLETION = "clean_completion" - # failure - UPSTREAM_ERROR = "upstream_error" - EMPTY_RESPONSE = "empty_response" - # excluded (policy rejections) - BUDGET_EXCEEDED = "budget_exceeded" - RATE_LIMITED_OWN = "rate_limited_own" - RATE_LIMITED_UPSTREAM = "rate_limited_upstream" - PAYLOAD_TOO_LARGE = "payload_too_large" - INVALID_MODEL_NAME = "invalid_model_name" - INVALID_REQUEST = "invalid_request" - # abort - CLIENT_DISCONNECT = "client_disconnect" - # pre-completion (auth dependency and route body, before the completion path) - # excluded - AUTH_REJECTED = "auth_rejected" - INVALID_AUTH_REQUEST = "invalid_auth_request" - INVALID_SERVICE_TYPE_FOR_MODEL = "invalid_service_type_for_model" - SIGNUP_CAP_EXCEEDED = "signup_cap_exceeded" - BLOCKED = "blocked" - # failure - PROVISIONING_FAILURE = "provisioning_failure" - # Defined but not emitted yet: auth backends normalize system failures to 401 - # (indistinguishable from expected rejections), so capturing this is left to a - # follow-on auth backend change that surfaces a real disposition. - AUTH_SYSTEM_FAILURE = "auth_system_failure" + # Strings shared with PrometheusRejectionReason are kept identical so the + # two counters reconcile. Keep them in sync when a rejection reason is added. + + # --- completion-stage reasons (recorded inside stream_completion / get_completion) --- + CLEAN_COMPLETION = "clean_completion" # success + UPSTREAM_ERROR = "upstream_error" # failure + EMPTY_RESPONSE = "empty_response" # failure + BUDGET_EXCEEDED = "budget_exceeded" # excluded + RATE_LIMITED_OWN = "rate_limited_own" # excluded + RATE_LIMITED_UPSTREAM = "rate_limited_upstream" # excluded + PAYLOAD_TOO_LARGE = "payload_too_large" # excluded + INVALID_MODEL_NAME = "invalid_model_name" # excluded + INVALID_REQUEST = "invalid_request" # excluded + CLIENT_DISCONNECT = "client_disconnect" # abort + + # --- pre-completion reasons (recorded in the auth dependency and route body) --- + AUTH_REJECTED = "auth_rejected" # excluded + INVALID_AUTH_REQUEST = "invalid_auth_request" # excluded + INVALID_SERVICE_TYPE_FOR_MODEL = "invalid_service_type_for_model" # excluded + SIGNUP_CAP_EXCEEDED = "signup_cap_exceeded" # excluded + BLOCKED = "blocked" # excluded + PROVISIONING_FAILURE = "provisioning_failure" # failure + + # Defined but not yet emitted: auth backends normalize system failures to 401, + # making them indistinguishable from expected rejections. Capturing this + # properly requires a follow-on change to the auth backends themselves. + AUTH_SYSTEM_FAILURE = "auth_system_failure" # failure _AVAILABILITY_OUTCOME_BY_REASON: dict[AvailabilityReason, AvailabilityOutcome] = { diff --git a/src/tests/unit/test_precompletion_availability.py b/src/tests/unit/test_precompletion_availability.py index b871ebb..12315ce 100644 --- a/src/tests/unit/test_precompletion_availability.py +++ b/src/tests/unit/test_precompletion_availability.py @@ -22,7 +22,6 @@ from mlpa.core.prometheus_metrics import ( AvailabilityOutcome, AvailabilityReason, - PrometheusRejectionReason, ) from tests.consts import SAMPLE_REQUEST From 945205654b9b33046f639afc9ec199b0cdc56aa4 Mon Sep 17 00:00:00 2001 From: Randy Concepcion Date: Tue, 16 Jun 2026 15:51:35 -0400 Subject: [PATCH 4/9] docs: clarify purpose placeholder and 5xx failure wording --- src/mlpa/core/auth/authorize.py | 2 ++ src/mlpa/core/completions.py | 6 +++--- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/src/mlpa/core/auth/authorize.py b/src/mlpa/core/auth/authorize.py index a12b8c1..a35aa00 100644 --- a/src/mlpa/core/auth/authorize.py +++ b/src/mlpa/core/auth/authorize.py @@ -129,6 +129,8 @@ async def authorize_chat_request( AvailabilityReason.INVALID_SERVICE_TYPE_FOR_MODEL, model=chat_request.model, service_type=service_type.value, + # `purpose` is not yet resolved here + # This serves as a placeholder until a validated purpose is available purpose="", ) raise HTTPException( diff --git a/src/mlpa/core/completions.py b/src/mlpa/core/completions.py index 0ee102c..ad09ed3 100644 --- a/src/mlpa/core/completions.py +++ b/src/mlpa/core/completions.py @@ -57,7 +57,7 @@ async def get_or_create_user_for_completion( """ Wraps get_or_create_user and records availability for chat requests: - signup cap (403 + MAX_USERS_REACHED): excluded, alongside the existing rejection metric - - user-resolution failure (status >= 500): failure + - user-resolution server or system failure (status >= 500): failure - search requests and non-signup-cap, non-5xx failures: not recorded """ try: @@ -75,8 +75,8 @@ async def get_or_create_user_for_completion( ) record_chat_availability(req, AvailabilityReason.SIGNUP_CAP_EXCEEDED) elif exc.status_code >= 500: - # User-resolution system failure. Non-signup-cap 4xx errors are - # not recorded; a client-side 4xx should get its own classification + # User-resolution server or system failure. Non-signup-cap 4xx errors + # are not recorded; a client-side 4xx should get its own classification # rather than counting as an availability failure. record_chat_availability(req, AvailabilityReason.PROVISIONING_FAILURE) raise From 370078a74bc03a739fefc79e91977181d11a2bb6 Mon Sep 17 00:00:00 2001 From: Randy Concepcion Date: Tue, 16 Jun 2026 16:02:45 -0400 Subject: [PATCH 5/9] record PAYLOAD_TOO_LARGE availability in request-size middleware --- src/mlpa/core/middleware/request_size.py | 11 +++++++++++ src/tests/integration/test_request_size_limit.py | 13 ++++++++++++- 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/src/mlpa/core/middleware/request_size.py b/src/mlpa/core/middleware/request_size.py index b9603e3..641a144 100644 --- a/src/mlpa/core/middleware/request_size.py +++ b/src/mlpa/core/middleware/request_size.py @@ -3,6 +3,8 @@ from mlpa.core.config import ERROR_CODE_REQUEST_TOO_LARGE, env from mlpa.core.logger import logger +from mlpa.core.metrics import record_chat_availability_for +from mlpa.core.prometheus_metrics import AvailabilityReason async def check_request_size_middleware(request: Request, call_next): @@ -19,6 +21,15 @@ async def check_request_size_middleware(request: Request, call_next): logger.warning( f"Request size {size} bytes exceeds maximum {env.MAX_REQUEST_SIZE_BYTES} bytes" ) + # Placeholders for `model`, `service_type`, and `purpose` + # are used since values are not set until the body is parsed + # and auth has run. + record_chat_availability_for( + AvailabilityReason.PAYLOAD_TOO_LARGE, + model="", + service_type="", + purpose="", + ) return JSONResponse( status_code=413, content={"error": ERROR_CODE_REQUEST_TOO_LARGE}, diff --git a/src/tests/integration/test_request_size_limit.py b/src/tests/integration/test_request_size_limit.py index 63e5c59..36d4263 100644 --- a/src/tests/integration/test_request_size_limit.py +++ b/src/tests/integration/test_request_size_limit.py @@ -23,7 +23,7 @@ def test_request_size_under_limit(mocked_client_integration): assert response.status_code != 413 -def test_request_size_over_limit(mocked_client_integration): +def test_request_size_over_limit(mocked_client_integration, metrics_spy): """Test that requests over the size limit return 413.""" max_size = env.MAX_REQUEST_SIZE_BYTES oversized_size = max_size + 1 @@ -61,6 +61,17 @@ def test_request_size_over_limit(mocked_client_integration): assert response.status_code == 413 assert response.json() == {"error": 3} + assert ( + metrics_spy.value( + "chat_availability", + outcome="excluded", + reason="payload_too_large", + model="", + service_type="", + purpose="", + ) + == 1 + ) def test_request_size_exactly_at_limit(mocked_client_integration): From 8167088d40b2e048115c31a565f43fbd8a8ef581 Mon Sep 17 00:00:00 2001 From: Randy Concepcion Date: Tue, 16 Jun 2026 16:17:24 -0400 Subject: [PATCH 6/9] order availability reasons by request stage --- src/mlpa/core/prometheus_metrics.py | 38 ++++++++++++++--------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/src/mlpa/core/prometheus_metrics.py b/src/mlpa/core/prometheus_metrics.py index d104020..e6ed9c2 100644 --- a/src/mlpa/core/prometheus_metrics.py +++ b/src/mlpa/core/prometheus_metrics.py @@ -30,18 +30,6 @@ class AvailabilityReason(StrEnum): # Strings shared with PrometheusRejectionReason are kept identical so the # two counters reconcile. Keep them in sync when a rejection reason is added. - # --- completion-stage reasons (recorded inside stream_completion / get_completion) --- - CLEAN_COMPLETION = "clean_completion" # success - UPSTREAM_ERROR = "upstream_error" # failure - EMPTY_RESPONSE = "empty_response" # failure - BUDGET_EXCEEDED = "budget_exceeded" # excluded - RATE_LIMITED_OWN = "rate_limited_own" # excluded - RATE_LIMITED_UPSTREAM = "rate_limited_upstream" # excluded - PAYLOAD_TOO_LARGE = "payload_too_large" # excluded - INVALID_MODEL_NAME = "invalid_model_name" # excluded - INVALID_REQUEST = "invalid_request" # excluded - CLIENT_DISCONNECT = "client_disconnect" # abort - # --- pre-completion reasons (recorded in the auth dependency and route body) --- AUTH_REJECTED = "auth_rejected" # excluded INVALID_AUTH_REQUEST = "invalid_auth_request" # excluded @@ -55,8 +43,27 @@ class AvailabilityReason(StrEnum): # properly requires a follow-on change to the auth backends themselves. AUTH_SYSTEM_FAILURE = "auth_system_failure" # failure + # --- completion-stage reasons (recorded inside stream_completion / get_completion) --- + CLEAN_COMPLETION = "clean_completion" # success + UPSTREAM_ERROR = "upstream_error" # failure + EMPTY_RESPONSE = "empty_response" # failure + BUDGET_EXCEEDED = "budget_exceeded" # excluded + RATE_LIMITED_OWN = "rate_limited_own" # excluded + RATE_LIMITED_UPSTREAM = "rate_limited_upstream" # excluded + PAYLOAD_TOO_LARGE = "payload_too_large" # excluded + INVALID_MODEL_NAME = "invalid_model_name" # excluded + INVALID_REQUEST = "invalid_request" # excluded + CLIENT_DISCONNECT = "client_disconnect" # abort + _AVAILABILITY_OUTCOME_BY_REASON: dict[AvailabilityReason, AvailabilityOutcome] = { + AvailabilityReason.AUTH_REJECTED: AvailabilityOutcome.EXCLUDED, + AvailabilityReason.INVALID_AUTH_REQUEST: AvailabilityOutcome.EXCLUDED, + AvailabilityReason.INVALID_SERVICE_TYPE_FOR_MODEL: AvailabilityOutcome.EXCLUDED, + AvailabilityReason.SIGNUP_CAP_EXCEEDED: AvailabilityOutcome.EXCLUDED, + AvailabilityReason.BLOCKED: AvailabilityOutcome.EXCLUDED, + AvailabilityReason.PROVISIONING_FAILURE: AvailabilityOutcome.FAILURE, + AvailabilityReason.AUTH_SYSTEM_FAILURE: AvailabilityOutcome.FAILURE, AvailabilityReason.CLEAN_COMPLETION: AvailabilityOutcome.SUCCESS, AvailabilityReason.UPSTREAM_ERROR: AvailabilityOutcome.FAILURE, AvailabilityReason.EMPTY_RESPONSE: AvailabilityOutcome.FAILURE, @@ -67,13 +74,6 @@ class AvailabilityReason(StrEnum): AvailabilityReason.INVALID_MODEL_NAME: AvailabilityOutcome.EXCLUDED, AvailabilityReason.INVALID_REQUEST: AvailabilityOutcome.EXCLUDED, AvailabilityReason.CLIENT_DISCONNECT: AvailabilityOutcome.ABORT, - AvailabilityReason.AUTH_REJECTED: AvailabilityOutcome.EXCLUDED, - AvailabilityReason.INVALID_AUTH_REQUEST: AvailabilityOutcome.EXCLUDED, - AvailabilityReason.INVALID_SERVICE_TYPE_FOR_MODEL: AvailabilityOutcome.EXCLUDED, - AvailabilityReason.SIGNUP_CAP_EXCEEDED: AvailabilityOutcome.EXCLUDED, - AvailabilityReason.BLOCKED: AvailabilityOutcome.EXCLUDED, - AvailabilityReason.PROVISIONING_FAILURE: AvailabilityOutcome.FAILURE, - AvailabilityReason.AUTH_SYSTEM_FAILURE: AvailabilityOutcome.FAILURE, } From 98b120c690a3c67448c4cada656747a361303735 Mon Sep 17 00:00:00 2001 From: Randy Concepcion Date: Tue, 16 Jun 2026 16:29:52 -0400 Subject: [PATCH 7/9] rename availability reasons for clarity (valid_response, rate_limited_platform) --- src/mlpa/core/completions.py | 4 ++-- src/mlpa/core/errors.py | 2 +- src/mlpa/core/prometheus_metrics.py | 8 ++++---- src/tests/unit/test_availability.py | 2 +- src/tests/unit/test_completions.py | 6 +++--- 5 files changed, 11 insertions(+), 11 deletions(-) diff --git a/src/mlpa/core/completions.py b/src/mlpa/core/completions.py index ad09ed3..971a586 100644 --- a/src/mlpa/core/completions.py +++ b/src/mlpa/core/completions.py @@ -272,7 +272,7 @@ async def _read_next_chunk( snapshot=litellm_routing_snapshot, ) result = PrometheusResult.SUCCESS - availability_reason = AvailabilityReason.CLEAN_COMPLETION + availability_reason = AvailabilityReason.VALID_RESPONSE except (GeneratorExit, asyncio.CancelledError): # Client went away mid-stream: Starlette tears the generator down by # throwing GeneratorExit (or cancelling the task) at the paused @@ -384,7 +384,7 @@ async def get_completion(authorized_chat_request: AuthorizedChatRequest): snapshot=litellm_routing_snapshot, ) result = PrometheusResult.SUCCESS - availability_reason = AvailabilityReason.CLEAN_COMPLETION + availability_reason = AvailabilityReason.VALID_RESPONSE return data except HTTPException: raise diff --git a/src/mlpa/core/errors.py b/src/mlpa/core/errors.py index 3ea4ca4..6ee521d 100644 --- a/src/mlpa/core/errors.py +++ b/src/mlpa/core/errors.py @@ -42,7 +42,7 @@ def availability_reason(self) -> AvailabilityReason: if self.reason == PrometheusRejectionReason.RATE_LIMITED: if self.error_code == ERROR_CODE_UPSTREAM_RATE_LIMIT_EXCEEDED: return AvailabilityReason.RATE_LIMITED_UPSTREAM - return AvailabilityReason.RATE_LIMITED_OWN + return AvailabilityReason.RATE_LIMITED_PLATFORM return _REJECTION_TO_AVAILABILITY_REASON[self.reason] diff --git a/src/mlpa/core/prometheus_metrics.py b/src/mlpa/core/prometheus_metrics.py index e6ed9c2..757634d 100644 --- a/src/mlpa/core/prometheus_metrics.py +++ b/src/mlpa/core/prometheus_metrics.py @@ -44,11 +44,11 @@ class AvailabilityReason(StrEnum): AUTH_SYSTEM_FAILURE = "auth_system_failure" # failure # --- completion-stage reasons (recorded inside stream_completion / get_completion) --- - CLEAN_COMPLETION = "clean_completion" # success + VALID_RESPONSE = "valid_response" # success UPSTREAM_ERROR = "upstream_error" # failure EMPTY_RESPONSE = "empty_response" # failure BUDGET_EXCEEDED = "budget_exceeded" # excluded - RATE_LIMITED_OWN = "rate_limited_own" # excluded + RATE_LIMITED_PLATFORM = "rate_limited_platform" # excluded RATE_LIMITED_UPSTREAM = "rate_limited_upstream" # excluded PAYLOAD_TOO_LARGE = "payload_too_large" # excluded INVALID_MODEL_NAME = "invalid_model_name" # excluded @@ -64,11 +64,11 @@ class AvailabilityReason(StrEnum): AvailabilityReason.BLOCKED: AvailabilityOutcome.EXCLUDED, AvailabilityReason.PROVISIONING_FAILURE: AvailabilityOutcome.FAILURE, AvailabilityReason.AUTH_SYSTEM_FAILURE: AvailabilityOutcome.FAILURE, - AvailabilityReason.CLEAN_COMPLETION: AvailabilityOutcome.SUCCESS, + AvailabilityReason.VALID_RESPONSE: AvailabilityOutcome.SUCCESS, AvailabilityReason.UPSTREAM_ERROR: AvailabilityOutcome.FAILURE, AvailabilityReason.EMPTY_RESPONSE: AvailabilityOutcome.FAILURE, AvailabilityReason.BUDGET_EXCEEDED: AvailabilityOutcome.EXCLUDED, - AvailabilityReason.RATE_LIMITED_OWN: AvailabilityOutcome.EXCLUDED, + AvailabilityReason.RATE_LIMITED_PLATFORM: AvailabilityOutcome.EXCLUDED, AvailabilityReason.RATE_LIMITED_UPSTREAM: AvailabilityOutcome.EXCLUDED, AvailabilityReason.PAYLOAD_TOO_LARGE: AvailabilityOutcome.EXCLUDED, AvailabilityReason.INVALID_MODEL_NAME: AvailabilityOutcome.EXCLUDED, diff --git a/src/tests/unit/test_availability.py b/src/tests/unit/test_availability.py index ca9898e..ef794ab 100644 --- a/src/tests/unit/test_availability.py +++ b/src/tests/unit/test_availability.py @@ -65,7 +65,7 @@ def test_every_completion_stage_rejection_reason_maps_to_excluded(): ( PrometheusRejectionReason.RATE_LIMITED, ERROR_CODE_RATE_LIMIT_EXCEEDED, - AvailabilityReason.RATE_LIMITED_OWN, + AvailabilityReason.RATE_LIMITED_PLATFORM, ), ( PrometheusRejectionReason.RATE_LIMITED, diff --git a/src/tests/unit/test_completions.py b/src/tests/unit/test_completions.py index afa6a25..c4147fe 100644 --- a/src/tests/unit/test_completions.py +++ b/src/tests/unit/test_completions.py @@ -174,7 +174,7 @@ async def test_get_completion_success(mocker, metrics_spy): _availability_count( metrics_spy, AvailabilityOutcome.SUCCESS, - AvailabilityReason.CLEAN_COMPLETION, + AvailabilityReason.VALID_RESPONSE, ) == 1 ) @@ -390,7 +390,7 @@ async def test_stream_completion_success( _availability_count( metrics_spy, AvailabilityOutcome.SUCCESS, - AvailabilityReason.CLEAN_COMPLETION, + AvailabilityReason.VALID_RESPONSE, ) == 1 ) @@ -569,7 +569,7 @@ async def test_get_completion_rate_limit_exceeded(mocker, metrics_spy): _availability_count( metrics_spy, AvailabilityOutcome.EXCLUDED, - AvailabilityReason.RATE_LIMITED_OWN, + AvailabilityReason.RATE_LIMITED_PLATFORM, ) == 1 ) From 890cdb643f307443b8763a138161384bcf234082 Mon Sep 17 00:00:00 2001 From: Randy Concepcion Date: Wed, 17 Jun 2026 15:51:00 -0400 Subject: [PATCH 8/9] populate service_type and purpose on pre-completion availability records --- src/mlpa/core/auth/authorize.py | 4 +--- src/mlpa/core/middleware/request_size.py | 11 ++++++----- src/tests/integration/test_request_size_limit.py | 4 ++-- src/tests/unit/test_precompletion_availability.py | 1 + 4 files changed, 10 insertions(+), 10 deletions(-) diff --git a/src/mlpa/core/auth/authorize.py b/src/mlpa/core/auth/authorize.py index a35aa00..c6fa463 100644 --- a/src/mlpa/core/auth/authorize.py +++ b/src/mlpa/core/auth/authorize.py @@ -129,9 +129,7 @@ async def authorize_chat_request( AvailabilityReason.INVALID_SERVICE_TYPE_FOR_MODEL, model=chat_request.model, service_type=service_type.value, - # `purpose` is not yet resolved here - # This serves as a placeholder until a validated purpose is available - purpose="", + purpose=(purpose or "").strip(), ) raise HTTPException( status_code=400, diff --git a/src/mlpa/core/middleware/request_size.py b/src/mlpa/core/middleware/request_size.py index 641a144..7565b13 100644 --- a/src/mlpa/core/middleware/request_size.py +++ b/src/mlpa/core/middleware/request_size.py @@ -21,14 +21,15 @@ async def check_request_size_middleware(request: Request, call_next): logger.warning( f"Request size {size} bytes exceeds maximum {env.MAX_REQUEST_SIZE_BYTES} bytes" ) - # Placeholders for `model`, `service_type`, and `purpose` - # are used since values are not set until the body is parsed - # and auth has run. + # `model` is in the request body, which we don't read here. + # We reject on the Content-Length header without parsing it. record_chat_availability_for( AvailabilityReason.PAYLOAD_TOO_LARGE, model="", - service_type="", - purpose="", + service_type=( + request.headers.get("service-type") or "" + ).strip(), + purpose=(request.headers.get("purpose") or "").strip(), ) return JSONResponse( status_code=413, diff --git a/src/tests/integration/test_request_size_limit.py b/src/tests/integration/test_request_size_limit.py index 36d4263..ad4bb25 100644 --- a/src/tests/integration/test_request_size_limit.py +++ b/src/tests/integration/test_request_size_limit.py @@ -67,8 +67,8 @@ def test_request_size_over_limit(mocked_client_integration, metrics_spy): outcome="excluded", reason="payload_too_large", model="", - service_type="", - purpose="", + service_type="ai", + purpose="chat", ) == 1 ) diff --git a/src/tests/unit/test_precompletion_availability.py b/src/tests/unit/test_precompletion_availability.py index 12315ce..04e850d 100644 --- a/src/tests/unit/test_precompletion_availability.py +++ b/src/tests/unit/test_precompletion_availability.py @@ -119,6 +119,7 @@ async def test_wrapper_invalid_service_type_records_excluded(metrics_spy): AvailabilityReason.INVALID_SERVICE_TYPE_FOR_MODEL, model="exa", service_type="ai", + purpose="chat", ) == 1 ) From d344857ffbd0d5faa9f7a50abcdb1a14b25464d8 Mon Sep 17 00:00:00 2001 From: Randy Concepcion Date: Thu, 18 Jun 2026 09:22:45 -0400 Subject: [PATCH 9/9] clean up strips and add purpose on auth-failure availability --- src/mlpa/core/auth/authorize.py | 7 +++---- src/mlpa/core/middleware/request_size.py | 6 ++---- src/tests/unit/test_precompletion_availability.py | 3 +++ 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/mlpa/core/auth/authorize.py b/src/mlpa/core/auth/authorize.py index c6fa463..6ecade0 100644 --- a/src/mlpa/core/auth/authorize.py +++ b/src/mlpa/core/auth/authorize.py @@ -27,7 +27,7 @@ def _resolve_purpose(service_type_value: str, purpose_header: str | None) -> str """Validate purpose header and return value; raise HTTPException if invalid.""" valid = env.valid_purposes_for_service_type(service_type_value) requires = env.service_type_requires_purpose(service_type_value) - purpose = (purpose_header or "").strip() + purpose = purpose_header or "" if purpose: if purpose not in valid: @@ -129,7 +129,7 @@ async def authorize_chat_request( AvailabilityReason.INVALID_SERVICE_TYPE_FOR_MODEL, model=chat_request.model, service_type=service_type.value, - purpose=(purpose or "").strip(), + purpose=purpose or "", ) raise HTTPException( status_code=400, @@ -161,7 +161,6 @@ async def authorize_chat_request( # - anything else (e.g. App Attest's explicit 500): re-raised unrecorded; # auth-system-failure capture is left to a follow-on auth backend change # Non-HTTPException errors are not caught here and propagate unrecorded. - # Purpose is unknown at this point, so "" is always used as a placeholder. if exc.status_code == 401: reason = AvailabilityReason.AUTH_REJECTED elif exc.status_code == 400: @@ -172,7 +171,7 @@ async def authorize_chat_request( reason, model=chat_request.model, service_type=service_type.value, - purpose="", + purpose=purpose or "", ) raise diff --git a/src/mlpa/core/middleware/request_size.py b/src/mlpa/core/middleware/request_size.py index 7565b13..8a3d771 100644 --- a/src/mlpa/core/middleware/request_size.py +++ b/src/mlpa/core/middleware/request_size.py @@ -26,10 +26,8 @@ async def check_request_size_middleware(request: Request, call_next): record_chat_availability_for( AvailabilityReason.PAYLOAD_TOO_LARGE, model="", - service_type=( - request.headers.get("service-type") or "" - ).strip(), - purpose=(request.headers.get("purpose") or "").strip(), + service_type=request.headers.get("service-type") or "", + purpose=request.headers.get("purpose") or "", ) return JSONResponse( status_code=413, diff --git a/src/tests/unit/test_precompletion_availability.py b/src/tests/unit/test_precompletion_availability.py index 04e850d..454b08d 100644 --- a/src/tests/unit/test_precompletion_availability.py +++ b/src/tests/unit/test_precompletion_availability.py @@ -145,6 +145,7 @@ async def test_wrapper_invalid_purpose_records_invalid_auth_request(metrics_spy) AvailabilityReason.INVALID_AUTH_REQUEST, model=_VALID_MODEL, service_type="ai", + purpose="definitely-not-a-valid-purpose", ) == 1 ) @@ -176,6 +177,7 @@ async def test_wrapper_shared_call_401_records_auth_rejected(mocker, metrics_spy AvailabilityReason.AUTH_REJECTED, model=_VALID_MODEL, service_type="ai", + purpose="chat", ) == 1 ) @@ -213,6 +215,7 @@ async def test_wrapper_shared_call_400_records_invalid_auth_request( AvailabilityReason.INVALID_AUTH_REQUEST, model=_VALID_MODEL, service_type="ai", + purpose="chat", ) == 1 )