From 7791c2e848a08f81a509544ad1eb2abc8e560ca5 Mon Sep 17 00:00:00 2001
From: Randy Concepcion <rconcepcion@mozilla.com>
Date: Thu, 11 Jun 2026 19:51:04 -0400
Subject: [PATCH 1/9] add chat completion availability counter

---
 src/mlpa/core/completions.py        |  13 ++
 src/mlpa/core/errors.py             |  20 ++-
 src/mlpa/core/metrics.py            |  12 ++
 src/mlpa/core/prometheus_metrics.py |  56 ++++++++
 src/tests/unit/test_availability.py |  96 ++++++++++++++
 src/tests/unit/test_completions.py  | 195 ++++++++++++++++++++++++----
 6 files changed, 367 insertions(+), 25 deletions(-)
 create mode 100644 src/tests/unit/test_availability.py

diff --git a/src/mlpa/core/completions.py b/src/mlpa/core/completions.py
index 7a5a8ec..5f40844 100644
--- a/src/mlpa/core/completions.py
+++ b/src/mlpa/core/completions.py
@@ -20,6 +20,7 @@
 from mlpa.core.logger import logger
 from mlpa.core.metrics import (
     extract_tool_names,
+    record_chat_availability,
     record_chat_request_rejection,
     record_completion_latency,
     record_completion_success,
@@ -27,6 +28,7 @@
     record_ttft,
 )
 from mlpa.core.prometheus_metrics import (
+    AvailabilityReason,
     PrometheusRejectionReason,
     PrometheusResult,
 )
@@ -80,6 +82,7 @@ async def stream_completion(
     record_request_with_tools(authorized_chat_request)
     body = _build_litellm_body(authorized_chat_request, stream=True)
     result = PrometheusResult.ERROR
+    availability_reason = AvailabilityReason.UPSTREAM_ERROR
     is_first_token = True
     prompt_tokens = 0
     completion_tokens = 0
@@ -139,6 +142,7 @@ async def _read_next_chunk(
                     if match.log_message:
                         logger.warning(match.log_message)
                     record_chat_request_rejection(authorized_chat_request, match.reason)
+                    availability_reason = match.availability_reason()
                     yield f'data: {{"error": {match.error_code}}}\n\n'.encode()
                     return
 
@@ -237,6 +241,7 @@ async def _read_next_chunk(
                 return
 
             if not streaming_started:
+                availability_reason = AvailabilityReason.EMPTY_RESPONSE
                 yield raise_and_log(
                     RuntimeError("LiteLLM returned an empty response"),
                     True,
@@ -256,6 +261,7 @@ async def _read_next_chunk(
                 snapshot=litellm_routing_snapshot,
             )
             result = PrometheusResult.SUCCESS
+            availability_reason = AvailabilityReason.CLEAN_COMPLETION
     except (GeneratorExit, asyncio.CancelledError):
         # Client went away mid-stream: Starlette tears the generator down by
         # throwing GeneratorExit (or cancelling the task) at the paused
@@ -291,9 +297,12 @@ async def _read_next_chunk(
         if result == PrometheusResult.ERROR and disconnect_event.is_set():
             result = PrometheusResult.ABORT
             logger.info(_client_disconnected_msg)
+        if result == PrometheusResult.ABORT:
+            availability_reason = AvailabilityReason.CLIENT_DISCONNECT
         record_completion_latency(
             authorized_chat_request, result, time.perf_counter() - start_time
         )
+        record_chat_availability(authorized_chat_request, availability_reason)
 
 
 async def get_completion(authorized_chat_request: AuthorizedChatRequest):
@@ -304,6 +313,7 @@ async def get_completion(authorized_chat_request: AuthorizedChatRequest):
     record_request_with_tools(authorized_chat_request)
     body = _build_litellm_body(authorized_chat_request, stream=False)
     result = PrometheusResult.ERROR
+    availability_reason = AvailabilityReason.UPSTREAM_ERROR
     logger.debug(
         f"Starting a non-stream completion using {authorized_chat_request.model}, for user {authorized_chat_request.user}",
     )
@@ -326,6 +336,7 @@ async def get_completion(authorized_chat_request: AuthorizedChatRequest):
                 if match.log_message:
                     logger.warning(match.log_message)
                 record_chat_request_rejection(authorized_chat_request, match.reason)
+                availability_reason = match.availability_reason()
                 headers = (
                     {"Retry-After": match.retry_after} if match.retry_after else None
                 )
@@ -362,6 +373,7 @@ async def get_completion(authorized_chat_request: AuthorizedChatRequest):
             snapshot=litellm_routing_snapshot,
         )
         result = PrometheusResult.SUCCESS
+        availability_reason = AvailabilityReason.CLEAN_COMPLETION
         return data
     except HTTPException:
         raise
@@ -371,3 +383,4 @@ async def get_completion(authorized_chat_request: AuthorizedChatRequest):
         record_completion_latency(
             authorized_chat_request, result, time.perf_counter() - start_time
         )
+        record_chat_availability(authorized_chat_request, availability_reason)
diff --git a/src/mlpa/core/errors.py b/src/mlpa/core/errors.py
index 50b8a68..8bfc0dc 100644
--- a/src/mlpa/core/errors.py
+++ b/src/mlpa/core/errors.py
@@ -9,7 +9,7 @@
     ERROR_CODE_REQUEST_TOO_LARGE,
     ERROR_CODE_UPSTREAM_RATE_LIMIT_EXCEEDED,
 )
-from mlpa.core.prometheus_metrics import PrometheusRejectionReason
+from mlpa.core.prometheus_metrics import AvailabilityReason, PrometheusRejectionReason
 from mlpa.core.utils import (
     is_context_window_error,
     is_invalid_model_name_error,
@@ -18,6 +18,15 @@
     is_rate_limit_error,
 )
 
+_REJECTION_TO_AVAILABILITY_REASON: dict[
+    PrometheusRejectionReason, AvailabilityReason
+] = {
+    PrometheusRejectionReason.BUDGET_EXCEEDED: AvailabilityReason.BUDGET_EXCEEDED,
+    PrometheusRejectionReason.PAYLOAD_TOO_LARGE: AvailabilityReason.PAYLOAD_TOO_LARGE,
+    PrometheusRejectionReason.INVALID_MODEL_NAME: AvailabilityReason.INVALID_MODEL_NAME,
+    PrometheusRejectionReason.INVALID_REQUEST: AvailabilityReason.INVALID_REQUEST,
+}
+
 
 @dataclass(frozen=True)
 class RejectionMatch:
@@ -27,6 +36,15 @@ class RejectionMatch:
     retry_after: str | None = None
     log_message: str = ""
 
+    def availability_reason(self) -> AvailabilityReason:
+        # classify_upstream_error only ever produces these completion-stage
+        # rejection reasons; SIGNUP_CAP_EXCEEDED is recorded pre-completion.
+        if self.reason == PrometheusRejectionReason.RATE_LIMITED:
+            if self.error_code == ERROR_CODE_UPSTREAM_RATE_LIMIT_EXCEEDED:
+                return AvailabilityReason.RATE_LIMITED_UPSTREAM
+            return AvailabilityReason.RATE_LIMITED_OWN
+        return _REJECTION_TO_AVAILABILITY_REASON[self.reason]
+
 
 _RATE_LIMIT_REJECTION: dict[int, tuple[PrometheusRejectionReason, str, str]] = {
     ERROR_CODE_BUDGET_LIMIT_EXCEEDED: (
diff --git a/src/mlpa/core/metrics.py b/src/mlpa/core/metrics.py
index 5c54846..110fb4d 100644
--- a/src/mlpa/core/metrics.py
+++ b/src/mlpa/core/metrics.py
@@ -6,9 +6,11 @@
     LitellmRoutingSnapshot,
 )
 from mlpa.core.prometheus_metrics import (
+    AvailabilityReason,
     PrometheusRejectionReason,
     PrometheusResult,
     TokenType,
+    availability_outcome_for,
     metrics,
 )
 
@@ -41,6 +43,16 @@ def record_search_request_rejection(
     metrics.search_request_rejections.labels(reason=reason, **_search_labels(req)).inc()
 
 
+def record_chat_availability(
+    req: AuthorizedChatRequest, reason: AvailabilityReason
+) -> None:
+    metrics.chat_availability.labels(
+        outcome=availability_outcome_for(reason),
+        reason=reason,
+        **_chat_labels(req),
+    ).inc()
+
+
 def record_completion_latency(
     req: AuthorizedChatRequest,
     result: PrometheusResult,
diff --git a/src/mlpa/core/prometheus_metrics.py b/src/mlpa/core/prometheus_metrics.py
index 745503d..6930c4b 100644
--- a/src/mlpa/core/prometheus_metrics.py
+++ b/src/mlpa/core/prometheus_metrics.py
@@ -19,6 +19,55 @@ class PrometheusRejectionReason(StrEnum):
     INVALID_REQUEST = "invalid_request"
 
 
+class AvailabilityOutcome(StrEnum):
+    SUCCESS = "success"
+    FAILURE = "failure"
+    EXCLUDED = "excluded"
+    ABORT = "abort"
+
+
+class AvailabilityReason(StrEnum):
+    # Values overlapping PrometheusRejectionReason are kept identical so the
+    # availability counter and the rejection counter reconcile on shared strings.
+    # Keep them in sync if a rejection reason is added.
+    # success
+    CLEAN_COMPLETION = "clean_completion"
+    # failure
+    UPSTREAM_ERROR = "upstream_error"
+    EMPTY_RESPONSE = "empty_response"
+    # excluded (policy rejections)
+    BUDGET_EXCEEDED = "budget_exceeded"
+    RATE_LIMITED_OWN = "rate_limited_own"
+    RATE_LIMITED_UPSTREAM = "rate_limited_upstream"
+    PAYLOAD_TOO_LARGE = "payload_too_large"
+    INVALID_MODEL_NAME = "invalid_model_name"
+    INVALID_REQUEST = "invalid_request"
+    # abort
+    CLIENT_DISCONNECT = "client_disconnect"
+    # TODO: add pre-completion reasons (auth_rejected, auth_system_failure,
+    # signup_cap_exceeded, blocked, provisioning_failure, invalid_purpose,
+    # invalid_service_type_for_model) when those paths are instrumented.
+
+
+_AVAILABILITY_OUTCOME_BY_REASON: dict[AvailabilityReason, AvailabilityOutcome] = {
+    AvailabilityReason.CLEAN_COMPLETION: AvailabilityOutcome.SUCCESS,
+    AvailabilityReason.UPSTREAM_ERROR: AvailabilityOutcome.FAILURE,
+    AvailabilityReason.EMPTY_RESPONSE: AvailabilityOutcome.FAILURE,
+    AvailabilityReason.BUDGET_EXCEEDED: AvailabilityOutcome.EXCLUDED,
+    AvailabilityReason.RATE_LIMITED_OWN: AvailabilityOutcome.EXCLUDED,
+    AvailabilityReason.RATE_LIMITED_UPSTREAM: AvailabilityOutcome.EXCLUDED,
+    AvailabilityReason.PAYLOAD_TOO_LARGE: AvailabilityOutcome.EXCLUDED,
+    AvailabilityReason.INVALID_MODEL_NAME: AvailabilityOutcome.EXCLUDED,
+    AvailabilityReason.INVALID_REQUEST: AvailabilityOutcome.EXCLUDED,
+    AvailabilityReason.CLIENT_DISCONNECT: AvailabilityOutcome.ABORT,
+}
+
+
+def availability_outcome_for(reason: AvailabilityReason) -> AvailabilityOutcome:
+    """Pure classifier: the availability outcome is fully determined by the reason."""
+    return _AVAILABILITY_OUTCOME_BY_REASON[reason]
+
+
 class TokenType(StrEnum):
     PROMPT = "prompt"
     COMPLETION = "completion"
@@ -113,6 +162,7 @@ class PrometheusMetrics:
     chat_tool_calls_per_completion: Histogram
     chat_requests_with_tools: Counter
     chat_request_rejections: Counter
+    chat_availability: Counter
 
     # search
     search_latency: Histogram
@@ -274,6 +324,12 @@ def build_metrics(registry: CollectorRegistry = REGISTRY) -> PrometheusMetrics:
             ["reason", "model", "service_type", "purpose"],
             registry=registry,
         ),
+        chat_availability=Counter(
+            "mlpa_chat_availability_total",
+            "Interim availability outcomes for chat completions. outcome is success/failure/excluded/abort; reason is the bounded cause. Availability = success / (success + failure).",
+            ["outcome", "reason", "model", "service_type", "purpose"],
+            registry=registry,
+        ),
         search_latency=Histogram(
             "mlpa_search_latency_seconds",
             "Search latency in seconds.",
diff --git a/src/tests/unit/test_availability.py b/src/tests/unit/test_availability.py
new file mode 100644
index 0000000..ca9898e
--- /dev/null
+++ b/src/tests/unit/test_availability.py
@@ -0,0 +1,96 @@
+import pytest
+
+from mlpa.core.config import (
+    ERROR_CODE_BUDGET_LIMIT_EXCEEDED,
+    ERROR_CODE_INVALID_MODEL_NAME,
+    ERROR_CODE_INVALID_REQUEST,
+    ERROR_CODE_RATE_LIMIT_EXCEEDED,
+    ERROR_CODE_REQUEST_TOO_LARGE,
+    ERROR_CODE_UPSTREAM_RATE_LIMIT_EXCEEDED,
+)
+from mlpa.core.errors import RejectionMatch
+from mlpa.core.prometheus_metrics import (
+    AvailabilityOutcome,
+    AvailabilityReason,
+    PrometheusRejectionReason,
+    availability_outcome_for,
+)
+
+
+def test_every_availability_reason_maps_to_an_outcome():
+    """Guard: a new AvailabilityReason cannot ship without an outcome mapping.
+
+    Future pre-completion reasons added to AvailabilityReason must extend the
+    map too; this fails loudly (KeyError) if the map is not updated alongside the enum.
+    """
+    for reason in AvailabilityReason:
+        assert isinstance(availability_outcome_for(reason), AvailabilityOutcome)
+
+
+# SIGNUP_CAP_EXCEEDED is recorded pre-completion, not by classify_upstream_error,
+# so it is intentionally outside the completion-stage availability mapping.
+_PRE_COMPLETION_REJECTION_REASONS = {PrometheusRejectionReason.SIGNUP_CAP_EXCEEDED}
+
+
+def test_every_completion_stage_rejection_reason_maps_to_excluded():
+    """Guard: every rejection reason classify_upstream_error can produce must
+    resolve through availability_reason() to an excluded outcome.
+
+    Iterating the enum (minus the pre-completion reasons) means a newly added
+    completion-stage rejection reason fails loudly here until it is mapped, or is
+    explicitly classified as pre-completion.
+    """
+    for reason in PrometheusRejectionReason:
+        if reason in _PRE_COMPLETION_REJECTION_REASONS:
+            continue
+        match = RejectionMatch(reason=reason, error_code=0, http_status=400)
+        assert (
+            availability_outcome_for(match.availability_reason())
+            == AvailabilityOutcome.EXCLUDED
+        )
+
+
+# Pins the expected availability reason for each completion-stage rejection,
+# including the own-vs-upstream rate-limit split keyed on error_code. This fixes
+# the exact mappings; the completeness test above guards that the map covers
+# every completion-stage rejection reason.
+@pytest.mark.parametrize(
+    ("reason", "error_code", "expected"),
+    [
+        (
+            PrometheusRejectionReason.BUDGET_EXCEEDED,
+            ERROR_CODE_BUDGET_LIMIT_EXCEEDED,
+            AvailabilityReason.BUDGET_EXCEEDED,
+        ),
+        (
+            PrometheusRejectionReason.RATE_LIMITED,
+            ERROR_CODE_RATE_LIMIT_EXCEEDED,
+            AvailabilityReason.RATE_LIMITED_OWN,
+        ),
+        (
+            PrometheusRejectionReason.RATE_LIMITED,
+            ERROR_CODE_UPSTREAM_RATE_LIMIT_EXCEEDED,
+            AvailabilityReason.RATE_LIMITED_UPSTREAM,
+        ),
+        (
+            PrometheusRejectionReason.PAYLOAD_TOO_LARGE,
+            ERROR_CODE_REQUEST_TOO_LARGE,
+            AvailabilityReason.PAYLOAD_TOO_LARGE,
+        ),
+        (
+            PrometheusRejectionReason.INVALID_MODEL_NAME,
+            ERROR_CODE_INVALID_MODEL_NAME,
+            AvailabilityReason.INVALID_MODEL_NAME,
+        ),
+        (
+            PrometheusRejectionReason.INVALID_REQUEST,
+            ERROR_CODE_INVALID_REQUEST,
+            AvailabilityReason.INVALID_REQUEST,
+        ),
+    ],
+)
+def test_rejection_match_availability_reason(reason, error_code, expected):
+    match = RejectionMatch(reason=reason, error_code=error_code, http_status=400)
+    assert match.availability_reason() == expected
+    # All policy rejections are excluded from the availability ratio.
+    assert availability_outcome_for(expected) == AvailabilityOutcome.EXCLUDED
diff --git a/src/tests/unit/test_completions.py b/src/tests/unit/test_completions.py
index fb87613..afa6a25 100644
--- a/src/tests/unit/test_completions.py
+++ b/src/tests/unit/test_completions.py
@@ -28,7 +28,12 @@
     LITELLM_HEADER_RESPONSE_DURATION_MS,
     env,
 )
-from mlpa.core.prometheus_metrics import PrometheusRejectionReason, PrometheusResult
+from mlpa.core.prometheus_metrics import (
+    AvailabilityOutcome,
+    AvailabilityReason,
+    PrometheusRejectionReason,
+    PrometheusResult,
+)
 from tests.consts import SAMPLE_REQUEST, SUCCESSFUL_CHAT_RESPONSE
 
 
@@ -54,6 +59,39 @@ def _rejection_count(
     )
 
 
+def _availability_count(
+    spy,
+    outcome: AvailabilityOutcome,
+    reason: AvailabilityReason,
+    req=SAMPLE_REQUEST,
+) -> float:
+    return spy.value(
+        "chat_availability",
+        outcome=outcome,
+        reason=reason,
+        model=req.model,
+        service_type=req.service_type,
+        purpose=req.purpose,
+    )
+
+
+def _availability_total(spy, req=SAMPLE_REQUEST) -> float:
+    """Sum of all chat_availability samples for the request labels.
+
+    Proves exactly one availability disposition was recorded, regardless of
+    which (outcome, reason) pair it landed on. Guards the policy-rejection path
+    against re-introducing a second emission alongside the correct one.
+    """
+    return sum(
+        s.value
+        for s in spy.samples("chat_availability")
+        if s.name.endswith("_total")
+        and s.labels.get("model") == req.model
+        and s.labels.get("service_type") == req.service_type
+        and s.labels.get("purpose") == req.purpose
+    )
+
+
 def _sample_litellm_response_headers(**overrides: str) -> httpx.Headers:
     base = {
         LITELLM_HEADER_MODEL_API_BASE: "https://api.together.xyz/v1",
@@ -105,6 +143,7 @@ async def test_get_completion_success(mocker, metrics_spy):
 
     metrics_spy.assert_only(
         {
+            "chat_availability",
             "chat_tokens",
             "chat_tokens_per_request",
             "chat_completion_latency",
@@ -131,6 +170,14 @@ async def test_get_completion_success(mocker, metrics_spy):
         == SUCCESSFUL_CHAT_RESPONSE["usage"]["completion_tokens"]
     )
     assert _latency_count(metrics_spy, PrometheusResult.SUCCESS) == 1
+    assert (
+        _availability_count(
+            metrics_spy,
+            AvailabilityOutcome.SUCCESS,
+            AvailabilityReason.CLEAN_COMPLETION,
+        )
+        == 1
+    )
 
     routing = _litellm_routing_label_base()
     assert (
@@ -256,8 +303,17 @@ async def test_get_completion_http_error(mocker, metrics_spy):
 
     assert exc_info.value.status_code == 500
     assert exc_info.value.detail["error"] == "Upstream service returned an error"
+    assert (
+        _availability_count(
+            metrics_spy,
+            AvailabilityOutcome.FAILURE,
+            AvailabilityReason.UPSTREAM_ERROR,
+        )
+        == 1
+    )
+    assert _availability_total(metrics_spy) == 1
 
-    metrics_spy.assert_only({"chat_completion_latency"})
+    metrics_spy.assert_only({"chat_completion_latency", "chat_availability"})
     assert _latency_count(metrics_spy, PrometheusResult.ERROR) == 1
 
 
@@ -273,7 +329,7 @@ async def test_get_completion_network_error(mocker, metrics_spy):
     assert exc_info.value.status_code == 502
     assert exc_info.value.detail["error"] == "Connection timed out"
 
-    metrics_spy.assert_only({"chat_completion_latency"})
+    metrics_spy.assert_only({"chat_completion_latency", "chat_availability"})
     assert _latency_count(metrics_spy, PrometheusResult.ERROR) == 1
 
 
@@ -308,6 +364,7 @@ async def test_stream_completion_success(
 
     metrics_spy.assert_only(
         {
+            "chat_availability",
             "chat_completion_ttft",
             "chat_tokens",
             "chat_tokens_per_request",
@@ -329,6 +386,14 @@ async def test_stream_completion_success(
     assert metrics_spy.value("chat_tokens", type="prompt", **chat_label_base) == 10
     assert metrics_spy.value("chat_tokens", type="completion", **chat_label_base) == 25
     assert _latency_count(metrics_spy, PrometheusResult.SUCCESS) == 1
+    assert (
+        _availability_count(
+            metrics_spy,
+            AvailabilityOutcome.SUCCESS,
+            AvailabilityReason.CLEAN_COMPLETION,
+        )
+        == 1
+    )
     assert (
         metrics_spy.histogram_count("chat_completion_ttft", model=SAMPLE_REQUEST.model)
         == 1
@@ -419,9 +484,22 @@ async def test_get_completion_budget_limit_exceeded_429(mocker, metrics_spy):
     assert exc_info.value.detail == {"error": 1}
     assert exc_info.value.headers == {"Retry-After": "86400"}
 
-    metrics_spy.assert_only({"chat_request_rejections", "chat_completion_latency"})
+    metrics_spy.assert_only(
+        {"chat_request_rejections", "chat_completion_latency", "chat_availability"}
+    )
     assert _rejection_count(metrics_spy, PrometheusRejectionReason.BUDGET_EXCEEDED) == 1
     assert _latency_count(metrics_spy, PrometheusResult.ERROR) == 1
+    # gap 1: same request, recorded as excluded (not failure) on availability
+    # even though the latency histogram above still counts it as result=error.
+    assert (
+        _availability_count(
+            metrics_spy,
+            AvailabilityOutcome.EXCLUDED,
+            AvailabilityReason.BUDGET_EXCEEDED,
+        )
+        == 1
+    )
+    assert _availability_total(metrics_spy) == 1
 
 
 async def test_get_completion_budget_limit_exceeded_400(mocker, metrics_spy):
@@ -453,7 +531,9 @@ async def test_get_completion_budget_limit_exceeded_400(mocker, metrics_spy):
     assert exc_info.value.detail == {"error": 1}
     assert exc_info.value.headers == {"Retry-After": "86400"}
 
-    metrics_spy.assert_only({"chat_request_rejections", "chat_completion_latency"})
+    metrics_spy.assert_only(
+        {"chat_request_rejections", "chat_completion_latency", "chat_availability"}
+    )
     assert _rejection_count(metrics_spy, PrometheusRejectionReason.BUDGET_EXCEEDED) == 1
 
 
@@ -485,8 +565,18 @@ async def test_get_completion_rate_limit_exceeded(mocker, metrics_spy):
     assert exc_info.value.status_code == 429
     assert exc_info.value.detail == {"error": 2}
     assert exc_info.value.headers == {"Retry-After": "60"}
+    assert (
+        _availability_count(
+            metrics_spy,
+            AvailabilityOutcome.EXCLUDED,
+            AvailabilityReason.RATE_LIMITED_OWN,
+        )
+        == 1
+    )
 
-    metrics_spy.assert_only({"chat_request_rejections", "chat_completion_latency"})
+    metrics_spy.assert_only(
+        {"chat_request_rejections", "chat_completion_latency", "chat_availability"}
+    )
     assert _rejection_count(metrics_spy, PrometheusRejectionReason.RATE_LIMITED) == 1
 
 
@@ -518,7 +608,7 @@ async def test_get_completion_400_non_rate_limit_error(mocker, metrics_spy):
 
     assert exc_info.value.status_code == 400
     assert exc_info.value.detail == {"error": "Upstream service returned an error"}
-    metrics_spy.assert_only({"chat_completion_latency"})
+    metrics_spy.assert_only({"chat_completion_latency", "chat_availability"})
 
 
 async def test_get_completion_429_non_rate_limit_error(mocker, metrics_spy):
@@ -543,7 +633,7 @@ async def test_get_completion_429_non_rate_limit_error(mocker, metrics_spy):
 
     assert exc_info.value.status_code == 429
     assert exc_info.value.detail == {"error": "Upstream service returned an error"}
-    metrics_spy.assert_only({"chat_completion_latency"})
+    metrics_spy.assert_only({"chat_completion_latency", "chat_availability"})
 
 
 async def test_get_completion_upstream_rate_limit_error(mocker, metrics_spy):
@@ -568,7 +658,17 @@ async def test_get_completion_upstream_rate_limit_error(mocker, metrics_spy):
 
     assert exc_info.value.status_code == 429
     assert exc_info.value.detail == {"error": ERROR_CODE_UPSTREAM_RATE_LIMIT_EXCEEDED}
-    metrics_spy.assert_only({"chat_request_rejections", "chat_completion_latency"})
+    assert (
+        _availability_count(
+            metrics_spy,
+            AvailabilityOutcome.EXCLUDED,
+            AvailabilityReason.RATE_LIMITED_UPSTREAM,
+        )
+        == 1
+    )
+    metrics_spy.assert_only(
+        {"chat_request_rejections", "chat_completion_latency", "chat_availability"}
+    )
     assert _rejection_count(metrics_spy, PrometheusRejectionReason.RATE_LIMITED) == 1
     assert _latency_count(metrics_spy, PrometheusResult.ERROR) == 1
 
@@ -599,7 +699,9 @@ async def test_get_completion_context_window_exceeded(mocker, metrics_spy):
     assert exc_info.value.detail == {"error": ERROR_CODE_REQUEST_TOO_LARGE}
     mock_logger.warning.assert_called_once()
     assert "Context window exceeded" in str(mock_logger.warning.call_args)
-    metrics_spy.assert_only({"chat_request_rejections", "chat_completion_latency"})
+    metrics_spy.assert_only(
+        {"chat_request_rejections", "chat_completion_latency", "chat_availability"}
+    )
     assert (
         _rejection_count(metrics_spy, PrometheusRejectionReason.PAYLOAD_TOO_LARGE) == 1
     )
@@ -634,7 +736,9 @@ async def test_get_completion_invalid_model_name(mocker, metrics_spy):
     assert exc_info.value.detail == {"error": ERROR_CODE_INVALID_MODEL_NAME}
     mock_logger.warning.assert_called_once()
     assert "Invalid model name" in str(mock_logger.warning.call_args)
-    metrics_spy.assert_only({"chat_request_rejections", "chat_completion_latency"})
+    metrics_spy.assert_only(
+        {"chat_request_rejections", "chat_completion_latency", "chat_availability"}
+    )
     assert (
         _rejection_count(metrics_spy, PrometheusRejectionReason.INVALID_MODEL_NAME) == 1
     )
@@ -668,7 +772,9 @@ async def test_get_completion_invalid_request_vertex(mocker, metrics_spy):
     assert exc_info.value.detail == {"error": ERROR_CODE_INVALID_REQUEST}
     mock_logger.warning.assert_called_once()
     assert "Invalid request" in str(mock_logger.warning.call_args)
-    metrics_spy.assert_only({"chat_request_rejections", "chat_completion_latency"})
+    metrics_spy.assert_only(
+        {"chat_request_rejections", "chat_completion_latency", "chat_availability"}
+    )
     assert _rejection_count(metrics_spy, PrometheusRejectionReason.INVALID_REQUEST) == 1
     assert _latency_count(metrics_spy, PrometheusResult.ERROR) == 1
 
@@ -693,7 +799,7 @@ async def test_get_completion_429_invalid_json(mocker, metrics_spy):
 
     assert exc_info.value.status_code == 429
     assert exc_info.value.detail == {"error": "Upstream service returned an error"}
-    metrics_spy.assert_only({"chat_completion_latency"})
+    metrics_spy.assert_only({"chat_completion_latency", "chat_availability"})
 
 
 async def test_stream_completion_budget_limit_exceeded_429(
@@ -727,9 +833,20 @@ async def test_stream_completion_budget_limit_exceeded_429(
     )
     mock_logger.warning.assert_called_once()
     assert "Budget limit exceeded" in str(mock_logger.warning.call_args)
-    metrics_spy.assert_only({"chat_request_rejections", "chat_completion_latency"})
+    metrics_spy.assert_only(
+        {"chat_request_rejections", "chat_completion_latency", "chat_availability"}
+    )
     assert _rejection_count(metrics_spy, PrometheusRejectionReason.BUDGET_EXCEEDED) == 1
     assert _latency_count(metrics_spy, PrometheusResult.ERROR) == 1
+    assert (
+        _availability_count(
+            metrics_spy,
+            AvailabilityOutcome.EXCLUDED,
+            AvailabilityReason.BUDGET_EXCEEDED,
+        )
+        == 1
+    )
+    assert _availability_total(metrics_spy) == 1
 
 
 async def test_stream_completion_budget_limit_exceeded_400(
@@ -764,7 +881,9 @@ async def test_stream_completion_budget_limit_exceeded_400(
     )
     mock_logger.warning.assert_called_once()
     assert "Budget limit exceeded" in str(mock_logger.warning.call_args)
-    metrics_spy.assert_only({"chat_request_rejections", "chat_completion_latency"})
+    metrics_spy.assert_only(
+        {"chat_request_rejections", "chat_completion_latency", "chat_availability"}
+    )
     assert _rejection_count(metrics_spy, PrometheusRejectionReason.BUDGET_EXCEEDED) == 1
     assert _latency_count(metrics_spy, PrometheusResult.ERROR) == 1
 
@@ -801,7 +920,9 @@ async def test_stream_completion_rate_limit_exceeded(
     )
     mock_logger.warning.assert_called_once()
     assert "Rate limit exceeded" in str(mock_logger.warning.call_args)
-    metrics_spy.assert_only({"chat_request_rejections", "chat_completion_latency"})
+    metrics_spy.assert_only(
+        {"chat_request_rejections", "chat_completion_latency", "chat_availability"}
+    )
     assert _rejection_count(metrics_spy, PrometheusRejectionReason.RATE_LIMITED) == 1
     assert _latency_count(metrics_spy, PrometheusResult.ERROR) == 1
 
@@ -833,7 +954,9 @@ async def test_stream_completion_context_window_exceeded(
     )
     mock_logger.warning.assert_called_once()
     assert "Context window exceeded" in str(mock_logger.warning.call_args)
-    metrics_spy.assert_only({"chat_request_rejections", "chat_completion_latency"})
+    metrics_spy.assert_only(
+        {"chat_request_rejections", "chat_completion_latency", "chat_availability"}
+    )
     assert (
         _rejection_count(metrics_spy, PrometheusRejectionReason.PAYLOAD_TOO_LARGE) == 1
     )
@@ -869,7 +992,9 @@ async def test_stream_completion_invalid_model_name(
     )
     mock_logger.warning.assert_called_once()
     assert "Invalid model name" in str(mock_logger.warning.call_args)
-    metrics_spy.assert_only({"chat_request_rejections", "chat_completion_latency"})
+    metrics_spy.assert_only(
+        {"chat_request_rejections", "chat_completion_latency", "chat_availability"}
+    )
     assert (
         _rejection_count(metrics_spy, PrometheusRejectionReason.INVALID_MODEL_NAME) == 1
     )
@@ -904,7 +1029,9 @@ async def test_stream_completion_invalid_request_vertex(
     )
     mock_logger.warning.assert_called_once()
     assert "Invalid request" in str(mock_logger.warning.call_args)
-    metrics_spy.assert_only({"chat_request_rejections", "chat_completion_latency"})
+    metrics_spy.assert_only(
+        {"chat_request_rejections", "chat_completion_latency", "chat_availability"}
+    )
     assert _rejection_count(metrics_spy, PrometheusRejectionReason.INVALID_REQUEST) == 1
     assert _latency_count(metrics_spy, PrometheusResult.ERROR) == 1
 
@@ -941,7 +1068,7 @@ async def test_stream_completion_400_non_rate_limit_error(
         == b'data: {"code": 400, "error": "Upstream service returned an error"}\n\n'
     )
     mock_logger.error.assert_called_once()
-    metrics_spy.assert_only({"chat_completion_latency"})
+    metrics_spy.assert_only({"chat_completion_latency", "chat_availability"})
     assert _latency_count(metrics_spy, PrometheusResult.ERROR) == 1
 
 
@@ -971,7 +1098,7 @@ async def test_stream_completion_429_non_rate_limit_error(
         == b'data: {"code": 429, "error": "Upstream service returned an error"}\n\n'
     )
     mock_logger.error.assert_called_once()
-    metrics_spy.assert_only({"chat_completion_latency"})
+    metrics_spy.assert_only({"chat_completion_latency", "chat_availability"})
     assert _latency_count(metrics_spy, PrometheusResult.ERROR) == 1
 
 
@@ -995,7 +1122,9 @@ async def test_stream_completion_upstream_rate_limit_error(
     assert received_chunks == [
         f'data: {{"error": {ERROR_CODE_UPSTREAM_RATE_LIMIT_EXCEEDED}}}\n\n'.encode()
     ]
-    metrics_spy.assert_only({"chat_request_rejections", "chat_completion_latency"})
+    metrics_spy.assert_only(
+        {"chat_request_rejections", "chat_completion_latency", "chat_availability"}
+    )
     assert _rejection_count(metrics_spy, PrometheusRejectionReason.RATE_LIMITED) == 1
     assert _latency_count(metrics_spy, PrometheusResult.ERROR) == 1
 
@@ -1023,7 +1152,7 @@ async def test_stream_completion_429_invalid_json(
         == b'data: {"code": 429, "error": "Upstream service returned an error"}\n\n'
     )
     mock_logger.error.assert_called_once()
-    metrics_spy.assert_only({"chat_completion_latency"})
+    metrics_spy.assert_only({"chat_completion_latency", "chat_availability"})
     assert _latency_count(metrics_spy, PrometheusResult.ERROR) == 1
 
 
@@ -1043,7 +1172,7 @@ async def test_stream_completion_exception_after_streaming_started(
 
     assert len(received_chunks) == 1
     assert b"error" in received_chunks[0]
-    metrics_spy.assert_only({"chat_completion_latency"})
+    metrics_spy.assert_only({"chat_completion_latency", "chat_availability"})
     assert _latency_count(metrics_spy, PrometheusResult.ERROR) == 1
 
 
@@ -1248,6 +1377,15 @@ async def test_stream_sends_error_sse_on_empty_200_response(
     )
     assert b'"error"' in received[0], "Chunk must be an error SSE frame"
     _assert_error_latency(metrics_spy)
+    assert (
+        _availability_count(
+            metrics_spy,
+            AvailabilityOutcome.FAILURE,
+            AvailabilityReason.EMPTY_RESPONSE,
+        )
+        == 1
+    )
+    assert _availability_total(metrics_spy) == 1
 
 
 async def test_stream_completion_client_disconnect_records_abort(
@@ -1280,6 +1418,15 @@ async def _aiter_bytes():
 
     assert _latency_count(metrics_spy, PrometheusResult.ABORT) == 1
     assert _latency_count(metrics_spy, PrometheusResult.ERROR) == 0
+    assert (
+        _availability_count(
+            metrics_spy,
+            AvailabilityOutcome.ABORT,
+            AvailabilityReason.CLIENT_DISCONNECT,
+        )
+        == 1
+    )
+    assert _availability_total(metrics_spy) == 1
 
 
 async def test_stream_uses_httpx_timeout_object_preserving_pool_timeout(

From 8c41794c3e65cf8d605b5ac716639f082763c998 Mon Sep 17 00:00:00 2001
From: Randy Concepcion <rconcepcion@mozilla.com>
Date: Thu, 11 Jun 2026 22:52:42 -0400
Subject: [PATCH 2/9] instrument pre-completion chat availability

---
 src/mlpa/core/auth/authorize.py               |  62 ++-
 src/mlpa/core/completions.py                  |  31 +-
 src/mlpa/core/metrics.py                      |  23 +-
 src/mlpa/core/prometheus_metrics.py           |  23 +-
 src/mlpa/run.py                               |   3 +
 .../unit/test_precompletion_availability.py   | 397 ++++++++++++++++++
 6 files changed, 507 insertions(+), 32 deletions(-)
 create mode 100644 src/tests/unit/test_precompletion_availability.py

diff --git a/src/mlpa/core/auth/authorize.py b/src/mlpa/core/auth/authorize.py
index 50eae01..e781ff9 100644
--- a/src/mlpa/core/auth/authorize.py
+++ b/src/mlpa/core/auth/authorize.py
@@ -13,6 +13,8 @@
     ServiceType,
 )
 from mlpa.core.config import env
+from mlpa.core.metrics import record_chat_availability_for
+from mlpa.core.prometheus_metrics import AvailabilityReason
 from mlpa.core.routers.appattest import app_attest_auth
 from mlpa.core.utils import extract_user_from_play_integrity_jwt, parse_app_attest_jwt
 
@@ -123,27 +125,57 @@ async def authorize_chat_request(
     )
 
     if not is_service_type_valid:
+        record_chat_availability_for(
+            AvailabilityReason.INVALID_SERVICE_TYPE_FOR_MODEL,
+            model=chat_request.model,
+            service_type=service_type.value,
+            purpose="",
+        )
         raise HTTPException(
             status_code=400,
             detail=f"Invalid service-type value for model {chat_request.model}. Should be one of {env.forced_model_service_type_pairs.get(chat_request.model)}",
         )
 
-    return await _authorize_common_request(
-        request=request,
-        build_authorized_request=lambda user, purpose_value: AuthorizedChatRequest(
-            user=user,
+    try:
+        return await _authorize_common_request(
+            request=request,
+            build_authorized_request=lambda user, purpose_value: AuthorizedChatRequest(
+                user=user,
+                service_type=service_type.value,
+                purpose=purpose_value,
+                **chat_request.model_dump(exclude_unset=True),
+            ),
+            authorization=authorization,
+            service_type=service_type,
+            purpose=purpose,
+            x_dev_authorization=x_dev_authorization,
+            use_app_attest=use_app_attest,
+            use_qa_certificates=use_qa_certificates,
+            use_play_integrity=use_play_integrity,
+        )
+    except HTTPException as exc:
+        # Record only the terminal auth dispositions the dependency intentionally
+        # emits. 401 is an expected or normalized auth rejection. 400 is a
+        # client-error request to the auth layer (invalid purpose, or malformed App
+        # Attest base64 decoded in app_attest_auth). Any other status, including App
+        # Attest's explicit 500, is re-raised unrecorded; auth-system-failure capture
+        # is left to a follow-on auth backend change. Non-HTTPException errors are not
+        # caught here and propagate unrecorded. Purpose is unresolved at these
+        # dispositions, so a stable "" placeholder is used, never the unvalidated
+        # header value.
+        if exc.status_code == 401:
+            reason = AvailabilityReason.AUTH_REJECTED
+        elif exc.status_code == 400:
+            reason = AvailabilityReason.INVALID_AUTH_REQUEST
+        else:
+            raise
+        record_chat_availability_for(
+            reason,
+            model=chat_request.model,
             service_type=service_type.value,
-            purpose=purpose_value,
-            **chat_request.model_dump(exclude_unset=True),
-        ),
-        authorization=authorization,
-        service_type=service_type,
-        purpose=purpose,
-        x_dev_authorization=x_dev_authorization,
-        use_app_attest=use_app_attest,
-        use_qa_certificates=use_qa_certificates,
-        use_play_integrity=use_play_integrity,
-    )
+            purpose="",
+        )
+        raise
 
 
 async def authorize_search_request(
diff --git a/src/mlpa/core/completions.py b/src/mlpa/core/completions.py
index 5f40844..1e531a8 100644
--- a/src/mlpa/core/completions.py
+++ b/src/mlpa/core/completions.py
@@ -54,20 +54,29 @@ def _build_litellm_body(req: AuthorizedChatRequest, *, stream: bool) -> dict:
 async def get_or_create_user_for_completion(
     user_id: str, req: AuthorizedChatRequest | AuthorizedSearchRequest
 ):
-    """Wraps get_or_create_user and records a signup-cap rejection metric if applicable."""
+    """Wraps get_or_create_user, recording the signup-cap rejection metric and the
+    pre-completion availability disposition (signup cap, or a 5xx provisioning
+    failure) for chat requests."""
     try:
         return await get_or_create_user(user_id)
     except HTTPException as exc:
-        if (
-            exc.status_code == 403
-            and isinstance(exc.detail, dict)
-            and exc.detail.get("error") == ERROR_CODE_MAX_USERS_REACHED
-            and isinstance(req, AuthorizedChatRequest)
-        ):
-            record_chat_request_rejection(
-                req,
-                PrometheusRejectionReason.SIGNUP_CAP_EXCEEDED,
-            )
+        if isinstance(req, AuthorizedChatRequest):
+            if (
+                exc.status_code == 403
+                and isinstance(exc.detail, dict)
+                and exc.detail.get("error") == ERROR_CODE_MAX_USERS_REACHED
+            ):
+                record_chat_request_rejection(
+                    req,
+                    PrometheusRejectionReason.SIGNUP_CAP_EXCEEDED,
+                )
+                record_chat_availability(req, AvailabilityReason.SIGNUP_CAP_EXCEEDED)
+            elif exc.status_code >= 500:
+                # User-resolution / provisioning system failure for an eligible
+                # request. Non-signup-cap, non-5xx dispositions are left unrecorded;
+                # a future client-side 4xx should get its own classification rather
+                # than counting as an availability failure.
+                record_chat_availability(req, AvailabilityReason.PROVISIONING_FAILURE)
         raise
 
 
diff --git a/src/mlpa/core/metrics.py b/src/mlpa/core/metrics.py
index 110fb4d..f81783a 100644
--- a/src/mlpa/core/metrics.py
+++ b/src/mlpa/core/metrics.py
@@ -43,16 +43,33 @@ def record_search_request_rejection(
     metrics.search_request_rejections.labels(reason=reason, **_search_labels(req)).inc()
 
 
-def record_chat_availability(
-    req: AuthorizedChatRequest, reason: AvailabilityReason
+def record_chat_availability_for(
+    reason: AvailabilityReason,
+    *,
+    model: str,
+    service_type: str,
+    purpose: str,
 ) -> None:
     metrics.chat_availability.labels(
         outcome=availability_outcome_for(reason),
         reason=reason,
-        **_chat_labels(req),
+        model=model,
+        service_type=service_type,
+        purpose=purpose,
     ).inc()
 
 
+def record_chat_availability(
+    req: AuthorizedChatRequest, reason: AvailabilityReason
+) -> None:
+    record_chat_availability_for(
+        reason,
+        model=req.model,
+        service_type=req.service_type,
+        purpose=req.purpose,
+    )
+
+
 def record_completion_latency(
     req: AuthorizedChatRequest,
     result: PrometheusResult,
diff --git a/src/mlpa/core/prometheus_metrics.py b/src/mlpa/core/prometheus_metrics.py
index 6930c4b..531816c 100644
--- a/src/mlpa/core/prometheus_metrics.py
+++ b/src/mlpa/core/prometheus_metrics.py
@@ -44,9 +44,19 @@ class AvailabilityReason(StrEnum):
     INVALID_REQUEST = "invalid_request"
     # abort
     CLIENT_DISCONNECT = "client_disconnect"
-    # TODO: add pre-completion reasons (auth_rejected, auth_system_failure,
-    # signup_cap_exceeded, blocked, provisioning_failure, invalid_purpose,
-    # invalid_service_type_for_model) when those paths are instrumented.
+    # pre-completion (auth dependency and route body, before the completion path)
+    # excluded
+    AUTH_REJECTED = "auth_rejected"
+    INVALID_AUTH_REQUEST = "invalid_auth_request"
+    INVALID_SERVICE_TYPE_FOR_MODEL = "invalid_service_type_for_model"
+    SIGNUP_CAP_EXCEEDED = "signup_cap_exceeded"
+    BLOCKED = "blocked"
+    # failure
+    PROVISIONING_FAILURE = "provisioning_failure"
+    # Defined but not emitted yet: auth backends normalize system failures to 401
+    # (indistinguishable from expected rejections), so capturing this is left to a
+    # follow-on auth backend change that surfaces a real disposition.
+    AUTH_SYSTEM_FAILURE = "auth_system_failure"
 
 
 _AVAILABILITY_OUTCOME_BY_REASON: dict[AvailabilityReason, AvailabilityOutcome] = {
@@ -60,6 +70,13 @@ class AvailabilityReason(StrEnum):
     AvailabilityReason.INVALID_MODEL_NAME: AvailabilityOutcome.EXCLUDED,
     AvailabilityReason.INVALID_REQUEST: AvailabilityOutcome.EXCLUDED,
     AvailabilityReason.CLIENT_DISCONNECT: AvailabilityOutcome.ABORT,
+    AvailabilityReason.AUTH_REJECTED: AvailabilityOutcome.EXCLUDED,
+    AvailabilityReason.INVALID_AUTH_REQUEST: AvailabilityOutcome.EXCLUDED,
+    AvailabilityReason.INVALID_SERVICE_TYPE_FOR_MODEL: AvailabilityOutcome.EXCLUDED,
+    AvailabilityReason.SIGNUP_CAP_EXCEEDED: AvailabilityOutcome.EXCLUDED,
+    AvailabilityReason.BLOCKED: AvailabilityOutcome.EXCLUDED,
+    AvailabilityReason.PROVISIONING_FAILURE: AvailabilityOutcome.FAILURE,
+    AvailabilityReason.AUTH_SYSTEM_FAILURE: AvailabilityOutcome.FAILURE,
 }
 
 
diff --git a/src/mlpa/run.py b/src/mlpa/run.py
index 6a98a06..b7baebf 100644
--- a/src/mlpa/run.py
+++ b/src/mlpa/run.py
@@ -26,9 +26,11 @@
 )
 from mlpa.core.http_client import close_http_client, get_http_client
 from mlpa.core.logger import logger, setup_logger
+from mlpa.core.metrics import record_chat_availability
 from mlpa.core.middleware import register_middleware
 from mlpa.core.openapi import customize_openapi
 from mlpa.core.pg_services.services import app_attest_pg, litellm_pg
+from mlpa.core.prometheus_metrics import AvailabilityReason
 from mlpa.core.routers.appattest import appattest_router
 from mlpa.core.routers.health import health_router
 from mlpa.core.routers.mock import mock_router
@@ -207,6 +209,7 @@ async def chat_completion(
         )
     user, _ = await get_or_create_user_for_completion(user_id, authorized_chat_request)
     if user.get("blocked"):
+        record_chat_availability(authorized_chat_request, AvailabilityReason.BLOCKED)
         raise HTTPException(status_code=403, detail={"error": "User is blocked."})
 
     if authorized_chat_request.stream:
diff --git a/src/tests/unit/test_precompletion_availability.py b/src/tests/unit/test_precompletion_availability.py
new file mode 100644
index 0000000..b871ebb
--- /dev/null
+++ b/src/tests/unit/test_precompletion_availability.py
@@ -0,0 +1,397 @@
+"""Pre-completion availability instrumentation.
+
+Covers the dispositions the completion-stage counter cannot see: the chat auth
+dependency (service-type / model, auth rejection, client-error auth request, and
+the statuses that are intentionally not recorded) and the route-body sites
+(signup cap, provisioning failure, blocked). The completion tests bypass the auth
+dependency, so the wrapper behavior is only exercised here.
+"""
+
+import pytest
+from fastapi import HTTPException, Request
+
+from mlpa import run as run_module
+from mlpa.core.auth import authorize as authorize_module
+from mlpa.core.classes import (
+    AuthorizedChatRequest,
+    AuthorizedSearchRequest,
+    ChatRequest,
+)
+from mlpa.core.completions import get_or_create_user_for_completion
+from mlpa.core.config import ERROR_CODE_MAX_USERS_REACHED
+from mlpa.core.prometheus_metrics import (
+    AvailabilityOutcome,
+    AvailabilityReason,
+    PrometheusRejectionReason,
+)
+from tests.consts import SAMPLE_REQUEST
+
+# A model/service-type pair that is valid together, so the wrapper passes its own
+# check and reaches the shared auth call.
+_VALID_MODEL = "openai/gpt-4o"
+_AI = authorize_module.ServiceType.ai
+
+
+def _make_request() -> Request:
+    async def receive() -> dict:
+        return {"type": "http.request", "body": b"", "more_body": False}
+
+    return Request(
+        {"type": "http", "method": "POST", "path": "/", "headers": []}, receive
+    )
+
+
+def _chat_request(model: str = _VALID_MODEL) -> ChatRequest:
+    return ChatRequest(model=model, messages=[{"role": "user", "content": "hi"}])
+
+
+def _availability(
+    spy,
+    outcome: AvailabilityOutcome,
+    reason: AvailabilityReason,
+    *,
+    model: str,
+    service_type: str,
+    purpose: str = "",
+) -> float:
+    return spy.value(
+        "chat_availability",
+        outcome=outcome,
+        reason=reason,
+        model=model,
+        service_type=service_type,
+        purpose=purpose,
+    )
+
+
+def _availability_total(spy) -> float:
+    """Sum of every chat_availability sample. Proves exactly one disposition."""
+    return sum(
+        s.value for s in spy.samples("chat_availability") if s.name.endswith("_total")
+    )
+
+
+def _rejection_total(spy) -> float:
+    return sum(
+        s.value
+        for s in spy.samples("chat_request_rejections")
+        if s.name.endswith("_total")
+    )
+
+
+# --- auth dependency (authorize_chat_request) ---------------------------------
+
+
+async def test_wrapper_success_records_no_auth_stage_availability(mocker, metrics_spy):
+    mocker.patch.object(
+        authorize_module,
+        "fxa_auth",
+        mocker.AsyncMock(return_value={"user": "user-123"}),
+    )
+
+    result = await authorize_module.authorize_chat_request(
+        request=_make_request(),
+        chat_request=_chat_request(),
+        authorization="Bearer token",
+        service_type=_AI,
+        purpose="chat",
+    )
+
+    assert isinstance(result, AuthorizedChatRequest)
+    # Auth success is finalized later at completion, never at the auth stage.
+    assert "chat_availability" not in metrics_spy.touched()
+
+
+async def test_wrapper_invalid_service_type_records_excluded(metrics_spy):
+    with pytest.raises(HTTPException) as exc_info:
+        await authorize_module.authorize_chat_request(
+            request=_make_request(),
+            chat_request=ChatRequest(model="exa", messages=[]),
+            authorization="Bearer token",
+            service_type=_AI,
+            purpose="chat",
+        )
+
+    assert exc_info.value.status_code == 400
+    assert (
+        _availability(
+            metrics_spy,
+            AvailabilityOutcome.EXCLUDED,
+            AvailabilityReason.INVALID_SERVICE_TYPE_FOR_MODEL,
+            model="exa",
+            service_type="ai",
+        )
+        == 1
+    )
+    assert _availability_total(metrics_spy) == 1
+
+
+async def test_wrapper_invalid_purpose_records_invalid_auth_request(metrics_spy):
+    # A real shared-call 400 from purpose validation maps to the coarse reason.
+    with pytest.raises(HTTPException) as exc_info:
+        await authorize_module.authorize_chat_request(
+            request=_make_request(),
+            chat_request=_chat_request(),
+            authorization="Bearer token",
+            service_type=_AI,
+            purpose="definitely-not-a-valid-purpose",
+        )
+
+    assert exc_info.value.status_code == 400
+    assert (
+        _availability(
+            metrics_spy,
+            AvailabilityOutcome.EXCLUDED,
+            AvailabilityReason.INVALID_AUTH_REQUEST,
+            model=_VALID_MODEL,
+            service_type="ai",
+        )
+        == 1
+    )
+    assert _availability_total(metrics_spy) == 1
+
+
+async def test_wrapper_shared_call_401_records_auth_rejected(mocker, metrics_spy):
+    raised = HTTPException(status_code=401, detail="Invalid FxA auth")
+    mocker.patch.object(
+        authorize_module,
+        "_authorize_common_request",
+        mocker.AsyncMock(side_effect=raised),
+    )
+
+    with pytest.raises(HTTPException) as exc_info:
+        await authorize_module.authorize_chat_request(
+            request=_make_request(),
+            chat_request=_chat_request(),
+            authorization="Bearer token",
+            service_type=_AI,
+            purpose="chat",
+        )
+
+    assert exc_info.value is raised  # re-raised unchanged
+    assert (
+        _availability(
+            metrics_spy,
+            AvailabilityOutcome.EXCLUDED,
+            AvailabilityReason.AUTH_REJECTED,
+            model=_VALID_MODEL,
+            service_type="ai",
+        )
+        == 1
+    )
+    assert _availability_total(metrics_spy) == 1
+
+
+async def test_wrapper_shared_call_400_records_invalid_auth_request(
+    mocker, metrics_spy
+):
+    # Pins that the wrapper maps any shared-call 400 to invalid_auth_request,
+    # regardless of source. The non-purpose source (malformed App Attest base64
+    # decoded in app_attest_auth, which raises before its try) was confirmed by
+    # code inspection; this test proves the mapping, not that path's reachability.
+    raised = HTTPException(status_code=400, detail={"challenge_b64": "Invalid Base64"})
+    mocker.patch.object(
+        authorize_module,
+        "_authorize_common_request",
+        mocker.AsyncMock(side_effect=raised),
+    )
+
+    with pytest.raises(HTTPException) as exc_info:
+        await authorize_module.authorize_chat_request(
+            request=_make_request(),
+            chat_request=_chat_request(),
+            authorization="Bearer token",
+            service_type=_AI,
+            purpose="chat",
+        )
+
+    assert exc_info.value is raised  # re-raised unchanged
+    assert (
+        _availability(
+            metrics_spy,
+            AvailabilityOutcome.EXCLUDED,
+            AvailabilityReason.INVALID_AUTH_REQUEST,
+            model=_VALID_MODEL,
+            service_type="ai",
+        )
+        == 1
+    )
+    assert _availability_total(metrics_spy) == 1
+
+
+async def test_wrapper_shared_call_500_records_nothing(mocker, metrics_spy):
+    # App Attest's explicit 500 is an HTTPException, so it is caught, but the
+    # wrapper records only 401/400 and re-raises everything else unrecorded.
+    raised = HTTPException(
+        status_code=500, detail="Server error during App Attest auth"
+    )
+    mocker.patch.object(
+        authorize_module,
+        "_authorize_common_request",
+        mocker.AsyncMock(side_effect=raised),
+    )
+
+    with pytest.raises(HTTPException) as exc_info:
+        await authorize_module.authorize_chat_request(
+            request=_make_request(),
+            chat_request=_chat_request(),
+            authorization="Bearer token",
+            service_type=_AI,
+            purpose="chat",
+        )
+
+    assert exc_info.value is raised  # re-raised unchanged
+    assert "chat_availability" not in metrics_spy.touched()
+
+
+async def test_wrapper_shared_call_non_http_exception_records_nothing(
+    mocker, metrics_spy
+):
+    # Non-HTTPException auth-path errors are not caught and propagate unrecorded.
+    raised = RuntimeError("bare auth-path failure")
+    mocker.patch.object(
+        authorize_module,
+        "_authorize_common_request",
+        mocker.AsyncMock(side_effect=raised),
+    )
+
+    with pytest.raises(RuntimeError):
+        await authorize_module.authorize_chat_request(
+            request=_make_request(),
+            chat_request=_chat_request(),
+            authorization="Bearer token",
+            service_type=_AI,
+            purpose="chat",
+        )
+
+    assert "chat_availability" not in metrics_spy.touched()
+
+
+# --- route body: get_or_create_user_for_completion ----------------------------
+
+
+async def test_signup_cap_records_excluded_alongside_rejection(mocker, metrics_spy):
+    mocker.patch(
+        "mlpa.core.completions.get_or_create_user",
+        mocker.AsyncMock(
+            side_effect=HTTPException(
+                status_code=403, detail={"error": ERROR_CODE_MAX_USERS_REACHED}
+            )
+        ),
+    )
+
+    with pytest.raises(HTTPException) as exc_info:
+        await get_or_create_user_for_completion(SAMPLE_REQUEST.user, SAMPLE_REQUEST)
+
+    assert exc_info.value.status_code == 403
+    assert (
+        _availability(
+            metrics_spy,
+            AvailabilityOutcome.EXCLUDED,
+            AvailabilityReason.SIGNUP_CAP_EXCEEDED,
+            model=SAMPLE_REQUEST.model,
+            service_type=SAMPLE_REQUEST.service_type,
+            purpose=SAMPLE_REQUEST.purpose,
+        )
+        == 1
+    )
+    assert _availability_total(metrics_spy) == 1
+    # The existing rejection metric is still recorded.
+    assert _rejection_total(metrics_spy) == 1
+
+
+async def test_provisioning_failure_records_failure(mocker, metrics_spy):
+    mocker.patch(
+        "mlpa.core.completions.get_or_create_user",
+        mocker.AsyncMock(
+            side_effect=HTTPException(
+                status_code=500, detail={"error": "Error fetching user info"}
+            )
+        ),
+    )
+
+    with pytest.raises(HTTPException) as exc_info:
+        await get_or_create_user_for_completion(SAMPLE_REQUEST.user, SAMPLE_REQUEST)
+
+    assert exc_info.value.status_code == 500
+    assert (
+        _availability(
+            metrics_spy,
+            AvailabilityOutcome.FAILURE,
+            AvailabilityReason.PROVISIONING_FAILURE,
+            model=SAMPLE_REQUEST.model,
+            service_type=SAMPLE_REQUEST.service_type,
+            purpose=SAMPLE_REQUEST.purpose,
+        )
+        == 1
+    )
+    assert _availability_total(metrics_spy) == 1
+
+
+async def test_non_signup_non_5xx_records_nothing(mocker, metrics_spy):
+    # The strict gate leaves a non-signup-cap, non-5xx disposition unrecorded so a
+    # client-side 4xx is not counted as an availability failure.
+    mocker.patch(
+        "mlpa.core.completions.get_or_create_user",
+        mocker.AsyncMock(
+            side_effect=HTTPException(
+                status_code=400, detail={"error": "Invalid user_id format"}
+            )
+        ),
+    )
+
+    with pytest.raises(HTTPException) as exc_info:
+        await get_or_create_user_for_completion(SAMPLE_REQUEST.user, SAMPLE_REQUEST)
+
+    assert exc_info.value.status_code == 400
+    assert "chat_availability" not in metrics_spy.touched()
+
+
+async def test_search_request_records_no_chat_availability(mocker, metrics_spy):
+    search_req = AuthorizedSearchRequest(
+        user="user-1:search", service_type="search", query="q", max_results=2
+    )
+    mocker.patch(
+        "mlpa.core.completions.get_or_create_user",
+        mocker.AsyncMock(
+            side_effect=HTTPException(
+                status_code=500, detail={"error": "Error fetching user info"}
+            )
+        ),
+    )
+
+    with pytest.raises(HTTPException):
+        await get_or_create_user_for_completion(search_req.user, search_req)
+
+    assert "chat_availability" not in metrics_spy.touched()
+
+
+# --- route body: blocked user -------------------------------------------------
+
+
+async def test_blocked_user_records_blocked(mocker, metrics_spy):
+    mocker.patch.object(
+        run_module,
+        "get_or_create_user_for_completion",
+        mocker.AsyncMock(return_value=({"blocked": True}, False)),
+    )
+
+    with pytest.raises(HTTPException) as exc_info:
+        await run_module.chat_completion(
+            request=_make_request(),
+            authorized_chat_request=SAMPLE_REQUEST,
+        )
+
+    assert exc_info.value.status_code == 403
+    assert (
+        _availability(
+            metrics_spy,
+            AvailabilityOutcome.EXCLUDED,
+            AvailabilityReason.BLOCKED,
+            model=SAMPLE_REQUEST.model,
+            service_type=SAMPLE_REQUEST.service_type,
+            purpose=SAMPLE_REQUEST.purpose,
+        )
+        == 1
+    )
+    assert _availability_total(metrics_spy) == 1

From 2c0082137fbecdcb27e938a867397bff88dd956f Mon Sep 17 00:00:00 2001
From: Randy Concepcion <rconcepcion@mozilla.com>
Date: Fri, 12 Jun 2026 00:32:21 -0400
Subject: [PATCH 3/9] clean up comments and remove unused import

---
 src/mlpa/core/auth/authorize.py               | 17 +++---
 src/mlpa/core/completions.py                  | 16 +++---
 src/mlpa/core/errors.py                       |  4 +-
 src/mlpa/core/prometheus_metrics.py           | 57 +++++++++----------
 .../unit/test_precompletion_availability.py   |  1 -
 5 files changed, 46 insertions(+), 49 deletions(-)

diff --git a/src/mlpa/core/auth/authorize.py b/src/mlpa/core/auth/authorize.py
index e781ff9..a12b8c1 100644
--- a/src/mlpa/core/auth/authorize.py
+++ b/src/mlpa/core/auth/authorize.py
@@ -154,15 +154,14 @@ async def authorize_chat_request(
             use_play_integrity=use_play_integrity,
         )
     except HTTPException as exc:
-        # Record only the terminal auth dispositions the dependency intentionally
-        # emits. 401 is an expected or normalized auth rejection. 400 is a
-        # client-error request to the auth layer (invalid purpose, or malformed App
-        # Attest base64 decoded in app_attest_auth). Any other status, including App
-        # Attest's explicit 500, is re-raised unrecorded; auth-system-failure capture
-        # is left to a follow-on auth backend change. Non-HTTPException errors are not
-        # caught here and propagate unrecorded. Purpose is unresolved at these
-        # dispositions, so a stable "" placeholder is used, never the unvalidated
-        # header value.
+        # Only record terminal HTTP failures from the shared auth call:
+        # - 401: expected or normalized auth rejection (bad creds, expired token, etc.)
+        # - 400: client error to the auth layer (invalid purpose, or malformed App
+        #        Attest base64 decoded in app_attest_auth before its try block)
+        # - anything else (e.g. App Attest's explicit 500): re-raised unrecorded;
+        #   auth-system-failure capture is left to a follow-on auth backend change
+        # Non-HTTPException errors are not caught here and propagate unrecorded.
+        # Purpose is unknown at this point, so "" is always used as a placeholder.
         if exc.status_code == 401:
             reason = AvailabilityReason.AUTH_REJECTED
         elif exc.status_code == 400:
diff --git a/src/mlpa/core/completions.py b/src/mlpa/core/completions.py
index 1e531a8..0ee102c 100644
--- a/src/mlpa/core/completions.py
+++ b/src/mlpa/core/completions.py
@@ -54,9 +54,12 @@ def _build_litellm_body(req: AuthorizedChatRequest, *, stream: bool) -> dict:
 async def get_or_create_user_for_completion(
     user_id: str, req: AuthorizedChatRequest | AuthorizedSearchRequest
 ):
-    """Wraps get_or_create_user, recording the signup-cap rejection metric and the
-    pre-completion availability disposition (signup cap, or a 5xx provisioning
-    failure) for chat requests."""
+    """
+    Wraps get_or_create_user and records availability for chat requests:
+    - signup cap (403 + MAX_USERS_REACHED): excluded, alongside the existing rejection metric
+    - user-resolution failure (status >= 500): failure
+    - search requests and non-signup-cap, non-5xx failures: not recorded
+    """
     try:
         return await get_or_create_user(user_id)
     except HTTPException as exc:
@@ -72,10 +75,9 @@ async def get_or_create_user_for_completion(
                 )
                 record_chat_availability(req, AvailabilityReason.SIGNUP_CAP_EXCEEDED)
             elif exc.status_code >= 500:
-                # User-resolution / provisioning system failure for an eligible
-                # request. Non-signup-cap, non-5xx dispositions are left unrecorded;
-                # a future client-side 4xx should get its own classification rather
-                # than counting as an availability failure.
+                # User-resolution system failure. Non-signup-cap 4xx errors are
+                # not recorded; a client-side 4xx should get its own classification
+                # rather than counting as an availability failure.
                 record_chat_availability(req, AvailabilityReason.PROVISIONING_FAILURE)
         raise
 
diff --git a/src/mlpa/core/errors.py b/src/mlpa/core/errors.py
index 8bfc0dc..3ea4ca4 100644
--- a/src/mlpa/core/errors.py
+++ b/src/mlpa/core/errors.py
@@ -37,8 +37,8 @@ class RejectionMatch:
     log_message: str = ""
 
     def availability_reason(self) -> AvailabilityReason:
-        # classify_upstream_error only ever produces these completion-stage
-        # rejection reasons; SIGNUP_CAP_EXCEEDED is recorded pre-completion.
+        # SIGNUP_CAP_EXCEEDED is recorded pre-completion, not via classify_upstream_error,
+        # so it is not in the mapping below.
         if self.reason == PrometheusRejectionReason.RATE_LIMITED:
             if self.error_code == ERROR_CODE_UPSTREAM_RATE_LIMIT_EXCEEDED:
                 return AvailabilityReason.RATE_LIMITED_UPSTREAM
diff --git a/src/mlpa/core/prometheus_metrics.py b/src/mlpa/core/prometheus_metrics.py
index 531816c..d104020 100644
--- a/src/mlpa/core/prometheus_metrics.py
+++ b/src/mlpa/core/prometheus_metrics.py
@@ -27,36 +27,33 @@ class AvailabilityOutcome(StrEnum):
 
 
 class AvailabilityReason(StrEnum):
-    # Values overlapping PrometheusRejectionReason are kept identical so the
-    # availability counter and the rejection counter reconcile on shared strings.
-    # Keep them in sync if a rejection reason is added.
-    # success
-    CLEAN_COMPLETION = "clean_completion"
-    # failure
-    UPSTREAM_ERROR = "upstream_error"
-    EMPTY_RESPONSE = "empty_response"
-    # excluded (policy rejections)
-    BUDGET_EXCEEDED = "budget_exceeded"
-    RATE_LIMITED_OWN = "rate_limited_own"
-    RATE_LIMITED_UPSTREAM = "rate_limited_upstream"
-    PAYLOAD_TOO_LARGE = "payload_too_large"
-    INVALID_MODEL_NAME = "invalid_model_name"
-    INVALID_REQUEST = "invalid_request"
-    # abort
-    CLIENT_DISCONNECT = "client_disconnect"
-    # pre-completion (auth dependency and route body, before the completion path)
-    # excluded
-    AUTH_REJECTED = "auth_rejected"
-    INVALID_AUTH_REQUEST = "invalid_auth_request"
-    INVALID_SERVICE_TYPE_FOR_MODEL = "invalid_service_type_for_model"
-    SIGNUP_CAP_EXCEEDED = "signup_cap_exceeded"
-    BLOCKED = "blocked"
-    # failure
-    PROVISIONING_FAILURE = "provisioning_failure"
-    # Defined but not emitted yet: auth backends normalize system failures to 401
-    # (indistinguishable from expected rejections), so capturing this is left to a
-    # follow-on auth backend change that surfaces a real disposition.
-    AUTH_SYSTEM_FAILURE = "auth_system_failure"
+    # Strings shared with PrometheusRejectionReason are kept identical so the
+    # two counters reconcile. Keep them in sync when a rejection reason is added.
+
+    # --- completion-stage reasons (recorded inside stream_completion / get_completion) ---
+    CLEAN_COMPLETION = "clean_completion"  # success
+    UPSTREAM_ERROR = "upstream_error"  # failure
+    EMPTY_RESPONSE = "empty_response"  # failure
+    BUDGET_EXCEEDED = "budget_exceeded"  # excluded
+    RATE_LIMITED_OWN = "rate_limited_own"  # excluded
+    RATE_LIMITED_UPSTREAM = "rate_limited_upstream"  # excluded
+    PAYLOAD_TOO_LARGE = "payload_too_large"  # excluded
+    INVALID_MODEL_NAME = "invalid_model_name"  # excluded
+    INVALID_REQUEST = "invalid_request"  # excluded
+    CLIENT_DISCONNECT = "client_disconnect"  # abort
+
+    # --- pre-completion reasons (recorded in the auth dependency and route body) ---
+    AUTH_REJECTED = "auth_rejected"  # excluded
+    INVALID_AUTH_REQUEST = "invalid_auth_request"  # excluded
+    INVALID_SERVICE_TYPE_FOR_MODEL = "invalid_service_type_for_model"  # excluded
+    SIGNUP_CAP_EXCEEDED = "signup_cap_exceeded"  # excluded
+    BLOCKED = "blocked"  # excluded
+    PROVISIONING_FAILURE = "provisioning_failure"  # failure
+
+    # Defined but not yet emitted: auth backends normalize system failures to 401,
+    # making them indistinguishable from expected rejections. Capturing this
+    # properly requires a follow-on change to the auth backends themselves.
+    AUTH_SYSTEM_FAILURE = "auth_system_failure"  # failure
 
 
 _AVAILABILITY_OUTCOME_BY_REASON: dict[AvailabilityReason, AvailabilityOutcome] = {
diff --git a/src/tests/unit/test_precompletion_availability.py b/src/tests/unit/test_precompletion_availability.py
index b871ebb..12315ce 100644
--- a/src/tests/unit/test_precompletion_availability.py
+++ b/src/tests/unit/test_precompletion_availability.py
@@ -22,7 +22,6 @@
 from mlpa.core.prometheus_metrics import (
     AvailabilityOutcome,
     AvailabilityReason,
-    PrometheusRejectionReason,
 )
 from tests.consts import SAMPLE_REQUEST
 

From 945205654b9b33046f639afc9ec199b0cdc56aa4 Mon Sep 17 00:00:00 2001
From: Randy Concepcion <rconcepcion@mozilla.com>
Date: Tue, 16 Jun 2026 15:51:35 -0400
Subject: [PATCH 4/9] docs: clarify purpose placeholder and 5xx failure wording

---
 src/mlpa/core/auth/authorize.py | 2 ++
 src/mlpa/core/completions.py    | 6 +++---
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/mlpa/core/auth/authorize.py b/src/mlpa/core/auth/authorize.py
index a12b8c1..a35aa00 100644
--- a/src/mlpa/core/auth/authorize.py
+++ b/src/mlpa/core/auth/authorize.py
@@ -129,6 +129,8 @@ async def authorize_chat_request(
             AvailabilityReason.INVALID_SERVICE_TYPE_FOR_MODEL,
             model=chat_request.model,
             service_type=service_type.value,
+            # `purpose` is not yet resolved here
+            # This serves as a placeholder until a validated purpose is available
             purpose="",
         )
         raise HTTPException(
diff --git a/src/mlpa/core/completions.py b/src/mlpa/core/completions.py
index 0ee102c..ad09ed3 100644
--- a/src/mlpa/core/completions.py
+++ b/src/mlpa/core/completions.py
@@ -57,7 +57,7 @@ async def get_or_create_user_for_completion(
     """
     Wraps get_or_create_user and records availability for chat requests:
     - signup cap (403 + MAX_USERS_REACHED): excluded, alongside the existing rejection metric
-    - user-resolution failure (status >= 500): failure
+    - user-resolution server or system failure (status >= 500): failure
     - search requests and non-signup-cap, non-5xx failures: not recorded
     """
     try:
@@ -75,8 +75,8 @@ async def get_or_create_user_for_completion(
                 )
                 record_chat_availability(req, AvailabilityReason.SIGNUP_CAP_EXCEEDED)
             elif exc.status_code >= 500:
-                # User-resolution system failure. Non-signup-cap 4xx errors are
-                # not recorded; a client-side 4xx should get its own classification
+                # User-resolution server or system failure. Non-signup-cap 4xx errors
+                # are not recorded; a client-side 4xx should get its own classification
                 # rather than counting as an availability failure.
                 record_chat_availability(req, AvailabilityReason.PROVISIONING_FAILURE)
         raise

From 370078a74bc03a739fefc79e91977181d11a2bb6 Mon Sep 17 00:00:00 2001
From: Randy Concepcion <rconcepcion@mozilla.com>
Date: Tue, 16 Jun 2026 16:02:45 -0400
Subject: [PATCH 5/9] record PAYLOAD_TOO_LARGE availability in request-size
 middleware

---
 src/mlpa/core/middleware/request_size.py         | 11 +++++++++++
 src/tests/integration/test_request_size_limit.py | 13 ++++++++++++-
 2 files changed, 23 insertions(+), 1 deletion(-)

diff --git a/src/mlpa/core/middleware/request_size.py b/src/mlpa/core/middleware/request_size.py
index b9603e3..641a144 100644
--- a/src/mlpa/core/middleware/request_size.py
+++ b/src/mlpa/core/middleware/request_size.py
@@ -3,6 +3,8 @@
 
 from mlpa.core.config import ERROR_CODE_REQUEST_TOO_LARGE, env
 from mlpa.core.logger import logger
+from mlpa.core.metrics import record_chat_availability_for
+from mlpa.core.prometheus_metrics import AvailabilityReason
 
 
 async def check_request_size_middleware(request: Request, call_next):
@@ -19,6 +21,15 @@ async def check_request_size_middleware(request: Request, call_next):
                     logger.warning(
                         f"Request size {size} bytes exceeds maximum {env.MAX_REQUEST_SIZE_BYTES} bytes"
                     )
+                    # Placeholders for `model`, `service_type`, and `purpose`
+                    # are used since values are not set until the body is parsed
+                    # and auth has run.
+                    record_chat_availability_for(
+                        AvailabilityReason.PAYLOAD_TOO_LARGE,
+                        model="",
+                        service_type="",
+                        purpose="",
+                    )
                     return JSONResponse(
                         status_code=413,
                         content={"error": ERROR_CODE_REQUEST_TOO_LARGE},
diff --git a/src/tests/integration/test_request_size_limit.py b/src/tests/integration/test_request_size_limit.py
index 63e5c59..36d4263 100644
--- a/src/tests/integration/test_request_size_limit.py
+++ b/src/tests/integration/test_request_size_limit.py
@@ -23,7 +23,7 @@ def test_request_size_under_limit(mocked_client_integration):
     assert response.status_code != 413
 
 
-def test_request_size_over_limit(mocked_client_integration):
+def test_request_size_over_limit(mocked_client_integration, metrics_spy):
     """Test that requests over the size limit return 413."""
     max_size = env.MAX_REQUEST_SIZE_BYTES
     oversized_size = max_size + 1
@@ -61,6 +61,17 @@ def test_request_size_over_limit(mocked_client_integration):
 
     assert response.status_code == 413
     assert response.json() == {"error": 3}
+    assert (
+        metrics_spy.value(
+            "chat_availability",
+            outcome="excluded",
+            reason="payload_too_large",
+            model="",
+            service_type="",
+            purpose="",
+        )
+        == 1
+    )
 
 
 def test_request_size_exactly_at_limit(mocked_client_integration):

From 8167088d40b2e048115c31a565f43fbd8a8ef581 Mon Sep 17 00:00:00 2001
From: Randy Concepcion <rconcepcion@mozilla.com>
Date: Tue, 16 Jun 2026 16:17:24 -0400
Subject: [PATCH 6/9] order availability reasons by request stage

---
 src/mlpa/core/prometheus_metrics.py | 38 ++++++++++++++---------------
 1 file changed, 19 insertions(+), 19 deletions(-)

diff --git a/src/mlpa/core/prometheus_metrics.py b/src/mlpa/core/prometheus_metrics.py
index d104020..e6ed9c2 100644
--- a/src/mlpa/core/prometheus_metrics.py
+++ b/src/mlpa/core/prometheus_metrics.py
@@ -30,18 +30,6 @@ class AvailabilityReason(StrEnum):
     # Strings shared with PrometheusRejectionReason are kept identical so the
     # two counters reconcile. Keep them in sync when a rejection reason is added.
 
-    # --- completion-stage reasons (recorded inside stream_completion / get_completion) ---
-    CLEAN_COMPLETION = "clean_completion"  # success
-    UPSTREAM_ERROR = "upstream_error"  # failure
-    EMPTY_RESPONSE = "empty_response"  # failure
-    BUDGET_EXCEEDED = "budget_exceeded"  # excluded
-    RATE_LIMITED_OWN = "rate_limited_own"  # excluded
-    RATE_LIMITED_UPSTREAM = "rate_limited_upstream"  # excluded
-    PAYLOAD_TOO_LARGE = "payload_too_large"  # excluded
-    INVALID_MODEL_NAME = "invalid_model_name"  # excluded
-    INVALID_REQUEST = "invalid_request"  # excluded
-    CLIENT_DISCONNECT = "client_disconnect"  # abort
-
     # --- pre-completion reasons (recorded in the auth dependency and route body) ---
     AUTH_REJECTED = "auth_rejected"  # excluded
     INVALID_AUTH_REQUEST = "invalid_auth_request"  # excluded
@@ -55,8 +43,27 @@ class AvailabilityReason(StrEnum):
     # properly requires a follow-on change to the auth backends themselves.
     AUTH_SYSTEM_FAILURE = "auth_system_failure"  # failure
 
+    # --- completion-stage reasons (recorded inside stream_completion / get_completion) ---
+    CLEAN_COMPLETION = "clean_completion"  # success
+    UPSTREAM_ERROR = "upstream_error"  # failure
+    EMPTY_RESPONSE = "empty_response"  # failure
+    BUDGET_EXCEEDED = "budget_exceeded"  # excluded
+    RATE_LIMITED_OWN = "rate_limited_own"  # excluded
+    RATE_LIMITED_UPSTREAM = "rate_limited_upstream"  # excluded
+    PAYLOAD_TOO_LARGE = "payload_too_large"  # excluded
+    INVALID_MODEL_NAME = "invalid_model_name"  # excluded
+    INVALID_REQUEST = "invalid_request"  # excluded
+    CLIENT_DISCONNECT = "client_disconnect"  # abort
+
 
 _AVAILABILITY_OUTCOME_BY_REASON: dict[AvailabilityReason, AvailabilityOutcome] = {
+    AvailabilityReason.AUTH_REJECTED: AvailabilityOutcome.EXCLUDED,
+    AvailabilityReason.INVALID_AUTH_REQUEST: AvailabilityOutcome.EXCLUDED,
+    AvailabilityReason.INVALID_SERVICE_TYPE_FOR_MODEL: AvailabilityOutcome.EXCLUDED,
+    AvailabilityReason.SIGNUP_CAP_EXCEEDED: AvailabilityOutcome.EXCLUDED,
+    AvailabilityReason.BLOCKED: AvailabilityOutcome.EXCLUDED,
+    AvailabilityReason.PROVISIONING_FAILURE: AvailabilityOutcome.FAILURE,
+    AvailabilityReason.AUTH_SYSTEM_FAILURE: AvailabilityOutcome.FAILURE,
     AvailabilityReason.CLEAN_COMPLETION: AvailabilityOutcome.SUCCESS,
     AvailabilityReason.UPSTREAM_ERROR: AvailabilityOutcome.FAILURE,
     AvailabilityReason.EMPTY_RESPONSE: AvailabilityOutcome.FAILURE,
@@ -67,13 +74,6 @@ class AvailabilityReason(StrEnum):
     AvailabilityReason.INVALID_MODEL_NAME: AvailabilityOutcome.EXCLUDED,
     AvailabilityReason.INVALID_REQUEST: AvailabilityOutcome.EXCLUDED,
     AvailabilityReason.CLIENT_DISCONNECT: AvailabilityOutcome.ABORT,
-    AvailabilityReason.AUTH_REJECTED: AvailabilityOutcome.EXCLUDED,
-    AvailabilityReason.INVALID_AUTH_REQUEST: AvailabilityOutcome.EXCLUDED,
-    AvailabilityReason.INVALID_SERVICE_TYPE_FOR_MODEL: AvailabilityOutcome.EXCLUDED,
-    AvailabilityReason.SIGNUP_CAP_EXCEEDED: AvailabilityOutcome.EXCLUDED,
-    AvailabilityReason.BLOCKED: AvailabilityOutcome.EXCLUDED,
-    AvailabilityReason.PROVISIONING_FAILURE: AvailabilityOutcome.FAILURE,
-    AvailabilityReason.AUTH_SYSTEM_FAILURE: AvailabilityOutcome.FAILURE,
 }
 
 

From 98b120c690a3c67448c4cada656747a361303735 Mon Sep 17 00:00:00 2001
From: Randy Concepcion <rconcepcion@mozilla.com>
Date: Tue, 16 Jun 2026 16:29:52 -0400
Subject: [PATCH 7/9] rename availability reasons for clarity (valid_response,
 rate_limited_platform)

---
 src/mlpa/core/completions.py        | 4 ++--
 src/mlpa/core/errors.py             | 2 +-
 src/mlpa/core/prometheus_metrics.py | 8 ++++----
 src/tests/unit/test_availability.py | 2 +-
 src/tests/unit/test_completions.py  | 6 +++---
 5 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/src/mlpa/core/completions.py b/src/mlpa/core/completions.py
index ad09ed3..971a586 100644
--- a/src/mlpa/core/completions.py
+++ b/src/mlpa/core/completions.py
@@ -272,7 +272,7 @@ async def _read_next_chunk(
                 snapshot=litellm_routing_snapshot,
             )
             result = PrometheusResult.SUCCESS
-            availability_reason = AvailabilityReason.CLEAN_COMPLETION
+            availability_reason = AvailabilityReason.VALID_RESPONSE
     except (GeneratorExit, asyncio.CancelledError):
         # Client went away mid-stream: Starlette tears the generator down by
         # throwing GeneratorExit (or cancelling the task) at the paused
@@ -384,7 +384,7 @@ async def get_completion(authorized_chat_request: AuthorizedChatRequest):
             snapshot=litellm_routing_snapshot,
         )
         result = PrometheusResult.SUCCESS
-        availability_reason = AvailabilityReason.CLEAN_COMPLETION
+        availability_reason = AvailabilityReason.VALID_RESPONSE
         return data
     except HTTPException:
         raise
diff --git a/src/mlpa/core/errors.py b/src/mlpa/core/errors.py
index 3ea4ca4..6ee521d 100644
--- a/src/mlpa/core/errors.py
+++ b/src/mlpa/core/errors.py
@@ -42,7 +42,7 @@ def availability_reason(self) -> AvailabilityReason:
         if self.reason == PrometheusRejectionReason.RATE_LIMITED:
             if self.error_code == ERROR_CODE_UPSTREAM_RATE_LIMIT_EXCEEDED:
                 return AvailabilityReason.RATE_LIMITED_UPSTREAM
-            return AvailabilityReason.RATE_LIMITED_OWN
+            return AvailabilityReason.RATE_LIMITED_PLATFORM
         return _REJECTION_TO_AVAILABILITY_REASON[self.reason]
 
 
diff --git a/src/mlpa/core/prometheus_metrics.py b/src/mlpa/core/prometheus_metrics.py
index e6ed9c2..757634d 100644
--- a/src/mlpa/core/prometheus_metrics.py
+++ b/src/mlpa/core/prometheus_metrics.py
@@ -44,11 +44,11 @@ class AvailabilityReason(StrEnum):
     AUTH_SYSTEM_FAILURE = "auth_system_failure"  # failure
 
     # --- completion-stage reasons (recorded inside stream_completion / get_completion) ---
-    CLEAN_COMPLETION = "clean_completion"  # success
+    VALID_RESPONSE = "valid_response"  # success
     UPSTREAM_ERROR = "upstream_error"  # failure
     EMPTY_RESPONSE = "empty_response"  # failure
     BUDGET_EXCEEDED = "budget_exceeded"  # excluded
-    RATE_LIMITED_OWN = "rate_limited_own"  # excluded
+    RATE_LIMITED_PLATFORM = "rate_limited_platform"  # excluded
     RATE_LIMITED_UPSTREAM = "rate_limited_upstream"  # excluded
     PAYLOAD_TOO_LARGE = "payload_too_large"  # excluded
     INVALID_MODEL_NAME = "invalid_model_name"  # excluded
@@ -64,11 +64,11 @@ class AvailabilityReason(StrEnum):
     AvailabilityReason.BLOCKED: AvailabilityOutcome.EXCLUDED,
     AvailabilityReason.PROVISIONING_FAILURE: AvailabilityOutcome.FAILURE,
     AvailabilityReason.AUTH_SYSTEM_FAILURE: AvailabilityOutcome.FAILURE,
-    AvailabilityReason.CLEAN_COMPLETION: AvailabilityOutcome.SUCCESS,
+    AvailabilityReason.VALID_RESPONSE: AvailabilityOutcome.SUCCESS,
     AvailabilityReason.UPSTREAM_ERROR: AvailabilityOutcome.FAILURE,
     AvailabilityReason.EMPTY_RESPONSE: AvailabilityOutcome.FAILURE,
     AvailabilityReason.BUDGET_EXCEEDED: AvailabilityOutcome.EXCLUDED,
-    AvailabilityReason.RATE_LIMITED_OWN: AvailabilityOutcome.EXCLUDED,
+    AvailabilityReason.RATE_LIMITED_PLATFORM: AvailabilityOutcome.EXCLUDED,
     AvailabilityReason.RATE_LIMITED_UPSTREAM: AvailabilityOutcome.EXCLUDED,
     AvailabilityReason.PAYLOAD_TOO_LARGE: AvailabilityOutcome.EXCLUDED,
     AvailabilityReason.INVALID_MODEL_NAME: AvailabilityOutcome.EXCLUDED,
diff --git a/src/tests/unit/test_availability.py b/src/tests/unit/test_availability.py
index ca9898e..ef794ab 100644
--- a/src/tests/unit/test_availability.py
+++ b/src/tests/unit/test_availability.py
@@ -65,7 +65,7 @@ def test_every_completion_stage_rejection_reason_maps_to_excluded():
         (
             PrometheusRejectionReason.RATE_LIMITED,
             ERROR_CODE_RATE_LIMIT_EXCEEDED,
-            AvailabilityReason.RATE_LIMITED_OWN,
+            AvailabilityReason.RATE_LIMITED_PLATFORM,
         ),
         (
             PrometheusRejectionReason.RATE_LIMITED,
diff --git a/src/tests/unit/test_completions.py b/src/tests/unit/test_completions.py
index afa6a25..c4147fe 100644
--- a/src/tests/unit/test_completions.py
+++ b/src/tests/unit/test_completions.py
@@ -174,7 +174,7 @@ async def test_get_completion_success(mocker, metrics_spy):
         _availability_count(
             metrics_spy,
             AvailabilityOutcome.SUCCESS,
-            AvailabilityReason.CLEAN_COMPLETION,
+            AvailabilityReason.VALID_RESPONSE,
         )
         == 1
     )
@@ -390,7 +390,7 @@ async def test_stream_completion_success(
         _availability_count(
             metrics_spy,
             AvailabilityOutcome.SUCCESS,
-            AvailabilityReason.CLEAN_COMPLETION,
+            AvailabilityReason.VALID_RESPONSE,
         )
         == 1
     )
@@ -569,7 +569,7 @@ async def test_get_completion_rate_limit_exceeded(mocker, metrics_spy):
         _availability_count(
             metrics_spy,
             AvailabilityOutcome.EXCLUDED,
-            AvailabilityReason.RATE_LIMITED_OWN,
+            AvailabilityReason.RATE_LIMITED_PLATFORM,
         )
         == 1
     )

From 890cdb643f307443b8763a138161384bcf234082 Mon Sep 17 00:00:00 2001
From: Randy Concepcion <rconcepcion@mozilla.com>
Date: Wed, 17 Jun 2026 15:51:00 -0400
Subject: [PATCH 8/9] populate service_type and purpose on pre-completion
 availability records

---
 src/mlpa/core/auth/authorize.py                   |  4 +---
 src/mlpa/core/middleware/request_size.py          | 11 ++++++-----
 src/tests/integration/test_request_size_limit.py  |  4 ++--
 src/tests/unit/test_precompletion_availability.py |  1 +
 4 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/src/mlpa/core/auth/authorize.py b/src/mlpa/core/auth/authorize.py
index a35aa00..c6fa463 100644
--- a/src/mlpa/core/auth/authorize.py
+++ b/src/mlpa/core/auth/authorize.py
@@ -129,9 +129,7 @@ async def authorize_chat_request(
             AvailabilityReason.INVALID_SERVICE_TYPE_FOR_MODEL,
             model=chat_request.model,
             service_type=service_type.value,
-            # `purpose` is not yet resolved here
-            # This serves as a placeholder until a validated purpose is available
-            purpose="",
+            purpose=(purpose or "").strip(),
         )
         raise HTTPException(
             status_code=400,
diff --git a/src/mlpa/core/middleware/request_size.py b/src/mlpa/core/middleware/request_size.py
index 641a144..7565b13 100644
--- a/src/mlpa/core/middleware/request_size.py
+++ b/src/mlpa/core/middleware/request_size.py
@@ -21,14 +21,15 @@ async def check_request_size_middleware(request: Request, call_next):
                     logger.warning(
                         f"Request size {size} bytes exceeds maximum {env.MAX_REQUEST_SIZE_BYTES} bytes"
                     )
-                    # Placeholders for `model`, `service_type`, and `purpose`
-                    # are used since values are not set until the body is parsed
-                    # and auth has run.
+                    # `model` is in the request body, which we don't read here.
+                    # We reject on the Content-Length header without parsing it.
                     record_chat_availability_for(
                         AvailabilityReason.PAYLOAD_TOO_LARGE,
                         model="",
-                        service_type="",
-                        purpose="",
+                        service_type=(
+                            request.headers.get("service-type") or ""
+                        ).strip(),
+                        purpose=(request.headers.get("purpose") or "").strip(),
                     )
                     return JSONResponse(
                         status_code=413,
diff --git a/src/tests/integration/test_request_size_limit.py b/src/tests/integration/test_request_size_limit.py
index 36d4263..ad4bb25 100644
--- a/src/tests/integration/test_request_size_limit.py
+++ b/src/tests/integration/test_request_size_limit.py
@@ -67,8 +67,8 @@ def test_request_size_over_limit(mocked_client_integration, metrics_spy):
             outcome="excluded",
             reason="payload_too_large",
             model="",
-            service_type="",
-            purpose="",
+            service_type="ai",
+            purpose="chat",
         )
         == 1
     )
diff --git a/src/tests/unit/test_precompletion_availability.py b/src/tests/unit/test_precompletion_availability.py
index 12315ce..04e850d 100644
--- a/src/tests/unit/test_precompletion_availability.py
+++ b/src/tests/unit/test_precompletion_availability.py
@@ -119,6 +119,7 @@ async def test_wrapper_invalid_service_type_records_excluded(metrics_spy):
             AvailabilityReason.INVALID_SERVICE_TYPE_FOR_MODEL,
             model="exa",
             service_type="ai",
+            purpose="chat",
         )
         == 1
     )

From d344857ffbd0d5faa9f7a50abcdb1a14b25464d8 Mon Sep 17 00:00:00 2001
From: Randy Concepcion <rconcepcion@mozilla.com>
Date: Thu, 18 Jun 2026 09:22:45 -0400
Subject: [PATCH 9/9] clean up strips and add purpose on auth-failure
 availability

---
 src/mlpa/core/auth/authorize.py                   | 7 +++----
 src/mlpa/core/middleware/request_size.py          | 6 ++----
 src/tests/unit/test_precompletion_availability.py | 3 +++
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/mlpa/core/auth/authorize.py b/src/mlpa/core/auth/authorize.py
index c6fa463..6ecade0 100644
--- a/src/mlpa/core/auth/authorize.py
+++ b/src/mlpa/core/auth/authorize.py
@@ -27,7 +27,7 @@ def _resolve_purpose(service_type_value: str, purpose_header: str | None) -> str
     """Validate purpose header and return value; raise HTTPException if invalid."""
     valid = env.valid_purposes_for_service_type(service_type_value)
     requires = env.service_type_requires_purpose(service_type_value)
-    purpose = (purpose_header or "").strip()
+    purpose = purpose_header or ""
 
     if purpose:
         if purpose not in valid:
@@ -129,7 +129,7 @@ async def authorize_chat_request(
             AvailabilityReason.INVALID_SERVICE_TYPE_FOR_MODEL,
             model=chat_request.model,
             service_type=service_type.value,
-            purpose=(purpose or "").strip(),
+            purpose=purpose or "",
         )
         raise HTTPException(
             status_code=400,
@@ -161,7 +161,6 @@ async def authorize_chat_request(
         # - anything else (e.g. App Attest's explicit 500): re-raised unrecorded;
         #   auth-system-failure capture is left to a follow-on auth backend change
         # Non-HTTPException errors are not caught here and propagate unrecorded.
-        # Purpose is unknown at this point, so "" is always used as a placeholder.
         if exc.status_code == 401:
             reason = AvailabilityReason.AUTH_REJECTED
         elif exc.status_code == 400:
@@ -172,7 +171,7 @@ async def authorize_chat_request(
             reason,
             model=chat_request.model,
             service_type=service_type.value,
-            purpose="",
+            purpose=purpose or "",
         )
         raise
 
diff --git a/src/mlpa/core/middleware/request_size.py b/src/mlpa/core/middleware/request_size.py
index 7565b13..8a3d771 100644
--- a/src/mlpa/core/middleware/request_size.py
+++ b/src/mlpa/core/middleware/request_size.py
@@ -26,10 +26,8 @@ async def check_request_size_middleware(request: Request, call_next):
                     record_chat_availability_for(
                         AvailabilityReason.PAYLOAD_TOO_LARGE,
                         model="",
-                        service_type=(
-                            request.headers.get("service-type") or ""
-                        ).strip(),
-                        purpose=(request.headers.get("purpose") or "").strip(),
+                        service_type=request.headers.get("service-type") or "",
+                        purpose=request.headers.get("purpose") or "",
                     )
                     return JSONResponse(
                         status_code=413,
diff --git a/src/tests/unit/test_precompletion_availability.py b/src/tests/unit/test_precompletion_availability.py
index 04e850d..454b08d 100644
--- a/src/tests/unit/test_precompletion_availability.py
+++ b/src/tests/unit/test_precompletion_availability.py
@@ -145,6 +145,7 @@ async def test_wrapper_invalid_purpose_records_invalid_auth_request(metrics_spy)
             AvailabilityReason.INVALID_AUTH_REQUEST,
             model=_VALID_MODEL,
             service_type="ai",
+            purpose="definitely-not-a-valid-purpose",
         )
         == 1
     )
@@ -176,6 +177,7 @@ async def test_wrapper_shared_call_401_records_auth_rejected(mocker, metrics_spy
             AvailabilityReason.AUTH_REJECTED,
             model=_VALID_MODEL,
             service_type="ai",
+            purpose="chat",
         )
         == 1
     )
@@ -213,6 +215,7 @@ async def test_wrapper_shared_call_400_records_invalid_auth_request(
             AvailabilityReason.INVALID_AUTH_REQUEST,
             model=_VALID_MODEL,
             service_type="ai",
+            purpose="chat",
         )
         == 1
     )