From 54ae6a349ebcb050d10f5e386dca79c0b6ef0374 Mon Sep 17 00:00:00 2001
From: Brenda Solari <brenda@bitmaker.la>
Date: Wed, 3 Jun 2026 16:32:09 -0500
Subject: [PATCH] Add confidence helpers for LLM categorization

---
 README.md                                    |  37 ++
 src/ps_helper/confidence/README.md           | 229 +++++++++++
 src/ps_helper/confidence/__init__.py         |   3 +
 src/ps_helper/confidence/normalized_value.py | 385 +++++++++++++++++++
 tests/test_normalized_value_confidence.py    | 165 ++++++++
 5 files changed, 819 insertions(+)
 create mode 100644 src/ps_helper/confidence/README.md
 create mode 100644 src/ps_helper/confidence/__init__.py
 create mode 100644 src/ps_helper/confidence/normalized_value.py
 create mode 100644 tests/test_normalized_value_confidence.py

diff --git a/README.md b/README.md
index 8e9a608..54d981e 100644
--- a/README.md
+++ b/README.md
@@ -74,6 +74,43 @@ record_curl_transfer_bytes(
 
 With `MetricsExtension`, this is also reflected in the final JSON report under `resources`.
 
+### Normalize Or Score AI Values
+
+Use the reusable confidence helpers to normalize an LLM value against a
+controlled list, or audit an already-normalized value.
+
+Normalize when you do not have a normalized value yet:
+
+```python
+from ps_helper.confidence import normalize_with_confidence
+
+category_audit = normalize_with_confidence(
+    extracted_value="Kitchen tools",
+    allowed_values=["Kitchen Cookware Sets", "Canvas Tools & Accessories"],
+    field_name="category",
+    minimum_label="GOOD",
+)
+
+category_normalized = category_audit["normalized_value"]
+```
+
+Score when another system already chose the normalized value:
+
+```python
+from ps_helper.confidence import score_normalized_value
+
+category_audit = score_normalized_value(
+    extracted_value="Kitchen tools",
+    normalized_value="Canvas Tools & Accessories",
+    allowed_values=["Kitchen Cookware Sets", "Canvas Tools & Accessories"],
+    field_name="category",
+    llm_confidence=0.82,
+)
+```
+
+The output includes `confidence_score`, `confidence_label`, normalization
+status, validation flags, and audit metadata for candidate ranking.
+
 For automatic tracking in every curl request, use `TrackedCurlSession`:
 
 ```python
diff --git a/src/ps_helper/confidence/README.md b/src/ps_helper/confidence/README.md
new file mode 100644
index 0000000..175c701
--- /dev/null
+++ b/src/ps_helper/confidence/README.md
@@ -0,0 +1,229 @@
+# Confidence Helpers
+
+Helpers for AI-assisted normalization workflows where an extracted value needs
+to be compared with a controlled list of allowed values.
+
+This package does not prove ground-truth accuracy. It estimates normalization
+confidence from observable signals such as text similarity, candidate ranking,
+allowed-value membership, and optional LLM confidence.
+
+## Available Helpers
+
+Use `normalize_with_confidence` when you do not have a normalized value yet.
+It chooses the best allowed value and only accepts it when it reaches your
+configured strictness level.
+
+Use `score_normalized_value` when another system already selected a normalized
+value and you only want to audit that mapping.
+
+```python
+from ps_helper.confidence import normalize_with_confidence, score_normalized_value
+```
+
+## Mode 1: Normalize With Confidence
+
+Use this mode for this workflow:
+
+```text
+LLM extracted value + allowed values -> normalized value or suggestion
+```
+
+### Single Input
+
+```python
+audit = normalize_with_confidence(
+    extracted_value="Kitchen tools",
+    allowed_values=[
+        "Kitchen Cookware Sets",
+        "Canvas Tools & Accessories",
+        "Serving Trays",
+    ],
+    field_name="category",
+    minimum_label="GOOD",
+)
+
+category_normalized = audit["normalized_value"]
+```
+
+If the best candidate reaches `minimum_label`, `normalized_value` is populated.
+If it does not, `normalized_value` is `None` and `suggested_value` contains the
+best candidate for review.
+
+### Multiple Weighted Inputs
+
+Use `extracted_values` when you have multiple signals for the same field.
+
+```python
+audit = normalize_with_confidence(
+    extracted_values=[
+        {"value": "Kitchen tools", "weight": 0.7, "source": "llm"},
+        {"value": "Kitchen accessories", "weight": 0.2, "source": "breadcrumb"},
+        {"value": "Citrus squeezer", "weight": 0.1, "source": "title"},
+    ],
+    allowed_values=categories,
+    field_name="category",
+    minimum_label="GOOD",
+)
+```
+
+Weights are normalized internally. For example, weights `7`, `2`, and `1` become
+`0.7`, `0.2`, and `0.1`.
+
+Candidate score formula:
+
+```text
+candidate_score =
+  similarity(input_1, candidate) * weight_1
++ similarity(input_2, candidate) * weight_2
++ similarity(input_3, candidate) * weight_3
+```
+
+The highest-scoring candidate becomes `suggested_value`.
+
+### Normalize Output Shape
+
+```json
+{
+  "field_name": "category",
+  "extracted_values": [
+    {"value": "Kitchen tools", "weight": 1.0, "source": null}
+  ],
+  "normalized_value": null,
+  "suggested_value": "Canvas Tools & Accessories",
+  "accepted": false,
+  "confidence_score": 0.64,
+  "confidence_label": "POSSIBLE",
+  "normalization": {
+    "status": "SUGGESTED",
+    "method": "fuzzy_match",
+    "match_score": 0.64,
+    "minimum_label": "GOOD"
+  },
+  "validation": {
+    "is_valid": false,
+    "flags": ["below_minimum_threshold"],
+    "requires_review": true
+  },
+  "audit": {
+    "allowed_values_count": 120,
+    "top_candidates": [
+      {"value": "Canvas Tools & Accessories", "score": 0.64}
+    ],
+    "input_signals": [],
+    "signals": {
+      "match_score": 0.64,
+      "llm_confidence_score": null
+    },
+    "thresholds": {}
+  }
+}
+```
+
+### Normalize Statuses
+
+- `NORMALIZED`: best candidate passed `minimum_label` and was accepted.
+- `SUGGESTED`: best candidate exists, but did not pass `minimum_label`.
+- `NO_MATCH`: no usable input or no allowed values were provided.
+
+## Mode 2: Score Existing Normalized Value
+
+Use this mode for this workflow:
+
+```text
+LLM extracted value + existing normalized value + allowed values -> audit score
+```
+
+```python
+audit = score_normalized_value(
+    extracted_value="Kitchen tools",
+    normalized_value="Canvas Tools & Accessories",
+    allowed_values=[
+        "Kitchen Cookware Sets",
+        "Canvas Tools & Accessories",
+        "Serving Trays",
+    ],
+    field_name="category",
+    llm_confidence=0.82,
+)
+```
+
+This mode does not choose a normalized value. It evaluates the mapping that was
+already chosen by another system.
+
+### Score Formula
+
+When `llm_confidence` is provided:
+
+```text
+confidence_score =
+  allowed_value_score * 0.30
++ text_similarity_score * 0.35
++ candidate_rank_score * 0.20
++ llm_confidence_score * 0.15
+```
+
+When `llm_confidence` is not provided, the available weights are normalized by
+their total weight so the missing LLM score does not automatically penalize the
+result.
+
+### Score Signals
+
+- `allowed_value_score`: `1.0` when `normalized_value` exists in `allowed_values`, otherwise `0.0`.
+- `text_similarity_score`: textual similarity between `extracted_value` and `normalized_value`.
+- `candidate_rank_score`: rank quality of `normalized_value` among candidate matches.
+- `llm_confidence_score`: optional confidence reported by the LLM, clamped to `0.0..1.0`.
+
+## Thresholds
+
+Default thresholds:
+
+```python
+{
+    "HIGH": 0.90,
+    "GOOD": 0.75,
+    "POSSIBLE": 0.60,
+    "LOW": 0.0,
+    "review_below": 0.75,
+    "ambiguity_margin": 0.10,
+    "low_similarity_below": 0.60,
+}
+```
+
+You can override them per call:
+
+```python
+audit = normalize_with_confidence(
+    extracted_value="Kitchen tools",
+    allowed_values=categories,
+    minimum_label="POSSIBLE",
+    thresholds={"GOOD": 0.80, "review_below": 0.70},
+)
+```
+
+## Validation Flags
+
+- `missing_extracted_value`: no usable extracted input was provided.
+- `missing_normalized_value`: `score_normalized_value` received no normalized value.
+- `empty_allowed_values`: `normalize_with_confidence` received no allowed values.
+- `taxonomy_missing_candidate`: normalized value is not in `allowed_values`.
+- `low_similarity_match`: match score is below `low_similarity_below`.
+- `ambiguous_classification`: top candidates are too close based on `ambiguity_margin`.
+- `below_minimum_threshold`: candidate did not reach the requested `minimum_label`.
+
+## Recommended Usage In Spiders
+
+```python
+audit = normalize_with_confidence(
+    extracted_values=[
+        {"value": item.get("category_name"), "weight": 0.7, "source": "llm"},
+        {"value": item.get("breadcrumb_category"), "weight": 0.2, "source": "breadcrumb"},
+        {"value": item.get("title"), "weight": 0.1, "source": "title"},
+    ],
+    allowed_values=self.allowed_categories,
+    field_name="category",
+    minimum_label="GOOD",
+)
+
+item["category_normalized"] = audit["normalized_value"]
+item["category_confidence_audit"] = audit
+```
diff --git a/src/ps_helper/confidence/__init__.py b/src/ps_helper/confidence/__init__.py
new file mode 100644
index 0000000..6ff465d
--- /dev/null
+++ b/src/ps_helper/confidence/__init__.py
@@ -0,0 +1,3 @@
+from .normalized_value import normalize_with_confidence, score_normalized_value
+
+__all__ = ["score_normalized_value", "normalize_with_confidence"]
diff --git a/src/ps_helper/confidence/normalized_value.py b/src/ps_helper/confidence/normalized_value.py
new file mode 100644
index 0000000..b02dff2
--- /dev/null
+++ b/src/ps_helper/confidence/normalized_value.py
@@ -0,0 +1,385 @@
+import re
+from difflib import SequenceMatcher
+from typing import Any, Dict, List, Optional
+
+
+DEFAULT_THRESHOLDS = {
+    "HIGH": 0.90,
+    "GOOD": 0.75,
+    "POSSIBLE": 0.60,
+    "LOW": 0.0,
+    "review_below": 0.75,
+    "ambiguity_margin": 0.10,
+    "low_similarity_below": 0.60,
+}
+
+
+def _normalize_key(value: Any) -> str:
+    if value is None:
+        return ""
+    text = str(value)
+    text = text.replace("\u200b", "").replace("\ufeff", "")
+    text = re.sub(r"[®™©]", "", text)
+    text = re.sub(r"[^a-zA-Z0-9\+\&\-\s]", " ", text)
+    text = re.sub(r"\s+", " ", text).strip().lower()
+    return text
+
+
+def _dedupe_values(values: List[Any]) -> List[str]:
+    seen = set()
+    output = []
+    for value in values or []:
+        text = str(value).strip() if value is not None else ""
+        key = _normalize_key(text)
+        if not key or key in seen:
+            continue
+        seen.add(key)
+        output.append(text)
+    return output
+
+
+def _similarity(left: Any, right: Any) -> float:
+    left_key = _normalize_key(left)
+    right_key = _normalize_key(right)
+    if not left_key or not right_key:
+        return 0.0
+    if left_key == right_key:
+        return 1.0
+    return SequenceMatcher(None, left_key, right_key).ratio()
+
+
+def _confidence_label(score: float, thresholds: Dict[str, float]) -> str:
+    if score >= thresholds["HIGH"]:
+        return "HIGH"
+    if score >= thresholds["GOOD"]:
+        return "GOOD"
+    if score >= thresholds["POSSIBLE"]:
+        return "POSSIBLE"
+    return "LOW"
+
+
+def _rank_candidates(extracted_value: Any, allowed_values: List[str], top_k: int) -> List[Dict[str, Any]]:
+    ranked = [
+        {"value": value, "score": round(_similarity(extracted_value, value), 4)}
+        for value in allowed_values
+    ]
+    ranked.sort(key=lambda item: item["score"], reverse=True)
+    return ranked[:max(0, top_k)]
+
+
+def _label_rank(label: str) -> int:
+    ranks = {"LOW": 0, "POSSIBLE": 1, "GOOD": 2, "HIGH": 3}
+    return ranks.get(str(label or "").upper(), ranks["GOOD"])
+
+
+def _coerce_score(value: Optional[float]) -> Optional[float]:
+    if value is None:
+        return None
+    try:
+        return max(0.0, min(1.0, float(value)))
+    except (TypeError, ValueError):
+        return None
+
+
+def _normalize_extracted_values(
+    extracted_value: Any = None,
+    extracted_values: Optional[List[Any]] = None,
+) -> List[Dict[str, Any]]:
+    raw_values = extracted_values if extracted_values is not None else [extracted_value]
+    signals = []
+
+    for item in raw_values or []:
+        if isinstance(item, dict):
+            value = item.get("value")
+            weight = item.get("weight", 1.0)
+            source = item.get("source")
+        else:
+            value = item
+            weight = 1.0
+            source = None
+
+        key = _normalize_key(value)
+        if not key:
+            continue
+
+        try:
+            numeric_weight = float(weight)
+        except (TypeError, ValueError):
+            numeric_weight = 1.0
+
+        if numeric_weight <= 0:
+            continue
+
+        signals.append(
+            {
+                "value": value,
+                "weight": numeric_weight,
+                "source": source,
+            }
+        )
+
+    total_weight = sum(item["weight"] for item in signals)
+    if total_weight <= 0:
+        return []
+
+    return [
+        {
+            "value": item["value"],
+            "weight": round(item["weight"] / total_weight, 4),
+            "source": item["source"],
+        }
+        for item in signals
+    ]
+
+
+def _weighted_similarity(input_signals: List[Dict[str, Any]], candidate: Any) -> float:
+    return sum(
+        _similarity(signal["value"], candidate) * float(signal["weight"])
+        for signal in input_signals
+    )
+
+
+def _rank_weighted_candidates(
+    input_signals: List[Dict[str, Any]],
+    allowed_values: List[str],
+    top_k: int,
+) -> List[Dict[str, Any]]:
+    ranked = [
+        {"value": value, "score": round(_weighted_similarity(input_signals, value), 4)}
+        for value in allowed_values
+    ]
+    ranked.sort(key=lambda item: item["score"], reverse=True)
+    return ranked[:max(0, top_k)]
+
+
+def normalize_with_confidence(
+    extracted_value: Any = None,
+    allowed_values: Optional[List[Any]] = None,
+    *,
+    extracted_values: Optional[List[Any]] = None,
+    field_name: Optional[str] = None,
+    llm_confidence: Optional[float] = None,
+    minimum_label: str = "GOOD",
+    thresholds: Optional[Dict[str, float]] = None,
+    top_k: int = 3,
+) -> Dict[str, Any]:
+    """Pick the best normalized value from a controlled list and gate it.
+
+    Pass `extracted_value` for the common case, or `extracted_values` with
+    weights/sources when multiple signals should contribute to normalization.
+    """
+    active_thresholds = {**DEFAULT_THRESHOLDS, **(thresholds or {})}
+    allowed = _dedupe_values(allowed_values or [])
+    input_signals = _normalize_extracted_values(extracted_value, extracted_values)
+    llm_score = _coerce_score(llm_confidence)
+    flags = []
+
+    if not input_signals:
+        flags.append("missing_extracted_value")
+    if not allowed:
+        flags.append("empty_allowed_values")
+
+    candidate_ranking = _rank_weighted_candidates(input_signals, allowed, top_k) if input_signals and allowed else []
+    best_candidate = candidate_ranking[0] if candidate_ranking else None
+    match_score = float(best_candidate.get("score") or 0.0) if best_candidate else 0.0
+
+    if llm_score is not None and best_candidate:
+        confidence_score = (match_score * 0.85) + (llm_score * 0.15)
+    else:
+        confidence_score = match_score
+
+    confidence_score = round(max(0.0, min(1.0, confidence_score)), 4)
+    confidence_label = _confidence_label(confidence_score, active_thresholds)
+    minimum_label = str(minimum_label or "GOOD").upper()
+    accepted = bool(best_candidate) and _label_rank(confidence_label) >= _label_rank(minimum_label)
+
+    if best_candidate and confidence_score < active_thresholds["low_similarity_below"]:
+        flags.append("low_similarity_match")
+    if best_candidate and not accepted:
+        flags.append("below_minimum_threshold")
+    if len(candidate_ranking) >= 2:
+        top_score = float(candidate_ranking[0].get("score") or 0.0)
+        second_score = float(candidate_ranking[1].get("score") or 0.0)
+        if top_score - second_score <= active_thresholds["ambiguity_margin"]:
+            flags.append("ambiguous_classification")
+
+    suggested_value = best_candidate.get("value") if best_candidate else None
+    normalized_value = suggested_value if accepted else None
+    if not best_candidate:
+        status = "NO_MATCH"
+    elif accepted:
+        status = "NORMALIZED"
+    else:
+        status = "SUGGESTED"
+
+    requires_review = not accepted or confidence_score < active_thresholds["review_below"]
+    method = "weighted_fuzzy_match" if len(input_signals) > 1 else "fuzzy_match"
+    if best_candidate and len(input_signals) == 1 and _normalize_key(input_signals[0]["value"]) == _normalize_key(suggested_value):
+        method = "exact_match"
+
+    return {
+        "field_name": field_name,
+        "extracted_values": input_signals,
+        "normalized_value": normalized_value,
+        "suggested_value": suggested_value,
+        "accepted": accepted,
+        "confidence_score": confidence_score,
+        "confidence_label": confidence_label,
+        "normalization": {
+            "status": status,
+            "method": method if best_candidate else "no_match",
+            "match_score": round(match_score, 4),
+            "minimum_label": minimum_label,
+        },
+        "validation": {
+            "is_valid": accepted,
+            "flags": sorted(set(flags)),
+            "requires_review": requires_review,
+        },
+        "audit": {
+            "allowed_values_count": len(allowed),
+            "top_candidates": candidate_ranking,
+            "input_signals": input_signals,
+            "signals": {
+                "match_score": round(match_score, 4),
+                "llm_confidence_score": round(llm_score, 4) if llm_score is not None else None,
+            },
+            "thresholds": active_thresholds,
+        },
+    }
+
+
+def score_normalized_value(
+    extracted_value: Any,
+    normalized_value: Any,
+    allowed_values: List[Any],
+    *,
+    field_name: Optional[str] = None,
+    llm_confidence: Optional[float] = None,
+    top_candidates: Optional[List[Dict[str, Any]]] = None,
+    thresholds: Optional[Dict[str, float]] = None,
+    top_k: int = 3,
+) -> Dict[str, Any]:
+    """Score confidence for an extracted value mapped to a normalized value.
+
+    This measures the observable quality of `extracted_value -> normalized_value`
+    against a controlled list. It does not claim ground-truth accuracy.
+    """
+    active_thresholds = {**DEFAULT_THRESHOLDS, **(thresholds or {})}
+    allowed = _dedupe_values(allowed_values)
+    allowed_keys = {_normalize_key(value): value for value in allowed}
+    normalized_key = _normalize_key(normalized_value)
+    extracted_key = _normalize_key(extracted_value)
+    llm_score = _coerce_score(llm_confidence)
+
+    flags = []
+    if not extracted_key:
+        flags.append("missing_extracted_value")
+    if not normalized_key:
+        flags.append("missing_normalized_value")
+
+    normalized_in_allowed = bool(normalized_key and normalized_key in allowed_keys)
+    if normalized_key and not normalized_in_allowed:
+        flags.append("taxonomy_missing_candidate")
+
+    if top_candidates is None:
+        candidate_ranking = _rank_candidates(extracted_value, allowed, top_k)
+    else:
+        candidate_ranking = [
+            {
+                "value": candidate.get("value"),
+                "score": round(_coerce_score(candidate.get("score")) or 0.0, 4),
+            }
+            for candidate in top_candidates[:max(0, top_k)]
+            if isinstance(candidate, dict)
+        ]
+
+    text_similarity = _similarity(extracted_value, normalized_value)
+    allowed_value_score = 1.0 if normalized_in_allowed else 0.0
+
+    method = "no_match"
+    if normalized_in_allowed:
+        if extracted_key and extracted_key == normalized_key:
+            method = "exact_match"
+        elif text_similarity > 0:
+            method = "fuzzy_match"
+        else:
+            method = "catalog_match"
+
+    candidate_rank_score = 0.0
+    normalized_candidate_score = None
+    for index, candidate in enumerate(candidate_ranking):
+        if _normalize_key(candidate.get("value")) == normalized_key:
+            normalized_candidate_score = float(candidate.get("score") or 0.0)
+            candidate_rank_score = max(0.0, 1.0 - (index * 0.2))
+            break
+
+    if normalized_candidate_score is None:
+        normalized_candidate_score = text_similarity if normalized_in_allowed else 0.0
+
+    if len(candidate_ranking) >= 2 and normalized_in_allowed:
+        top_score = float(candidate_ranking[0].get("score") or 0.0)
+        second_score = float(candidate_ranking[1].get("score") or 0.0)
+        if top_score - second_score <= active_thresholds["ambiguity_margin"]:
+            flags.append("ambiguous_classification")
+
+    if text_similarity < active_thresholds["low_similarity_below"] and normalized_in_allowed:
+        flags.append("low_similarity_match")
+
+    signals = {
+        "allowed_value_score": allowed_value_score,
+        "text_similarity_score": round(text_similarity, 4),
+        "candidate_rank_score": round(candidate_rank_score, 4),
+        "llm_confidence_score": round(llm_score, 4) if llm_score is not None else None,
+    }
+
+    weighted_signals = [
+        (allowed_value_score, 0.30),
+        (text_similarity, 0.35),
+        (candidate_rank_score, 0.20),
+    ]
+    if llm_score is not None:
+        weighted_signals.append((llm_score, 0.15))
+
+    total_weight = sum(weight for _, weight in weighted_signals)
+    confidence_score = (
+        sum(score * weight for score, weight in weighted_signals) / total_weight
+        if total_weight
+        else 0.0
+    )
+
+    if not normalized_in_allowed or not normalized_key or not extracted_key:
+        confidence_score = min(confidence_score, 0.59)
+
+    confidence_score = round(max(0.0, min(1.0, confidence_score)), 4)
+    confidence_label = _confidence_label(confidence_score, active_thresholds)
+    requires_review = confidence_score < active_thresholds["review_below"] or bool(
+        {"taxonomy_missing_candidate", "missing_normalized_value", "missing_extracted_value"} & set(flags)
+    )
+
+    status = "NORMALIZED" if normalized_in_allowed else "NO_MATCH"
+    is_valid = normalized_in_allowed and bool(extracted_key)
+
+    return {
+        "field_name": field_name,
+        "extracted_value": extracted_value,
+        "normalized_value": normalized_value,
+        "confidence_score": confidence_score,
+        "confidence_label": confidence_label,
+        "normalization": {
+            "status": status,
+            "method": method,
+            "match_score": round(normalized_candidate_score, 4),
+        },
+        "validation": {
+            "is_valid": is_valid,
+            "flags": sorted(set(flags)),
+            "requires_review": requires_review,
+        },
+        "audit": {
+            "allowed_values_count": len(allowed),
+            "top_candidates": candidate_ranking,
+            "signals": signals,
+            "thresholds": active_thresholds,
+        },
+    }
diff --git a/tests/test_normalized_value_confidence.py b/tests/test_normalized_value_confidence.py
new file mode 100644
index 0000000..8f813b4
--- /dev/null
+++ b/tests/test_normalized_value_confidence.py
@@ -0,0 +1,165 @@
+import json
+
+from ps_helper.confidence import normalize_with_confidence, score_normalized_value
+
+
+def test_exact_match_returns_high_confidence():
+    result = score_normalized_value(
+        "Serving Trays",
+        "Serving Trays",
+        ["Serving Trays", "Plates"],
+        field_name="category",
+    )
+
+    assert result["field_name"] == "category"
+    assert result["confidence_label"] == "HIGH"
+    assert result["normalization"]["status"] == "NORMALIZED"
+    assert result["normalization"]["method"] == "exact_match"
+    assert result["validation"]["requires_review"] is False
+
+
+def test_fuzzy_match_returns_auditable_score():
+    result = score_normalized_value(
+        "Kitchen tools",
+        "Canvas Tools & Accessories",
+        ["Kitchen Cookware Sets", "Canvas Tools & Accessories", "Serving Trays"],
+    )
+
+    assert result["normalization"]["status"] == "NORMALIZED"
+    assert result["normalization"]["method"] == "fuzzy_match"
+    assert result["confidence_score"] < 0.75
+    assert "low_similarity_match" in result["validation"]["flags"]
+    assert result["audit"]["top_candidates"]
+
+
+def test_normalized_value_outside_allowed_values_returns_no_match():
+    result = score_normalized_value(
+        "Christmas Decorations",
+        "Holiday Decor",
+        ["Serving Trays", "Plates"],
+    )
+
+    assert result["normalization"]["status"] == "NO_MATCH"
+    assert result["validation"]["is_valid"] is False
+    assert "taxonomy_missing_candidate" in result["validation"]["flags"]
+    assert result["validation"]["requires_review"] is True
+
+
+def test_missing_values_are_flagged():
+    result = score_normalized_value(None, None, ["Serving Trays"])
+
+    assert "missing_extracted_value" in result["validation"]["flags"]
+    assert "missing_normalized_value" in result["validation"]["flags"]
+    assert result["normalization"]["status"] == "NO_MATCH"
+    assert result["validation"]["requires_review"] is True
+
+
+def test_llm_confidence_contributes_to_score():
+    without_llm = score_normalized_value(
+        "Serving tray",
+        "Serving Trays",
+        ["Serving Trays", "Plates"],
+    )
+    with_llm = score_normalized_value(
+        "Serving tray",
+        "Serving Trays",
+        ["Serving Trays", "Plates"],
+        llm_confidence=1.0,
+    )
+
+    assert with_llm["confidence_score"] >= without_llm["confidence_score"]
+    assert with_llm["audit"]["signals"]["llm_confidence_score"] == 1.0
+
+
+def test_ambiguous_candidates_are_flagged():
+    result = score_normalized_value(
+        "Panel",
+        "Wall Panel",
+        ["Wall Panel", "Ceiling Panel"],
+        top_candidates=[
+            {"value": "Wall Panel", "score": 0.82},
+            {"value": "Ceiling Panel", "score": 0.78},
+        ],
+    )
+
+    assert "ambiguous_classification" in result["validation"]["flags"]
+
+
+def test_output_is_json_serializable():
+    result = score_normalized_value("Plates", "Plates", ["Plates"])
+
+    json.dumps(result)
+
+
+def test_normalize_with_confidence_accepts_exact_match():
+    result = normalize_with_confidence(
+        extracted_value="Serving Trays",
+        allowed_values=["Serving Trays", "Plates"],
+        minimum_label="GOOD",
+    )
+
+    assert result["accepted"] is True
+    assert result["normalized_value"] == "Serving Trays"
+    assert result["normalization"]["status"] == "NORMALIZED"
+    assert result["normalization"]["method"] == "exact_match"
+
+
+def test_normalize_with_confidence_rejects_below_minimum_label():
+    result = normalize_with_confidence(
+        extracted_value="Kitchen tools",
+        allowed_values=["Kitchen Cookware Sets", "Canvas Tools & Accessories"],
+        minimum_label="GOOD",
+    )
+
+    assert result["accepted"] is False
+    assert result["normalized_value"] is None
+    assert result["suggested_value"] is not None
+    assert result["normalization"]["status"] == "SUGGESTED"
+    assert "below_minimum_threshold" in result["validation"]["flags"]
+
+
+def test_normalize_with_confidence_accepts_possible_when_configured():
+    result = normalize_with_confidence(
+        extracted_value="Kitchen tools",
+        allowed_values=["Kitchen Cookware Sets", "Canvas Tools & Accessories"],
+        minimum_label="POSSIBLE",
+    )
+
+    if result["confidence_label"] == "POSSIBLE":
+        assert result["accepted"] is True
+        assert result["normalized_value"] == result["suggested_value"]
+
+
+def test_normalize_with_confidence_uses_weighted_inputs():
+    result = normalize_with_confidence(
+        extracted_values=[
+            {"value": "Plates", "weight": 1, "source": "llm"},
+            {"value": "Serving Trays", "weight": 9, "source": "breadcrumb"},
+        ],
+        allowed_values=["Plates", "Serving Trays"],
+        minimum_label="GOOD",
+    )
+
+    assert result["normalized_value"] == "Serving Trays"
+    assert result["normalization"]["method"] == "weighted_fuzzy_match"
+    assert result["audit"]["input_signals"][0]["weight"] == 0.1
+    assert result["audit"]["input_signals"][1]["weight"] == 0.9
+
+
+def test_normalize_with_confidence_flags_empty_inputs():
+    result = normalize_with_confidence(extracted_value=None, allowed_values=[])
+
+    assert result["accepted"] is False
+    assert result["normalization"]["status"] == "NO_MATCH"
+    assert "missing_extracted_value" in result["validation"]["flags"]
+    assert "empty_allowed_values" in result["validation"]["flags"]
+
+
+def test_normalize_with_confidence_flags_ambiguous_candidates():
+    result = normalize_with_confidence(
+        extracted_value="A Panel",
+        allowed_values=["B Panel", "C Panel"],
+        minimum_label="LOW",
+    )
+
+    assert "ambiguous_classification" in result["validation"]["flags"]