From 54ae6a349ebcb050d10f5e386dca79c0b6ef0374 Mon Sep 17 00:00:00 2001 From: Brenda Solari Date: Wed, 3 Jun 2026 16:32:09 -0500 Subject: [PATCH] Add confidence helpers for LLM categorization --- README.md | 37 ++ src/ps_helper/confidence/README.md | 229 +++++++++++ src/ps_helper/confidence/__init__.py | 3 + src/ps_helper/confidence/normalized_value.py | 385 +++++++++++++++++++ tests/test_normalized_value_confidence.py | 165 ++++++++ 5 files changed, 819 insertions(+) create mode 100644 src/ps_helper/confidence/README.md create mode 100644 src/ps_helper/confidence/__init__.py create mode 100644 src/ps_helper/confidence/normalized_value.py create mode 100644 tests/test_normalized_value_confidence.py diff --git a/README.md b/README.md index 8e9a608..54d981e 100644 --- a/README.md +++ b/README.md @@ -74,6 +74,43 @@ record_curl_transfer_bytes( With `MetricsExtension`, this is also reflected in the final JSON report under `resources`. +### Normalize Or Score AI Values + +Use the reusable confidence helpers to normalize an LLM value against a +controlled list, or audit an already-normalized value. + +Normalize when you do not have a normalized value yet: + +```python +from ps_helper.confidence import normalize_with_confidence + +category_audit = normalize_with_confidence( + extracted_value="Kitchen tools", + allowed_values=["Kitchen Cookware Sets", "Canvas Tools & Accessories"], + field_name="category", + minimum_label="GOOD", +) + +category_normalized = category_audit["normalized_value"] +``` + +Score when another system already chose the normalized value: + +```python +from ps_helper.confidence import score_normalized_value + +category_audit = score_normalized_value( + extracted_value="Kitchen tools", + normalized_value="Canvas Tools & Accessories", + allowed_values=["Kitchen Cookware Sets", "Canvas Tools & Accessories"], + field_name="category", + llm_confidence=0.82, +) +``` + +The output includes `confidence_score`, `confidence_label`, normalization +status, validation flags, and audit metadata for candidate ranking. + For automatic tracking in every curl request, use `TrackedCurlSession`: ```python diff --git a/src/ps_helper/confidence/README.md b/src/ps_helper/confidence/README.md new file mode 100644 index 0000000..175c701 --- /dev/null +++ b/src/ps_helper/confidence/README.md @@ -0,0 +1,229 @@ +# Confidence Helpers + +Helpers for AI-assisted normalization workflows where an extracted value needs +to be compared with a controlled list of allowed values. + +This package does not prove ground-truth accuracy. It estimates normalization +confidence from observable signals such as text similarity, candidate ranking, +allowed-value membership, and optional LLM confidence. + +## Available Helpers + +Use `normalize_with_confidence` when you do not have a normalized value yet. +It chooses the best allowed value and only accepts it when it reaches your +configured strictness level. + +Use `score_normalized_value` when another system already selected a normalized +value and you only want to audit that mapping. + +```python +from ps_helper.confidence import normalize_with_confidence, score_normalized_value +``` + +## Mode 1: Normalize With Confidence + +Use this mode for this workflow: + +```text +LLM extracted value + allowed values -> normalized value or suggestion +``` + +### Single Input + +```python +audit = normalize_with_confidence( + extracted_value="Kitchen tools", + allowed_values=[ + "Kitchen Cookware Sets", + "Canvas Tools & Accessories", + "Serving Trays", + ], + field_name="category", + minimum_label="GOOD", +) + +category_normalized = audit["normalized_value"] +``` + +If the best candidate reaches `minimum_label`, `normalized_value` is populated. +If it does not, `normalized_value` is `None` and `suggested_value` contains the +best candidate for review. + +### Multiple Weighted Inputs + +Use `extracted_values` when you have multiple signals for the same field. + +```python +audit = normalize_with_confidence( + extracted_values=[ + {"value": "Kitchen tools", "weight": 0.7, "source": "llm"}, + {"value": "Kitchen accessories", "weight": 0.2, "source": "breadcrumb"}, + {"value": "Citrus squeezer", "weight": 0.1, "source": "title"}, + ], + allowed_values=categories, + field_name="category", + minimum_label="GOOD", +) +``` + +Weights are normalized internally. For example, weights `7`, `2`, and `1` become +`0.7`, `0.2`, and `0.1`. + +Candidate score formula: + +```text +candidate_score = + similarity(input_1, candidate) * weight_1 ++ similarity(input_2, candidate) * weight_2 ++ similarity(input_3, candidate) * weight_3 +``` + +The highest-scoring candidate becomes `suggested_value`. + +### Normalize Output Shape + +```json +{ + "field_name": "category", + "extracted_values": [ + {"value": "Kitchen tools", "weight": 1.0, "source": null} + ], + "normalized_value": null, + "suggested_value": "Canvas Tools & Accessories", + "accepted": false, + "confidence_score": 0.64, + "confidence_label": "POSSIBLE", + "normalization": { + "status": "SUGGESTED", + "method": "fuzzy_match", + "match_score": 0.64, + "minimum_label": "GOOD" + }, + "validation": { + "is_valid": false, + "flags": ["below_minimum_threshold"], + "requires_review": true + }, + "audit": { + "allowed_values_count": 120, + "top_candidates": [ + {"value": "Canvas Tools & Accessories", "score": 0.64} + ], + "input_signals": [], + "signals": { + "match_score": 0.64, + "llm_confidence_score": null + }, + "thresholds": {} + } +} +``` + +### Normalize Statuses + +- `NORMALIZED`: best candidate passed `minimum_label` and was accepted. +- `SUGGESTED`: best candidate exists, but did not pass `minimum_label`. +- `NO_MATCH`: no usable input or no allowed values were provided. + +## Mode 2: Score Existing Normalized Value + +Use this mode for this workflow: + +```text +LLM extracted value + existing normalized value + allowed values -> audit score +``` + +```python +audit = score_normalized_value( + extracted_value="Kitchen tools", + normalized_value="Canvas Tools & Accessories", + allowed_values=[ + "Kitchen Cookware Sets", + "Canvas Tools & Accessories", + "Serving Trays", + ], + field_name="category", + llm_confidence=0.82, +) +``` + +This mode does not choose a normalized value. It evaluates the mapping that was +already chosen by another system. + +### Score Formula + +When `llm_confidence` is provided: + +```text +confidence_score = + allowed_value_score * 0.30 ++ text_similarity_score * 0.35 ++ candidate_rank_score * 0.20 ++ llm_confidence_score * 0.15 +``` + +When `llm_confidence` is not provided, the available weights are normalized by +their total weight so the missing LLM score does not automatically penalize the +result. + +### Score Signals + +- `allowed_value_score`: `1.0` when `normalized_value` exists in `allowed_values`, otherwise `0.0`. +- `text_similarity_score`: textual similarity between `extracted_value` and `normalized_value`. +- `candidate_rank_score`: rank quality of `normalized_value` among candidate matches. +- `llm_confidence_score`: optional confidence reported by the LLM, clamped to `0.0..1.0`. + +## Thresholds + +Default thresholds: + +```python +{ + "HIGH": 0.90, + "GOOD": 0.75, + "POSSIBLE": 0.60, + "LOW": 0.0, + "review_below": 0.75, + "ambiguity_margin": 0.10, + "low_similarity_below": 0.60, +} +``` + +You can override them per call: + +```python +audit = normalize_with_confidence( + extracted_value="Kitchen tools", + allowed_values=categories, + minimum_label="POSSIBLE", + thresholds={"GOOD": 0.80, "review_below": 0.70}, +) +``` + +## Validation Flags + +- `missing_extracted_value`: no usable extracted input was provided. +- `missing_normalized_value`: `score_normalized_value` received no normalized value. +- `empty_allowed_values`: `normalize_with_confidence` received no allowed values. +- `taxonomy_missing_candidate`: normalized value is not in `allowed_values`. +- `low_similarity_match`: match score is below `low_similarity_below`. +- `ambiguous_classification`: top candidates are too close based on `ambiguity_margin`. +- `below_minimum_threshold`: candidate did not reach the requested `minimum_label`. + +## Recommended Usage In Spiders + +```python +audit = normalize_with_confidence( + extracted_values=[ + {"value": item.get("category_name"), "weight": 0.7, "source": "llm"}, + {"value": item.get("breadcrumb_category"), "weight": 0.2, "source": "breadcrumb"}, + {"value": item.get("title"), "weight": 0.1, "source": "title"}, + ], + allowed_values=self.allowed_categories, + field_name="category", + minimum_label="GOOD", +) + +item["category_normalized"] = audit["normalized_value"] +item["category_confidence_audit"] = audit +``` diff --git a/src/ps_helper/confidence/__init__.py b/src/ps_helper/confidence/__init__.py new file mode 100644 index 0000000..6ff465d --- /dev/null +++ b/src/ps_helper/confidence/__init__.py @@ -0,0 +1,3 @@ +from .normalized_value import normalize_with_confidence, score_normalized_value + +__all__ = ["score_normalized_value", "normalize_with_confidence"] diff --git a/src/ps_helper/confidence/normalized_value.py b/src/ps_helper/confidence/normalized_value.py new file mode 100644 index 0000000..b02dff2 --- /dev/null +++ b/src/ps_helper/confidence/normalized_value.py @@ -0,0 +1,385 @@ +import re +from difflib import SequenceMatcher +from typing import Any, Dict, List, Optional + + +DEFAULT_THRESHOLDS = { + "HIGH": 0.90, + "GOOD": 0.75, + "POSSIBLE": 0.60, + "LOW": 0.0, + "review_below": 0.75, + "ambiguity_margin": 0.10, + "low_similarity_below": 0.60, +} + + +def _normalize_key(value: Any) -> str: + if value is None: + return "" + text = str(value) + text = text.replace("\u200b", "").replace("\ufeff", "") + text = re.sub(r"[®™©]", "", text) + text = re.sub(r"[^a-zA-Z0-9\+\&\-\s]", " ", text) + text = re.sub(r"\s+", " ", text).strip().lower() + return text + + +def _dedupe_values(values: List[Any]) -> List[str]: + seen = set() + output = [] + for value in values or []: + text = str(value).strip() if value is not None else "" + key = _normalize_key(text) + if not key or key in seen: + continue + seen.add(key) + output.append(text) + return output + + +def _similarity(left: Any, right: Any) -> float: + left_key = _normalize_key(left) + right_key = _normalize_key(right) + if not left_key or not right_key: + return 0.0 + if left_key == right_key: + return 1.0 + return SequenceMatcher(None, left_key, right_key).ratio() + + +def _confidence_label(score: float, thresholds: Dict[str, float]) -> str: + if score >= thresholds["HIGH"]: + return "HIGH" + if score >= thresholds["GOOD"]: + return "GOOD" + if score >= thresholds["POSSIBLE"]: + return "POSSIBLE" + return "LOW" + + +def _rank_candidates(extracted_value: Any, allowed_values: List[str], top_k: int) -> List[Dict[str, Any]]: + ranked = [ + {"value": value, "score": round(_similarity(extracted_value, value), 4)} + for value in allowed_values + ] + ranked.sort(key=lambda item: item["score"], reverse=True) + return ranked[:max(0, top_k)] + + +def _label_rank(label: str) -> int: + ranks = {"LOW": 0, "POSSIBLE": 1, "GOOD": 2, "HIGH": 3} + return ranks.get(str(label or "").upper(), ranks["GOOD"]) + + +def _coerce_score(value: Optional[float]) -> Optional[float]: + if value is None: + return None + try: + return max(0.0, min(1.0, float(value))) + except (TypeError, ValueError): + return None + + +def _normalize_extracted_values( + extracted_value: Any = None, + extracted_values: Optional[List[Any]] = None, +) -> List[Dict[str, Any]]: + raw_values = extracted_values if extracted_values is not None else [extracted_value] + signals = [] + + for item in raw_values or []: + if isinstance(item, dict): + value = item.get("value") + weight = item.get("weight", 1.0) + source = item.get("source") + else: + value = item + weight = 1.0 + source = None + + key = _normalize_key(value) + if not key: + continue + + try: + numeric_weight = float(weight) + except (TypeError, ValueError): + numeric_weight = 1.0 + + if numeric_weight <= 0: + continue + + signals.append( + { + "value": value, + "weight": numeric_weight, + "source": source, + } + ) + + total_weight = sum(item["weight"] for item in signals) + if total_weight <= 0: + return [] + + return [ + { + "value": item["value"], + "weight": round(item["weight"] / total_weight, 4), + "source": item["source"], + } + for item in signals + ] + + +def _weighted_similarity(input_signals: List[Dict[str, Any]], candidate: Any) -> float: + return sum( + _similarity(signal["value"], candidate) * float(signal["weight"]) + for signal in input_signals + ) + + +def _rank_weighted_candidates( + input_signals: List[Dict[str, Any]], + allowed_values: List[str], + top_k: int, +) -> List[Dict[str, Any]]: + ranked = [ + {"value": value, "score": round(_weighted_similarity(input_signals, value), 4)} + for value in allowed_values + ] + ranked.sort(key=lambda item: item["score"], reverse=True) + return ranked[:max(0, top_k)] + + +def normalize_with_confidence( + extracted_value: Any = None, + allowed_values: Optional[List[Any]] = None, + *, + extracted_values: Optional[List[Any]] = None, + field_name: Optional[str] = None, + llm_confidence: Optional[float] = None, + minimum_label: str = "GOOD", + thresholds: Optional[Dict[str, float]] = None, + top_k: int = 3, +) -> Dict[str, Any]: + """Pick the best normalized value from a controlled list and gate it. + + Pass `extracted_value` for the common case, or `extracted_values` with + weights/sources when multiple signals should contribute to normalization. + """ + active_thresholds = {**DEFAULT_THRESHOLDS, **(thresholds or {})} + allowed = _dedupe_values(allowed_values or []) + input_signals = _normalize_extracted_values(extracted_value, extracted_values) + llm_score = _coerce_score(llm_confidence) + flags = [] + + if not input_signals: + flags.append("missing_extracted_value") + if not allowed: + flags.append("empty_allowed_values") + + candidate_ranking = _rank_weighted_candidates(input_signals, allowed, top_k) if input_signals and allowed else [] + best_candidate = candidate_ranking[0] if candidate_ranking else None + match_score = float(best_candidate.get("score") or 0.0) if best_candidate else 0.0 + + if llm_score is not None and best_candidate: + confidence_score = (match_score * 0.85) + (llm_score * 0.15) + else: + confidence_score = match_score + + confidence_score = round(max(0.0, min(1.0, confidence_score)), 4) + confidence_label = _confidence_label(confidence_score, active_thresholds) + minimum_label = str(minimum_label or "GOOD").upper() + accepted = bool(best_candidate) and _label_rank(confidence_label) >= _label_rank(minimum_label) + + if best_candidate and confidence_score < active_thresholds["low_similarity_below"]: + flags.append("low_similarity_match") + if best_candidate and not accepted: + flags.append("below_minimum_threshold") + if len(candidate_ranking) >= 2: + top_score = float(candidate_ranking[0].get("score") or 0.0) + second_score = float(candidate_ranking[1].get("score") or 0.0) + if top_score - second_score <= active_thresholds["ambiguity_margin"]: + flags.append("ambiguous_classification") + + suggested_value = best_candidate.get("value") if best_candidate else None + normalized_value = suggested_value if accepted else None + if not best_candidate: + status = "NO_MATCH" + elif accepted: + status = "NORMALIZED" + else: + status = "SUGGESTED" + + requires_review = not accepted or confidence_score < active_thresholds["review_below"] + method = "weighted_fuzzy_match" if len(input_signals) > 1 else "fuzzy_match" + if best_candidate and len(input_signals) == 1 and _normalize_key(input_signals[0]["value"]) == _normalize_key(suggested_value): + method = "exact_match" + + return { + "field_name": field_name, + "extracted_values": input_signals, + "normalized_value": normalized_value, + "suggested_value": suggested_value, + "accepted": accepted, + "confidence_score": confidence_score, + "confidence_label": confidence_label, + "normalization": { + "status": status, + "method": method if best_candidate else "no_match", + "match_score": round(match_score, 4), + "minimum_label": minimum_label, + }, + "validation": { + "is_valid": accepted, + "flags": sorted(set(flags)), + "requires_review": requires_review, + }, + "audit": { + "allowed_values_count": len(allowed), + "top_candidates": candidate_ranking, + "input_signals": input_signals, + "signals": { + "match_score": round(match_score, 4), + "llm_confidence_score": round(llm_score, 4) if llm_score is not None else None, + }, + "thresholds": active_thresholds, + }, + } + + +def score_normalized_value( + extracted_value: Any, + normalized_value: Any, + allowed_values: List[Any], + *, + field_name: Optional[str] = None, + llm_confidence: Optional[float] = None, + top_candidates: Optional[List[Dict[str, Any]]] = None, + thresholds: Optional[Dict[str, float]] = None, + top_k: int = 3, +) -> Dict[str, Any]: + """Score confidence for an extracted value mapped to a normalized value. + + This measures the observable quality of `extracted_value -> normalized_value` + against a controlled list. It does not claim ground-truth accuracy. + """ + active_thresholds = {**DEFAULT_THRESHOLDS, **(thresholds or {})} + allowed = _dedupe_values(allowed_values) + allowed_keys = {_normalize_key(value): value for value in allowed} + normalized_key = _normalize_key(normalized_value) + extracted_key = _normalize_key(extracted_value) + llm_score = _coerce_score(llm_confidence) + + flags = [] + if not extracted_key: + flags.append("missing_extracted_value") + if not normalized_key: + flags.append("missing_normalized_value") + + normalized_in_allowed = bool(normalized_key and normalized_key in allowed_keys) + if normalized_key and not normalized_in_allowed: + flags.append("taxonomy_missing_candidate") + + if top_candidates is None: + candidate_ranking = _rank_candidates(extracted_value, allowed, top_k) + else: + candidate_ranking = [ + { + "value": candidate.get("value"), + "score": round(_coerce_score(candidate.get("score")) or 0.0, 4), + } + for candidate in top_candidates[:max(0, top_k)] + if isinstance(candidate, dict) + ] + + text_similarity = _similarity(extracted_value, normalized_value) + allowed_value_score = 1.0 if normalized_in_allowed else 0.0 + + method = "no_match" + if normalized_in_allowed: + if extracted_key and extracted_key == normalized_key: + method = "exact_match" + elif text_similarity > 0: + method = "fuzzy_match" + else: + method = "catalog_match" + + candidate_rank_score = 0.0 + normalized_candidate_score = None + for index, candidate in enumerate(candidate_ranking): + if _normalize_key(candidate.get("value")) == normalized_key: + normalized_candidate_score = float(candidate.get("score") or 0.0) + candidate_rank_score = max(0.0, 1.0 - (index * 0.2)) + break + + if normalized_candidate_score is None: + normalized_candidate_score = text_similarity if normalized_in_allowed else 0.0 + + if len(candidate_ranking) >= 2 and normalized_in_allowed: + top_score = float(candidate_ranking[0].get("score") or 0.0) + second_score = float(candidate_ranking[1].get("score") or 0.0) + if top_score - second_score <= active_thresholds["ambiguity_margin"]: + flags.append("ambiguous_classification") + + if text_similarity < active_thresholds["low_similarity_below"] and normalized_in_allowed: + flags.append("low_similarity_match") + + signals = { + "allowed_value_score": allowed_value_score, + "text_similarity_score": round(text_similarity, 4), + "candidate_rank_score": round(candidate_rank_score, 4), + "llm_confidence_score": round(llm_score, 4) if llm_score is not None else None, + } + + weighted_signals = [ + (allowed_value_score, 0.30), + (text_similarity, 0.35), + (candidate_rank_score, 0.20), + ] + if llm_score is not None: + weighted_signals.append((llm_score, 0.15)) + + total_weight = sum(weight for _, weight in weighted_signals) + confidence_score = ( + sum(score * weight for score, weight in weighted_signals) / total_weight + if total_weight + else 0.0 + ) + + if not normalized_in_allowed or not normalized_key or not extracted_key: + confidence_score = min(confidence_score, 0.59) + + confidence_score = round(max(0.0, min(1.0, confidence_score)), 4) + confidence_label = _confidence_label(confidence_score, active_thresholds) + requires_review = confidence_score < active_thresholds["review_below"] or bool( + {"taxonomy_missing_candidate", "missing_normalized_value", "missing_extracted_value"} & set(flags) + ) + + status = "NORMALIZED" if normalized_in_allowed else "NO_MATCH" + is_valid = normalized_in_allowed and bool(extracted_key) + + return { + "field_name": field_name, + "extracted_value": extracted_value, + "normalized_value": normalized_value, + "confidence_score": confidence_score, + "confidence_label": confidence_label, + "normalization": { + "status": status, + "method": method, + "match_score": round(normalized_candidate_score, 4), + }, + "validation": { + "is_valid": is_valid, + "flags": sorted(set(flags)), + "requires_review": requires_review, + }, + "audit": { + "allowed_values_count": len(allowed), + "top_candidates": candidate_ranking, + "signals": signals, + "thresholds": active_thresholds, + }, + } diff --git a/tests/test_normalized_value_confidence.py b/tests/test_normalized_value_confidence.py new file mode 100644 index 0000000..8f813b4 --- /dev/null +++ b/tests/test_normalized_value_confidence.py @@ -0,0 +1,165 @@ +import json + +from ps_helper.confidence import normalize_with_confidence, score_normalized_value + + +def test_exact_match_returns_high_confidence(): + result = score_normalized_value( + "Serving Trays", + "Serving Trays", + ["Serving Trays", "Plates"], + field_name="category", + ) + + assert result["field_name"] == "category" + assert result["confidence_label"] == "HIGH" + assert result["normalization"]["status"] == "NORMALIZED" + assert result["normalization"]["method"] == "exact_match" + assert result["validation"]["requires_review"] is False + + +def test_fuzzy_match_returns_auditable_score(): + result = score_normalized_value( + "Kitchen tools", + "Canvas Tools & Accessories", + ["Kitchen Cookware Sets", "Canvas Tools & Accessories", "Serving Trays"], + ) + + assert result["normalization"]["status"] == "NORMALIZED" + assert result["normalization"]["method"] == "fuzzy_match" + assert result["confidence_score"] < 0.75 + assert "low_similarity_match" in result["validation"]["flags"] + assert result["audit"]["top_candidates"] + + +def test_normalized_value_outside_allowed_values_returns_no_match(): + result = score_normalized_value( + "Christmas Decorations", + "Holiday Decor", + ["Serving Trays", "Plates"], + ) + + assert result["normalization"]["status"] == "NO_MATCH" + assert result["validation"]["is_valid"] is False + assert "taxonomy_missing_candidate" in result["validation"]["flags"] + assert result["validation"]["requires_review"] is True + + +def test_missing_values_are_flagged(): + result = score_normalized_value(None, None, ["Serving Trays"]) + + assert "missing_extracted_value" in result["validation"]["flags"] + assert "missing_normalized_value" in result["validation"]["flags"] + assert result["normalization"]["status"] == "NO_MATCH" + assert result["validation"]["requires_review"] is True + + +def test_llm_confidence_contributes_to_score(): + without_llm = score_normalized_value( + "Serving tray", + "Serving Trays", + ["Serving Trays", "Plates"], + ) + with_llm = score_normalized_value( + "Serving tray", + "Serving Trays", + ["Serving Trays", "Plates"], + llm_confidence=1.0, + ) + + assert with_llm["confidence_score"] >= without_llm["confidence_score"] + assert with_llm["audit"]["signals"]["llm_confidence_score"] == 1.0 + + +def test_ambiguous_candidates_are_flagged(): + result = score_normalized_value( + "Panel", + "Wall Panel", + ["Wall Panel", "Ceiling Panel"], + top_candidates=[ + {"value": "Wall Panel", "score": 0.82}, + {"value": "Ceiling Panel", "score": 0.78}, + ], + ) + + assert "ambiguous_classification" in result["validation"]["flags"] + + +def test_output_is_json_serializable(): + result = score_normalized_value("Plates", "Plates", ["Plates"]) + + json.dumps(result) + + +def test_normalize_with_confidence_accepts_exact_match(): + result = normalize_with_confidence( + extracted_value="Serving Trays", + allowed_values=["Serving Trays", "Plates"], + minimum_label="GOOD", + ) + + assert result["accepted"] is True + assert result["normalized_value"] == "Serving Trays" + assert result["normalization"]["status"] == "NORMALIZED" + assert result["normalization"]["method"] == "exact_match" + + +def test_normalize_with_confidence_rejects_below_minimum_label(): + result = normalize_with_confidence( + extracted_value="Kitchen tools", + allowed_values=["Kitchen Cookware Sets", "Canvas Tools & Accessories"], + minimum_label="GOOD", + ) + + assert result["accepted"] is False + assert result["normalized_value"] is None + assert result["suggested_value"] is not None + assert result["normalization"]["status"] == "SUGGESTED" + assert "below_minimum_threshold" in result["validation"]["flags"] + + +def test_normalize_with_confidence_accepts_possible_when_configured(): + result = normalize_with_confidence( + extracted_value="Kitchen tools", + allowed_values=["Kitchen Cookware Sets", "Canvas Tools & Accessories"], + minimum_label="POSSIBLE", + ) + + if result["confidence_label"] == "POSSIBLE": + assert result["accepted"] is True + assert result["normalized_value"] == result["suggested_value"] + + +def test_normalize_with_confidence_uses_weighted_inputs(): + result = normalize_with_confidence( + extracted_values=[ + {"value": "Plates", "weight": 1, "source": "llm"}, + {"value": "Serving Trays", "weight": 9, "source": "breadcrumb"}, + ], + allowed_values=["Plates", "Serving Trays"], + minimum_label="GOOD", + ) + + assert result["normalized_value"] == "Serving Trays" + assert result["normalization"]["method"] == "weighted_fuzzy_match" + assert result["audit"]["input_signals"][0]["weight"] == 0.1 + assert result["audit"]["input_signals"][1]["weight"] == 0.9 + + +def test_normalize_with_confidence_flags_empty_inputs(): + result = normalize_with_confidence(extracted_value=None, allowed_values=[]) + + assert result["accepted"] is False + assert result["normalization"]["status"] == "NO_MATCH" + assert "missing_extracted_value" in result["validation"]["flags"] + assert "empty_allowed_values" in result["validation"]["flags"] + + +def test_normalize_with_confidence_flags_ambiguous_candidates(): + result = normalize_with_confidence( + extracted_value="A Panel", + allowed_values=["B Panel", "C Panel"], + minimum_label="LOW", + ) + + assert "ambiguous_classification" in result["validation"]["flags"]