microsoft · romanlutz · Jul 2, 2026 · Jul 2, 2026 · Jul 3, 2026
diff --git a/pyrit/score/__init__.py b/pyrit/score/__init__.py
@@ -25,6 +25,7 @@
 from pyrit.score.float_scale.self_ask_general_float_scale_scorer import SelfAskGeneralFloatScaleScorer
 from pyrit.score.float_scale.self_ask_likert_scorer import LikertScaleEvalFiles, LikertScalePaths, SelfAskLikertScorer
 from pyrit.score.float_scale.self_ask_scale_scorer import SelfAskScaleScorer
+from pyrit.score.response_handler import JsonSchemaResponseHandler, ResponseHandler
 from pyrit.score.scorer import Scorer
 from pyrit.score.scorer_evaluation.metrics_type import MetricsType, RegistryUpdateBehavior
 from pyrit.score.scorer_evaluation.scorer_metrics import (
@@ -143,6 +144,7 @@ def __getattr__(name: str) -> object:
     "HumanLabeledDataset",
     "HumanLabeledEntry",
     "InsecureCodeScorer",
+    "JsonSchemaResponseHandler",
     "LikertScaleEvalFiles",
     "LikertScalePaths",
     "MarkdownInjectionScorer",
@@ -159,6 +161,7 @@ def __getattr__(name: str) -> object:
     "QuestionAnswerScorer",
     "RegexScorer",
     "RegistryUpdateBehavior",
+    "ResponseHandler",
     "Scorer",
     "ScorerEvalDatasetFiles",
     "ScorerEvaluator",

diff --git a/pyrit/score/float_scale/float_scale_scorer.py b/pyrit/score/float_scale/float_scale_scorer.py
@@ -3,21 +3,15 @@
 
 from __future__ import annotations
 
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, ClassVar
 
-from pyrit.exceptions.exception_classes import InvalidJsonException
 from pyrit.models import (
-    JsonSchemaDefinition,
     Message,
-    PromptDataType,
     Score,
-    UnvalidatedScore,
 )
 from pyrit.score.scorer import Scorer
 
 if TYPE_CHECKING:
-    from uuid import UUID
-
     from pyrit.prompt_target.common.prompt_target import PromptTarget
     from pyrit.score.scorer_evaluation.scorer_metrics import HarmScorerMetrics
     from pyrit.score.scorer_prompt_validator import ScorerPromptValidator
@@ -44,6 +38,10 @@ class FloatScaleScorer(Scorer):
     "blocked = True") should override ``_score_piece_async`` or ``_build_fallback_score``.
     """
 
+    # Marks scores produced by this scorer as numeric so the shared LLM round-trip validates that
+    # the returned score value is parsable as a float. Float-scale scorers require this.
+    _score_value_is_numeric: ClassVar[bool] = True
+
     def __init__(self, *, validator: ScorerPromptValidator, chat_target: PromptTarget | None = None) -> None:
         """
         Initialize the FloatScaleScorer.
@@ -138,50 +136,3 @@ def get_scorer_metrics(self) -> HarmScorerMetrics | None:
             eval_hash=eval_hash,
             harm_category=self.evaluation_file_mapping.harm_category,
         )
-
-    async def _score_value_with_llm_async(
-        self,
-        *,
-        prompt_target: PromptTarget,
-        system_prompt: str,
-        message_value: str,
-        message_data_type: PromptDataType,
-        scored_prompt_id: str | UUID,
-        prepended_text_message_piece: str | None = None,
-        category: str | UUID | None = None,
-        objective: str | None = None,
-        score_value_output_key: str = "score_value",
-        rationale_output_key: str = "rationale",
-        description_output_key: str = "description",
-        metadata_output_key: str = "metadata",
-        category_output_key: str = "category",
-        response_json_schema: JsonSchemaDefinition | None = None,
-    ) -> UnvalidatedScore:
-        score: UnvalidatedScore | None = None
-        try:
-            score = await super()._score_value_with_llm_async(
-                prompt_target=prompt_target,
-                system_prompt=system_prompt,
-                message_value=message_value,
-                message_data_type=message_data_type,
-                scored_prompt_id=scored_prompt_id,
-                prepended_text_message_piece=prepended_text_message_piece,
-                category=category,
-                objective=objective,
-                score_value_output_key=score_value_output_key,
-                rationale_output_key=rationale_output_key,
-                description_output_key=description_output_key,
-                metadata_output_key=metadata_output_key,
-                category_output_key=category_output_key,
-                response_json_schema=response_json_schema,
-            )
-            if score is None:
-                raise ValueError("Score returned None")
-            # raise an exception if it's not parsable as a float
-            float(score.raw_score_value)
-        except ValueError:
-            score_value = score.raw_score_value if score else "None"
-            raise InvalidJsonException(
-                message=(f"Invalid JSON response, score_value should be a float not this: {score_value}")
-            ) from None
-        return score
diff --git a/pyrit/score/llm_scoring.py b/pyrit/score/llm_scoring.py
@@ -0,0 +1,179 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+from __future__ import annotations
+
+import uuid
+from typing import TYPE_CHECKING, Any
+
+from pyrit.exceptions import InvalidJsonException, pyrit_json_retry
+from pyrit.models import JSON_SCHEMA_METADATA_KEY, Message, MessagePiece
+
+if TYPE_CHECKING:
+    from collections.abc import Sequence
+
+    from pyrit.models import (
+        ComponentIdentifier,
+        JsonSchemaDefinition,
+        PromptDataType,
+        UnvalidatedScore,
+    )
+    from pyrit.prompt_target import PromptTarget
+    from pyrit.score.response_handler import ResponseHandler
+
+
+async def _run_llm_scoring_async(
+    *,
+    chat_target: PromptTarget,
+    system_prompt: str,
+    response_handler: ResponseHandler,
+    value: str,
+    data_type: PromptDataType,
+    scored_prompt_id: str | uuid.UUID,
+    scorer_identifier: ComponentIdentifier,
+    prepended_text: str | None = None,
+    category: Sequence[str] | str | None = None,
+    objective: str | None = None,
+    response_json_schema: JsonSchemaDefinition | None = None,
+    numeric_value: bool = False,
+) -> UnvalidatedScore:
+    """
+    Perform a single scoring round-trip against an LLM target and parse the result.
+
+    This is the shared LLM evaluation mechanism: it sets the system prompt on the target,
+    sends the value to be scored, applies the standard JSON retry behavior, and delegates
+    parsing to ``response_handler``. It is intentionally stateless and independent of any
+    particular ``Scorer`` so that scorers can compose it without inheriting LLM machinery.
+
+    Args:
+        chat_target (PromptTarget): The target LLM to send the message to.
+        system_prompt (str): The system-level prompt that guides the target LLM.
+        response_handler (ResponseHandler): Parser that turns the target's raw text into an
+            ``UnvalidatedScore``.
+        value (str): The content to be scored (e.g. text, image path, audio path).
+        data_type (PromptDataType): The data type of ``value`` (e.g. "text", "image_path").
+        scored_prompt_id (str | uuid.UUID): The ID of the message piece being scored.
+        scorer_identifier (ComponentIdentifier): Identifier of the calling scorer, stored on
+            the resulting score.
+        prepended_text (str | None): Text context to prepend before ``value`` as a separate
+            piece. Useful for adding objective/context when scoring non-text content.
+            Defaults to None.
+        category (Sequence[str] | str | None): The category of the score. May instead be parsed
+            from the response; supplying both is an error. Defaults to None.
+        objective (str | None): The objective associated with the score, used for
+            contextualizing the result. Defaults to None.
+        response_json_schema (JsonSchemaDefinition | None): Optional JSON schema constraining the
+            response. Forwarded to the request metadata; targets that natively support JSON
+            schemas enforce it, others have it omitted by normalization. Defaults to None.
+        numeric_value (bool): When True, the parsed ``raw_score_value`` must be parsable as a
+            float; otherwise an ``InvalidJsonException`` is raised (without retrying). Defaults
+            to False.
+
+    Returns:
+        UnvalidatedScore: The parsed score, whose ``raw_score_value`` still needs to be
+            normalized and validated by the caller.
+
+    Raises:
+        InvalidJsonException: If the response is not valid JSON, is missing required keys, or
+            (when ``numeric_value`` is True) the score value is not a float.
+        Exception: For other unexpected errors during scoring.
+    """
+    score = await _send_and_parse_async(
+        chat_target=chat_target,
+        system_prompt=system_prompt,
+        response_handler=response_handler,
+        value=value,
+        data_type=data_type,
+        scored_prompt_id=scored_prompt_id,
+        scorer_identifier=scorer_identifier,
+        prepended_text=prepended_text,
+        category=category,
+        objective=objective,
+        response_json_schema=response_json_schema,
+    )
+
+    if numeric_value:
+        try:
+            # Raise an exception if the score value is not parsable as a float. This mirrors the
+            # historical float-scale behavior: the check runs outside the JSON retry, so a
+            # well-formed-but-non-numeric response is not retried.
+            float(score.raw_score_value)
+        except ValueError:
+            raise InvalidJsonException(
+                message=f"Invalid JSON response, score_value should be a float not this: {score.raw_score_value}"
+            ) from None
+
+    return score
+
+
+@pyrit_json_retry
+async def _send_and_parse_async(
+    *,
+    chat_target: PromptTarget,
+    system_prompt: str,
+    response_handler: ResponseHandler,
+    value: str,
+    data_type: PromptDataType,
+    scored_prompt_id: str | uuid.UUID,
+    scorer_identifier: ComponentIdentifier,
+    prepended_text: str | None = None,
+    category: Sequence[str] | str | None = None,
+    objective: str | None = None,
+    response_json_schema: JsonSchemaDefinition | None = None,
+) -> UnvalidatedScore:
+    conversation_id = str(uuid.uuid4())
+
+    chat_target.set_system_prompt(
+        system_prompt=system_prompt,
+        conversation_id=conversation_id,
+    )
+    prompt_metadata: dict[str, Any] = {"response_format": "json"}
+    if response_json_schema is not None:
+        # Always forward the schema; the target's normalization pipeline omits it
+        # when the target cannot natively enforce a JSON schema.
+        prompt_metadata[JSON_SCHEMA_METADATA_KEY] = response_json_schema
+
+    # Build message pieces - prepended text context first (if provided), then the main message being scored
+    message_pieces: list[MessagePiece] = []
+
+    # Add prepended text context piece if provided (e.g., objective context for non-text scoring)
+    if prepended_text:
+        message_pieces.append(
+            MessagePiece(
+                role="user",
+                original_value=prepended_text,
+                original_value_data_type="text",
+                converted_value_data_type="text",
+                conversation_id=conversation_id,
+                prompt_metadata=prompt_metadata,
+            )
+        )
+
+    # Add the main message piece being scored
+    message_pieces.append(
+        MessagePiece(
+            role="user",
+            original_value=value,
+            original_value_data_type=data_type,
+            converted_value_data_type=data_type,
+            conversation_id=conversation_id,
+            prompt_metadata=prompt_metadata,
+        )
+    )
+
+    scorer_llm_request = Message(message_pieces=message_pieces)
+    try:
+        response = await chat_target.send_prompt_async(message=scorer_llm_request)
+    except Exception as ex:
+        raise Exception(f"Error scoring prompt with original prompt ID: {scored_prompt_id}") from ex
+
+    # Get the text piece which contains the JSON response containing the score_value and rationale from the LLM
+    text_piece = next(piece for piece in response[0].message_pieces if piece.converted_value_data_type == "text")
+
+    return response_handler.parse(
+        response_text=text_piece.converted_value,
+        scorer_identifier=scorer_identifier,
+        scored_prompt_id=scored_prompt_id,
+        category=category,
+        objective=objective,
+    )