Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
def _map_finish_reason(anthropic_reason):
"""Map an Anthropic stop_reason to the OTel GenAI FinishReason enum value."""
if not anthropic_reason:
return anthropic_reason
return ""
return _FINISH_REASON_MAP.get(anthropic_reason, anthropic_reason)


Expand Down Expand Up @@ -256,9 +256,7 @@ def _build_output_messages_from_content(response):
"role": response.get("role", "assistant"),
"parts": [{"type": "text", "content": response.get("completion")}],
}
mapped = _map_finish_reason(response.get("stop_reason"))
if mapped:
msg["finish_reason"] = mapped
msg["finish_reason"] = _map_finish_reason(response.get("stop_reason"))
return [msg]

if not response.get("content"):
Expand Down Expand Up @@ -295,9 +293,7 @@ def _build_output_messages_from_content(response):
"role": response.get("role", "assistant"),
"parts": parts,
}
mapped = _map_finish_reason(response.get("stop_reason"))
if mapped:
msg["finish_reason"] = mapped
msg["finish_reason"] = _map_finish_reason(response.get("stop_reason"))
return [msg]


Expand Down Expand Up @@ -449,8 +445,7 @@ def set_streaming_response_attributes(span, complete_response_events):
"role": "assistant",
"parts": parts,
}
if finish_reasons:
msg["finish_reason"] = finish_reasons[-1]
msg["finish_reason"] = finish_reasons[-1] if finish_reasons else ""
output_messages = [msg]
set_span_attribute(
span,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -375,6 +375,34 @@ def test_streaming_finish_reasons_set_when_content_tracing_disabled():
assert GenAIAttributes.GEN_AI_OUTPUT_MESSAGES not in span.attributes


def test_finish_reason_empty_string_when_none():
"""finish_reason must be '' (not omitted) when stop_reason is None (Bedrock convention)."""
span = make_span()
response = _make_response([_make_text_block("Hello")], stop_reason=None)
set_response_attributes(span, response)

output = json.loads(span.attributes[GenAIAttributes.GEN_AI_OUTPUT_MESSAGES])
assert len(output) == 1
assert "finish_reason" in output[0], "finish_reason key must always be present"
assert output[0]["finish_reason"] == "", (
f"Expected '' for missing stop_reason, got '{output[0]['finish_reason']}'"
)


def test_streaming_finish_reason_empty_string_when_none():
"""Streaming: finish_reason must be '' when no finish_reason in events."""
span = make_span()
events = [{"type": "text", "text": "Hello", "index": 0}]
set_streaming_response_attributes(span, events)

raw = span.attributes.get(GenAIAttributes.GEN_AI_OUTPUT_MESSAGES)
if raw:
output = json.loads(raw)
assert output[0]["finish_reason"] == "", (
f"Expected '' for missing streaming finish_reason, got '{output[0]['finish_reason']}'"
)

Comment on lines +398 to +404
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

Make the streaming fallback test fail when output messages are missing.

At Line 399, if raw: makes the test pass even if GEN_AI_OUTPUT_MESSAGES is never written. Assert presence first so regressions are caught.

✅ Suggested test hardening
-    raw = span.attributes.get(GenAIAttributes.GEN_AI_OUTPUT_MESSAGES)
-    if raw:
-        output = json.loads(raw)
-        assert output[0]["finish_reason"] == "", (
-            f"Expected '' for missing streaming finish_reason, got '{output[0]['finish_reason']}'"
-        )
+    raw = span.attributes.get(GenAIAttributes.GEN_AI_OUTPUT_MESSAGES)
+    assert raw is not None, "GEN_AI_OUTPUT_MESSAGES must be set for streaming text events"
+    output = json.loads(raw)
+    assert output[0]["finish_reason"] == "", (
+        f"Expected '' for missing streaming finish_reason, got '{output[0]['finish_reason']}'"
+    )
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
raw = span.attributes.get(GenAIAttributes.GEN_AI_OUTPUT_MESSAGES)
if raw:
output = json.loads(raw)
assert output[0]["finish_reason"] == "", (
f"Expected '' for missing streaming finish_reason, got '{output[0]['finish_reason']}'"
)
raw = span.attributes.get(GenAIAttributes.GEN_AI_OUTPUT_MESSAGES)
assert raw is not None, "GEN_AI_OUTPUT_MESSAGES must be set for streaming text events"
output = json.loads(raw)
assert output[0]["finish_reason"] == "", (
f"Expected '' for missing streaming finish_reason, got '{output[0]['finish_reason']}'"
)
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In
`@packages/opentelemetry-instrumentation-anthropic/tests/test_semconv_span_attrs.py`
around lines 398 - 404, The test currently uses "if raw:" which silently passes
when GenAIAttributes.GEN_AI_OUTPUT_MESSAGES is missing; change it to assert the
attribute exists (e.g., assert raw is not None or assert
GenAIAttributes.GEN_AI_OUTPUT_MESSAGES in span.attributes) before loading JSON
so the test fails on missing output messages, then keep the json.loads(raw) and
the existing finish_reason assertion; refer to the variable raw and the constant
GenAIAttributes.GEN_AI_OUTPUT_MESSAGES in test_semconv_span_attrs.py to locate
where to add the presence assertion.


def test_output_messages_tool_use_response():
"""Tool use in the response should appear as tool_call parts."""
span = make_span()
Expand Down Expand Up @@ -1009,8 +1037,8 @@ def test_image_without_upload_produces_blob_part():


def test_streaming_finish_reason_null_omitted_from_json():
"""When no finish_reason is available, the key must be omitted from
gen_ai.output.messages JSON — NOT serialized as null."""
"""When no finish_reason is available, the key must be present with empty
string value — NOT serialized as null, NOT omitted (Bedrock convention)."""
span = make_span()
# Event with no finish_reason key at all
events = [{"type": "text", "text": "Hello world", "index": 0}]
Expand All @@ -1020,8 +1048,8 @@ def test_streaming_finish_reason_null_omitted_from_json():
assert len(output) == 1
assert output[0]["role"] == "assistant"
assert output[0]["parts"] == [{"type": "text", "content": "Hello world"}]
# finish_reason key must be absent, not null
assert "finish_reason" not in output[0]
# finish_reason key must be present with empty string fallback
assert output[0]["finish_reason"] == ""


def test_streaming_finish_reason_none_does_not_set_span_attr():
Expand Down Expand Up @@ -1536,3 +1564,32 @@ def test_event_attributes_uses_provider_name_not_system():
assert EVENT_ATTRIBUTES[GenAIAttributes.GEN_AI_PROVIDER_NAME] == "anthropic"
assert GenAIAttributes.GEN_AI_SYSTEM not in EVENT_ATTRIBUTES, \
"Deprecated GEN_AI_SYSTEM should not be in EVENT_ATTRIBUTES"


# ---------------------------------------------------------------------------
# _map_finish_reason must return "" for falsy input, mapped value for known
# reasons, and the original string as-is for unknown reasons.
# ---------------------------------------------------------------------------

class TestMapFinishReason:
from opentelemetry.instrumentation.anthropic.span_utils import _map_finish_reason
_map_finish_reason = staticmethod(_map_finish_reason)

@pytest.mark.parametrize("falsy_input", [None, "", 0, False])
def test_returns_empty_string_for_falsy(self, falsy_input):
assert self._map_finish_reason(falsy_input) == ""

def test_maps_end_turn_to_stop(self):
assert self._map_finish_reason("end_turn") == "stop"

def test_maps_tool_use_to_tool_call(self):
assert self._map_finish_reason("tool_use") == "tool_call"

def test_maps_max_tokens_to_length(self):
assert self._map_finish_reason("max_tokens") == "length"

def test_maps_stop_sequence_to_stop(self):
assert self._map_finish_reason("stop_sequence") == "stop"

def test_passes_through_unknown_reason(self):
assert self._map_finish_reason("some_new_reason") == "some_new_reason"
Original file line number Diff line number Diff line change
Expand Up @@ -369,9 +369,7 @@ def set_chat_response(span: Span, response: LLMResult) -> None:
if tool_calls and isinstance(tool_calls, list):
parts.extend(_tool_calls_to_parts(tool_calls))

msg_obj = {"role": role, "parts": parts}
if fr:
msg_obj["finish_reason"] = fr
msg_obj = {"role": role, "parts": parts, "finish_reason": fr if fr else ""}
output_messages.append(msg_obj)

if output_messages:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ def test_no_finish_reason_omits_attribute(self, mock_span, monkeypatch):
assert GenAIAttributes.GEN_AI_RESPONSE_FINISH_REASONS not in mock_span.attributes

output = json.loads(mock_span.attributes[GenAIAttributes.GEN_AI_OUTPUT_MESSAGES])
assert "finish_reason" not in output[0]
assert output[0]["finish_reason"] == ""

def test_empty_generation_info_omits_attribute(self, mock_span, monkeypatch):
monkeypatch.setattr(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -556,7 +556,7 @@ def _set_completions(span, choices):

def _map_finish_reason(reason):
if not reason:
return None
return ""
return OPENAI_FINISH_REASON_MAP.get(reason, reason)

def _set_output_messages(span, choices):
Expand Down Expand Up @@ -603,11 +603,12 @@ def _set_output_messages(span, choices):
"name": fc_name,
"arguments": _parse_arguments(fc_args),
})
fr = _map_finish_reason(choice.get("finish_reason")) or "stop"
fr = _map_finish_reason(choice.get("finish_reason"))
entry = {"role": "assistant", "parts": parts, "finish_reason": fr}
if content_filter_results:
entry["content_filter_results"] = content_filter_results
messages.append(entry)

_set_span_attribute(span, GenAIAttributes.GEN_AI_OUTPUT_MESSAGES, json.dumps(messages))

@dont_throw
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -190,13 +190,14 @@ def _set_output_messages(span, choices):

messages = []
for choice in choices:
fr = _map_finish_reason(choice.get("finish_reason")) or "stop"
fr = _map_finish_reason(choice.get("finish_reason"))
entry = {
"role": "assistant",
"parts": [{"content": choice.get("text"), "type": "text"}],
"finish_reason": fr,
}
messages.append(entry)

_set_span_attribute(
span,
GenAIAttributes.GEN_AI_OUTPUT_MESSAGES,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,11 @@ class TracedData(pydantic.BaseModel):
request_service_tier: Optional[str] = pydantic.Field(default=None)
response_service_tier: Optional[str] = pydantic.Field(default=None)

# Response status from Responses API ("completed", "incomplete", "failed", etc.)
response_status: Optional[str] = pydantic.Field(default=None)
# Reason from incomplete_details when status is "incomplete"
incomplete_reason: Optional[str] = pydantic.Field(default=None)

# Trace context - to maintain trace continuity across async operations
trace_context: Any = pydantic.Field(default=None)

Expand All @@ -193,6 +198,41 @@ class Config:
responses: dict[str, TracedData] = {}


def _derive_finish_reason(traced_data: TracedData) -> str:
"""Derive finish_reason from response.status instead of fabricating from block types.

Mapping:
completed + tool calls → "tool_call"
completed + no tool calls → "stop"
incomplete + max_output_tokens → "length"
incomplete + content_filter → "content_filter"
incomplete + other → "length"
failed → "error"
None/unknown → ""
"""
status = traced_data.response_status
if not status:
return ""
if status == "completed":
if traced_data.output_blocks:
for block in traced_data.output_blocks.values():
block_dict = model_as_dict(block)
if block_dict.get("type") in (
"function_call", "file_search_call", "web_search_call",
"computer_call", "code_interpreter_call",
):
return "tool_call"
return "stop"
if status == "incomplete":
reason = traced_data.incomplete_reason
if reason == "content_filter":
return "content_filter"
return "length"
if status in ("failed", "cancelled"):
return "error"
return ""


def parse_response(response: Union[LegacyAPIResponse, Response]) -> Response:
if isinstance(response, LegacyAPIResponse):
return response.parse()
Expand Down Expand Up @@ -256,7 +296,11 @@ def prepare_kwargs_for_shared_attributes(kwargs):


def _set_responses_json_messages(traced_response: TracedData, span: Span):
"""Set gen_ai.input.messages and gen_ai.output.messages as JSON."""
"""Set gen_ai.input.messages and gen_ai.output.messages as JSON.

finish_reason is derived from response.status via _derive_finish_reason(),
not fabricated from output block types.
"""
# Build input messages
input_messages = []
if traced_response.instructions:
Expand Down Expand Up @@ -352,12 +396,10 @@ def _set_responses_json_messages(traced_response: TracedData, span: Span):
else:
parts.append({"type": "reasoning", "content": summary})
if parts:
has_tool_call = any(p.get("type") == "tool_call" for p in parts)
finish_reason = "tool_call" if has_tool_call else "stop"
output_messages.append({
"role": "assistant",
"parts": parts,
"finish_reason": finish_reason,
"finish_reason": _derive_finish_reason(traced_response),
})

_set_span_attribute(span, GenAIAttributes.GEN_AI_OUTPUT_MESSAGES, json.dumps(output_messages))
Expand Down Expand Up @@ -419,26 +461,13 @@ def set_data_attributes(traced_response: TracedData, span: Span):
traced_response.response_reasoning_effort,
)

# P1-2: Derive finish_reasons from output blocks
if traced_response.output_blocks:
finish_reasons = []
has_tool_call = False
for block in traced_response.output_blocks.values():
block_dict = model_as_dict(block)
block_type = block_dict.get("type")
if block_type == "message":
finish_reasons.append("stop")
elif block_type in ("function_call", "file_search_call", "web_search_call",
"computer_call", "code_interpreter_call"):
has_tool_call = True
if has_tool_call:
finish_reasons.append("tool_call")
if finish_reasons:
_set_span_attribute(
span,
GenAIAttributes.GEN_AI_RESPONSE_FINISH_REASONS,
tuple(finish_reasons),
)
finish_reason = _derive_finish_reason(traced_response)
if finish_reason:
_set_span_attribute(
span,
GenAIAttributes.GEN_AI_RESPONSE_FINISH_REASONS,
(finish_reason,),
)

if should_send_prompts():
_set_responses_json_messages(traced_response, span)
Expand Down Expand Up @@ -585,6 +614,11 @@ def responses_get_or_create_wrapper(tracer: Tracer, wrapped, instance, args, kwa
response_reasoning_effort=non_sentinel_kwargs.get("reasoning", {}).get("effort"),
request_service_tier=existing_data.get("request_service_tier", non_sentinel_kwargs.get("service_tier")),
response_service_tier=existing_data.get("response_service_tier", parsed_response.service_tier),
response_status=parsed_response.status,
incomplete_reason=(
getattr(parsed_response.incomplete_details, "reason", None)
if getattr(parsed_response, "incomplete_details", None) else None
),
# Capture trace context to maintain continuity across async operations
trace_context=existing_data.get("trace_context", context_api.get_current()),
)
Expand Down Expand Up @@ -748,6 +782,11 @@ async def async_responses_get_or_create_wrapper(
response_reasoning_effort=non_sentinel_kwargs.get("reasoning", {}).get("effort"),
request_service_tier=existing_data.get("request_service_tier", non_sentinel_kwargs.get("service_tier")),
response_service_tier=existing_data.get("response_service_tier", parsed_response.service_tier),
response_status=parsed_response.status,
incomplete_reason=(
getattr(parsed_response.incomplete_details, "reason", None)
if getattr(parsed_response, "incomplete_details", None) else None
),
# Capture trace context to maintain continuity across async operations
trace_context=existing_data.get("trace_context", context_api.get_current()),
)
Expand Down Expand Up @@ -1008,6 +1047,11 @@ def _process_complete_response(self):
self._traced_data.response_id = parsed_response.id
self._traced_data.response_model = parsed_response.model
self._traced_data.output_text = self._output_text
self._traced_data.response_status = parsed_response.status
self._traced_data.incomplete_reason = (
getattr(parsed_response.incomplete_details, "reason", None)
if getattr(parsed_response, "incomplete_details", None) else None
)

if parsed_response.usage:
self._traced_data.usage = parsed_response.usage
Expand Down
Loading
Loading