diff --git a/sentry_sdk/integrations/litellm.py b/sentry_sdk/integrations/litellm.py index 3cff0fbc23..ed890ef6fd 100644 --- a/sentry_sdk/integrations/litellm.py +++ b/sentry_sdk/integrations/litellm.py @@ -6,6 +6,7 @@ from sentry_sdk.ai.monitoring import record_token_usage from sentry_sdk.ai.utils import ( get_start_span_function, + normalize_message_roles, set_data_normalized, truncate_and_annotate_messages, transform_openai_content_part, @@ -23,6 +24,11 @@ try: import litellm # type: ignore[import-not-found] from litellm import input_callback, success_callback, failure_callback + from litellm.types.llms.openai import ( # type: ignore[import-not-found] + ResponseAPIUsage, + ResponseCompletedEvent, + ResponsesAPIResponse, + ) except ImportError: raise DidNotEnable("LiteLLM not installed") @@ -66,6 +72,48 @@ def _convert_message_parts(messages: "List[Dict[str, Any]]") -> "List[Dict[str, return messages +def _record_responses_conversation_id( + span: "Any", complete_input: "Dict[str, Any]" +) -> None: + """Set the conversation id on the span when the Responses API request carries one.""" + conversation = complete_input.get("conversation") + if conversation is None: + return + + if isinstance(conversation, str): + conversation_id = conversation + elif isinstance(conversation, dict): + conversation_id = conversation.get("id") + else: + conversation_id = None + + if conversation_id is not None: + set_data_normalized(span, SPANDATA.GEN_AI_CONVERSATION_ID, conversation_id) + + +def _record_responses_input_messages( + span: "Any", scope: "Any", responses_input: "Any" +) -> None: + """Record the request messages for a Responses API call.""" + if not responses_input: + return + + # `input` is either a string or a list of message dicts (same shape as + # the OpenAI Responses API). + if isinstance(responses_input, str): + input_messages = [responses_input] + else: + input_messages = list(responses_input) + normalized = normalize_message_roles(input_messages) # type: ignore[arg-type] + messages_data = truncate_and_annotate_messages(normalized, span, scope) + if messages_data is not None: + span.set_data( + SPANDATA.GEN_AI_REQUEST_MESSAGES, + messages_data, + unpack=False, + ) + + def _input_callback(kwargs: "Dict[str, Any]") -> None: """Handle the start of a request.""" integration = sentry_sdk.get_client().get_integration(LiteLLMIntegration) @@ -84,16 +132,17 @@ def _input_callback(kwargs: "Dict[str, Any]") -> None: call_type = kwargs.get("call_type", None) if call_type == "embedding" or call_type == "aembedding": operation = "embeddings" + op = consts.OP.GEN_AI_EMBEDDINGS + elif call_type == "responses" or call_type == "aresponses": + operation = "responses" + op = consts.OP.GEN_AI_RESPONSES else: operation = "chat" + op = consts.OP.GEN_AI_CHAT # Start a new span/transaction span = get_start_span_function()( - op=( - consts.OP.GEN_AI_CHAT - if operation == "chat" - else consts.OP.GEN_AI_EMBEDDINGS - ), + op=op, name=f"{operation} {model}", origin=LiteLLMIntegration.origin, ) @@ -106,14 +155,15 @@ def _input_callback(kwargs: "Dict[str, Any]") -> None: set_data_normalized(span, SPANDATA.GEN_AI_SYSTEM, provider) set_data_normalized(span, SPANDATA.GEN_AI_OPERATION_NAME, operation) - # Record input/messages if allowed - if should_send_default_pii() and integration.include_prompts: - if operation == "embeddings": - # For embeddings, look for the 'input' parameter + # Per-operation request data. Conversation id (responses) is set + # unconditionally; user-content fields are gated on PII / include_prompts. + record_prompts = should_send_default_pii() and integration.include_prompts + scope = sentry_sdk.get_current_scope() + + if operation == "embeddings": + if record_prompts: embedding_input = kwargs.get("input") if embedding_input: - scope = sentry_sdk.get_current_scope() - # Normalize to list format input_list = ( embedding_input if isinstance(embedding_input, list) @@ -129,11 +179,23 @@ def _input_callback(kwargs: "Dict[str, Any]") -> None: messages_data, unpack=False, ) - else: - # For chat, look for the 'messages' parameter + + elif operation == "responses": + # litellm unpacks `extra_body` into the request body, so the + # `conversation` field shows up in additional_args.complete_input_dict + # rather than as a top-level kwarg. + complete_input = (kwargs.get("additional_args") or {}).get( + "complete_input_dict" + ) or {} + _record_responses_conversation_id(span, complete_input) + if record_prompts: + _record_responses_input_messages(span, scope, kwargs.get("input")) + + else: + # Chat completions. + if record_prompts: messages = kwargs.get("messages", []) if messages: - scope = sentry_sdk.get_current_scope() messages = _convert_message_parts(messages) messages_data = truncate_and_annotate_messages(messages, span, scope) if messages_data is not None: @@ -164,13 +226,122 @@ async def _async_input_callback(kwargs: "Dict[str, Any]") -> None: return _input_callback(kwargs) +def _record_chat_response_messages(span: "Any", response: "Any") -> None: + """Record response.text from a Chat Completions response.""" + response_messages = [] + for choice in response.choices: + message = getattr(choice, "message", None) + if message is None: + continue + if hasattr(message, "model_dump"): + response_messages.append(message.model_dump()) + elif hasattr(message, "dict"): + response_messages.append(message.dict()) + else: + # Fallback for basic message objects + msg = {} + if hasattr(message, "role"): + msg["role"] = message.role + if hasattr(message, "content"): + msg["content"] = message.content + if hasattr(message, "tool_calls"): + msg["tool_calls"] = message.tool_calls + response_messages.append(msg) + + if response_messages: + set_data_normalized(span, SPANDATA.GEN_AI_RESPONSE_TEXT, response_messages) + + +def _record_responses_output(span: "Any", response: "ResponsesAPIResponse") -> None: + """Record response text and tool calls from a Responses API response.""" + output_text = [] # type: List[Any] + tool_calls = [] # type: List[Any] + for output in response.output: + output_type = getattr(output, "type", None) + if output_type == "function_call": + if hasattr(output, "model_dump"): + tool_calls.append(output.model_dump()) + elif hasattr(output, "dict"): + tool_calls.append(output.dict()) + elif output_type == "message": + for content_item in getattr(output, "content", []) or []: + text = getattr(content_item, "text", None) + if text is not None: + output_text.append(text) + elif hasattr(content_item, "model_dump"): + output_text.append(content_item.model_dump()) + elif hasattr(content_item, "dict"): + output_text.append(content_item.dict()) + + if tool_calls: + set_data_normalized( + span, + SPANDATA.GEN_AI_RESPONSE_TOOL_CALLS, + tool_calls, + unpack=False, + ) + if output_text: + set_data_normalized(span, SPANDATA.GEN_AI_RESPONSE_TEXT, output_text) + + +def _record_token_usage_from_response(span: "Any", response: "Any") -> None: + """Record token usage. The shape of ``usage`` depends on the litellm + processing pipeline rather than the API path: + + - ``ResponseAPIUsage``: raw Responses API usage (``input_tokens`` / + ``output_tokens``). Seen when litellm has not yet normalized the value. + - ``dict``: chat-style dict (``prompt_tokens`` / ``completion_tokens``). + litellm assembles streaming Responses API usage as a dict. + - Otherwise: chat-style Pydantic ``Usage`` (``prompt_tokens`` / + ``completion_tokens``). Used for Chat Completions, Embeddings, and + non-streaming Responses API after litellm's post-processing. + """ + usage = getattr(response, "usage", None) + if usage is None: + return + + if isinstance(usage, ResponseAPIUsage): + record_token_usage( + span, + input_tokens=usage.input_tokens, + output_tokens=usage.output_tokens, + total_tokens=usage.total_tokens, + ) + elif isinstance(usage, dict): + record_token_usage( + span, + input_tokens=usage.get("prompt_tokens"), + output_tokens=usage.get("completion_tokens"), + total_tokens=usage.get("total_tokens"), + ) + else: + record_token_usage( + span, + input_tokens=getattr(usage, "prompt_tokens", None), + output_tokens=getattr(usage, "completion_tokens", None), + total_tokens=getattr(usage, "total_tokens", None), + ) + + def _success_callback( kwargs: "Dict[str, Any]", - completion_response: "Any", + response: "Any", start_time: "datetime", end_time: "datetime", ) -> None: - """Handle successful completion.""" + """Handle a successful chat completion, embeddings, or Responses API call. + + The shape of `response` differs between API paths: + - Chat Completions: ModelResponse with ``.choices[].message`` and + ``.usage`` carrying ``prompt_tokens`` / ``completion_tokens``. + - Responses API (non-streaming): ResponsesAPIResponse with ``.output[]`` + items (``message`` / ``function_call``) and ``.usage`` carrying + ``input_tokens`` / ``output_tokens``. + - Responses API (streaming): a ResponseCompletedEvent wrapping a + ``ResponsesAPIResponse``, which we unwrap below. + - Embeddings: CreateEmbeddingResponse with ``.usage`` only (no choices + or output). + """ metadata = _get_metadata_dict(kwargs) span = metadata.get("_sentry_span") @@ -181,48 +352,23 @@ def _success_callback( if integration is None: return + # Streaming Responses API: unwrap the ResponseCompletedEvent so the rest of + # the function sees the assembled ResponsesAPIResponse directly. + if isinstance(response, ResponseCompletedEvent): + response = response.response + try: - # Record model information - if hasattr(completion_response, "model"): - set_data_normalized( - span, SPANDATA.GEN_AI_RESPONSE_MODEL, completion_response.model - ) + # `model` is set by all API shapes (chat / responses / embeddings). + if hasattr(response, "model"): + set_data_normalized(span, SPANDATA.GEN_AI_RESPONSE_MODEL, response.model) - # Record response content if allowed if should_send_default_pii() and integration.include_prompts: - if hasattr(completion_response, "choices"): - response_messages = [] - for choice in completion_response.choices: - if hasattr(choice, "message"): - if hasattr(choice.message, "model_dump"): - response_messages.append(choice.message.model_dump()) - elif hasattr(choice.message, "dict"): - response_messages.append(choice.message.dict()) - else: - # Fallback for basic message objects - msg = {} - if hasattr(choice.message, "role"): - msg["role"] = choice.message.role - if hasattr(choice.message, "content"): - msg["content"] = choice.message.content - if hasattr(choice.message, "tool_calls"): - msg["tool_calls"] = choice.message.tool_calls - response_messages.append(msg) - - if response_messages: - set_data_normalized( - span, SPANDATA.GEN_AI_RESPONSE_TEXT, response_messages - ) + if isinstance(response, ResponsesAPIResponse): + _record_responses_output(span, response) + elif hasattr(response, "choices"): + _record_chat_response_messages(span, response) - # Record token usage - if hasattr(completion_response, "usage"): - usage = completion_response.usage - record_token_usage( - span, - input_tokens=getattr(usage, "prompt_tokens", None), - output_tokens=getattr(usage, "completion_tokens", None), - total_tokens=getattr(usage, "total_tokens", None), - ) + _record_token_usage_from_response(span, response) finally: is_streaming = kwargs.get("stream") diff --git a/tests/integrations/litellm/test_litellm.py b/tests/integrations/litellm/test_litellm.py index 18f8cfaf6e..18f18cbfeb 100644 --- a/tests/integrations/litellm/test_litellm.py +++ b/tests/integrations/litellm/test_litellm.py @@ -2135,3 +2135,319 @@ def test_convert_message_parts_image_url_missing_url(): converted = _convert_message_parts(messages) # Should return item unchanged assert converted[0]["content"][0]["type"] == "image_url" + + +def _make_responses_api_response( + model="gpt-4.1-nano", + text="the model response", + input_tokens=12, + output_tokens=24, + total_tokens=36, +): + """Build a real openai.types.responses.Response for use as a fake HTTP + payload. litellm parses the JSON into a litellm ResponsesAPIResponse.""" + import openai.types.responses as resp_types + + return resp_types.Response( + id="resp-test", + output=[ + resp_types.ResponseOutputMessage( + id="msg_123", + type="message", + status="completed", + content=[ + resp_types.ResponseOutputText( + text=text, + type="output_text", + annotations=[], + ) + ], + role="assistant", + ) + ], + parallel_tool_calls=False, + tool_choice="none", + tools=[], + created_at=10000000, + model=model, + object="response", + usage=resp_types.ResponseUsage( + input_tokens=input_tokens, + input_tokens_details=resp_types.response_usage.InputTokensDetails( + cached_tokens=0, + ), + output_tokens=output_tokens, + output_tokens_details=resp_types.response_usage.OutputTokensDetails( + reasoning_tokens=0, + ), + total_tokens=total_tokens, + ), + ) + + +@pytest.mark.parametrize( + "conversation, expected_id", + [ + pytest.param(None, None, id="no_conversation"), + pytest.param({"id": "conv_abc123"}, "conv_abc123", id="dict"), + pytest.param("conv_str_id", "conv_str_id", id="string"), + ], +) +def test_responses_conversation_id( + reset_litellm_executor, + sentry_init, + capture_events, + get_model_response, + conversation, + expected_id, +): + sentry_init( + integrations=[LiteLLMIntegration()], + traces_sample_rate=1.0, + ) + events = capture_events() + + client = HTTPHandler() + fake_response = get_model_response( + _make_responses_api_response(), + serialize_pydantic=True, + ) + + extra_body = {"conversation": conversation} if conversation is not None else {} + + with mock.patch.object(client, "post", return_value=fake_response): + with start_transaction(name="litellm test"): + litellm.responses( + model="openai/gpt-4.1-nano", + input="What is the capital of France?", + client=client, + extra_body=extra_body, + ) + litellm_utils.executor.shutdown(wait=True) + + (event,) = events + response_spans = [ + span for span in event["spans"] if span["op"] == OP.GEN_AI_RESPONSES + ] + (span,) = response_spans + + assert span["description"] == "responses gpt-4.1-nano" + assert span["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "responses" + + if expected_id is None: + assert SPANDATA.GEN_AI_CONVERSATION_ID not in span["data"] + else: + assert span["data"][SPANDATA.GEN_AI_CONVERSATION_ID] == expected_id + + +def test_responses_records_input_output_and_usage( + reset_litellm_executor, + sentry_init, + capture_events, + get_model_response, +): + sentry_init( + integrations=[LiteLLMIntegration(include_prompts=True)], + traces_sample_rate=1.0, + send_default_pii=True, + ) + events = capture_events() + + client = HTTPHandler() + fake_response = get_model_response( + _make_responses_api_response(text="the model response"), + serialize_pydantic=True, + ) + + with mock.patch.object(client, "post", return_value=fake_response): + with start_transaction(name="litellm test"): + litellm.responses( + model="openai/gpt-4.1-nano", + input="What is the capital of France?", + client=client, + extra_body={"conversation": {"id": "conv_xyz"}}, + ) + litellm_utils.executor.shutdown(wait=True) + + (event,) = events + response_spans = [ + span for span in event["spans"] if span["op"] == OP.GEN_AI_RESPONSES + ] + (span,) = response_spans + + assert span["data"][SPANDATA.GEN_AI_CONVERSATION_ID] == "conv_xyz" + assert SPANDATA.GEN_AI_REQUEST_MESSAGES in span["data"] + assert "the model response" in span["data"][SPANDATA.GEN_AI_RESPONSE_TEXT] + assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 12 + assert span["data"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 24 + assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 36 + + +def test_responses_no_pii_omits_messages( + reset_litellm_executor, + sentry_init, + capture_events, + get_model_response, +): + sentry_init( + integrations=[LiteLLMIntegration(include_prompts=True)], + traces_sample_rate=1.0, + # send_default_pii not set -> defaults to False + ) + events = capture_events() + + client = HTTPHandler() + fake_response = get_model_response( + _make_responses_api_response(), + serialize_pydantic=True, + ) + + with mock.patch.object(client, "post", return_value=fake_response): + with start_transaction(name="litellm test"): + litellm.responses( + model="openai/gpt-4.1-nano", + input="What is the capital of France?", + client=client, + extra_body={"conversation": {"id": "conv_xyz"}}, + ) + litellm_utils.executor.shutdown(wait=True) + + (event,) = events + response_spans = [ + span for span in event["spans"] if span["op"] == OP.GEN_AI_RESPONSES + ] + (span,) = response_spans + + # Conversation id is not PII, but request/response content is + assert span["data"][SPANDATA.GEN_AI_CONVERSATION_ID] == "conv_xyz" + assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in span["data"] + assert SPANDATA.GEN_AI_RESPONSE_TEXT not in span["data"] + + +def test_responses_streaming_unwraps_completed_event(sentry_init, capture_events): + """For streaming responses, success_handler receives a ResponseCompletedEvent + wrapping the assembled ResponsesAPIResponse. We must unwrap it to read + usage/output from the inner response.""" + from litellm.types.llms.openai import ResponsesAPIResponse, ResponseCompletedEvent + + sentry_init( + integrations=[LiteLLMIntegration(include_prompts=True)], + traces_sample_rate=1.0, + send_default_pii=True, + ) + events = capture_events() + + inner_response = ResponsesAPIResponse( + **_make_responses_api_response().model_dump(by_alias=True, exclude_none=True), + ) + wrapper = ResponseCompletedEvent(type="response.completed", response=inner_response) + + kwargs = { + "model": "openai/gpt-4.1-nano", + "input": "What is the capital of France?", + "call_type": "responses", + "stream": True, + "complete_streaming_response": inner_response, + "additional_args": {"complete_input_dict": {}}, + } + + with start_transaction(name="litellm test"): + _input_callback(kwargs) + _success_callback(kwargs, wrapper, datetime.now(), datetime.now()) + + (event,) = events + (span,) = event["spans"] + + assert span["op"] == OP.GEN_AI_RESPONSES + assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 12 + assert span["data"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 24 + assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 36 + assert "the model response" in span["data"][SPANDATA.GEN_AI_RESPONSE_TEXT] + + +def test_responses_async_streaming_dict_usage(sentry_init, capture_events): + """For async streaming responses, litellm transforms `usage` into a chat-style + dict on the assembled ResponsesAPIResponse (see + Logging._get_assembled_streaming_response). We must read the chat-style keys + when usage is a dict.""" + from litellm.types.llms.openai import ResponsesAPIResponse + + sentry_init( + integrations=[LiteLLMIntegration()], + traces_sample_rate=1.0, + ) + events = capture_events() + + response_data = _make_responses_api_response().model_dump( + by_alias=True, exclude_none=True + ) + response = ResponsesAPIResponse(**response_data) + # litellm replaces ResponseAPIUsage with a chat-style dict during streaming + # assembly; mirror that mutation here. + response.usage = { + "prompt_tokens": 7, + "completion_tokens": 2, + "total_tokens": 9, + } + + kwargs = { + "model": "openai/gpt-4.1-nano", + "input": "What is the capital of France?", + "call_type": "aresponses", + "stream": True, + "async_complete_streaming_response": response, + "additional_args": {"complete_input_dict": {}}, + } + + with start_transaction(name="litellm test"): + _input_callback(kwargs) + _success_callback(kwargs, response, datetime.now(), datetime.now()) + + (event,) = events + (span,) = event["spans"] + + assert span["op"] == OP.GEN_AI_RESPONSES + assert span["data"][SPANDATA.GEN_AI_USAGE_INPUT_TOKENS] == 7 + assert span["data"][SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS] == 2 + assert span["data"][SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS] == 9 + + +@pytest.mark.asyncio(loop_scope="session") +async def test_aresponses_call_type_treated_as_responses( + sentry_init, + capture_events, + get_model_response, +): + """aresponses (async) call_type should produce a responses span.""" + sentry_init( + integrations=[LiteLLMIntegration()], + traces_sample_rate=1.0, + ) + events = capture_events() + + client = AsyncHTTPHandler() + fake_response = get_model_response( + _make_responses_api_response(), + serialize_pydantic=True, + ) + + async def fake_post(*args, **kwargs): + return fake_response + + with mock.patch.object(client, "post", new=fake_post): + with start_transaction(name="litellm test"): + await litellm.aresponses( + model="openai/gpt-4.1-nano", + input="What is the capital of France?", + client=client, + ) + await GLOBAL_LOGGING_WORKER.flush() + await asyncio.sleep(0.5) + + (event,) = events + response_spans = [ + span for span in event["spans"] if span["op"] == OP.GEN_AI_RESPONSES + ] + (span,) = response_spans + + assert span["data"][SPANDATA.GEN_AI_OPERATION_NAME] == "responses"