From 0086b83a00632a1279d4c5b744cc9bd0c15ed8ca Mon Sep 17 00:00:00 2001 From: Alexander Alderman Webb Date: Thu, 26 Mar 2026 15:00:19 +0100 Subject: [PATCH 1/3] ref(openai): Create functions for streaming token usage --- sentry_sdk/integrations/openai.py | 167 ++++++++++++++++++++++++++++-- 1 file changed, 157 insertions(+), 10 deletions(-) diff --git a/sentry_sdk/integrations/openai.py b/sentry_sdk/integrations/openai.py index 121823db5f..05c8a728f5 100644 --- a/sentry_sdk/integrations/openai.py +++ b/sentry_sdk/integrations/openai.py @@ -148,13 +148,17 @@ def _get_usage(usage: "Any", names: "List[str]") -> int: return 0 -def _calculate_token_usage( +def _calculate_streaming_completions_token_usage( messages: "Optional[Iterable[ChatCompletionMessageParam]]", response: "Any", span: "Span", streaming_message_responses: "Optional[List[str]]", count_tokens: "Callable[..., Any]", ) -> None: + """ + Sets token attribute for streaming Completions calls. + Sets token usage if available in the response object, or computes the usage for textual input and output with tiktoken. + """ input_tokens: "Optional[int]" = 0 input_tokens_cached: "Optional[int]" = 0 output_tokens: "Optional[int]" = 0 @@ -219,6 +223,151 @@ def _calculate_token_usage( ) +def _calculate_streaming_responses_token_usage( + messages: "Optional[Optional[Union[str, ResponseInputParam]]]", + response: "Any", + span: "Span", + streaming_message_responses: "Optional[List[str]]", + count_tokens: "Callable[..., Any]", +): + """ + Sets token attribute for streaming Responses calls. + Sets token usage if available in the response object, or computes the usage for textual input and output with tiktoken. + """ + input_tokens: "Optional[int]" = 0 + input_tokens_cached: "Optional[int]" = 0 + output_tokens: "Optional[int]" = 0 + output_tokens_reasoning: "Optional[int]" = 0 + total_tokens: "Optional[int]" = 0 + + if hasattr(response, "usage"): + input_tokens = _get_usage(response.usage, ["input_tokens", "prompt_tokens"]) + if hasattr(response.usage, "input_tokens_details"): + input_tokens_cached = _get_usage( + response.usage.input_tokens_details, ["cached_tokens"] + ) + + output_tokens = _get_usage( + response.usage, ["output_tokens", "completion_tokens"] + ) + if hasattr(response.usage, "output_tokens_details"): + output_tokens_reasoning = _get_usage( + response.usage.output_tokens_details, ["reasoning_tokens"] + ) + + total_tokens = _get_usage(response.usage, ["total_tokens"]) + + # Manually count tokens + if input_tokens == 0: + for message in messages or []: + if isinstance(message, str): + input_tokens += count_tokens(message) + continue + elif isinstance(message, dict): + message_content = message.get("content") + if message_content is None: + continue + # Deliberate use of Completions function for both Completions and Responses input format. + text_items = _get_text_items(message_content) + input_tokens += sum(count_tokens(text) for text in text_items) + continue + + if output_tokens == 0: + if streaming_message_responses is not None: + for message in streaming_message_responses: + output_tokens += count_tokens(message) + elif hasattr(response, "choices"): + for choice in response.choices: + if hasattr(choice, "message") and hasattr(choice.message, "content"): + output_tokens += count_tokens(choice.message.content) + + # Do not set token data if it is 0 + input_tokens = input_tokens or None + input_tokens_cached = input_tokens_cached or None + output_tokens = output_tokens or None + output_tokens_reasoning = output_tokens_reasoning or None + total_tokens = total_tokens or None + + record_token_usage( + span, + input_tokens=input_tokens, + input_tokens_cached=input_tokens_cached, + output_tokens=output_tokens, + output_tokens_reasoning=output_tokens_reasoning, + total_tokens=total_tokens, + ) + + +def _calculate_token_usage( + messages: "Optional[Iterable[ChatCompletionMessageParam]]", + response: "Any", + span: "Span", + count_tokens: "Callable[..., Any]", +) -> None: + """ + Sets token attribute for non-streaming Completions and Responses API calls. + Sets token usage if available in the response object, or computes the usage for textual input and output with tiktoken. + """ + input_tokens: "Optional[int]" = 0 + input_tokens_cached: "Optional[int]" = 0 + output_tokens: "Optional[int]" = 0 + output_tokens_reasoning: "Optional[int]" = 0 + total_tokens: "Optional[int]" = 0 + + if hasattr(response, "usage"): + input_tokens = _get_usage(response.usage, ["input_tokens", "prompt_tokens"]) + if hasattr(response.usage, "input_tokens_details"): + input_tokens_cached = _get_usage( + response.usage.input_tokens_details, ["cached_tokens"] + ) + + output_tokens = _get_usage( + response.usage, ["output_tokens", "completion_tokens"] + ) + if hasattr(response.usage, "output_tokens_details"): + output_tokens_reasoning = _get_usage( + response.usage.output_tokens_details, ["reasoning_tokens"] + ) + + total_tokens = _get_usage(response.usage, ["total_tokens"]) + + # Manually count tokens + if input_tokens == 0: + for message in messages or []: + if isinstance(message, str): + input_tokens += count_tokens(message) + continue + elif isinstance(message, dict): + message_content = message.get("content") + if message_content is None: + continue + # Deliberate use of Completions function for both Completions and Responses input format. + text_items = _get_text_items(message_content) + input_tokens += sum(count_tokens(text) for text in text_items) + continue + + if output_tokens == 0 and hasattr(response, "choices"): + for choice in response.choices: + if hasattr(choice, "message") and hasattr(choice.message, "content"): + output_tokens += count_tokens(choice.message.content) + + # Do not set token data if it is 0 + input_tokens = input_tokens or None + input_tokens_cached = input_tokens_cached or None + output_tokens = output_tokens or None + output_tokens_reasoning = output_tokens_reasoning or None + total_tokens = total_tokens or None + + record_token_usage( + span, + input_tokens=input_tokens, + input_tokens_cached=input_tokens_cached, + output_tokens=output_tokens, + output_tokens_reasoning=output_tokens_reasoning, + total_tokens=total_tokens, + ) + + def _set_responses_api_input_data( span: "Span", kwargs: "dict[str, Any]", @@ -497,7 +646,7 @@ def _set_common_output_data( if len(response_text) > 0: set_data_normalized(span, SPANDATA.GEN_AI_RESPONSE_TEXT, response_text) - _calculate_token_usage(input, response, span, None, integration.count_tokens) + _calculate_token_usage(input, response, span, integration.count_tokens) if finish_span: span.__exit__(None, None, None) @@ -533,12 +682,12 @@ def _set_common_output_data( span, SPANDATA.GEN_AI_RESPONSE_TEXT, output_messages["response"] ) - _calculate_token_usage(input, response, span, None, integration.count_tokens) + _calculate_token_usage(input, response, span, integration.count_tokens) if finish_span: span.__exit__(None, None, None) else: - _calculate_token_usage(input, response, span, None, integration.count_tokens) + _calculate_token_usage(input, response, span, integration.count_tokens) if finish_span: span.__exit__(None, None, None) @@ -662,7 +811,7 @@ def new_iterator() -> "Iterator[ChatCompletionChunk]": set_data_normalized( span, SPANDATA.GEN_AI_RESPONSE_TEXT, all_responses ) - _calculate_token_usage( + _calculate_streaming_completions_token_usage( messages, response, span, @@ -706,7 +855,7 @@ async def new_iterator_async() -> "AsyncIterator[ChatCompletionChunk]": set_data_normalized( span, SPANDATA.GEN_AI_RESPONSE_TEXT, all_responses ) - _calculate_token_usage( + _calculate_streaming_completions_token_usage( messages, response, span, @@ -781,7 +930,6 @@ def new_iterator() -> "Iterator[ChatCompletionChunk]": input, x.response, span, - None, integration.count_tokens, ) count_tokens_manually = False @@ -800,7 +948,7 @@ def new_iterator() -> "Iterator[ChatCompletionChunk]": span, SPANDATA.GEN_AI_RESPONSE_TEXT, all_responses ) if count_tokens_manually: - _calculate_token_usage( + _calculate_streaming_responses_token_usage( input, response, span, @@ -830,7 +978,6 @@ async def new_iterator_async() -> "AsyncIterator[ChatCompletionChunk]": input, x.response, span, - None, integration.count_tokens, ) count_tokens_manually = False @@ -849,7 +996,7 @@ async def new_iterator_async() -> "AsyncIterator[ChatCompletionChunk]": span, SPANDATA.GEN_AI_RESPONSE_TEXT, all_responses ) if count_tokens_manually: - _calculate_token_usage( + _calculate_streaming_responses_token_usage( input, response, span, From 26422abbeeeab43bffad9d962db21061f711f03e Mon Sep 17 00:00:00 2001 From: Alexander Alderman Webb Date: Thu, 26 Mar 2026 15:18:20 +0100 Subject: [PATCH 2/3] remove tests of private details --- tests/integrations/openai/test_openai.py | 157 ----------------------- 1 file changed, 157 deletions(-) diff --git a/tests/integrations/openai/test_openai.py b/tests/integrations/openai/test_openai.py index 0fd049e742..a871ff3ea1 100644 --- a/tests/integrations/openai/test_openai.py +++ b/tests/integrations/openai/test_openai.py @@ -44,7 +44,6 @@ from sentry_sdk.consts import SPANDATA, OP from sentry_sdk.integrations.openai import ( OpenAIIntegration, - _calculate_token_usage, ) from sentry_sdk.utils import safe_serialize @@ -1780,162 +1779,6 @@ async def test_span_origin_embeddings_async(sentry_init, capture_events): assert event["spans"][0]["origin"] == "auto.ai.openai" -def test_calculate_token_usage_a(): - span = mock.MagicMock() - - def count_tokens(msg): - return len(str(msg)) - - response = mock.MagicMock() - response.usage = mock.MagicMock() - response.usage.completion_tokens = 10 - response.usage.prompt_tokens = 20 - response.usage.total_tokens = 30 - messages = [] - streaming_message_responses = [] - - with mock.patch( - "sentry_sdk.integrations.openai.record_token_usage" - ) as mock_record_token_usage: - _calculate_token_usage( - messages, response, span, streaming_message_responses, count_tokens - ) - mock_record_token_usage.assert_called_once_with( - span, - input_tokens=20, - input_tokens_cached=None, - output_tokens=10, - output_tokens_reasoning=None, - total_tokens=30, - ) - - -def test_calculate_token_usage_b(): - span = mock.MagicMock() - - def count_tokens(msg): - return len(str(msg)) - - response = mock.MagicMock() - response.usage = mock.MagicMock() - response.usage.completion_tokens = 10 - response.usage.total_tokens = 10 - messages = [ - {"content": "one"}, - {"content": "two"}, - {"content": "three"}, - ] - streaming_message_responses = [] - - with mock.patch( - "sentry_sdk.integrations.openai.record_token_usage" - ) as mock_record_token_usage: - _calculate_token_usage( - messages, response, span, streaming_message_responses, count_tokens - ) - mock_record_token_usage.assert_called_once_with( - span, - input_tokens=11, - input_tokens_cached=None, - output_tokens=10, - output_tokens_reasoning=None, - total_tokens=10, - ) - - -def test_calculate_token_usage_c(): - span = mock.MagicMock() - - def count_tokens(msg): - return len(str(msg)) - - response = mock.MagicMock() - response.usage = mock.MagicMock() - response.usage.prompt_tokens = 20 - response.usage.total_tokens = 20 - messages = [] - streaming_message_responses = [ - "one", - "two", - "three", - ] - - with mock.patch( - "sentry_sdk.integrations.openai.record_token_usage" - ) as mock_record_token_usage: - _calculate_token_usage( - messages, response, span, streaming_message_responses, count_tokens - ) - mock_record_token_usage.assert_called_once_with( - span, - input_tokens=20, - input_tokens_cached=None, - output_tokens=11, - output_tokens_reasoning=None, - total_tokens=20, - ) - - -def test_calculate_token_usage_d(): - span = mock.MagicMock() - - def count_tokens(msg): - return len(str(msg)) - - response = mock.MagicMock() - response.usage = mock.MagicMock() - response.usage.prompt_tokens = 20 - response.usage.total_tokens = 20 - response.choices = [ - mock.MagicMock(message="one"), - mock.MagicMock(message="two"), - mock.MagicMock(message="three"), - ] - messages = [] - streaming_message_responses = [] - - with mock.patch( - "sentry_sdk.integrations.openai.record_token_usage" - ) as mock_record_token_usage: - _calculate_token_usage( - messages, response, span, streaming_message_responses, count_tokens - ) - mock_record_token_usage.assert_called_once_with( - span, - input_tokens=20, - input_tokens_cached=None, - output_tokens=None, - output_tokens_reasoning=None, - total_tokens=20, - ) - - -def test_calculate_token_usage_e(): - span = mock.MagicMock() - - def count_tokens(msg): - return len(str(msg)) - - response = mock.MagicMock() - messages = [] - streaming_message_responses = None - - with mock.patch( - "sentry_sdk.integrations.openai.record_token_usage" - ) as mock_record_token_usage: - _calculate_token_usage( - messages, response, span, streaming_message_responses, count_tokens - ) - mock_record_token_usage.assert_called_once_with( - span, - input_tokens=None, - input_tokens_cached=None, - output_tokens=None, - output_tokens_reasoning=None, - total_tokens=None, - ) - - @pytest.mark.skipif(SKIP_RESPONSES_TESTS, reason="Responses API not available") def test_ai_client_span_responses_api_no_pii(sentry_init, capture_events): sentry_init( From e42f7b179ea5821f6f3fb6e829c678e57d19e06a Mon Sep 17 00:00:00 2001 From: Alexander Alderman Webb Date: Thu, 26 Mar 2026 15:22:06 +0100 Subject: [PATCH 3/3] add return type --- sentry_sdk/integrations/openai.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sentry_sdk/integrations/openai.py b/sentry_sdk/integrations/openai.py index 7ed161b8db..160773775f 100644 --- a/sentry_sdk/integrations/openai.py +++ b/sentry_sdk/integrations/openai.py @@ -229,7 +229,7 @@ def _calculate_streaming_responses_token_usage( span: "Span", streaming_message_responses: "Optional[List[str]]", count_tokens: "Callable[..., Any]", -): +) -> None: """ Sets token attribute for streaming Responses calls. Sets token usage if available in the response object, or computes the usage for textual input and output with tiktoken.