Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ RUN chmod +x ./perfecto-mcp && \
USER perfecto-mcp

ENV MCP_DOCKER=true
ENV OTEL_EXPORTER_OTLP_ENDPOINT=""

# Command to run the application
ENTRYPOINT ["./perfecto-mcp"]
Expand Down
9 changes: 9 additions & 0 deletions build.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,15 @@ def run_pyinstaller(name: str, icon: str):
f'--icon={icon}',
'--clean',
'--noconfirm',
'--hidden-import=opentelemetry.sdk.trace',
'--hidden-import=opentelemetry.sdk.trace.export',
'--hidden-import=opentelemetry.sdk.resources',
'--hidden-import=opentelemetry.sdk.metrics',
'--hidden-import=opentelemetry.sdk.metrics.export',
'--hidden-import=opentelemetry.exporter.otlp.proto.http.trace_exporter',
'--hidden-import=opentelemetry.exporter.otlp.proto.http.metric_exporter',
'--hidden-import=opentelemetry.propagate',
'--collect-submodules=opentelemetry',
])


Expand Down
2 changes: 2 additions & 0 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from config.token import PerfectoToken, PerfectoTokenError
from config.version import __version__, __executable__, __bundle__, __uvx__, get_version
from server import register_tools
from telemetry import init_telemetry

PERFECTO_SECURITY_TOKEN_FILE_NAME = "perfecto-security-token.txt"
PERFECTO_SECURITY_TOKEN_FILE_PATH = os.getenv(SECURITY_TOKEN_FILE_ENV_NAME)
Expand Down Expand Up @@ -63,6 +64,7 @@ def get_token() -> PerfectoToken:


def run(log_level: str = "CRITICAL"):
init_telemetry("perfecto-mcp", __version__)
token = get_token()

instructions = """
Expand Down
13 changes: 12 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@ requires-python = ">=3.11"
dependencies = [
"httpx[http2]>=0.28.1",
"mcp[cli]>=1.17.0",
"opentelemetry-api>=1.20.0",
"opentelemetry-sdk>=1.20.0",
"opentelemetry-exporter-otlp>=1.20.0",
"pyinstaller>=6.0.0",
"pydantic>=2.11.7",
"pydantic-core>=2.33.2",
Expand All @@ -18,12 +21,20 @@ dependencies = [
perfecto-mcp = "main:main"

[tool.setuptools]
py-modules = ["main", "server"]
py-modules = ["main", "server", "telemetry"]
include-package-data = true

[dependency-groups]
dev = [
"pytest>=9.0.2",
]

[tool.setuptools.packages.find]
where = ["."]
include = ["tools", "config", "models", "formatters", "resources"]

[tool.setuptools.package-data]
"resources" = ["*.png"]

[tool.pytest.ini_options]
pythonpath = ["."]
252 changes: 252 additions & 0 deletions telemetry.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,252 @@
"""
Copyright 2025 Perforce Software, Inc.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""
import logging
import os
import platform
import time
from typing import Any, Awaitable, Callable

import httpx

logger = logging.getLogger(__name__)

try:
from opentelemetry import metrics, trace # noqa: F401 — must be module-level for patching
_OTEL_API_AVAILABLE = True
except ImportError:
trace = None # type: ignore[assignment]
metrics = None # type: ignore[assignment]
_OTEL_API_AVAILABLE = False

_call_counter = None
_duration_histogram = None


def init_telemetry(service_name: str, service_version: str) -> None:
global _call_counter, _duration_histogram

if not _OTEL_API_AVAILABLE:
return
if os.getenv("OTEL_SDK_DISABLED", "").lower() == "true":
return
try:
# Lazy SDK imports: defer heavy setup until init_telemetry() and tolerate a
# missing SDK (ImportError) without breaking module import or startup.
from opentelemetry.sdk.resources import SERVICE_NAME, SERVICE_VERSION, Resource
from opentelemetry.sdk.trace import TracerProvider
from opentelemetry.sdk.trace.export import BatchSpanProcessor

resource = Resource.create({
SERVICE_NAME: service_name,
SERVICE_VERSION: service_version,
"os.type": platform.system().lower(),
"host.arch": platform.machine().lower(),
})
provider = TracerProvider(resource=resource)

endpoint = os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT")
if endpoint:
try:
from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter
provider.add_span_processor(BatchSpanProcessor(OTLPSpanExporter()))
except Exception:
logger.debug("OTLP trace exporter setup failed", exc_info=True)

trace.set_tracer_provider(provider)
logger.debug("OTel TracerProvider initialised (service=%s, version=%s)", service_name, service_version)

try:
# Metrics SDK is optional relative to tracing; keep imports local so a
# missing metrics package does not prevent trace provider setup.
from opentelemetry.sdk.metrics import MeterProvider
from opentelemetry.sdk.metrics.export import PeriodicExportingMetricReader

readers = []
if endpoint:
try:
from opentelemetry.exporter.otlp.proto.http.metric_exporter import OTLPMetricExporter
readers.append(PeriodicExportingMetricReader(OTLPMetricExporter()))
except Exception:
logger.debug("OTLP metric exporter setup failed", exc_info=True)

meter_provider = MeterProvider(resource=resource, metric_readers=readers)
metrics.set_meter_provider(meter_provider)

meter = metrics.get_meter("perfecto-mcp")
_call_counter = meter.create_counter(
"mcp.tool.calls",
unit="{call}",
description="Number of MCP tool calls",
)
_duration_histogram = meter.create_histogram(
"mcp.tool.duration",
unit="s",
description="MCP tool call duration in seconds",
)
logger.debug("OTel MeterProvider initialised")
except ImportError:
pass
except Exception:
logger.debug("OTel metrics init failed", exc_info=True)

except ImportError:
pass
except Exception:
logger.debug("OTel init failed; continuing without tracing", exc_info=True)


def _get_meta(ctx: Any) -> dict:
try:
return ctx.request_context.request.params.meta or {}
except Exception:
return {}


def _extract_trace_context(meta: dict):
if not meta:
return None
try:
from opentelemetry.propagate import extract
carrier = {}
if "traceparent" in meta:
carrier["traceparent"] = meta["traceparent"]
if "tracestate" in meta:
carrier["tracestate"] = meta["tracestate"]
return extract(carrier) if carrier else None
except Exception:
return None
Comment on lines +111 to +130

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hey @diego-ferrand I was interiorizing with the PR and I got a pretty big alert on how are we handling the meta withing the context.
This is so far the research I could put through:

_extract_trace_context never extracts the parent context, so server spans are never linked to the caller's trace. Local spans/metrics still work, which is why this
fails quietly rather than erroring.

Root cause: request.params.meta is not a dict — it's the SDK's RequestParams.Meta Pydantic model (alias="_meta", extra="allow"). The _meta.meta
mapping is handled correctly by the alias, but traceparent/tracestate arrive as model attributes, not dict keys. The current code does dict-style access on it:

if "traceparent" in meta:                 # always False on a BaseModel
    carrier["traceparent"] = meta["traceparent"]   # 'Meta' object is not subscriptable

So carrier is always empty → extract() is never called → parent_ctx is always None.
Reproduced against the real SDK type:

type(meta): mcp.types.RequestParams.Meta
"traceparent" in meta  => False
meta["traceparent"]    => TypeError: 'Meta' object is not subscriptable
_extract_trace_context => None

Suggested fix — normalize to a dict first (handles both the Pydantic-model and plain-dict shapes, so we don't have to assume the client/transport):

def _extract_trace_context(meta):
    if not meta:
        return None
    if hasattr(meta, "model_dump"):
        meta = meta.model_dump()
    ...

model_dump() surfaces the extras correctly:

{'progressToken': None, 'traceparent': '00-...-01', 'tracestate': 'rojo=...'}

Tests: tests/test_telemetry.py currently has no coverage for traceparent / _get_meta / _extract_trace_context, which is how this regressed without anything
failing. A MagicMock ctx would fake in/[] and mask the bug, so the test should build the actual SDK type.
Non-breaking for everything else, but blocking for the OTEL propagation goal — it defeats the whole point of passing traceparent.



def _get_client_info(ctx: Any):
try:
info = ctx.request_context.session.client_params.clientInfo
return info.name, info.version
except Exception:
return None, None


def _get_session_id(ctx: Any) -> str | None:
try:
session_id = ctx.session_id
return str(session_id) if session_id is not None else None
except Exception:
return None


def _record_span_error(span: Any, error_type: str) -> None:
try:
span.set_attribute("error.type", error_type)
except Exception:
pass
try:
from opentelemetry.trace import Status, StatusCode
span.set_status(Status(StatusCode.ERROR))
except Exception:
pass


def _http_status_to_error_type(status_code: int) -> str:
if status_code in (401, 403):
return "auth_failed"
if status_code == 404:
return "not_found"
if status_code == 429:
return "rate_limited"
if status_code >= 500:
return "server_error"
return f"http_{status_code}"


def _record_metrics(tool_name: str, action: str, elapsed: float, error_type: str | None) -> None:
attrs: dict[str, str] = {"gen_ai.tool.name": tool_name, "mcp.tool.action": action}
if error_type is not None:
attrs["error.type"] = error_type
try:
if _call_counter is not None:
_call_counter.add(1, attrs)
if _duration_histogram is not None:
_duration_histogram.record(elapsed, attrs)
except Exception:
pass


async def run_tool(
tool_name: str,
action: str,
ctx: Any,
dispatch: Callable[[], Awaitable[Any]],
) -> Any:
if trace is None:
return await dispatch()

try:
meta = _get_meta(ctx)
parent_ctx = _extract_trace_context(meta)
tracer = trace.get_tracer("perfecto-mcp")
span_cm = tracer.start_as_current_span(
f"tools/call {tool_name}",
context=parent_ctx,
kind=trace.SpanKind.SERVER,
record_exception=False,
set_status_on_exception=False,
)
except Exception:
return await dispatch()

with span_cm as span:
try:
span.set_attribute("mcp.method.name", "tools/call")
span.set_attribute("gen_ai.tool.name", tool_name)
span.set_attribute("gen_ai.operation.name", "execute_tool")
span.set_attribute("mcp.tool.action", action)
client_name, client_version = _get_client_info(ctx)
if client_name is not None:
span.set_attribute("user_agent.name", client_name)
if client_version is not None:
span.set_attribute("user_agent.version", client_version)
session_id = _get_session_id(ctx)
if session_id is not None:
span.set_attribute("mcp.session.id", session_id)
except Exception:
pass

start = time.perf_counter()
error_type: str | None = None
result = None
try:
result = await dispatch()
except httpx.TimeoutException:
error_type = "timeout"
_record_span_error(span, error_type)
raise
except httpx.HTTPStatusError as e:
error_type = _http_status_to_error_type(e.response.status_code)
_record_span_error(span, error_type)
raise
except Exception:
error_type = "tool_error"
_record_span_error(span, error_type)
raise
finally:
elapsed = time.perf_counter() - start
metric_error_type = error_type or (
"api_error" if result is not None and getattr(result, "error", None) else None
)
_record_metrics(tool_name, action, elapsed, metric_error_type)

if result is not None and getattr(result, "error", None):
_record_span_error(span, "api_error")
return result
Loading