diff --git a/cmd/thv/app/audit_trifecta.go b/cmd/thv/app/audit_trifecta.go new file mode 100644 index 0000000000..0f87c76389 --- /dev/null +++ b/cmd/thv/app/audit_trifecta.go @@ -0,0 +1,385 @@ +// SPDX-FileCopyrightText: Copyright 2025 Stacklok, Inc. +// SPDX-License-Identifier: Apache-2.0 + +package app + +import ( + "encoding/json" + "fmt" + "os" + "slices" + "strings" + "text/tabwriter" + + "github.com/spf13/cobra" + + "github.com/stacklok/toolhive/pkg/config" + llmclient "github.com/stacklok/toolhive/pkg/llm/client" + "github.com/stacklok/toolhive/pkg/toxicflow" +) + +// auditTrifectaCmd is hidden while the heuristics are being calibrated. It +// audits a group (a set of MCP servers sharing one agent context) for +// "lethal trifecta" risk: the co-location of private-data access, untrusted- +// content exposure, and an exfiltration path. +var auditTrifectaCmd = &cobra.Command{ + Use: "audit-trifecta [group]", + Short: "Audit a group of MCP servers for lethal-trifecta risk (experimental)", + Hidden: true, + Long: `Audit a ToolHive group for "lethal trifecta" risk. + +The lethal trifecta is the co-location, within a single agent context, of: + - access to private data, + - exposure to untrusted content (a prompt-injection vector), and + - the ability to exfiltrate data externally. + +Because every server in a group shares the model's context, a toxic flow exists +whenever the group contains all three. This command classifies each server from +its permission profile and registry metadata and reports the group verdict. + +ToolHive can assess data access and egress with reasonable confidence (they come +from the permission profile) but untrusted-content exposure poorly; expect an +"indeterminate" verdict until you classify or override the relevant servers. + +Examples: + # Audit one group + thv audit-trifecta research + + # Audit every group as JSON + thv audit-trifecta --all --format json + + # Apply overrides for mis-classified servers + thv audit-trifecta research --overrides ./trifecta-overrides.json`, + Args: cobra.MaximumNArgs(1), + RunE: auditTrifectaCmdFunc, + ValidArgsFunction: completeTrifectaGroup, +} + +// completeTrifectaGroup completes the group argument with known group names. +func completeTrifectaGroup(cmd *cobra.Command, args []string, _ string) ([]string, cobra.ShellCompDirective) { + if len(args) != 0 { + return nil, cobra.ShellCompDirectiveNoFileComp + } + names, err := toxicflow.ListGroupNames(cmd.Context()) + if err != nil { + return nil, cobra.ShellCompDirectiveNoFileComp + } + return names, cobra.ShellCompDirectiveNoFileComp +} + +var ( + auditTrifectaAll bool + auditTrifectaFormat string + auditTrifectaExplain bool + auditTrifectaOverrides string + auditTrifectaLive bool + auditTrifectaLLMModel string + auditTrifectaLLMURL string + auditTrifectaLLMKey string + auditTrifectaLLMProxy bool +) + +// trifectaLLMKeyEnv is the fallback source for the LLM API key so it need not +// be passed on the command line. +const trifectaLLMKeyEnv = "THV_AUDIT_LLM_API_KEY" + +func init() { + AddFormatFlag(auditTrifectaCmd, &auditTrifectaFormat, FormatJSON, FormatText) + auditTrifectaCmd.Flags().BoolVar(&auditTrifectaAll, "all", false, "Audit every group") + auditTrifectaCmd.Flags().BoolVar(&auditTrifectaExplain, "explain", false, "Show the evidence behind each finding") + auditTrifectaCmd.Flags().StringVar(&auditTrifectaOverrides, "overrides", "", + "Path to a JSON file of role overrides for mis-classified servers") + auditTrifectaCmd.Flags().BoolVar(&auditTrifectaLive, "live", false, + "Probe running servers for live tool annotations (openWorldHint)") + auditTrifectaCmd.Flags().StringVar(&auditTrifectaLLMModel, "llm-model", "", + "LLM model for untrusted-content inference (falls back to keyword search if unset)") + auditTrifectaCmd.Flags().StringVar(&auditTrifectaLLMURL, "llm-base-url", "", + "OpenAI-compatible base URL for the LLM (e.g. https://api.openai.com/v1)") + auditTrifectaCmd.Flags().StringVar(&auditTrifectaLLMKey, "llm-api-key", "", + "API key for the LLM (or set "+trifectaLLMKeyEnv+")") + auditTrifectaCmd.Flags().BoolVar(&auditTrifectaLLMProxy, "llm-proxy", false, + "Use the running `thv llm` proxy for inference (handles auth; needs --llm-model)") + auditTrifectaCmd.PreRunE = ValidateFormat(&auditTrifectaFormat, FormatJSON, FormatText) +} + +// buildTrifectaInference selects the inference strategy: the `thv llm` proxy +// when --llm-proxy is set, else an LLM when a model and base URL are supplied +// (the API key is optional), else the offline keyword search. A partial LLM +// config falls back to keywords with a warning so a missing setting never +// silently disables inference the user asked for. +func buildTrifectaInference() (toxicflow.SourceInference, error) { + // Prefer the running `thv llm` proxy when asked: it injects auth and + // forwards to the configured gateway, so no key or base URL is needed here. + if auditTrifectaLLMProxy { + return llmProxyInference() + } + + key := auditTrifectaLLMKey + if key == "" { + key = os.Getenv(trifectaLLMKeyEnv) + } + model, baseURL := auditTrifectaLLMModel, auditTrifectaLLMURL + + // An explicit LLM needs a model and base URL; the key is optional (for + // keyless local models). With nothing set, fall back to keyword search. + switch { + case model == "" && baseURL == "" && key == "": + return toxicflow.NewKeywordInference(), nil + case model == "" || baseURL == "": + fmt.Fprintln(os.Stderr, + "warning: incomplete LLM config (need --llm-model and --llm-base-url, or --llm-proxy); using keyword search") + return toxicflow.NewKeywordInference(), nil + } + + c, err := llmclient.New(llmclient.Config{Model: model, BaseURL: baseURL, APIKey: key}) + if err != nil { + return nil, fmt.Errorf("configure LLM: %w", err) + } + return toxicflow.NewLLMInference(c), nil +} + +// llmProxyInference builds an inference backend that talks to the local +// `thv llm` reverse proxy. The proxy must be running (`thv llm proxy`); it +// injects the OIDC token and forwards to the configured gateway. +func llmProxyInference() (toxicflow.SourceInference, error) { + if auditTrifectaLLMModel == "" { + return nil, fmt.Errorf("--llm-model is required with --llm-proxy") + } + if auditTrifectaLLMURL != "" || auditTrifectaLLMKey != "" { + fmt.Fprintln(os.Stderr, "warning: --llm-base-url/--llm-api-key are ignored with --llm-proxy") + } + + llmCfg := config.NewDefaultProvider().GetConfig().LLM + if !llmCfg.IsConfigured() { + return nil, fmt.Errorf("no thv llm gateway configured: run \"thv llm config set\" and start the proxy with \"thv llm proxy\"") + } + + baseURL := llmCfg.ProxyBaseURL() + fmt.Fprintf(os.Stderr, "using thv llm proxy at %s (model %s)\n", baseURL, auditTrifectaLLMModel) + + c, err := llmclient.New(llmclient.Config{Model: auditTrifectaLLMModel, BaseURL: baseURL}) + if err != nil { + return nil, fmt.Errorf("configure LLM proxy: %w", err) + } + return toxicflow.NewLLMInference(c), nil +} + +func auditTrifectaCmdFunc(cmd *cobra.Command, args []string) error { + ctx := cmd.Context() + + if !auditTrifectaAll && len(args) == 0 { + return fmt.Errorf("specify a group name or use --all") + } + + overrides, err := loadTrifectaOverrides(auditTrifectaOverrides) + if err != nil { + return err + } + + inference, err := buildTrifectaInference() + if err != nil { + return err + } + + collector, err := toxicflow.NewCollector(ctx, inference, auditTrifectaLive) + if err != nil { + return fmt.Errorf("failed to set up audit: %w", err) + } + + // Resolve the groups to audit. Validate a named group exists so a typo is + // reported as "not found" rather than silently auditing zero servers and + // reading as a reassuring "no toxic flow". + known, err := toxicflow.ListGroupNames(ctx) + if err != nil { + return err + } + groupNames := known + if !auditTrifectaAll { + if !slices.Contains(known, args[0]) { + return fmt.Errorf("group %q not found (run \"thv group list\")", args[0]) + } + groupNames = []string{args[0]} + } + + results := make([]toxicflow.GroupAssessment, 0, len(groupNames)) + for _, g := range groupNames { + assessment, err := collector.AssessGroup(ctx, g, overrides) + if err != nil { + return fmt.Errorf("failed to audit group %q: %w", g, err) + } + results = append(results, assessment) + } + + warnUnknownOverrideTargets(overrides, results) + + if auditTrifectaFormat == FormatJSON { + return printTrifectaJSON(results) + } + printTrifectaText(results) + return nil +} + +// loadTrifectaOverrides reads a JSON array of overrides from path, or returns +// nil when no path is given. +func loadTrifectaOverrides(path string) ([]toxicflow.Override, error) { + if path == "" { + return nil, nil + } + data, err := os.ReadFile(path) // #nosec G304 -- path is an operator-supplied CLI flag + if err != nil { + return nil, fmt.Errorf("failed to read overrides file: %w", err) + } + var overrides []toxicflow.Override + if err := json.Unmarshal(data, &overrides); err != nil { + return nil, fmt.Errorf("failed to parse overrides file: %w", err) + } + for _, o := range overrides { + if err := toxicflow.ValidateOverride(o); err != nil { + return nil, err + } + } + return overrides, nil +} + +// warnUnknownOverrideTargets prints a warning for any override that names a +// server not present in the audited groups, so a typo does not silently +// fail open. +func warnUnknownOverrideTargets(overrides []toxicflow.Override, results []toxicflow.GroupAssessment) { + seen := map[string]bool{} + for _, a := range results { + for _, s := range a.Servers { + seen[s.Name] = true + } + } + for _, o := range overrides { + if o.Server != "" && !seen[o.Server] { + fmt.Fprintf(os.Stderr, "warning: override targets server %q, which is not in the audited group(s)\n", o.Server) + } + } +} + +func printTrifectaJSON(results []toxicflow.GroupAssessment) error { + if results == nil { + results = []toxicflow.GroupAssessment{} + } + enc := json.NewEncoder(os.Stdout) + enc.SetIndent("", " ") + return enc.Encode(results) +} + +func printTrifectaText(results []toxicflow.GroupAssessment) { + if len(results) == 0 { + fmt.Println("No groups to audit.") + return + } + for i, a := range results { + if i > 0 { + fmt.Println() + } + printGroupAssessment(a) + } +} + +func printGroupAssessment(a toxicflow.GroupAssessment) { + fmt.Printf("Group: %s verdict: %s\n", a.Group, strings.ToUpper(string(a.Verdict))) + + if len(a.Servers) == 0 { + fmt.Println(" (no servers in group)") + return + } + + overridden := false + w := tabwriter.NewWriter(os.Stdout, 0, 2, 2, ' ', 0) + _, _ = fmt.Fprintln(w, " SERVER\tDATA\tSOURCE\tSINK") + for _, s := range a.Servers { + _, _ = fmt.Fprintf(w, " %s\t%s\t%s\t%s\n", + s.Name, + cell(s.Finding(toxicflow.RoleData)), + cell(s.Finding(toxicflow.RoleSource)), + cell(s.Finding(toxicflow.RoleSink))) + overridden = overridden || anyOverridden(s) + } + _ = w.Flush() + + if overridden { + fmt.Println(" * value set by override — re-run with --explain for the reason") + } + + if auditTrifectaExplain { + printTrifectaEvidence(a) + } + + fmt.Printf(" Private data : %s\n", joinOrNone(a.DataHolders)) + fmt.Printf(" Untrusted intake: %s\n", joinOrNone(a.Sources)) + fmt.Printf(" Exfil sink : %s\n", joinOrNone(a.Sinks)) + if len(a.Unclassified) > 0 { + fmt.Printf(" Unclassified : %s (untrusted-content exposure unknown)\n", + strings.Join(a.Unclassified, ", ")) + } + + fmt.Printf("\n %s\n", trifectaVerdictMessage(a)) +} + +func printTrifectaEvidence(a toxicflow.GroupAssessment) { + fmt.Println(" evidence:") + for _, s := range a.Servers { + for _, role := range toxicflow.AllRoles { + f := s.Finding(role) + for _, ev := range f.Evidence { + fmt.Printf(" %s [%s]: %s\n", s.Name, role, ev) + } + } + } +} + +// trifectaVerdictMessage returns an actionable one-liner for the verdict. +func trifectaVerdictMessage(a toxicflow.GroupAssessment) string { + switch a.Verdict { + case toxicflow.VerdictPresent: + if len(a.SelfContainedFlow) > 0 { + return fmt.Sprintf("WARNING: Toxic flow present. %s hold(s) all three roles alone — "+ + "a prompt injection there could read private data and exfiltrate it. "+ + "Cheapest fix: tighten that server's permission profile (restrict egress, "+ + "drop mounts), no regrouping needed.", strings.Join(a.SelfContainedFlow, ", ")) + } + return "WARNING: Toxic flow present. A prompt injection via an untrusted-content " + + "server could read private data and exfiltrate it. Split the group so the " + + "untrusted-content server no longer shares a context with the data/exfil servers." + case toxicflow.VerdictPossible: + return "WARNING: Possible toxic flow. All three roles are present, some at low " + + "confidence. Review the contributing servers above." + case toxicflow.VerdictIndeterminate: + return "REVIEW: Indeterminate. Data and exfil paths exist, but untrusted-content " + + "exposure could not be determined. Classify or override the unclassified " + + "servers to resolve." + case toxicflow.VerdictNone: + return "OK: No toxic flow. At least one leg is confidently absent." + default: + return "" + } +} + +// cell renders a finding's confidence for the table, marking overrides with a *. +func cell(f toxicflow.RoleFinding) string { + if f.Overridden { + return string(f.Confidence) + "*" + } + return string(f.Confidence) +} + +// anyOverridden reports whether any of the server's findings was overridden. +func anyOverridden(s toxicflow.ServerAssessment) bool { + for _, role := range toxicflow.AllRoles { + if s.Finding(role).Overridden { + return true + } + } + return false +} + +func joinOrNone(items []string) string { + if len(items) == 0 { + return "(none)" + } + return strings.Join(items, ", ") +} diff --git a/cmd/thv/app/commands.go b/cmd/thv/app/commands.go index c575c00de0..03e1857f49 100644 --- a/cmd/thv/app/commands.go +++ b/cmd/thv/app/commands.go @@ -76,6 +76,7 @@ func NewRootCmd(enableUpdates bool) *cobra.Command { rootCmd.AddCommand(skillCmd) rootCmd.AddCommand(statusCmd) rootCmd.AddCommand(tuiCmd) + rootCmd.AddCommand(auditTrifectaCmd) // Silence printing the usage on error rootCmd.SilenceUsage = true diff --git a/pkg/llm/client/client.go b/pkg/llm/client/client.go new file mode 100644 index 0000000000..ea6d47a2cf --- /dev/null +++ b/pkg/llm/client/client.go @@ -0,0 +1,129 @@ +// SPDX-FileCopyrightText: Copyright 2026 Stacklok, Inc. +// SPDX-License-Identifier: Apache-2.0 + +// Package client is a minimal, dependency-free client for OpenAI-compatible +// chat-completion endpoints. It targets the widely-implemented +// POST /chat/completions shape, so it works with OpenAI, most LLM gateways, +// and local servers (Ollama, vLLM, llama.cpp) by varying the base URL — which +// also lets callers keep traffic on-host when sending data to a model is a +// concern. +package client + +import ( + "bytes" + "context" + "encoding/json" + "errors" + "fmt" + "io" + "net/http" + "strings" + "time" +) + +// Config configures a Client. Model and BaseURL are required; APIKey is +// optional so the client can target an auth-injecting proxy (e.g. the +// `thv llm` localhost proxy) or a keyless local model. +type Config struct { + // Model is the model identifier (e.g. "gpt-4o-mini"). + Model string + // BaseURL is the API root (e.g. "https://api.openai.com/v1"); the client + // appends "/chat/completions". + BaseURL string + // APIKey, when non-empty, is sent as a Bearer token. Leave empty when the + // endpoint injects auth itself (a proxy) or needs none (a local model). + APIKey string + // HTTPClient is optional; a 30s-timeout client is used when nil. + HTTPClient *http.Client +} + +// Client calls an OpenAI-compatible chat-completions endpoint. +type Client struct { + model string + baseURL string + apiKey string + http *http.Client +} + +// New validates the config and returns a Client. +func New(cfg Config) (*Client, error) { + if cfg.Model == "" { + return nil, errors.New("llm client: model is required") + } + if cfg.BaseURL == "" { + return nil, errors.New("llm client: base URL is required") + } + hc := cfg.HTTPClient + if hc == nil { + hc = &http.Client{Timeout: 30 * time.Second} + } + return &Client{ + model: cfg.Model, + baseURL: strings.TrimRight(cfg.BaseURL, "/"), + apiKey: cfg.APIKey, + http: hc, + }, nil +} + +type chatMessage struct { + Role string `json:"role"` + Content string `json:"content"` +} + +type chatRequest struct { + Model string `json:"model"` + Messages []chatMessage `json:"messages"` + Temperature float64 `json:"temperature"` +} + +type chatResponse struct { + Choices []struct { + Message chatMessage `json:"message"` + } `json:"choices"` +} + +// Complete sends a single-turn (system + user) chat completion and returns the +// assistant message content. Temperature is fixed at 0 for reproducibility. +func (c *Client) Complete(ctx context.Context, system, user string) (string, error) { + payload, err := json.Marshal(chatRequest{ + Model: c.model, + Messages: []chatMessage{ + {Role: "system", Content: system}, + {Role: "user", Content: user}, + }, + Temperature: 0, + }) + if err != nil { + return "", fmt.Errorf("marshal request: %w", err) + } + + req, err := http.NewRequestWithContext(ctx, http.MethodPost, c.baseURL+"/chat/completions", bytes.NewReader(payload)) + if err != nil { + return "", fmt.Errorf("build request: %w", err) + } + req.Header.Set("Content-Type", "application/json") + if c.apiKey != "" { + req.Header.Set("Authorization", "Bearer "+c.apiKey) + } + + resp, err := c.http.Do(req) + if err != nil { + return "", fmt.Errorf("call llm: %w", err) + } + defer func() { _ = resp.Body.Close() }() + + if resp.StatusCode != http.StatusOK { + body, _ := io.ReadAll(io.LimitReader(resp.Body, 2048)) + _, _ = io.Copy(io.Discard, resp.Body) // drain remainder so the connection can be reused + return "", fmt.Errorf("llm returned status %d: %s", resp.StatusCode, strings.TrimSpace(string(body))) + } + + var parsed chatResponse + if err := json.NewDecoder(resp.Body).Decode(&parsed); err != nil { + return "", fmt.Errorf("decode response: %w", err) + } + if len(parsed.Choices) == 0 { + return "", errors.New("llm returned no choices") + } + return parsed.Choices[0].Message.Content, nil +} diff --git a/pkg/llm/client/client_test.go b/pkg/llm/client/client_test.go new file mode 100644 index 0000000000..33759ca5cd --- /dev/null +++ b/pkg/llm/client/client_test.go @@ -0,0 +1,101 @@ +// SPDX-FileCopyrightText: Copyright 2026 Stacklok, Inc. +// SPDX-License-Identifier: Apache-2.0 + +package client + +import ( + "encoding/json" + "io" + "net/http" + "net/http/httptest" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestNewValidation(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + cfg Config + wantErr bool + }{ + {"valid", Config{Model: "m", BaseURL: "https://x/v1", APIKey: "k"}, false}, + {"key is optional (proxy/local model)", Config{Model: "m", BaseURL: "https://x/v1"}, false}, + {"missing model", Config{BaseURL: "https://x/v1", APIKey: "k"}, true}, + {"missing base url", Config{Model: "m", APIKey: "k"}, true}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + _, err := New(tt.cfg) + if tt.wantErr { + require.Error(t, err) + return + } + require.NoError(t, err) + }) + } +} + +func TestComplete(t *testing.T) { + t.Parallel() + + var gotPath, gotAuth, gotModel string + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + gotPath = r.URL.Path + gotAuth = r.Header.Get("Authorization") + body, _ := io.ReadAll(r.Body) + var req map[string]any + _ = json.Unmarshal(body, &req) + gotModel, _ = req["model"].(string) + _, _ = io.WriteString(w, `{"choices":[{"message":{"role":"assistant","content":"hello world"}}]}`) + })) + t.Cleanup(srv.Close) + + c, err := New(Config{Model: "gpt-test", BaseURL: srv.URL, APIKey: "secret", HTTPClient: srv.Client()}) + require.NoError(t, err) + + out, err := c.Complete(t.Context(), "system", "user") + require.NoError(t, err) + assert.Equal(t, "hello world", out) + assert.Equal(t, "/chat/completions", gotPath) + assert.Equal(t, "Bearer secret", gotAuth) + assert.Equal(t, "gpt-test", gotModel) +} + +func TestCompleteNoKeyOmitsAuthHeader(t *testing.T) { + t.Parallel() + + var hadAuth bool + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + _, hadAuth = r.Header["Authorization"] + _, _ = io.WriteString(w, `{"choices":[{"message":{"content":"ok"}}]}`) + })) + t.Cleanup(srv.Close) + + c, err := New(Config{Model: "m", BaseURL: srv.URL, HTTPClient: srv.Client()}) + require.NoError(t, err) + + _, err = c.Complete(t.Context(), "s", "u") + require.NoError(t, err) + assert.False(t, hadAuth, "no Authorization header should be sent without an API key") +} + +func TestCompleteErrorStatus(t *testing.T) { + t.Parallel() + + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + w.WriteHeader(http.StatusInternalServerError) + _, _ = io.WriteString(w, "boom") + })) + t.Cleanup(srv.Close) + + c, err := New(Config{Model: "m", BaseURL: srv.URL, APIKey: "k", HTTPClient: srv.Client()}) + require.NoError(t, err) + + _, err = c.Complete(t.Context(), "s", "u") + require.Error(t, err) +} diff --git a/pkg/llm/config.go b/pkg/llm/config.go index 9c2a0fb810..2f9f03af06 100644 --- a/pkg/llm/config.go +++ b/pkg/llm/config.go @@ -119,3 +119,10 @@ func (c *Config) EffectiveProxyPort() int { } return DefaultProxyListenPort } + +// ProxyBaseURL returns the OpenAI-compatible base URL of the localhost proxy +// (e.g. "http://localhost:14000/v1"). Callers append the API path. This is the +// single source of truth for the proxy's loopback base URL. +func (c *Config) ProxyBaseURL() string { + return fmt.Sprintf("http://localhost:%d/v1", c.EffectiveProxyPort()) +} diff --git a/pkg/llm/setup.go b/pkg/llm/setup.go index aa2d9db018..b1aa9a2100 100644 --- a/pkg/llm/setup.go +++ b/pkg/llm/setup.go @@ -82,7 +82,7 @@ func Setup( return err } - proxyBaseURL := fmt.Sprintf("http://localhost:%d/v1", llmCfg.EffectiveProxyPort()) + proxyBaseURL := llmCfg.ProxyBaseURL() // Detect tools before login so we skip the interactive browser flow when // there is nothing to configure. Login still runs before any files are diff --git a/pkg/toxicflow/analyze.go b/pkg/toxicflow/analyze.go new file mode 100644 index 0000000000..298ba1d37f --- /dev/null +++ b/pkg/toxicflow/analyze.go @@ -0,0 +1,88 @@ +// SPDX-FileCopyrightText: Copyright 2025 Stacklok, Inc. +// SPDX-License-Identifier: Apache-2.0 + +package toxicflow + +// AnalyzeGroup folds per-server assessments into a group verdict. Because every +// server in a group shares the one agent context, a toxic flow exists whenever +// the group contains all three roles; the verdict reflects the weakest of the +// three (the flow is only as strong as its least-confident leg). It is a pure +// function. +func AnalyzeGroup(group string, assessments []ServerAssessment) GroupAssessment { + ga := GroupAssessment{Group: group, Servers: assessments} + + best := map[Role]Confidence{ + RoleData: ConfNone, + RoleSource: ConfNone, + RoleSink: ConfNone, + } + + for _, s := range assessments { + for _, role := range AllRoles { + f := s.Finding(role) + if f.Confidence.rank() > best[role].rank() { + best[role] = f.Confidence + } + if f.Confidence.atOrAbove(ConfPossible) { + switch role { + case RoleData: + ga.DataHolders = append(ga.DataHolders, s.Name) + case RoleSource: + ga.Sources = append(ga.Sources, s.Name) + case RoleSink: + ga.Sinks = append(ga.Sinks, s.Name) + } + } + } + if hasUnknownRole(s) { + ga.Unclassified = append(ga.Unclassified, s.Name) + } + if holdsAllRoles(s) { + ga.SelfContainedFlow = append(ga.SelfContainedFlow, s.Name) + } + } + + ga.Verdict = verdict(best[RoleData], best[RoleSource], best[RoleSink]) + return ga +} + +// verdict derives the group conclusion from the strongest confidence found for +// each role. The flow is gated by its weakest leg: +// - all legs likely -> present +// - all legs >= possible -> possible +// - weakest leg unknown -> indeterminate (cannot confirm or rule out) +// - any leg confidently none -> none (flow broken) +func verdict(data, source, sink Confidence) Verdict { + weakest := min(data.rank(), source.rank(), sink.rank()) + switch { + case weakest >= ConfLikely.rank(): + return VerdictPresent + case weakest >= ConfPossible.rank(): + return VerdictPossible + case weakest >= ConfUnknown.rank(): + return VerdictIndeterminate + default: + return VerdictNone + } +} + +// hasUnknownRole reports whether any of the server's roles could not be assessed. +func hasUnknownRole(s ServerAssessment) bool { + for _, role := range AllRoles { + if s.Finding(role).Confidence == ConfUnknown { + return true + } + } + return false +} + +// holdsAllRoles reports whether a single server holds all three roles at +// possible-or-likely confidence, forming a toxic flow on its own. +func holdsAllRoles(s ServerAssessment) bool { + for _, role := range AllRoles { + if !s.Finding(role).Confidence.atOrAbove(ConfPossible) { + return false + } + } + return true +} diff --git a/pkg/toxicflow/analyze_test.go b/pkg/toxicflow/analyze_test.go new file mode 100644 index 0000000000..54a7278e8c --- /dev/null +++ b/pkg/toxicflow/analyze_test.go @@ -0,0 +1,127 @@ +// SPDX-FileCopyrightText: Copyright 2025 Stacklok, Inc. +// SPDX-License-Identifier: Apache-2.0 + +package toxicflow + +import ( + "testing" + + "github.com/stretchr/testify/assert" +) + +// sa builds a server assessment with explicit confidences for each role. +func sa(name string, data, source, sink Confidence) ServerAssessment { + return ServerAssessment{Name: name, Findings: map[Role]RoleFinding{ + RoleData: {Role: RoleData, Confidence: data}, + RoleSource: {Role: RoleSource, Confidence: source}, + RoleSink: {Role: RoleSink, Confidence: sink}, + }} +} + +func TestAnalyzeGroupVerdict(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + assessments []ServerAssessment + want Verdict + }{ + { + name: "all roles likely on one server is present", + assessments: []ServerAssessment{sa("all", ConfLikely, ConfLikely, ConfLikely)}, + want: VerdictPresent, + }, + { + name: "roles split across servers still forms present", + assessments: []ServerAssessment{ + sa("data-and-sink", ConfLikely, ConfNone, ConfLikely), + sa("source", ConfNone, ConfLikely, ConfNone), + }, + want: VerdictPresent, + }, + { + name: "weakest leg only possible yields possible", + assessments: []ServerAssessment{ + sa("data-sink", ConfLikely, ConfNone, ConfLikely), + sa("weak-source", ConfNone, ConfPossible, ConfNone), + }, + want: VerdictPossible, + }, + { + name: "unknown source with data and sink yields indeterminate", + assessments: []ServerAssessment{ + sa("github", ConfLikely, ConfUnknown, ConfLikely), + }, + want: VerdictIndeterminate, + }, + { + name: "confident absence of a leg yields none", + assessments: []ServerAssessment{ + sa("data-sink", ConfLikely, ConfNone, ConfLikely), + sa("inert", ConfNone, ConfNone, ConfNone), + }, + want: VerdictNone, + }, + { + name: "empty group is none", + assessments: nil, + want: VerdictNone, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + got := AnalyzeGroup("g", tt.assessments) + assert.Equal(t, tt.want, got.Verdict) + }) + } +} + +func TestAnalyzeGroupContributors(t *testing.T) { + t.Parallel() + + got := AnalyzeGroup("research", []ServerAssessment{ + sa("notes", ConfLikely, ConfNone, ConfNone), // data only + sa("web-fetch", ConfNone, ConfLikely, ConfLikely), // source + sink + sa("calc", ConfNone, ConfNone, ConfNone), // nothing + }) + + assert.Equal(t, []string{"notes"}, got.DataHolders) + assert.Equal(t, []string{"web-fetch"}, got.Sources) + assert.Equal(t, []string{"web-fetch"}, got.Sinks) + assert.Empty(t, got.Unclassified) +} + +func TestAnalyzeGroupSelfContainedFlow(t *testing.T) { + t.Parallel() + + // One server holds all three roles: a self-contained flow. + self := AnalyzeGroup("solo", []ServerAssessment{ + sa("kitchen-sink", ConfLikely, ConfLikely, ConfLikely), + sa("inert", ConfNone, ConfNone, ConfNone), + }) + assert.Equal(t, VerdictPresent, self.Verdict) + assert.Equal(t, []string{"kitchen-sink"}, self.SelfContainedFlow) + + // Flow spread across three single-role servers: no self-contained flow. + spread := AnalyzeGroup("spread", []ServerAssessment{ + sa("d", ConfLikely, ConfNone, ConfNone), + sa("s", ConfNone, ConfLikely, ConfNone), + sa("k", ConfNone, ConfNone, ConfLikely), + }) + assert.Equal(t, VerdictPresent, spread.Verdict) + assert.Empty(t, spread.SelfContainedFlow) +} + +func TestAnalyzeGroupUnclassified(t *testing.T) { + t.Parallel() + + got := AnalyzeGroup("mixed", []ServerAssessment{ + sa("github", ConfLikely, ConfUnknown, ConfLikely), + sa("notes", ConfLikely, ConfNone, ConfNone), + }) + + assert.Equal(t, VerdictIndeterminate, got.Verdict) + assert.Equal(t, []string{"github"}, got.Unclassified) +} diff --git a/pkg/toxicflow/classify.go b/pkg/toxicflow/classify.go new file mode 100644 index 0000000000..8e261dc23e --- /dev/null +++ b/pkg/toxicflow/classify.go @@ -0,0 +1,327 @@ +// SPDX-FileCopyrightText: Copyright 2025 Stacklok, Inc. +// SPDX-License-Identifier: Apache-2.0 + +package toxicflow + +import ( + "fmt" + "maps" + "slices" + "strings" + + registrytypes "github.com/stacklok/toolhive-core/registry/types" + "github.com/stacklok/toolhive/pkg/authz/authorizers" + "github.com/stacklok/toolhive/pkg/runner" +) + +// ClassifyInput carries everything Classify needs for one server. All fields +// are optional: with no inputs the data and sink roles resolve to "unknown". +type ClassifyInput struct { + // Name is the workload name. + Name string + // Config is the saved RunConfig; nil when it could not be loaded. + Config *runner.RunConfig + // Metadata is the server's registry metadata; nil when unavailable. + Metadata registrytypes.ServerMetadata + // Annotations maps tool name to MCP annotations harvested from a live + // tools/list (used by --live); nil for static assessment. + Annotations map[string]*authorizers.ToolAnnotations + // Hints are raise-only suggestions from a SourceInference (keyword or LLM), + // folded after the profile-derived baseline and before overrides. + Hints []Hint + // Overrides force role confidence for this server. + Overrides []Override +} + +// signal is a single piece of evidence supporting a role at some confidence. +type signal struct { + conf Confidence + reason string +} + +// untrustedTagKeywords mark registry tags that suggest a server ingests +// content the caller does not control (a prompt-injection vector). +var untrustedTagKeywords = []string{ + "web", "fetch", "browser", "search", "email", "rss", + "scrape", "crawl", "http", "social", "news", "feed", +} + +// untrustedToolKeywords mark tool-name fragments suggesting the same. +var untrustedToolKeywords = []string{ + "fetch", "browse", "search", "crawl", "scrape", "navigate", + "read_url", "get_url", "read_email", "get_issue", "get_page", + "download", "url", +} + +// untrustedTextKeywords are whole words in a server's registry description or +// overview that suggest untrusted-content ingestion. Matched on word +// boundaries (not substrings) and only ever raise the source role to +// "possible" — free-text is non-authoritative. +var untrustedTextKeywords = map[string]struct{}{ + "web": {}, "internet": {}, "url": {}, "urls": {}, "fetch": {}, + "fetches": {}, "fetching": {}, "browse": {}, "browsing": {}, "browser": {}, + "scrape": {}, "scraping": {}, "crawl": {}, "crawling": {}, "rss": {}, + "email": {}, "emails": {}, "website": {}, "websites": {}, "search": {}, +} + +// Classify assesses a single server's role in a potential toxic flow. It is a +// pure function: given the same inputs it always produces the same assessment. +func Classify(in ClassifyInput) ServerAssessment { + findings := map[Role]RoleFinding{ + RoleData: classifyData(in), + RoleSource: classifySource(in), + RoleSink: classifySink(in), + } + // Inference hints fold raise-only (they can only strengthen a leg); explicit + // overrides are authoritative and applied last so they can also weaken one. + applyHints(findings, in.Hints) + applyOverrides(findings, in.Name, in.Overrides) + return ServerAssessment{Name: in.Name, Findings: findings} +} + +// classifyData looks for access to private or sensitive data. The permission +// profile (filesystem reads, injected secrets) is the authoritative signal, so +// a confident "none" requires the run config: without it the role is "unknown" +// rather than a misleading absence. Remoteness is deliberately NOT a data +// signal — it is egress (a sink signal); treating it as data too would +// double-count remote servers into two legs. +func classifyData(in ClassifyInput) RoleFinding { + var sigs []signal + + if in.Config != nil { + if p := in.Config.PermissionProfile; p != nil && len(p.Read) > 0 { + sigs = append(sigs, signal{ConfLikely, + fmt.Sprintf("filesystem read access (%d mount(s))", len(p.Read))}) + } + if len(in.Config.Secrets) > 0 { + sigs = append(sigs, signal{ConfLikely, + fmt.Sprintf("%d secret(s) injected", len(in.Config.Secrets))}) + } + } + + if in.Metadata != nil { + for _, ev := range in.Metadata.GetEnvVars() { + if ev != nil && ev.Secret { + sigs = append(sigs, signal{ConfLikely, + fmt.Sprintf("requires secret env var %q", ev.Name)}) + } + } + } + + if len(sigs) > 0 { + return maxFinding(RoleData, sigs) + } + // Mounts and injected secrets live on the run config; only with it can we + // assert a confident absence. Metadata alone is not enough. + if in.Config != nil { + return RoleFinding{Role: RoleData, Confidence: ConfNone} + } + return RoleFinding{Role: RoleData, Confidence: ConfUnknown} +} + +// classifySink looks for the ability to communicate externally. ToolHive's +// runtime is open-egress-by-default: a nil permission profile, nil network +// policy, or nil outbound policy all resolve to unrestricted egress at runtime +// (see pkg/container/docker/squid.go and the default network profile). The +// sink role therefore reaches a confident "none" ONLY on positive evidence of +// closed egress; any unspecified policy is treated as open, never as safe. +func classifySink(in ClassifyInput) RoleFinding { + // Without a run config, egress is unknowable from here. + if in.Config == nil { + return RoleFinding{Role: RoleSink, Confidence: ConfUnknown} + } + + var sigs []signal + if in.Config.RemoteURL != "" { + sigs = append(sigs, signal{ConfLikely, + "remote server sends requests to an external endpoint"}) + } + + switch p := in.Config.PermissionProfile; p { + case nil: + sigs = append(sigs, signal{ConfPossible, "no permission profile; runtime egress is open by default"}) + default: + if p.Privileged { + sigs = append(sigs, signal{ConfLikely, "privileged container (network controls bypassable)"}) + } + switch n := p.Network; n { + case nil: + sigs = append(sigs, signal{ConfPossible, "network policy unspecified; runtime egress is open by default"}) + default: + if n.Mode == "host" { + sigs = append(sigs, signal{ConfLikely, "host network mode (egress not controlled)"}) + } + switch o := n.Outbound; { + case o == nil: + sigs = append(sigs, signal{ConfPossible, "outbound policy unspecified; runtime egress is open by default"}) + case o.InsecureAllowAll: + sigs = append(sigs, signal{ConfLikely, "unrestricted outbound network access"}) + case len(o.AllowHost) > 0 || len(o.AllowPort) > 0: + sigs = append(sigs, signal{ConfPossible, + fmt.Sprintf("restricted outbound access (%d host(s), %d port(s))", len(o.AllowHost), len(o.AllowPort))}) + } + // Outbound present with no allow rules and not insecure-allow-all + // is the only positive evidence of closed egress: no signal added. + } + } + + if len(sigs) == 0 { + return RoleFinding{Role: RoleSink, Confidence: ConfNone, + Evidence: []string{"outbound network access denied by permission profile"}} + } + return maxFinding(RoleSink, sigs) +} + +// classifySource looks for exposure to untrusted content from the one signal it +// can trust as evidence of presence: a live tools/list openWorldHint. Tag, +// tool, and description heuristics and LLM judgements are non-authoritative and +// arrive separately as raise-only Hints (see SourceInference / applyHints). +// +// Source classification is raise-only: it never returns a confident "none". +// Untrusted-content exposure cannot be ruled out from a server's own tool list, +// because annotations are advisory and a compromised server can simply +// under-report openWorldHint to hide. Only an explicit operator override may +// assert source "none". +func classifySource(in ClassifyInput) RoleFinding { + var sigs []signal + + // Iterate annotations in sorted key order so evidence is deterministic. + for _, name := range slices.Sorted(maps.Keys(in.Annotations)) { + a := in.Annotations[name] + if a != nil && a.OpenWorldHint != nil && *a.OpenWorldHint { + sigs = append(sigs, signal{ConfLikely, fmt.Sprintf("tool %q has openWorldHint", name)}) + } + } + + if len(sigs) > 0 { + return maxFinding(RoleSource, sigs) + } + return RoleFinding{Role: RoleSource, Confidence: ConfUnknown} +} + +// maxFinding folds a non-empty signal set into a finding, taking the strongest +// confidence and collecting every reason as evidence. Each classifier decides +// its own "no signals" semantics (confident none vs unknown) before calling +// this, because that decision differs per role. +func maxFinding(role Role, sigs []signal) RoleFinding { + best := ConfNone + ev := make([]string, 0, len(sigs)) + for _, s := range sigs { + ev = append(ev, s.reason) + if s.conf.rank() > best.rank() { + best = s.conf + } + } + return RoleFinding{Role: role, Confidence: best, Evidence: ev} +} + +// applyHints folds raise-only inference hints into the findings. A hint takes +// effect only when it would strengthen the role's confidence, so hints can +// never weaken a leg or manufacture a confident "none". The hint reason is +// prepended to the evidence. +func applyHints(findings map[Role]RoleFinding, hints []Hint) { + for _, h := range hints { + if !isRole(h.Role) { + continue + } + // Clamp at the choke point: inference backends (keyword or LLM) are + // non-authoritative, so no hint may push a leg past "possible" — only + // the structured openWorldHint signal reaches "likely". This enforces + // the cap structurally rather than trusting each backend to honor it. + conf := h.Confidence + if conf.rank() > ConfPossible.rank() { + conf = ConfPossible + } + prev := findings[h.Role] + // Raise-only: a hint applies only when it strictly strengthens the leg. + // Hints run before overrides (see Classify), so prev is never overridden. + if conf.rank() <= prev.Confidence.rank() { + continue + } + findings[h.Role] = RoleFinding{ + Role: h.Role, + Confidence: conf, + Evidence: append([]string{h.Reason}, prev.Evidence...), + } + } +} + +// applyOverrides replaces findings for the named server with explicit overrides, +// recording the reason as top-priority evidence. +func applyOverrides(findings map[Role]RoleFinding, server string, overrides []Override) { + for _, ov := range overrides { + if ov.Server != "" && ov.Server != server { + continue + } + if !isRole(ov.Role) { + continue + } + reason := ov.Reason + if reason == "" { + reason = "(no reason given)" + } + prev := findings[ov.Role] + ev := append([]string{fmt.Sprintf("overridden to %s: %s", ov.Confidence, reason)}, prev.Evidence...) + findings[ov.Role] = RoleFinding{ + Role: ov.Role, + Confidence: ov.Confidence, + Evidence: ev, + Overridden: true, + } + } +} + +// matchKeyword returns the first value containing any keyword (case-insensitive), +// or "" if none match. +func matchKeyword(values, keywords []string) string { + for _, v := range values { + lower := strings.ToLower(v) + for _, kw := range keywords { + if strings.Contains(lower, kw) { + return v + } + } + } + return "" +} + +// matchWord returns the first whole word in text that is in the keyword set +// (case-insensitive), or "" if none match. Word-boundary matching avoids +// substring false positives (e.g. "web" inside "cobweb"). +func matchWord(text string, keywords map[string]struct{}) string { + for _, field := range strings.Fields(strings.ToLower(text)) { + word := strings.Trim(field, ".,;:!?()[]{}\"'`") + if _, ok := keywords[word]; ok { + return word + } + } + return "" +} + +// isRole reports whether r is one of the three recognized roles. +func isRole(r Role) bool { + return r == RoleData || r == RoleSource || r == RoleSink +} + +// isConfidence reports whether c is one of the four recognized levels. +func isConfidence(c Confidence) bool { + return c == ConfNone || c == ConfUnknown || c == ConfPossible || c == ConfLikely +} + +// ValidateOverride checks that an override is well-formed. A parse success only +// means valid JSON; an unknown confidence would otherwise map to ConfNone and +// silently zero a leg (the most dangerous direction for a security advisory), +// so reject unknown values loudly. A reason is required for auditability. +func ValidateOverride(o Override) error { + if !isRole(o.Role) { + return fmt.Errorf("override for server %q: invalid role %q (want data, source, or sink)", o.Server, o.Role) + } + if !isConfidence(o.Confidence) { + return fmt.Errorf("override for server %q: invalid confidence %q (want none, unknown, possible, or likely)", + o.Server, o.Confidence) + } + if strings.TrimSpace(o.Reason) == "" { + return fmt.Errorf("override for server %q role %q: a reason is required", o.Server, o.Role) + } + return nil +} diff --git a/pkg/toxicflow/classify_test.go b/pkg/toxicflow/classify_test.go new file mode 100644 index 0000000000..9e415380e4 --- /dev/null +++ b/pkg/toxicflow/classify_test.go @@ -0,0 +1,306 @@ +// SPDX-FileCopyrightText: Copyright 2025 Stacklok, Inc. +// SPDX-License-Identifier: Apache-2.0 + +package toxicflow + +import ( + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "github.com/stacklok/toolhive-core/permissions" + registrytypes "github.com/stacklok/toolhive-core/registry/types" + "github.com/stacklok/toolhive/pkg/authz/authorizers" + "github.com/stacklok/toolhive/pkg/runner" +) + +func boolPtr(b bool) *bool { return &b } + +// noEgressProfile is the equivalent of the built-in "none" profile: a profile +// that was inspected and grants no outbound access. +func noEgressProfile() *permissions.Profile { + return &permissions.Profile{ + Network: &permissions.NetworkPermissions{ + Outbound: &permissions.OutboundNetworkPermissions{}, + }, + } +} + +func imageMeta(tags, tools []string) registrytypes.ServerMetadata { + return ®istrytypes.ImageMetadata{ + BaseServerMetadata: registrytypes.BaseServerMetadata{ + Tags: tags, + Tools: tools, + }, + } +} + +func TestClassify(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + input ClassifyInput + want map[Role]Confidence + wantOvr map[Role]bool // roles expected to be marked Overridden + }{ + { + name: "filesystem read mount marks data likely", + input: ClassifyInput{ + Name: "fs", + Config: &runner.RunConfig{PermissionProfile: &permissions.Profile{ + Read: []permissions.MountDeclaration{"/home/user/notes:/notes"}, + Network: &permissions.NetworkPermissions{Outbound: &permissions.OutboundNetworkPermissions{}}, + }}, + }, + want: map[Role]Confidence{RoleData: ConfLikely, RoleSink: ConfNone, RoleSource: ConfUnknown}, + }, + { + name: "injected secrets mark data likely", + input: ClassifyInput{ + Name: "svc", + Config: &runner.RunConfig{Secrets: []string{"api-key,target=API_KEY"}, PermissionProfile: noEgressProfile()}, + }, + want: map[Role]Confidence{RoleData: ConfLikely, RoleSink: ConfNone, RoleSource: ConfUnknown}, + }, + { + name: "no-egress profile yields confident none for data and sink", + input: ClassifyInput{ + Name: "inert", + Config: &runner.RunConfig{PermissionProfile: noEgressProfile()}, + }, + want: map[Role]Confidence{RoleData: ConfNone, RoleSink: ConfNone, RoleSource: ConfUnknown}, + }, + { + name: "unrestricted egress marks sink likely", + input: ClassifyInput{ + Name: "open", + Config: &runner.RunConfig{PermissionProfile: &permissions.Profile{ + Network: &permissions.NetworkPermissions{Outbound: &permissions.OutboundNetworkPermissions{InsecureAllowAll: true}}, + }}, + }, + want: map[Role]Confidence{RoleData: ConfNone, RoleSink: ConfLikely, RoleSource: ConfUnknown}, + }, + { + name: "host-restricted egress marks sink possible", + input: ClassifyInput{ + Name: "scoped", + Config: &runner.RunConfig{PermissionProfile: &permissions.Profile{ + Network: &permissions.NetworkPermissions{Outbound: &permissions.OutboundNetworkPermissions{AllowHost: []string{"api.example.com"}}}, + }}, + }, + want: map[Role]Confidence{RoleData: ConfNone, RoleSink: ConfPossible, RoleSource: ConfUnknown}, + }, + { + name: "host network mode marks sink likely", + input: ClassifyInput{ + Name: "hostnet", + Config: &runner.RunConfig{PermissionProfile: &permissions.Profile{ + Network: &permissions.NetworkPermissions{Mode: "host", Outbound: &permissions.OutboundNetworkPermissions{}}, + }}, + }, + want: map[Role]Confidence{RoleData: ConfNone, RoleSink: ConfLikely, RoleSource: ConfUnknown}, + }, + { + name: "privileged container marks sink likely", + input: ClassifyInput{ + Name: "priv", + Config: &runner.RunConfig{PermissionProfile: &permissions.Profile{ + Privileged: true, + Network: &permissions.NetworkPermissions{Outbound: &permissions.OutboundNetworkPermissions{}}, + }}, + }, + want: map[Role]Confidence{RoleData: ConfNone, RoleSink: ConfLikely, RoleSource: ConfUnknown}, + }, + { + name: "remote url marks sink likely but not data (no double-count)", + input: ClassifyInput{ + Name: "remote", + Config: &runner.RunConfig{RemoteURL: "https://mcp.example.com/sse", PermissionProfile: noEgressProfile()}, + }, + want: map[Role]Confidence{RoleData: ConfNone, RoleSink: ConfLikely, RoleSource: ConfUnknown}, + }, + { + name: "nil network policy reads sink possible (open egress by default)", + input: ClassifyInput{ + Name: "nonet", + Config: &runner.RunConfig{PermissionProfile: &permissions.Profile{}}, + }, + want: map[Role]Confidence{RoleData: ConfNone, RoleSink: ConfPossible, RoleSource: ConfUnknown}, + }, + { + name: "nil permission profile reads sink possible (open egress by default)", + input: ClassifyInput{ + Name: "noprofile", + Config: &runner.RunConfig{}, + }, + want: map[Role]Confidence{RoleData: ConfNone, RoleSink: ConfPossible, RoleSource: ConfUnknown}, + }, + { + name: "nil outbound policy reads sink possible (open egress by default)", + input: ClassifyInput{ + Name: "nooutbound", + Config: &runner.RunConfig{PermissionProfile: &permissions.Profile{ + Network: &permissions.NetworkPermissions{}, + }}, + }, + want: map[Role]Confidence{RoleData: ConfNone, RoleSink: ConfPossible, RoleSource: ConfUnknown}, + }, + { + name: "metadata without a run config leaves data and sink unknown", + input: ClassifyInput{ + Name: "noconfig", + Metadata: imageMeta(nil, nil), + }, + want: map[Role]Confidence{RoleData: ConfUnknown, RoleSink: ConfUnknown, RoleSource: ConfUnknown}, + }, + { + name: "source hint raises source to possible", + input: ClassifyInput{ + Name: "fetcher", + Config: &runner.RunConfig{PermissionProfile: noEgressProfile()}, + Hints: []Hint{{Role: RoleSource, Confidence: ConfPossible, Reason: "tag \"web\""}}, + }, + want: map[Role]Confidence{RoleData: ConfNone, RoleSink: ConfNone, RoleSource: ConfPossible}, + }, + { + name: "data hint raises remote data from none to possible", + input: ClassifyInput{ + Name: "remote", + Config: &runner.RunConfig{RemoteURL: "https://mcp.example.com/sse", PermissionProfile: noEgressProfile()}, + Hints: []Hint{{Role: RoleData, Confidence: ConfPossible, Reason: "LLM: holds account data"}}, + }, + want: map[Role]Confidence{RoleData: ConfPossible, RoleSink: ConfLikely, RoleSource: ConfUnknown}, + }, + { + name: "hint stronger than possible is clamped to possible (cap enforced at the seam)", + input: ClassifyInput{ + Name: "fetcher", + Config: &runner.RunConfig{PermissionProfile: noEgressProfile()}, + Hints: []Hint{{Role: RoleSource, Confidence: ConfLikely, Reason: "overconfident backend"}}, + }, + want: map[Role]Confidence{RoleData: ConfNone, RoleSink: ConfNone, RoleSource: ConfPossible}, + }, + { + name: "hint never lowers a stronger finding (raise-only)", + input: ClassifyInput{ + Name: "browser", + Config: &runner.RunConfig{PermissionProfile: noEgressProfile()}, + Annotations: map[string]*authorizers.ToolAnnotations{"browse": {OpenWorldHint: boolPtr(true)}}, + Hints: []Hint{{Role: RoleSource, Confidence: ConfPossible, Reason: "weaker hint"}}, + }, + want: map[Role]Confidence{RoleData: ConfNone, RoleSink: ConfNone, RoleSource: ConfLikely}, + }, + { + name: "openWorldHint annotation marks source likely", + input: ClassifyInput{ + Name: "browser", + Config: &runner.RunConfig{PermissionProfile: noEgressProfile()}, + Annotations: map[string]*authorizers.ToolAnnotations{"browse": {OpenWorldHint: boolPtr(true)}}, + }, + want: map[Role]Confidence{RoleData: ConfNone, RoleSink: ConfNone, RoleSource: ConfLikely}, + }, + { + name: "annotations without openWorldHint leave source unknown (a server cannot self-declare safe)", + input: ClassifyInput{ + Name: "calc", + Config: &runner.RunConfig{PermissionProfile: noEgressProfile()}, + Annotations: map[string]*authorizers.ToolAnnotations{"add": {OpenWorldHint: boolPtr(false)}}, + }, + want: map[Role]Confidence{RoleData: ConfNone, RoleSink: ConfNone, RoleSource: ConfUnknown}, + }, + { + name: "no inputs leave every role unknown", + input: ClassifyInput{Name: "ghost"}, + want: map[Role]Confidence{RoleData: ConfUnknown, RoleSink: ConfUnknown, RoleSource: ConfUnknown}, + }, + { + name: "override downgrades source to none even against a raising hint", + input: ClassifyInput{ + Name: "intranet-fetch", + Config: &runner.RunConfig{PermissionProfile: noEgressProfile()}, + Hints: []Hint{{Role: RoleSource, Confidence: ConfPossible, Reason: "tag \"web\""}}, + Overrides: []Override{ + {Server: "intranet-fetch", Role: RoleSource, Confidence: ConfNone, Reason: "first-party intranet only"}, + }, + }, + want: map[Role]Confidence{RoleData: ConfNone, RoleSink: ConfNone, RoleSource: ConfNone}, + wantOvr: map[Role]bool{RoleSource: true}, + }, + { + name: "override targeting a different server is ignored", + input: ClassifyInput{ + Name: "fetcher", + Config: &runner.RunConfig{PermissionProfile: noEgressProfile()}, + Hints: []Hint{{Role: RoleSource, Confidence: ConfPossible, Reason: "tag \"web\""}}, + Overrides: []Override{ + {Server: "other", Role: RoleSource, Confidence: ConfNone, Reason: "not this one"}, + }, + }, + want: map[Role]Confidence{RoleData: ConfNone, RoleSink: ConfNone, RoleSource: ConfPossible}, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + got := Classify(tt.input) + require.Equal(t, tt.input.Name, got.Name) + for role, wantConf := range tt.want { + assert.Equalf(t, wantConf, got.Finding(role).Confidence, + "role %s confidence", role) + } + for role, wantOvr := range tt.wantOvr { + assert.Equalf(t, wantOvr, got.Finding(role).Overridden, + "role %s overridden", role) + } + }) + } +} + +func TestValidateOverride(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + ovr Override + wantErr bool + }{ + { + name: "valid override", + ovr: Override{Server: "fetch", Role: RoleSource, Confidence: ConfNone, Reason: "intranet only"}, + }, + { + name: "wildcard server is allowed", + ovr: Override{Server: "", Role: RoleSink, Confidence: ConfNone, Reason: "all egress audited"}, + }, + { + name: "invalid role is rejected", + ovr: Override{Server: "x", Role: "egress", Confidence: ConfNone, Reason: "r"}, + wantErr: true, + }, + { + name: "invalid confidence is rejected", + ovr: Override{Server: "x", Role: RoleSink, Confidence: "high", Reason: "r"}, + wantErr: true, + }, + { + name: "missing reason is rejected", + ovr: Override{Server: "x", Role: RoleSink, Confidence: ConfNone, Reason: " "}, + wantErr: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + err := ValidateOverride(tt.ovr) + if tt.wantErr { + require.Error(t, err) + return + } + require.NoError(t, err) + }) + } +} diff --git a/pkg/toxicflow/collect.go b/pkg/toxicflow/collect.go new file mode 100644 index 0000000000..7eca8f592c --- /dev/null +++ b/pkg/toxicflow/collect.go @@ -0,0 +1,152 @@ +// SPDX-FileCopyrightText: Copyright 2025 Stacklok, Inc. +// SPDX-License-Identifier: Apache-2.0 + +package toxicflow + +import ( + "context" + "fmt" + "log/slog" + + registrytypes "github.com/stacklok/toolhive-core/registry/types" + "github.com/stacklok/toolhive/pkg/container/runtime" + "github.com/stacklok/toolhive/pkg/core" + "github.com/stacklok/toolhive/pkg/groups" + "github.com/stacklok/toolhive/pkg/registry" + "github.com/stacklok/toolhive/pkg/runner" + "github.com/stacklok/toolhive/pkg/workloads" +) + +// Collector gathers the inputs a toxic-flow assessment needs (run configs and +// registry metadata) for the workloads in a group and runs the classifier over +// them. Registry lookups are best-effort: a missing or unreachable registry +// degrades classification accuracy but does not fail the assessment. +type Collector struct { + workloads workloads.Manager + registry registry.Provider // nil when the registry is unavailable + inference SourceInference + live bool +} + +// NewCollector builds a Collector backed by the local workloads manager and the +// default registry provider. inference selects the untrusted-content strategy +// (keyword or LLM); a nil inference defaults to KeywordInference. When live is +// true, running (actively-proxied) servers are probed for live tool +// annotations. +func NewCollector(ctx context.Context, inference SourceInference, live bool) (*Collector, error) { + wm, err := workloads.NewManager(ctx) + if err != nil { + return nil, fmt.Errorf("create workloads manager: %w", err) + } + // The registry is best-effort: without it, metadata-derived hints are + // skipped but profile-derived data/sink roles are unaffected. + reg, err := registry.GetDefaultProvider() + if err != nil { + slog.Debug("toxicflow: registry provider unavailable, continuing without metadata", "error", err) + reg = nil + } + if inference == nil { + inference = NewKeywordInference() + } + return &Collector{workloads: wm, registry: reg, inference: inference, live: live}, nil +} + +// AssessGroup classifies every workload in the named group and returns the +// group verdict. The overrides apply across the group (each override may target +// a specific server or all servers). +func (c *Collector) AssessGroup(ctx context.Context, group string, overrides []Override) (GroupAssessment, error) { + all, err := c.workloads.ListWorkloads(ctx, true) + if err != nil { + return GroupAssessment{}, fmt.Errorf("list workloads: %w", err) + } + members, err := workloads.FilterByGroup(all, group) + if err != nil { + return GroupAssessment{}, fmt.Errorf("filter workloads by group %q: %w", group, err) + } + + assessments := make([]ServerAssessment, 0, len(members)) + for _, w := range members { + in := ClassifyInput{Name: w.Name, Overrides: overrides} + + if rc, err := runner.LoadState(ctx, w.Name); err == nil { + in.Config = rc + } else { + // Without a run config the profile-derived roles fall back to + // "unknown"; this is surfaced in the verdict, not an error. + slog.Debug("toxicflow: could not load run config", "workload", w.Name, "error", err) + } + + if c.registry != nil { + if meta, err := c.lookupMetadata(in.Config, w); err == nil { + in.Metadata = meta + } else { + slog.Debug("toxicflow: no registry metadata", "workload", w.Name, "error", err) + } + } + + // Untrusted-content (and remote data) hints from the configured strategy. + if hints, err := c.inference.Infer(ctx, profileFor(w, in.Metadata)); err == nil { + in.Hints = hints + } else { + // Inference is best-effort: on failure the source role stays + // "unknown" (the safe direction), never a confident "none". + slog.Debug("toxicflow: inference failed", "workload", w.Name, "error", err) + } + + // When we are actively proxying this server, probe it live for the + // authoritative openWorldHint signal. + if c.live && w.Status == runtime.WorkloadStatusRunning && w.URL != "" { + if ann, err := probeAnnotations(ctx, w.URL, w.ProxyMode); err == nil { + in.Annotations = ann + } else { + slog.Debug("toxicflow: live probe failed", "workload", w.Name, "error", err) + } + } + + assessments = append(assessments, Classify(in)) + } + + return AnalyzeGroup(group, assessments), nil +} + +// profileFor builds the public, non-sensitive profile passed to the inference +// strategy. It deliberately carries no permission profile, secrets, or runtime +// config — only catalog metadata an LLM backend may safely receive. +func profileFor(w core.Workload, meta registrytypes.ServerMetadata) ServerProfile { + p := ServerProfile{Name: w.Name, Remote: w.Remote} + if meta != nil { + p.Description = meta.GetDescription() + p.Overview = meta.GetOverview() + p.Tags = meta.GetTags() + p.Tools = meta.GetTools() + } + return p +} + +// lookupMetadata resolves registry metadata, preferring the registry server +// name recorded in the run config and falling back to the workload name. +func (c *Collector) lookupMetadata(rc *runner.RunConfig, w core.Workload) (registrytypes.ServerMetadata, error) { + name := w.Name + if rc != nil && rc.RegistryServerName != "" { + name = rc.RegistryServerName + } + return c.registry.GetServer(name) +} + +// ListGroupNames returns the names of all known groups, for auditing every +// group at once. +func ListGroupNames(ctx context.Context) ([]string, error) { + gm, err := groups.NewManager() + if err != nil { + return nil, fmt.Errorf("create groups manager: %w", err) + } + gs, err := gm.List(ctx) + if err != nil { + return nil, fmt.Errorf("list groups: %w", err) + } + names := make([]string, 0, len(gs)) + for _, g := range gs { + names = append(names, g.Name) + } + return names, nil +} diff --git a/pkg/toxicflow/collect_test.go b/pkg/toxicflow/collect_test.go new file mode 100644 index 0000000000..fa022591df --- /dev/null +++ b/pkg/toxicflow/collect_test.go @@ -0,0 +1,69 @@ +// SPDX-FileCopyrightText: Copyright 2025 Stacklok, Inc. +// SPDX-License-Identifier: Apache-2.0 + +package toxicflow + +import ( + "testing" + + "github.com/mark3labs/mcp-go/mcp" + "github.com/stretchr/testify/assert" + + registrytypes "github.com/stacklok/toolhive-core/registry/types" + "github.com/stacklok/toolhive/pkg/core" +) + +// TestProfileFor guards the security boundary: the profile sent to an inference +// backend (potentially a remote LLM) must carry only public catalog metadata, +// never anything derived from the run config, permission profile, or secrets. +// ServerProfile having no such field is the structural guarantee; this test +// pins the mapping and documents the intent. +func TestProfileFor(t *testing.T) { + t.Parallel() + + w := core.Workload{Name: "github", Remote: true} + meta := ®istrytypes.ImageMetadata{BaseServerMetadata: registrytypes.BaseServerMetadata{ + Description: "GitHub MCP server", + Overview: "Access repositories and issues", + Tags: []string{"git", "vcs"}, + Tools: []string{"get_issue", "create_pr"}, + }} + + got := profileFor(w, meta) + + assert.Equal(t, ServerProfile{ + Name: "github", + Remote: true, + Description: "GitHub MCP server", + Overview: "Access repositories and issues", + Tags: []string{"git", "vcs"}, + Tools: []string{"get_issue", "create_pr"}, + }, got) +} + +func TestProfileForNilMetadata(t *testing.T) { + t.Parallel() + + got := profileFor(core.Workload{Name: "local", Remote: false}, nil) + assert.Equal(t, ServerProfile{Name: "local"}, got) +} + +func TestMapAnnotations(t *testing.T) { + t.Parallel() + + open := true + ro := true + tools := []mcp.Tool{ + {Name: "fetch", Annotations: mcp.ToolAnnotation{OpenWorldHint: &open}}, + {Name: "read", Annotations: mcp.ToolAnnotation{ReadOnlyHint: &ro}}, + } + + got := mapAnnotations(tools) + + assert.Len(t, got, 2) + assert.NotNil(t, got["fetch"].OpenWorldHint) + assert.True(t, *got["fetch"].OpenWorldHint) + assert.NotNil(t, got["read"].ReadOnlyHint) + assert.True(t, *got["read"].ReadOnlyHint) + assert.Nil(t, got["read"].OpenWorldHint) +} diff --git a/pkg/toxicflow/inference.go b/pkg/toxicflow/inference.go new file mode 100644 index 0000000000..a62cb86a11 --- /dev/null +++ b/pkg/toxicflow/inference.go @@ -0,0 +1,170 @@ +// SPDX-FileCopyrightText: Copyright 2025 Stacklok, Inc. +// SPDX-License-Identifier: Apache-2.0 + +package toxicflow + +import ( + "context" + "encoding/json" + "fmt" + "strings" +) + +// ServerProfile is the public, non-sensitive view of a server passed to a +// SourceInference. It deliberately excludes permission profiles, secrets, and +// runtime config: an inference backend (especially a remote LLM) must never +// receive sensitive data. +type ServerProfile struct { + Name string + Description string + Overview string + Tags []string + Tools []string + Remote bool +} + +// Hint is a non-authoritative, raise-only suggestion from a SourceInference. +// Hints can only raise a role's confidence (never lower it, never force a +// confident "none"), so a wrong hint costs a spurious warning, not a missed +// flow. Inference backends emit at most ConfPossible — the structured +// openWorldHint is the only basis for a stronger source signal. +type Hint struct { + Role Role + Confidence Confidence + Reason string +} + +// SourceInference derives untrusted-content (and, where possible, private-data) +// hints from a server's public metadata. Implementations must be safe to call +// with partial profiles and must not return findings stronger than ConfPossible. +type SourceInference interface { + Infer(ctx context.Context, p ServerProfile) ([]Hint, error) +} + +// KeywordInference is the default, offline, deterministic strategy: it matches +// tags, tool names, and description text against curated keyword lists. +type KeywordInference struct{} + +// NewKeywordInference returns the keyword-based inference strategy. +func NewKeywordInference() KeywordInference { return KeywordInference{} } + +// Infer implements SourceInference using keyword heuristics. +func (KeywordInference) Infer(_ context.Context, p ServerProfile) ([]Hint, error) { + var hints []Hint + if tag := matchKeyword(p.Tags, untrustedTagKeywords); tag != "" { + hints = append(hints, Hint{RoleSource, ConfPossible, + fmt.Sprintf("tag %q suggests untrusted-content ingestion", tag)}) + } + if tool := matchKeyword(p.Tools, untrustedToolKeywords); tool != "" { + hints = append(hints, Hint{RoleSource, ConfPossible, + fmt.Sprintf("tool %q suggests untrusted-content ingestion", tool)}) + } + if word := matchWord(p.Description+" "+p.Overview, untrustedTextKeywords); word != "" { + hints = append(hints, Hint{RoleSource, ConfPossible, + fmt.Sprintf("description mentions %q (possible untrusted-content ingestion)", word)}) + } + return hints, nil +} + +// Completer is the minimal LLM contract LLMInference needs. It is satisfied by +// pkg/llm/client.Client; defining it here keeps toxicflow free of an LLM-client +// dependency and makes LLMInference unit-testable with a stub. +type Completer interface { + Complete(ctx context.Context, system, user string) (string, error) +} + +// LLMInference uses a language model to judge, from the public profile, whether +// a server ingests untrusted content or exposes private data. Its output is +// still capped at ConfPossible and folded raise-only, so a hallucination can +// only over-warn — it can never produce a reassuring "no toxic flow". +type LLMInference struct { + client Completer +} + +// NewLLMInference returns an LLM-backed inference strategy. +func NewLLMInference(c Completer) LLMInference { return LLMInference{client: c} } + +const sourceSystemPrompt = "You classify Model Context Protocol (MCP) servers for security review.\n" + + "Given a server's public metadata, decide two things:\n" + + "- untrusted_content: does the server ingest content an external party could influence " + + "(web pages, search results, emails, issue/PR bodies, arbitrary documents)? " + + "Internal-but-user-generated content (wikis, tickets, mailboxes) counts as untrusted.\n" + + "- private_data: does the server plausibly access private or sensitive data " + + "(a user's accounts, repositories, files, internal APIs)?\n" + + "Respond with ONLY a JSON object: " + + `{"untrusted_content": bool, "private_data": bool, "reason": ""}.` + +type llmVerdict struct { + UntrustedContent bool `json:"untrusted_content"` + PrivateData bool `json:"private_data"` + Reason string `json:"reason"` +} + +// Infer implements SourceInference by prompting the model and parsing its JSON. +func (l LLMInference) Infer(ctx context.Context, p ServerProfile) ([]Hint, error) { + out, err := l.client.Complete(ctx, sourceSystemPrompt, buildProfilePrompt(p)) + if err != nil { + return nil, fmt.Errorf("llm inference: %w", err) + } + verdict, err := parseLLMVerdict(out) + if err != nil { + return nil, err + } + + var hints []Hint + if verdict.UntrustedContent { + hints = append(hints, Hint{RoleSource, ConfPossible, llmReason("untrusted content", verdict.Reason)}) + } + if verdict.PrivateData { + hints = append(hints, Hint{RoleData, ConfPossible, llmReason("private data", verdict.Reason)}) + } + return hints, nil +} + +func buildProfilePrompt(p ServerProfile) string { + var b strings.Builder + fmt.Fprintf(&b, "name: %s\n", p.Name) + fmt.Fprintf(&b, "remote: %t\n", p.Remote) + if p.Description != "" { + fmt.Fprintf(&b, "description: %s\n", p.Description) + } + if p.Overview != "" { + fmt.Fprintf(&b, "overview: %s\n", p.Overview) + } + if len(p.Tags) > 0 { + fmt.Fprintf(&b, "tags: %s\n", strings.Join(p.Tags, ", ")) + } + if len(p.Tools) > 0 { + fmt.Fprintf(&b, "tools: %s\n", strings.Join(p.Tools, ", ")) + } + return b.String() +} + +// parseLLMVerdict extracts the JSON object from the model output, tolerating +// surrounding prose or code fences. +func parseLLMVerdict(out string) (llmVerdict, error) { + start := strings.Index(out, "{") + end := strings.LastIndex(out, "}") + if start < 0 || end < start { + return llmVerdict{}, fmt.Errorf("llm response was not JSON: %q", truncate(out, 120)) + } + var v llmVerdict + if err := json.Unmarshal([]byte(out[start:end+1]), &v); err != nil { + return llmVerdict{}, fmt.Errorf("parse llm response: %w", err) + } + return v, nil +} + +func llmReason(kind, reason string) string { + if strings.TrimSpace(reason) == "" { + return "LLM judged this server to involve " + kind + } + return "LLM: " + reason +} + +func truncate(s string, n int) string { + if len(s) <= n { + return s + } + return s[:n] + "..." +} diff --git a/pkg/toxicflow/inference_test.go b/pkg/toxicflow/inference_test.go new file mode 100644 index 0000000000..4b8f9144ca --- /dev/null +++ b/pkg/toxicflow/inference_test.go @@ -0,0 +1,122 @@ +// SPDX-FileCopyrightText: Copyright 2025 Stacklok, Inc. +// SPDX-License-Identifier: Apache-2.0 + +package toxicflow + +import ( + "context" + "errors" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestKeywordInferenceInfer(t *testing.T) { + t.Parallel() + + ki := NewKeywordInference() + tests := []struct { + name string + profile ServerProfile + wantSource bool + }{ + {"web tag", ServerProfile{Tags: []string{"web", "utility"}}, true}, + {"fetch tool name", ServerProfile{Tools: []string{"fetch_page"}}, true}, + {"description mentions web", ServerProfile{Description: "Fetches web pages and returns their content"}, true}, + {"innocuous description", ServerProfile{Description: "Performs arithmetic calculations locally"}, false}, + {"substring is not a match", ServerProfile{Description: "manages a cobweb of dependencies"}, false}, + {"empty profile", ServerProfile{Name: "x"}, false}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + hints, err := ki.Infer(t.Context(), tt.profile) + require.NoError(t, err) + assert.Equal(t, tt.wantSource, hasHint(hints, RoleSource)) + for _, h := range hints { + assert.Equal(t, ConfPossible, h.Confidence, "keyword hints must be possible") + } + }) + } +} + +type stubCompleter struct { + out string + err error +} + +func (s stubCompleter) Complete(_ context.Context, _, _ string) (string, error) { + return s.out, s.err +} + +func TestLLMInferenceInfer(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + out string + cErr error + wantSource bool + wantData bool + wantErr bool + }{ + { + name: "both legs true", + out: `{"untrusted_content":true,"private_data":true,"reason":"github account"}`, + wantSource: true, wantData: true, + }, + { + name: "only untrusted content", + out: `{"untrusted_content":true,"private_data":false,"reason":"web search"}`, + wantSource: true, + }, + { + name: "neither leg", + out: `{"untrusted_content":false,"private_data":false,"reason":"local calculator"}`, + }, + { + name: "json wrapped in prose and fences", + out: "Sure:\n```json\n{\"untrusted_content\":true,\"private_data\":false}\n```", + wantSource: true, + }, + { + name: "non-json response is an error", + out: "I cannot help with that", + wantErr: true, + }, + { + name: "completer error propagates", + cErr: errors.New("network down"), + wantErr: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + li := NewLLMInference(stubCompleter{out: tt.out, err: tt.cErr}) + hints, err := li.Infer(t.Context(), ServerProfile{Name: "x"}) + if tt.wantErr { + require.Error(t, err) + return + } + require.NoError(t, err) + assert.Equal(t, tt.wantSource, hasHint(hints, RoleSource)) + assert.Equal(t, tt.wantData, hasHint(hints, RoleData)) + for _, h := range hints { + assert.Equal(t, ConfPossible, h.Confidence, "llm hints must be capped at possible") + } + }) + } +} + +func hasHint(hints []Hint, role Role) bool { + for _, h := range hints { + if h.Role == role { + return true + } + } + return false +} diff --git a/pkg/toxicflow/probe.go b/pkg/toxicflow/probe.go new file mode 100644 index 0000000000..b418192c9e --- /dev/null +++ b/pkg/toxicflow/probe.go @@ -0,0 +1,60 @@ +// SPDX-FileCopyrightText: Copyright 2025 Stacklok, Inc. +// SPDX-License-Identifier: Apache-2.0 + +package toxicflow + +import ( + "context" + "fmt" + "time" + + "github.com/mark3labs/mcp-go/mcp" + + "github.com/stacklok/toolhive/pkg/authz/authorizers" + mcpclient "github.com/stacklok/toolhive/pkg/mcp/client" +) + +// probeTimeout bounds a single live tools/list probe so a slow or wedged +// server cannot hang the audit. +const probeTimeout = 10 * time.Second + +// probeAnnotations connects to a running server through the ToolHive proxy and +// reads its live tool annotations (notably openWorldHint), which are the +// strongest available signal for the source role. It is best-effort: callers +// treat any error as "no live data" rather than failing the audit. +func probeAnnotations(ctx context.Context, serverURL, transport string) (map[string]*authorizers.ToolAnnotations, error) { + ctx, cancel := context.WithTimeout(ctx, probeTimeout) + defer cancel() + + if transport == "" { + transport = mcpclient.TransportAuto + } + c, err := mcpclient.Connect(ctx, serverURL, transport, "toolhive-trifecta-audit") + if err != nil { + return nil, fmt.Errorf("connect to %s: %w", serverURL, err) + } + defer func() { _ = c.Close() }() + + res, err := c.ListTools(ctx, mcp.ListToolsRequest{}) + if err != nil { + return nil, fmt.Errorf("list tools: %w", err) + } + return mapAnnotations(res.Tools), nil +} + +// mapAnnotations converts MCP tool annotations into the ToolHive representation +// classifySource consumes. Kept separate so the field-by-field copy is unit +// testable without a live server. +func mapAnnotations(tools []mcp.Tool) map[string]*authorizers.ToolAnnotations { + annotations := make(map[string]*authorizers.ToolAnnotations, len(tools)) + for i := range tools { + a := tools[i].Annotations + annotations[tools[i].Name] = &authorizers.ToolAnnotations{ + ReadOnlyHint: a.ReadOnlyHint, + DestructiveHint: a.DestructiveHint, + IdempotentHint: a.IdempotentHint, + OpenWorldHint: a.OpenWorldHint, + } + } + return annotations +} diff --git a/pkg/toxicflow/types.go b/pkg/toxicflow/types.go new file mode 100644 index 0000000000..9013522ff0 --- /dev/null +++ b/pkg/toxicflow/types.go @@ -0,0 +1,147 @@ +// SPDX-FileCopyrightText: Copyright 2025 Stacklok, Inc. +// SPDX-License-Identifier: Apache-2.0 + +// Package toxicflow assesses a ToolHive group (a set of MCP servers) for +// "lethal trifecta" risk: the co-location, within a single agent context, of +// access to private data, exposure to untrusted content, and the ability to +// exfiltrate. Following information-flow terminology, the hazard is modelled as +// a toxic flow from an untrusted-content source, through a private-data holder, +// to an exfiltration sink. Because every server in a group shares the one model +// context, a flow exists whenever the group contains all three roles. +// +// The package separates a pure classifier (Classify) and a pure analyzer +// (AnalyzeGroup) from the impure data collection (Collector) so the heuristics +// can be unit-tested without a running ToolHive. +// +// ToolHive observes permission profiles, network egress, and registry metadata +// — not the model's reasoning. It therefore classifies the data and sink roles +// with reasonable confidence (they derive from the permission profile) but the +// source role poorly: untrusted-content ingestion has no first-class signal, so +// the source role defaults to "unknown" rather than "none". +package toxicflow + +// Role is the part a server can play in a toxic flow. A single server may hold +// more than one role (e.g. a web-fetch server is both a source and a sink). +type Role string + +const ( + // RoleData marks a server with access to private or sensitive data. + RoleData Role = "data" + // RoleSource marks a server that ingests potentially untrusted content, + // i.e. a prompt-injection delivery vector. + RoleSource Role = "source" + // RoleSink marks a server able to communicate externally (exfiltrate). + RoleSink Role = "sink" +) + +// AllRoles is the canonical role order used for deterministic output. +var AllRoles = []Role{RoleData, RoleSource, RoleSink} + +// Confidence expresses how strongly the evidence supports a role. The ordering +// none < unknown < possible < likely is significant: "unknown" means ToolHive +// lacked the inputs to rule the role in or out, whereas "none" is a confident +// absence (we inspected the relevant inputs and found nothing). +type Confidence string + +const ( + // ConfNone is a confident absence — the inputs were available and clear. + ConfNone Confidence = "none" + // ConfUnknown means the role could not be assessed from available inputs. + ConfUnknown Confidence = "unknown" + // ConfPossible means weak/inferred evidence supports the role. + ConfPossible Confidence = "possible" + // ConfLikely means strong evidence supports the role. + ConfLikely Confidence = "likely" +) + +// rank returns the ordinal of a confidence level for comparison. +func (c Confidence) rank() int { + switch c { + case ConfLikely: + return 3 + case ConfPossible: + return 2 + case ConfUnknown: + return 1 + case ConfNone: + return 0 + default: + return 0 + } +} + +// atOrAbove reports whether c is as strong as, or stronger than, other. +func (c Confidence) atOrAbove(other Confidence) bool { + return c.rank() >= other.rank() +} + +// RoleFinding is the assessment of a single role for a single server. +type RoleFinding struct { + Role Role `json:"role"` + Confidence Confidence `json:"confidence"` + // Evidence holds human-readable reasons supporting the confidence level. + Evidence []string `json:"evidence,omitempty"` + // Overridden is true when an explicit override set this finding. + Overridden bool `json:"overridden,omitempty"` +} + +// ServerAssessment is the per-server result of classification. +type ServerAssessment struct { + Name string `json:"name"` + Findings map[Role]RoleFinding `json:"findings"` +} + +// Finding returns the finding for a role, defaulting to a ConfNone finding when +// absent so callers never have to nil-check. +func (s ServerAssessment) Finding(role Role) RoleFinding { + if f, ok := s.Findings[role]; ok { + return f + } + return RoleFinding{Role: role, Confidence: ConfNone} +} + +// Verdict is the group-level conclusion. +type Verdict string + +const ( + // VerdictNone means at least one role is confidently absent, so no toxic + // flow can form in this group. + VerdictNone Verdict = "none" + // VerdictPossible means all three roles are present at possible-or-likely + // confidence, but not all reach "likely". + VerdictPossible Verdict = "possible" + // VerdictPresent means all three roles are present at "likely" confidence. + VerdictPresent Verdict = "present" + // VerdictIndeterminate means a role cannot be confirmed or ruled out + // (stuck at "unknown"), so the group may or may not form a toxic flow. + VerdictIndeterminate Verdict = "indeterminate" +) + +// GroupAssessment is the result of analyzing a whole group. +type GroupAssessment struct { + Group string `json:"group"` + Verdict Verdict `json:"verdict"` + Servers []ServerAssessment `json:"servers"` + // Sources, DataHolders and Sinks list the servers contributing each role + // at possible-or-likely confidence — the attack path at group granularity. + Sources []string `json:"sources,omitempty"` + DataHolders []string `json:"data_holders,omitempty"` + Sinks []string `json:"sinks,omitempty"` + // Unclassified lists servers whose source role could not be assessed; these + // are why a verdict may be indeterminate. + Unclassified []string `json:"unclassified,omitempty"` + // SelfContainedFlow names servers that hold all three roles at + // possible-or-likely confidence on their own. Such a server forms a toxic + // flow by itself; the cheapest fix is to tighten its permission profile, + // not to split the group. + SelfContainedFlow []string `json:"self_contained_flow,omitempty"` +} + +// Override forces a role's confidence for a named server, e.g. to correct a +// mis-classified common server. A reason is required for auditability. +type Override struct { + Server string `json:"server"` + Role Role `json:"role"` + Confidence Confidence `json:"confidence"` + Reason string `json:"reason"` +} diff --git a/skills/audit-trifecta/SKILL.md b/skills/audit-trifecta/SKILL.md new file mode 100644 index 0000000000..80ec9fce4c --- /dev/null +++ b/skills/audit-trifecta/SKILL.md @@ -0,0 +1,177 @@ +--- +name: audit-trifecta +description: >- + Audit a ToolHive group of MCP servers for "lethal trifecta" / toxic-flow risk + — the co-location of private-data access, untrusted-content exposure, and an + exfiltration path in one agent context. Use when checking whether a group is + safe from prompt-injection data exfiltration, reviewing MCP server combinations + for security, interpreting `thv audit-trifecta` output, or writing overrides for + mis-classified servers. NOT for running/managing servers (use toolhive-cli-user) + or Kubernetes operator audits. +license: Apache-2.0 +metadata: + version: 0.1.0 +--- + +# Audit a ToolHive Group for Lethal-Trifecta Risk + +## What this checks + +The **lethal trifecta** is the co-location, in a single agent context, of three +capabilities. ToolHive models them as roles in a **toxic flow**: + +- **data** — access to private/sensitive data (filesystem reads, secrets, authed backends) +- **source** — exposure to untrusted content, a prompt-injection vector (web fetch, email, issues) +- **sink** — the ability to exfiltrate (network egress, remote endpoints) + +A ToolHive **group** is a set of MCP servers that share one agent's context, so a +toxic flow exists whenever the group contains all three roles: an injection via a +*source* server can read a *data* server's secrets and send them out a *sink*. + +ToolHive assesses **data** and **sink** confidently (they come from the permission +profile) but **source** poorly — untrusted-content exposure has no first-class +signal. Expect `indeterminate` verdicts until source servers are classified or +overridden. + +## Prerequisites + +- `thv` built with the `audit-trifecta` subcommand (currently hidden/experimental). +- At least one group with workloads. List groups with `thv group list`. + +## Instructions + +1. **Run the audit** for the group in question: + ```bash + thv audit-trifecta + thv audit-trifecta --all # every group + thv audit-trifecta --explain # show the evidence behind each finding + ``` + + Improve accuracy two ways (both optional, both safe — they can only *raise* + suspicion, never produce a false "no toxic flow"): + ```bash + # Probe running (actively-proxied) servers for live tool annotations + thv audit-trifecta --live + + # Use an LLM to judge untrusted-content/private-data from descriptions + # (falls back to keyword search if model/base-url are unset) + thv audit-trifecta \ + --llm-model gpt-4o-mini --llm-base-url https://api.openai.com/v1 + # API key via --llm-api-key or the THV_AUDIT_LLM_API_KEY env var + # (key is optional — omit it for keyless local models) + + # Or reuse a running `thv llm` proxy (it injects auth; no key needed) + thv audit-trifecta --llm-proxy --llm-model + ``` + The `--llm-base-url` must speak the OpenAI-compatible `/v1/chat/completions` + API. `--llm-proxy` points at the local `thv llm` reverse proxy (must be + running via `thv llm proxy`), which handles gateway auth for you. + The LLM only ever receives **public** metadata (name, description, tags, tool + names) — never permission profiles, secrets, or config. Point `--llm-base-url` + at a local model to keep traffic on-host. + +2. **Read the verdict** (see [Verdicts](#verdicts)). The text output shows a + per-server role table, then the contributing servers per role and a one-line + recommendation. + +3. **For `present` or `possible`** — there is a real or likely toxic flow. + Recommend remediation in this priority order (see [Remediation](#remediation)): + split the group → restrict egress on data-holders → drop the source. + +4. **For `indeterminate`** — data and sink exist but source is unknown for the + listed `Unclassified` servers. Investigate each: does it ingest content the + user does not control (web pages, emails, issue/PR bodies, arbitrary files)? + - If **yes**, it is a source → the verdict becomes `present`/`possible`; remediate. + - If **no**, write an override setting its `source` to `none` (see + [Overrides](#overrides)). The test for "no" is strict: the server ingests + **no content any external party can influence**. "First-party" is not + sufficient — an internal wiki, issue tracker, or mailbox is first-party + *infrastructure* but routinely carries attacker-influenced *content* + (a comment, a PR body, a forwarded email). Those are still sources. + +5. **For `none`** — at least one leg is confidently absent; the group is safe from + this class of attack. State that plainly. + +6. **Re-run with `--overrides`** after writing any overrides to confirm the new + verdict: + ```bash + thv audit-trifecta --overrides ./trifecta-overrides.json + ``` + +## Verdicts + +| Verdict | Meaning | Action | +|---|---|---| +| `present` | All three roles at high confidence | Remediate now | +| `possible` | All three present, some at low confidence | Review contributors, likely remediate | +| `indeterminate` | data + sink present, source unknown | Classify the unclassified servers or override | +| `none` | A leg is confidently absent | Safe — no action | + +## Remediation + +Lead with the cheapest effective fix: + +1. **Split the group** — move the source server (or the data server) into its own + group so they no longer share one agent context. This breaks the flow + structurally and is usually the right answer. +2. **Restrict egress** — tighten the sink server's permission profile to an + allowlist (`network.outbound.allow_host`) or `isolate_network`, so even a + successful injection cannot reach an exfiltration destination. +3. **Drop the source** — remove the untrusted-content server from the group if it + is not essential. + +Do not recommend "just be careful" — the trifecta is an architecture problem; the +fix is to remove one leg from the shared context. + +## Overrides + +Overrides correct mis-classified servers. They are top-priority evidence, so use +them deliberately and always with a real `reason` (it is the audit trail). + +Write a JSON array to a file (e.g. `trifecta-overrides.json`): + +```json +[ + { + "server": "intranet-docs", + "role": "source", + "confidence": "none", + "reason": "reads only the first-party internal wiki, not untrusted content" + }, + { + "server": "weather", + "role": "sink", + "confidence": "none", + "reason": "egress restricted to api.weather.gov, not an exfiltration channel" + } +] +``` + +Fields: +- `server` — workload name. An empty string applies the override to **every** + server in the group — a blunt instrument; prefer naming a specific server. +- `role` — `data`, `source`, or `sink`. +- `confidence` — `none`, `unknown`, `possible`, or `likely`. +- `reason` — required justification; surfaced in `--explain` and the audit trail. + +Overrides are validated on load: an unknown `role`/`confidence` or a missing +`reason` fails with an error rather than silently degrading a finding. Overridden +values are marked with `*` in the default output. + +**Only override what you have verified.** Setting `source: none` on a server that +*does* ingest untrusted content silently re-opens the trifecta. When in doubt, +investigate rather than override. + +## Error Handling + +- **"specify a group name or use --all"** — pass a group name or `--all`. +- **Empty / `(no servers in group)`** — the group has no workloads; check + `thv group list` and `thv list --group `. +- **Everything `indeterminate`** — expected when servers are not in the registry + and have no tool annotations; classify sources via overrides, or wait for live + annotation probing. + +## See Also + +- `toolhive-cli-user` skill — running, grouping, and configuring MCP servers. +- `thv audit-trifecta --help` — full flag reference.