Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
385 changes: 385 additions & 0 deletions cmd/thv/app/audit_trifecta.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,385 @@
// SPDX-FileCopyrightText: Copyright 2025 Stacklok, Inc.
// SPDX-License-Identifier: Apache-2.0

package app

import (
"encoding/json"
"fmt"
"os"
"slices"
"strings"
"text/tabwriter"

"github.com/spf13/cobra"

"github.com/stacklok/toolhive/pkg/config"
llmclient "github.com/stacklok/toolhive/pkg/llm/client"
"github.com/stacklok/toolhive/pkg/toxicflow"
)

// auditTrifectaCmd is hidden while the heuristics are being calibrated. It
// audits a group (a set of MCP servers sharing one agent context) for
// "lethal trifecta" risk: the co-location of private-data access, untrusted-
// content exposure, and an exfiltration path.
var auditTrifectaCmd = &cobra.Command{
Use: "audit-trifecta [group]",
Short: "Audit a group of MCP servers for lethal-trifecta risk (experimental)",
Hidden: true,
Long: `Audit a ToolHive group for "lethal trifecta" risk.

The lethal trifecta is the co-location, within a single agent context, of:
- access to private data,
- exposure to untrusted content (a prompt-injection vector), and
- the ability to exfiltrate data externally.

Because every server in a group shares the model's context, a toxic flow exists
whenever the group contains all three. This command classifies each server from
its permission profile and registry metadata and reports the group verdict.

ToolHive can assess data access and egress with reasonable confidence (they come
from the permission profile) but untrusted-content exposure poorly; expect an
"indeterminate" verdict until you classify or override the relevant servers.

Examples:
# Audit one group
thv audit-trifecta research

# Audit every group as JSON
thv audit-trifecta --all --format json

# Apply overrides for mis-classified servers
thv audit-trifecta research --overrides ./trifecta-overrides.json`,
Args: cobra.MaximumNArgs(1),
RunE: auditTrifectaCmdFunc,
ValidArgsFunction: completeTrifectaGroup,
}

// completeTrifectaGroup completes the group argument with known group names.
func completeTrifectaGroup(cmd *cobra.Command, args []string, _ string) ([]string, cobra.ShellCompDirective) {
if len(args) != 0 {
return nil, cobra.ShellCompDirectiveNoFileComp
}
names, err := toxicflow.ListGroupNames(cmd.Context())
if err != nil {
return nil, cobra.ShellCompDirectiveNoFileComp
}
return names, cobra.ShellCompDirectiveNoFileComp
}

var (
auditTrifectaAll bool
auditTrifectaFormat string
auditTrifectaExplain bool
auditTrifectaOverrides string
auditTrifectaLive bool
auditTrifectaLLMModel string
auditTrifectaLLMURL string
auditTrifectaLLMKey string
auditTrifectaLLMProxy bool
)

// trifectaLLMKeyEnv is the fallback source for the LLM API key so it need not
// be passed on the command line.
const trifectaLLMKeyEnv = "THV_AUDIT_LLM_API_KEY"

func init() {
AddFormatFlag(auditTrifectaCmd, &auditTrifectaFormat, FormatJSON, FormatText)
auditTrifectaCmd.Flags().BoolVar(&auditTrifectaAll, "all", false, "Audit every group")
auditTrifectaCmd.Flags().BoolVar(&auditTrifectaExplain, "explain", false, "Show the evidence behind each finding")
auditTrifectaCmd.Flags().StringVar(&auditTrifectaOverrides, "overrides", "",
"Path to a JSON file of role overrides for mis-classified servers")
auditTrifectaCmd.Flags().BoolVar(&auditTrifectaLive, "live", false,
"Probe running servers for live tool annotations (openWorldHint)")
auditTrifectaCmd.Flags().StringVar(&auditTrifectaLLMModel, "llm-model", "",
"LLM model for untrusted-content inference (falls back to keyword search if unset)")
auditTrifectaCmd.Flags().StringVar(&auditTrifectaLLMURL, "llm-base-url", "",
"OpenAI-compatible base URL for the LLM (e.g. https://api.openai.com/v1)")
auditTrifectaCmd.Flags().StringVar(&auditTrifectaLLMKey, "llm-api-key", "",
"API key for the LLM (or set "+trifectaLLMKeyEnv+")")
auditTrifectaCmd.Flags().BoolVar(&auditTrifectaLLMProxy, "llm-proxy", false,
"Use the running `thv llm` proxy for inference (handles auth; needs --llm-model)")
auditTrifectaCmd.PreRunE = ValidateFormat(&auditTrifectaFormat, FormatJSON, FormatText)
}

// buildTrifectaInference selects the inference strategy: the `thv llm` proxy
// when --llm-proxy is set, else an LLM when a model and base URL are supplied
// (the API key is optional), else the offline keyword search. A partial LLM
// config falls back to keywords with a warning so a missing setting never
// silently disables inference the user asked for.
func buildTrifectaInference() (toxicflow.SourceInference, error) {
// Prefer the running `thv llm` proxy when asked: it injects auth and
// forwards to the configured gateway, so no key or base URL is needed here.
if auditTrifectaLLMProxy {
return llmProxyInference()
}

key := auditTrifectaLLMKey
if key == "" {
key = os.Getenv(trifectaLLMKeyEnv)
}
model, baseURL := auditTrifectaLLMModel, auditTrifectaLLMURL

// An explicit LLM needs a model and base URL; the key is optional (for
// keyless local models). With nothing set, fall back to keyword search.
switch {
case model == "" && baseURL == "" && key == "":
return toxicflow.NewKeywordInference(), nil
case model == "" || baseURL == "":
fmt.Fprintln(os.Stderr,
"warning: incomplete LLM config (need --llm-model and --llm-base-url, or --llm-proxy); using keyword search")
return toxicflow.NewKeywordInference(), nil
}

c, err := llmclient.New(llmclient.Config{Model: model, BaseURL: baseURL, APIKey: key})
if err != nil {
return nil, fmt.Errorf("configure LLM: %w", err)
}
return toxicflow.NewLLMInference(c), nil
}

// llmProxyInference builds an inference backend that talks to the local
// `thv llm` reverse proxy. The proxy must be running (`thv llm proxy`); it
// injects the OIDC token and forwards to the configured gateway.
func llmProxyInference() (toxicflow.SourceInference, error) {
if auditTrifectaLLMModel == "" {
return nil, fmt.Errorf("--llm-model is required with --llm-proxy")
}
if auditTrifectaLLMURL != "" || auditTrifectaLLMKey != "" {
fmt.Fprintln(os.Stderr, "warning: --llm-base-url/--llm-api-key are ignored with --llm-proxy")
}

llmCfg := config.NewDefaultProvider().GetConfig().LLM
if !llmCfg.IsConfigured() {
return nil, fmt.Errorf("no thv llm gateway configured: run \"thv llm config set\" and start the proxy with \"thv llm proxy\"")
}

baseURL := llmCfg.ProxyBaseURL()
fmt.Fprintf(os.Stderr, "using thv llm proxy at %s (model %s)\n", baseURL, auditTrifectaLLMModel)

c, err := llmclient.New(llmclient.Config{Model: auditTrifectaLLMModel, BaseURL: baseURL})
if err != nil {
return nil, fmt.Errorf("configure LLM proxy: %w", err)
}
return toxicflow.NewLLMInference(c), nil
}

func auditTrifectaCmdFunc(cmd *cobra.Command, args []string) error {
ctx := cmd.Context()

if !auditTrifectaAll && len(args) == 0 {
return fmt.Errorf("specify a group name or use --all")
}

overrides, err := loadTrifectaOverrides(auditTrifectaOverrides)
if err != nil {
return err
}

inference, err := buildTrifectaInference()
if err != nil {
return err
}

collector, err := toxicflow.NewCollector(ctx, inference, auditTrifectaLive)
if err != nil {
return fmt.Errorf("failed to set up audit: %w", err)
}

// Resolve the groups to audit. Validate a named group exists so a typo is
// reported as "not found" rather than silently auditing zero servers and
// reading as a reassuring "no toxic flow".
known, err := toxicflow.ListGroupNames(ctx)
if err != nil {
return err
}
groupNames := known
if !auditTrifectaAll {
if !slices.Contains(known, args[0]) {
return fmt.Errorf("group %q not found (run \"thv group list\")", args[0])
}
groupNames = []string{args[0]}
}

results := make([]toxicflow.GroupAssessment, 0, len(groupNames))
for _, g := range groupNames {
assessment, err := collector.AssessGroup(ctx, g, overrides)
if err != nil {
return fmt.Errorf("failed to audit group %q: %w", g, err)
}
results = append(results, assessment)
}

warnUnknownOverrideTargets(overrides, results)

if auditTrifectaFormat == FormatJSON {
return printTrifectaJSON(results)
}
printTrifectaText(results)
return nil
}

// loadTrifectaOverrides reads a JSON array of overrides from path, or returns
// nil when no path is given.
func loadTrifectaOverrides(path string) ([]toxicflow.Override, error) {
if path == "" {
return nil, nil
}
data, err := os.ReadFile(path) // #nosec G304 -- path is an operator-supplied CLI flag
if err != nil {
return nil, fmt.Errorf("failed to read overrides file: %w", err)
}
var overrides []toxicflow.Override
if err := json.Unmarshal(data, &overrides); err != nil {
return nil, fmt.Errorf("failed to parse overrides file: %w", err)
}
for _, o := range overrides {
if err := toxicflow.ValidateOverride(o); err != nil {
return nil, err
}
}
return overrides, nil
}

// warnUnknownOverrideTargets prints a warning for any override that names a
// server not present in the audited groups, so a typo does not silently
// fail open.
func warnUnknownOverrideTargets(overrides []toxicflow.Override, results []toxicflow.GroupAssessment) {
seen := map[string]bool{}
for _, a := range results {
for _, s := range a.Servers {
seen[s.Name] = true
}
}
for _, o := range overrides {
if o.Server != "" && !seen[o.Server] {
fmt.Fprintf(os.Stderr, "warning: override targets server %q, which is not in the audited group(s)\n", o.Server)
}
}
}

func printTrifectaJSON(results []toxicflow.GroupAssessment) error {
if results == nil {
results = []toxicflow.GroupAssessment{}
}
enc := json.NewEncoder(os.Stdout)
enc.SetIndent("", " ")
return enc.Encode(results)
}

func printTrifectaText(results []toxicflow.GroupAssessment) {
if len(results) == 0 {
fmt.Println("No groups to audit.")
return
}
for i, a := range results {
if i > 0 {
fmt.Println()
}
printGroupAssessment(a)
}
}

func printGroupAssessment(a toxicflow.GroupAssessment) {
fmt.Printf("Group: %s verdict: %s\n", a.Group, strings.ToUpper(string(a.Verdict)))

if len(a.Servers) == 0 {
fmt.Println(" (no servers in group)")
return
}

overridden := false
w := tabwriter.NewWriter(os.Stdout, 0, 2, 2, ' ', 0)
_, _ = fmt.Fprintln(w, " SERVER\tDATA\tSOURCE\tSINK")
for _, s := range a.Servers {
_, _ = fmt.Fprintf(w, " %s\t%s\t%s\t%s\n",
s.Name,
cell(s.Finding(toxicflow.RoleData)),
cell(s.Finding(toxicflow.RoleSource)),
cell(s.Finding(toxicflow.RoleSink)))
overridden = overridden || anyOverridden(s)
}
_ = w.Flush()

if overridden {
fmt.Println(" * value set by override — re-run with --explain for the reason")
}

if auditTrifectaExplain {
printTrifectaEvidence(a)
}

fmt.Printf(" Private data : %s\n", joinOrNone(a.DataHolders))
fmt.Printf(" Untrusted intake: %s\n", joinOrNone(a.Sources))
fmt.Printf(" Exfil sink : %s\n", joinOrNone(a.Sinks))
if len(a.Unclassified) > 0 {
fmt.Printf(" Unclassified : %s (untrusted-content exposure unknown)\n",
strings.Join(a.Unclassified, ", "))
}

fmt.Printf("\n %s\n", trifectaVerdictMessage(a))
}

func printTrifectaEvidence(a toxicflow.GroupAssessment) {
fmt.Println(" evidence:")
for _, s := range a.Servers {
for _, role := range toxicflow.AllRoles {
f := s.Finding(role)
for _, ev := range f.Evidence {
fmt.Printf(" %s [%s]: %s\n", s.Name, role, ev)
}
}
}
}

// trifectaVerdictMessage returns an actionable one-liner for the verdict.
func trifectaVerdictMessage(a toxicflow.GroupAssessment) string {
switch a.Verdict {
case toxicflow.VerdictPresent:
if len(a.SelfContainedFlow) > 0 {
return fmt.Sprintf("WARNING: Toxic flow present. %s hold(s) all three roles alone — "+
"a prompt injection there could read private data and exfiltrate it. "+
"Cheapest fix: tighten that server's permission profile (restrict egress, "+
"drop mounts), no regrouping needed.", strings.Join(a.SelfContainedFlow, ", "))
}
return "WARNING: Toxic flow present. A prompt injection via an untrusted-content " +
"server could read private data and exfiltrate it. Split the group so the " +
"untrusted-content server no longer shares a context with the data/exfil servers."
case toxicflow.VerdictPossible:
return "WARNING: Possible toxic flow. All three roles are present, some at low " +
"confidence. Review the contributing servers above."
case toxicflow.VerdictIndeterminate:
return "REVIEW: Indeterminate. Data and exfil paths exist, but untrusted-content " +
"exposure could not be determined. Classify or override the unclassified " +
"servers to resolve."
case toxicflow.VerdictNone:
return "OK: No toxic flow. At least one leg is confidently absent."
default:
return ""
}
}

// cell renders a finding's confidence for the table, marking overrides with a *.
func cell(f toxicflow.RoleFinding) string {
if f.Overridden {
return string(f.Confidence) + "*"
}
return string(f.Confidence)
}

// anyOverridden reports whether any of the server's findings was overridden.
func anyOverridden(s toxicflow.ServerAssessment) bool {
for _, role := range toxicflow.AllRoles {
if s.Finding(role).Overridden {
return true
}
}
return false
}

func joinOrNone(items []string) string {
if len(items) == 0 {
return "(none)"
}
return strings.Join(items, ", ")
}
1 change: 1 addition & 0 deletions cmd/thv/app/commands.go
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ func NewRootCmd(enableUpdates bool) *cobra.Command {
rootCmd.AddCommand(skillCmd)
rootCmd.AddCommand(statusCmd)
rootCmd.AddCommand(tuiCmd)
rootCmd.AddCommand(auditTrifectaCmd)

// Silence printing the usage on error
rootCmd.SilenceUsage = true
Expand Down
Loading
Loading