|
| 1 | +"""Classify agent actions by matching content against known threat signatures. |
| 2 | +
|
| 3 | +When an agent is about to run ``rm -rf /``, you want to catch it. When |
| 4 | +the agent merely *thinks about* ``rm -rf /`` while running ``ls /tmp``, |
| 5 | +you do not. This module solves that with two scanning corpora: |
| 6 | +
|
| 7 | +- **Executable corpus** (tool_name, tool_call arguments): scanned for |
| 8 | + shell-destructive, code-execution, and network-to-exec patterns. |
| 9 | +- **All-field corpus** (executable + thought/reasoning/summary): scanned |
| 10 | + for injection and social-engineering patterns that are dangerous |
| 11 | + wherever they appear. |
| 12 | +
|
| 13 | +Each pattern carries a stable detector ID for telemetry readiness. |
| 14 | +""" |
| 15 | + |
| 16 | +from __future__ import annotations |
| 17 | + |
| 18 | +import re |
| 19 | +from typing import Any |
| 20 | + |
| 21 | +from pydantic import Field, PrivateAttr |
| 22 | + |
| 23 | +from openhands.sdk.event import ActionEvent |
| 24 | +from openhands.sdk.logger import get_logger |
| 25 | +from openhands.sdk.security.analyzer import SecurityAnalyzerBase |
| 26 | +from openhands.sdk.security.defense_in_depth.utils import ( |
| 27 | + _extract_content, |
| 28 | + _extract_exec_content, |
| 29 | + _normalize, |
| 30 | +) |
| 31 | +from openhands.sdk.security.risk import SecurityRisk |
| 32 | + |
| 33 | + |
| 34 | +logger = get_logger(__name__) |
| 35 | + |
| 36 | +# --------------------------------------------------------------------------- |
| 37 | +# Stable detector IDs -- do not change between releases without documentation. |
| 38 | +# Format: DET_{CORPUS}_{FAMILY}_{SPECIFIC} |
| 39 | +# --------------------------------------------------------------------------- |
| 40 | + |
| 41 | +DET_EXEC_DESTRUCT_RM_RF = "exec.destruct.rm_rf" |
| 42 | +DET_EXEC_DESTRUCT_SUDO_RM = "exec.destruct.sudo_rm" |
| 43 | +DET_EXEC_DESTRUCT_MKFS = "exec.destruct.mkfs" |
| 44 | +DET_EXEC_DESTRUCT_DD = "exec.destruct.dd_raw_disk" |
| 45 | +DET_EXEC_CODE_EVAL = "exec.code.eval_call" |
| 46 | +DET_EXEC_CODE_EXEC = "exec.code.exec_call" |
| 47 | +DET_EXEC_CODE_OS_SYSTEM = "exec.code.os_system" |
| 48 | +DET_EXEC_CODE_SUBPROCESS = "exec.code.subprocess" |
| 49 | +DET_EXEC_NET_CURL_EXEC = "exec.net.curl_pipe_exec" |
| 50 | +DET_EXEC_NET_WGET_EXEC = "exec.net.wget_pipe_exec" |
| 51 | +DET_EXEC_NET_CURL = "exec.net.curl" |
| 52 | +DET_EXEC_NET_WGET = "exec.net.wget" |
| 53 | +DET_INJECT_OVERRIDE = "inject.override" |
| 54 | +DET_INJECT_MODE_SWITCH = "inject.mode_switch" |
| 55 | +DET_INJECT_IDENTITY = "inject.identity" |
| 56 | + |
| 57 | +# --------------------------------------------------------------------------- |
| 58 | +# Pattern definitions |
| 59 | +# |
| 60 | +# Format: (regex_pattern, description, detector_id) |
| 61 | +# |
| 62 | +# Pattern design constraints: |
| 63 | +# - No unbounded .* or .+ around alternations (catastrophic backtracking) |
| 64 | +# - Risky spans are bounded ({0,N}) to prevent ReDoS |
| 65 | +# - \s* and \w+ are acceptable in non-alternation positions |
| 66 | +# - \b-anchored to avoid substring matches |
| 67 | +# - IGNORECASE compiled in |
| 68 | +# --------------------------------------------------------------------------- |
| 69 | + |
| 70 | +DEFAULT_HIGH_PATTERNS: list[tuple[str, str, str]] = [ |
| 71 | + # Destructive filesystem operations |
| 72 | + ( |
| 73 | + r"\brm\s+(?:-[frR]{2,}|-[rR]\s+-f|-f\s+-[rR]" |
| 74 | + r"|--recursive\s+--force|--force\s+--recursive)\b", |
| 75 | + "Recursive force-delete (rm -rf variants)", |
| 76 | + DET_EXEC_DESTRUCT_RM_RF, |
| 77 | + ), |
| 78 | + (r"\bsudo\s+rm\b", "Privileged file deletion", DET_EXEC_DESTRUCT_SUDO_RM), |
| 79 | + (r"\bmkfs\.\w+", "Filesystem format command", DET_EXEC_DESTRUCT_MKFS), |
| 80 | + (r"\bdd\b.{0,100}of=/dev/", "Raw disk write", DET_EXEC_DESTRUCT_DD), |
| 81 | + # Code invocation via dynamic interpreters |
| 82 | + (r"\beval\s*\(", "Dynamic code evaluation", DET_EXEC_CODE_EVAL), |
| 83 | + (r"\bexec\s*\(", "Dynamic code execution", DET_EXEC_CODE_EXEC), |
| 84 | + (r"\bos\.system\s*\(", "OS-level command execution", DET_EXEC_CODE_OS_SYSTEM), |
| 85 | + ( |
| 86 | + r"\bsubprocess\.(?:call|run|Popen|check_output|check_call)\s*\(", |
| 87 | + "Subprocess invocation", |
| 88 | + DET_EXEC_CODE_SUBPROCESS, |
| 89 | + ), |
| 90 | + # Download-and-run |
| 91 | + ( |
| 92 | + r"\bcurl\b[^|]{0,200}\|\s*(?:ba)?sh\b", |
| 93 | + "Download and run (curl | sh)", |
| 94 | + DET_EXEC_NET_CURL_EXEC, |
| 95 | + ), |
| 96 | + ( |
| 97 | + r"\bwget\b[^|]{0,200}\|\s*(?:ba)?sh\b", |
| 98 | + "Download and run (wget | sh)", |
| 99 | + DET_EXEC_NET_WGET_EXEC, |
| 100 | + ), |
| 101 | +] |
| 102 | + |
| 103 | +DEFAULT_MEDIUM_PATTERNS: list[tuple[str, str, str]] = [ |
| 104 | + # Network access without invocation pipe |
| 105 | + (r"\bcurl\b.{0,100}https?://", "HTTP request via curl", DET_EXEC_NET_CURL), |
| 106 | + (r"\bwget\b.{0,100}https?://", "Download via wget", DET_EXEC_NET_WGET), |
| 107 | +] |
| 108 | + |
| 109 | +# Injection patterns: scanned against ALL fields (invocation + reasoning). |
| 110 | +# These are textual attacks targeting instruction-following, not the OS. |
| 111 | + |
| 112 | +DEFAULT_INJECTION_HIGH_PATTERNS: list[tuple[str, str, str]] = [ |
| 113 | + ( |
| 114 | + r"\b(?:ignore|disregard|forget|override|bypass)\s+(?:all\s+)?" |
| 115 | + r"(?:previous|prior|above)\s+(?:instructions?|prompts?|rules?|directives?)\b", |
| 116 | + "Instruction override attempt", |
| 117 | + DET_INJECT_OVERRIDE, |
| 118 | + ), |
| 119 | +] |
| 120 | + |
| 121 | +DEFAULT_INJECTION_MEDIUM_PATTERNS: list[tuple[str, str, str]] = [ |
| 122 | + ( |
| 123 | + r"\byou\s+are\s+now\s+(?:in\s+)?(?:\w+\s+)?mode\b", |
| 124 | + "Mode switching attempt", |
| 125 | + DET_INJECT_MODE_SWITCH, |
| 126 | + ), |
| 127 | + ( |
| 128 | + r"\bpretend\s+(?:you\s+are|to\s+be)\s+(?:a\s+)?different\b", |
| 129 | + "Identity manipulation", |
| 130 | + DET_INJECT_IDENTITY, |
| 131 | + ), |
| 132 | +] |
| 133 | + |
| 134 | + |
| 135 | +# --------------------------------------------------------------------------- |
| 136 | +# PatternSecurityAnalyzer |
| 137 | +# --------------------------------------------------------------------------- |
| 138 | + |
| 139 | + |
| 140 | +class PatternSecurityAnalyzer(SecurityAnalyzerBase): |
| 141 | + """Catch dangerous agent actions through deterministic signature scanning. |
| 142 | +
|
| 143 | + Use this when you want fast, local, no-network threat detection at the |
| 144 | + action boundary. It returns ``SecurityRisk.HIGH``, ``MEDIUM``, or ``LOW`` |
| 145 | + -- pair it with ``ConfirmRisky`` to decide what gets confirmed. |
| 146 | +
|
| 147 | + The key design choice: shell-destructive patterns only scan what the |
| 148 | + agent will *execute* (tool arguments), never what it *thought about* |
| 149 | + (reasoning text). Injection patterns scan everything, because |
| 150 | + "ignore all previous instructions" is dangerous wherever it appears. |
| 151 | +
|
| 152 | + Normalization is always on -- invisible characters and fullwidth |
| 153 | + substitutions are collapsed before matching. |
| 154 | +
|
| 155 | + Example:: |
| 156 | +
|
| 157 | + from openhands.sdk.security import PatternSecurityAnalyzer, ConfirmRisky |
| 158 | +
|
| 159 | + analyzer = PatternSecurityAnalyzer() |
| 160 | + policy = ConfirmRisky(threshold=SecurityRisk.MEDIUM) |
| 161 | + """ |
| 162 | + |
| 163 | + high_patterns: list[tuple[str, str, str]] = Field( |
| 164 | + default_factory=lambda: list(DEFAULT_HIGH_PATTERNS), |
| 165 | + description="HIGH patterns scanned against executable fields only", |
| 166 | + ) |
| 167 | + medium_patterns: list[tuple[str, str, str]] = Field( |
| 168 | + default_factory=lambda: list(DEFAULT_MEDIUM_PATTERNS), |
| 169 | + description="MEDIUM patterns scanned against executable fields only", |
| 170 | + ) |
| 171 | + injection_high_patterns: list[tuple[str, str, str]] = Field( |
| 172 | + default_factory=lambda: list(DEFAULT_INJECTION_HIGH_PATTERNS), |
| 173 | + description="HIGH patterns scanned against all fields", |
| 174 | + ) |
| 175 | + injection_medium_patterns: list[tuple[str, str, str]] = Field( |
| 176 | + default_factory=lambda: list(DEFAULT_INJECTION_MEDIUM_PATTERNS), |
| 177 | + description="MEDIUM patterns scanned against all fields", |
| 178 | + ) |
| 179 | + |
| 180 | + _compiled_high: list[tuple[re.Pattern[str], str, str]] = PrivateAttr( |
| 181 | + default_factory=list, |
| 182 | + ) |
| 183 | + _compiled_medium: list[tuple[re.Pattern[str], str, str]] = PrivateAttr( |
| 184 | + default_factory=list, |
| 185 | + ) |
| 186 | + _compiled_injection_high: list[tuple[re.Pattern[str], str, str]] = PrivateAttr( |
| 187 | + default_factory=list, |
| 188 | + ) |
| 189 | + _compiled_injection_medium: list[tuple[re.Pattern[str], str, str]] = PrivateAttr( |
| 190 | + default_factory=list, |
| 191 | + ) |
| 192 | + |
| 193 | + def model_post_init(self, __context: Any) -> None: |
| 194 | + """Compile regex patterns after model initialization.""" |
| 195 | + self._compiled_high = [ |
| 196 | + (re.compile(p, re.IGNORECASE), d, det_id) |
| 197 | + for p, d, det_id in self.high_patterns |
| 198 | + ] |
| 199 | + self._compiled_medium = [ |
| 200 | + (re.compile(p, re.IGNORECASE), d, det_id) |
| 201 | + for p, d, det_id in self.medium_patterns |
| 202 | + ] |
| 203 | + self._compiled_injection_high = [ |
| 204 | + (re.compile(p, re.IGNORECASE), d, det_id) |
| 205 | + for p, d, det_id in self.injection_high_patterns |
| 206 | + ] |
| 207 | + self._compiled_injection_medium = [ |
| 208 | + (re.compile(p, re.IGNORECASE), d, det_id) |
| 209 | + for p, d, det_id in self.injection_medium_patterns |
| 210 | + ] |
| 211 | + |
| 212 | + def security_risk(self, action: ActionEvent) -> SecurityRisk: |
| 213 | + """Evaluate security risk via two-corpus pattern matching.""" |
| 214 | + exec_content = _normalize(_extract_exec_content(action)) |
| 215 | + all_content = _normalize(_extract_content(action)) |
| 216 | + |
| 217 | + if not exec_content and not all_content: |
| 218 | + return SecurityRisk.LOW |
| 219 | + |
| 220 | + # HIGH: patterns on executable fields only |
| 221 | + for pattern, _desc, det_id in self._compiled_high: |
| 222 | + if pattern.search(exec_content): |
| 223 | + logger.debug("Pattern matched: %s -> HIGH", det_id) |
| 224 | + return SecurityRisk.HIGH |
| 225 | + |
| 226 | + # HIGH: injection patterns on all fields |
| 227 | + for pattern, _desc, det_id in self._compiled_injection_high: |
| 228 | + if pattern.search(all_content): |
| 229 | + logger.debug("Pattern matched: %s -> HIGH", det_id) |
| 230 | + return SecurityRisk.HIGH |
| 231 | + |
| 232 | + # MEDIUM: patterns on executable fields only |
| 233 | + for pattern, _desc, det_id in self._compiled_medium: |
| 234 | + if pattern.search(exec_content): |
| 235 | + logger.debug("Pattern matched: %s -> MEDIUM", det_id) |
| 236 | + return SecurityRisk.MEDIUM |
| 237 | + |
| 238 | + # MEDIUM: injection patterns on all fields |
| 239 | + for pattern, _desc, det_id in self._compiled_injection_medium: |
| 240 | + if pattern.search(all_content): |
| 241 | + logger.debug("Pattern matched: %s -> MEDIUM", det_id) |
| 242 | + return SecurityRisk.MEDIUM |
| 243 | + |
| 244 | + return SecurityRisk.LOW |
0 commit comments