-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathllm_interaction_handler.py
More file actions
191 lines (151 loc) · 7.1 KB
/
llm_interaction_handler.py
File metadata and controls
191 lines (151 loc) · 7.1 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
"""
LLM Interaction Handler for Sentience Agent.
Handles all LLM-related operations: context building, querying, and response parsing.
This separates LLM interaction concerns from action execution.
"""
import re
from .llm_provider import LLMProvider, LLMResponse
from .models import Snapshot
class LLMInteractionHandler:
"""
Handles LLM queries and response parsing for Sentience Agent.
This class encapsulates all LLM interaction logic, making it easier to:
- Test LLM interactions independently
- Swap LLM providers without changing agent code
- Modify prompt templates in one place
"""
def __init__(self, llm: LLMProvider):
"""
Initialize LLM interaction handler.
Args:
llm: LLM provider instance (OpenAIProvider, AnthropicProvider, etc.)
"""
self.llm = llm
def build_context(self, snap: Snapshot, goal: str | None = None) -> str:
"""
Convert snapshot elements to token-efficient prompt string.
Format: [ID] <role> "text" {cues} @ position size:WxH importance:score [status]
Args:
snap: Snapshot object
goal: Optional user goal (for context, currently unused but kept for API consistency)
Returns:
Formatted element context string
"""
lines = []
for el in snap.elements:
# Skip REMOVED elements - they're not actionable and shouldn't be in LLM context
if el.diff_status == "REMOVED":
continue
# Extract visual cues
cues: list[str] = []
if el.visual_cues.is_primary:
cues.append("PRIMARY")
if el.visual_cues.is_clickable:
cues.append("CLICKABLE")
if el.visual_cues.background_color_name:
cues.append(f"color:{el.visual_cues.background_color_name}")
# Format element line with improved readability
# Ensure cues is defined before using it in f-string
cues_str = f" {{{','.join(cues)}}}" if cues else ""
# Better text handling - show truncation indicator
text_preview = ""
if el.text:
if len(el.text) > 50:
text_preview = f'"{el.text[:50]}..."'
else:
text_preview = f'"{el.text}"'
# Build position and size info
x, y = int(el.bbox.x), int(el.bbox.y)
width, height = int(el.bbox.width), int(el.bbox.height)
position_str = f"@ ({x},{y})"
size_str = f"size:{width}x{height}"
# Build status indicators (only include if relevant)
status_parts = []
if not el.in_viewport:
status_parts.append("not_in_viewport")
if el.is_occluded:
status_parts.append("occluded")
if el.diff_status:
status_parts.append(f"diff:{el.diff_status}")
status_str = f" [{','.join(status_parts)}]" if status_parts else ""
# Format: [ID] <role> "text" {cues} @ (x,y) size:WxH importance:score [status]
lines.append(
f'[{el.id}] <{el.role}> {text_preview}{cues_str} '
f"{position_str} {size_str} importance:{el.importance}{status_str}"
)
return "\n".join(lines)
def query_llm(self, dom_context: str, goal: str) -> LLMResponse:
"""
Query LLM with standardized prompt template.
Args:
dom_context: Formatted element context from build_context()
goal: User goal
Returns:
LLMResponse from LLM provider
"""
system_prompt = f"""You are an AI web automation agent.
GOAL: {goal}
VISIBLE ELEMENTS (sorted by importance):
{dom_context}
VISUAL CUES EXPLAINED:
After the text, you may see visual cues in curly braces like {{CLICKABLE}} or {{PRIMARY,CLICKABLE,color:white}}:
- PRIMARY: Main call-to-action element on the page
- CLICKABLE: Element is clickable/interactive
- color:X: Background color name (e.g., color:white, color:blue)
Multiple cues are comma-separated inside the braces: {{CLICKABLE,color:white}}
ELEMENT FORMAT EXPLAINED:
Each element line follows this format:
[ID] <role> "text" {{cues}} @ (x,y) size:WxH importance:score [status]
Example: [346] <button> "Computer Accessories" {{CLICKABLE,color:white}} @ (664,100) size:150x40 importance:811
Breaking down each part:
- [ID]: The number in brackets is the element ID - use this EXACT number in CLICK/TYPE commands
Example: If you see [346], use CLICK(346) or TYPE(346, "text")
- <role>: Element type (button, link, textbox, etc.)
- "text": Visible text content (truncated with "..." if long)
- {{cues}}: Optional visual cues in curly braces (e.g., {{CLICKABLE}}, {{PRIMARY,CLICKABLE}}, {{CLICKABLE,color:white}})
If no cues, this part is omitted entirely
- @ (x,y): Element position in pixels from top-left corner
- size:WxH: Element dimensions (width x height in pixels)
- importance: Score indicating element relevance (higher = more important)
- [status]: Optional status flags in brackets (not_in_viewport, occluded, diff:ADDED/MODIFIED/etc)
CRITICAL RESPONSE FORMAT:
You MUST respond with ONLY ONE of these exact action formats:
- CLICK(id) - Click element by ID (use the number from [ID] brackets)
- TYPE(id, "text") - Type text into element (use the number from [ID] brackets)
- PRESS("key") - Press keyboard key (Enter, Escape, Tab, ArrowDown, etc)
- FINISH() - Task complete
DO NOT include any explanation, reasoning, or natural language.
DO NOT use markdown formatting or code blocks.
DO NOT say "The next step is..." or anything similar.
CORRECT Examples (matching element IDs from the list above):
If element is [346] <button> "Click me" → respond: CLICK(346)
If element is [15] <textbox> "Search" → respond: TYPE(15, "magic mouse")
PRESS("Enter")
FINISH()
INCORRECT Examples (DO NOT DO THIS):
"The next step is to click..."
"I will type..."
```CLICK(42)```
"""
user_prompt = "Return the single action command:"
return self.llm.generate(system_prompt, user_prompt, temperature=0.0)
def extract_action(self, response: str) -> str:
"""
Extract action command from LLM response.
Handles cases where the LLM adds extra explanation despite instructions.
Args:
response: Raw LLM response text
Returns:
Cleaned action command string (e.g., "CLICK(42)", "TYPE(15, \"text\")")
"""
# Remove markdown code blocks if present
response = re.sub(r"```[\w]*\n?", "", response)
response = response.strip()
# Try to find action patterns in the response
# Pattern matches: CLICK(123), TYPE(123, "text"), PRESS("key"), FINISH()
action_pattern = r'(CLICK\s*\(\s*\d+\s*\)|TYPE\s*\(\s*\d+\s*,\s*["\'].*?["\']\s*\)|PRESS\s*\(\s*["\'].*?["\']\s*\)|FINISH\s*\(\s*\))'
match = re.search(action_pattern, response, re.IGNORECASE)
if match:
return match.group(1)
# If no pattern match, return the original response (will likely fail parsing)
return response