@@ -237,7 +237,7 @@ def act( # noqa: C901
237237 self ._track_tokens (goal , llm_response )
238238
239239 # Parse action from LLM response
240- action_str = llm_response .content . strip ( )
240+ action_str = self . _extract_action_from_response ( llm_response .content )
241241
242242 # 4. EXECUTE: Parse and run action
243243 result_dict = self ._execute_action (action_str , filtered_snap )
@@ -395,6 +395,34 @@ def _build_context(self, snap: Snapshot, goal: str) -> str:
395395
396396 return "\n " .join (lines )
397397
398+ def _extract_action_from_response (self , response : str ) -> str :
399+ """
400+ Extract action command from LLM response, handling cases where
401+ the LLM adds extra explanation despite instructions.
402+
403+ Args:
404+ response: Raw LLM response text
405+
406+ Returns:
407+ Cleaned action command string
408+ """
409+ import re
410+
411+ # Remove markdown code blocks if present
412+ response = re .sub (r"```[\w]*\n?" , "" , response )
413+ response = response .strip ()
414+
415+ # Try to find action patterns in the response
416+ # Pattern matches: CLICK(123), TYPE(123, "text"), PRESS("key"), FINISH()
417+ action_pattern = r'(CLICK\s*\(\s*\d+\s*\)|TYPE\s*\(\s*\d+\s*,\s*["\'].*?["\']\s*\)|PRESS\s*\(\s*["\'].*?["\']\s*\)|FINISH\s*\(\s*\))'
418+
419+ match = re .search (action_pattern , response , re .IGNORECASE )
420+ if match :
421+ return match .group (1 )
422+
423+ # If no pattern match, return the original response (will likely fail parsing)
424+ return response
425+
398426 def _query_llm (self , dom_context : str , goal : str ) -> LLMResponse :
399427 """
400428 Query LLM with standardized prompt template
@@ -418,23 +446,30 @@ def _query_llm(self, dom_context: str, goal: str) -> LLMResponse:
418446- {{CLICKABLE}}: Element is clickable
419447- {{color:X}}: Background color name
420448
421- RESPONSE FORMAT:
422- Return ONLY the function call, no explanation or markdown.
423-
424- Available actions:
449+ CRITICAL RESPONSE FORMAT:
450+ You MUST respond with ONLY ONE of these exact action formats:
425451- CLICK(id) - Click element by ID
426452- TYPE(id, "text") - Type text into element
427453- PRESS("key") - Press keyboard key (Enter, Escape, Tab, ArrowDown, etc)
428454- FINISH() - Task complete
429455
430- Examples:
431- - CLICK(42)
432- - TYPE(15, "magic mouse")
433- - PRESS("Enter")
434- - FINISH()
456+ DO NOT include any explanation, reasoning, or natural language.
457+ DO NOT use markdown formatting or code blocks.
458+ DO NOT say "The next step is..." or anything similar.
459+
460+ CORRECT Examples:
461+ CLICK(42)
462+ TYPE(15, "magic mouse")
463+ PRESS("Enter")
464+ FINISH()
465+
466+ INCORRECT Examples (DO NOT DO THIS):
467+ "The next step is to click..."
468+ "I will type..."
469+ ```CLICK(42)```
435470"""
436471
437- user_prompt = "What is the next step to achieve the goal? "
472+ user_prompt = "Return the single action command: "
438473
439474 return self .llm .generate (system_prompt , user_prompt , temperature = 0.0 )
440475
0 commit comments