stagehand-python/stagehand/llm/prompts.py at 0b6817c2fe9586c12fe29277d2947f55c6720330 · browserbase/stagehand-python · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
from typing import Optional

from stagehand.handlers.act_handler_utils import method_handler_map
from stagehand.types.llm import ChatMessage


def build_user_instructions_string(
    user_provided_instructions: Optional[str] = None,
) -> str:
    if not user_provided_instructions:
        return ""

    return f"""

# Custom Instructions Provided by the User

Please keep the user's instructions in mind when performing actions. If the user's instructions are not relevant to the current task, ignore them.

User Instructions:
{user_provided_instructions}"""


# extract
def build_extract_system_prompt(
    is_using_text_extract: bool = False,
    user_provided_instructions: Optional[str] = None,
) -> ChatMessage:
    base_content = """You are extracting content on behalf of a user.
If a user asks you to extract a 'list' of information, or 'all' information,
YOU MUST EXTRACT ALL OF THE INFORMATION THAT THE USER REQUESTS.

You will be given:
1. An instruction
2. """

    content_detail = (
        "A text representation of a webpage to extract information from."
        if is_using_text_extract
        else "A list of DOM elements to extract from."
    )

    instructions = (
        f"Print the exact text from the {'text-rendered webpage' if is_using_text_extract else 'DOM+accessibility tree elements'} "
        f"with all symbols, characters, and endlines as is.\n"
        f"Print null or an empty string if no new information is found."
    ).strip()

    additional_instructions = (
        """Once you are given the text-rendered webpage,
you must thoroughly and meticulously analyze it. Be very careful to ensure that you
do not miss any important information."""
        if is_using_text_extract
        else (
            "If a user is attempting to extract links or URLs, you MUST respond with ONLY the IDs of the link elements.\n"
            "Do not attempt to extract links directly from the text unless absolutely necessary. "
        )
    )

    user_instructions = build_user_instructions_string(
        user_provided_instructions,
    )

    content_parts = [
        f"{base_content}{content_detail}",
        instructions,
    ]
    if additional_instructions:
        content_parts.append(additional_instructions)
    if user_instructions:
        content_parts.append(user_instructions)

    # Join parts with newlines, filter empty strings, then replace multiple spaces
    full_content = "\n\n".join(filter(None, content_parts))
    content = " ".join(full_content.split())

    return ChatMessage(role="system", content=content)


def build_extract_user_prompt(instruction: str, tree_elements: str) -> ChatMessage:
    """
    Build the user prompt for extraction.

    Args:
        instruction: The instruction for what to extract
        tree_elements: The DOM+accessibility tree representation

    Returns:
        User prompt for extract
    """
    content = f"""Instruction: {instruction}
DOM+accessibility tree: {tree_elements}"""

    return ChatMessage(role="user", content=content)


def build_metadata_system_prompt() -> ChatMessage:
    """
    Build the system prompt for metadata extraction.

    Returns:
        System prompt for metadata
    """
    prompt = """You are an AI assistant that evaluates the completeness of information extraction.

Given:
1. An extraction instruction
2. The extracted data

Your task is to:
1. Determine if the extraction is complete based on the instruction
2. Provide a brief progress summary

Please respond with:
- "completed": A boolean indicating if the extraction is complete (true) or not (false)
- "progress": A brief summary of the extraction progress
"""

    return ChatMessage(role="system", content=prompt)


def build_metadata_prompt(
    instruction: str, extracted_data: dict, chunks_seen: int, chunks_total: int
) -> str:
    """
    Build the user prompt for metadata extraction.

    Args:
        instruction: The original extraction instruction
        extracted_data: The data that was extracted
        chunks_seen: Number of chunks processed
        chunks_total: Total number of chunks

    Returns:
        User prompt for metadata
    """
    prompt = f"""Extraction instruction: {instruction}

Extracted data: {extracted_data}

Chunks processed: {chunks_seen} of {chunks_total}

Evaluate if this extraction is complete according to the instruction.
"""

    return ChatMessage(role="user", content=prompt)


# observe
def build_observe_system_prompt(
    user_provided_instructions: Optional[str] = None,
) -> ChatMessage:
    tree_type_desc = "a hierarchical accessibility tree showing the semantic structure of the page. The tree is a hybrid of the DOM and the accessibility tree."

    observe_system_prompt_base = f"""
You are helping the user automate the browser by finding elements based on what the user wants to observe in the page.

You will be given:
1. an instruction of elements to observe
2. {tree_type_desc}

Return an array of elements that match the instruction if they exist, otherwise return an empty array. Whenever suggesting actions, use supported playwright locator methods or preferably one of the following supported actions:
{', '.join(method_handler_map.keys())}"""

    content = " ".join(observe_system_prompt_base.split())
    user_instructions_str = build_user_instructions_string(user_provided_instructions)

    final_content = content
    if user_instructions_str:
        final_content += "\n\n" + user_instructions_str

    return ChatMessage(
        role="system",
        content=final_content,
    )


def build_observe_user_message(
    instruction: str,
    tree_elements: str,
    variables,
) -> ChatMessage:
    tree_or_dom = "Accessibility Tree"
    return ChatMessage(
        role="user",
        content=f"""instruction: {instruction}
Below are the variables that are accessible in jinja style in the instruction.
For the 'fill' and 'type' instructions, don't replace the variables in the response. For the rest of the actions please do. In the response in the arguments try and use the same jinja style variables that are in the instruction, if it is suitable.
variables: {variables}
{tree_or_dom}: {tree_elements}
""",
    )


def build_act_observe_prompt(
    action: str,
    supported_actions: list[str],
    variables: Optional[dict[str, str]] = None,
) -> str:
    """
    Builds the instruction for the observeAct method to find the most relevant element for an action
    """
    instruction = f"""Find the most relevant element to perform an action on given the following action: {action}.
Provide an action for this element such as {', '.join(supported_actions)}, or any other playwright locator method. Remember that to users, buttons and links look the same in most cases.
If the action is completely unrelated to a potential action to be taken on the page, return an empty array.
ONLY return one action. If multiple actions are relevant, return the most relevant one.
If the user is asking to scroll to a position on the page, e.g., 'halfway' or 0.75, etc, you must return the argument formatted as the correct percentage, e.g., '50%' or '75%', etc.
If the user is asking to scroll to the next chunk/previous chunk, choose the nextChunk/prevChunk method. No arguments are required here.
If the action implies a key press, e.g., 'press enter', 'press a', 'press space', etc., always choose the press method with the appropriate key as argument — e.g. 'a', 'Enter', 'Space'. Do not choose a click action on an on-screen keyboard. Capitalize the first character like 'Enter', 'Tab', 'Escape' only for special keys.
If the action implies choosing an option from a dropdown, AND the corresponding element is a 'select' element, choose the selectOptionFromDropdown method. The argument should be the text of the option to select.
If the action implies choosing an option from a dropdown, and the corresponding element is NOT a 'select' element, choose the click method."""

    if variables and len(variables) > 0:
        variables_prompt = f"The following variables are available to use in the action: {', '.join(variables.keys())}. Fill the argument variables with the variable name."
        instruction += f" {variables_prompt}"

    return instruction


def build_operator_system_prompt(goal: str) -> ChatMessage:
    return ChatMessage(
        role="system",
        content=f"""You are a general-purpose agent whose job is to accomplish the user's goal across multiple model calls by running actions on the page.

You will be given a goal and a list of steps that have been taken so far. Your job is to determine if either the user's goal has been completed or if there are still steps that need to be taken.

# Your current goal
{goal}

# Important guidelines
1. Break down complex actions into individual atomic steps
2. For `act` commands, use only one action at a time, such as:
   - Single click on a specific element
   - Type into a single input field
   - Select a single option
3. Avoid combining multiple actions in one instruction
4. If multiple actions are needed, they should be separate steps""",
    )