predicate-runtime-python/tests/test_agent.py at d350fe29484b83b750f102a3f39cb09bef001f48 · PredicateSystems/predicate-runtime-python · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
"""
Unit tests for Sentience Agent Layer (Phase 1)
Tests LLM providers and SentienceAgent without requiring browser
"""

from unittest.mock import Mock, patch

import pytest

from sentience.agent import SentienceAgent
from sentience.llm_provider import AnthropicProvider, LLMProvider, LLMResponse, OpenAIProvider
from sentience.models import BBox, Element, Snapshot, Viewport, VisualCues


class MockLLMProvider(LLMProvider):
    """Mock LLM provider for testing"""

    def __init__(self, responses=None):
        self.responses = responses or []
        self.call_count = 0
        self.calls = []

    def generate(self, system_prompt: str, user_prompt: str, **kwargs):
        self.calls.append({"system": system_prompt, "user": user_prompt, "kwargs": kwargs})

        if self.responses:
            response = self.responses[self.call_count % len(self.responses)]
        else:
            response = "CLICK(1)"

        self.call_count += 1

        return LLMResponse(
            content=response,
            prompt_tokens=100,
            completion_tokens=20,
            total_tokens=120,
            model_name="mock-model",
        )

    def supports_json_mode(self) -> bool:
        return True

    @property
    def model_name(self) -> str:
        return "mock-model"


# ========== LLM Provider Tests ==========


def test_llm_response_dataclass():
    """Test LLMResponse dataclass creation"""
    response = LLMResponse(
        content="CLICK(42)",
        prompt_tokens=100,
        completion_tokens=20,
        total_tokens=120,
        model_name="gpt-4o",
    )

    assert response.content == "CLICK(42)"
    assert response.prompt_tokens == 100
    assert response.completion_tokens == 20
    assert response.total_tokens == 120
    assert response.model_name == "gpt-4o"


def test_mock_llm_provider():
    """Test mock LLM provider"""
    provider = MockLLMProvider(responses=["CLICK(1)", 'TYPE(2, "test")'])

    # First call
    response1 = provider.generate("system", "user")
    assert response1.content == "CLICK(1)"
    assert provider.call_count == 1

    # Second call
    response2 = provider.generate("system", "user")
    assert response2.content == 'TYPE(2, "test")'
    assert provider.call_count == 2

    # Check calls were recorded
    assert len(provider.calls) == 2
    assert provider.calls[0]["system"] == "system"


@pytest.mark.skipif(True, reason="Requires openai package and API key")
def test_openai_provider_init():
    """Test OpenAI provider initialization (skipped without API key)"""
    provider = OpenAIProvider(api_key="test-key", model="gpt-4o")
    assert provider.model_name == "gpt-4o"
    assert provider.supports_json_mode() is True


@pytest.mark.skipif(True, reason="Requires anthropic package and API key")
def test_anthropic_provider_init():
    """Test Anthropic provider initialization (skipped without API key)"""
    provider = AnthropicProvider(api_key="test-key", model="claude-3-sonnet")
    assert provider.model_name == "claude-3-sonnet"
    assert provider.supports_json_mode() is False


# ========== SentienceAgent Tests ==========


def create_mock_browser():
    """Create mock browser for testing"""
    browser = Mock()
    browser.page = Mock()
    browser.page.url = "https://example.com"
    return browser


def create_mock_snapshot():
    """Create mock snapshot with test elements"""
    elements = [
        Element(
            id=1,
            role="button",
            text="Click Me",
            importance=900,
            bbox=BBox(x=100, y=200, width=80, height=30),
            visual_cues=VisualCues(
                is_primary=True, is_clickable=True, background_color_name="blue"
            ),
            in_viewport=True,
            is_occluded=False,
            z_index=10,
        ),
        Element(
            id=2,
            role="textbox",
            text="",
            importance=850,
            bbox=BBox(x=100, y=100, width=200, height=40),
            visual_cues=VisualCues(is_primary=False, is_clickable=True, background_color_name=None),
            in_viewport=True,
            is_occluded=False,
            z_index=5,
        ),
    ]

    return Snapshot(
        status="success",
        timestamp="2024-12-24T10:00:00Z",
        url="https://example.com",
        viewport=Viewport(width=1920, height=1080),
        elements=elements,
    )


def test_agent_initialization():
    """Test SentienceAgent initialization"""
    browser = create_mock_browser()
    llm = MockLLMProvider()

    agent = SentienceAgent(browser, llm, default_snapshot_limit=50, verbose=False)

    assert agent.browser == browser
    assert agent.llm == llm
    assert agent.default_snapshot_limit == 50
    assert agent.verbose is False
    assert len(agent.history) == 0
    # Test new get_token_stats() method
    stats = agent.get_token_stats()
    assert stats.total_tokens == 0


def test_agent_build_context():
    """Test context building from snapshot"""
    browser = create_mock_browser()
    llm = MockLLMProvider()
    agent = SentienceAgent(browser, llm, verbose=False)

    snap = create_mock_snapshot()
    context = agent.llm_handler.build_context(snap, "test goal")

    # Should contain both elements
    assert "[1]" in context
    assert "[2]" in context
    assert "button" in context
    assert "textbox" in context
    assert "Click Me" in context
    assert "PRIMARY" in context
    assert "CLICKABLE" in context
    assert "color:blue" in context
    assert "importance:900" in context


def test_agent_execute_click_action():
    """Test parsing and executing CLICK action"""
    browser = create_mock_browser()
    llm = MockLLMProvider()
    agent = SentienceAgent(browser, llm, verbose=False)

    snap = create_mock_snapshot()

    # Mock click function via ActionExecutor
    with patch("sentience.action_executor.click") as mock_click:
        from sentience.models import ActionResult

        mock_click.return_value = ActionResult(
            success=True, duration_ms=150, outcome="dom_updated", url_changed=False
        )

        result = agent.action_executor.execute("CLICK(1)", snap)

        assert result["success"] is True
        assert result["action"] == "click"
        assert result["element_id"] == 1
        mock_click.assert_called_once_with(browser, 1)


def test_agent_execute_type_action():
    """Test parsing and executing TYPE action"""
    browser = create_mock_browser()
    llm = MockLLMProvider()
    agent = SentienceAgent(browser, llm, verbose=False)

    snap = create_mock_snapshot()

    # Mock type_text function via ActionExecutor
    with patch("sentience.action_executor.type_text") as mock_type:
        from sentience.models import ActionResult

        mock_type.return_value = ActionResult(success=True, duration_ms=200, outcome="dom_updated")

        result = agent.action_executor.execute('TYPE(2, "hello world")', snap)

        assert result["success"] is True
        assert result["action"] == "type"
        assert result["element_id"] == 2
        assert result["text"] == "hello world"
        mock_type.assert_called_once_with(browser, 2, "hello world")


def test_agent_execute_press_action():
    """Test parsing and executing PRESS action"""
    browser = create_mock_browser()
    llm = MockLLMProvider()
    agent = SentienceAgent(browser, llm, verbose=False)

    snap = create_mock_snapshot()

    # Mock press function via ActionExecutor
    with patch("sentience.action_executor.press") as mock_press:
        from sentience.models import ActionResult

        mock_press.return_value = ActionResult(success=True, duration_ms=50, outcome="dom_updated")

        result = agent.action_executor.execute('PRESS("Enter")', snap)

        assert result["success"] is True
        assert result["action"] == "press"
        assert result["key"] == "Enter"
        mock_press.assert_called_once_with(browser, "Enter")


def test_agent_execute_finish_action():
    """Test parsing FINISH action"""
    browser = create_mock_browser()
    llm = MockLLMProvider()
    agent = SentienceAgent(browser, llm, verbose=False)

    snap = create_mock_snapshot()
    result = agent.action_executor.execute("FINISH()", snap)

    assert result["success"] is True
    assert result["action"] == "finish"


def test_agent_execute_invalid_action():
    """Test handling of invalid action format"""
    browser = create_mock_browser()
    llm = MockLLMProvider()
    agent = SentienceAgent(browser, llm, verbose=False)

    snap = create_mock_snapshot()

    with pytest.raises(ValueError, match="Unknown action format"):
        agent.action_executor.execute("INVALID_ACTION", snap)


def test_agent_act_full_cycle():
    """Test full act() cycle with mocked dependencies"""
    browser = create_mock_browser()
    llm = MockLLMProvider(responses=["CLICK(1)"])
    agent = SentienceAgent(browser, llm, verbose=False)

    # Mock snapshot and click
    with (
        patch("sentience.agent.snapshot") as mock_snapshot,
        patch("sentience.action_executor.click") as mock_click,
    ):
        from sentience.models import ActionResult

        mock_snapshot.return_value = create_mock_snapshot()
        mock_click.return_value = ActionResult(success=True, duration_ms=150, outcome="dom_updated")

        result = agent.act("Click the button", max_retries=0)

        # Test new dataclass return type (with backward compatible dict access)
        assert result.success is True
        assert result.action == "click"
        assert result.element_id == 1
        assert result.goal == "Click the button"

        # Also test backward compatible dict-style access (shows deprecation warning)
        assert result["success"] is True
        assert result["action"] == "click"

        # Check history was recorded
        assert len(agent.history) == 1
        assert agent.history[0]["goal"] == "Click the button"

        # Check tokens were tracked using new method
        stats = agent.get_token_stats()
        assert stats.total_tokens > 0


def test_agent_token_tracking():
    """Test token usage tracking"""
    browser = create_mock_browser()
    llm = MockLLMProvider()
    agent = SentienceAgent(browser, llm, verbose=False)

    # Simulate multiple actions
    response1 = LLMResponse(
        content="CLICK(1)",
        prompt_tokens=100,
        completion_tokens=20,
        total_tokens=120,
        model_name="mock-model",
    )
    response2 = LLMResponse(
        content='TYPE(2, "test")',
        prompt_tokens=150,
        completion_tokens=30,
        total_tokens=180,
        model_name="mock-model",
    )

    agent._track_tokens("goal 1", response1)
    agent._track_tokens("goal 2", response2)

    # Test new TokenStats dataclass return type
    stats = agent.get_token_stats()
    assert stats.total_prompt_tokens == 250
    assert stats.total_completion_tokens == 50
    assert stats.total_tokens == 300
    assert len(stats.by_action) == 2
    assert stats.by_action[0].goal == "goal 1"
    assert stats.by_action[0].model == "mock-model"


def test_agent_clear_history():
    """Test clearing history and token stats"""
    browser = create_mock_browser()
    llm = MockLLMProvider()
    agent = SentienceAgent(browser, llm, verbose=False)

    # Add some history
    agent.history.append(
        {
            "goal": "test",
            "action": "test",
            "result": {},
            "success": True,
            "attempt": 0,
            "duration_ms": 0,
        }
    )
    agent._token_usage_raw["total_tokens"] = 100

    agent.clear_history()

    assert len(agent.history) == 0
    stats = agent.get_token_stats()
    assert stats.total_tokens == 0


def test_agent_retry_on_failure():
    """Test retry logic on action failure"""
    browser = create_mock_browser()
    llm = MockLLMProvider(responses=["CLICK(999)"])  # Invalid element ID
    agent = SentienceAgent(browser, llm, verbose=False)

    # Mock snapshot and click (click will fail)
    with (
        patch("sentience.agent.snapshot") as mock_snapshot,
        patch("sentience.action_executor.click") as mock_click,
    ):
        mock_snapshot.return_value = create_mock_snapshot()
        # Simulate click failure
        mock_click.side_effect = RuntimeError("Element not found")

        with pytest.raises(RuntimeError, match="Failed after 2 retries"):
            agent.act("Click invalid element", max_retries=2)

        # Should have attempted 3 times (initial + 2 retries)
        assert mock_click.call_count == 3


def test_agent_action_parsing_variations():
    """Test various action string format variations"""
    browser = create_mock_browser()
    llm = MockLLMProvider()
    agent = SentienceAgent(browser, llm, verbose=False)

    snap = create_mock_snapshot()

    with (
        patch("sentience.action_executor.click") as mock_click,
        patch("sentience.action_executor.type_text") as mock_type,
        patch("sentience.action_executor.press") as mock_press,
    ):
        from sentience.models import ActionResult

        mock_result = ActionResult(success=True, duration_ms=100, outcome="dom_updated")
        mock_click.return_value = mock_result
        mock_type.return_value = mock_result
        mock_press.return_value = mock_result

        # Test variations
        agent.action_executor.execute("click(1)", snap)  # lowercase
        agent.action_executor.execute("CLICK( 1 )", snap)  # extra spaces
        agent.action_executor.execute("TYPE(2, 'single quotes')", snap)  # single quotes
        agent.action_executor.execute("PRESS('Enter')", snap)  # single quotes
        agent.action_executor.execute("finish()", snap)  # lowercase finish

        assert mock_click.call_count == 2
        assert mock_type.call_count == 1
        assert mock_press.call_count == 1


def test_agent_extract_action_from_llm_response():
    """Test extraction of action commands from LLM responses with extra text"""
    browser = create_mock_browser()
    llm = MockLLMProvider()
    agent = SentienceAgent(browser, llm, verbose=False)

    # Test clean action (should pass through)
    assert agent.llm_handler.extract_action("CLICK(42)") == "CLICK(42)"
    assert agent.llm_handler.extract_action('TYPE(15, "test")') == 'TYPE(15, "test")'
    assert agent.llm_handler.extract_action('PRESS("Enter")') == 'PRESS("Enter")'
    assert agent.llm_handler.extract_action("FINISH()") == "FINISH()"

    # Test with natural language prefix (the bug case)
    assert (
        agent.llm_handler.extract_action("The next step is to click the button. CLICK(42)")
        == "CLICK(42)"
    )
    assert (
        agent.llm_handler.extract_action(
            'The next step is to type "Sentience AI agent SDK" into the search field. TYPE(15, "Sentience AI agent SDK")'
        )
        == 'TYPE(15, "Sentience AI agent SDK")'
    )

    # Test with markdown code blocks
    assert agent.llm_handler.extract_action("```\nCLICK(42)\n```") == "CLICK(42)"
    assert (
        agent.llm_handler.extract_action('```python\nTYPE(15, "test")\n```') == 'TYPE(15, "test")'
    )

    # Test with explanation after action
    assert agent.llm_handler.extract_action("CLICK(42) to submit the form") == "CLICK(42)"