Skip to content

Commit e253dcd

Browse files
committed
fixed tool calling
1 parent b74bb0d commit e253dcd

3 files changed

Lines changed: 241 additions & 2 deletions

File tree

effectful/handlers/llm/persistence.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -286,8 +286,15 @@ def _compact(self, agent_id: str, history: OrderedDict[str, Any]) -> None:
286286
if len(items) <= keep_recent:
287287
return
288288

289-
old_items = items[:-keep_recent]
290-
recent_items = items[-keep_recent:]
289+
split = len(items) - keep_recent
290+
# Never split between a tool_use and its tool_result(s).
291+
while split > 0 and items[split][1].get("role") == "tool":
292+
split -= 1
293+
if split <= 0:
294+
return
295+
296+
old_items = items[:split]
297+
recent_items = items[split:]
291298

292299
old_text_parts: list[str] = []
293300
for _, msg in old_items:

tests/test_handlers_llm_provider.py

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2414,3 +2414,68 @@ def ask(self, q: str) -> str:
24142414
data = json.loads((tmp_path / "compact-bot.json").read_text())
24152415
first_msg = data["history"][0]
24162416
assert "CONTEXT SUMMARY" in first_msg["content"]
2417+
2418+
2419+
class _ToolAgent(Agent):
2420+
"""You are a concise assistant. Answer in at most 10 words."""
2421+
2422+
@Tool.define
2423+
def lookup(self, query: str) -> str:
2424+
"""Look up factual information about a topic."""
2425+
return f"Result: The answer to '{query}' is 42."
2426+
2427+
@Template.define
2428+
def ask(self, question: str) -> str:
2429+
"""Answer: {question}. You MUST use the lookup tool first."""
2430+
raise NotHandled
2431+
2432+
2433+
@pytest.mark.parametrize(
2434+
"model",
2435+
[
2436+
pytest.param("gpt-4o-mini", marks=requires_openai),
2437+
pytest.param("claude-haiku-4-5-20251001", marks=requires_anthropic),
2438+
],
2439+
)
2440+
def test_compaction_with_tool_calls_does_not_break_api(model):
2441+
"""After compaction of history containing tool pairs, subsequent calls succeed.
2442+
2443+
Each tool-using call generates ~4 messages (user, assistant/tool_use,
2444+
tool/result, assistant/final). With max_history_len=6, compaction fires
2445+
after the 2nd call. The 3rd call must succeed — if compaction orphaned
2446+
a tool_result the API would reject the conversation.
2447+
"""
2448+
from effectful.handlers.llm.persistence import CompactionHandler
2449+
2450+
bot = _ToolAgent()
2451+
provider = LiteLLMProvider(model=model, max_tokens=60)
2452+
2453+
with (
2454+
handler(provider),
2455+
handler(LimitLLMCallsHandler(max_calls=15)),
2456+
handler(CompactionHandler(max_history_len=6)),
2457+
):
2458+
bot.ask("What is the meaning of life?")
2459+
bot.ask("What is 6 times 7?")
2460+
# Compaction should have fired. This call must not fail.
2461+
result = bot.ask("Summarize what you told me.")
2462+
2463+
assert isinstance(result, str)
2464+
assert len(result) > 0
2465+
2466+
# Verify no orphaned tool_result messages in final history.
2467+
history = provider._histories.get(bot.__agent_id__, {})
2468+
tool_use_ids: set[str] = set()
2469+
for msg in history.values():
2470+
content = msg.get("content", "")
2471+
if isinstance(content, list):
2472+
for block in content:
2473+
if isinstance(block, dict) and block.get("type") == "tool_use":
2474+
tool_use_ids.add(block["id"])
2475+
2476+
for msg in history.values():
2477+
if msg.get("role") == "tool":
2478+
tc_id = msg.get("tool_call_id", "")
2479+
assert tc_id in tool_use_ids, (
2480+
f"Orphaned tool_result with tool_call_id={tc_id!r}"
2481+
)

tests/test_persistent_agent.py

Lines changed: 167 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -883,6 +883,173 @@ def send(self, msg: str) -> str:
883883
first_msg = next(iter(history.values()))
884884
assert "CONTEXT SUMMARY" in first_msg["content"]
885885

886+
def test_compaction_does_not_split_tool_use_tool_result_pairs(self):
887+
"""Compaction must not split tool_use/tool_result message pairs.
888+
889+
If the cut point falls between an assistant message with tool_use
890+
blocks and the corresponding tool_result message, the Anthropic API
891+
rejects the conversation. This test constructs a history where the
892+
naive positional split would do exactly that and asserts that both
893+
messages end up on the same side of the cut.
894+
"""
895+
history: OrderedDict[str, Any] = OrderedDict()
896+
897+
# Build 10 messages: msgs 0-7 are plain user/assistant pairs,
898+
# msg8 is an assistant tool_use, msg9 is the tool_result.
899+
for i in range(8):
900+
history[f"msg{i}"] = {
901+
"id": f"msg{i}",
902+
"role": "user" if i % 2 == 0 else "assistant",
903+
"content": f"Message {i}",
904+
}
905+
906+
# Assistant message with a tool_use block
907+
history["msg8"] = {
908+
"id": "msg8",
909+
"role": "assistant",
910+
"content": [
911+
{
912+
"type": "tool_use",
913+
"id": "tool_call_abc",
914+
"name": "some_tool",
915+
"input": {"arg": "value"},
916+
}
917+
],
918+
}
919+
# Corresponding tool_result
920+
history["msg9"] = {
921+
"id": "msg9",
922+
"role": "tool",
923+
"tool_call_id": "tool_call_abc",
924+
"content": "tool output",
925+
}
926+
927+
# With max_history_len=6, keep_recent = max(6//2, 4) = 4
928+
# So items[:6] are old (msg0-msg5), items[6:] are recent (msg6-msg9).
929+
# That's fine — both msg8 and msg9 land in recent.
930+
#
931+
# But with max_history_len=8, keep_recent = max(8//2, 4) = 4
932+
# old = items[:6] = msg0-msg5, recent = items[6:] = msg6-msg9.
933+
# Still fine.
934+
#
935+
# With max_history_len=4, keep_recent = max(4//2, 4) = 4
936+
# old = items[:6] = msg0-msg5, recent = items[6:] = msg6-msg9.
937+
# Still fine.
938+
#
939+
# To trigger the bug: we need tool_use in old and tool_result in recent.
940+
# With 12 messages and max_history_len=6, keep_recent=4:
941+
# old = items[:8], recent = items[8:] = msg8-msg11
942+
# Let's add 2 more messages after the tool pair.
943+
history["msg10"] = {
944+
"id": "msg10",
945+
"role": "user",
946+
"content": "Message 10",
947+
}
948+
history["msg11"] = {
949+
"id": "msg11",
950+
"role": "assistant",
951+
"content": "Message 11",
952+
}
953+
954+
# 12 messages total. max_history_len=6, keep_recent = max(3, 4) = 4
955+
# old = items[:8] (msg0-msg7), recent = items[8:] (msg8-msg11)
956+
# Hmm, that puts both tool msgs in recent. We want the split in between.
957+
#
958+
# With keep_recent=3 (max_history_len=6 -> 6//2=3, max(3,4)=4... no)
959+
# keep_recent is always >= 4. So recent = last 4 items.
960+
#
961+
# For 12 items with keep_recent=4: old=items[:8], recent=items[8:]
962+
# msg8 (tool_use) is at index 8, so it's the first recent item. Fine.
963+
#
964+
# We need tool_use at index 7 (last of old) and tool_result at index 8
965+
# (first of recent). Let's restructure:
966+
967+
history.clear()
968+
for i in range(7):
969+
history[f"msg{i}"] = {
970+
"id": f"msg{i}",
971+
"role": "user" if i % 2 == 0 else "assistant",
972+
"content": f"Message {i}",
973+
}
974+
975+
# msg7: assistant with tool_use (will be last item in old_items)
976+
history["msg7"] = {
977+
"id": "msg7",
978+
"role": "assistant",
979+
"content": [
980+
{
981+
"type": "tool_use",
982+
"id": "tool_call_xyz",
983+
"name": "check_tool",
984+
"input": {"payload": "test"},
985+
}
986+
],
987+
}
988+
989+
# msg8: tool_result (will be first item in recent_items)
990+
history["msg8"] = {
991+
"id": "msg8",
992+
"role": "tool",
993+
"tool_call_id": "tool_call_xyz",
994+
"content": "tool result here",
995+
}
996+
997+
# msg9, msg10, msg11: padding so recent has 4 items
998+
history["msg9"] = {
999+
"id": "msg9",
1000+
"role": "assistant",
1001+
"content": "Response after tool",
1002+
}
1003+
history["msg10"] = {
1004+
"id": "msg10",
1005+
"role": "user",
1006+
"content": "Follow up question",
1007+
}
1008+
history["msg11"] = {
1009+
"id": "msg11",
1010+
"role": "assistant",
1011+
"content": "Final answer",
1012+
}
1013+
1014+
# 12 items total. max_history_len=8, keep_recent = max(8//2, 4) = 4
1015+
# old = items[:8] = msg0..msg7 (includes tool_use at msg7)
1016+
# recent = items[8:] = msg8..msg11 (includes tool_result at msg8)
1017+
# BUG: tool_use in old gets discarded, but tool_result in recent
1018+
# references a tool_call_id that no longer exists -> API rejection.
1019+
1020+
compaction = CompactionHandler(max_history_len=8)
1021+
mock = MockCompletionHandler(
1022+
[make_text_response("Summary of prior conversation.")]
1023+
)
1024+
provider = LiteLLMProvider()
1025+
with handler(provider), handler(mock):
1026+
stored = get_agent_history("ToolPairBot")
1027+
stored.update(history)
1028+
compaction._compact("ToolPairBot", stored)
1029+
1030+
result = provider._histories["ToolPairBot"]
1031+
result_items = list(result.values())
1032+
1033+
# After compaction, there must be no orphaned tool_result messages.
1034+
# Every tool_result must have a preceding assistant message with a
1035+
# matching tool_use block.
1036+
tool_use_ids: set[str] = set()
1037+
for msg in result_items:
1038+
content = msg.get("content", "")
1039+
if isinstance(content, list):
1040+
for block in content:
1041+
if isinstance(block, dict) and block.get("type") == "tool_use":
1042+
tool_use_ids.add(block["id"])
1043+
1044+
for msg in result_items:
1045+
if msg.get("role") == "tool":
1046+
tc_id = msg.get("tool_call_id", "")
1047+
assert tc_id in tool_use_ids, (
1048+
f"Orphaned tool_result with tool_call_id={tc_id!r} after "
1049+
f"compaction — the matching tool_use was discarded. "
1050+
f"Remaining messages: {[m.get('id') for m in result_items]}"
1051+
)
1052+
8861053
def test_compaction_on_plain_agent_preserves_functionality(self):
8871054
"""After compaction, the plain Agent still works for subsequent calls."""
8881055

0 commit comments

Comments
 (0)