@@ -883,6 +883,173 @@ def send(self, msg: str) -> str:
883883 first_msg = next (iter (history .values ()))
884884 assert "CONTEXT SUMMARY" in first_msg ["content" ]
885885
886+ def test_compaction_does_not_split_tool_use_tool_result_pairs (self ):
887+ """Compaction must not split tool_use/tool_result message pairs.
888+
889+ If the cut point falls between an assistant message with tool_use
890+ blocks and the corresponding tool_result message, the Anthropic API
891+ rejects the conversation. This test constructs a history where the
892+ naive positional split would do exactly that and asserts that both
893+ messages end up on the same side of the cut.
894+ """
895+ history : OrderedDict [str , Any ] = OrderedDict ()
896+
897+ # Build 10 messages: msgs 0-7 are plain user/assistant pairs,
898+ # msg8 is an assistant tool_use, msg9 is the tool_result.
899+ for i in range (8 ):
900+ history [f"msg{ i } " ] = {
901+ "id" : f"msg{ i } " ,
902+ "role" : "user" if i % 2 == 0 else "assistant" ,
903+ "content" : f"Message { i } " ,
904+ }
905+
906+ # Assistant message with a tool_use block
907+ history ["msg8" ] = {
908+ "id" : "msg8" ,
909+ "role" : "assistant" ,
910+ "content" : [
911+ {
912+ "type" : "tool_use" ,
913+ "id" : "tool_call_abc" ,
914+ "name" : "some_tool" ,
915+ "input" : {"arg" : "value" },
916+ }
917+ ],
918+ }
919+ # Corresponding tool_result
920+ history ["msg9" ] = {
921+ "id" : "msg9" ,
922+ "role" : "tool" ,
923+ "tool_call_id" : "tool_call_abc" ,
924+ "content" : "tool output" ,
925+ }
926+
927+ # With max_history_len=6, keep_recent = max(6//2, 4) = 4
928+ # So items[:6] are old (msg0-msg5), items[6:] are recent (msg6-msg9).
929+ # That's fine — both msg8 and msg9 land in recent.
930+ #
931+ # But with max_history_len=8, keep_recent = max(8//2, 4) = 4
932+ # old = items[:6] = msg0-msg5, recent = items[6:] = msg6-msg9.
933+ # Still fine.
934+ #
935+ # With max_history_len=4, keep_recent = max(4//2, 4) = 4
936+ # old = items[:6] = msg0-msg5, recent = items[6:] = msg6-msg9.
937+ # Still fine.
938+ #
939+ # To trigger the bug: we need tool_use in old and tool_result in recent.
940+ # With 12 messages and max_history_len=6, keep_recent=4:
941+ # old = items[:8], recent = items[8:] = msg8-msg11
942+ # Let's add 2 more messages after the tool pair.
943+ history ["msg10" ] = {
944+ "id" : "msg10" ,
945+ "role" : "user" ,
946+ "content" : "Message 10" ,
947+ }
948+ history ["msg11" ] = {
949+ "id" : "msg11" ,
950+ "role" : "assistant" ,
951+ "content" : "Message 11" ,
952+ }
953+
954+ # 12 messages total. max_history_len=6, keep_recent = max(3, 4) = 4
955+ # old = items[:8] (msg0-msg7), recent = items[8:] (msg8-msg11)
956+ # Hmm, that puts both tool msgs in recent. We want the split in between.
957+ #
958+ # With keep_recent=3 (max_history_len=6 -> 6//2=3, max(3,4)=4... no)
959+ # keep_recent is always >= 4. So recent = last 4 items.
960+ #
961+ # For 12 items with keep_recent=4: old=items[:8], recent=items[8:]
962+ # msg8 (tool_use) is at index 8, so it's the first recent item. Fine.
963+ #
964+ # We need tool_use at index 7 (last of old) and tool_result at index 8
965+ # (first of recent). Let's restructure:
966+
967+ history .clear ()
968+ for i in range (7 ):
969+ history [f"msg{ i } " ] = {
970+ "id" : f"msg{ i } " ,
971+ "role" : "user" if i % 2 == 0 else "assistant" ,
972+ "content" : f"Message { i } " ,
973+ }
974+
975+ # msg7: assistant with tool_use (will be last item in old_items)
976+ history ["msg7" ] = {
977+ "id" : "msg7" ,
978+ "role" : "assistant" ,
979+ "content" : [
980+ {
981+ "type" : "tool_use" ,
982+ "id" : "tool_call_xyz" ,
983+ "name" : "check_tool" ,
984+ "input" : {"payload" : "test" },
985+ }
986+ ],
987+ }
988+
989+ # msg8: tool_result (will be first item in recent_items)
990+ history ["msg8" ] = {
991+ "id" : "msg8" ,
992+ "role" : "tool" ,
993+ "tool_call_id" : "tool_call_xyz" ,
994+ "content" : "tool result here" ,
995+ }
996+
997+ # msg9, msg10, msg11: padding so recent has 4 items
998+ history ["msg9" ] = {
999+ "id" : "msg9" ,
1000+ "role" : "assistant" ,
1001+ "content" : "Response after tool" ,
1002+ }
1003+ history ["msg10" ] = {
1004+ "id" : "msg10" ,
1005+ "role" : "user" ,
1006+ "content" : "Follow up question" ,
1007+ }
1008+ history ["msg11" ] = {
1009+ "id" : "msg11" ,
1010+ "role" : "assistant" ,
1011+ "content" : "Final answer" ,
1012+ }
1013+
1014+ # 12 items total. max_history_len=8, keep_recent = max(8//2, 4) = 4
1015+ # old = items[:8] = msg0..msg7 (includes tool_use at msg7)
1016+ # recent = items[8:] = msg8..msg11 (includes tool_result at msg8)
1017+ # BUG: tool_use in old gets discarded, but tool_result in recent
1018+ # references a tool_call_id that no longer exists -> API rejection.
1019+
1020+ compaction = CompactionHandler (max_history_len = 8 )
1021+ mock = MockCompletionHandler (
1022+ [make_text_response ("Summary of prior conversation." )]
1023+ )
1024+ provider = LiteLLMProvider ()
1025+ with handler (provider ), handler (mock ):
1026+ stored = get_agent_history ("ToolPairBot" )
1027+ stored .update (history )
1028+ compaction ._compact ("ToolPairBot" , stored )
1029+
1030+ result = provider ._histories ["ToolPairBot" ]
1031+ result_items = list (result .values ())
1032+
1033+ # After compaction, there must be no orphaned tool_result messages.
1034+ # Every tool_result must have a preceding assistant message with a
1035+ # matching tool_use block.
1036+ tool_use_ids : set [str ] = set ()
1037+ for msg in result_items :
1038+ content = msg .get ("content" , "" )
1039+ if isinstance (content , list ):
1040+ for block in content :
1041+ if isinstance (block , dict ) and block .get ("type" ) == "tool_use" :
1042+ tool_use_ids .add (block ["id" ])
1043+
1044+ for msg in result_items :
1045+ if msg .get ("role" ) == "tool" :
1046+ tc_id = msg .get ("tool_call_id" , "" )
1047+ assert tc_id in tool_use_ids , (
1048+ f"Orphaned tool_result with tool_call_id={ tc_id !r} after "
1049+ f"compaction — the matching tool_use was discarded. "
1050+ f"Remaining messages: { [m .get ('id' ) for m in result_items ]} "
1051+ )
1052+
8861053 def test_compaction_on_plain_agent_preserves_functionality (self ):
8871054 """After compaction, the plain Agent still works for subsequent calls."""
8881055
0 commit comments