ServiceStack
diff --git a/‎llms/db.py‎
Lines changed: 87 additions & 0 deletions b/‎llms/db.py‎
Lines changed: 87 additions & 0 deletions
diff --git a/‎llms/extensions/app/__init__.py‎
Lines changed: 110 additions & 0 deletions b/‎llms/extensions/app/__init__.py‎
Lines changed: 110 additions & 0 deletions
diff --git a/‎llms/extensions/app/db.py‎
Lines changed: 4 additions & 1 deletion b/‎llms/extensions/app/db.py‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎llms/llms.json‎
Lines changed: 54 additions & 0 deletions b/‎llms/llms.json‎
Lines changed: 54 additions & 0 deletions
diff --git a/‎llms/main.py‎
Lines changed: 31 additions & 2 deletions b/‎llms/main.py‎
Lines changed: 31 additions & 2 deletions
@@ -1,5 +1,6 @@
 import json
 import os
+import re
 import sqlite3
 import threading
 from queue import Empty, Queue
@@ -116,6 +117,92 @@ def order_by(all_columns, sort):
     return f"ORDER BY {', '.join(cols)} " if len(cols) > 0 else ""
 
 
+def count_tokens_approx(messages: list[dict]) -> int:
+    """
+    Approximate token count for chat completion messages without external libraries.
+
+    Handles various message formats:
+    - Simple string content: {"role": "user", "content": "hello"}
+    - Content arrays: {"role": "user", "content": [{"type": "text", "text": "hello"}]}
+    - Tool calls, images, etc.
+    """
+
+    def count_text_tokens(text: str) -> int:
+        if not text:
+            return 0
+
+        tokens = 0
+        chunks = re.findall(r"\d+|[a-zA-Z]+|[^\s\w]|\s+", text)
+        for chunk in chunks:
+            if not chunk.strip():
+                tokens += len(chunk) // 4
+            elif chunk.isdigit():
+                tokens += (len(chunk) + 2) // 3
+            elif chunk.isalpha():
+                tokens += 1 if len(chunk) <= 4 else (len(chunk) + 3) // 4
+            else:
+                tokens += len(chunk)
+        return tokens
+
+    def extract_text_content(value) -> str:
+        """Recursively extract text from various content structures."""
+        if value is None:
+            return ""
+        if isinstance(value, str):
+            return value
+        if isinstance(value, list):
+            texts = []
+            for item in value:
+                if isinstance(item, dict):
+                    # Handle content blocks: {"type": "text", "text": "..."}
+                    if item.get("type") == "text" and "text" in item:
+                        texts.append(item["text"])
+                    # Handle tool use, tool results, etc.
+                    elif "content" in item:
+                        texts.append(extract_text_content(item["content"]))
+                    elif "text" in item:
+                        texts.append(item["text"])
+                elif isinstance(item, str):
+                    texts.append(item)
+            return " ".join(texts)
+        if isinstance(value, dict):
+            if "text" in value:
+                return value["text"]
+            if "content" in value:
+                return extract_text_content(value["content"])
+        return ""
+
+    total = 0
+    for message in messages:
+        # Message overhead
+        total += 4
+
+        role = message.get("role", "")
+        total += count_text_tokens(role)
+
+        content = message.get("content")
+        text = extract_text_content(content)
+        total += count_text_tokens(text)
+
+        # Handle thinking/reasoning content
+        for key in ["thinking", "reasoning", "reasoning_content"]:
+            if key in message:
+                text = extract_text_content(message[key])
+                total += count_text_tokens(text)
+
+        # Handle tool calls if present
+        if "tool_calls" in message:
+            for tool_call in message.get("tool_calls") or []:
+                if isinstance(tool_call, dict):
+                    fn = tool_call.get("function", {})
+                    total += count_text_tokens(fn.get("name", ""))
+                    total += count_text_tokens(fn.get("arguments", ""))
+    # Reply priming
+    total += 3
+
+    return total
+
+
 class DbManager:
     def __init__(self, ctx, db_path, clone=None):
         if db_path is None:
 
@@ -1,12 +1,15 @@
 import asyncio
 import json
 import os
+import re
 import time
 from datetime import datetime
 from typing import Any
 
 from aiohttp import web
 
+from llms.db import count_tokens_approx
+
 from .db import AppDB
 
 g_db = None
@@ -52,6 +55,7 @@ def get_db():
         "metadata",
         "error",
         "ref",
+        "contextTokens",
     ]
 
     def thread_dto(row):
@@ -321,6 +325,112 @@ async def daily_requests_summary(request):
 
     ctx.add_get("requests/summary/{day}", daily_requests_summary)
 
+    async def sync_thread(request):
+        user = ctx.get_username(request)
+        take = min(int(request.query.get("take", "200")), 1000)
+
+        threads = g_db.query_threads({"null": "contextTokens", "take": take}, user=user)
+        updated = 0
+        for thread in threads:
+            id = thread["id"]
+            messages = json.loads(thread["messages"])
+            context_tokens = count_tokens_approx(messages)
+            await g_db.update_thread_async(id, {"contextTokens": context_tokens}, user=user)
+            updated += 1
+
+        return web.json_response({"updated": updated})
+
+    ctx.add_get("threads/sync", sync_thread)
+
+    async def compact_thread(request):
+        id = request.match_info["id"]
+        user = ctx.get_username(request)
+        thread = g_db.get_thread(id, user=user)
+        if not thread:
+            raise Exception("Thread not found")
+
+        messages_json = thread["messages"]
+        thread_messages = json.loads(messages_json)
+        message_count = len(thread_messages)
+        token_count = count_tokens_approx(thread_messages)
+        target_tokens = int(token_count * 0.3)  # 30% of original
+
+        compact_template = ctx.config["defaults"]["compact"] if "compact" in ctx.config.get("defaults", {}) else None
+        if not compact_template:
+            raise Exception("'compact' template not found in llms.json defaults")
+
+        compact_template = compact_template.copy()
+        compact_template_messages = compact_template["messages"].copy()
+        user_message = compact_template_messages[-1].copy()
+        user_content = user_message.get("content", "")
+        if not user_content and not isinstance(user_content, str):
+            raise Exception("'compact' template has no user message")
+        if "{messages_json}" not in user_content:
+            raise Exception("'compact' template has no {messages_json} placeholder")
+        user_content = user_content.replace("{message_count}", str(message_count), 1)
+        user_content = user_content.replace("{token_count}", str(token_count), 1)
+        user_content = user_content.replace("{target_tokens}", str(target_tokens), 1)
+        user_content = user_content.replace("{messages_json}", messages_json, 1)
+        user_message["content"] = user_content
+        compact_template_messages[-1] = user_message
+        compact_template["messages"] = compact_template_messages
+
+        ctx.dbg(f"compact_thread: {id} / {message_count} / {token_count} / {target_tokens}\n{user_content}\n")
+        context = {"chat": compact_template, "tools": "none", "user": user}
+        response = await ctx.chat_completion(compact_template, context=context)
+
+        answer = response.get("choices", [{}])[0].get("message", {}).get("content", "")
+        if not answer:
+            raise Exception("No answer in compact response")
+
+        ctx.dbg(answer)
+        compact_messages_response = ctx.parse_json_response(answer)
+        if "messages" in compact_messages_response:
+            compact_messages = compact_messages_response["messages"]
+        elif (
+            isinstance(compact_messages_response, list)
+            and len(compact_messages_response) > 0
+            and compact_messages_response[0].get("role")
+        ):
+            compact_messages = compact_messages_response
+        else:
+            raise Exception("Invalid compact messages response")
+
+        threadId = context.get("threadId")
+        if not threadId:
+            raise Exception("Thread not found")
+        compact_tokens = count_tokens_approx(compact_messages)
+
+        update_thread = {
+            "user": user,
+            "title": thread.get("title"),
+            "systemPrompt": thread.get("systemPrompt"),
+            "model": thread.get("model"),
+            "modelInfo": thread.get("modelInfo"),
+            "modalities": thread.get("modalities"),
+            "messages": compact_messages,
+            "toolHistory": thread.get("toolHistory"),
+            "args": thread.get("args"),
+            "tools": thread.get("tools"),
+            "provider": thread.get("provider"),
+            "providerModel": thread.get("providerModel"),
+            "completedAt": datetime.now(),
+            "metadata": thread.get("metadata"),
+            "ref": thread.get("ref"),
+            "providerResponse": response,
+            "contextTokens": compact_tokens,
+            "parentId": thread.get("id"),
+        }
+        await g_db.update_thread_async(threadId, update_thread, user=user)
+
+        return web.json_response(
+            {
+                "id": threadId,
+            }
+        )
+
+    ctx.add_post("threads/{id}/compact", compact_thread)
+
     async def chat_request(openai_request, context):
         chat = openai_request
         user = context.get("user", None)
 
@@ -4,7 +4,7 @@
 from datetime import datetime, timedelta
 from typing import Any, Dict
 
-from llms.db import DbManager, order_by, select_columns, to_dto, valid_columns
+from llms.db import DbManager, count_tokens_approx, order_by, select_columns, to_dto, valid_columns
 
 
 def with_user(data, user):
@@ -58,6 +58,8 @@ def __init__(self, ctx, db_path):
                 "error": "TEXT",
                 "ref": "TEXT",
                 "providerResponse": "JSON",
+                "contextTokens": "INTEGER",
+                "parentId": "INTEGER",
             },
             "request": {
                 "id": "INTEGER",
@@ -351,6 +353,7 @@ def prepare_thread(self, thread, id=None, user=None):
                 self.ctx.cache_message_inline_data(m)
                 if "timestamp" not in m:
                     m["timestamp"] = initial_timestamp + idx
+            thread["contextTokens"] = count_tokens_approx(thread["messages"])
         return with_user(thread, user=user)
 
     def create_thread(self, thread: Dict[str, Any], user=None):
 
@@ -121,6 +121,60 @@
             ],
             "max_completion_tokens": 16,
             "stream": false
+        },
+        "compact": {
+            "model": "Gemini 2.5 Flash Lite",
+            "messages": [
+                {
+                    "role": "system",
+                    "content": "You are a context compaction assistant. Your task is to condense chat conversation histories while preserving all information necessary for the AI to continue the conversation coherently.\n\n## Compaction Rules\n\n1. **Preserve**: User intent, decisions made, key facts established, code/data shared, current task state, any constraints or preferences expressed\n2. **Remove**: Pleasantries, redundant confirmations, superseded information, verbose explanations that led to simple conclusions, failed attempts that were abandoned\n3. **Summarize**: Multi-turn explorations into their conclusions, lengthy discussions into key points, iterative refinements into final versions\n4. **Keep verbatim**: Code snippets still relevant, specific names/values/URLs, exact user requirements, any content the user explicitly asked to remember\n\n## Output Format\n\nReturn a compacted message array in the same OpenAI chat format:\n- Merge assistant knowledge into a single \"context summary\" system or assistant message where appropriate\n- Preserve the most recent 2-3 exchanges verbatim to maintain conversational flow\n- Use concise prose, not bullet points, for summaries\n\n## Critical\n\nNever lose information that would cause the AI to:\n- Repeat questions already answered\n- Forget constraints or preferences stated\n- Lose track of the current task or goal\n- Miss context needed to understand recent messages"
+                },
+                {
+                    "role": "user",
+                    "content": "Compact the following conversation while preserving all context needed to continue it coherently. The conversation has {message_count} messages totaling approximately {token_count} tokens. Target approximately {target_tokens} tokens.\n\n<conversation>\n{messages_json}\n</conversation>\n\nReturn your response as a JSON object with a single \"messages\" key containing the compacted array. Do not include any text before or after the JSON. Do not wrap in markdown code fences.\n\nExample output structure:\n{\"messages\":[{\"role\":\"system\",\"content\":\"...\"},{\"role\":\"user\",\"content\":\"...\"}]}"
+                }
+            ],
+            "response_format": {
+                "type": "json_schema",
+                "json_schema": {
+                    "name": "compacted_conversation",
+                    "strict": true,
+                    "schema": {
+                        "type": "object",
+                        "properties": {
+                            "messages": {
+                                "type": "array",
+                                "items": {
+                                    "type": "object",
+                                    "properties": {
+                                        "role": {
+                                            "type": "string",
+                                            "enum": [
+                                                "system",
+                                                "user",
+                                                "assistant"
+                                            ]
+                                        },
+                                        "content": {
+                                            "type": "string"
+                                        }
+                                    },
+                                    "required": [
+                                        "role",
+                                        "content"
+                                    ],
+                                    "additionalProperties": false
+                                }
+                            }
+                        },
+                        "required": [
+                            "messages"
+                        ],
+                        "additionalProperties": false
+                    }
+                }
+            },
+            "stream": false
         }
     },
     "limits": {
 
@@ -1763,6 +1763,7 @@ def convert_tool_args(function_name, function_args):
 
     return function_args
 
+
 def get_tool_property(function_name, prop_name):
     tool_def = g_app.get_tool_definition(function_name)
     if not tool_def:
@@ -1773,6 +1774,7 @@ def get_tool_property(function_name, prop_name):
         return properties.get(prop_name)
     return None
 
+
 async def g_exec_tool(function_name, function_args):
     _log(f"g_exec_tool: {function_name}")
     if function_name in g_app.tools:
@@ -2370,6 +2372,30 @@ def disable_provider(provider):
     init_llms(g_config, g_providers)
 
 
+def parse_json_response(text):
+    # Try direct parse first
+    try:
+        return json.loads(text)
+    except json.JSONDecodeError:
+        pass
+
+    # Strip markdown fences
+    cleaned = re.sub(r"^```(?:json)?\s*", "", text.strip())
+    cleaned = re.sub(r"\s*```$", "", cleaned)
+
+    try:
+        return json.loads(cleaned)
+    except json.JSONDecodeError:
+        pass
+
+    # Try to extract JSON object/array
+    match = re.search(r"(\{[\s\S]*\}|\[[\s\S]*\])", text)
+    if match:
+        return json.loads(match.group(1))
+
+    raise ValueError("Could not parse JSON from response")
+
+
 def resolve_root():
     # Try to find the resource root directory
     # When installed as a package, static files may be in different locations
@@ -3031,8 +3057,8 @@ async def on_chat_error(self, e: Exception, context: Dict[str, Any]):
                 task = filter_func(e, context)
                 if inspect.iscoroutine(task):
                     await task
-            except Exception as e:
-                _err("chat error filter failed", e)
+            except Exception as ex:
+                _err("chat error filter failed", ex)
 
     async def on_chat_tool(self, chat: Dict[str, Any], context: Dict[str, Any]):
         m_len = len(chat.get("messages", []))
@@ -3455,6 +3481,9 @@ def create_chat_with_tools(self, chat: Dict[str, Any], use_tools: str = "all") -
     def chat_to_aspect_ratio(self, chat: Dict[str, Any]) -> str:
         return chat_to_aspect_ratio(chat)
 
+    def parse_json_response(self, text: str) -> Dict[str, Any]:
+        return parse_json_response(text)
+
 
 def get_extensions_path():
     return os.getenv("LLMS_EXTENSIONS_DIR", home_llms_path("extensions"))