From f6a247bc9358d4390b02b424fac39d458f48e76d Mon Sep 17 00:00:00 2001 From: Matteo Merola Date: Thu, 25 Jun 2026 10:09:33 +0200 Subject: [PATCH 1/7] feat(llm): add thinking-level control for reasoning models Add a 'thinking level' (low/medium/high) selector to the LLM tab, shown only when the selected model supports reasoning. The value maps to output_config.effort + adaptive thinking for Anthropic, and reasoning_effort for OpenAI-compatible providers. Off by default; never sent for models that don't support thinking. Closes #9 --- api/admin.py | 2 ++ api/templates/base.html | 11 +++++++- api/templates/partials/llm.html | 17 ++++++++++-- core/config.py | 1 + core/llm.py | 30 ++++++++++++++++++--- tests/test_llm.py | 46 +++++++++++++++++++++++++++++++++ 6 files changed, 101 insertions(+), 6 deletions(-) create mode 100644 tests/test_llm.py diff --git a/api/admin.py b/api/admin.py index e69d43e..27ce7dc 100644 --- a/api/admin.py +++ b/api/admin.py @@ -792,6 +792,7 @@ async def partial_llm() -> HTMLResponse: deepseek_api_key = await config_store.get("agent.deepseek_api_key") or "" deepseek_base_url = await config_store.get("agent.deepseek_base_url") or "" model = await config_store.get("agent.model") or "claude-4-6-sonnet" + thinking_level = await config_store.get("agent.thinking_level") or "" extraction_provider = await config_store.get("memory.extraction_provider") or "anthropic" extraction_model = await config_store.get("memory.extraction_model") or "claude-haiku-4-5" consolidation_provider = ( @@ -829,6 +830,7 @@ async def partial_llm() -> HTMLResponse: deepseek_api_key=deepseek_api_key, deepseek_base_url=deepseek_base_url, model=model, + thinking_level=thinking_level, extraction_provider=extraction_provider, extraction_model=extraction_model, consolidation_provider=consolidation_provider, diff --git a/api/templates/base.html b/api/templates/base.html index 587183c..e5cf454 100644 --- a/api/templates/base.html +++ b/api/templates/base.html @@ -42,7 +42,7 @@ } window.showToast = showToast; - function llmTab(provider, apiKey, model, openaiKey, openaiBaseUrl, googleKey, googleBaseUrl, grokKey, grokBaseUrl, deepseekKey, deepseekBaseUrl, extractionProvider, extractionModel, consolidationProvider, consolidationModel, gdEnabled, gdProvider, gdModel, trEnabled, trProvider, trModel, promptToolUsageOverride, promptHistoryOverride, defaultToolUsage, defaultHistoryHandling, promptCaptureEnabled, compactionProvider, compactionModel) { + function llmTab(provider, apiKey, model, openaiKey, openaiBaseUrl, googleKey, googleBaseUrl, grokKey, grokBaseUrl, deepseekKey, deepseekBaseUrl, extractionProvider, extractionModel, consolidationProvider, consolidationModel, gdEnabled, gdProvider, gdModel, trEnabled, trProvider, trModel, promptToolUsageOverride, promptHistoryOverride, defaultToolUsage, defaultHistoryHandling, promptCaptureEnabled, compactionProvider, compactionModel, thinkingLevel) { const currentProvider = provider || 'anthropic'; return { providerOptions: [ @@ -55,6 +55,7 @@ provider: currentProvider, apiKey: apiKey || '', model: model || '', + thinkingLevel: thinkingLevel || '', openaiKey: openaiKey || '', openaiBaseUrl: openaiBaseUrl || '', googleKey: googleKey || '', @@ -161,6 +162,14 @@ } return this.models[prov] || []; }, + // ponytail: substring heuristic for reasoning-capable models — extend the + // list as model ids change; the thinking-level control shows only when true. + modelSupportsThinking(modelId) { + const id = (modelId || '').toLowerCase(); + if (!id) return false; + return ['opus', 'sonnet', 'thinking', 'pro', 'deep-think', 'reasoner', 'grok-4'] + .some(pat => id.includes(pat)); + }, fetchModels(service) { const apiKey = this.keyFor(service); if (!apiKey) { diff --git a/api/templates/partials/llm.html b/api/templates/partials/llm.html index 27d33fa..56b98df 100644 --- a/api/templates/partials/llm.html +++ b/api/templates/partials/llm.html @@ -27,7 +27,8 @@ {{ default_history_handling|default('', true)|tojson|forceescape }}, {{ prompt_capture_enabled|default(false, true)|tojson|forceescape }}, {{ compaction_provider|default('anthropic', true)|tojson|forceescape }}, - {{ compaction_model|default('claude-haiku-4-5', true)|tojson|forceescape }} + {{ compaction_model|default('claude-haiku-4-5', true)|tojson|forceescape }}, + {{ thinking_level|default('', true)|tojson|forceescape }} )">

System Prompt Controls

@@ -175,6 +176,17 @@

Active Inference Provider

Type any model id, or pick from the list. Use “Fetch models” in the provider card below to load the live list from the API.

+ +
+ + +

Reasoning effort for models that support thinking. Higher = more reasoning, more tokens.

+
@@ -197,7 +209,8 @@

Active Inference Provider

'agent.grok_base_url': grokBaseUrl, 'agent.deepseek_api_key': deepseekKey, 'agent.deepseek_base_url': deepseekBaseUrl, - 'agent.model': model + 'agent.model': model, + 'agent.thinking_level': modelSupportsThinking(model) ? thinkingLevel : '' }}) }) .then(r => { resultOk = r.ok; return r.json(); }) diff --git a/core/config.py b/core/config.py index 1fc1a3b..efa7742 100644 --- a/core/config.py +++ b/core/config.py @@ -48,6 +48,7 @@ class AgentConfig(BaseModel): deepseek_api_key: str = "" deepseek_base_url: str = "" model: str = "claude-4-6-sonnet" + thinking_level: str = "" # "" (off) | "low" | "medium" | "high" — only for reasoning models timezone: str = "Europe/Zurich" skills_dir: str = "skills/" skills_db_path: str = "data/skills.db" diff --git a/core/llm.py b/core/llm.py index 43ae6d4..bf3a054 100644 --- a/core/llm.py +++ b/core/llm.py @@ -101,8 +101,16 @@ def _normalize_model(provider: str, model: str) -> str: class LLMClient: - def __init__(self, provider: str, api_key: str, base_url: str | None = None): + def __init__( + self, + provider: str, + api_key: str, + base_url: str | None = None, + thinking_level: str = "", + ): self.provider = _normalize_provider(provider) + # "" (off) | "low" | "medium" | "high" — applied only to the main generate() call + self.thinking_level = (thinking_level or "").strip().lower() self._client: Any if self.provider == "anthropic": self._client = AsyncAnthropic(api_key=api_key) @@ -122,33 +130,38 @@ def __init__(self, provider: str, api_key: str, base_url: str | None = None): @classmethod def from_agent_config(cls, config) -> LLMClient: provider = _normalize_provider(getattr(config, "llm_provider", "anthropic")) + thinking = getattr(config, "thinking_level", "") if provider == "anthropic": - return cls(provider, getattr(config, "anthropic_api_key", "")) + return cls(provider, getattr(config, "anthropic_api_key", ""), thinking_level=thinking) if provider == "openai": return cls( provider, getattr(config, "openai_api_key", ""), getattr(config, "openai_base_url", ""), + thinking_level=thinking, ) if provider == "google": return cls( provider, getattr(config, "google_api_key", ""), getattr(config, "google_base_url", ""), + thinking_level=thinking, ) if provider == "grok": return cls( provider, getattr(config, "grok_api_key", ""), getattr(config, "grok_base_url", ""), + thinking_level=thinking, ) if provider == "deepseek": return cls( provider, getattr(config, "deepseek_api_key", ""), getattr(config, "deepseek_base_url", ""), + thinking_level=thinking, ) - return cls("anthropic", getattr(config, "anthropic_api_key", "")) + return cls("anthropic", getattr(config, "anthropic_api_key", ""), thinking_level=thinking) async def generate( self, @@ -174,12 +187,19 @@ async def generate( "cache_control": {"type": "ephemeral"}, } ] + extra: dict[str, Any] = {} + if self.thinking_level in ("low", "medium", "high"): + # Adaptive thinking at the requested effort. Only sent when a level is + # set — the UI exposes the control for reasoning models only. + extra["thinking"] = {"type": "adaptive"} + extra["output_config"] = {"effort": self.thinking_level} response = await messages_client.create( model=resolved_model, max_tokens=max_tokens, system=cast(Any, system_param), messages=cast(Any, messages), tools=cast(Any, tools), + **extra, ) tool_calls = [] text_parts = [] @@ -205,11 +225,15 @@ async def generate( openai_tools = _openai_tools(tools) client_any = cast(Any, self._client) full_messages = [{"role": "system", "content": system}, *messages] + extra = {} + if self.thinking_level in ("low", "medium", "high"): + extra["reasoning_effort"] = self.thinking_level response = await client_any.chat.completions.create( model=resolved_model, max_tokens=max_tokens, messages=cast(Any, full_messages), tools=cast(Any, openai_tools), + **extra, ) message = response.choices[0].message tool_calls = [] diff --git a/tests/test_llm.py b/tests/test_llm.py new file mode 100644 index 0000000..06d4f5c --- /dev/null +++ b/tests/test_llm.py @@ -0,0 +1,46 @@ +"""Tests for thinking-level plumbing in LLMClient.""" + +from __future__ import annotations + +from unittest.mock import AsyncMock + +import pytest + +from core.config import AgentConfig +from core.llm import LLMClient + + +def test_thinking_level_normalized_and_defaulted() -> None: + assert LLMClient("anthropic", "x", thinking_level=" HIGH ").thinking_level == "high" + assert LLMClient("anthropic", "x").thinking_level == "" + + +def test_from_agent_config_carries_thinking_level() -> None: + cfg = AgentConfig(llm_provider="openai", openai_api_key="x", thinking_level="low") + assert LLMClient.from_agent_config(cfg).thinking_level == "low" + + +@pytest.mark.asyncio +async def test_anthropic_generate_sends_effort_when_set() -> None: + client = LLMClient("anthropic", "x", thinking_level="medium") + create = AsyncMock(return_value=type("R", (), {"content": [], "usage": None})()) + client._client = type("C", (), {"messages": type("M", (), {"create": create})()})() + + await client.generate(model="claude-4-6-opus", system="s", messages=[], tools=[]) + + kwargs = create.await_args.kwargs + assert kwargs["thinking"] == {"type": "adaptive"} + assert kwargs["output_config"] == {"effort": "medium"} + + +@pytest.mark.asyncio +async def test_anthropic_generate_omits_effort_when_off() -> None: + client = LLMClient("anthropic", "x") + create = AsyncMock(return_value=type("R", (), {"content": [], "usage": None})()) + client._client = type("C", (), {"messages": type("M", (), {"create": create})()})() + + await client.generate(model="claude-4-6-opus", system="s", messages=[], tools=[]) + + kwargs = create.await_args.kwargs + assert "thinking" not in kwargs + assert "output_config" not in kwargs From 2a5f6215138a646fa4dcac8cbb82112a7861a25f Mon Sep 17 00:00:00 2001 From: Matteo Merola Date: Thu, 25 Jun 2026 10:31:09 +0200 Subject: [PATCH 2/7] feat(llm): per-kind thinking levels for all inference paths MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add independent thinking-level config for memory extraction/consolidation, goal decomposition, task reflection, and compaction. generate_text() now honors the client's level via a shared _reasoning_kwargs() helper, and _background_llm() clones the main client (sharing the SDK connection) to override only the level — so background tasks get their own effort setting. --- core/agent.py | 30 +++++++++++++++++++----------- core/config.py | 5 +++++ core/llm.py | 27 ++++++++++++++++----------- core/scheduler.py | 5 ++++- tests/test_compaction.py | 4 ++-- tests/test_scheduler.py | 3 ++- 6 files changed, 48 insertions(+), 26 deletions(-) diff --git a/core/agent.py b/core/agent.py index 449c7ae..ced0ec7 100644 --- a/core/agent.py +++ b/core/agent.py @@ -3,6 +3,7 @@ from __future__ import annotations import asyncio +import copy import hashlib import json import logging @@ -410,7 +411,7 @@ async def _maybe_compact( session = await self.history.get_session(channel, user_id, chat_id) try: - llm = self._background_llm(cfg.provider) + llm = self._background_llm(cfg.provider, cfg.thinking_level) result = await compact_messages(llm, cfg.model, session, cfg.keep_recent_turns) except Exception: log.exception("Conversation compaction failed") @@ -1136,7 +1137,10 @@ async def _extract_memories(self, user_msg: str, agent_msg: str) -> None: main agent loop. """ try: - llm = self._memory_llm(self.config.memory.extraction_provider) + llm = self._memory_llm( + self.config.memory.extraction_provider, + self.config.memory.extraction_thinking_level, + ) stored = await self.memory.extract_memories( llm=llm, model=self.config.memory.extraction_model, @@ -1149,29 +1153,33 @@ async def _extract_memories(self, user_msg: str, agent_msg: str) -> None: except Exception: log.exception("Background memory extraction failed") - def _memory_llm(self, provider: str) -> LLMClient: + def _memory_llm(self, provider: str, thinking_level: str = "") -> LLMClient: """Return an LLM client for memory operations. If the requested provider matches the main inference provider the existing client is reused; otherwise a new one is created using the API key / base-URL already stored in the agent config. """ - return self._background_llm(provider) + return self._background_llm(provider, thinking_level) - def _background_llm(self, provider: str) -> LLMClient: + def _background_llm(self, provider: str, thinking_level: str = "") -> LLMClient: """Return an LLM client for background tasks (memory, reflection, etc.). - If the requested provider matches the main inference provider the - existing client is reused; otherwise a new one is created using the - API key / base-URL already stored in the agent config. + Background tasks carry their own thinking level, independent of the + main inference one. When the provider matches the main client we clone + it (sharing the underlying SDK connection) and override only the level; + otherwise a fresh client is built from the stored credentials. """ if provider == self.llm.provider: - return self.llm + clone = copy.copy(self.llm) + clone.thinking_level = (thinking_level or "").strip().lower() + return clone cfg = self.config.agent return LLMClient( provider=provider, api_key=getattr(cfg, f"{provider}_api_key", ""), base_url=getattr(cfg, f"{provider}_base_url", None), + thinking_level=thinking_level, ) def _build_embedder(self): @@ -1218,7 +1226,7 @@ async def _maybe_decompose(self, message: str) -> DecomposedGoal | None: Returns None if the message is simple or decomposition fails/is disabled. """ gd_cfg = self.config.goal_decomposition - llm = self._background_llm(gd_cfg.provider) + llm = self._background_llm(gd_cfg.provider, gd_cfg.thinking_level) try: is_complex = await classify_complexity(llm, gd_cfg.model, message) @@ -1246,7 +1254,7 @@ async def _reflect_on_task(self, user_msg: str, agent_msg: str, tool_log: list[d """ try: tr_cfg = self.config.task_reflection - llm = self._background_llm(tr_cfg.provider) + llm = self._background_llm(tr_cfg.provider, tr_cfg.thinking_level) stored = await self.reflections.reflect_on_task( llm=llm, model=tr_cfg.model, diff --git a/core/config.py b/core/config.py index efa7742..8d9b90a 100644 --- a/core/config.py +++ b/core/config.py @@ -159,6 +159,8 @@ class MemoryConfig(BaseModel): extraction_model: str = "claude-haiku-4-5" consolidation_provider: str = "anthropic" consolidation_model: str = "claude-haiku-4-5" + extraction_thinking_level: str = "" # "" (off) | "low" | "medium" | "high" + consolidation_thinking_level: str = "" # "" (off) | "low" | "medium" | "high" extraction_cooldown_seconds: int = 120 # minimum seconds between extractions embedding: EmbeddingConfig = EmbeddingConfig() @@ -178,12 +180,14 @@ class GoalDecompositionConfig(BaseModel): enabled: bool = True provider: str = "anthropic" model: str = "claude-haiku-4-5" + thinking_level: str = "" # "" (off) | "low" | "medium" | "high" class TaskReflectionConfig(BaseModel): enabled: bool = True provider: str = "anthropic" model: str = "claude-haiku-4-5" + thinking_level: str = "" # "" (off) | "low" | "medium" | "high" db_path: str = "data/reflections.db" max_reflections: int = 50 # max reflections to keep for prompt injection @@ -198,6 +202,7 @@ class CompactionConfig(BaseModel): enabled: bool = True provider: str = "anthropic" model: str = "claude-haiku-4-5" + thinking_level: str = "" # "" (off) | "low" | "medium" | "high" threshold_type: str = "percent" # "percent" (of context window) or "tokens" (absolute) threshold_percent: int = 80 # trigger at this % of the model's context window threshold_tokens: int = 150000 # absolute trigger when threshold_type == "tokens" diff --git a/core/llm.py b/core/llm.py index bf3a054..5b5318f 100644 --- a/core/llm.py +++ b/core/llm.py @@ -127,6 +127,18 @@ def __init__( } self._client = cast(Any, client_class)(**client_kwargs) # type: ignore[call-arg] + def _reasoning_kwargs(self) -> dict[str, Any]: + """Provider-specific request kwargs for the configured thinking level. + + Empty when no level is set, so non-reasoning calls are untouched. + """ + level = self.thinking_level + if level not in ("low", "medium", "high"): + return {} + if self.provider == "anthropic": + return {"thinking": {"type": "adaptive"}, "output_config": {"effort": level}} + return {"reasoning_effort": level} + @classmethod def from_agent_config(cls, config) -> LLMClient: provider = _normalize_provider(getattr(config, "llm_provider", "anthropic")) @@ -187,19 +199,13 @@ async def generate( "cache_control": {"type": "ephemeral"}, } ] - extra: dict[str, Any] = {} - if self.thinking_level in ("low", "medium", "high"): - # Adaptive thinking at the requested effort. Only sent when a level is - # set — the UI exposes the control for reasoning models only. - extra["thinking"] = {"type": "adaptive"} - extra["output_config"] = {"effort": self.thinking_level} response = await messages_client.create( model=resolved_model, max_tokens=max_tokens, system=cast(Any, system_param), messages=cast(Any, messages), tools=cast(Any, tools), - **extra, + **self._reasoning_kwargs(), ) tool_calls = [] text_parts = [] @@ -225,15 +231,12 @@ async def generate( openai_tools = _openai_tools(tools) client_any = cast(Any, self._client) full_messages = [{"role": "system", "content": system}, *messages] - extra = {} - if self.thinking_level in ("low", "medium", "high"): - extra["reasoning_effort"] = self.thinking_level response = await client_any.chat.completions.create( model=resolved_model, max_tokens=max_tokens, messages=cast(Any, full_messages), tools=cast(Any, openai_tools), - **extra, + **self._reasoning_kwargs(), ) message = response.choices[0].message tool_calls = [] @@ -291,6 +294,7 @@ async def generate_text(self, *, model: str, prompt: str, max_tokens: int = 1024 model=resolved_model, max_tokens=max_tokens, messages=cast(Any, [{"role": "user", "content": prompt}]), + **self._reasoning_kwargs(), ) for block in response.content: block_any = cast(Any, block) @@ -303,5 +307,6 @@ async def generate_text(self, *, model: str, prompt: str, max_tokens: int = 1024 model=resolved_model, max_tokens=max_tokens, messages=cast(Any, [{"role": "user", "content": prompt}]), + **self._reasoning_kwargs(), ) return (response.choices[0].message.content or "").strip() diff --git a/core/scheduler.py b/core/scheduler.py index 35503d5..6bb840e 100644 --- a/core/scheduler.py +++ b/core/scheduler.py @@ -127,7 +127,10 @@ async def run_memory_consolidation() -> None: log.info("Scheduler running memory consolidation") try: - llm = agent._memory_llm(agent.config.memory.consolidation_provider) + llm = agent._memory_llm( + agent.config.memory.consolidation_provider, + agent.config.memory.consolidation_thinking_level, + ) result = await agent.memory.consolidate_and_cleanup( llm=llm, model=agent.config.memory.consolidation_model, diff --git a/tests/test_compaction.py b/tests/test_compaction.py index 6107e4c..1d1cbc4 100644 --- a/tests/test_compaction.py +++ b/tests/test_compaction.py @@ -189,7 +189,7 @@ async def test_maybe_compact_replaces_session_and_notifies(agent, monkeypatch) - for m in _session_with_tool_pair(): await agent.history.append_session_message("telegram", "u", m, "") - monkeypatch.setattr(agent, "_background_llm", lambda provider: FakeLLM("S")) + monkeypatch.setattr(agent, "_background_llm", lambda provider, thinking_level="": FakeLLM("S")) response = SimpleNamespace(usage={"context_tokens": 999}) notice = await agent._maybe_compact("telegram", "u", "", response) @@ -209,7 +209,7 @@ async def test_maybe_compact_below_threshold_noop(agent, monkeypatch) -> None: agent.config.compaction.threshold_tokens = 100000 for m in _session_with_tool_pair(): await agent.history.append_session_message("telegram", "u", m, "") - monkeypatch.setattr(agent, "_background_llm", lambda provider: FakeLLM("S")) + monkeypatch.setattr(agent, "_background_llm", lambda provider, thinking_level="": FakeLLM("S")) response = SimpleNamespace(usage={"context_tokens": 50}) assert await agent._maybe_compact("telegram", "u", "", response) is None diff --git a/tests/test_scheduler.py b/tests/test_scheduler.py index f0e9e28..46a188f 100644 --- a/tests/test_scheduler.py +++ b/tests/test_scheduler.py @@ -62,9 +62,10 @@ async def test_run_memory_consolidation_calls_store(monkeypatch) -> None: memory=SimpleNamespace( consolidation_model="model", consolidation_provider="anthropic", + consolidation_thinking_level="", ), ), - _memory_llm=lambda self_provider: llm_sentinel, + _memory_llm=lambda self_provider, thinking_level="": llm_sentinel, ) set_agent_context(agent) From 9567e8bbac37c3a54e7bc6f878d736117c06fd65 Mon Sep 17 00:00:00 2001 From: Matteo Merola Date: Thu, 25 Jun 2026 10:32:33 +0200 Subject: [PATCH 3/7] feat(admin): thinking-level autodiscovery endpoint + per-kind context Add POST /setup/thinking-levels (Anthropic capability lookup via Models API) and pass per-kind thinking levels to the LLM tab template. --- api/admin.py | 58 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) diff --git a/api/admin.py b/api/admin.py index 27ce7dc..dee923c 100644 --- a/api/admin.py +++ b/api/admin.py @@ -801,6 +801,10 @@ async def partial_llm() -> HTMLResponse: consolidation_model = ( await config_store.get("memory.consolidation_model") or "claude-haiku-4-5" ) + extraction_thinking_level = await config_store.get("memory.extraction_thinking_level") or "" + consolidation_thinking_level = ( + await config_store.get("memory.consolidation_thinking_level") or "" + ) gd_enabled = await config_store.get("goal_decomposition.enabled") gd_enabled = gd_enabled if gd_enabled is not None else "true" gd_provider = await config_store.get("goal_decomposition.provider") or "anthropic" @@ -809,8 +813,11 @@ async def partial_llm() -> HTMLResponse: tr_enabled = tr_enabled if tr_enabled is not None else "true" tr_provider = await config_store.get("task_reflection.provider") or "anthropic" tr_model = await config_store.get("task_reflection.model") or "claude-haiku-4-5" + gd_thinking_level = await config_store.get("goal_decomposition.thinking_level") or "" + tr_thinking_level = await config_store.get("task_reflection.thinking_level") or "" compaction_provider = await config_store.get("compaction.provider") or "anthropic" compaction_model = await config_store.get("compaction.model") or "claude-haiku-4-5" + compaction_thinking_level = await config_store.get("compaction.thinking_level") or "" prompt_tool_usage_override = await config_store.get("prompt.tool_usage_override") or "" prompt_history_override = await config_store.get("prompt.history_handling_override") or "" prompt_capture_enabled = await config_store.get("admin.capture_prompts") @@ -833,16 +840,21 @@ async def partial_llm() -> HTMLResponse: thinking_level=thinking_level, extraction_provider=extraction_provider, extraction_model=extraction_model, + extraction_thinking_level=extraction_thinking_level, consolidation_provider=consolidation_provider, consolidation_model=consolidation_model, + consolidation_thinking_level=consolidation_thinking_level, gd_enabled=gd_enabled, gd_provider=gd_provider, gd_model=gd_model, + gd_thinking_level=gd_thinking_level, tr_enabled=tr_enabled, tr_provider=tr_provider, tr_model=tr_model, + tr_thinking_level=tr_thinking_level, compaction_provider=compaction_provider, compaction_model=compaction_model, + compaction_thinking_level=compaction_thinking_level, prompt_tool_usage_override=prompt_tool_usage_override, prompt_history_override=prompt_history_override, default_tool_usage=DEFAULT_TOOL_USAGE_BLOCK, @@ -2400,6 +2412,25 @@ async def list_models(request: Request) -> dict: return await _list_models_openai(api_key, base_url) return {"ok": False, "error": f"Unknown service: {service}"} + @app.post("/setup/thinking-levels") + async def thinking_levels(request: Request) -> dict: + """Autodiscover supported reasoning-effort levels for a model. + + Only Anthropic exposes this via the Models API; other providers must be + configured by typing the effort value (see the docs link in the UI). + """ + payload = await request.json() + service = payload.get("service", "") + api_key = payload.get("api_key", "") + model = payload.get("model", "") + if service == "anthropic": + return await _thinking_levels_anthropic(api_key, model) + return { + "ok": False, + "error": "Autodiscovery is only available for Anthropic — " + "enter the effort value manually for this provider.", + } + return app, auth @@ -2458,6 +2489,33 @@ async def _test_openai(api_key: str, base_url: str | None, model: str = "gpt-4o- return {"ok": False, "error": str(exc)} +async def _thinking_levels_anthropic(api_key: str, model: str) -> dict: + if not api_key: + return {"ok": False, "error": "API key is empty"} + if not model: + return {"ok": False, "error": "Enter a model id first"} + try: + from anthropic import AsyncAnthropic + + client = AsyncAnthropic(api_key=api_key) + m = await client.models.retrieve(model) + caps = getattr(m, "capabilities", None) + if caps is not None and not isinstance(caps, dict): + caps = getattr(caps, "model_dump", lambda: {})() or {} + caps = caps or {} + effort = caps.get("effort") or {} + levels = [ + lvl + for lvl in ("low", "medium", "high", "xhigh", "max") + if isinstance(effort.get(lvl), dict) and effort[lvl].get("supported") + ] + thinking = caps.get("thinking") or {} + supported = bool(thinking.get("supported")) or bool(levels) + return {"ok": True, "supported": supported, "levels": levels} + except Exception as exc: + return {"ok": False, "error": str(exc)} + + async def _list_models_anthropic(api_key: str) -> dict: if not api_key: return {"ok": False, "error": "API key is empty"} From 910091db77d4d08cfa2b8073c064bfee6ef3df49 Mon Sep 17 00:00:00 2001 From: Matteo Merola Date: Thu, 25 Jun 2026 12:26:17 +0200 Subject: [PATCH 4/7] feat(ui): Alpine state + helpers for per-kind thinking levels Add levelOptions/fetchThinkingLevels (Anthropic autodiscovery), providerDocsUrl (info links for other providers), and sameAsMain (shared-config badge). --- api/templates/base.html | 60 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 59 insertions(+), 1 deletion(-) diff --git a/api/templates/base.html b/api/templates/base.html index e5cf454..59632c3 100644 --- a/api/templates/base.html +++ b/api/templates/base.html @@ -42,7 +42,7 @@ } window.showToast = showToast; - function llmTab(provider, apiKey, model, openaiKey, openaiBaseUrl, googleKey, googleBaseUrl, grokKey, grokBaseUrl, deepseekKey, deepseekBaseUrl, extractionProvider, extractionModel, consolidationProvider, consolidationModel, gdEnabled, gdProvider, gdModel, trEnabled, trProvider, trModel, promptToolUsageOverride, promptHistoryOverride, defaultToolUsage, defaultHistoryHandling, promptCaptureEnabled, compactionProvider, compactionModel, thinkingLevel) { + function llmTab(provider, apiKey, model, openaiKey, openaiBaseUrl, googleKey, googleBaseUrl, grokKey, grokBaseUrl, deepseekKey, deepseekBaseUrl, extractionProvider, extractionModel, consolidationProvider, consolidationModel, gdEnabled, gdProvider, gdModel, trEnabled, trProvider, trModel, promptToolUsageOverride, promptHistoryOverride, defaultToolUsage, defaultHistoryHandling, promptCaptureEnabled, compactionProvider, compactionModel, thinkingLevel, extractionThinkingLevel, consolidationThinkingLevel, gdThinkingLevel, trThinkingLevel, compactionThinkingLevel) { const currentProvider = provider || 'anthropic'; return { providerOptions: [ @@ -56,6 +56,14 @@ apiKey: apiKey || '', model: model || '', thinkingLevel: thinkingLevel || '', + extractionThinkingLevel: extractionThinkingLevel || '', + consolidationThinkingLevel: consolidationThinkingLevel || '', + gdThinkingLevel: gdThinkingLevel || '', + trThinkingLevel: trThinkingLevel || '', + compactionThinkingLevel: compactionThinkingLevel || '', + fetchedLevels: {anthropic: [], openai: [], google: [], grok: [], deepseek: []}, + fetchingLevels: {anthropic: false, openai: false, google: false, grok: false, deepseek: false}, + fetchLevelsResult: {anthropic: '', openai: '', google: '', grok: '', deepseek: ''}, openaiKey: openaiKey || '', openaiBaseUrl: openaiBaseUrl || '', googleKey: googleKey || '', @@ -170,6 +178,56 @@ return ['opus', 'sonnet', 'thinking', 'pro', 'deep-think', 'reasoner', 'grok-4'] .some(pat => id.includes(pat)); }, + // Effort values offered in the thinking-level datalist. Anthropic can be + // enriched live via "Fetch levels"; others fall back to the common set. + levelOptions(prov) { + const fetched = this.fetchedLevels[prov]; + if (fetched && fetched.length) return fetched; + return ['low', 'medium', 'high']; + }, + fetchThinkingLevels(service, model) { + const apiKey = this.keyFor(service); + if (!apiKey) { this.fetchLevelsResult[service] = 'Missing API key — save/enter it first'; return; } + if (!model) { this.fetchLevelsResult[service] = 'Enter a model id first'; return; } + this.fetchingLevels[service] = true; + this.fetchLevelsResult[service] = ''; + fetch('/setup/thinking-levels', { + method: 'POST', + headers: { + 'Content-Type': 'application/json', + 'Authorization': 'Bearer ' + (localStorage.getItem('admin_api_key') || '') + }, + body: JSON.stringify({service: service, api_key: apiKey, model: model}) + }) + .then(r => r.json()) + .then(d => { + if (d.ok && Array.isArray(d.levels) && d.levels.length) { + this.fetchedLevels[service] = d.levels; + this.fetchLevelsResult[service] = 'Supported: ' + d.levels.join(', '); + } else if (d.ok) { + this.fetchLevelsResult[service] = d.supported ? 'Thinking supported (no effort levels reported)' : 'Model does not support thinking'; + } else { + this.fetchLevelsResult[service] = 'Failed: ' + (d.error || 'unknown'); + } + }) + .catch(e => { this.fetchLevelsResult[service] = 'Error: ' + e.message; }) + .finally(() => { this.fetchingLevels[service] = false; }); + }, + // Docs for the reasoning-effort value to type, per provider. + providerDocsUrl(prov) { + return ({ + anthropic: 'https://platform.claude.com/docs/en/build-with-claude/effort', + openai: 'https://platform.openai.com/docs/guides/reasoning', + google: 'https://ai.google.dev/gemini-api/docs/thinking', + grok: 'https://docs.x.ai/docs/guides/reasoning', + deepseek: 'https://api-docs.deepseek.com/guides/reasoning_model' + })[prov] || ''; + }, + // True when a background kind points at the exact same provider+model as + // main inference — surfaced in the UI so shared config is obvious. + sameAsMain(prov, model) { + return !!model && prov === this.provider && model === this.model; + }, fetchModels(service) { const apiKey = this.keyFor(service); if (!apiKey) { From 839c3477157e3ef6a9bf38ab9dedbad65e634ab7 Mon Sep 17 00:00:00 2001 From: Matteo Merola Date: Thu, 25 Jun 2026 12:29:16 +0200 Subject: [PATCH 5/7] feat(ui): per-kind thinking-level controls in LLM tab Reusable Jinja macro renders a thinking-level field next to every inference kind's model (main, extraction, consolidation, goal decomposition, task reflection, compaction). Each is a type-or-pick datalist; Anthropic gets a 'Fetch levels' button (capability autodiscovery), other providers get an 'effort docs' link to enter the value manually. Background kinds show a 'same config as Main inference' badge when they target the main provider+model. Each level is saved to its own config key, cleared when the model can't think. --- api/templates/partials/llm.html | 62 +++++++++++++++++++++++++-------- 1 file changed, 47 insertions(+), 15 deletions(-) diff --git a/api/templates/partials/llm.html b/api/templates/partials/llm.html index 56b98df..1e8d4fb 100644 --- a/api/templates/partials/llm.html +++ b/api/templates/partials/llm.html @@ -1,4 +1,30 @@ {# LLM tab partial #} +{#- Reusable thinking-level control. `bg` adds the "shared config" badge for + background inference kinds that point at the same provider+model as main. -#} +{% macro think(level, prov, model, lid, bg=false) -%} +
+ +
+ + + + + + + effort docs ↗ +
+

+

Reasoning effort — higher = more reasoning, more tokens. Anthropic: “Fetch levels” autodiscovers supported values; other providers: see the docs link and type the value.

+
+{%- endmacro %}

System Prompt Controls

@@ -177,16 +208,7 @@

Active Inference Provider

Type any model id, or pick from the list. Use “Fetch models” in the provider card below to load the live list from the API.

-
- - -

Reasoning effort for models that support thinking. Higher = more reasoning, more tokens.

-
+{{ think('thinkingLevel', 'provider', 'model', 'dl-think-main') }}
@@ -253,6 +275,7 @@

Memory Models

Model used to extract memories from conversations.

+{{ think('extractionThinkingLevel', 'extractionProvider', 'extractionModel', 'dl-think-ext', true) }}
@@ -275,6 +298,7 @@

Memory Models

Model used to consolidate and merge duplicate memories.

+{{ think('consolidationThinkingLevel', 'consolidationProvider', 'consolidationModel', 'dl-think-con', true) }}
@@ -289,8 +313,10 @@

Memory Models

body: JSON.stringify({values: { 'memory.extraction_provider': extractionProvider, 'memory.extraction_model': extractionModel, + 'memory.extraction_thinking_level': modelSupportsThinking(extractionModel) ? extractionThinkingLevel : '', 'memory.consolidation_provider': consolidationProvider, - 'memory.consolidation_model': consolidationModel + 'memory.consolidation_model': consolidationModel, + 'memory.consolidation_thinking_level': modelSupportsThinking(consolidationModel) ? consolidationThinkingLevel : '' }}) }) .then(r => { memoryResultOk = r.ok; return r.json(); }) @@ -337,6 +363,7 @@

Goal Decomposition

Model used to classify message complexity and decompose goals.

+{{ think('gdThinkingLevel', 'gdProvider', 'gdModel', 'dl-think-gd', true) }}
@@ -351,7 +378,8 @@

Goal Decomposition

body: JSON.stringify({values: { 'goal_decomposition.enabled': gdEnabled ? 'true' : 'false', 'goal_decomposition.provider': gdProvider, - 'goal_decomposition.model': gdModel + 'goal_decomposition.model': gdModel, + 'goal_decomposition.thinking_level': modelSupportsThinking(gdModel) ? gdThinkingLevel : '' }}) }) .then(r => { gdResultOk = r.ok; return r.json(); }) @@ -398,6 +426,7 @@

Task Reflection

Model used to reflect on completed tasks and extract lessons.

+{{ think('trThinkingLevel', 'trProvider', 'trModel', 'dl-think-tr', true) }}
@@ -412,7 +441,8 @@

Task Reflection

body: JSON.stringify({values: { 'task_reflection.enabled': trEnabled ? 'true' : 'false', 'task_reflection.provider': trProvider, - 'task_reflection.model': trModel + 'task_reflection.model': trModel, + 'task_reflection.thinking_level': modelSupportsThinking(trModel) ? trThinkingLevel : '' }}) }) .then(r => { trResultOk = r.ok; return r.json(); }) @@ -454,6 +484,7 @@

History Compaction

A small, fast model is recommended (summarization is cheap).

+{{ think('compactionThinkingLevel', 'compactionProvider', 'compactionModel', 'dl-think-cp', true) }}
@@ -467,7 +498,8 @@

History Compaction

}, body: JSON.stringify({values: { 'compaction.provider': compactionProvider, - 'compaction.model': compactionModel + 'compaction.model': compactionModel, + 'compaction.thinking_level': modelSupportsThinking(compactionModel) ? compactionThinkingLevel : '' }}) }) .then(r => { compactionResultOk = r.ok; return r.json(); }) From f36aaea372faa89686bbbb941b0c3adf132e09e6 Mon Sep 17 00:00:00 2001 From: Matteo Merola Date: Thu, 25 Jun 2026 12:29:55 +0200 Subject: [PATCH 6/7] test(llm): cover generate_text reasoning + per-provider kwargs --- tests/test_llm.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/tests/test_llm.py b/tests/test_llm.py index 06d4f5c..62ac2fd 100644 --- a/tests/test_llm.py +++ b/tests/test_llm.py @@ -44,3 +44,26 @@ async def test_anthropic_generate_omits_effort_when_off() -> None: kwargs = create.await_args.kwargs assert "thinking" not in kwargs assert "output_config" not in kwargs + + +@pytest.mark.asyncio +async def test_anthropic_generate_text_sends_effort_when_set() -> None: + """Background tasks (memory/reflection/etc.) honor the client's level too.""" + client = LLMClient("anthropic", "x", thinking_level="low") + create = AsyncMock(return_value=type("R", (), {"content": []})()) + client._client = type("C", (), {"messages": type("M", (), {"create": create})()})() + + await client.generate_text(model="claude-4-6-opus", prompt="hi") + + kwargs = create.await_args.kwargs + assert kwargs["thinking"] == {"type": "adaptive"} + assert kwargs["output_config"] == {"effort": "low"} + + +def test_reasoning_kwargs_per_provider() -> None: + assert LLMClient("openai", "x", thinking_level="high")._reasoning_kwargs() == { + "reasoning_effort": "high" + } + assert LLMClient("anthropic", "x")._reasoning_kwargs() == {} + # unknown level value is ignored (off) + assert LLMClient("anthropic", "x", thinking_level="bogus")._reasoning_kwargs() == {} From d320eb451422c5041db764a1415249fef3f9da26 Mon Sep 17 00:00:00 2001 From: Matteo Merola Date: Thu, 25 Jun 2026 13:49:49 +0200 Subject: [PATCH 7/7] fix(ui): always show thinking-level control, save typed value MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The modelSupportsThinking() substring heuristic hid every control for real model ids (deepseek-v4-flash, claude-haiku-4-5, …). Drop the visibility gate so the field shows for all providers/models — matching the 'enter effort manually for other providers' requirement — and save the typed value as-is. The heuristic is demoted to a non-blocking amber hint shown only when a level is set on an unrecognized model. --- api/templates/partials/llm.html | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/api/templates/partials/llm.html b/api/templates/partials/llm.html index 1e8d4fb..df6613d 100644 --- a/api/templates/partials/llm.html +++ b/api/templates/partials/llm.html @@ -2,7 +2,7 @@ {#- Reusable thinking-level control. `bg` adds the "shared config" badge for background inference kinds that point at the same provider+model as main. -#} {% macro think(level, prov, model, lid, bg=false) -%} -
+
effort docs ↗

-

Reasoning effort — higher = more reasoning, more tokens. Anthropic: “Fetch levels” autodiscovers supported values; other providers: see the docs link and type the value.

+

Leave Off for non-reasoning models. Higher = more reasoning, more tokens. Anthropic: “Fetch levels” autodiscovers supported values; other providers: see the docs link and type the value.

+

Heads up: “” isn't a recognized reasoning model — only set a level if you know it supports thinking, or the inference call may error.

{%- endmacro %}
Active Inference Provider 'agent.deepseek_api_key': deepseekKey, 'agent.deepseek_base_url': deepseekBaseUrl, 'agent.model': model, - 'agent.thinking_level': modelSupportsThinking(model) ? thinkingLevel : '' + 'agent.thinking_level': thinkingLevel }}) }) .then(r => { resultOk = r.ok; return r.json(); }) @@ -313,10 +314,10 @@

Memory Models

body: JSON.stringify({values: { 'memory.extraction_provider': extractionProvider, 'memory.extraction_model': extractionModel, - 'memory.extraction_thinking_level': modelSupportsThinking(extractionModel) ? extractionThinkingLevel : '', + 'memory.extraction_thinking_level': extractionThinkingLevel, 'memory.consolidation_provider': consolidationProvider, 'memory.consolidation_model': consolidationModel, - 'memory.consolidation_thinking_level': modelSupportsThinking(consolidationModel) ? consolidationThinkingLevel : '' + 'memory.consolidation_thinking_level': consolidationThinkingLevel }}) }) .then(r => { memoryResultOk = r.ok; return r.json(); }) @@ -379,7 +380,7 @@

Goal Decomposition

'goal_decomposition.enabled': gdEnabled ? 'true' : 'false', 'goal_decomposition.provider': gdProvider, 'goal_decomposition.model': gdModel, - 'goal_decomposition.thinking_level': modelSupportsThinking(gdModel) ? gdThinkingLevel : '' + 'goal_decomposition.thinking_level': gdThinkingLevel }}) }) .then(r => { gdResultOk = r.ok; return r.json(); }) @@ -442,7 +443,7 @@

Task Reflection

'task_reflection.enabled': trEnabled ? 'true' : 'false', 'task_reflection.provider': trProvider, 'task_reflection.model': trModel, - 'task_reflection.thinking_level': modelSupportsThinking(trModel) ? trThinkingLevel : '' + 'task_reflection.thinking_level': trThinkingLevel }}) }) .then(r => { trResultOk = r.ok; return r.json(); }) @@ -499,7 +500,7 @@

History Compaction

body: JSON.stringify({values: { 'compaction.provider': compactionProvider, 'compaction.model': compactionModel, - 'compaction.thinking_level': modelSupportsThinking(compactionModel) ? compactionThinkingLevel : '' + 'compaction.thinking_level': compactionThinkingLevel }}) }) .then(r => { compactionResultOk = r.ok; return r.json(); })