diff --git a/CHANGELOG.md b/CHANGELOG.md
index c90f4deb..194b420a 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,59 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 Copyright 2026 Firefly Software Foundation. Licensed under the Apache License 2.0.
 
+## [26.06.11] - 2026-06-22
+
+SP-3: human-in-the-loop tool approval re-based onto pydantic-ai native deferred-tools.
+
+### Added
+
+- **Native tool approval / HITL.** Tools can declare `requires_approval=True`
+  (`firefly_tool(...)`, `BaseTool`, and threaded through `ToolKit.as_pydantic_tools()`
+  / `as_toolset()`). When the model calls such a tool, the agent run **pauses before
+  executing it** and returns a `DeferredToolRequests` as `result.output`. Detect with
+  the new `is_deferred(result)` helper; **resume** via
+  `agent.run(message_history=..., deferred_tool_results=DeferredToolResults(approvals={call_id: True | ToolApproved(override_args=...) | ToolDenied(message=...)}))`.
+  `FireflyAgent` auto-detects HITL (any approval-requiring tool/`ToolKit`/`as_toolset()`,
+  or an `ApprovalRequiredToolset` in `toolsets`) and widens its output union to allow the
+  pause **only then** — non-HITL agents are unchanged. Force with `hitl=True`.
+- **Inline (non-pausing) approval.** `FireflyAgent(approval_handler=...)` resolves
+  approvals *inside* the run via a native `HandleDeferredToolCalls` capability — for
+  programmatic / policy-based auto-approval.
+- **Native re-exports** from `fireflyframework_agentic.tools`: `DeferredToolRequests`,
+  `DeferredToolResults`, `ToolApproved`, `ToolDenied`, `ApprovalRequired` (plus the
+  already-exported `ApprovalRequiredToolset`). `is_deferred` and the `ApprovalHandler`
+  type are exported from `fireflyframework_agentic.agents`.
+
+### Changed
+
+- Post-run cross-cutting code now treats a paused run as a control object, not a final
+  answer: `_persist_memory`, the output-guard, validation, cache, logging, and
+  explainability middleware all **skip** a `DeferredToolRequests` output (preventing
+  corrupted memory turns, spurious `OutputGuardError`/`OutputReviewError`, and caching a
+  pause).
+- Tool **guard denials** (validation / rate-limit / sandbox) now raise `ToolGuardError`
+  instead of a plain `ToolError`. `ToolGuardError` subclasses `ToolError`, so existing
+  `except ToolError` handlers are unaffected.
+- `BaseTool._guarded_execute` now lets pydantic-ai's `ApprovalRequired` / `CallDeferred`
+  control signals propagate untouched (like `ModelRetry`), instead of wrapping them as
+  `ToolError`. This makes **dynamic** approval work — a tool body (with `takes_ctx=True`)
+  may `raise ApprovalRequired(metadata=...)` to defer that specific call; pair with
+  `FireflyAgent(hitl=True)` so the output union allows the pause.
+
+### Removed (breaking)
+
+- **`ApprovalGuard`** (and the `ApprovalCallback` alias). The bespoke guard-chain approval
+  (sync bool callback → `ToolError` on denial, no pause/resume/metadata) is replaced by the
+  native protocol above. Migration: `docs/migration.md` §6.
+
+### Notes
+
+- HITL stays three distinct layers by design: tool approval (native deferred-tools, agent
+  layer), workflow `human()` / `WorkflowInterrupt` (journal-replay), and pipeline `Pause`
+  / `approve_pause` (checkpoint). They are not collapsed.
+- Validated against a live Anthropic model: a `requires_approval` tool pauses the real run
+  (tool body does not execute), and resuming with approval runs it exactly once.
+
 ## [26.06.10] - 2026-06-22
 
 SP-5: native structured-output modes for reasoning patterns.
diff --git a/README.md b/README.md
index 8a3e6084..1e547c8b 100644
--- a/README.md
+++ b/README.md
@@ -156,9 +156,14 @@ create your own components; the framework discovers them via duck typing.
 
 - **Tools** — `ToolProtocol` (duck-typed) and `BaseTool` (inheritance) let you choose
   your extensibility style. `ToolBuilder` provides a fluent API for building tools
-  without subclassing. Five guard types (`ValidationGuard`, `RateLimitGuard`,
-  `ApprovalGuard`, `SandboxGuard`, `CompositeGuard`) intercept calls before execution.
-  Three composition patterns (`SequentialComposer`, `FallbackComposer`,
+  without subclassing. Four guard types (`ValidationGuard`, `RateLimitGuard`,
+  `SandboxGuard`, `CompositeGuard`) intercept calls before execution (a rejected guard
+  raises `ToolGuardError`). For **human-in-the-loop**, mark a tool `requires_approval=True`:
+  the agent run **pauses** before executing it and returns a `DeferredToolRequests`
+  (detected via `is_deferred(result)`), which you resume with `deferred_tool_results=` —
+  approving (`ToolApproved`), denying (`ToolDenied`), or auto-deciding inline via an
+  `approval_handler=`. The native deferred-tools types are re-exported from
+  `fireflyframework_agentic.tools`. Three composition patterns (`SequentialComposer`, `FallbackComposer`,
   `ConditionalComposer`) build higher-order tools. `ToolKit` groups tools for
   bulk registration. Nine built-in tools (calculator, datetime, filesystem, HTTP,
   JSON, search, shell, text, database) are ready to attach to any agent.
@@ -177,7 +182,11 @@ create your own components; the framework discovers them via duck typing.
   **Reflexion** (execute → critique → retry), **Tree of Thoughts** (branch →
   evaluate → select), and **Goal Decomposition** (goal → phases → tasks).
   All produce structured `ReasoningResult` with `ReasoningTrace`. Prompts are
-  slot-overridable. `OutputReviewer` can validate final outputs. `ReasoningPipeline`
+  slot-overridable. Each pattern's structured output is wrapped in a pydantic-ai
+  output mode — selected per-pattern via `output_mode=` or framework-wide via the
+  `reasoning_output_mode` config — `"tool"` (`ToolOutput`), `"native"` (provider
+  structured output), or `"prompted"` (`PromptedOutput`, portable to any model).
+  `OutputReviewer` can validate final outputs. `ReasoningPipeline`
   chains patterns sequentially.
 
 <p align="center">
@@ -536,6 +545,12 @@ async def lookup(query: str) -> str:
     return f"Result for {query}"
 ```
 
+> **Human-in-the-loop:** mark a tool `@firefly_tool(name=..., requires_approval=True)` and the
+> agent run **pauses** before executing it — `run()` returns a `DeferredToolRequests`
+> (detect with `is_deferred(result)`). Resume with
+> `agent.run(message_history=paused.all_messages(), deferred_tool_results=DeferredToolResults(approvals={call_id: True}))`.
+> Full detail in [docs/tools.md](docs/tools.md#human-in-the-loop-tool-approval).
+
 ### 4. Add Memory for Multi-Turn Conversations
 
 ```python
@@ -713,11 +728,11 @@ content processing, validation, explainability, and pipelines.
 Detailed guides for each module:
 
 - [Architecture](docs/architecture.md) — Design principles and layer diagram
-- [Agents](docs/agents.md) — Lifecycle, registry, delegation, decorators
+- [Agents](docs/agents.md) — Lifecycle, registry, delegation, decorators, human-in-the-loop approval
 - [Template Agents](docs/templates.md) — Summarizer, classifier, extractor, conversational, router
-- [Tools](docs/tools.md) — Protocol, builder, guards, composition, built-ins
+- [Tools](docs/tools.md) — Protocol, builder, guards, composition, built-ins, native HITL approval (`requires_approval`, deferred resume)
 - [Prompts](docs/prompts.md) — Templates, versioning, composition, validation
-- [Reasoning Patterns](docs/reasoning.md) — 6 patterns, structured outputs, custom patterns
+- [Reasoning Patterns](docs/reasoning.md) — 6 patterns, structured outputs, output modes (`output_mode`/`reasoning_output_mode`), custom patterns
 - [Content](docs/content.md) — Chunking, compression, batch processing
 - [Memory](docs/memory.md) — Conversation history, working memory, storage backends
 - [Validation](docs/validation.md) — Rules, QoS guards, output reviewer
diff --git a/docs/README.md b/docs/README.md
index e46a00d8..c2a45470 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -43,7 +43,7 @@ below it, keeping the dependency graph acyclic and each module independently tes
 |---|---|
 | **[Agents](agents.md)** | `FireflyAgent`, `AgentRegistry`, `AgentLifecycle`, `@firefly_agent` decorator, middleware stack (`AgentMiddleware`, `MiddlewareChain`, `Logging`/`PromptGuard`/`CostGuard`/`Observability`/`Explainability`/`Cache`/`OutputGuard`/`Validation`/`Retry`/`PromptCache` middleware), 7 delegation strategies (round-robin, capability, content-based, cost-aware, chain, fallback, weighted), `FallbackModelWrapper` / `run_with_fallback`, `ResultCache` |
 | **[Template Agents](templates.md)** | Five factory functions: summarizer, classifier, extractor, conversational, router |
-| **[Tools](tools.md)** | `ToolProtocol`, `BaseTool`, `ToolBuilder`, guards, composition, caching, 9 built-in tools; full-fidelity schemas via `ParameterSpec(python_type=…)`, `RunContext` opt-in (`takes_ctx`), `ToolKit.as_toolset()` + re-exported native combinators (`FilteredToolset`, `WrapperToolset`, `ApprovalRequiredToolset`, …) |
+| **[Tools](tools.md)** | `ToolProtocol`, `BaseTool`, `ToolBuilder`, guards, composition, caching, 9 built-in tools; full-fidelity schemas via `ParameterSpec(python_type=…)`, `RunContext` opt-in (`takes_ctx`), `ToolKit.as_toolset()` + re-exported native combinators (`FilteredToolset`, `WrapperToolset`, `ApprovalRequiredToolset`, …); human-in-the-loop tool approval (`requires_approval` / `is_deferred` / `deferred_tool_results` / `approval_handler`) |
 | **[Prompts](prompts.md)** | `PromptTemplate`, `PromptRegistry`, composers, validation, loaders |
 | **[Content](content.md)** | `TextChunker`, `MarkdownChunker`, `DocumentSplitter`, `ImageTiler`, `BatchProcessor`, compression; binary normalization (`content.binary`, `[binary]` extra: `BinaryNormalizer`, office/PDF/image/archive/email converters) |
 | **[Memory](memory.md)** | `ConversationMemory`, `WorkingMemory`, `MemoryManager`, `InMemoryStore` / `FileStore` / `SQLiteStore` backends, `MemoryScope`, LLM summarisation |
diff --git a/docs/agents.md b/docs/agents.md
index 8f8427d8..ef3aa5df 100644
--- a/docs/agents.md
+++ b/docs/agents.md
@@ -137,6 +137,34 @@ agent = FireflyAgent(
 
 ---
 
+## Human-in-the-Loop Tool Approval
+
+When a tool declares `requires_approval=True` (or an `ApprovalRequiredToolset` gates the
+toolset), `run()` / `run_sync()` **pause before the tool executes** and return a
+`DeferredToolRequests` as `result.output`. Detect this with `is_deferred(result)`, then
+resume by calling the agent again with the paused messages and the human's decision:
+
+```python
+from fireflyframework_agentic.agents import FireflyAgent, is_deferred
+from fireflyframework_agentic.tools import DeferredToolResults
+
+result = await agent.run("Delete record 42.")
+if is_deferred(result):
+    approvals = {c.tool_call_id: True for c in result.output.approvals}  # True / ToolApproved / ToolDenied
+    result = await agent.run(
+        message_history=result.all_messages(),
+        deferred_tool_results=DeferredToolResults(approvals=approvals),
+    )
+```
+
+`FireflyAgent` auto-detects HITL from approval-requiring tools/toolsets (widening its output
+union only then); force it with `hitl=True`, or resolve approvals inline without pausing via
+`approval_handler=`. Post-run middleware (output guard, validation, cache) and memory all skip
+a paused result — it is a control object, not a final answer. See
+[Human-in-the-Loop Tool Approval](tools.md#human-in-the-loop-tool-approval) for the full guide.
+
+---
+
 ## Agent Registry
 
 The `AgentRegistry` is a singleton that maps agent names to `FireflyAgent` instances.
diff --git a/docs/architecture.md b/docs/architecture.md
index 5b6f2973..c4d7541e 100644
--- a/docs/architecture.md
+++ b/docs/architecture.md
@@ -61,7 +61,7 @@ graph TD
 
     subgraph Agent Layer
         AGT["Agents<br/><small>FireflyAgent · AgentRegistry<br/>DelegationRouter · AgentLifecycle<br/>@firefly_agent · 5 templates · 11 middleware<br/>7 delegation strategies · FallbackModelWrapper<br/>ResultCache · run timeout</small>"]
-        TOOLS["Tools<br/><small>BaseTool · ToolBuilder · ToolKit · CachedTool<br/>5 guards · 3 composers · tool timeout<br/>ToolRegistry · 9 built-ins</small>"]
+        TOOLS["Tools<br/><small>BaseTool · ToolBuilder · ToolKit · CachedTool<br/>4 guards · 3 composers · tool timeout · HITL approval<br/>ToolRegistry · 9 built-ins</small>"]
         PROMPTS["Prompts<br/><small>PromptTemplate · PromptRegistry<br/>3 composers · PromptValidator<br/>PromptLoader</small>"]
         CONTENT["Content<br/><small>TextChunker · DocumentSplitter · MarkdownChunker<br/>ImageTiler · BatchProcessor<br/>ContextCompressor · SlidingWindowManager<br/>content.binary (BinaryNormalizer · office converters)</small>"]
         MEM["Memory<br/><small>MemoryManager · ConversationMemory<br/>WorkingMemory · TokenEstimator<br/>InMemoryStore · FileStore · SQLiteStore<br/>summarization · create_llm_summarizer<br/>export/import · async wrappers</small>"]
@@ -166,7 +166,6 @@ classDiagram
     ToolProtocol <|.. ConditionalComposer
     GuardProtocol <|.. ValidationGuard
     GuardProtocol <|.. RateLimitGuard
-    GuardProtocol <|.. ApprovalGuard
     GuardProtocol <|.. SandboxGuard
     GuardProtocol <|.. CompositeGuard
     ReasoningPattern <|.. AbstractReasoningPattern
@@ -215,7 +214,7 @@ system. Every other module depends on at least one Core component.
   from environment variables and `.env` files. It actively rejects removed serving/exposure
   config fields (e.g. `otlp_endpoint`, `rbac_enabled`, `cors_allowed_origins`,
   `cost_calculator`) with a `ValueError`.
-- **exceptions.py** -- A structured exception hierarchy of 34 classes rooted at
+- **exceptions.py** -- A structured exception hierarchy of 42 classes rooted at
   `FireflyAgenticError`.
 - **plugin.py** -- `PluginDiscovery` discovers and loads entry-point plugins at startup.
 - **resilience/circuit_breaker.py** -- `CircuitBreaker` (with `CircuitState` and
diff --git a/docs/migration.md b/docs/migration.md
index e0cff968..13361292 100644
--- a/docs/migration.md
+++ b/docs/migration.md
@@ -134,6 +134,50 @@ await triage(args, runner=FireflyAgentRunner())   # both resolved from the regis
 
 ---
 
+## 6. `ApprovalGuard` removed — human-in-the-loop is now native (breaking)
+
+**Why.** Tool approval was a bespoke guard (`ApprovalGuard(callback)`) that ran inside
+Firefly's guard chain and raised `ToolError` on a denied call — a synchronous, all-or-nothing
+gate with no pause/resume, metadata, or per-call granularity, parallel to pydantic-ai's own
+deferred-tools protocol. It has been **removed** in favour of the native protocol
+(`requires_approval`, `DeferredToolRequests`/`DeferredToolResults`, `ApprovalRequired`,
+`ApprovalRequiredToolset`).
+
+```python
+# Before — guard that blocks the run on denial
+from fireflyframework_agentic.tools.guards import ApprovalGuard
+
+async def approve(tool_name, kwargs) -> bool:
+    return await ask_admin(tool_name, kwargs)
+
+@guarded(ApprovalGuard(callback=approve))
+@firefly_tool("delete_record", description="Delete a record")
+async def delete_record(record_id: str) -> str: ...
+
+# After — native: the run PAUSES for sign-off, then resumes
+from fireflyframework_agentic.agents import is_deferred
+from fireflyframework_agentic.tools import DeferredToolResults
+
+@firefly_tool("delete_record", description="Delete a record", requires_approval=True)
+async def delete_record(record_id: str) -> str: ...
+
+result = await agent.run("delete record 42")
+if is_deferred(result):
+    approvals = {c.tool_call_id: await ask_admin(c) for c in result.output.approvals}  # bool / ToolApproved / ToolDenied
+    result = await agent.run(message_history=result.all_messages(),
+                             deferred_tool_results=DeferredToolResults(approvals=approvals))
+```
+
+For the old **inline, non-pausing** behaviour (a callback decides programmatically), pass
+`FireflyAgent(approval_handler=...)` — wired as a native `HandleDeferredToolCalls` capability.
+See [Human-in-the-Loop Tool Approval](tools.md#human-in-the-loop-tool-approval).
+
+Also: guard denials (validation, rate-limit, sandbox) now raise `ToolGuardError` instead of a
+plain `ToolError`. `ToolGuardError` **subclasses** `ToolError`, so existing `except ToolError`
+handlers keep working.
+
+---
+
 ## Checklist
 
 - [ ] Replace every `type_annotation="..."` with `python_type=<real type>` in
@@ -144,3 +188,7 @@ await triage(args, runner=FireflyAgentRunner())   # both resolved from the regis
 - [ ] Review workflows for global cost/budget effects now that sub-agents run through
       `FireflyAgent`; pass `runner=DefaultAgentRunner()` if you want the old path.
 - [ ] Import toolset combinators / `RunContext` from `fireflyframework_agentic.tools`.
+- [ ] Replace `ApprovalGuard` with `requires_approval=True` + the native pause/resume flow
+      (`is_deferred()` + `deferred_tool_results=`), or an inline `approval_handler=`.
+- [ ] If you matched on `ToolError` from guard denials specifically, note it is now the
+      `ToolGuardError` subclass (still caught by `except ToolError`).
diff --git a/docs/tools.md b/docs/tools.md
index 545d3995..38c072ea 100644
--- a/docs/tools.md
+++ b/docs/tools.md
@@ -188,7 +188,12 @@ flowchart LR
 
 Each guard implements `GuardProtocol.check(tool_name, kwargs) -> GuardResult`. When a
 guard returns `GuardResult(passed=False, reason=...)`, `BaseTool.execute()` raises a
-`ToolError` before the handler runs.
+`ToolGuardError` (a `ToolError` subclass) before the handler runs.
+
+> Guards are for **hard, synchronous policy** (validation, rate-limiting, sandboxing).
+> For **human-in-the-loop approval** — pausing a run until a person signs off — use the
+> native deferred-tools path described in
+> [Human-in-the-Loop Tool Approval](#human-in-the-loop-tool-approval), not a guard.
 
 ### Built-in Guards
 
@@ -196,8 +201,6 @@ guard returns `GuardResult(passed=False, reason=...)`, `BaseTool.execute()` rais
   required keyword argument is missing.
 - **RateLimitGuard** -- `RateLimitGuard(max_calls, period_seconds=60.0)` — sliding-window
   rate limiter that caps invocations per time window.
-- **ApprovalGuard** -- `ApprovalGuard(callback)` — human-in-the-loop gate; `callback` is
-  an async callable `(tool_name, kwargs) -> bool` that must return `True` to approve.
 - **SandboxGuard** -- `SandboxGuard(*, allowed_patterns=(), denied_patterns=())` — converts
   each kwarg value to a string and rejects it if it matches a `denied_patterns` regex
   (unless it also matches an `allowed_patterns` regex, which takes precedence).
@@ -253,6 +256,139 @@ async def fetch(url: str) -> str:
 
 ---
 
+## Human-in-the-Loop Tool Approval
+
+Destructive or sensitive tools can require a human to sign off **before** they run.
+Firefly delegates this to pydantic-ai's native **deferred-tools** protocol — no bespoke
+mechanism — so approval is per-tool-call, carries metadata, and survives durable
+execution.
+
+### Declaring a tool that needs approval
+
+Set `requires_approval=True` on the tool. It works on the decorator, `BaseTool`
+subclasses, and tools added to a `ToolKit` (via either `as_pydantic_tools()` or
+`as_toolset()`):
+
+```python
+from fireflyframework_agentic.tools import firefly_tool
+
+@firefly_tool("delete_record", description="Delete a database record", requires_approval=True)
+async def delete_record(record_id: str) -> str:
+    ...
+```
+
+### Pause → approve → resume
+
+When the model calls an approval-required tool, the run **pauses before executing it**
+and returns a `DeferredToolRequests` as `result.output`. Check this with `is_deferred()`,
+collect the human decision, then **resume** by calling `run()` again with the prior
+messages and a `DeferredToolResults`:
+
+```python
+from fireflyframework_agentic.agents import FireflyAgent, is_deferred
+from fireflyframework_agentic.tools import DeferredToolResults, ToolApproved, ToolDenied
+
+agent = FireflyAgent("ops", model="anthropic:claude-haiku-4-5", tools=[delete_record])
+
+result = await agent.run("Delete record 42.")
+if is_deferred(result):
+    requests = result.output                      # DeferredToolRequests
+    decisions = {}
+    for call in requests.approvals:               # each is a ToolCallPart
+        # call.tool_name, call.args, requests.metadata.get(call.tool_call_id)
+        decisions[call.tool_call_id] = True        # True / ToolApproved(override_args=...) / ToolDenied(message=...)
+    result = await agent.run(
+        message_history=result.all_messages(),     # required: thread the paused run's messages
+        deferred_tool_results=DeferredToolResults(approvals=decisions),
+    )
+# result.output is now the model's final answer
+```
+
+The full pause → approve → resume flow:
+
+```mermaid
+sequenceDiagram
+    actor Caller
+    participant Agent as FireflyAgent
+    participant Model as LLM
+    actor Human
+
+    Caller->>Agent: run("Delete record 42.")
+    Agent->>Model: prompt + tool schemas
+    Model-->>Agent: call delete_record(record_id="42")
+    Note over Agent: requires_approval → run PAUSES<br/>before executing the tool
+    Agent-->>Caller: result.output = DeferredToolRequests<br/>(is_deferred(result) is True)
+
+    Caller->>Human: present requests.approvals (+ metadata)
+    Human-->>Caller: decision = True / ToolApproved(override_args=…) / ToolDenied(message=…)
+
+    Caller->>Agent: run(message_history=result.all_messages(),<br/>deferred_tool_results=DeferredToolResults(approvals={call_id: decision}))
+    alt approved
+        Agent->>Agent: execute delete_record<br/>(RunContext.tool_call_approved = True)
+    else denied
+        Note over Agent: ToolDenied message returned to the model<br/>(not a crash — the run continues)
+    end
+    Agent->>Model: continue with tool result / denial
+    Model-->>Agent: final answer
+    Agent-->>Caller: result.output = final answer
+```
+
+> With an `approval_handler=` (the inline, non-pausing path), the handler resolves the
+> `DeferredToolRequests` **inside** the run via a native `HandleDeferredToolCalls` capability,
+> so the run never returns to the caller paused.
+
+Decision values: `True` approves (equivalent to `ToolApproved()`); `ToolApproved(override_args={...})`
+approves but replaces the call arguments; `ToolDenied(message="...")` denies — the message is
+returned to the model, which continues (it is **not** a crash). On resume the tool's
+`RunContext.tool_call_approved` is `True` and `RunContext.tool_call_metadata` carries any
+`DeferredToolResults.metadata` you passed.
+
+> Do not set `conversation_id` on the resume call — an explicit `message_history` and
+> memory injection are mutually exclusive.
+
+### Auto-detection and forcing HITL
+
+`FireflyAgent` widens its output type to allow the `DeferredToolRequests` pause exactly when
+HITL is in play. It auto-detects this from any `requires_approval` tool (directly, inside a
+`ToolKit`, or inside a `ToolKit.as_toolset()`), or an `ApprovalRequiredToolset` in `toolsets`.
+If your tools defer **dynamically** (raising `pydantic_ai.exceptions.ApprovalRequired`) so
+detection can't see it statically, pass `hitl=True`.
+
+### Dynamic, predicate-based approval
+
+To gate **existing** tools by a runtime predicate (e.g. approve small amounts, hold large
+ones), wrap a toolset in the native `ApprovalRequiredToolset`:
+
+```python
+from fireflyframework_agentic.tools import ApprovalRequiredToolset
+
+gated = ApprovalRequiredToolset(
+    my_toolkit.as_toolset(),
+    approval_required_func=lambda ctx, tool_def, args: args.get("amount", 0) > 1000,
+)
+agent = FireflyAgent("payments", model="...", toolsets=[gated])
+```
+
+### Inline (non-pausing) approval
+
+For programmatic / policy-based auto-approval that resolves **inside** the run instead of
+pausing, pass an `approval_handler` — wired as a native `HandleDeferredToolCalls` capability:
+
+```python
+def auto_approve(ctx, requests):
+    return requests.build_results(approvals={c.tool_call_id: True for c in requests.approvals})
+
+agent = FireflyAgent("ops", model="...", tools=[delete_record], approval_handler=auto_approve)
+```
+
+> **Three distinct HITL layers, by design.** Tool approval (this section) is native
+> deferred-tools at the **agent** layer. A **workflow** pause uses `human()` /
+> `WorkflowInterrupt` (journal-replay). A **pipeline** node pauses by returning `Pause(...)`
+> and resumes via `approve_pause` (checkpoint). They solve different problems and are not
+> collapsed into one mechanism.
+
+---
+
 ## Composition
 
 Tools can be composed into higher-level operations. Each composer implements
diff --git a/docs/tutorial.md b/docs/tutorial.md
index a5b6e5db..89006ad5 100644
--- a/docs/tutorial.md
+++ b/docs/tutorial.md
@@ -626,8 +626,9 @@ during a conversation to fetch data, trigger side-effects, or run computations.
 
 Pydantic AI already supports tool functions, but fireflyframework-agentic wraps them with
 a richer layer: a **protocol-based type system** (`ToolProtocol` → `BaseTool`),
-**guards** that enforce validation, rate-limiting, sandboxing, and approval policies
-before a tool executes, **composition** primitives (sequential, fallback, conditional),
+**guards** that enforce validation, rate-limiting, and sandboxing
+before a tool executes (human-in-the-loop *approval* is separate — it pauses the run
+rather than rejecting it), **composition** primitives (sequential, fallback, conditional),
 a **global registry** for discovery, and a **ToolKit** that can convert framework tools
 into Pydantic AI tools for injection into any agent.
 
@@ -654,7 +655,6 @@ graph TB
         VG["ValidationGuard"]
         RG["RateLimitGuard"]
         SG["SandboxGuard"]
-        AG["ApprovalGuard"]
     end
 
     subgraph "Composition"
@@ -681,7 +681,6 @@ graph TB
     CG --> VG
     CG --> RG
     CG --> SG
-    CG --> AG
     BT -.->|"compose"| SEQ
     BT -.->|"compose"| FB
     BT -.->|"compose"| COND
@@ -733,9 +732,10 @@ exchange_tool = (
 ### Tool Guards
 
 In production, you rarely want a tool to run unconditionally. Guards are decorators
-that wrap a tool's execution with policy checks — input validation, rate-limiting,
-filesystem sandboxing, or human-in-the-loop approval. They run **before** the handler
-(and optionally after), and they stack via `CompositeGuard`.
+that wrap a tool's execution with **hard, synchronous policy checks** — input validation,
+rate-limiting, or filesystem sandboxing. They run **before** the handler (and optionally
+after), and they stack via `CompositeGuard`. (Human-in-the-loop *approval*, which pauses
+the run rather than rejecting it, is handled separately — see below.)
 
 #### Validation Guard
 
@@ -785,24 +785,26 @@ async def read_file(path: str) -> str:
     ...
 ```
 
-#### Approval Guard
+#### Human-in-the-loop approval
 
-Requires explicit approval before execution — useful for destructive operations.
-You provide an async callback that receives the tool name and kwargs and returns
-`True` to approve:
+Human approval is **not** a guard — it pauses the run rather than failing it. Mark a tool
+with `requires_approval=True`; the agent run then pauses before the tool executes and the
+caller resumes with an approval decision. See
+[Human-in-the-Loop Tool Approval](tools.md#human-in-the-loop-tool-approval) for the full flow.
 
 ```python
-from fireflyframework_agentic.tools.guards import ApprovalGuard
+from fireflyframework_agentic.agents import is_deferred
+from fireflyframework_agentic.tools import DeferredToolResults
 
-async def require_admin_approval(tool_name: str, kwargs: dict) -> bool:
-    """In production, this would check a queue, Slack webhook, or admin UI."""
-    print(f"Approve {tool_name} with {kwargs}? (auto-approved in dev)")
-    return True
-
-@guarded(ApprovalGuard(callback=require_admin_approval))
-@firefly_tool(name="delete_record", description="Delete a database record")
+@firefly_tool(name="delete_record", description="Delete a database record", requires_approval=True)
 async def delete_record(record_id: str) -> str:
     ...
+
+result = await agent.run("Delete record 42.")
+if is_deferred(result):  # paused for sign-off
+    approvals = {c.tool_call_id: True for c in result.output.approvals}  # True / ToolApproved / ToolDenied
+    result = await agent.run(message_history=result.all_messages(),
+                             deferred_tool_results=DeferredToolResults(approvals=approvals))
 ```
 
 ### Composing Guards
diff --git a/fireflyframework_agentic/agents/__init__.py b/fireflyframework_agentic/agents/__init__.py
index c8a491fe..8d708ea9 100644
--- a/fireflyframework_agentic/agents/__init__.py
+++ b/fireflyframework_agentic/agents/__init__.py
@@ -14,7 +14,7 @@
 
 """Agents subpackage -- creation, registration, lifecycle, and delegation."""
 
-from fireflyframework_agentic.agents.base import FireflyAgent
+from fireflyframework_agentic.agents.base import ApprovalHandler, FireflyAgent, is_deferred
 from fireflyframework_agentic.agents.builtin_middleware import (
     BudgetExceededError,
     CacheMiddleware,
@@ -69,6 +69,7 @@
     "agent_lifecycle",
     "AgentMiddleware",
     "AgentRegistry",
+    "ApprovalHandler",
     "BudgetExceededError",
     "CacheMiddleware",
     "CacheStatistics",
@@ -84,6 +85,7 @@
     "FallbackModelWrapper",
     "FallbackStrategy",
     "FireflyAgent",
+    "is_deferred",
     "LoggingMiddleware",
     "MiddlewareChain",
     "MiddlewareContext",
diff --git a/fireflyframework_agentic/agents/base.py b/fireflyframework_agentic/agents/base.py
index 2708f6d2..38324283 100644
--- a/fireflyframework_agentic/agents/base.py
+++ b/fireflyframework_agentic/agents/base.py
@@ -29,14 +29,16 @@
 import logging
 import re
 import time
-from collections.abc import Sequence
+from collections.abc import Awaitable, Callable, Sequence
 from typing import TYPE_CHECKING, Any, Generic, cast
 
-from pydantic_ai import Agent
+from pydantic_ai import Agent, DeferredToolRequests, DeferredToolResults, RunContext
 from pydantic_ai import Tool as PydanticTool
+from pydantic_ai.capabilities import HandleDeferredToolCalls
 from pydantic_ai.exceptions import UserError
 from pydantic_ai.models import Model
 from pydantic_ai.settings import ModelSettings
+from pydantic_ai.toolsets import ApprovalRequiredToolset
 
 from fireflyframework_agentic.agents.builtin_middleware import (
     LoggingMiddleware,
@@ -114,6 +116,32 @@ def _suggested_retry_delay(exc: BaseException) -> float | None:
     return float(match.group(1)) if match else None
 
 
+#: An inline human-in-the-loop approval handler. Receives the run context and the
+#: pending :class:`~pydantic_ai.DeferredToolRequests`, and returns a
+#: :class:`~pydantic_ai.DeferredToolResults` resolving some/all approvals so the
+#: run continues *without* pausing (return ``None`` to decline and let the run
+#: pause normally). Wired into the inner agent as a ``HandleDeferredToolCalls``
+#: capability. Use ``requests.build_results(...)`` to construct the result.
+ApprovalHandler = Callable[
+    [RunContext[Any], DeferredToolRequests],
+    "DeferredToolResults | None | Awaitable[DeferredToolResults | None]",
+]
+
+
+def is_deferred(result: Any) -> bool:
+    """Whether an agent run *paused* instead of completing.
+
+    Returns ``True`` when ``result.output`` is a pydantic-ai
+    :class:`~pydantic_ai.DeferredToolRequests` — i.e. the run hit a tool that
+    requires human-in-the-loop approval (``.approvals``) or external execution
+    (``.calls``) and is awaiting a :class:`~pydantic_ai.DeferredToolResults`
+    before it can continue. Resume by calling :meth:`FireflyAgent.run` (or
+    ``run_sync``) again with ``message_history=result.all_messages()`` and
+    ``deferred_tool_results=...``.
+    """
+    return isinstance(getattr(result, "output", None), DeferredToolRequests)
+
+
 class FireflyAgent(Generic[AgentDepsT, OutputT]):
     """A managed agent that wraps a Pydantic AI :class:`Agent`.
 
@@ -154,6 +182,14 @@ class FireflyAgent(Generic[AgentDepsT, OutputT]):
             messages back after each invocation.
         auto_register: When *True* (the default), the agent is automatically
             added to the global :class:`AgentRegistry`.
+        hitl: Force human-in-the-loop mode on. Normally HITL is auto-detected
+            from approval-requiring tools/toolsets; set this when tools defer
+            dynamically (raising ``ApprovalRequired``) so the output union is
+            widened to allow the ``DeferredToolRequests`` pause.
+        approval_handler: Optional inline approval handler (see
+            :data:`ApprovalHandler`). When supplied, approvals are resolved
+            *inside* the run via a ``HandleDeferredToolCalls`` capability instead
+            of pausing — useful for programmatic / policy-based auto-approval.
     """
 
     def __init__(
@@ -176,6 +212,8 @@ def __init__(
         middleware: list[AgentMiddleware] | None = None,
         default_middleware: bool = True,
         auto_register: bool = True,
+        hitl: bool = False,
+        approval_handler: ApprovalHandler | None = None,
     ) -> None:
         cfg = get_config()
 
@@ -206,15 +244,33 @@ def __init__(
             merged_settings["temperature"] = cfg.default_temperature
         resolved_settings: ModelSettings | None = cast("ModelSettings", merged_settings) if merged_settings else None
 
+        # Human-in-the-loop: when any tool requires approval (declared on a
+        # BaseTool/ToolKit, or via an ApprovalRequiredToolset in ``toolsets``),
+        # or HITL is forced, or an inline approval handler is supplied, the run
+        # may pause and surface a ``DeferredToolRequests``. pydantic-ai only
+        # allows that terminal when it is in ``output_type`` — otherwise it
+        # raises ``UserError`` — so widen the output union exactly once here,
+        # gated strictly on HITL so non-HITL agents keep their declared shape.
+        self._hitl_enabled = hitl or approval_handler is not None or self._detect_hitl(tools, toolsets)
+        effective_output_type: Any = output_type
+        if self._hitl_enabled:
+            existing = list(output_type) if isinstance(output_type, list) else [output_type]
+            if DeferredToolRequests not in existing:
+                existing.append(DeferredToolRequests)
+            effective_output_type = existing
+
+        capabilities = [HandleDeferredToolCalls(approval_handler)] if approval_handler is not None else []
+
         self._agent: Agent[AgentDepsT, OutputT] = Agent(
             resolved_model,
             instructions=instructions,
-            output_type=output_type,
+            output_type=effective_output_type,
             deps_type=deps_type,
             tools=self._resolve_tools(tools),
             toolsets=list(toolsets),
             retries=resolved_retries,
             model_settings=resolved_settings,
+            capabilities=capabilities,
             name=name,
         )
 
@@ -278,6 +334,7 @@ async def run(
         conversation_id: str | None = None,
         timeout: float | None = None,
         context: AgentContext | None = None,
+        deferred_tool_results: DeferredToolResults | None = None,
         **kwargs: Any,
     ) -> Any:
         """Run the agent asynchronously.
@@ -290,8 +347,16 @@ async def run(
         is provided, the agent automatically loads conversation history
         as ``message_history`` and stores new messages after the run.
 
+        To **resume** a run that paused for human-in-the-loop approval (see
+        :func:`is_deferred`), pass ``deferred_tool_results`` together with
+        ``message_history=<paused result>.all_messages()``; do not also set
+        *conversation_id* on the resume call (memory injection and an explicit
+        ``message_history`` are mutually exclusive).
+
         Delegates to ``pydantic_ai.Agent.run``.
         """
+        if deferred_tool_results is not None:
+            kwargs["deferred_tool_results"] = deferred_tool_results
         if context is None:
             context = _AgentContext()
 
@@ -339,9 +404,15 @@ def run_sync(
         conversation_id: str | None = None,
         timeout: float | None = None,
         context: AgentContext | None = None,
+        deferred_tool_results: DeferredToolResults | None = None,
         **kwargs: Any,
     ) -> Any:
-        """Run the agent synchronously.  Delegates to ``pydantic_ai.Agent.run_sync``."""
+        """Run the agent synchronously.  Delegates to ``pydantic_ai.Agent.run_sync``.
+
+        Supports the same ``deferred_tool_results`` resume path as :meth:`run`.
+        """
+        if deferred_tool_results is not None:
+            kwargs["deferred_tool_results"] = deferred_tool_results
         if context is None:
             context = _AgentContext()
 
@@ -473,6 +544,11 @@ def _persist_memory(
         """After a run, store new messages in conversation memory."""
         if self._memory is None or conversation_id is None:
             return
+        if is_deferred(result):
+            # The run paused for approval; its output is a control object, not a
+            # final answer. Persisting it would corrupt the conversation turn —
+            # the resumed run records the completed turn instead.
+            return
         if not hasattr(result, "new_messages"):
             return
         new_msgs = result.new_messages()
@@ -785,12 +861,41 @@ def _resolve_tools(tools: Sequence[Any]) -> list[Any]:
                         item.pydantic_handler(),
                         name=item.name,
                         description=item.description,
+                        requires_approval=item.requires_approval,
                     )
                 )
             else:
                 resolved.append(item)
         return resolved
 
+    @staticmethod
+    def _detect_hitl(tools: Sequence[Any], toolsets: Sequence[Any]) -> bool:
+        """Whether any tool/toolset can pause the run for approval.
+
+        True if a :class:`BaseTool` (directly or inside a ``ToolKit``) declares
+        ``requires_approval``, a toolset is a native
+        :class:`~pydantic_ai.toolsets.ApprovalRequiredToolset`, or a toolset
+        (e.g. ``ToolKit.as_toolset()``) exposes a tool that declares
+        ``requires_approval``. Raw ``pydantic_ai.Tool`` objects with
+        ``requires_approval`` are also honoured; if introspection misses a case,
+        pass ``hitl=True`` explicitly.
+        """
+        for item in tools:
+            if isinstance(item, ToolKit):
+                if any(getattr(t, "requires_approval", False) for t in item.tools):
+                    return True
+            elif getattr(item, "requires_approval", False):
+                return True
+        for ts in toolsets:
+            if isinstance(ts, ApprovalRequiredToolset):
+                return True
+            # A FunctionToolset exposes its tools as a name -> Tool mapping; honour
+            # any that declare requires_approval (so as_toolset() HITL kits work).
+            ts_tools = getattr(ts, "tools", None)
+            if isinstance(ts_tools, dict) and any(getattr(t, "requires_approval", False) for t in ts_tools.values()):
+                return True
+        return False
+
     # -- Dunder --------------------------------------------------------------
 
     def __repr__(self) -> str:
diff --git a/fireflyframework_agentic/agents/builtin_middleware.py b/fireflyframework_agentic/agents/builtin_middleware.py
index ce407499..8017d9cc 100644
--- a/fireflyframework_agentic/agents/builtin_middleware.py
+++ b/fireflyframework_agentic/agents/builtin_middleware.py
@@ -47,6 +47,7 @@
 from typing import Any
 
 from opentelemetry import trace as _trace
+from pydantic_ai import DeferredToolRequests
 
 from fireflyframework_agentic.agents.middleware import MiddlewareContext
 from fireflyframework_agentic.exceptions import BudgetExceededError, OutputReviewError
@@ -72,6 +73,14 @@
 logger = logging.getLogger(__name__)
 
 
+def _is_deferred_output(result: Any) -> bool:
+    """Whether a run paused (output is a ``DeferredToolRequests``) rather than
+    completing. Output-consuming ``after_run`` hooks must skip such a result:
+    it is a HITL control object, not a final answer — scanning, validating, or
+    caching it is wrong (and would raise on a type mismatch)."""
+    return isinstance(getattr(result, "output", None), DeferredToolRequests)
+
+
 # -- Errors ------------------------------------------------------------------
 
 
@@ -141,6 +150,16 @@ async def after_run(self, context: MiddlewareContext, result: Any) -> Any:
         elapsed_ms = (time.monotonic() - t0) * 1000 if t0 is not None else 0.0
         method = context.method or "run"
 
+        if _is_deferred_output(result):
+            logger.log(
+                self._level,
+                "⏸ %s.%s paused after %.1fms awaiting tool approval",
+                context.agent_name,
+                method,
+                elapsed_ms,
+            )
+            return result
+
         suffix = self._usage_suffix(result) if self._include_usage else ""
         reasoning = self._reasoning_suffix(result)
 
@@ -420,7 +439,9 @@ async def before_run(self, context: MiddlewareContext) -> None:
     async def after_run(self, context: MiddlewareContext, result: Any) -> Any:
         """Record a decision with output information."""
         output_text = ""
-        if hasattr(result, "output"):
+        if _is_deferred_output(result):
+            output_text = "<paused: awaiting tool approval>"
+        elif hasattr(result, "output"):
             output_text = str(result.output)[:200]
         self._get_recorder().record(
             "llm_call",
@@ -458,6 +479,10 @@ async def before_run(self, context: MiddlewareContext) -> None:
 
     async def after_run(self, context: MiddlewareContext, result: Any) -> Any:
         """Store the result in the cache on miss."""
+        if _is_deferred_output(result):
+            # Never cache a paused run — a later identical prompt would be served
+            # a stale ``DeferredToolRequests`` instead of running for real.
+            return result
         if "_cache_result" not in context.metadata:
             prompt_str = str(context.prompt) if context.prompt is not None else ""
             self._cache.put(context.agent_name, prompt_str, result)
@@ -529,6 +554,8 @@ async def before_run(self, context: MiddlewareContext) -> None:
 
     async def after_run(self, context: MiddlewareContext, result: Any) -> Any:
         """Scan the output; reject or sanitise if unsafe."""
+        if _is_deferred_output(result):
+            return result
         output_text = ""
         if hasattr(result, "output"):
             output_text = str(result.output)
@@ -586,6 +613,8 @@ async def before_run(self, context: MiddlewareContext) -> None:
 
     async def after_run(self, context: MiddlewareContext, result: Any) -> Any:
         """Validate the output; raise on failure."""
+        if _is_deferred_output(result):
+            return result
         raw = result.output if hasattr(result, "output") else result
 
         parsed, parse_errors = self._reviewer._parse_output(raw)
diff --git a/fireflyframework_agentic/tools/__init__.py b/fireflyframework_agentic/tools/__init__.py
index 9453b34e..bde32cde 100644
--- a/fireflyframework_agentic/tools/__init__.py
+++ b/fireflyframework_agentic/tools/__init__.py
@@ -22,8 +22,16 @@
 ``pydantic_ai`` directly.
 """
 
-# Native pydantic-ai surface, re-exported so callers compose toolsets in one import.
-from pydantic_ai import RunContext
+# Native pydantic-ai surface, re-exported so callers compose toolsets, request
+# human approval, and resume deferred runs in one import.
+from pydantic_ai import (
+    DeferredToolRequests,
+    DeferredToolResults,
+    RunContext,
+    ToolApproved,
+    ToolDenied,
+)
+from pydantic_ai.exceptions import ApprovalRequired
 from pydantic_ai.toolsets import (
     ApprovalRequiredToolset,
     CombinedToolset,
@@ -52,7 +60,6 @@
 )
 from fireflyframework_agentic.tools.decorators import firefly_tool, guarded, retryable
 from fireflyframework_agentic.tools.guards import (
-    ApprovalGuard,
     CompositeGuard,
     RateLimitGuard,
     SandboxGuard,
@@ -62,13 +69,15 @@
 from fireflyframework_agentic.tools.toolkit import ToolKit, to_pydantic_handler
 
 __all__ = [
-    "ApprovalGuard",
+    "ApprovalRequired",
     "ApprovalRequiredToolset",
     "BaseTool",
     "CachedTool",
     "CombinedToolset",
     "CompositeGuard",
     "ConditionalComposer",
+    "DeferredToolRequests",
+    "DeferredToolResults",
     "FallbackComposer",
     "FilteredToolset",
     "FunctionToolset",
@@ -82,7 +91,9 @@
     "RunContext",
     "SandboxGuard",
     "SequentialComposer",
+    "ToolApproved",
     "ToolBuilder",
+    "ToolDenied",
     "ToolInfo",
     "ToolKit",
     "ToolProtocol",
diff --git a/fireflyframework_agentic/tools/base.py b/fireflyframework_agentic/tools/base.py
index 9c73e635..9d965656 100644
--- a/fireflyframework_agentic/tools/base.py
+++ b/fireflyframework_agentic/tools/base.py
@@ -38,8 +38,9 @@
 
 from pydantic import BaseModel, Field
 from pydantic_ai import ModelRetry, RunContext
+from pydantic_ai.exceptions import ApprovalRequired, CallDeferred
 
-from fireflyframework_agentic.exceptions import ToolError, ToolTimeoutError
+from fireflyframework_agentic.exceptions import ToolError, ToolGuardError, ToolTimeoutError
 
 logger = logging.getLogger(__name__)
 
@@ -151,6 +152,12 @@ class BaseTool(ABC):
             ``RunContext`` (agent deps, usage, retry count, messages). The
             generated handler injects it; ``_execute`` then receives it as the
             keyword-only ``_ctx``. Guards and caching never see ``ctx``.
+        requires_approval: When ``True``, the tool is exposed to pydantic-ai as
+            an approval-required tool. The agent run pauses before this tool
+            executes and surfaces a ``DeferredToolRequests`` for human
+            sign-off (resolved natively via ``DeferredToolResults``); the tool
+            body runs only once the call is approved. See the agent's
+            human-in-the-loop docs.
     """
 
     def __init__(
@@ -163,6 +170,7 @@ def __init__(
         parameters: Sequence[ParameterSpec] = (),
         timeout: float | None = None,
         takes_ctx: bool = False,
+        requires_approval: bool = False,
     ) -> None:
         self._name = name
         self._description = description
@@ -171,6 +179,7 @@ def __init__(
         self._parameters = list(parameters)
         self._timeout = timeout
         self._takes_ctx = takes_ctx
+        self._requires_approval = requires_approval
 
     # -- Properties ----------------------------------------------------------
 
@@ -204,12 +213,18 @@ def takes_ctx(self) -> bool:
         """Whether this tool opts into a pydantic-ai ``RunContext``."""
         return self._takes_ctx
 
+    @property
+    def requires_approval(self) -> bool:
+        """Whether calls to this tool require human-in-the-loop approval."""
+        return self._requires_approval
+
     # -- Execution -----------------------------------------------------------
 
     async def execute(self, **kwargs: Any) -> Any:
         """Run the tool after evaluating all guards.
 
-        If any guard rejects, a :class:`ToolError` is raised immediately.
+        If any guard rejects, a :class:`ToolGuardError` (a ``ToolError``
+        subclass) is raised immediately.
         When a *timeout* is configured, wraps the execution in
         :func:`asyncio.wait_for` and raises :class:`ToolTimeoutError`
         on expiry.
@@ -230,7 +245,7 @@ async def _guarded_execute(self, kwargs: dict[str, Any], *, ctx: Any) -> Any:
         for guard in self._guards:
             result = await guard.check(self._name, kwargs)
             if not result.passed:
-                raise ToolError(f"Guard rejected execution of tool '{self._name}': {result.reason}")
+                raise ToolGuardError(f"Guard rejected execution of tool '{self._name}': {result.reason}")
 
         logger.debug("Executing tool '%s' with kwargs=%s", self._name, list(kwargs.keys()))
         # ``_ctx`` reaches ``_execute`` only for ctx-aware tools; it is never in
@@ -243,6 +258,12 @@ async def _guarded_execute(self, kwargs: dict[str, Any], *, ctx: Any) -> Any:
             return await coro
         except TimeoutError:
             raise ToolTimeoutError(f"Tool '{self._name}' timed out after {self._timeout}s") from None
+        except (ApprovalRequired, CallDeferred):
+            # pydantic-ai human-in-the-loop / deferral control signals. Like
+            # ``ModelRetry`` these are NOT errors: they must reach the agent graph
+            # untouched so the run pauses and surfaces a ``DeferredToolRequests``.
+            # Wrapping them as ``ToolError`` would break dynamic tool approval.
+            raise
         except ToolError:
             raise
         except Exception as exc:
diff --git a/fireflyframework_agentic/tools/decorators.py b/fireflyframework_agentic/tools/decorators.py
index 3a51ff72..98484c11 100644
--- a/fireflyframework_agentic/tools/decorators.py
+++ b/fireflyframework_agentic/tools/decorators.py
@@ -54,8 +54,9 @@ def __init__(
         description: str = "",
         tags: Sequence[str] = (),
         guards: Sequence[GuardProtocol] = (),
+        requires_approval: bool = False,
     ) -> None:
-        super().__init__(name, description=description, tags=tags, guards=guards)
+        super().__init__(name, description=description, tags=tags, guards=guards, requires_approval=requires_approval)
         self._handler = handler
 
     async def _execute(self, **kwargs: Any) -> Any:
@@ -86,6 +87,7 @@ def firefly_tool(
     description: str = "",
     tags: Sequence[str] = (),
     guards: Sequence[GuardProtocol] = (),
+    requires_approval: bool = False,
     auto_register: bool = True,
 ) -> Callable[[AsyncHandler], BaseTool]:
     """Create a :class:`BaseTool` from an async function and optionally register it.
@@ -95,6 +97,9 @@ def firefly_tool(
         description: Description (falls back to the function's docstring).
         tags: Tags for capability-based discovery.
         guards: Guards evaluated before execution.
+        requires_approval: When *True*, calls to this tool pause the agent run
+            for human-in-the-loop approval via pydantic-ai's native
+            deferred-tools protocol (see the agent HITL docs).
         auto_register: When *True*, the tool is added to the global registry.
 
     Returns:
@@ -109,6 +114,7 @@ def decorator(func: AsyncHandler) -> BaseTool:
             description=description or func.__doc__ or "",
             tags=tags,
             guards=guards,
+            requires_approval=requires_approval,
         )
         if auto_register:
             tool_registry.register(tool)
diff --git a/fireflyframework_agentic/tools/guards.py b/fireflyframework_agentic/tools/guards.py
index db763ca6..6a8eb9f8 100644
--- a/fireflyframework_agentic/tools/guards.py
+++ b/fireflyframework_agentic/tools/guards.py
@@ -25,14 +25,11 @@
 import re
 import threading
 import time
-from collections.abc import Awaitable, Callable, Sequence
+from collections.abc import Sequence
 from typing import Any
 
 from fireflyframework_agentic.tools.base import GuardProtocol, GuardResult
 
-# Type alias for the human-in-the-loop approval callback
-ApprovalCallback = Callable[[str, dict[str, Any]], Awaitable[bool]]
-
 
 class ValidationGuard:
     """Validates that all required parameters declared in a tool's spec are present.
@@ -84,26 +81,6 @@ async def check(self, tool_name: str, kwargs: dict[str, Any]) -> GuardResult:
         return GuardResult(passed=True)
 
 
-class ApprovalGuard:
-    """Human-in-the-loop guard that delegates the decision to an async callback.
-
-    The callback receives the tool name and kwargs and must return ``True``
-    to approve execution.
-
-    Parameters:
-        callback: An async callable ``(tool_name, kwargs) -> bool``.
-    """
-
-    def __init__(self, callback: ApprovalCallback) -> None:
-        self._callback = callback
-
-    async def check(self, tool_name: str, kwargs: dict[str, Any]) -> GuardResult:
-        approved = await self._callback(tool_name, kwargs)
-        if not approved:
-            return GuardResult(passed=False, reason="Execution not approved")
-        return GuardResult(passed=True)
-
-
 class SandboxGuard:
     """Restricts tool arguments by matching against allow / deny regex patterns.
 
diff --git a/fireflyframework_agentic/tools/toolkit.py b/fireflyframework_agentic/tools/toolkit.py
index aa95c18c..029499d3 100644
--- a/fireflyframework_agentic/tools/toolkit.py
+++ b/fireflyframework_agentic/tools/toolkit.py
@@ -113,6 +113,7 @@ def as_pydantic_tools(self) -> list[PydanticTool[Any]]:
                     to_pydantic_handler(tool),
                     name=tool.name,
                     description=tool.description,
+                    requires_approval=getattr(tool, "requires_approval", False),
                 )
             )
         return pydantic_tools
@@ -130,7 +131,14 @@ def as_toolset(self) -> FunctionToolset[Any]:
         for tool in self._tools:
             # Forward the description too — add_function drops it otherwise, so the
             # LLM would see a toolset tool with no description (a real fidelity loss).
-            toolset.add_function(to_pydantic_handler(tool), name=tool.name, description=tool.description)
+            # ``requires_approval`` is forwarded so a kit's HITL tools keep pausing
+            # the run for approval even when the kit is added as a toolset.
+            toolset.add_function(
+                to_pydantic_handler(tool),
+                name=tool.name,
+                description=tool.description,
+                requires_approval=getattr(tool, "requires_approval", False),
+            )
         return toolset
 
     def __len__(self) -> int:
diff --git a/pyproject.toml b/pyproject.toml
index ed069a28..3bb5eca6 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "fireflyframework-agentic"
-version = "26.06.10"
+version = "26.06.11"
 description = "A GenAI metaframework built on Pydantic AI for building production-grade GenAI applications with agents, reasoning patterns, prompt engineering, observability, and more."
 readme = "README.md"
 license = { text = "Apache-2.0" }
diff --git a/tests/integration/test_real_anthropic_e2e.py b/tests/integration/test_real_anthropic_e2e.py
index cb867e77..a86fa1f4 100644
--- a/tests/integration/test_real_anthropic_e2e.py
+++ b/tests/integration/test_real_anthropic_e2e.py
@@ -35,8 +35,9 @@
 
 import pytest
 from pydantic import BaseModel
+from pydantic_ai import DeferredToolResults
 
-from fireflyframework_agentic.agents.base import FireflyAgent
+from fireflyframework_agentic.agents.base import FireflyAgent, is_deferred
 from fireflyframework_agentic.memory.manager import MemoryManager
 from fireflyframework_agentic.observability.usage import UsageTracker
 from fireflyframework_agentic.pipeline.builder import PipelineBuilder
@@ -144,6 +145,48 @@ async def test_reasoning_prompted_output_mode(self):
         assert result.steps_taken >= 1
         assert "60" in str(result.output)
 
+    async def test_hitl_tool_approval_pause_resume(self):
+        """A requires_approval tool pauses the real run before executing; resuming
+        with approval runs the tool and the model uses its result."""
+        calls: list[str] = []
+
+        class DeleteTool(BaseTool):
+            def __init__(self) -> None:
+                super().__init__(
+                    "delete_record",
+                    description="Delete a record by its id. This is destructive.",
+                    parameters=[ParameterSpec(name="record_id", python_type=str)],
+                    requires_approval=True,
+                )
+
+            async def _execute(self, *, record_id: str) -> str:
+                calls.append(record_id)
+                return f"record {record_id} deleted"
+
+        ag = FireflyAgent(
+            "e2e-hitl",
+            model=HAIKU,
+            tools=[DeleteTool()],
+            instructions="To delete, call delete_record with the id. Then confirm to the user.",
+            auto_register=False,
+        )
+
+        paused = await ag.run("Please delete record 42.")
+        assert is_deferred(paused), "the approval-required tool should pause the run"
+        assert calls == [], "the tool body must NOT run before approval"
+        approvals = paused.output.approvals
+        assert [c.tool_name for c in approvals] == ["delete_record"]
+        call_id = approvals[0].tool_call_id
+
+        resumed = await ag.run(
+            None,
+            message_history=paused.all_messages(),
+            deferred_tool_results=DeferredToolResults(approvals={call_id: True}),
+        )
+        assert not is_deferred(resumed)
+        assert calls == ["42"], "the tool body must run exactly once, after approval"
+        assert "42" in str(resumed.output)
+
     async def test_pipeline_agent_step(self):
         """A DAG pipeline runs a real agent step end to end."""
         ag = FireflyAgent("e2e-pipe", model=HAIKU, instructions="Answer with a single word.", auto_register=False)
diff --git a/tests/unit/agents/test_hitl_deferred.py b/tests/unit/agents/test_hitl_deferred.py
new file mode 100644
index 00000000..a1faa585
--- /dev/null
+++ b/tests/unit/agents/test_hitl_deferred.py
@@ -0,0 +1,529 @@
+# Copyright 2026 Firefly Software Foundation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""SP-3: human-in-the-loop tool approval via pydantic-ai native deferred-tools.
+
+An approval-required tool pauses the run and surfaces a ``DeferredToolRequests``
+(``is_deferred`` is True). The caller resumes by approving/denying via
+``DeferredToolResults``. Post-run cross-cutting code (memory, OutputGuard,
+Validation, Cache, Logging) must treat the paused result as a control object,
+not a final answer.
+"""
+
+from __future__ import annotations
+
+import logging
+from types import SimpleNamespace
+from typing import Any
+
+import pytest
+from pydantic_ai import DeferredToolRequests, DeferredToolResults, Tool, ToolApproved, ToolDenied
+from pydantic_ai.exceptions import UserError
+from pydantic_ai.messages import ModelResponse, TextPart, ToolCallPart
+from pydantic_ai.models.function import FunctionModel
+from pydantic_ai.toolsets import ApprovalRequiredToolset, FunctionToolset
+
+from fireflyframework_agentic.agents.base import FireflyAgent, is_deferred
+from fireflyframework_agentic.agents.builtin_middleware import (
+    CacheMiddleware,
+    ExplainabilityMiddleware,
+    OutputGuardError,
+    OutputGuardMiddleware,
+    ValidationMiddleware,
+)
+from fireflyframework_agentic.agents.cache import ResultCache
+from fireflyframework_agentic.exceptions import OutputReviewError
+from fireflyframework_agentic.memory.manager import MemoryManager
+from fireflyframework_agentic.tools import ApprovalRequired
+from fireflyframework_agentic.tools.base import BaseTool, ParameterSpec
+from fireflyframework_agentic.tools.decorators import firefly_tool
+from fireflyframework_agentic.tools.toolkit import ToolKit
+
+
+def _model_calling(*calls: tuple[str, dict[str, Any], str]) -> FunctionModel:
+    """A model that emits the given tool calls in its first response, then text.
+
+    Each call is ``(tool_name, args, tool_call_id)``. Emitting several in one
+    response exercises a single pause carrying multiple pending approvals.
+    """
+    state = {"n": 0}
+
+    def fn(messages: Any, info: Any) -> ModelResponse:
+        state["n"] += 1
+        if state["n"] == 1:
+            return ModelResponse(parts=[ToolCallPart(tool_name=t, args=a, tool_call_id=c) for (t, a, c) in calls])
+        return ModelResponse(parts=[TextPart("done")])
+
+    return FunctionModel(fn)
+
+
+class _RecordingTool(BaseTool):
+    """An approval-required tool that records the args it actually executed with."""
+
+    def __init__(self, name: str = "rec_delete") -> None:
+        super().__init__(
+            name,
+            description="delete by name",
+            parameters=[ParameterSpec(name="name", python_type=str)],
+            requires_approval=True,
+        )
+        self.ran_with: list[str] = []
+
+    async def _execute(self, *, name: str) -> str:
+        self.ran_with.append(name)
+        return f"DELETED {name}"
+
+
+class _DynamicTool(BaseTool):
+    """A ctx-aware tool that defers *dynamically* by raising ApprovalRequired until approved."""
+
+    def __init__(self) -> None:
+        super().__init__(
+            "dyn_delete",
+            description="delete a record (needs approval)",
+            parameters=[ParameterSpec(name="record_id", python_type=str)],
+            takes_ctx=True,
+        )
+        self.seen: dict[str, Any] = {}
+
+    async def _execute(self, *, record_id: str, _ctx: Any) -> str:
+        if not getattr(_ctx, "tool_call_approved", False):
+            raise ApprovalRequired(metadata={"reason": "destructive", "record_id": record_id})
+        self.seen["approved"] = _ctx.tool_call_approved
+        self.seen["metadata"] = dict(_ctx.tool_call_metadata or {})
+        return f"deleted {record_id}"
+
+
+def _approval_model() -> FunctionModel:
+    """A model that calls ``delete_db`` first, then answers with text."""
+    state = {"n": 0}
+
+    def fn(messages: Any, info: Any) -> ModelResponse:
+        state["n"] += 1
+        if state["n"] == 1:
+            return ModelResponse(
+                parts=[ToolCallPart(tool_name="delete_db", args={"name": "prod"}, tool_call_id="call_1")]
+            )
+        return ModelResponse(parts=[TextPart("done")])
+
+    return FunctionModel(fn)
+
+
+@firefly_tool("delete_db", description="Delete a database", requires_approval=True, auto_register=False)
+async def delete_db(name: str) -> str:
+    return f"DELETED {name}"
+
+
+@pytest.mark.asyncio
+async def test_requires_approval_pauses_then_resumes_approve():
+    agent = FireflyAgent("hitl", model=_approval_model(), tools=[delete_db], auto_register=False)
+
+    paused = await agent.run("delete prod")
+    assert is_deferred(paused)
+    reqs = paused.output
+    assert isinstance(reqs, DeferredToolRequests)
+    assert [c.tool_name for c in reqs.approvals] == ["delete_db"]
+    assert not reqs.calls
+    call_id = reqs.approvals[0].tool_call_id
+
+    resumed = await agent.run(
+        None,
+        message_history=paused.all_messages(),
+        deferred_tool_results=DeferredToolResults(approvals={call_id: True}),
+    )
+    assert not is_deferred(resumed)
+    assert resumed.output == "done"
+
+
+@pytest.mark.asyncio
+async def test_resume_deny_surfaces_to_model_and_completes():
+    agent = FireflyAgent("hitl-deny", model=_approval_model(), tools=[delete_db], auto_register=False)
+    paused = await agent.run("delete prod")
+    call_id = paused.output.approvals[0].tool_call_id
+
+    resumed = await agent.run(
+        None,
+        message_history=paused.all_messages(),
+        deferred_tool_results=DeferredToolResults(approvals={call_id: ToolDenied(message="not allowed")}),
+    )
+    # A denial is not a crash: the model is told and the run completes.
+    assert not is_deferred(resumed)
+    assert resumed.output == "done"
+
+
+@pytest.mark.asyncio
+async def test_inline_approval_handler_resolves_without_pausing():
+    def handler(ctx: Any, requests: DeferredToolRequests) -> DeferredToolResults:
+        return requests.build_results(approvals={c.tool_call_id: True for c in requests.approvals})
+
+    agent = FireflyAgent(
+        "hitl-inline", model=_approval_model(), tools=[delete_db], approval_handler=handler, auto_register=False
+    )
+    result = await agent.run("delete prod")
+    assert not is_deferred(result)
+    assert result.output == "done"
+
+
+def test_non_hitl_agent_output_type_is_not_widened():
+    agent = FireflyAgent("plain", model=_approval_model(), auto_register=False)
+    assert agent._hitl_enabled is False
+
+
+def test_hitl_detected_from_requires_approval_tool():
+    agent = FireflyAgent("d1", model=_approval_model(), tools=[delete_db], auto_register=False)
+    assert agent._hitl_enabled is True
+
+
+def test_hitl_detected_from_toolkit():
+    kit = ToolKit("danger", [delete_db])
+    agent = FireflyAgent("d2", model=_approval_model(), tools=[kit], auto_register=False)
+    assert agent._hitl_enabled is True
+
+
+def test_hitl_detected_from_as_toolset():
+    kit = ToolKit("danger", [delete_db])
+    agent = FireflyAgent("d3", model=_approval_model(), toolsets=[kit.as_toolset()], auto_register=False)
+    assert agent._hitl_enabled is True
+
+
+def test_hitl_detected_from_approval_required_toolset():
+    wrapped = ApprovalRequiredToolset(FunctionToolset(), approval_required_func=lambda *a: True)
+    agent = FireflyAgent("d4", model=_approval_model(), toolsets=[wrapped], auto_register=False)
+    assert agent._hitl_enabled is True
+
+
+def test_hitl_forced_via_flag():
+    agent = FireflyAgent("d5", model=_approval_model(), hitl=True, auto_register=False)
+    assert agent._hitl_enabled is True
+
+
+@pytest.mark.asyncio
+async def test_paused_result_not_persisted_to_memory():
+    from fireflyframework_agentic.memory.manager import MemoryManager
+
+    mem = MemoryManager()
+    agent = FireflyAgent("hitl-mem", model=_approval_model(), tools=[delete_db], memory=mem, auto_register=False)
+    paused = await agent.run("delete prod", conversation_id="c1")
+    assert is_deferred(paused)
+    # The pause must not be written as a conversation turn.
+    assert mem.get_message_history("c1") == []
+
+
+@pytest.mark.asyncio
+async def test_validation_middleware_skips_deferred_output():
+    class _AlwaysFailReviewer:
+        def _parse_output(self, raw: Any) -> tuple[Any, list[str]]:
+            return None, ["reviewer always fails"]
+
+        def _validate_output(self, validated: Any) -> Any:
+            return None
+
+    agent = FireflyAgent(
+        "hitl-val",
+        model=_approval_model(),
+        tools=[delete_db],
+        middleware=[ValidationMiddleware(reviewer=_AlwaysFailReviewer())],
+        default_middleware=False,
+        auto_register=False,
+    )
+    # Would raise OutputReviewError if the deferred output were validated.
+    paused = await agent.run("delete prod")
+    assert is_deferred(paused)
+
+    # Sanity: the same reviewer DOES fire on a real completed output.
+    plain = FireflyAgent(
+        "plain-val",
+        model=FunctionModel(lambda m, i: ModelResponse(parts=[TextPart("hello")])),
+        middleware=[ValidationMiddleware(reviewer=_AlwaysFailReviewer())],
+        default_middleware=False,
+        auto_register=False,
+    )
+    with pytest.raises(OutputReviewError):
+        await plain.run("hi")
+
+
+@pytest.mark.asyncio
+async def test_cache_middleware_does_not_cache_a_pause():
+    cache = ResultCache()
+    agent = FireflyAgent(
+        "hitl-cache",
+        model=_approval_model(),
+        tools=[delete_db],
+        middleware=[CacheMiddleware(cache=cache)],
+        default_middleware=False,
+        auto_register=False,
+    )
+    paused = await agent.run("delete prod")
+    assert is_deferred(paused)
+    # The paused run must not have been cached.
+    assert cache.get("hitl-cache", "delete prod") is None
+
+
+def test_is_deferred_helper():
+    class _R:
+        output = DeferredToolRequests(approvals=[], calls=[], metadata={})
+
+    class _Done:
+        output = "final answer"
+
+    assert is_deferred(_R()) is True
+    assert is_deferred(_Done()) is False
+    assert is_deferred(None) is False
+
+
+@pytest.mark.asyncio
+async def test_run_sync_requires_approval_pauses_then_resumes_approve():
+    agent = FireflyAgent("hitl-sync", model=_approval_model(), tools=[delete_db], auto_register=False)
+    paused = agent.run_sync("delete prod")
+    assert is_deferred(paused)
+    call_id = paused.output.approvals[0].tool_call_id
+    resumed = agent.run_sync(
+        None,
+        message_history=paused.all_messages(),
+        deferred_tool_results=DeferredToolResults(approvals={call_id: True}),
+    )
+    assert not is_deferred(resumed)
+    assert resumed.output == "done"
+
+
+@pytest.mark.asyncio
+async def test_resume_approve_with_override_args_replaces_tool_args():
+    tool = _RecordingTool()
+    agent = FireflyAgent(
+        "override", model=_model_calling(("rec_delete", {"name": "prod"}, "c1")), tools=[tool], auto_register=False
+    )
+    paused = await agent.run("delete prod")
+    call_id = paused.output.approvals[0].tool_call_id
+    resumed = await agent.run(
+        None,
+        message_history=paused.all_messages(),
+        deferred_tool_results=DeferredToolResults(approvals={call_id: ToolApproved(override_args={"name": "staging"})}),
+    )
+    assert not is_deferred(resumed)
+    assert tool.ran_with == ["staging"]  # override_args replaced the model's "prod"
+
+
+@pytest.mark.asyncio
+async def test_dynamic_approval_required_from_tool_body_with_hitl_flag():
+    tool = _DynamicTool()
+    agent = FireflyAgent(
+        "dyn",
+        model=_model_calling(("dyn_delete", {"record_id": "7"}, "c1")),
+        tools=[tool],
+        hitl=True,
+        auto_register=False,
+    )
+    paused = await agent.run("delete 7")
+    assert is_deferred(paused)
+    call_id = paused.output.approvals[0].tool_call_id
+    resumed = await agent.run(
+        None,
+        message_history=paused.all_messages(),
+        deferred_tool_results=DeferredToolResults(approvals={call_id: True}),
+    )
+    assert not is_deferred(resumed)
+    assert tool.seen.get("approved") is True
+
+    # Without hitl=True the dynamic deferral has nowhere to surface -> UserError.
+    bad = FireflyAgent(
+        "dyn-bad",
+        model=_model_calling(("dyn_delete", {"record_id": "7"}, "c1")),
+        tools=[_DynamicTool()],
+        auto_register=False,
+    )
+    with pytest.raises(UserError):
+        await bad.run("delete 7")
+
+
+@pytest.mark.asyncio
+async def test_approval_metadata_round_trips_to_run_context():
+    tool = _DynamicTool()
+    agent = FireflyAgent(
+        "meta",
+        model=_model_calling(("dyn_delete", {"record_id": "9"}, "c1")),
+        tools=[tool],
+        hitl=True,
+        auto_register=False,
+    )
+    paused = await agent.run("delete 9")
+    call_id = paused.output.approvals[0].tool_call_id
+    # Outbound: ApprovalRequired(metadata=...) surfaces in DeferredToolRequests.metadata.
+    assert paused.output.metadata.get(call_id) == {"reason": "destructive", "record_id": "9"}
+    # Inbound: DeferredToolResults.metadata reaches the tool's RunContext on resume.
+    resumed = await agent.run(
+        None,
+        message_history=paused.all_messages(),
+        deferred_tool_results=DeferredToolResults(approvals={call_id: True}, metadata={call_id: {"reviewer": "alice"}}),
+    )
+    assert not is_deferred(resumed)
+    assert tool.seen.get("approved") is True
+    assert tool.seen.get("metadata") == {"reviewer": "alice"}
+
+
+@pytest.mark.asyncio
+async def test_approval_required_toolset_pauses_and_resumes_end_to_end():
+    ran: list[str] = []
+
+    async def do_thing(x: str) -> str:
+        ran.append(x)
+        return f"did {x}"
+
+    fts = FunctionToolset()
+    fts.add_function(do_thing, name="do_thing")
+    gated = ApprovalRequiredToolset(fts, approval_required_func=lambda *a: True)
+    agent = FireflyAgent(
+        "ars", model=_model_calling(("do_thing", {"x": "go"}, "c1")), toolsets=[gated], auto_register=False
+    )
+
+    paused = await agent.run("do it")
+    assert is_deferred(paused)
+    assert ran == []  # gated tool did not run before approval
+    call_id = paused.output.approvals[0].tool_call_id
+    resumed = await agent.run(
+        None,
+        message_history=paused.all_messages(),
+        deferred_tool_results=DeferredToolResults(approvals={call_id: True}),
+    )
+    assert not is_deferred(resumed)
+    assert ran == ["go"]
+
+
+@pytest.mark.asyncio
+async def test_output_guard_middleware_skips_deferred_output():
+    class _BlockAll:
+        def scan(self, text: str) -> Any:
+            return SimpleNamespace(
+                safe=False, matched_categories=["x"], matched_patterns=["p"], reason="blocked", sanitised_output=None
+            )
+
+    agent = FireflyAgent(
+        "og",
+        model=_approval_model(),
+        tools=[delete_db],
+        middleware=[OutputGuardMiddleware(guard=_BlockAll())],
+        default_middleware=False,
+        auto_register=False,
+    )
+    paused = await agent.run("delete prod")
+    assert is_deferred(paused)  # the block-all guard did NOT fire on the paused output
+
+    plain = FireflyAgent(
+        "og-plain",
+        model=FunctionModel(lambda m, i: ModelResponse(parts=[TextPart("hi")])),
+        middleware=[OutputGuardMiddleware(guard=_BlockAll())],
+        default_middleware=False,
+        auto_register=False,
+    )
+    with pytest.raises(OutputGuardError):
+        await plain.run("x")
+
+
+@pytest.mark.asyncio
+async def test_explainability_middleware_records_paused_marker():
+    records: list[tuple[str, dict[str, Any]]] = []
+
+    class _Recorder:
+        def record(self, kind: str, **kw: Any) -> None:
+            records.append((kind, kw))
+
+    agent = FireflyAgent(
+        "expl",
+        model=_approval_model(),
+        tools=[delete_db],
+        middleware=[ExplainabilityMiddleware(recorder=_Recorder())],
+        default_middleware=False,
+        auto_register=False,
+    )
+    paused = await agent.run("delete prod")
+    assert is_deferred(paused)
+    summaries = [kw.get("output_summary") for _, kw in records]
+    assert "<paused: awaiting tool approval>" in summaries
+
+
+@pytest.mark.asyncio
+async def test_logging_middleware_logs_paused_branch(caplog: pytest.LogCaptureFixture):
+    agent = FireflyAgent("logp", model=_approval_model(), tools=[delete_db], auto_register=False)
+    with caplog.at_level(logging.INFO, logger="fireflyframework_agentic.agents.builtin_middleware"):
+        paused = await agent.run("delete prod")
+    assert is_deferred(paused)
+    assert "paused" in caplog.text and "awaiting tool approval" in caplog.text
+    assert "completed in" not in caplog.text  # the normal completion line is NOT emitted
+
+
+@pytest.mark.asyncio
+async def test_multiple_pending_approvals_in_one_pause():
+    a = _RecordingTool("del_a")
+    b = _RecordingTool("del_b")
+    model = _model_calling(("del_a", {"name": "A"}, "ca"), ("del_b", {"name": "B"}, "cb"))
+    agent = FireflyAgent("multi", model=model, tools=[a, b], auto_register=False)
+
+    paused = await agent.run("delete both")
+    assert is_deferred(paused)
+    ids = {c.tool_name: c.tool_call_id for c in paused.output.approvals}
+    assert set(ids) == {"del_a", "del_b"}
+
+    resumed = await agent.run(
+        None,
+        message_history=paused.all_messages(),
+        deferred_tool_results=DeferredToolResults(
+            approvals={ids["del_a"]: True, ids["del_b"]: ToolDenied(message="no")}
+        ),
+    )
+    assert not is_deferred(resumed)
+    assert a.ran_with == ["A"]  # approved ran
+    assert b.ran_with == []  # denied did not run
+
+
+@pytest.mark.asyncio
+async def test_approval_handler_returning_none_still_pauses():
+    def decline(ctx: Any, requests: DeferredToolRequests) -> None:
+        return None  # decline to resolve inline -> the run pauses normally
+
+    agent = FireflyAgent(
+        "handler-none", model=_approval_model(), tools=[delete_db], approval_handler=decline, auto_register=False
+    )
+    paused = await agent.run("delete prod")
+    assert is_deferred(paused)
+    call_id = paused.output.approvals[0].tool_call_id
+    resumed = await agent.run(
+        None,
+        message_history=paused.all_messages(),
+        deferred_tool_results=DeferredToolResults(approvals={call_id: True}),
+    )
+    assert not is_deferred(resumed)
+
+
+def test_raw_pydantic_tool_requires_approval_detected():
+    async def fn(x: str) -> str:
+        return x
+
+    agent = FireflyAgent("raw", model=_approval_model(), tools=[Tool(fn, requires_approval=True)], auto_register=False)
+    assert agent._hitl_enabled is True
+
+
+@pytest.mark.asyncio
+async def test_resume_with_memory_does_not_double_inject():
+    mem = MemoryManager()
+    agent = FireflyAgent("memr", model=_approval_model(), tools=[delete_db], memory=mem, auto_register=False)
+    paused = await agent.run("delete prod", conversation_id="c1")
+    assert is_deferred(paused)
+    # Resume with an explicit message_history and NO conversation_id: memory injection is
+    # skipped (mutually exclusive), so prior messages are not double-injected and it completes.
+    resumed = await agent.run(
+        None,
+        message_history=paused.all_messages(),
+        deferred_tool_results=DeferredToolResults(approvals={paused.output.approvals[0].tool_call_id: True}),
+    )
+    assert not is_deferred(resumed)
+    assert resumed.output == "done"
diff --git a/tests/unit/tools/test_requires_approval.py b/tests/unit/tools/test_requires_approval.py
new file mode 100644
index 00000000..ce971049
--- /dev/null
+++ b/tests/unit/tools/test_requires_approval.py
@@ -0,0 +1,112 @@
+# Copyright 2026 Firefly Software Foundation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""SP-3 tool layer: ``requires_approval`` threading + guard-denial taxonomy.
+
+A tool's ``requires_approval`` flag must reach the native ``pydantic_ai.Tool``
+on every conversion path (direct ``tools=``, ``ToolKit.as_pydantic_tools``,
+``ToolKit.as_toolset``). Guard denials now raise ``ToolGuardError``. The legacy
+``ApprovalGuard`` (bespoke approval via ``ToolError``) is removed in favour of
+native deferred-tools.
+"""
+
+from __future__ import annotations
+
+from typing import Any
+
+import pytest
+
+from fireflyframework_agentic.exceptions import ToolError, ToolGuardError
+from fireflyframework_agentic.tools.base import BaseTool, GuardResult, ParameterSpec
+from fireflyframework_agentic.tools.decorators import firefly_tool
+from fireflyframework_agentic.tools.toolkit import ToolKit
+
+
+class _Tool(BaseTool):
+    def __init__(self, *, requires_approval: bool = False, guards: Any = ()) -> None:
+        super().__init__(
+            "danger",
+            description="a dangerous tool",
+            parameters=[ParameterSpec(name="q", python_type=str)],
+            guards=guards,
+            requires_approval=requires_approval,
+        )
+
+    async def _execute(self, **kwargs: Any) -> Any:
+        return "ran"
+
+
+def test_base_tool_requires_approval_property():
+    assert _Tool().requires_approval is False
+    assert _Tool(requires_approval=True).requires_approval is True
+
+
+def test_firefly_tool_requires_approval_flag():
+    @firefly_tool("t", requires_approval=True, auto_register=False)
+    async def t(x: str) -> str:
+        return x
+
+    assert t.requires_approval is True
+
+    @firefly_tool("t2", auto_register=False)
+    async def t2(x: str) -> str:
+        return x
+
+    assert t2.requires_approval is False
+
+
+def test_toolkit_as_pydantic_tools_threads_requires_approval():
+    # Distinguish the two tools by giving them different names.
+    a = _Tool(requires_approval=True)
+    a._name = "a"
+    b = _Tool(requires_approval=False)
+    b._name = "b"
+    kit = ToolKit("k", [a, b])
+
+    pyd = {t.name: t for t in kit.as_pydantic_tools()}
+    assert pyd["a"].requires_approval is True
+    assert pyd["b"].requires_approval is False
+
+
+def test_toolkit_as_toolset_threads_requires_approval():
+    a = _Tool(requires_approval=True)
+    a._name = "a"
+    b = _Tool(requires_approval=False)
+    b._name = "b"
+    toolset = ToolKit("k", [a, b]).as_toolset()
+
+    assert toolset.tools["a"].requires_approval is True
+    assert toolset.tools["b"].requires_approval is False
+
+
+@pytest.mark.asyncio
+async def test_guard_denial_raises_tool_guard_error():
+    class _Deny:
+        async def check(self, tool_name: str, kwargs: dict[str, Any]) -> GuardResult:
+            return GuardResult(passed=False, reason="nope")
+
+    tool = _Tool(guards=[_Deny()])
+    with pytest.raises(ToolGuardError) as exc:
+        await tool.execute(q="x")
+    # ToolGuardError remains a ToolError subclass — existing `except ToolError` still catches it.
+    assert isinstance(exc.value, ToolError)
+
+
+def test_legacy_approval_guard_is_removed():
+    import fireflyframework_agentic.tools as tools_pkg
+
+    assert not hasattr(tools_pkg, "ApprovalGuard")
+    # Native replacements are exported instead.
+    for name in ("ApprovalRequired", "ApprovalRequiredToolset", "DeferredToolRequests", "DeferredToolResults"):
+        assert hasattr(tools_pkg, name)
diff --git a/uv.lock b/uv.lock
index c77e3b2b..44d67b0a 100644
--- a/uv.lock
+++ b/uv.lock
@@ -1148,7 +1148,7 @@ wheels = [
 
 [[package]]
 name = "fireflyframework-agentic"
-version = "26.6.10"
+version = "26.6.11"
 source = { editable = "." }
 dependencies = [
     { name = "genai-prices" },