diff --git a/README.md b/README.md
index 8c8ca19..6e9d540 100644
--- a/README.md
+++ b/README.md
@@ -12,6 +12,7 @@
 - 🌐 **Safari** - Control tabs, navigate, execute JavaScript
 - 🌍 **Chrome (CDP)** - Open sessions, navigate, click/type, extract data, screenshots
 - 📸 **Screen Capture** - Capture the active display and share image output with the model
+- 🔎 **Screen Text** - Extract visible text from the active display with Vision OCR, with optional macOS 27 visual summaries
 - 🖥️ **System** - Open apps, adjust brightness/volume, visual effects
 
 ## Available Skills
@@ -32,6 +33,7 @@ This repo currently includes one shareable skill:
 - Safari: open/close/switch/navigate/reload/history/page-info scripts
 - System: `open-application.applescript`, brightness + volume scripts
 - Screenshot: `capture-screenshot.applescript`
+- Screen text MCP: `extract_screen_text`
 - Files/Finder MCP: `find_files`, `list_directory`, `get_file_info`, `copy_file`, `copy_directory`, `move_file`, `rename_file`, `trash_file`, `reveal_in_finder`, `get_finder_selection`
 - Clipboard MCP: `get_clipboard_text`, `set_clipboard_text`, `clear_clipboard`, `get_clipboard_files`, `set_clipboard_files`, `save_clipboard_image`, `set_clipboard_image`
 - Window/Workspace MCP: `get_frontmost_app`, `list_windows`, `focus_window`, `move_window`, `resize_window`, `center_window`, `tile_windows`, `minimize`, `hide_app`, `quit_app`
@@ -115,7 +117,8 @@ Replace `/FULL/PATH/TO/altic-mcp` with your actual path (e.g., `/Users/johndoe/D
 - ✅ **Automation** - Allow Claude to control apps (Messages, Notes, Safari)
 - ✅ **Finder Automation** - For Finder selection, reveal, and Trash file tools
 - ✅ **Accessibility** - Required for screen glow, system controls, and window management tools such as focus_window, move_window, resize_window, center_window, tile_windows, minimize, hide_app, and quit_app
-- ✅ **Screen Recording** - Required for screenshot capture tools and improves window title/id discovery for list_windows on recent macOS versions
+- ✅ **Screen Recording** - Required for screenshot and screen text extraction tools and improves window title/id discovery for list_windows on recent macOS versions
+- ✅ **macOS 27 Apple Intelligence / Foundation Models availability** - Required only for `extract_screen_text` visual summary mode; OCR-only mode works without it
 
 Clipboard text operations normally do not require extra permissions. Clipboard
 file and image operations use macOS pasteboard APIs and may prompt for security
@@ -163,6 +166,13 @@ echo "hello" > /tmp/altic-file-smoke/source/example.txt
 - Copy an image or screenshot, then call `save_clipboard_image`
 - Use `set_clipboard_image` with an existing PNG or JPEG file, then paste into an app that accepts images
 
+## Manual Smoke Tests For Screen Text Tools
+
+- Open a window with visible text, then call `extract_screen_text` with `include_visual_summary=false`.
+- Confirm the returned JSON includes visible text, `line_count`, `average_confidence`, and a valid `screenshot_path`.
+- On macOS 27 with Foundation Models available, call `extract_screen_text` with `include_visual_summary=true` and confirm `visual_summary` is populated.
+- On systems without visual summary support, confirm OCR text is still returned and `visual_error` explains the missing macOS 27/Foundation Models capability.
+
 ## Manual Smoke Tests For Window Tools
 
 - Call `get_frontmost_app` while Finder or Safari is active.
diff --git a/server.py b/server.py
index 2d98536..d673303 100644
--- a/server.py
+++ b/server.py
@@ -15,6 +15,7 @@
     notes,
     reminders,
     safari,
+    screen_text,
     screenshot,
     system,
     window,
@@ -1056,6 +1057,36 @@ async def capture_active_screen(
     return screenshot.capture_active_screen(output_path)
 
 
+@mcp.tool()
+async def extract_screen_text(
+    output_path: str = Field(default=""),
+    max_chars: int = Field(default=20000, ge=1, le=200000),
+    include_visual_summary: bool = Field(default=False),
+    visual_prompt: str = Field(
+        default=screen_text.DEFAULT_VISUAL_PROMPT,
+    ),
+) -> str:
+    """
+    Capture the active display and extract visible text with macOS Vision OCR.
+    Optionally request a macOS 27 Foundation Models visual summary.
+
+    Args:
+        output_path: Optional file path for the captured PNG
+        max_chars: Maximum OCR text characters to return
+        include_visual_summary: Ask macOS 27 Foundation Models to summarize the image
+        visual_prompt: Prompt used when visual summary is enabled
+
+    Returns:
+        JSON string with OCR text, screenshot path, confidence metadata, and optional visual summary.
+    """
+    return screen_text.extract_screen_text(
+        output_path,
+        max_chars,
+        include_visual_summary,
+        visual_prompt,
+    )
+
+
 @mcp.tool()
 async def add_screen_glow() -> str:
     """
diff --git a/skills/altic-studio/SKILL.md b/skills/altic-studio/SKILL.md
index 36d8fd7..7d2a35d 100644
--- a/skills/altic-studio/SKILL.md
+++ b/skills/altic-studio/SKILL.md
@@ -13,9 +13,10 @@ license: Apache-2.0
 3. MCP file mode for safe Finder and filesystem operations
 4. MCP clipboard mode for text, file, and image pasteboard operations
 5. MCP window/workspace mode for arranging macOS apps and windows
+6. MCP screen text mode for reading visible text from the active display
 
 It also includes Swift utility scripts for active-display screenshots, clipboard
-file/image operations, and window/workspace management on macOS.
+file/image operations, screen OCR, and window/workspace management on macOS.
 
 ## Mode A: AppleScript (macOS apps)
 
@@ -61,6 +62,7 @@ The full Altic automation surface is exposed as scripts under `skills/altic-stud
 - `turn-down-volume.applescript` - args: `[amount_0_to_100]`
 - `capture-screenshot.applescript` - args: `[output_path] [full|interactive|window]`
 - `capture-active-screen.swift` - args: `<output_path>` (captures full display containing frontmost app)
+- `extract-screen-text.swift` - args: `<output_path> [include_visual_summary] [visual_prompt]` (captures active display and extracts OCR text)
 - `clipboard.swift` - subcommands: `get-files`, `set-files <paths...>`, `save-image <output_path>`, `set-image <image_path>`
 - `window-manager.swift` - subcommands: `get_frontmost_app`, `list_windows`, `focus_window`, `move_window`, `resize_window`, `center_window`, `tile_windows`, `minimize`, `hide_app`, `quit_app`
 
@@ -70,6 +72,12 @@ Swift command template (for active-display screenshots):
 swift "skills/altic-studio/scripts/capture-active-screen.swift" "/tmp/active-screen.png"
 ```
 
+Swift command template (for screen OCR):
+
+```bash
+swift "skills/altic-studio/scripts/extract-screen-text.swift" "/tmp/screen-text.png" false
+```
+
 Swift command template (for window management):
 
 ```bash
@@ -90,6 +98,7 @@ Use MCP tools for deterministic Chrome automation:
 - `chrome_close_session`
 - `chrome_list_sessions`
 - `capture_active_screen`
+- `extract_screen_text`
 
 Execution pattern:
 
@@ -100,6 +109,29 @@ Execution pattern:
 5. Capture screenshots on checkpoints or failures.
 6. Close session.
 
+## Mode B2: Screen Text and Visual Understanding (MCP)
+
+Use `extract_screen_text` when the user asks to read, transcribe, copy, or inspect
+visible text on the active screen. Default to OCR-only mode because it is faster,
+deterministic, and does not require Foundation Models availability.
+
+Available tool:
+
+- `extract_screen_text` - args: `[output_path] [max_chars] [include_visual_summary] [visual_prompt]`
+
+Screen text workflow rules:
+
+- Use `extract_screen_text` with `include_visual_summary=false` for requests like
+  "read the screen", "what text is visible", or "extract the error message".
+- Use `include_visual_summary=true` only when the user asks what is shown, what
+  to click, how to interpret the visible UI, or asks for visual understanding
+  beyond raw text.
+- Visual summary mode requires macOS 27 plus Apple Foundation Models
+  availability. If unavailable, use the OCR result and report the returned
+  `visual_error`.
+- Use `capture_active_screen` instead when the user needs image inspection or a
+  screenshot artifact rather than extracted text.
+
 ## Mode C: File Finder and File Operations (MCP)
 
 Use MCP file tools instead of shell commands when the user asks to find, inspect,
@@ -211,6 +243,8 @@ Window workflow rules:
   confirmation.
 - For window mutations, verify with `list_windows` when the user needs
   confirmation.
+- For screen text extraction, use the returned OCR JSON as the source of truth.
+  If visual summary mode fails, continue with OCR text when it is sufficient.
 
 ## Permissions Checklist
 
@@ -220,6 +254,8 @@ Window workflow rules:
 - Automation permission for app control
 - Accessibility permission for system controls and window management
 - Screen Recording permission for screenshots and improved window discovery
+- Screen Recording permission for screen text extraction
+- macOS 27 and Apple Foundation Models availability for screen visual summary mode
 - Safari setting: Allow JavaScript from Apple Events
 - Google Chrome installed for CDP tools
 - Full Disk Access for reading Messages database
diff --git a/skills/altic-studio/scripts/README.md b/skills/altic-studio/scripts/README.md
index bf42305..872af84 100644
--- a/skills/altic-studio/scripts/README.md
+++ b/skills/altic-studio/scripts/README.md
@@ -19,6 +19,7 @@ osascript "skills/altic-studio/scripts/create-calendar-event.applescript" "Team
 osascript "skills/altic-studio/scripts/navigate-safari.applescript" "https://example.com"
 osascript "skills/altic-studio/scripts/capture-screenshot.applescript" "/tmp/screen.png" "full"
 swift "skills/altic-studio/scripts/capture-active-screen.swift" "/tmp/active-screen.png"
+swift "skills/altic-studio/scripts/extract-screen-text.swift" "/tmp/screen-text.png" false
 swift "skills/altic-studio/scripts/clipboard.swift" get-files
 swift "skills/altic-studio/scripts/clipboard.swift" set-files "/Users/example/Desktop/report.pdf"
 swift "skills/altic-studio/scripts/clipboard.swift" save-image "/tmp/clipboard.png"
diff --git a/skills/altic-studio/scripts/extract-screen-text.swift b/skills/altic-studio/scripts/extract-screen-text.swift
new file mode 100644
index 0000000..6f77766
--- /dev/null
+++ b/skills/altic-studio/scripts/extract-screen-text.swift
@@ -0,0 +1,234 @@
+#!/usr/bin/env swift
+
+import AppKit
+import Foundation
+import ScreenCaptureKit
+import Vision
+
+#if canImport(FoundationModels)
+import FoundationModels
+#endif
+
+#if canImport(_Vision_FoundationModels)
+import _Vision_FoundationModels
+#endif
+
+struct OCRLine: Encodable {
+    let text: String
+    let confidence: Float
+}
+
+struct VisualSummaryResult {
+    let summary: String
+    let available: Bool
+    let source: String
+    let prompt: String
+    let error: String
+}
+
+struct ScreenTextOutput: Encodable {
+    let action: String
+    let screenshot_path: String
+    let text: String
+    let line_count: Int
+    let average_confidence: Float
+    let truncated: Bool
+    let visual_summary: String
+    let visual_model_available: Bool
+    let visual_model_source: String
+    let visual_prompt: String
+    let visual_error: String
+}
+
+func area(_ rect: CGRect) -> CGFloat {
+    max(0, rect.width) * max(0, rect.height)
+}
+
+func displayForFrontmostApp(content: SCShareableContent) -> SCDisplay? {
+    guard let app = NSWorkspace.shared.frontmostApplication else {
+        return nil
+    }
+
+    let targetPID = app.processIdentifier
+    let appWindows = content.windows.filter { window in
+        window.owningApplication?.processID == targetPID
+    }
+
+    guard
+        let frontWindow = appWindows.max(by: { lhs, rhs in
+            area(lhs.frame) < area(rhs.frame)
+        })
+    else {
+        return nil
+    }
+
+    let targetRect = frontWindow.frame
+    return content.displays.max(by: { lhs, rhs in
+        area(lhs.frame.intersection(targetRect)) < area(rhs.frame.intersection(targetRect))
+    })
+}
+
+func captureDisplay(to outputPath: String, display: SCDisplay) async throws -> CGImage {
+    let filter = SCContentFilter(display: display, excludingWindows: [])
+    let config = SCStreamConfiguration()
+
+    let image = try await SCScreenshotManager.captureImage(
+        contentFilter: filter,
+        configuration: config
+    )
+
+    let bitmap = NSBitmapImageRep(cgImage: image)
+    guard let pngData = bitmap.representation(using: .png, properties: [:]) else {
+        throw NSError(
+            domain: "altic-mcp.extract-screen-text",
+            code: 2,
+            userInfo: [NSLocalizedDescriptionKey: "Could not encode screenshot as PNG."]
+        )
+    }
+
+    let outputURL = URL(fileURLWithPath: outputPath)
+    try FileManager.default.createDirectory(
+        at: outputURL.deletingLastPathComponent(),
+        withIntermediateDirectories: true
+    )
+    try pngData.write(to: outputURL)
+
+    return image
+}
+
+func recognizeText(in image: CGImage) throws -> [OCRLine] {
+    let request = VNRecognizeTextRequest()
+    request.recognitionLevel = .accurate
+    request.usesLanguageCorrection = true
+
+    let handler = VNImageRequestHandler(cgImage: image, options: [:])
+    try handler.perform([request])
+
+    let observations = request.results ?? []
+    return observations.compactMap { observation in
+        guard let candidate = observation.topCandidates(1).first else {
+            return nil
+        }
+        return OCRLine(text: candidate.string, confidence: candidate.confidence)
+    }
+}
+
+func unavailableVisualSummary(prompt: String, reason: String) -> VisualSummaryResult {
+    VisualSummaryResult(
+        summary: "",
+        available: false,
+        source: "",
+        prompt: prompt,
+        error: reason
+    )
+}
+
+@available(macOS 27.0, *)
+func generateFoundationVisualSummary(imageURL: URL, prompt: String) async -> VisualSummaryResult {
+    #if canImport(FoundationModels) && canImport(_Vision_FoundationModels)
+    do {
+        let model = SystemLanguageModel()
+        let session = LanguageModelSession(model: model)
+        let response = try await session.respond {
+            prompt
+            Attachment(imageURL: imageURL)
+                .label("screen")
+        }
+
+        return VisualSummaryResult(
+            summary: response.content,
+            available: true,
+            source: "FoundationModels.SystemLanguageModel",
+            prompt: prompt,
+            error: ""
+        )
+    } catch {
+        return unavailableVisualSummary(
+            prompt: prompt,
+            reason: "FoundationModels visual summary failed: \(error.localizedDescription)"
+        )
+    }
+    #else
+    return unavailableVisualSummary(
+        prompt: prompt,
+        reason: "FoundationModels visual image tools are not available in this Swift toolchain"
+    )
+    #endif
+}
+
+let args = CommandLine.arguments
+guard args.count >= 2 else {
+    fputs("Usage: extract-screen-text.swift <output_path> [include_visual_summary] [visual_prompt]\n", stderr)
+    exit(1)
+}
+
+let outputPath = args[1]
+let includeVisualSummary = args.count >= 3 && ["1", "true", "yes"].contains(args[2].lowercased())
+let visualPrompt = args.count >= 4
+    ? args.dropFirst(3).joined(separator: " ")
+    : "Summarize the visible screen content and call out actionable UI text."
+
+do {
+    let content = try await SCShareableContent.excludingDesktopWindows(false, onScreenWindowsOnly: true)
+    let display = displayForFrontmostApp(content: content) ?? content.displays.first
+    guard let display else {
+        fputs("Could not determine a display to capture.\n", stderr)
+        exit(1)
+    }
+
+    let image = try await captureDisplay(to: outputPath, display: display)
+    let lines = try recognizeText(in: image)
+    let text = lines.map(\.text).joined(separator: "\n")
+    let averageConfidence = lines.isEmpty
+        ? Float(0)
+        : lines.map(\.confidence).reduce(Float(0), +) / Float(lines.count)
+
+    let visualSummary: VisualSummaryResult
+    if includeVisualSummary {
+        if #available(macOS 27.0, *) {
+            visualSummary = await generateFoundationVisualSummary(
+                imageURL: URL(fileURLWithPath: outputPath),
+                prompt: visualPrompt
+            )
+        } else {
+            visualSummary = unavailableVisualSummary(
+                prompt: visualPrompt,
+                reason: "FoundationModels visual understanding requires macOS 27 or later"
+            )
+        }
+    } else {
+        visualSummary = VisualSummaryResult(
+            summary: "",
+            available: false,
+            source: "",
+            prompt: "",
+            error: ""
+        )
+    }
+
+    let output = ScreenTextOutput(
+        action: "extract_screen_text",
+        screenshot_path: outputPath,
+        text: text,
+        line_count: lines.count,
+        average_confidence: averageConfidence,
+        truncated: false,
+        visual_summary: visualSummary.summary,
+        visual_model_available: visualSummary.available,
+        visual_model_source: visualSummary.source,
+        visual_prompt: visualSummary.prompt,
+        visual_error: visualSummary.error
+    )
+
+    let encoder = JSONEncoder()
+    encoder.outputFormatting = [.withoutEscapingSlashes]
+    let data = try encoder.encode(output)
+    guard let json = String(data: data, encoding: .utf8) else {
+        fputs("Could not encode OCR response as JSON.\n", stderr)
+        exit(1)
+    }
+    print(json)
+} catch {
+    fputs("\(error.localizedDescription)\n", stderr)
+    exit(1)
+}
diff --git a/tests/test_screen_text.py b/tests/test_screen_text.py
new file mode 100644
index 0000000..070735c
--- /dev/null
+++ b/tests/test_screen_text.py
@@ -0,0 +1,197 @@
+import json
+import subprocess
+from pathlib import Path
+
+from tools import screen_text
+
+
+def read_json(value: str):
+    assert not value.startswith("Error:"), value
+    return json.loads(value)
+
+
+def completed(args, stdout="", stderr="", returncode=0):
+    return subprocess.CompletedProcess(
+        args=args,
+        returncode=returncode,
+        stdout=stdout,
+        stderr=stderr,
+    )
+
+
+def test_extract_screen_text_invokes_swift_helper_with_defaults(monkeypatch, tmp_path):
+    output = tmp_path / "screen.png"
+    seen = {}
+
+    swift_payload = {
+        "action": "extract_screen_text",
+        "screenshot_path": str(output.resolve()),
+        "text": "Visible title\nPrimary button",
+        "line_count": 2,
+        "average_confidence": 0.91,
+        "visual_summary": "",
+        "visual_model_available": False,
+        "visual_error": "",
+    }
+
+    def fake_run(args, **kwargs):
+        seen["args"] = args
+        seen["kwargs"] = kwargs
+        return completed(args, stdout=json.dumps(swift_payload))
+
+    monkeypatch.setattr(screen_text.subprocess, "run", fake_run)
+
+    payload = read_json(screen_text.extract_screen_text(str(output)))
+
+    assert payload["action"] == "extract_screen_text"
+    assert payload["screenshot_path"] == str(output.resolve())
+    assert payload["text"] == "Visible title\nPrimary button"
+    assert payload["length_chars"] == 28
+    assert payload["truncated"] is False
+    assert payload["line_count"] == 2
+    assert payload["average_confidence"] == 0.91
+    assert payload["visual_summary"] == ""
+    assert payload["visual_model_available"] is False
+    assert payload["visual_error"] == ""
+    assert seen["args"][0] == "swift"
+    assert Path(seen["args"][1]).name == "extract-screen-text.swift"
+    assert seen["args"][2:] == [
+        str(output.resolve()),
+        "false",
+        "Summarize the visible screen content and call out actionable UI text.",
+    ]
+    assert seen["kwargs"]["timeout"] == 90
+
+
+def test_extract_screen_text_passes_visual_summary_options(monkeypatch, tmp_path):
+    output = tmp_path / "screen.png"
+    seen = {}
+
+    def fake_run(args, **kwargs):
+        seen["args"] = args
+        return completed(
+            args,
+            stdout=json.dumps(
+                {
+                    "action": "extract_screen_text",
+                    "screenshot_path": str(output.resolve()),
+                    "text": "Settings",
+                    "line_count": 1,
+                    "average_confidence": 0.88,
+                    "visual_summary": "A settings page is open.",
+                    "visual_model_available": True,
+                    "visual_model_source": "FoundationModels.SystemLanguageModel",
+                    "visual_prompt": "What is shown?",
+                }
+            ),
+        )
+
+    monkeypatch.setattr(screen_text.subprocess, "run", fake_run)
+
+    payload = read_json(
+        screen_text.extract_screen_text(
+            str(output),
+            include_visual_summary=True,
+            visual_prompt="What is shown?",
+        )
+    )
+
+    assert payload["visual_model_available"] is True
+    assert payload["visual_model_source"] == "FoundationModels.SystemLanguageModel"
+    assert payload["visual_summary"] == "A settings page is open."
+    assert payload["visual_prompt"] == "What is shown?"
+    assert seen["args"][2:] == [str(output.resolve()), "true", "What is shown?"]
+
+
+def test_extract_screen_text_truncates_text_in_python(monkeypatch, tmp_path):
+    output = tmp_path / "screen.png"
+
+    def fake_run(args, **kwargs):
+        return completed(
+            args,
+            stdout=json.dumps(
+                {
+                    "action": "extract_screen_text",
+                    "screenshot_path": str(output.resolve()),
+                    "text": "abcdefghijklmnopqrstuvwxyz",
+                    "line_count": 1,
+                    "average_confidence": 0.7,
+                }
+            ),
+        )
+
+    monkeypatch.setattr(screen_text.subprocess, "run", fake_run)
+
+    payload = read_json(screen_text.extract_screen_text(str(output), max_chars=10))
+
+    assert payload["text"] == "abcdefghij"
+    assert payload["length_chars"] == 26
+    assert payload["returned_length_chars"] == 10
+    assert payload["truncated"] is True
+
+
+def test_extract_screen_text_reports_visual_unavailable(monkeypatch, tmp_path):
+    output = tmp_path / "screen.png"
+
+    def fake_run(args, **kwargs):
+        return completed(
+            args,
+            stdout=json.dumps(
+                {
+                    "action": "extract_screen_text",
+                    "screenshot_path": str(output.resolve()),
+                    "text": "Read me",
+                    "line_count": 1,
+                    "average_confidence": 0.81,
+                    "visual_summary": "",
+                    "visual_model_available": False,
+                    "visual_error": (
+                        "FoundationModels visual understanding requires macOS 27 or later"
+                    ),
+                }
+            ),
+        )
+
+    monkeypatch.setattr(screen_text.subprocess, "run", fake_run)
+
+    payload = read_json(
+        screen_text.extract_screen_text(str(output), include_visual_summary=True)
+    )
+
+    assert payload["text"] == "Read me"
+    assert payload["visual_model_available"] is False
+    assert "macOS 27" in payload["visual_error"]
+
+
+def test_extract_screen_text_returns_subprocess_error(monkeypatch, tmp_path):
+    output = tmp_path / "screen.png"
+
+    def fake_run(args, **kwargs):
+        return completed(args, stderr="Screen Recording permission denied", returncode=1)
+
+    monkeypatch.setattr(screen_text.subprocess, "run", fake_run)
+
+    result = screen_text.extract_screen_text(str(output))
+
+    assert result == "Error: Unable to extract screen text: Screen Recording permission denied"
+
+
+def test_extract_screen_text_reports_invalid_json(monkeypatch, tmp_path):
+    output = tmp_path / "screen.png"
+
+    def fake_run(args, **kwargs):
+        return completed(args, stdout="not-json")
+
+    monkeypatch.setattr(screen_text.subprocess, "run", fake_run)
+
+    result = screen_text.extract_screen_text(str(output))
+
+    assert result.startswith("Error: invalid screen text response:")
+
+
+def test_server_exposes_extract_screen_text():
+    import server
+
+    tool_names = set(server.mcp._tool_manager._tools)
+
+    assert "extract_screen_text" in tool_names
diff --git a/tools/screen_text.py b/tools/screen_text.py
new file mode 100644
index 0000000..417b974
--- /dev/null
+++ b/tools/screen_text.py
@@ -0,0 +1,104 @@
+import json
+import subprocess
+import tempfile
+import time
+from pathlib import Path
+
+from .constants import SCRIPTS_PREFIX
+
+
+DEFAULT_VISUAL_PROMPT = (
+    "Summarize the visible screen content and call out actionable UI text."
+)
+
+
+def _error(message: str) -> str:
+    return f"Error: {message}"
+
+
+def _json(payload: dict[str, object]) -> str:
+    return json.dumps(payload)
+
+
+def _script_path() -> str:
+    return str(SCRIPTS_PREFIX / "extract-screen-text.swift")
+
+
+def _default_output_path() -> str:
+    timestamp = int(time.time())
+    shots_dir = Path(tempfile.gettempdir()) / "altic-mcp-screen-text"
+    shots_dir.mkdir(parents=True, exist_ok=True)
+    return str(shots_dir / f"active-screen-{timestamp}.png")
+
+
+def extract_screen_text(
+    output_path: str = "",
+    max_chars: int = 20000,
+    include_visual_summary: bool = False,
+    visual_prompt: str = DEFAULT_VISUAL_PROMPT,
+) -> str:
+    """
+    Capture the active display and extract visible screen text with macOS Vision.
+    Optionally ask macOS 27 Foundation Models for a visual summary.
+    """
+    try:
+        if max_chars < 1:
+            return _error("max_chars must be at least 1")
+
+        target_path = output_path.strip() or _default_output_path()
+        target = Path(target_path).expanduser().resolve()
+        target.parent.mkdir(parents=True, exist_ok=True)
+
+        prompt = visual_prompt if visual_prompt else DEFAULT_VISUAL_PROMPT
+        result = subprocess.run(
+            [
+                "swift",
+                _script_path(),
+                str(target),
+                "true" if include_visual_summary else "false",
+                prompt,
+            ],
+            capture_output=True,
+            text=True,
+            timeout=90,
+        )
+
+        if result.returncode != 0:
+            error_msg = result.stderr.strip() if result.stderr else "Unknown error"
+            return _error(f"Unable to extract screen text: {error_msg}")
+
+        try:
+            payload = json.loads(result.stdout)
+        except json.JSONDecodeError:
+            return _error(f"invalid screen text response: {result.stdout.strip()}")
+
+        text = str(payload.get("text", ""))
+        returned_text = text[:max_chars]
+        truncated = len(returned_text) < len(text) or bool(payload.get("truncated", False))
+
+        normalized: dict[str, object] = {
+            "action": "extract_screen_text",
+            "screenshot_path": str(payload.get("screenshot_path", str(target))),
+            "text": returned_text,
+            "length_chars": len(text),
+            "returned_length_chars": len(returned_text),
+            "truncated": truncated,
+            "line_count": int(payload.get("line_count", 0) or 0),
+            "average_confidence": float(payload.get("average_confidence", 0) or 0),
+        }
+
+        for key in (
+            "visual_summary",
+            "visual_model_available",
+            "visual_model_source",
+            "visual_prompt",
+            "visual_error",
+        ):
+            if key in payload:
+                normalized[key] = payload[key]
+
+        return _json(normalized)
+    except subprocess.TimeoutExpired:
+        return _error("extract screen text timed out")
+    except Exception as exc:
+        return _error(f"Failed to extract screen text: {exc}")
diff --git a/tools/scripts/extract-screen-text.swift b/tools/scripts/extract-screen-text.swift
new file mode 100644
index 0000000..6f77766
--- /dev/null
+++ b/tools/scripts/extract-screen-text.swift
@@ -0,0 +1,234 @@
+#!/usr/bin/env swift
+
+import AppKit
+import Foundation
+import ScreenCaptureKit
+import Vision
+
+#if canImport(FoundationModels)
+import FoundationModels
+#endif
+
+#if canImport(_Vision_FoundationModels)
+import _Vision_FoundationModels
+#endif
+
+struct OCRLine: Encodable {
+    let text: String
+    let confidence: Float
+}
+
+struct VisualSummaryResult {
+    let summary: String
+    let available: Bool
+    let source: String
+    let prompt: String
+    let error: String
+}
+
+struct ScreenTextOutput: Encodable {
+    let action: String
+    let screenshot_path: String
+    let text: String
+    let line_count: Int
+    let average_confidence: Float
+    let truncated: Bool
+    let visual_summary: String
+    let visual_model_available: Bool
+    let visual_model_source: String
+    let visual_prompt: String
+    let visual_error: String
+}
+
+func area(_ rect: CGRect) -> CGFloat {
+    max(0, rect.width) * max(0, rect.height)
+}
+
+func displayForFrontmostApp(content: SCShareableContent) -> SCDisplay? {
+    guard let app = NSWorkspace.shared.frontmostApplication else {
+        return nil
+    }
+
+    let targetPID = app.processIdentifier
+    let appWindows = content.windows.filter { window in
+        window.owningApplication?.processID == targetPID
+    }
+
+    guard
+        let frontWindow = appWindows.max(by: { lhs, rhs in
+            area(lhs.frame) < area(rhs.frame)
+        })
+    else {
+        return nil
+    }
+
+    let targetRect = frontWindow.frame
+    return content.displays.max(by: { lhs, rhs in
+        area(lhs.frame.intersection(targetRect)) < area(rhs.frame.intersection(targetRect))
+    })
+}
+
+func captureDisplay(to outputPath: String, display: SCDisplay) async throws -> CGImage {
+    let filter = SCContentFilter(display: display, excludingWindows: [])
+    let config = SCStreamConfiguration()
+
+    let image = try await SCScreenshotManager.captureImage(
+        contentFilter: filter,
+        configuration: config
+    )
+
+    let bitmap = NSBitmapImageRep(cgImage: image)
+    guard let pngData = bitmap.representation(using: .png, properties: [:]) else {
+        throw NSError(
+            domain: "altic-mcp.extract-screen-text",
+            code: 2,
+            userInfo: [NSLocalizedDescriptionKey: "Could not encode screenshot as PNG."]
+        )
+    }
+
+    let outputURL = URL(fileURLWithPath: outputPath)
+    try FileManager.default.createDirectory(
+        at: outputURL.deletingLastPathComponent(),
+        withIntermediateDirectories: true
+    )
+    try pngData.write(to: outputURL)
+
+    return image
+}
+
+func recognizeText(in image: CGImage) throws -> [OCRLine] {
+    let request = VNRecognizeTextRequest()
+    request.recognitionLevel = .accurate
+    request.usesLanguageCorrection = true
+
+    let handler = VNImageRequestHandler(cgImage: image, options: [:])
+    try handler.perform([request])
+
+    let observations = request.results ?? []
+    return observations.compactMap { observation in
+        guard let candidate = observation.topCandidates(1).first else {
+            return nil
+        }
+        return OCRLine(text: candidate.string, confidence: candidate.confidence)
+    }
+}
+
+func unavailableVisualSummary(prompt: String, reason: String) -> VisualSummaryResult {
+    VisualSummaryResult(
+        summary: "",
+        available: false,
+        source: "",
+        prompt: prompt,
+        error: reason
+    )
+}
+
+@available(macOS 27.0, *)
+func generateFoundationVisualSummary(imageURL: URL, prompt: String) async -> VisualSummaryResult {
+    #if canImport(FoundationModels) && canImport(_Vision_FoundationModels)
+    do {
+        let model = SystemLanguageModel()
+        let session = LanguageModelSession(model: model)
+        let response = try await session.respond {
+            prompt
+            Attachment(imageURL: imageURL)
+                .label("screen")
+        }
+
+        return VisualSummaryResult(
+            summary: response.content,
+            available: true,
+            source: "FoundationModels.SystemLanguageModel",
+            prompt: prompt,
+            error: ""
+        )
+    } catch {
+        return unavailableVisualSummary(
+            prompt: prompt,
+            reason: "FoundationModels visual summary failed: \(error.localizedDescription)"
+        )
+    }
+    #else
+    return unavailableVisualSummary(
+        prompt: prompt,
+        reason: "FoundationModels visual image tools are not available in this Swift toolchain"
+    )
+    #endif
+}
+
+let args = CommandLine.arguments
+guard args.count >= 2 else {
+    fputs("Usage: extract-screen-text.swift <output_path> [include_visual_summary] [visual_prompt]\n", stderr)
+    exit(1)
+}
+
+let outputPath = args[1]
+let includeVisualSummary = args.count >= 3 && ["1", "true", "yes"].contains(args[2].lowercased())
+let visualPrompt = args.count >= 4
+    ? args.dropFirst(3).joined(separator: " ")
+    : "Summarize the visible screen content and call out actionable UI text."
+
+do {
+    let content = try await SCShareableContent.excludingDesktopWindows(false, onScreenWindowsOnly: true)
+    let display = displayForFrontmostApp(content: content) ?? content.displays.first
+    guard let display else {
+        fputs("Could not determine a display to capture.\n", stderr)
+        exit(1)
+    }
+
+    let image = try await captureDisplay(to: outputPath, display: display)
+    let lines = try recognizeText(in: image)
+    let text = lines.map(\.text).joined(separator: "\n")
+    let averageConfidence = lines.isEmpty
+        ? Float(0)
+        : lines.map(\.confidence).reduce(Float(0), +) / Float(lines.count)
+
+    let visualSummary: VisualSummaryResult
+    if includeVisualSummary {
+        if #available(macOS 27.0, *) {
+            visualSummary = await generateFoundationVisualSummary(
+                imageURL: URL(fileURLWithPath: outputPath),
+                prompt: visualPrompt
+            )
+        } else {
+            visualSummary = unavailableVisualSummary(
+                prompt: visualPrompt,
+                reason: "FoundationModels visual understanding requires macOS 27 or later"
+            )
+        }
+    } else {
+        visualSummary = VisualSummaryResult(
+            summary: "",
+            available: false,
+            source: "",
+            prompt: "",
+            error: ""
+        )
+    }
+
+    let output = ScreenTextOutput(
+        action: "extract_screen_text",
+        screenshot_path: outputPath,
+        text: text,
+        line_count: lines.count,
+        average_confidence: averageConfidence,
+        truncated: false,
+        visual_summary: visualSummary.summary,
+        visual_model_available: visualSummary.available,
+        visual_model_source: visualSummary.source,
+        visual_prompt: visualSummary.prompt,
+        visual_error: visualSummary.error
+    )
+
+    let encoder = JSONEncoder()
+    encoder.outputFormatting = [.withoutEscapingSlashes]
+    let data = try encoder.encode(output)
+    guard let json = String(data: data, encoding: .utf8) else {
+        fputs("Could not encode OCR response as JSON.\n", stderr)
+        exit(1)
+    }
+    print(json)
+} catch {
+    fputs("\(error.localizedDescription)\n", stderr)
+    exit(1)
+}