diff --git a/Cargo.lock b/Cargo.lock index 6c5a3bcc1f..61ce5e657e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3405,6 +3405,7 @@ dependencies = [ "bytes", "chrono", "clap", + "core-graphics", "crossterm", "dirs", "flate2", @@ -3627,6 +3628,7 @@ name = "jcode-config-types" version = "0.1.0" dependencies = [ "serde", + "serde_json", ] [[package]] diff --git a/OAUTH.md b/OAUTH.md index d29cb62c67..f0a9a52dee 100644 --- a/OAUTH.md +++ b/OAUTH.md @@ -133,6 +133,19 @@ with headers: Otherwise it uses: - `https://api.openai.com/v1/responses` +For **API-key** usage (no ChatGPT/Codex OAuth), the Responses API base URL is +overridable so you can target a local or proxied Responses-API endpoint. Set one +of (checked in this order) to an absolute `http(s)://` base that ends in the API +version, e.g. `http://127.0.0.1:8317/v1`: +- `JCODE_OPENAI_API_BASE` +- `OPENAI_BASE_URL` +- `OPENAI_API_BASE` + +jcode appends `/responses` itself, derives the WebSocket and `/compact` +endpoints from the same base, and also points the `/models` catalog probe at it. +The override is ignored in ChatGPT/Codex OAuth mode (that backend is fixed), and +a malformed value is logged and ignored rather than breaking requests. + ### Troubleshooting - Claude 401/auth errors: run `jcode login --provider claude`. - 401/403: re-run `jcode login --provider openai`. diff --git a/README.md b/README.md index 34bb4fc6f4..9402542061 100644 --- a/README.md +++ b/README.md @@ -368,6 +368,7 @@ Useful environment overrides for these endpoints: - `JCODE_STREAM_IDLE_TIMEOUT_SECS` — raise the streaming idle timeout (default 180s) for slow reasoning models that think silently before emitting tokens. Also settable as `[provider] stream_idle_timeout_secs` in `config.toml`. - Per-model `context_window` (alias `context_limit`) in a `[[providers..models]]` entry — set the context window when the endpoint has no usable `/v1/models` response, so jcode does not fall back to the generic 200k default. +- `extra_body` — inject non-standard top-level fields into every chat/completions request body for backends that require them. See [Extra request-body fields](#extra-request-body-fields-extra_body) below. For details on self-hosting, local runtimes, and the exact config file shape, see below. @@ -444,6 +445,32 @@ id = "my-model-id" context_window = 128000 ``` +##### Extra request-body fields (`extra_body`) + +Some OpenAI-compatible backends require non-standard top-level request fields. For example, NVIDIA NIM DeepSeek-V4 reasoning models (`deepseek-ai/deepseek-v4-flash`, `deepseek-ai/deepseek-v4-pro`) only enable thinking when the request includes `chat_template_kwargs`; without it they reply without reasoning (or, for some deployments, hang). jcode lets you inject arbitrary top-level fields two ways. + +1. Per named profile, via `extra_body` in `config.toml` (a TOML table merged verbatim into the JSON body): + + ```toml + [providers.my-nim] + type = "openai-compatible" + base_url = "https://integrate.api.nvidia.com/v1" + api_key_env = "NVIDIA_API_KEY" + default_model = "deepseek-ai/deepseek-v4-flash" + + [providers.my-nim.extra_body.chat_template_kwargs] + thinking = true + reasoning_effort = "high" + ``` + +2. For built-in profiles (e.g. `nvidia-nim`) or any endpoint, via the `JCODE_OPENAI_EXTRA_BODY` environment variable (a JSON object string). It can live in the provider's env file (`~/.config/jcode/nvidia-nim.env`) next to the API key: + + ```bash + JCODE_OPENAI_EXTRA_BODY={"chat_template_kwargs":{"thinking":true,"reasoning_effort":"high"}} + ``` + +Keys from `extra_body` are merged last and override any jcode-generated body field with the same name (`JCODE_OPENAI_EXTRA_BODY` wins over the config `extra_body` on key collisions). Invalid values are logged and ignored rather than failing the request. + The custom OpenAI-compatible provider reads overrides from environment variables or from an env file in jcode's app config directory. On Linux this is usually `~/.config/jcode/`, so the default file is usually: ```text diff --git a/crates/jcode-app-core/Cargo.toml b/crates/jcode-app-core/Cargo.toml index 5a157f4e11..d796d92a74 100644 --- a/crates/jcode-app-core/Cargo.toml +++ b/crates/jcode-app-core/Cargo.toml @@ -160,3 +160,7 @@ windows-sys = { version = "0.59", features = ["Win32_Foundation", "Win32_System_ [target.'cfg(target_os = "macos")'.dependencies] global-hotkey = "0.7" +# Native desktop control for the `macos_computer_use` tool (synthetic mouse/keyboard +# events, cursor positioning, display bounds). `highsierra` enables the +# CGEventCreateScrollWheelEvent2 binding used for scroll. +core-graphics = { version = "0.23", features = ["highsierra"] } diff --git a/crates/jcode-app-core/src/tool/computer/ax.rs b/crates/jcode-app-core/src/tool/computer/ax.rs new file mode 100644 index 0000000000..d1ae8c2e1b --- /dev/null +++ b/crates/jcode-app-core/src/tool/computer/ax.rs @@ -0,0 +1,348 @@ +//! Tier 1: Accessibility (AX) read + action. +//! +//! This is the *background* control path. It drives other apps' UI elements by +//! reference through `System Events`, so it can press buttons and set field +//! values without moving the cursor and (for many actions) without bringing the +//! target app to the front. +//! +//! Elements are addressed by a structural path: an app (by name) plus a chain of +//! 1-based child indices from the front window. `find_element` / `ui` return +//! these paths; the action verbs accept them. + +use super::osa; +use anyhow::{Result, bail}; +use jcode_tool_types::ToolOutput; +use serde::Deserialize; +use serde_json::json; +use std::time::Duration; + +/// A structural handle to an AX element. +#[derive(Debug, Clone, Deserialize)] +pub struct ElementHandle { + /// Application process name (e.g. "Safari"). + pub app: String, + /// 1-based child index chain from the app's front window to the element. + /// Empty path == the front window itself. + #[serde(default)] + pub path: Vec, +} + +impl ElementHandle { + /// Emit an AppleScript expression that resolves to this element, bound to a + /// variable name `el`. Assumes a `tell application "System Events"` context + /// and that `frontApp` is the target process. + fn resolve_script(&self) -> String { + let mut expr = String::from("front window of frontApp"); + for idx in &self.path { + expr = format!("UI element {idx} of ({expr})"); + } + expr + } +} + +fn tell(app: &str, body: &str) -> String { + format!( + "tell application \"System Events\"\n\ + set frontApp to first application process whose name is {app}\n\ + {body}\n\ + end tell", + app = osa::as_quote(app), + body = body + ) +} + +/// Dump the AX tree of an app (or the frontmost app) to a given depth. +pub fn ui_tree(app: Option<&str>, depth: u32) -> Result { + let target = match app { + Some(a) => format!("first application process whose name is {}", osa::as_quote(a)), + None => "first application process whose frontmost is true".to_string(), + }; + let script = format!( + r##" +using terms from application "System Events" + on dumpEl(el, lvl, maxlvl, idxPath) + set out to "" + if lvl > maxlvl then return out + set r to "?" + try + set r to (role of el as text) + end try + set t to "" + try + set t to (title of el as text) + end try + if t is "" then + try + set t to (value of el as text) + end try + end if + set d to "" + try + set d to (description of el as text) + end try + set pos to "" + try + set p to position of el + set sz to size of el + set pos to " @(" & (item 1 of p) & "," & (item 2 of p) & " " & (item 1 of sz) & "x" & (item 2 of sz) & ")" + end try + set indent to "" + repeat lvl times + set indent to indent & " " + end repeat + set ln to indent & "#" & idxPath & " " & r + if t is not "" then set ln to ln & " \"" & t & "\"" + if d is not "" then set ln to ln & " [" & d & "]" + set ln to ln & pos & linefeed + set out to out & ln + try + set i to 0 + repeat with child in (UI elements of el) + set i to i + 1 + set out to out & (my dumpEl(child, lvl + 1, maxlvl, idxPath & "." & i)) + end repeat + end try + return out + end dumpEl +end using terms from + +tell application "System Events" + set frontApp to {target} + set appName to name of frontApp + set out to "App: " & appName & " (element paths shown as #a.b.c == child indices from front window)" & linefeed + try + set win to front window of frontApp + set out to out & (my dumpEl(win, 0, {depth}, "")) + on error errMsg + set out to out & "(no window / " & errMsg & ")" + end try + return out +end tell +"##, + target = target, + depth = depth + ); + let tree = osa::run_applescript(&script)?; + let tree = if tree.trim().is_empty() { + "(empty Accessibility tree)".to_string() + } else { + tree + }; + Ok(ToolOutput::new(tree).with_title("ui tree")) +} + +/// Find elements matching role/title/value within an app, returning their paths. +pub fn find_element( + app: &str, + role: Option<&str>, + title: Option<&str>, + value: Option<&str>, + max_depth: u32, +) -> Result { + let role_m = role.map(osa::as_quote).unwrap_or_else(|| "\"\"".into()); + let title_m = title.map(osa::as_quote).unwrap_or_else(|| "\"\"".into()); + let value_m = value.map(osa::as_quote).unwrap_or_else(|| "\"\"".into()); + let script = format!( + r#" +using terms from application "System Events" + on findEl(el, lvl, maxlvl, idxPath, roleM, titleM, valueM) + set out to "" + if lvl > maxlvl then return out + set r to "" + try + set r to (role of el as text) + end try + set t to "" + try + set t to (title of el as text) + end try + set v to "" + try + set v to (value of el as text) + end try + set ok to true + if roleM is not "" and r is not roleM then set ok to false + if titleM is not "" and t does not contain titleM then set ok to false + if valueM is not "" and v does not contain valueM then set ok to false + if ok and idxPath is not "" then + set pos to "" + try + set p to position of el + set sz to size of el + set pos to " @(" & (item 1 of p) & "," & (item 2 of p) & " " & (item 1 of sz) & "x" & (item 2 of sz) & ")" + end try + set out to out & idxPath & " " & r & " \"" & t & "\"" & pos & linefeed + end if + try + set i to 0 + repeat with child in (UI elements of el) + set i to i + 1 + set out to out & (my findEl(child, lvl + 1, maxlvl, idxPath & "." & i, roleM, titleM, valueM)) + end repeat + end try + return out + end findEl +end using terms from + +tell application "System Events" + set frontApp to first application process whose name is {app} + try + set win to front window of frontApp + set out to (my findEl(win, 0, {depth}, "", {role_m}, {title_m}, {value_m})) + on error errMsg + set out to "(error: " & errMsg & ")" + end try + if out is "" then set out to "(no matching elements)" + return out +end tell +"#, + app = osa::as_quote(app), + depth = max_depth, + role_m = role_m, + title_m = title_m, + value_m = value_m, + ); + let res = osa::run_applescript(&script)?; + Ok(ToolOutput::new(format!( + "Matches in {app} (path role title @pos). Use the path with press/set_value/get_value:\n{res}", + )) + .with_title("find_element")) +} + +/// Perform AXPress on an element (background click). +pub fn press(handle: &ElementHandle) -> Result { + let body = format!("perform action \"AXPress\" of ({})", handle.resolve_script()); + osa::run_applescript_timeout(&tell(&handle.app, &body), Duration::from_secs(10))?; + Ok(ToolOutput::new(format!( + "pressed element {:?} in {} (no cursor movement)", + handle.path, handle.app + ))) +} + +/// Perform an arbitrary AX action on an element. +pub fn perform_action(handle: &ElementHandle, ax_action: &str) -> Result { + let body = format!( + "perform action {} of ({})", + osa::as_quote(ax_action), + handle.resolve_script() + ); + osa::run_applescript_timeout(&tell(&handle.app, &body), Duration::from_secs(10))?; + Ok(ToolOutput::new(format!( + "performed {ax_action} on element {:?} in {}", + handle.path, handle.app + ))) +} + +/// Set the value of an element (background typing into a field). +pub fn set_value(handle: &ElementHandle, value: &str) -> Result { + let body = format!( + "set value of ({}) to {}", + handle.resolve_script(), + osa::as_quote(value) + ); + osa::run_applescript_timeout(&tell(&handle.app, &body), Duration::from_secs(10))?; + Ok(ToolOutput::new(format!( + "set value of element {:?} in {} to {} chars", + handle.path, + handle.app, + value.chars().count() + ))) +} + +/// Read the value of an element. +pub fn get_value(handle: &ElementHandle) -> Result { + let body = format!("return value of ({}) as text", handle.resolve_script()); + let v = osa::run_applescript_timeout(&tell(&handle.app, &body), Duration::from_secs(10))?; + Ok(ToolOutput::new(v).with_title("get_value")) +} + +/// Select a menu-bar item by path, e.g. ["File", "Export…"]. +pub fn select_menu(app: &str, path: &[String]) -> Result { + if path.len() < 2 { + bail!("select_menu needs at least a top menu and one item, e.g. [\"File\",\"Save\"]"); + } + // The menu bar belongs to the process; menu access requires the app be + // frontmost, so activate it first. + let top = &path[0]; + let mut expr = format!( + "menu bar item {top} of menu bar 1 of frontApp", + top = osa::as_quote(top) + ); + for item in path.iter().skip(1) { + expr = format!( + "menu item {item} of menu 1 of ({expr})", + item = osa::as_quote(item), + expr = expr + ); + } + let body = format!( + "set frontmost of frontApp to true\n\ + delay 0.2\n\ + click ({expr})" + ); + osa::run_applescript_timeout(&tell(app, &body), Duration::from_secs(10))?; + Ok(ToolOutput::new(format!( + "selected menu {} in {app}", + path.join(" > ") + ))) +} + +/// Return the element at a screen point (role/title), useful to confirm targets. +pub fn element_at(app: &str, x: f64, y: f64) -> Result { + // System Events can hit-test via "UI element N" is awkward; use AX position + // matching by walking and finding the deepest element containing the point. + let script = format!( + r#" +using terms from application "System Events" + on hit(el, px, py, idxPath, best) + set result to best + try + set p to position of el + set sz to size of el + set x1 to item 1 of p + set y1 to item 2 of p + set x2 to x1 + (item 1 of sz) + set y2 to y1 + (item 2 of sz) + if px >= x1 and px <= x2 and py >= y1 and py <= y2 then + set r to "" + try + set r to (role of el as text) + end try + set t to "" + try + set t to (title of el as text) + end try + set result to idxPath & " " & r & " \"" & t & "\" @(" & x1 & "," & y1 & " " & (item 1 of sz) & "x" & (item 2 of sz) & ")" + end if + end try + try + set i to 0 + repeat with child in (UI elements of el) + set i to i + 1 + set result to my hit(child, px, py, idxPath & "." & i, result) + end repeat + end try + return result + end hit +end using terms from + +tell application "System Events" + set frontApp to first application process whose name is {app} + try + set win to front window of frontApp + set out to my hit(win, {x}, {y}, "", "(none)") + on error errMsg + set out to "(error: " & errMsg & ")" + end try + return out +end tell +"#, + app = osa::as_quote(app), + x = x, + y = y + ); + let res = osa::run_applescript(&script)?; + Ok(ToolOutput::new(format!("Deepest element at ({x:.0},{y:.0}) in {app}:\n{res}")) + .with_title("element_at") + .with_metadata(json!({"app": app, "x": x, "y": y}))) +} diff --git a/crates/jcode-app-core/src/tool/computer/coverage_tests.rs b/crates/jcode-app-core/src/tool/computer/coverage_tests.rs new file mode 100644 index 0000000000..87428d6d5b --- /dev/null +++ b/crates/jcode-app-core/src/tool/computer/coverage_tests.rs @@ -0,0 +1,187 @@ +//! Exhaustive live coverage of EVERY computer action. These mutate the desktop +//! (open TextEdit, move windows, clipboard, etc.) so they are `#[ignore]`d and +//! run explicitly: +//! cargo test -p jcode-app-core tool::computer::coverage -- --ignored --nocapture --test-threads=1 +//! +//! Each test asserts the action returns Ok and (where checkable) the expected +//! effect. The goal is to prove no action panics or silently misbehaves. + +use super::*; +use jcode_tool_core::{ToolContext, ToolExecutionMode}; + +fn ctx() -> ToolContext { + ToolContext { + session_id: "cov".into(), + message_id: "cov".into(), + tool_call_id: "cov".into(), + working_dir: None, + stdin_request_tx: None, + graceful_shutdown_signal: None, + execution_mode: ToolExecutionMode::Direct, + } +} + +async fn act(v: Value) -> Result { + ComputerTool::new().execute(v, ctx()).await +} + +async fn ok(v: Value) -> ToolOutput { + let label = v.to_string(); + match act(v).await { + Ok(o) => { + eprintln!("PASS {label} -> {}", o.output.lines().next().unwrap_or("")); + o + } + Err(e) => panic!("FAIL {label} -> {e}"), + } +} + +async fn textedit_new() { + ok(json!({"action":"run_applescript","script": + "tell application \"TextEdit\" to activate\ndelay 0.3\ntell application \"TextEdit\" to make new document\ndelay 0.3"})).await; +} +async fn textedit_quit() { + let _ = act(json!({"action":"run_applescript","script": + "tell application \"TextEdit\" to close every document saving no\ntell application \"TextEdit\" to quit"})).await; +} + +#[tokio::test] +#[ignore = "live"] +async fn coverage_observe() { + ok(json!({"action":"check_permissions"})).await; + ok(json!({"action":"setup"})).await; // already granted -> reports ready quickly + ok(json!({"action":"screenshot"})).await; + ok(json!({"action":"cursor"})).await; + ok(json!({"action":"system_state"})).await; + ok(json!({"action":"discover","category":"all"})).await; + // OCR may require swift; tolerate absence but it should not panic. + match act(json!({"action":"ocr"})).await { + Ok(o) => eprintln!("PASS ocr -> {}", o.output.lines().next().unwrap_or("")), + Err(e) => eprintln!("SKIP ocr -> {e}"), + } +} + +#[tokio::test] +#[ignore = "live"] +async fn coverage_input() { + ok(json!({"action":"move","x":300,"y":300})).await; + ok(json!({"action":"click","x":300,"y":300})).await; + ok(json!({"action":"double_click","x":300,"y":300})).await; + ok(json!({"action":"right_click","x":300,"y":300})).await; + // dismiss any context menu + ok(json!({"action":"key","keys":"esc"})).await; + ok(json!({"action":"drag","x":300,"y":300,"to_x":320,"to_y":320})).await; + ok(json!({"action":"scroll","x":400,"y":400,"dy":-3})).await; + ok(json!({"action":"key_down","keys":"shift"})).await; + ok(json!({"action":"key_up","keys":"shift"})).await; +} + +#[tokio::test] +#[ignore = "live"] +async fn coverage_keyboard_into_textedit() { + textedit_new().await; + // type goes to focused app (TextEdit just activated) + ok(json!({"action":"type","text":"hello "})).await; + ok(json!({"action":"key","keys":"cmd+a"})).await; // select all + ok(json!({"action":"key","keys":"delete"})).await; + textedit_quit().await; +} + +#[tokio::test] +#[ignore = "live"] +async fn coverage_ax() { + textedit_new().await; + ok(json!({"action":"ui","app":"TextEdit","depth":3})).await; + ok(json!({"action":"find_element","app":"TextEdit","role":"AXTextArea"})).await; + ok(json!({"action":"element_at","app":"TextEdit","x":700,"y":400})).await; + // background set/get on the text area (path 1.1) + let el = json!({"app":"TextEdit","path":[1,1]}); + ok(json!({"action":"set_value","element":el,"value":"ax-coverage"})).await; + let g = ok(json!({"action":"get_value","element":el})).await; + assert!(g.output.contains("ax-coverage"), "get_value got: {}", g.output); + textedit_quit().await; +} + +#[tokio::test] +#[ignore = "live"] +async fn coverage_select_menu() { + textedit_new().await; + // Format menu exists in TextEdit; "Make Plain Text" or "Wrap to Page" toggles. + // Use a stable, reversible item: Edit > Select All. + let r = act(json!({"action":"select_menu","app":"TextEdit","menu_path":["Edit","Select All"]})).await; + match r { + Ok(o) => eprintln!("PASS select_menu -> {}", o.output), + Err(e) => panic!("FAIL select_menu -> {e}"), + } + textedit_quit().await; +} + +#[tokio::test] +#[ignore = "live"] +async fn coverage_windows_apps() { + textedit_new().await; + ok(json!({"action":"list_apps"})).await; + ok(json!({"action":"list_windows"})).await; + ok(json!({"action":"activate_app","app":"TextEdit"})).await; + ok(json!({"action":"move_window","app":"TextEdit","x":120,"y":120})).await; + ok(json!({"action":"resize_window","app":"TextEdit","w":700,"h":500})).await; + ok(json!({"action":"focus_window","app":"TextEdit"})).await; + // window_screenshot needs an id from list_windows; find TextEdit's. + let lw = ok(json!({"action":"list_windows"})).await; + if let Some(id) = first_window_id_for(&lw.output, "TextEdit") { + ok(json!({"action":"window_screenshot","window_id":id})).await; + } else { + eprintln!("SKIP window_screenshot (no TextEdit window id parsed)"); + } + ok(json!({"action":"minimize_window","app":"TextEdit"})).await; + // restore + close + ok(json!({"action":"activate_app","app":"TextEdit"})).await; + textedit_quit().await; +} + +#[tokio::test] +#[ignore = "live"] +async fn coverage_clipboard_scripting_system() { + ok(json!({"action":"set_clipboard","text":"cov-clip"})).await; + let c = ok(json!({"action":"get_clipboard"})).await; + assert!(c.output.contains("cov-clip")); + ok(json!({"action":"run_applescript","script":"return 7 * 6"})).await; + ok(json!({"action":"run_jxa","script":"2 + 3"})).await; + ok(json!({"action":"notify","text":"jcode coverage test","title":"jcode"})).await; + // wait_for against a known app/text with short timeout (Finder always has a menu) + textedit_new().await; + let _ = act(json!({"action":"wait_for","app":"TextEdit","contains":"","timeout_ms":1500})).await; + textedit_quit().await; + // set_brightness may be unavailable; tolerate. + match act(json!({"action":"set_brightness","level":0.8})).await { + Ok(o) => eprintln!("PASS set_brightness -> {}", o.output), + Err(e) => eprintln!("SKIP set_brightness -> {e}"), + } +} + +#[tokio::test] +#[ignore = "live"] +async fn coverage_destructive_quit_close() { + textedit_new().await; + ok(json!({"action":"close_window","app":"TextEdit"})).await; + // A new empty doc closes without a sheet. Discard anything then quit. + let _ = act(json!({"action":"run_applescript","script": + "tell application \"TextEdit\" to close every document saving no"})).await; + ok(json!({"action":"quit_app","app":"TextEdit"})).await; +} + +/// Parse the first CG window id whose owner matches `owner` from list_windows +/// output lines of the form: "\t\t\t<bounds>". +fn first_window_id_for(text: &str, owner: &str) -> Option<i64> { + for line in text.lines() { + let mut parts = line.splitn(4, '\t'); + let id = parts.next()?.trim(); + let own = parts.next().unwrap_or("").trim(); + if own == owner { + if let Ok(n) = id.parse::<i64>() { + return Some(n); + } + } + } + None +} diff --git a/crates/jcode-app-core/src/tool/computer/discover.rs b/crates/jcode-app-core/src/tool/computer/discover.rs new file mode 100644 index 0000000000..b2f1b186b6 --- /dev/null +++ b/crates/jcode-app-core/src/tool/computer/discover.rs @@ -0,0 +1,125 @@ +//! Progressive disclosure: return full specs for advanced actions on demand, +//! so the always-on tool schema stays small. + +use anyhow::Result; +use jcode_tool_types::ToolOutput; + +const MOUSE: &str = "\ +mouse actions (visible coordinate input; moves the real cursor): +- move {x, y} +- click {x?, y?} click at point (or current cursor position) +- double_click{x?, y?} +- right_click {x?, y?} +- drag {x, y, to_x, to_y} +- scroll {x?, y?, dx, dy} dy>0 scrolls up +- cursor {} report current cursor position"; + +const KEYBOARD: &str = "\ +keyboard actions (go to the focused app): +- type {text} type a UTF-8 string +- key {keys} chord, e.g. 'cmd+space', 'return', 'ctrl+shift+t' +- key_down {keys} hold a key/chord down +- key_up {keys} release a key/chord"; + +const OBSERVE: &str = "\ +observe actions (see the screen): +- screenshot {} full main display; reports point/pixel scale +- window_screenshot {window_id} capture one window (even if occluded) +- ocr {region?:[x,y,w,h]} recognize on-screen text + bounding boxes (Vision) +- ui {app?, depth?} dump the Accessibility tree; element paths shown as #a.b.c"; + +const AX: &str = "\ +accessibility actions (BACKGROUND control; no cursor movement, app need not be frontmost). +Element handle = {app:\"AppName\", path:[child indices from the front window]} from find_element/ui: +- find_element {app, role?, title?, value?, depth?} -> matching elements with paths +- element_at {app, x, y} -> deepest element at a point +- press {element} -> AXPress (background click) +- set_value {element, value} -> set a field's value (background type) +- get_value {element} +- perform_action {element, ax_action} -> any AX action, e.g. 'AXShowMenu' +- select_menu {app, menu_path:[\"File\",\"Save\"]} -> drive the menu bar"; + +const WINDOWS: &str = "\ +window actions (act on an app's front window; AX-based, can target background windows): +- list_windows {} all on-screen windows with ids/owners/bounds +- focus_window {app} raise + activate the app's front window +- move_window {app, x, y} +- resize_window {app, w, h} +- minimize_window{app} +- close_window {app}"; + +const APPS: &str = "\ +app actions: +- list_apps {} running (non-background) apps +- activate_app {app} bring an app to the front +- hide_app {app} hide an app (no quit) +- quit_app {app} quit an app"; + +const CLIPBOARD: &str = "\ +clipboard actions: +- get_clipboard {} +- set_clipboard {text}"; + +const SCRIPTING: &str = "\ +scripting actions (headless control of scriptable apps; no UI, no cursor): +- run_applescript {script} run AppleScript, returns its result +- run_jxa {script} run JavaScript-for-Automation +- wait_for {app, contains, timeout_ms?} poll an app's AX tree until text appears"; + +const SYSTEM: &str = "\ +system actions: +- notify {text, title?} post a Notification Center banner +- system_state {} battery / date / power summary +- set_brightness {level} 0..1 (needs the `brightness` cli)"; + +const SETUP: &str = "\ +setup actions: +- check_permissions {} report Accessibility / Screen Recording / Swift status +- setup {} request permissions, deep-link to the right Settings panes, poll until ready + +Note: the Accessibility toggle itself cannot be enabled programmatically (macOS security); +setup gets you one click away."; + +fn section(cat: &str) -> Option<&'static str> { + Some(match cat { + "mouse" => MOUSE, + "keyboard" => KEYBOARD, + "observe" => OBSERVE, + "ax" => AX, + "windows" => WINDOWS, + "apps" => APPS, + "clipboard" => CLIPBOARD, + "scripting" => SCRIPTING, + "system" => SYSTEM, + "setup" => SETUP, + _ => return None, + }) +} + +pub fn discover(category: Option<&str>) -> Result<ToolOutput> { + let cat = category.unwrap_or("all"); + let body = if cat == "all" { + [ + OBSERVE, MOUSE, KEYBOARD, AX, WINDOWS, APPS, CLIPBOARD, SCRIPTING, SYSTEM, SETUP, + ] + .join("\n\n") + } else if let Some(s) = section(cat) { + s.to_string() + } else { + format!( + "Unknown category '{cat}'. Valid: mouse, keyboard, observe, ax, windows, apps, \ + clipboard, scripting, system, setup, all." + ) + }; + Ok(ToolOutput::new(format!( + "macos_computer_use actions — category '{cat}'. All actions are fields on the same `macos_computer_use` tool.\n\n{body}\n\n\ + Default policy: this is the user's own live machine, so stay out of their way. \ + Prefer background AX/scripting actions over visible coordinate input when the target \ + element is resolvable; only fall back to click/type (which move your cursor and steal \ + focus) when AX can't reach it. Do not move the cursor, change the frontmost app, or \ + move/resize/activate windows unless the user asked or the task strictly requires it. \ + Act only on the task you were given — never take proactive control of the desktop \ + upfront. When a visible action is unavoidable, do the minimum and restore focus." + )) + .with_title("discover")) +} diff --git a/crates/jcode-app-core/src/tool/computer/input.rs b/crates/jcode-app-core/src/tool/computer/input.rs new file mode 100644 index 0000000000..48b6739189 --- /dev/null +++ b/crates/jcode-app-core/src/tool/computer/input.rs @@ -0,0 +1,312 @@ +//! Synthetic mouse + keyboard input via Core Graphics CGEvents. +//! +//! This is the *visible* control path: events go to the shared HID stream, so +//! they move the real cursor and type into the focused app. Background control +//! lives in `ax.rs` instead. + +use super::keys; +use anyhow::{Context, Result, bail}; +use core_graphics::event::{ + CGEvent, CGEventFlags, CGEventTapLocation, CGEventType, CGMouseButton, EventField, + ScrollEventUnit, +}; +use core_graphics::event_source::{CGEventSource, CGEventSourceStateID}; +use core_graphics::geometry::CGPoint; +use std::thread::sleep; +use std::time::Duration; + +#[derive(Clone, Copy)] +pub enum Button { + Left, + Right, +} + +fn source() -> Result<CGEventSource> { + CGEventSource::new(CGEventSourceStateID::HIDSystemState).map_err(|_| { + anyhow::anyhow!( + "failed to create CGEventSource. Grant Accessibility permission (run the `setup` action)." + ) + }) +} + +fn post(event: CGEvent) { + event.post(CGEventTapLocation::HID); +} + +/// Current cursor position in global (top-left origin) screen points. +pub fn current_cursor() -> Result<CGPoint> { + let src = source()?; + let evt = CGEvent::new(src).map_err(|_| anyhow::anyhow!("failed to read cursor position"))?; + Ok(evt.location()) +} + +pub fn move_to(x: f64, y: f64) -> Result<()> { + let src = source()?; + let evt = CGEvent::new_mouse_event( + src, + CGEventType::MouseMoved, + CGPoint::new(x, y), + CGMouseButton::Left, + ) + .map_err(|_| anyhow::anyhow!("failed to create mouse-move event"))?; + post(evt); + Ok(()) +} + +pub fn click(x: Option<f64>, y: Option<f64>, button: Button, count: u32) -> Result<CGPoint> { + let point = match (x, y) { + (Some(x), Some(y)) => CGPoint::new(x, y), + _ => current_cursor()?, + }; + let (down, up, cg_button) = match button { + Button::Left => ( + CGEventType::LeftMouseDown, + CGEventType::LeftMouseUp, + CGMouseButton::Left, + ), + Button::Right => ( + CGEventType::RightMouseDown, + CGEventType::RightMouseUp, + CGMouseButton::Right, + ), + }; + + let src = source()?; + let mv = CGEvent::new_mouse_event(src, CGEventType::MouseMoved, point, cg_button) + .map_err(|_| anyhow::anyhow!("failed to create move event"))?; + post(mv); + sleep(Duration::from_millis(10)); + + for i in 1..=count { + let src_d = source()?; + let down_evt = CGEvent::new_mouse_event(src_d, down, point, cg_button) + .map_err(|_| anyhow::anyhow!("failed to create mouse-down event"))?; + if count > 1 { + down_evt.set_integer_value_field(EventField::MOUSE_EVENT_CLICK_STATE, i as i64); + } + post(down_evt); + + let src_u = source()?; + let up_evt = CGEvent::new_mouse_event(src_u, up, point, cg_button) + .map_err(|_| anyhow::anyhow!("failed to create mouse-up event"))?; + if count > 1 { + up_evt.set_integer_value_field(EventField::MOUSE_EVENT_CLICK_STATE, i as i64); + } + post(up_evt); + sleep(Duration::from_millis(20)); + } + Ok(point) +} + +pub fn drag(from_x: f64, from_y: f64, to_x: f64, to_y: f64) -> Result<()> { + let from = CGPoint::new(from_x, from_y); + let to = CGPoint::new(to_x, to_y); + + let src = source()?; + let down = CGEvent::new_mouse_event(src, CGEventType::LeftMouseDown, from, CGMouseButton::Left) + .map_err(|_| anyhow::anyhow!("failed to create drag-down event"))?; + post(down); + sleep(Duration::from_millis(30)); + + let steps = 10; + for i in 1..=steps { + let t = i as f64 / steps as f64; + let p = CGPoint::new(from_x + (to_x - from_x) * t, from_y + (to_y - from_y) * t); + let src_m = source()?; + let mv = + CGEvent::new_mouse_event(src_m, CGEventType::LeftMouseDragged, p, CGMouseButton::Left) + .map_err(|_| anyhow::anyhow!("failed to create drag-move event"))?; + post(mv); + sleep(Duration::from_millis(15)); + } + + let src_u = source()?; + let up = CGEvent::new_mouse_event(src_u, CGEventType::LeftMouseUp, to, CGMouseButton::Left) + .map_err(|_| anyhow::anyhow!("failed to create drag-up event"))?; + post(up); + Ok(()) +} + +pub fn scroll(x: Option<f64>, y: Option<f64>, dx: i32, dy: i32) -> Result<()> { + if let (Some(x), Some(y)) = (x, y) { + move_to(x, y)?; + sleep(Duration::from_millis(10)); + } + let src = source()?; + let evt = CGEvent::new_scroll_event(src, ScrollEventUnit::PIXEL, 2, dy, dx, 0) + .map_err(|_| anyhow::anyhow!("failed to create scroll event"))?; + post(evt); + Ok(()) +} + +/// Type a UTF-8 string as a single synthesized keyboard event (Unicode payload), +/// layout-independent. Goes to the focused app. +pub fn type_text(text: &str) -> Result<()> { + let src = source()?; + let down = CGEvent::new_keyboard_event(src, 0, true) + .map_err(|_| anyhow::anyhow!("failed to create keyboard event"))?; + down.set_string(text); + post(down); + + let src_up = source()?; + let up = CGEvent::new_keyboard_event(src_up, 0, false) + .map_err(|_| anyhow::anyhow!("failed to create keyboard event"))?; + up.set_string(text); + post(up); + Ok(()) +} + +/// Parse a chord like "cmd+shift+t" into (modifier flags, main keycode). +pub fn parse_chord(chord: &str) -> Result<(CGEventFlags, u16)> { + let mut flags = CGEventFlags::CGEventFlagNull; + let mut keycode: Option<u16> = None; + for raw in chord.split('+') { + let part = raw.trim().to_lowercase(); + if part.is_empty() { + continue; + } + match part.as_str() { + "cmd" | "command" | "meta" | "super" => flags |= CGEventFlags::CGEventFlagCommand, + "ctrl" | "control" => flags |= CGEventFlags::CGEventFlagControl, + "alt" | "opt" | "option" => flags |= CGEventFlags::CGEventFlagAlternate, + "shift" => flags |= CGEventFlags::CGEventFlagShift, + "fn" => flags |= CGEventFlags::CGEventFlagSecondaryFn, + other => { + if keycode.is_some() { + bail!("key chord '{chord}' has more than one non-modifier key"); + } + keycode = Some( + keys::keycode_for(other) + .with_context(|| format!("unknown key '{other}' in chord '{chord}'"))?, + ); + } + } + } + let code = keycode.with_context(|| format!("chord '{chord}' has no main key"))?; + Ok((flags, code)) +} + +pub fn key_chord(chord: &str) -> Result<()> { + let (flags, code) = parse_chord(chord)?; + let src = source()?; + let down = CGEvent::new_keyboard_event(src, code, true) + .map_err(|_| anyhow::anyhow!("failed to create key-down event"))?; + down.set_flags(flags); + post(down); + sleep(Duration::from_millis(15)); + + let src_up = source()?; + let up = CGEvent::new_keyboard_event(src_up, code, false) + .map_err(|_| anyhow::anyhow!("failed to create key-up event"))?; + up.set_flags(flags); + post(up); + Ok(()) +} + +/// Parse a chord into (modifier flags, optional main keycode). Unlike +/// `parse_chord`, a modifier-only chord (e.g. "shift") is allowed and returns +/// `None` for the keycode. Used for key_down/key_up holds. +pub fn parse_chord_opt(chord: &str) -> Result<(CGEventFlags, Option<u16>)> { + let mut flags = CGEventFlags::CGEventFlagNull; + let mut keycode: Option<u16> = None; + for raw in chord.split('+') { + let part = raw.trim().to_lowercase(); + if part.is_empty() { + continue; + } + match part.as_str() { + "cmd" | "command" | "meta" | "super" => flags |= CGEventFlags::CGEventFlagCommand, + "ctrl" | "control" => flags |= CGEventFlags::CGEventFlagControl, + "alt" | "opt" | "option" => flags |= CGEventFlags::CGEventFlagAlternate, + "shift" => flags |= CGEventFlags::CGEventFlagShift, + "fn" => flags |= CGEventFlags::CGEventFlagSecondaryFn, + other => { + if keycode.is_some() { + bail!("key chord '{chord}' has more than one non-modifier key"); + } + keycode = Some( + keys::keycode_for(other) + .with_context(|| format!("unknown key '{other}' in chord '{chord}'"))?, + ); + } + } + } + if keycode.is_none() && flags == CGEventFlags::CGEventFlagNull { + bail!("chord '{chord}' is empty"); + } + Ok((flags, keycode)) +} + +/// Keycode for a single modifier name, for modifier-only holds. +fn modifier_keycode(chord: &str) -> Option<u16> { + match chord.trim().to_lowercase().as_str() { + "cmd" | "command" | "meta" | "super" => Some(0x37), + "shift" => Some(0x38), + "alt" | "opt" | "option" => Some(0x3A), + "ctrl" | "control" => Some(0x3B), + "fn" => Some(0x3F), + _ => None, + } +} + +/// Hold a key or chord down (down_state=true) or release it (false). +/// Supports modifier-only holds (e.g. "shift") via a FlagsChanged event. +pub fn key_hold(chord: &str, down_state: bool) -> Result<()> { + let (flags, keycode) = parse_chord_opt(chord)?; + let src = source()?; + match keycode { + Some(code) => { + let evt = CGEvent::new_keyboard_event(src, code, down_state) + .map_err(|_| anyhow::anyhow!("failed to create key event"))?; + evt.set_flags(flags); + post(evt); + } + None => { + // Modifier-only: emit a FlagsChanged event carrying the modifier + // keycode, with the flags set while held and cleared on release. + let code = modifier_keycode(chord) + .with_context(|| format!("unsupported modifier-only hold '{chord}'"))?; + let evt = CGEvent::new_keyboard_event(src, code, down_state) + .map_err(|_| anyhow::anyhow!("failed to create modifier event"))?; + evt.set_type(CGEventType::FlagsChanged); + evt.set_flags(if down_state { + flags + } else { + CGEventFlags::CGEventFlagNull + }); + post(evt); + } + } + Ok(()) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn parses_modifier_only_chord() { + let (flags, code) = parse_chord_opt("shift").unwrap(); + assert!(flags.contains(CGEventFlags::CGEventFlagShift)); + assert!(code.is_none()); + } + + #[test] + fn parses_chord_with_key() { + let (flags, code) = parse_chord_opt("cmd+a").unwrap(); + assert!(flags.contains(CGEventFlags::CGEventFlagCommand)); + assert_eq!(code, Some(0x00)); + } + + #[test] + fn rejects_empty_chord() { + assert!(parse_chord_opt("").is_err()); + } + + #[test] + fn modifier_keycodes_known() { + assert_eq!(modifier_keycode("cmd"), Some(0x37)); + assert_eq!(modifier_keycode("shift"), Some(0x38)); + assert_eq!(modifier_keycode("nope"), None); + } +} diff --git a/crates/jcode-app-core/src/tool/computer/keys.rs b/crates/jcode-app-core/src/tool/computer/keys.rs new file mode 100644 index 0000000000..eb39d47ecf --- /dev/null +++ b/crates/jcode-app-core/src/tool/computer/keys.rs @@ -0,0 +1,129 @@ +//! Layout-independent virtual keycode mapping for key chords. + +use core_graphics::event::CGKeyCode; + +/// Map a key name (already lowercased, single token) to a US virtual keycode. +/// Returns None for unknown keys. +pub fn keycode_for(key: &str) -> Option<CGKeyCode> { + use core_graphics::event::KeyCode; + let code = match key { + "return" | "enter" => KeyCode::RETURN, + "tab" => KeyCode::TAB, + "space" => KeyCode::SPACE, + "delete" | "backspace" => KeyCode::DELETE, + "esc" | "escape" => KeyCode::ESCAPE, + "left" => KeyCode::LEFT_ARROW, + "right" => KeyCode::RIGHT_ARROW, + "down" => KeyCode::DOWN_ARROW, + "up" => KeyCode::UP_ARROW, + "home" => KeyCode::HOME, + "end" => KeyCode::END, + "pageup" => KeyCode::PAGE_UP, + "pagedown" => KeyCode::PAGE_DOWN, + "forwarddelete" => KeyCode::FORWARD_DELETE, + "f1" => 0x7A, + "f2" => 0x78, + "f3" => 0x63, + "f4" => 0x76, + "f5" => 0x60, + "f6" => 0x61, + "f7" => 0x62, + "f8" => 0x64, + "f9" => 0x65, + "f10" => 0x6D, + "f11" => 0x67, + "f12" => 0x6F, + other => return ansi_keycode(other), + }; + Some(code) +} + +/// US ANSI virtual keycodes for single letters, digits, and common punctuation. +/// Layout-independent hardware positions. +pub fn ansi_keycode(key: &str) -> Option<CGKeyCode> { + let mut chars = key.chars(); + let first = chars.next()?; + if chars.next().is_some() { + return None; + } + let code: CGKeyCode = match first { + 'a' => 0x00, + 'b' => 0x0B, + 'c' => 0x08, + 'd' => 0x02, + 'e' => 0x0E, + 'f' => 0x03, + 'g' => 0x05, + 'h' => 0x04, + 'i' => 0x22, + 'j' => 0x26, + 'k' => 0x28, + 'l' => 0x25, + 'm' => 0x2E, + 'n' => 0x2D, + 'o' => 0x1F, + 'p' => 0x23, + 'q' => 0x0C, + 'r' => 0x0F, + 's' => 0x01, + 't' => 0x11, + 'u' => 0x20, + 'v' => 0x09, + 'w' => 0x0D, + 'x' => 0x07, + 'y' => 0x10, + 'z' => 0x06, + '0' => 0x1D, + '1' => 0x12, + '2' => 0x13, + '3' => 0x14, + '4' => 0x15, + '5' => 0x17, + '6' => 0x16, + '7' => 0x1A, + '8' => 0x1C, + '9' => 0x19, + '-' => 0x1B, + '=' => 0x18, + '[' => 0x21, + ']' => 0x1E, + '\\' => 0x2A, + ';' => 0x29, + '\'' => 0x27, + ',' => 0x2B, + '.' => 0x2F, + '/' => 0x2C, + '`' => 0x32, + _ => return None, + }; + Some(code) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn maps_named_keys() { + assert_eq!(keycode_for("return"), Some(0x24)); + assert_eq!(keycode_for("space"), Some(0x31)); + assert_eq!(keycode_for("esc"), Some(0x35)); + assert_eq!(keycode_for("escape"), Some(0x35)); + assert_eq!(keycode_for("left"), Some(0x7B)); + assert_eq!(keycode_for("f5"), Some(0x60)); + } + + #[test] + fn maps_letters_and_digits() { + assert_eq!(keycode_for("a"), Some(0x00)); + assert_eq!(keycode_for("z"), Some(0x06)); + assert_eq!(keycode_for("0"), Some(0x1D)); + assert_eq!(keycode_for(","), Some(0x2B)); + } + + #[test] + fn rejects_unknown_keys() { + assert_eq!(ansi_keycode("nope"), None); + assert_eq!(keycode_for("nope"), None); + } +} diff --git a/crates/jcode-app-core/src/tool/computer/mod.rs b/crates/jcode-app-core/src/tool/computer/mod.rs new file mode 100644 index 0000000000..70c2e63303 --- /dev/null +++ b/crates/jcode-app-core/src/tool/computer/mod.rs @@ -0,0 +1,474 @@ +//! Native macOS "computer use" tool. +//! +//! The desktop analog of the `browser` tool: a single `action`-dispatched tool +//! that lets the agent see the screen and control the macOS GUI. +//! +//! ## Mechanisms and visibility +//! +//! - **Coordinate input** (`click`/`type`/`key`/`scroll`/`drag`) uses Core +//! Graphics CGEvents on the shared HID stream, so it is *visible*: it moves the +//! real cursor and types into the focused app. +//! - **Accessibility actions** (`press`/`set_value`/`select_menu`/...) and +//! **scripting** (`run_applescript`) act on apps *by reference*, so they can +//! work in the **background** without moving the cursor. +//! +//! By default the tool prefers non-disruptive mechanisms and restraint: because +//! this runs on the user's own live machine, the agent should act only on the +//! requested task, prefer background AX/scripting over moving the cursor or +//! stealing focus, and never take proactive control of the desktop. This policy +//! is conveyed to the model via the tool description and the `discover` output. +//! +//! ## Progressive disclosure +//! +//! Only a small set of common actions is described in the always-on schema to +//! keep prompt cost low. The full action set is fetched on demand via +//! `action="discover"` with a `category`. +//! +//! Everything is gated behind `cfg(target_os = "macos")`; other platforms return +//! a clear "unsupported" error. + +use super::{Tool, ToolContext, ToolOutput}; +use anyhow::{Context, Result, bail}; +use async_trait::async_trait; +use serde::Deserialize; +use serde_json::{Value, json}; + +#[cfg(target_os = "macos")] +mod ax; +#[cfg(target_os = "macos")] +mod discover; +#[cfg(target_os = "macos")] +mod input; +#[cfg(target_os = "macos")] +mod keys; +#[cfg(target_os = "macos")] +mod osa; +#[cfg(target_os = "macos")] +mod screen; +#[cfg(target_os = "macos")] +mod setup; +#[cfg(target_os = "macos")] +mod sys; +#[cfg(target_os = "macos")] +mod win; + +pub struct ComputerTool; + +impl ComputerTool { + pub fn new() -> Self { + Self + } +} + +#[derive(Debug, Deserialize)] +struct ComputerInput { + action: String, + // discovery + #[serde(default)] + category: Option<String>, + // coordinates + #[serde(default)] + x: Option<f64>, + #[serde(default)] + y: Option<f64>, + #[serde(default)] + to_x: Option<f64>, + #[serde(default)] + to_y: Option<f64>, + #[serde(default)] + w: Option<f64>, + #[serde(default)] + h: Option<f64>, + // text / keys + #[serde(default)] + text: Option<String>, + #[serde(default)] + keys: Option<String>, + #[serde(default)] + dx: Option<i32>, + #[serde(default)] + dy: Option<i32>, + #[serde(default)] + depth: Option<u32>, + // AX / scoping + #[serde(default)] + app: Option<String>, + #[serde(default)] + role: Option<String>, + #[serde(default)] + title: Option<String>, + #[serde(default)] + value: Option<String>, + #[serde(default)] + element: Option<Value>, + #[serde(default)] + ax_action: Option<String>, + #[serde(default)] + menu_path: Option<Vec<String>>, + // windows + #[serde(default)] + window_id: Option<i64>, + // scripting / wait / system + #[serde(default)] + script: Option<String>, + #[serde(default)] + contains: Option<String>, + #[serde(default)] + timeout_ms: Option<u64>, + #[serde(default)] + region: Option<[f64; 4]>, + #[serde(default)] + level: Option<f64>, + /// For mutating actions: resolve and report the target without acting. + #[serde(default)] + dry_run: Option<bool>, +} + +/// Cap a tool output's text so a huge AX tree / clipboard / OCR dump cannot +/// blow up the context. Keeps the head and notes how much was dropped. +#[cfg(target_os = "macos")] +fn cap_output(mut out: ToolOutput, max_chars: usize) -> ToolOutput { + if out.output.len() > max_chars { + let mut cut = max_chars; + while cut > 0 && !out.output.is_char_boundary(cut) { + cut -= 1; + } + let dropped = out.output.len() - cut; + let head = out.output[..cut].to_string(); + out.output = format!("{head}\n… [truncated {dropped} chars]"); + } + out +} + +/// Actions that change desktop/app state. Used for dry_run gating. +#[cfg(target_os = "macos")] +fn is_mutating(action: &str) -> bool { + matches!( + action, + "move" + | "click" + | "double_click" + | "right_click" + | "drag" + | "scroll" + | "type" + | "key" + | "key_down" + | "key_up" + | "press" + | "set_value" + | "perform_action" + | "select_menu" + | "activate_app" + | "hide_app" + | "quit_app" + | "focus_window" + | "move_window" + | "resize_window" + | "minimize_window" + | "close_window" + | "set_clipboard" + | "run_applescript" + | "run_jxa" + | "notify" + | "set_brightness" + ) +} + +#[async_trait] +impl Tool for ComputerTool { + fn name(&self) -> &str { + "macos_computer_use" + } + + fn description(&self) -> &str { + "Control the macOS desktop: see the screen (screenshot/ocr/ui tree), click and type \ + (visible coordinate input), act on UI elements in the BACKGROUND via Accessibility \ + (press/set_value, no cursor movement), manage apps and windows, use the clipboard, and \ + run AppleScript. Coordinates are in points (top-left origin). This is the user's live \ + machine: act only on the requested task (not proactively) and prefer BACKGROUND \ + AX/scripting over moving the cursor or stealing focus; click/type only when AX can't \ + reach the target. Call action='discover' with a category for the full action set. Run \ + action='setup' first if permissions are missing." + } + + fn parameters_schema(&self) -> Value { + // Progressive disclosure: only the common actions + discover are spelled + // out here to keep always-on prompt cost low (~370 tokens). Advanced + // actions and their params are returned by action="discover". + json!({ + "type": "object", + "required": ["action"], + "properties": { + "intent": super::intent_schema_property(), + "action": { + "type": "string", + "description": "Common: screenshot, ocr, ui (see); click, type, key (visible input); \ + press, set_value (BACKGROUND AX action on an `element` handle); find_element; \ + run_applescript; setup, check_permissions; discover (load full action set). \ + Many more actions (move, drag, scroll, window/app management, clipboard, \ + select_menu, notify, ...) take the same fields; call discover for their params." + }, + "category": { + "type": "string", + "enum": ["mouse","keyboard","observe","ax","windows","apps","clipboard","scripting","system","setup","all"], + "description": "For action='discover': which group to return full action specs for." + }, + "x": { "type": "number", "description": "Screen X in points (top-left origin)." }, + "y": { "type": "number", "description": "Screen Y in points." }, + "text": { "type": "string", "description": "Text for type / set_clipboard / notify." }, + "keys": { "type": "string", "description": "Key chord, e.g. cmd+space, return, esc, ctrl+shift+t." }, + "app": { "type": "string", "description": "Target app/process name (AX, windows, scripting scope)." }, + "role": { "type": "string", "description": "AX role filter for find_element, e.g. AXButton." }, + "title": { "type": "string", "description": "AX title/label substring for find_element." }, + "value": { "type": "string", "description": "Value to match (find_element) or set (set_value)." }, + "element": { + "type": "object", + "description": "Element handle from find_element/ui: {app, path:[child indices]}. Used by press/set_value/get_value/perform_action.", + "properties": { + "app": { "type": "string" }, + "path": { "type": "array", "items": { "type": "integer" } } + } + }, + "script": { "type": "string", "description": "AppleScript (run_applescript) or JS (run_jxa) source." }, + "depth": { "type": "integer", "description": "Max AX tree depth for ui/find_element (default 12)." }, + "dry_run": { "type": "boolean", "description": "For mutating actions: report the intended action without performing it." } + } + }) + } + + async fn execute(&self, input: Value, _ctx: ToolContext) -> Result<ToolOutput> { + let parsed: ComputerInput = + serde_json::from_value(input).context("invalid `macos_computer_use` tool input")?; + tokio::task::spawn_blocking(move || run(parsed)) + .await + .context("macos_computer_use tool task panicked")? + } +} + +#[cfg(not(target_os = "macos"))] +fn run(_input: ComputerInput) -> Result<ToolOutput> { + bail!("The `macos_computer_use` tool is only supported on macOS.") +} + +#[cfg(target_os = "macos")] +fn run(input: ComputerInput) -> Result<ToolOutput> { + let action = input.action.as_str(); + + // dry_run: for mutating actions, report the intended target and stop. + if input.dry_run == Some(true) && is_mutating(action) { + return Ok(ToolOutput::new(format!( + "[dry_run] would perform '{action}' (no action taken). \ + Re-issue without dry_run to execute." + ))); + } + + let result = dispatch(action, &input); + // Cap large textual outputs to protect context (images are unaffected). + result.map(|o| cap_output(o, 16_000)) +} + +#[cfg(target_os = "macos")] +fn dispatch(action: &str, input: &ComputerInput) -> Result<ToolOutput> { + match action { + // ---- discovery & setup ---- + "discover" => discover::discover(input.category.as_deref()), + "setup" => setup::setup(), + "check_permissions" => setup::check_permissions(), + + // ---- observe ---- + "screenshot" => screen::screenshot(), + "ocr" => screen::ocr(input.region), + "window_screenshot" => { + let id = input.window_id.context("window_screenshot requires `window_id`")?; + screen::window_screenshot(id) + } + "ui" => ax::ui_tree(input.app.as_deref(), input.depth.unwrap_or(12)), + "cursor" => { + let p = input::current_cursor()?; + Ok(ToolOutput::new(format!("cursor at ({:.0}, {:.0})", p.x, p.y)) + .with_metadata(json!({ "x": p.x, "y": p.y }))) + } + + // ---- coordinate input (visible) ---- + "move" => { + let (x, y) = require_xy(input)?; + input::move_to(x, y)?; + Ok(ToolOutput::new(format!("moved cursor to ({x:.0}, {y:.0})"))) + } + "click" => { + let p = input::click(input.x, input.y, input::Button::Left, 1)?; + Ok(ToolOutput::new(format!("clicked at ({:.0}, {:.0})", p.x, p.y))) + } + "double_click" => { + let p = input::click(input.x, input.y, input::Button::Left, 2)?; + Ok(ToolOutput::new(format!("double-clicked at ({:.0}, {:.0})", p.x, p.y))) + } + "right_click" => { + let p = input::click(input.x, input.y, input::Button::Right, 1)?; + Ok(ToolOutput::new(format!("right-clicked at ({:.0}, {:.0})", p.x, p.y))) + } + "drag" => { + let (x, y) = require_xy(input)?; + match (input.to_x, input.to_y) { + (Some(tx), Some(ty)) => { + input::drag(x, y, tx, ty)?; + Ok(ToolOutput::new(format!( + "dragged from ({x:.0},{y:.0}) to ({tx:.0},{ty:.0})" + ))) + } + _ => bail!("action='drag' requires `to_x` and `to_y`"), + } + } + "scroll" => { + let dx = input.dx.unwrap_or(0); + let dy = input.dy.unwrap_or(0); + if dx == 0 && dy == 0 { + bail!("action='scroll' requires non-zero `dx` and/or `dy`"); + } + input::scroll(input.x, input.y, dx, dy)?; + Ok(ToolOutput::new(format!("scrolled dx={dx} dy={dy}"))) + } + "type" => { + let text = input + .text + .as_deref() + .filter(|s| !s.is_empty()) + .context("action='type' requires non-empty `text`")?; + input::type_text(text)?; + Ok(ToolOutput::new(format!("typed {} characters", text.chars().count()))) + } + "key" => { + let keys = input + .keys + .as_deref() + .filter(|s| !s.is_empty()) + .context("action='key' requires a `keys` chord, e.g. 'cmd+space'")?; + input::key_chord(keys)?; + Ok(ToolOutput::new(format!("pressed {keys}"))) + } + "key_down" | "key_up" => { + let keys = input + .keys + .as_deref() + .filter(|s| !s.is_empty()) + .context("requires a `keys` value")?; + input::key_hold(keys, action == "key_down")?; + Ok(ToolOutput::new(format!("{action} {keys}"))) + } + + // ---- AX background actions (Tier 1) ---- + "find_element" => { + let app = input.app.as_deref().context("find_element requires `app`")?; + ax::find_element( + app, + input.role.as_deref(), + input.title.as_deref(), + input.value.as_deref(), + input.depth.unwrap_or(20), + ) + } + "element_at" => { + let app = input.app.as_deref().context("element_at requires `app`")?; + let (x, y) = require_xy(input)?; + ax::element_at(app, x, y) + } + "press" => ax::press(&parse_element(input)?), + "get_value" => ax::get_value(&parse_element(input)?), + "set_value" => { + let v = input.value.as_deref().context("set_value requires `value`")?; + ax::set_value(&parse_element(input)?, v) + } + "perform_action" => { + let a = input.ax_action.as_deref().context("perform_action requires `ax_action`")?; + ax::perform_action(&parse_element(input)?, a) + } + "select_menu" => { + let app = input.app.as_deref().context("select_menu requires `app`")?; + let path = input.menu_path.as_ref().context("select_menu requires `menu_path`")?; + ax::select_menu(app, path) + } + + // ---- windows / apps (Tier 2) ---- + "list_apps" => win::list_apps(), + "list_windows" => win::list_windows(), + "activate_app" => win::activate_app(req_app(input)?), + "hide_app" => win::hide_app(req_app(input)?), + "quit_app" => win::quit_app(req_app(input)?), + "focus_window" => win::focus_window(req_app(input)?), + "move_window" => { + let (x, y) = require_xy(input)?; + win::move_window(req_app(input)?, x, y) + } + "resize_window" => { + let w = input.w.context("resize_window requires `w`")?; + let h = input.h.context("resize_window requires `h`")?; + win::resize_window(req_app(input)?, w, h) + } + "minimize_window" => win::minimize_window(req_app(input)?), + "close_window" => win::close_window(req_app(input)?), + + // ---- clipboard / scripting / system (Tier 3/4) ---- + "get_clipboard" => sys::get_clipboard(), + "set_clipboard" => { + let t = input.text.as_deref().context("set_clipboard requires `text`")?; + sys::set_clipboard(t) + } + "run_applescript" => { + let s = input.script.as_deref().context("run_applescript requires `script`")?; + sys::run_applescript(s) + } + "run_jxa" => { + let s = input.script.as_deref().context("run_jxa requires `script`")?; + sys::run_jxa(s) + } + "wait_for" => { + let app = input.app.as_deref().context("wait_for requires `app`")?; + let c = input.contains.as_deref().context("wait_for requires `contains`")?; + sys::wait_for(app, c, input.timeout_ms.unwrap_or(10_000)) + } + "notify" => { + let t = input.text.as_deref().context("notify requires `text`")?; + sys::notify(t, input.title.as_deref()) + } + "system_state" => sys::system_state(), + "set_brightness" => { + let l = input.level.context("set_brightness requires `level` (0..1)")?; + sys::set_brightness(l) + } + + other => bail!( + "Unknown macos_computer_use action: {other}. Call action='discover' (category='all') to list every action." + ), + } +} + +#[cfg(target_os = "macos")] +fn require_xy(input: &ComputerInput) -> Result<(f64, f64)> { + match (input.x, input.y) { + (Some(x), Some(y)) => Ok((x, y)), + _ => bail!("action='{}' requires both `x` and `y`", input.action), + } +} + +#[cfg(target_os = "macos")] +fn req_app<'a>(input: &'a ComputerInput) -> Result<&'a str> { + input + .app + .as_deref() + .with_context(|| format!("action='{}' requires `app`", input.action)) +} + +#[cfg(target_os = "macos")] +fn parse_element(input: &ComputerInput) -> Result<ax::ElementHandle> { + let raw = input + .element + .clone() + .context("this action requires an `element` handle {app, path:[...]} from find_element/ui")?; + serde_json::from_value(raw).context("invalid `element` handle") +} + +#[cfg(all(test, target_os = "macos"))] +mod tests; +#[cfg(all(test, target_os = "macos"))] +mod coverage_tests; diff --git a/crates/jcode-app-core/src/tool/computer/osa.rs b/crates/jcode-app-core/src/tool/computer/osa.rs new file mode 100644 index 0000000000..296da5221c --- /dev/null +++ b/crates/jcode-app-core/src/tool/computer/osa.rs @@ -0,0 +1,157 @@ +//! Centralized `osascript` / JXA execution for the `macos_computer_use` tool. +//! +//! Many macOS capabilities (Accessibility actions, window/app management, system +//! state) are reachable through AppleScript / JavaScript-for-Automation without +//! extra native bindings. This module funnels all of that through one place so +//! escaping, error mapping (especially the TCC permission errors), and timeouts +//! are handled consistently. +//! +//! Every external command runs under a wall-clock timeout: a hung target app +//! must never freeze the agent. AppleScript also gets an internal +//! `with timeout` guard so System Events stops waiting on an unresponsive app. + +use anyhow::{Result, bail}; +use std::io::Read; +use std::process::{Command, Stdio}; +use std::time::{Duration, Instant}; + +/// Default wall-clock limit for a scripting call. +pub const DEFAULT_TIMEOUT: Duration = Duration::from_secs(20); + +/// Run an AppleScript and return stdout (trimmed). Maps the common macOS +/// permission / automation errors to actionable messages. +pub fn run_applescript(script: &str) -> Result<String> { + run(&["-e", script], "AppleScript", DEFAULT_TIMEOUT) +} + +/// Run AppleScript with an explicit timeout. +pub fn run_applescript_timeout(script: &str, timeout: Duration) -> Result<String> { + run(&["-e", script], "AppleScript", timeout) +} + +/// Run a JavaScript-for-Automation (JXA) script. +pub fn run_jxa(script: &str) -> Result<String> { + run(&["-l", "JavaScript", "-e", script], "JXA", DEFAULT_TIMEOUT) +} + +fn run(args: &[&str], lang: &str, timeout: Duration) -> Result<String> { + let (status, stdout, stderr) = run_command_timed("/usr/bin/osascript", args, timeout)?; + + if status { + return Ok(stdout.trim_end().to_string()); + } + + let trimmed = stderr.trim(); + let lower = trimmed.to_lowercase(); + + if lower.contains("assistive") + || lower.contains("not allowed") + || lower.contains("-1719") + || lower.contains("1002") + { + bail!( + "Accessibility permission required. Run the `setup` action, or grant it in \ + System Settings > Privacy & Security > Accessibility for your terminal/jcode. \ + ({trimmed})" + ); + } + if lower.contains("-1743") || lower.contains("not authorized to send apple events") { + bail!( + "Automation permission required for the target app. Approve the prompt, or grant it \ + in System Settings > Privacy & Security > Automation. ({trimmed})" + ); + } + if trimmed.is_empty() { + bail!("{lang} failed (no error output)"); + } + bail!("{lang} failed: {trimmed}"); +} + +/// Run a command with a wall-clock timeout. Returns (success, stdout, stderr). +/// On timeout the child is killed and an error is returned. +pub fn run_command_timed( + program: &str, + args: &[&str], + timeout: Duration, +) -> Result<(bool, String, String)> { + let mut child = Command::new(program) + .args(args) + .stdin(Stdio::null()) + .stdout(Stdio::piped()) + .stderr(Stdio::piped()) + .spawn() + .map_err(|e| anyhow::anyhow!("failed to spawn {program}: {e}"))?; + + let deadline = Instant::now() + timeout; + loop { + match child.try_wait() { + Ok(Some(status)) => { + let mut out = String::new(); + let mut err = String::new(); + if let Some(mut s) = child.stdout.take() { + let _ = s.read_to_string(&mut out); + } + if let Some(mut s) = child.stderr.take() { + let _ = s.read_to_string(&mut err); + } + return Ok((status.success(), out, err)); + } + Ok(None) => { + if Instant::now() >= deadline { + let _ = child.kill(); + let _ = child.wait(); + bail!( + "command timed out after {}s (a target app may be unresponsive): {program}", + timeout.as_secs() + ); + } + std::thread::sleep(Duration::from_millis(25)); + } + Err(e) => bail!("error waiting on {program}: {e}"), + } + } +} + +/// Quote a string as an AppleScript string literal (wraps in quotes, escapes +/// backslash and double-quote). Use for interpolating untrusted text into +/// generated AppleScript. +pub fn as_quote(s: &str) -> String { + let mut out = String::with_capacity(s.len() + 2); + out.push('"'); + for ch in s.chars() { + match ch { + '\\' => out.push_str("\\\\"), + '"' => out.push_str("\\\""), + _ => out.push(ch), + } + } + out.push('"'); + out +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn quotes_and_escapes() { + assert_eq!(as_quote("hi"), "\"hi\""); + assert_eq!(as_quote("a\"b"), "\"a\\\"b\""); + assert_eq!(as_quote("a\\b"), "\"a\\\\b\""); + } + + #[test] + fn timed_command_succeeds_fast() { + let (ok, out, _) = run_command_timed("/bin/echo", &["hi"], Duration::from_secs(5)).unwrap(); + assert!(ok); + assert_eq!(out.trim(), "hi"); + } + + #[test] + fn timed_command_times_out() { + let err = run_command_timed("/bin/sleep", &["5"], Duration::from_millis(200)) + .unwrap_err() + .to_string(); + assert!(err.contains("timed out"), "got: {err}"); + } +} diff --git a/crates/jcode-app-core/src/tool/computer/screen.rs b/crates/jcode-app-core/src/tool/computer/screen.rs new file mode 100644 index 0000000000..0cc9f015c4 --- /dev/null +++ b/crates/jcode-app-core/src/tool/computer/screen.rs @@ -0,0 +1,222 @@ +//! Screen observation: full-screen + per-window screenshots and OCR. + +use super::osa; +use anyhow::{Context, Result, bail}; +use base64::{Engine as _, engine::general_purpose::STANDARD}; +use core_graphics::display::CGDisplay; +use jcode_tool_types::ToolOutput; +use serde_json::json; +use std::process::Command; +use std::time::Duration; + +/// Read width/height from a PNG IHDR chunk. Returns None if not a PNG. +pub fn png_dimensions(bytes: &[u8]) -> Option<(u32, u32)> { + const PNG_SIG: [u8; 8] = [0x89, b'P', b'N', b'G', 0x0D, 0x0A, 0x1A, 0x0A]; + if bytes.len() < 24 || bytes[..8] != PNG_SIG { + return None; + } + let w = u32::from_be_bytes([bytes[16], bytes[17], bytes[18], bytes[19]]); + let h = u32::from_be_bytes([bytes[20], bytes[21], bytes[22], bytes[23]]); + Some((w, h)) +} + +fn capture_to_temp(extra_args: &[&str]) -> Result<Vec<u8>> { + let tmp = std::env::temp_dir().join(format!( + "jcode_computer_{}_{}.png", + std::process::id(), + std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .map(|d| d.as_millis()) + .unwrap_or(0) + )); + let tmp_str = tmp.to_string_lossy().to_string(); + let mut args: Vec<&str> = vec!["-x"]; + args.extend_from_slice(extra_args); + args.push(&tmp_str); + let (ok, _out, err) = + osa::run_command_timed("/usr/sbin/screencapture", &args, Duration::from_secs(15))?; + if !ok { + let _ = std::fs::remove_file(&tmp); + bail!("screencapture failed: {}", err.trim()); + } + let bytes = std::fs::read(&tmp).context("failed to read screenshot file")?; + let _ = std::fs::remove_file(&tmp); + if bytes.is_empty() { + bail!( + "screenshot was empty. Grant Screen Recording permission (run the `setup` action), or \ + in System Settings > Privacy & Security > Screen Recording." + ); + } + Ok(bytes) +} + +pub fn screenshot() -> Result<ToolOutput> { + let bytes = capture_to_temp(&[])?; + let bounds = CGDisplay::main().bounds(); + let point_w = bounds.size.width; + let point_h = bounds.size.height; + let (pixel_w, pixel_h) = png_dimensions(&bytes).unwrap_or((point_w as u32, point_h as u32)); + let scale = if point_w > 0.0 { + pixel_w as f64 / point_w + } else { + 1.0 + }; + let summary = format!( + "Captured main display: {pixel_w}x{pixel_h} pixels = {point_w:.0}x{point_h:.0} points \ + (scale {scale:.2}x). Click/move coordinates are in POINTS: for a feature at image pixel \ + (px, py), use x = px / {scale:.2}, y = py / {scale:.2}.", + ); + Ok(ToolOutput::new(summary) + .with_title("screenshot") + .with_labeled_image("image/png", STANDARD.encode(&bytes), "screen") + .with_metadata(json!({ + "width_points": point_w, "height_points": point_h, + "width_pixels": pixel_w, "height_pixels": pixel_h, "scale": scale, + }))) +} + +/// Screenshot a single window by its CoreGraphics window id, even if occluded. +pub fn window_screenshot(window_id: i64) -> Result<ToolOutput> { + let bytes = capture_to_temp(&["-o", "-l", &window_id.to_string()])?; + let (pixel_w, pixel_h) = png_dimensions(&bytes).unwrap_or((0, 0)); + Ok(ToolOutput::new(format!( + "Captured window {window_id}: {pixel_w}x{pixel_h} pixels." + )) + .with_title("window screenshot") + .with_labeled_image("image/png", STANDARD.encode(&bytes), "window") + .with_metadata(json!({ "window_id": window_id, "width_pixels": pixel_w, "height_pixels": pixel_h }))) +} + +/// OCR a region (or the whole screen) using the macOS Vision framework via a +/// small inline Swift/JXA bridge. Returns recognized strings with bounding +/// boxes so the model can click located text in apps with no Accessibility. +pub fn ocr(region: Option<[f64; 4]>) -> Result<ToolOutput> { + // Capture (optionally a region) to a temp file, then OCR it. + let region_args: Vec<String> = if let Some([x, y, w, h]) = region { + vec!["-R".to_string(), format!("{x},{y},{w},{h}")] + } else { + vec![] + }; + let tmp = std::env::temp_dir().join(format!("jcode_ocr_{}.png", std::process::id())); + let tmp_str = tmp.to_string_lossy().to_string(); + let mut args: Vec<&str> = vec!["-x"]; + for a in ®ion_args { + args.push(a); + } + args.push(&tmp_str); + let (ok, _o, err) = + osa::run_command_timed("/usr/sbin/screencapture", &args, Duration::from_secs(15))?; + if !ok { + let _ = std::fs::remove_file(&tmp); + bail!("screencapture failed for OCR: {}", err.trim()); + } + + let img_path = tmp.to_string_lossy().to_string(); + let result = run_vision_ocr(&img_path); + let _ = std::fs::remove_file(&tmp); + let text = result?; + let summary = if text.trim().is_empty() { + "OCR found no text.".to_string() + } else { + text + }; + Ok(ToolOutput::new(summary).with_title("ocr")) +} + +/// Run the Vision OCR via the `osascript`-launched Swift one-liner is not viable; +/// instead use a tiny Swift program through `swift` if available, falling back to +/// a clear message. Vision has no AppleScript binding, so we shell to Swift. +fn run_vision_ocr(image_path: &str) -> Result<String> { + // Prefer a compiled helper if present; otherwise use `swift` to run inline. + let swift_src = format!( + r#" +import Foundation +import Vision +import AppKit + +let url = URL(fileURLWithPath: "{path}") +guard let img = NSImage(contentsOf: url), let cg = img.cgImage(forProposedRect: nil, context: nil, hints: nil) else {{ + FileHandle.standardError.write("could not load image\n".data(using: .utf8)!) + exit(2) +}} +let req = VNRecognizeTextRequest {{ request, error in + guard let obs = request.results as? [VNRecognizedTextObservation] else {{ return }} + for o in obs {{ + guard let top = o.topCandidates(1).first else {{ continue }} + let b = o.boundingBox // normalized, origin bottom-left + print("\(b.origin.x),\(b.origin.y),\(b.size.width),\(b.size.height)\t\(top.string)") + }} +}} +req.recognitionLevel = .accurate +req.usesLanguageCorrection = true +let handler = VNImageRequestHandler(cgImage: cg, options: [:]) +try? handler.perform([req]) +"#, + path = image_path + ); + + let swift = which_swift(); + let Some(swift) = swift else { + bail!( + "OCR needs the Swift toolchain (Vision framework has no scripting bridge). \ + Install Xcode command line tools: xcode-select --install" + ); + }; + + let out = Command::new(&swift) + .arg("-") + .arg(image_path) + .env("JCODE_OCR_IMG", image_path) + .stdin(std::process::Stdio::piped()) + .stdout(std::process::Stdio::piped()) + .stderr(std::process::Stdio::piped()) + .spawn() + .and_then(|mut child| { + use std::io::Write; + if let Some(mut stdin) = child.stdin.take() { + stdin.write_all(swift_src.as_bytes())?; + } + child.wait_with_output() + }) + .context("failed to run swift for OCR")?; + + if !out.status.success() { + let err = String::from_utf8_lossy(&out.stderr); + bail!("Vision OCR failed: {}", err.trim()); + } + let raw = String::from_utf8_lossy(&out.stdout); + // Each line: x,y,w,h\ttext (bbox normalized, origin bottom-left). + let mut lines = vec![ + "Recognized text (bbox normalized 0..1, origin bottom-left; multiply by image size):" + .to_string(), + ]; + for line in raw.lines() { + lines.push(line.to_string()); + } + Ok(lines.join("\n")) +} + +fn which_swift() -> Option<String> { + for p in ["/usr/bin/swift", "/usr/local/bin/swift"] { + if std::path::Path::new(p).exists() { + return Some(p.to_string()); + } + } + // try PATH + let out = Command::new("/usr/bin/which").arg("swift").output().ok()?; + if out.status.success() { + let s = String::from_utf8_lossy(&out.stdout).trim().to_string(); + if !s.is_empty() { + return Some(s); + } + } + None +} + +/// Used by `osa`-based callers that just need a quick AX-free description. +#[allow(dead_code)] +pub fn frontmost_app() -> Result<String> { + osa::run_applescript( + "tell application \"System Events\" to get name of first application process whose frontmost is true", + ) +} diff --git a/crates/jcode-app-core/src/tool/computer/setup.rs b/crates/jcode-app-core/src/tool/computer/setup.rs new file mode 100644 index 0000000000..7a9db94227 --- /dev/null +++ b/crates/jcode-app-core/src/tool/computer/setup.rs @@ -0,0 +1,123 @@ +//! Permission setup: check, request, deep-link, and poll the macOS TCC grants +//! needed for desktop control. + +use super::osa; +use anyhow::Result; +use jcode_tool_types::ToolOutput; +use serde_json::json; +use std::process::Command; +use std::thread::sleep; +use std::time::{Duration, Instant}; + +fn accessibility_ok() -> bool { + // System Events reports whether assistive access is enabled for us. + osa::run_applescript("tell application \"System Events\" to return UI elements enabled") + .map(|s| s.trim() == "true") + .unwrap_or(false) +} + +fn screen_recording_ok() -> bool { + let tmp = std::env::temp_dir().join(format!("jcode_setup_{}.png", std::process::id())); + let ok = Command::new("/usr/sbin/screencapture") + .arg("-x") + .arg(&tmp) + .status() + .map(|s| s.success()) + .unwrap_or(false) + && std::fs::metadata(&tmp).map(|m| m.len() > 0).unwrap_or(false); + let _ = std::fs::remove_file(&tmp); + ok +} + +fn yes_no(b: bool) -> &'static str { + if b { "granted" } else { "NOT granted" } +} + +/// Report status only. +pub fn check_permissions() -> Result<ToolOutput> { + let ax = accessibility_ok(); + let screen = screen_recording_ok(); + let swift = std::path::Path::new("/usr/bin/swift").exists() + || Command::new("/usr/bin/which") + .arg("swift") + .status() + .map(|s| s.success()) + .unwrap_or(false); + + let mut lines = vec![ + format!("Accessibility (input + AX control): {}", yes_no(ax)), + format!("Screen Recording (screenshots/OCR): {}", yes_no(screen)), + format!("Swift toolchain (for OCR): {}", if swift { "present" } else { "missing" }), + ]; + if !ax || !screen { + lines.push("Run action='setup' to request these and open the right settings pane.".into()); + } + Ok(ToolOutput::new(lines.join("\n")).with_metadata(json!({ + "accessibility": ax, "screen_recording": screen, "swift": swift, + }))) +} + +/// Request permissions: prompt, deep-link, and poll Accessibility until granted. +pub fn setup() -> Result<ToolOutput> { + let mut log = Vec::new(); + + let ax0 = accessibility_ok(); + let screen0 = screen_recording_ok(); + log.push(format!("Initial: accessibility={}, screen_recording={}", ax0, screen0)); + + // Trigger the Screen Recording prompt by attempting a capture (already done + // in screen_recording_ok). For Accessibility, prompt + pre-add jcode by + // opening the pane; the trust prompt itself is shown by the host process the + // first time it calls an AX API. + if !ax0 { + // Deep-link to the exact Accessibility pane. + let _ = Command::new("/usr/bin/open") + .arg("x-apple.systempreferences:com.apple.preference.security?Privacy_Accessibility") + .status(); + log.push( + "Opened Privacy & Security > Accessibility. Add and enable your terminal/jcode there." + .into(), + ); + } + if !screen0 { + let _ = Command::new("/usr/bin/open") + .arg("x-apple.systempreferences:com.apple.preference.security?Privacy_ScreenCapture") + .status(); + log.push( + "Opened Privacy & Security > Screen Recording. Add and enable your terminal/jcode there." + .into(), + ); + } + + // Poll Accessibility for up to ~30s so the agent can report "ready". + if !ax0 { + let deadline = Instant::now() + Duration::from_secs(30); + let mut granted = false; + while Instant::now() < deadline { + if accessibility_ok() { + granted = true; + break; + } + sleep(Duration::from_millis(1000)); + } + log.push(format!( + "Accessibility after wait: {}", + if granted { "granted" } else { "still not granted (toggle it, then re-run check_permissions)" } + )); + } + + let ax = accessibility_ok(); + let screen = screen_recording_ok(); + log.push(format!("Final: accessibility={}, screen_recording={}", ax, screen)); + if !ax { + log.push( + "NOTE: the Accessibility toggle cannot be enabled programmatically (macOS security). \ + It is the one switch you must flip by hand." + .into(), + ); + } + + Ok(ToolOutput::new(log.join("\n")).with_metadata(json!({ + "accessibility": ax, "screen_recording": screen, + }))) +} diff --git a/crates/jcode-app-core/src/tool/computer/sys.rs b/crates/jcode-app-core/src/tool/computer/sys.rs new file mode 100644 index 0000000000..d1f091b962 --- /dev/null +++ b/crates/jcode-app-core/src/tool/computer/sys.rs @@ -0,0 +1,154 @@ +//! Tier 3/4: clipboard, scripting bridge, waits, notifications, system state. + +use super::osa; +use anyhow::{Result, bail}; +use jcode_tool_types::ToolOutput; +use serde_json::json; +use std::process::Command; +use std::thread::sleep; +use std::time::{Duration, Instant}; + +pub fn get_clipboard() -> Result<ToolOutput> { + // pbpaste is the most reliable text read. + let out = Command::new("/usr/bin/pbpaste") + .output() + .map_err(|e| anyhow::anyhow!("failed to run pbpaste: {e}"))?; + let text = String::from_utf8_lossy(&out.stdout).to_string(); + Ok(ToolOutput::new(text).with_title("clipboard")) +} + +pub fn set_clipboard(text: &str) -> Result<ToolOutput> { + use std::io::Write; + let mut child = Command::new("/usr/bin/pbcopy") + .stdin(std::process::Stdio::piped()) + .spawn() + .map_err(|e| anyhow::anyhow!("failed to run pbcopy: {e}"))?; + if let Some(mut stdin) = child.stdin.take() { + stdin + .write_all(text.as_bytes()) + .map_err(|e| anyhow::anyhow!("failed to write to pbcopy: {e}"))?; + } + child + .wait() + .map_err(|e| anyhow::anyhow!("pbcopy failed: {e}"))?; + Ok(ToolOutput::new(format!("copied {} chars to clipboard", text.chars().count()))) +} + +pub fn run_applescript(script: &str) -> Result<ToolOutput> { + let out = osa::run_applescript(script)?; + Ok(ToolOutput::new(if out.is_empty() { + "(AppleScript ran, no output)".to_string() + } else { + out + }) + .with_title("applescript")) +} + +pub fn run_jxa(script: &str) -> Result<ToolOutput> { + let out = osa::run_jxa(script)?; + Ok(ToolOutput::new(if out.is_empty() { + "(JXA ran, no output)".to_string() + } else { + out + }) + .with_title("jxa")) +} + +pub fn notify(text: &str, title: Option<&str>) -> Result<ToolOutput> { + let title = title.unwrap_or("jcode"); + osa::run_applescript(&format!( + "display notification {} with title {}", + osa::as_quote(text), + osa::as_quote(title) + ))?; + Ok(ToolOutput::new(format!("posted notification: {text}"))) +} + +/// Poll an app's AX tree until a substring appears (element_appears) or a +/// timeout elapses. Cheap structural wait instead of fixed sleeps. +pub fn wait_for(app: &str, contains: &str, timeout_ms: u64) -> Result<ToolOutput> { + let deadline = Instant::now() + Duration::from_millis(timeout_ms.min(60_000)); + let script = format!( + r#" +using terms from application "System Events" + on dumpEl(el, lvl, maxlvl) + set out to "" + if lvl > maxlvl then return out + try + set out to out & (title of el as text) & " " + end try + try + set out to out & (value of el as text) & " " + end try + try + repeat with child in (UI elements of el) + set out to out & (my dumpEl(child, lvl + 1, maxlvl)) + end repeat + end try + return out + end dumpEl +end using terms from +tell application "System Events" + set frontApp to first application process whose name is {app} + try + return my dumpEl(front window of frontApp, 0, 10) + on error + return "" + end try +end tell +"#, + app = osa::as_quote(app) + ); + loop { + let tree = osa::run_applescript(&script).unwrap_or_default(); + if tree.contains(contains) { + return Ok(ToolOutput::new(format!("matched '{contains}' in {app}"))); + } + if Instant::now() >= deadline { + bail!("wait_for timed out after {timeout_ms}ms (no '{contains}' in {app})"); + } + sleep(Duration::from_millis(250)); + } +} + +/// Read common system state (battery, display brightness, focus, etc.). +pub fn system_state() -> Result<ToolOutput> { + let battery = Command::new("/usr/bin/pmset") + .args(["-g", "batt"]) + .output() + .ok() + .map(|o| String::from_utf8_lossy(&o.stdout).trim().to_string()) + .unwrap_or_default(); + let date = Command::new("/bin/date") + .output() + .ok() + .map(|o| String::from_utf8_lossy(&o.stdout).trim().to_string()) + .unwrap_or_default(); + Ok(ToolOutput::new(format!("date: {date}\n{battery}")) + .with_title("system_state") + .with_metadata(json!({"battery_raw": battery}))) +} + +/// Set display brightness 0.0..1.0 using the `brightness` cli if present, else +/// fall back to AppleScript key events. Brightness has no stable public API, so +/// this is best-effort. +pub fn set_brightness(level: f64) -> Result<ToolOutput> { + let level = level.clamp(0.0, 1.0); + // Try the `brightness` homebrew tool first. + for path in ["/opt/homebrew/bin/brightness", "/usr/local/bin/brightness"] { + if std::path::Path::new(path).exists() { + let ok = Command::new(path) + .arg(format!("{level}")) + .status() + .map(|s| s.success()) + .unwrap_or(false); + if ok { + return Ok(ToolOutput::new(format!("set brightness to {level:.2}"))); + } + } + } + bail!( + "no brightness control available. Install with `brew install brightness`, or adjust via the \ + brightness keys." + ) +} diff --git a/crates/jcode-app-core/src/tool/computer/tests.rs b/crates/jcode-app-core/src/tool/computer/tests.rs new file mode 100644 index 0000000000..9380c4a22b --- /dev/null +++ b/crates/jcode-app-core/src/tool/computer/tests.rs @@ -0,0 +1,233 @@ +//! Tests for the macos_computer_use tool. Pure-logic tests run anywhere on macOS; live +//! tests that synthesize events / capture the screen are `#[ignore]`d. + +use super::*; +use jcode_tool_core::{ToolContext, ToolExecutionMode}; + +fn ctx() -> ToolContext { + ToolContext { + session_id: "test".into(), + message_id: "test".into(), + tool_call_id: "test".into(), + working_dir: None, + stdin_request_tx: None, + graceful_shutdown_signal: None, + execution_mode: ToolExecutionMode::Direct, + } +} + +async fn run_action(v: Value) -> Result<ToolOutput> { + ComputerTool::new().execute(v, ctx()).await +} + +// ---- pure logic ---- + +#[tokio::test] +async fn rejects_bad_action() { + let err = run_action(json!({ "action": "frobnicate" })) + .await + .unwrap_err(); + assert!(err.to_string().contains("Unknown macos_computer_use action")); +} + +#[tokio::test] +async fn move_requires_coords() { + let err = run_action(json!({ "action": "move" })).await.unwrap_err(); + assert!(err.to_string().contains("requires")); +} + +#[tokio::test] +async fn discover_all_lists_actions() { + let out = run_action(json!({ "action": "discover", "category": "all" })) + .await + .unwrap(); + // Spot-check that several categories are present. + for needle in ["press", "set_value", "run_applescript", "list_windows", "screenshot"] { + assert!(out.output.contains(needle), "missing {needle}"); + } +} + +#[tokio::test] +async fn discover_category_scopes() { + let out = run_action(json!({ "action": "discover", "category": "ax" })) + .await + .unwrap(); + assert!(out.output.contains("find_element")); + assert!(!out.output.contains("set_brightness")); +} + +#[tokio::test] +async fn press_requires_element() { + let err = run_action(json!({ "action": "press" })).await.unwrap_err(); + assert!(err.to_string().contains("element")); +} + +#[tokio::test] +async fn dry_run_skips_mutation() { + let out = run_action(json!({ "action": "click", "x": 10, "y": 10, "dry_run": true })) + .await + .unwrap(); + assert!(out.output.contains("dry_run")); + assert!(out.output.contains("click")); +} + +#[tokio::test] +async fn dry_run_ignored_for_readonly() { + let out = run_action(json!({ "action": "discover", "category": "ax", "dry_run": true })) + .await + .unwrap(); + assert!(out.output.contains("find_element")); +} + +#[test] +fn cap_output_truncates() { + let big = "x".repeat(20_000); + let capped = super::cap_output(ToolOutput::new(big), 16_000); + assert!(capped.output.len() < 16_200); + assert!(capped.output.contains("truncated")); +} + +#[test] +fn is_mutating_classifies() { + assert!(super::is_mutating("click")); + assert!(super::is_mutating("quit_app")); + assert!(super::is_mutating("set_value")); + assert!(!super::is_mutating("screenshot")); + assert!(!super::is_mutating("ui")); + assert!(!super::is_mutating("discover")); +} + +#[test] +fn schema_is_compact() { + // Guard against context bloat: the always-on schema + description must stay + // small. Measured well under this bound; alert if it balloons. + let tool = ComputerTool::new(); + let schema = serde_json::to_string(&tool.parameters_schema()).unwrap(); + let total = tool.description().len() + schema.len(); + // ~4 chars/token; keep always-on cost roughly under ~750 tokens. The bound + // leaves room for the short safety/restraint guidance in the description + // (act only on the requested task; prefer background mechanisms) while still + // flagging any real ballooning. + assert!( + total < 3000, + "macos_computer_use tool always-on size grew to {total} chars (~{} tokens)", + total / 4 + ); +} + +// ---- live (need GUI + permissions); run with --ignored ---- + +#[tokio::test] +#[ignore = "requires GUI + permissions"] +async fn live_check_permissions() { + let out = run_action(json!({ "action": "check_permissions" })) + .await + .unwrap(); + eprintln!("{}", out.output); + assert!(out.metadata.is_some()); +} + +#[tokio::test] +#[ignore = "requires GUI + permissions"] +async fn live_cursor_and_move() { + run_action(json!({ "action": "move", "x": 400, "y": 300 })) + .await + .unwrap(); + let after = run_action(json!({ "action": "cursor" })).await.unwrap(); + let meta = after.metadata.unwrap(); + assert!((meta["x"].as_f64().unwrap() - 400.0).abs() < 5.0); + assert!((meta["y"].as_f64().unwrap() - 300.0).abs() < 5.0); +} + +#[tokio::test] +#[ignore = "requires GUI + permissions"] +async fn live_screenshot() { + let out = run_action(json!({ "action": "screenshot" })).await.unwrap(); + assert_eq!(out.images.len(), 1); + assert_eq!(out.images[0].media_type, "image/png"); + eprintln!("{}", out.output); +} + +#[tokio::test] +#[ignore = "requires GUI + permissions"] +async fn live_ui_tree() { + let out = run_action(json!({ "action": "ui", "depth": 3 })) + .await + .unwrap(); + eprintln!("{}", out.output); + assert!(out.output.contains("App:")); +} + +#[tokio::test] +#[ignore = "requires GUI + permissions"] +async fn live_list_windows() { + let out = run_action(json!({ "action": "list_windows" })).await.unwrap(); + eprintln!("{}", out.output); +} + +#[tokio::test] +#[ignore = "requires GUI + permissions"] +async fn live_clipboard_roundtrip() { + run_action(json!({ "action": "set_clipboard", "text": "jcode-clip-test" })) + .await + .unwrap(); + let out = run_action(json!({ "action": "get_clipboard" })).await.unwrap(); + assert!(out.output.contains("jcode-clip-test")); +} + +#[tokio::test] +#[ignore = "requires GUI + permissions"] +async fn live_applescript() { + let out = run_action(json!({ "action": "run_applescript", "script": "return 2 + 2" })) + .await + .unwrap(); + assert!(out.output.contains("4")); +} + +/// Headline capability: set a TextEdit field's value via AX while TextEdit is +/// NOT frontmost, proving background control with no cursor movement. +#[tokio::test] +#[ignore = "requires GUI + permissions"] +async fn live_background_set_value() { + // Open a fresh TextEdit document. + run_action(json!({ + "action": "run_applescript", + "script": "tell application \"TextEdit\" to activate\ndelay 0.4\ntell application \"TextEdit\" to make new document\ndelay 0.4" + })) + .await + .unwrap(); + + // Move focus away so TextEdit is in the background. + run_action(json!({ + "action": "run_applescript", + "script": "tell application \"System Events\" to set frontmost of (first process whose name is \"System Events\") to true" + })) + .await + .ok(); + + let marker = "background-ax-marker-42"; + // Set the text area value by AX path (AXScrollArea[1] -> AXTextArea[1]). + run_action(json!({ + "action": "set_value", + "element": { "app": "TextEdit", "path": [1, 1] }, + "value": marker + })) + .await + .unwrap(); + + let content = run_action(json!({ + "action": "run_applescript", + "script": "tell application \"TextEdit\" to get text of document 1" + })) + .await + .unwrap(); + assert!(content.output.contains(marker), "got: {}", content.output); + + // Cleanup. + run_action(json!({ + "action": "run_applescript", + "script": "tell application \"TextEdit\" to close every document saving no\ntell application \"TextEdit\" to quit" + })) + .await + .ok(); +} diff --git a/crates/jcode-app-core/src/tool/computer/win.rs b/crates/jcode-app-core/src/tool/computer/win.rs new file mode 100644 index 0000000000..336eef3d27 --- /dev/null +++ b/crates/jcode-app-core/src/tool/computer/win.rs @@ -0,0 +1,119 @@ +//! Tier 2: application and window management via System Events / NSWorkspace. + +use super::osa; +use anyhow::Result; +use jcode_tool_types::ToolOutput; + +pub fn list_apps() -> Result<ToolOutput> { + let script = "tell application \"System Events\" to get name of every application process whose background only is false"; + let res = osa::run_applescript(script)?; + let apps: Vec<String> = res.split(", ").map(|s| s.trim().to_string()).collect(); + Ok(ToolOutput::new(format!("Running apps ({}):\n{}", apps.len(), apps.join("\n"))) + .with_title("list_apps")) +} + +pub fn activate_app(app: &str) -> Result<ToolOutput> { + osa::run_applescript(&format!("tell application {} to activate", osa::as_quote(app)))?; + Ok(ToolOutput::new(format!("activated {app}"))) +} + +pub fn hide_app(app: &str) -> Result<ToolOutput> { + osa::run_applescript(&format!( + "tell application \"System Events\" to set visible of (first process whose name is {}) to false", + osa::as_quote(app) + ))?; + Ok(ToolOutput::new(format!("hid {app}"))) +} + +pub fn quit_app(app: &str) -> Result<ToolOutput> { + // `quit` blocks if the app shows a modal (e.g. an unsaved-changes sheet), so + // use a short timeout and report that case instead of hanging the agent. + match osa::run_applescript_timeout( + &format!("tell application {} to quit", osa::as_quote(app)), + std::time::Duration::from_secs(8), + ) { + Ok(_) => Ok(ToolOutput::new(format!("quit {app}"))), + Err(e) if e.to_string().contains("timed out") => Ok(ToolOutput::new(format!( + "{app} did not quit within 8s — it is likely showing a dialog (e.g. unsaved changes). \ + Handle the dialog (screenshot + click, or press a key), then quit again." + ))), + Err(e) => Err(e), + } +} + +/// List on-screen windows with CG window ids, owners, titles, and bounds. +pub fn list_windows() -> Result<ToolOutput> { + // JXA can read the CG window list with ids reliably. + let script = r#" +ObjC.import('CoreGraphics'); +ObjC.import('Foundation'); +var opts = $.kCGWindowListOptionOnScreenOnly | $.kCGWindowListExcludeDesktopElements; +var arr = $.CGWindowListCopyWindowInfo(opts, $.kCGNullWindowID); +var n = $.CFArrayGetCount(arr); +var out = []; +for (var i = 0; i < n; i++) { + var d = $.CFArrayGetValueAtIndex(arr, i); + var dict = ObjC.castRefToObject(d); + var id = dict.objectForKey($('kCGWindowNumber')); + var owner = dict.objectForKey($('kCGWindowOwnerName')); + var name = dict.objectForKey($('kCGWindowName')); + var b = dict.objectForKey($('kCGWindowBounds')); + var bx = ObjC.deepUnwrap(b) || {}; + var ownerS = owner ? ObjC.unwrap(owner) : ''; + var nameS = name ? ObjC.unwrap(name) : ''; + var idN = id ? ObjC.unwrap(id) : ''; + out.push(idN + '\t' + ownerS + '\t' + (nameS||'') + '\t@(' + (bx.X|0) + ',' + (bx.Y|0) + ' ' + (bx.Width|0) + 'x' + (bx.Height|0) + ')'); +} +out.join('\n'); +"#; + let res = osa::run_jxa(script)?; + Ok(ToolOutput::new(format!( + "Windows (id owner title bounds):\n{}", + if res.trim().is_empty() { "(none)" } else { &res } + )) + .with_title("list_windows")) +} + +/// Window ops that target a window of an app by its (1-based) index or title. +/// We address via System Events AX windows of the owning process. +pub fn focus_window(app: &str) -> Result<ToolOutput> { + osa::run_applescript(&format!( + "tell application \"System Events\" to perform action \"AXRaise\" of (front window of (first process whose name is {}))", + osa::as_quote(app) + ))?; + // also bring app forward + let _ = activate_app(app); + Ok(ToolOutput::new(format!("focused front window of {app}"))) +} + +pub fn move_window(app: &str, x: f64, y: f64) -> Result<ToolOutput> { + osa::run_applescript(&format!( + "tell application \"System Events\" to set position of front window of (first process whose name is {}) to {{{x}, {y}}}", + osa::as_quote(app), x = x as i64, y = y as i64 + ))?; + Ok(ToolOutput::new(format!("moved {app} front window to ({x:.0},{y:.0})"))) +} + +pub fn resize_window(app: &str, w: f64, h: f64) -> Result<ToolOutput> { + osa::run_applescript(&format!( + "tell application \"System Events\" to set size of front window of (first process whose name is {}) to {{{w}, {h}}}", + osa::as_quote(app), w = w as i64, h = h as i64 + ))?; + Ok(ToolOutput::new(format!("resized {app} front window to {w:.0}x{h:.0}"))) +} + +pub fn minimize_window(app: &str) -> Result<ToolOutput> { + osa::run_applescript(&format!( + "tell application \"System Events\" to set value of attribute \"AXMinimized\" of front window of (first process whose name is {}) to true", + osa::as_quote(app) + ))?; + Ok(ToolOutput::new(format!("minimized {app} front window"))) +} + +pub fn close_window(app: &str) -> Result<ToolOutput> { + osa::run_applescript(&format!( + "tell application \"System Events\" to perform action \"AXPress\" of (button 1 of front window of (first process whose name is {}))", + osa::as_quote(app) + ))?; + Ok(ToolOutput::new(format!("closed {app} front window"))) +} diff --git a/crates/jcode-app-core/src/tool/mod.rs b/crates/jcode-app-core/src/tool/mod.rs index a25ed1b0fb..16f2299ee9 100644 --- a/crates/jcode-app-core/src/tool/mod.rs +++ b/crates/jcode-app-core/src/tool/mod.rs @@ -7,6 +7,8 @@ mod bg; mod browser; mod codesearch; mod communicate; +#[cfg(target_os = "macos")] +mod computer; mod conversation_search; mod debug_socket; mod edit; @@ -191,6 +193,13 @@ impl Registry { Self::insert_tool_timed(&mut m, &mut timings, "bash", bash::BashTool::new); Self::insert_tool_timed(&mut m, &mut timings, "browser", browser::BrowserTool::new); Self::insert_tool_timed(&mut m, &mut timings, "open", open::OpenTool::new); + #[cfg(target_os = "macos")] + Self::insert_tool_timed( + &mut m, + &mut timings, + "macos_computer_use", + computer::ComputerTool::new, + ); Self::insert_tool_timed( &mut m, &mut timings, diff --git a/crates/jcode-base/src/config/default_file.rs b/crates/jcode-base/src/config/default_file.rs index 40aa995cde..2604c6b563 100644 --- a/crates/jcode-base/src/config/default_file.rs +++ b/crates/jcode-base/src/config/default_file.rs @@ -231,6 +231,7 @@ tool_profile = "acp" # OpenAI reasoning effort (none|low|medium|high|xhigh) openai_reasoning_effort = "low" # Anthropic reasoning effort for Claude reasoning models (none|low|medium|high; xhigh on Opus 4.7; max aliases to the strongest supported level) +# Defaults to the strongest supported level for Claude Opus models (xhigh on Opus 4.7/4.8, high on older Opus) when unset; other models keep their own default. # anthropic_reasoning_effort = "medium" # OpenAI transport mode (auto|websocket|https) # openai_transport = "auto" diff --git a/crates/jcode-base/src/provider/anthropic.rs b/crates/jcode-base/src/provider/anthropic.rs index f6d865b029..0c59e084db 100644 --- a/crates/jcode-base/src/provider/anthropic.rs +++ b/crates/jcode-base/src/provider/anthropic.rs @@ -386,6 +386,7 @@ const DEFAULT_MAX_TOKENS: u32 = 32_768; /// Available models pub const AVAILABLE_MODELS: &[&str] = &[ "claude-opus-4-8", + "claude-fable-5", "claude-opus-4-6", "claude-opus-4-6[1m]", "claude-sonnet-4-6", @@ -581,6 +582,7 @@ impl AnthropicProvider { fn model_supports_output_effort(model: &str) -> bool { let model = Self::normalized_model_key(model); model.contains("claude-mythos") + || model.contains("claude-fable-5") || model.contains("claude-opus-4-8") || model.contains("claude-opus-4-7") || model.contains("claude-opus-4-6") @@ -591,6 +593,7 @@ impl AnthropicProvider { fn model_supports_adaptive_thinking(model: &str) -> bool { let model = Self::normalized_model_key(model); model.contains("claude-mythos") + || model.contains("claude-fable-5") || model.contains("claude-opus-4-8") || model.contains("claude-opus-4-7") || model.contains("claude-opus-4-6") @@ -606,7 +609,9 @@ impl AnthropicProvider { fn model_supports_xhigh_effort(model: &str) -> bool { let model = Self::normalized_model_key(model); - model.contains("claude-opus-4-8") || model.contains("claude-opus-4-7") + model.contains("claude-fable-5") + || model.contains("claude-opus-4-8") + || model.contains("claude-opus-4-7") } fn model_supports_reasoning_effort(model: &str) -> bool { @@ -645,6 +650,42 @@ impl AnthropicProvider { } } + /// Default reasoning effort to apply when the user has *not* explicitly + /// configured one. Claude Opus models are reasoning-heavy flagships, so we + /// default them to their strongest supported thinking level (`xhigh` on + /// Opus 4.7/4.8, clamped to `high` on older Opus). Every other model keeps + /// the model's own default (no forced effort) so cheaper models stay cheap. + fn default_reasoning_effort_for_model(model: &str) -> Option<String> { + if Self::normalized_model_key(model).contains("claude-opus") { + Some(Self::actual_effort_for_model(model, "max")) + } else { + None + } + } + + /// The raw, user-configured reasoning effort for this provider, if any. + /// `None` means "use the model default" (see + /// [`Self::default_reasoning_effort_for_model`]). + fn stored_reasoning_effort(&self) -> Option<String> { + self.reasoning_effort + .read() + .map(|guard| guard.clone()) + .unwrap_or_else(|poisoned| poisoned.into_inner().clone()) + } + + /// Effective reasoning effort for `model`, resolving the model default when + /// the user has not configured an explicit effort. + fn effort_for_model(&self, model: &str) -> Option<String> { + if !Self::model_supports_reasoning_effort(model) { + return None; + } + Some( + self.stored_reasoning_effort() + .or_else(|| Self::default_reasoning_effort_for_model(model)) + .unwrap_or_else(|| "none".to_string()), + ) + } + fn model_supports_priority_service_tier(model: &str) -> bool { Self::normalized_model_key(model).contains("claude-opus-4-8") } @@ -703,7 +744,7 @@ impl AnthropicProvider { is_oauth: bool, show_thinking: bool, ) -> (Option<ApiThinking>, Option<ApiOutputConfig>, Option<f32>) { - let effort = self.reasoning_effort(); + let effort = self.effort_for_model(model); let effort = effort.as_deref().filter(|effort| *effort != "none"); let output_config = effort @@ -1119,15 +1160,13 @@ impl Provider for AnthropicProvider { } fn reasoning_effort(&self) -> Option<String> { - if !Self::model_supports_reasoning_effort(&self.model()) { + let model = self.model(); + if !Self::model_supports_reasoning_effort(&model) { return None; } - let effort = self - .reasoning_effort - .read() - .map(|guard| guard.clone()) - .unwrap_or_else(|poisoned| poisoned.into_inner().clone()); - Some(effort.unwrap_or_else(|| "none".to_string())) + // Surface the *effective* effort so the UI/status reflects the Opus + // default (e.g. `xhigh`) when the user has not picked one explicitly. + self.effort_for_model(&model) } fn set_reasoning_effort(&self, effort: &str) -> Result<()> { @@ -1245,7 +1284,7 @@ impl Provider for AnthropicProvider { .unwrap_or_else(|poisoned| poisoned.into_inner()) .clone(), )), - reasoning_effort: Arc::new(std::sync::RwLock::new(self.reasoning_effort())), + reasoning_effort: Arc::new(std::sync::RwLock::new(self.stored_reasoning_effort())), service_tier: Arc::new(std::sync::RwLock::new(self.service_tier())), credentials: Arc::new(RwLock::new(None)), credential_mode: Arc::clone(&self.credential_mode), diff --git a/crates/jcode-base/src/provider/anthropic_tests.rs b/crates/jcode-base/src/provider/anthropic_tests.rs index bdef16873b..472078ef8b 100644 --- a/crates/jcode-base/src/provider/anthropic_tests.rs +++ b/crates/jcode-base/src/provider/anthropic_tests.rs @@ -135,6 +135,10 @@ fn test_anthropic_show_thinking_enables_adaptive_thinking_without_effort() { // Crucially, `output_config` must stay None so we do not force a stronger // (more expensive) reasoning level than the model's default. // + // We use a non-Opus model here because Opus now carries an implicit `xhigh` + // default (see `test_anthropic_opus_defaults_to_xhigh_effort`); Sonnet keeps + // the model's own default so this invariant stays meaningful. + // // `build_reasoning_request_parts_inner` takes the model directly, so we do // not depend on `set_model` accepting a particular catalog entry. With no // effort configured, `self.reasoning_effort()` resolves to None regardless @@ -146,7 +150,7 @@ fn test_anthropic_show_thinking_enables_adaptive_thinking_without_effort() { // show_thinking = false: nothing requested. let (thinking, output_config, _temp) = - provider.build_reasoning_request_parts_inner("claude-opus-4-8", true, false); + provider.build_reasoning_request_parts_inner("claude-sonnet-4-6", true, false); assert!( thinking.is_none(), "no thinking should be requested when both effort and show_thinking are off" @@ -155,10 +159,10 @@ fn test_anthropic_show_thinking_enables_adaptive_thinking_without_effort() { // show_thinking = true: adaptive thinking requested, no output_config. let (thinking, output_config, temperature) = - provider.build_reasoning_request_parts_inner("claude-opus-4-8", true, true); + provider.build_reasoning_request_parts_inner("claude-sonnet-4-6", true, true); match thinking.expect("show_thinking should enable adaptive thinking") { ApiThinking::Adaptive { display } => assert_eq!(display, Some("summarized")), - ApiThinking::Enabled { .. } => panic!("Opus 4.8 should use adaptive thinking"), + ApiThinking::Enabled { .. } => panic!("Sonnet 4.6 should use adaptive thinking"), } assert!( output_config.is_none(), @@ -170,24 +174,81 @@ fn test_anthropic_show_thinking_enables_adaptive_thinking_without_effort() { ); } +#[test] +fn test_anthropic_opus_defaults_to_xhigh_effort() { + // Opus is a reasoning-heavy flagship, so when the user has *not* configured + // an explicit effort it should default to its strongest supported level + // (`xhigh` on Opus 4.7/4.8). This drives both the request `output_config` + // and the surfaced `reasoning_effort()` status. + let provider = AnthropicProvider::new(); + // Clear any ambient config-provided effort so we exercise the model default. + *provider.reasoning_effort.write().unwrap() = None; + + assert_eq!( + AnthropicProvider::default_reasoning_effort_for_model("claude-opus-4-8").as_deref(), + Some("xhigh"), + ); + assert_eq!( + AnthropicProvider::default_reasoning_effort_for_model("claude-opus-4-7").as_deref(), + Some("xhigh"), + ); + // Older Opus does not support xhigh, so it clamps to high. + assert_eq!( + AnthropicProvider::default_reasoning_effort_for_model("claude-opus-4-5").as_deref(), + Some("high"), + ); + // Non-Opus models keep the model's own default (no forced effort). + assert_eq!( + AnthropicProvider::default_reasoning_effort_for_model("claude-sonnet-4-6"), + None, + ); + + // Even without show_thinking, Opus forces its strongest output effort. + let (thinking, output_config, _temp) = + provider.build_reasoning_request_parts_inner("claude-opus-4-8", true, false); + assert_eq!( + output_config + .expect("Opus should default to a forced output effort") + .effort, + "xhigh", + ); + match thinking.expect("Opus default effort should enable adaptive thinking") { + ApiThinking::Adaptive { display } => assert_eq!(display, Some("summarized")), + ApiThinking::Enabled { .. } => panic!("Opus 4.8 should use adaptive thinking"), + } + + // The surfaced status mirrors the effective default for the active model. + *provider + .model + .write() + .unwrap_or_else(|poisoned| poisoned.into_inner()) = "claude-opus-4-8".to_string(); + assert_eq!(provider.reasoning_effort().as_deref(), Some("xhigh")); + + // An explicit user override still wins over the Opus default. + provider.set_reasoning_effort("low").unwrap(); + assert_eq!(provider.reasoning_effort().as_deref(), Some("low")); +} + #[test] fn test_anthropic_show_thinking_enables_manual_thinking_without_effort() { - // Manual-thinking models (e.g. Opus 4.5) need a concrete budget; with only - // the display toggle on we fall back to the minimal budget. The model is + // Manual-thinking models (e.g. Claude 3.7 Sonnet) need a concrete budget; + // with only the display toggle on we fall back to the minimal budget. We use + // a non-Opus model here because Opus now carries an implicit strongest-effort + // default (see `test_anthropic_opus_defaults_to_xhigh_effort`). The model is // passed directly so this does not depend on `set_model` validation. let provider = AnthropicProvider::new(); // Independent of ambient config: clear any configured effort. *provider.reasoning_effort.write().unwrap() = None; let (thinking, _output_config, _temp) = - provider.build_reasoning_request_parts_inner("claude-opus-4-5", false, false); + provider.build_reasoning_request_parts_inner("claude-3-7-sonnet", false, false); assert!(thinking.is_none()); let (thinking, _output_config, _temperature) = - provider.build_reasoning_request_parts_inner("claude-opus-4-5", false, true); + provider.build_reasoning_request_parts_inner("claude-3-7-sonnet", false, true); match thinking.expect("show_thinking should enable manual thinking") { ApiThinking::Enabled { budget_tokens } => assert_eq!(budget_tokens, 1_024), - ApiThinking::Adaptive { .. } => panic!("Opus 4.5 should use manual thinking"), + ApiThinking::Adaptive { .. } => panic!("Claude 3.7 Sonnet should use manual thinking"), } } diff --git a/crates/jcode-base/src/provider/gemini_tests.rs b/crates/jcode-base/src/provider/gemini_tests.rs index df7b60bdea..4393e7d158 100644 --- a/crates/jcode-base/src/provider/gemini_tests.rs +++ b/crates/jcode-base/src/provider/gemini_tests.rs @@ -193,7 +193,8 @@ fn build_contents_replays_thought_signature_on_function_call() { // Gemini 3 (Antigravity Cloud Code backend) rejects function calls that // omit the original thoughtSignature on later turns. Verify the signature // captured on the ToolUse block is replayed verbatim on the functionCall - // part, and that an absent/empty signature stays absent. + // part. A later unsigned call inherits the most recent real signature so the + // backend (which 400s a fully-unsigned turn) accepts it (issue #339). let messages = vec![ Message { role: Role::Assistant, @@ -226,8 +227,10 @@ fn build_contents_replays_thought_signature_on_function_call() { "signature must be replayed on the matching function call part" ); assert_eq!( - contents[1].parts[0].thought_signature, None, - "missing signature must not be fabricated" + contents[1].parts[0].thought_signature.as_deref(), + Some("SIGNATURE_ABC"), + "an unsigned later call must inherit the most recent real signature so \ + the backend does not reject a fully-unsigned turn" ); } @@ -280,6 +283,134 @@ fn build_contents_replays_every_signature_across_multi_tool_history() { ); } +#[test] +fn build_contents_carries_first_signature_onto_unsigned_same_turn_siblings() { + // Issue #339: when Gemini-3 emits MULTIPLE function calls in ONE turn it + // signs only the first; the siblings persist without a signature. The + // Antigravity/Cloud Code backend then rejects the unsigned siblings with + // "Function call is missing a thought_signature ... position N". Verify the + // first call's signature is carried forward onto same-turn siblings that + // lack one (the backend accepts a replayed signature on sibling calls). + let messages = vec![Message { + role: Role::Assistant, + content: vec![ + ContentBlock::ToolUse { + id: "call_todo".to_string(), + name: "todo".to_string(), + input: json!({ "items": ["a", "b"] }), + thought_signature: Some("SIG_TURN_1".to_string()), + }, + ContentBlock::ToolUse { + id: "call_bash".to_string(), + name: "bash".to_string(), + input: json!({ "command": "ls" }), + thought_signature: None, + }, + ContentBlock::ToolUse { + id: "call_write".to_string(), + name: "write".to_string(), + input: json!({ "path": "a.txt", "content": "hi" }), + thought_signature: None, + }, + ], + timestamp: None, + tool_duration_ms: None, + }]; + + let contents = build_contents(&messages); + let replayed: Vec<Option<&str>> = contents + .iter() + .flat_map(|content| content.parts.iter()) + .filter(|part| part.function_call.is_some()) + .map(|part| part.thought_signature.as_deref()) + .collect(); + assert_eq!( + replayed, + vec![Some("SIG_TURN_1"), Some("SIG_TURN_1"), Some("SIG_TURN_1")], + "every functionCall in a multi-call turn must carry a signature so the \ + backend does not reject unsigned siblings" + ); +} + +#[test] +fn build_contents_carries_signature_forward_across_turns_for_unsigned_calls() { + // Issue #339: the Antigravity/Cloud Code backend 400s an assistant turn + // whose function calls are ALL unsigned. A later turn made entirely of + // locally synthesized / unsigned tool calls (auto-poke continuation, batch, + // manual tool use, or an imported pre-signature session) must inherit the + // most recent real signature from earlier in the conversation so at least + // one call carries a signature and the backend accepts the turn. + let messages = vec![ + Message { + role: Role::Assistant, + content: vec![ContentBlock::ToolUse { + id: "turn1".to_string(), + name: "read".to_string(), + input: json!({ "path": "README.md" }), + thought_signature: Some("SIG_TURN_1".to_string()), + }], + timestamp: None, + tool_duration_ms: None, + }, + Message { + role: Role::User, + content: vec![ContentBlock::ToolResult { + tool_use_id: "turn1".to_string(), + content: "ok".to_string(), + is_error: Some(false), + }], + timestamp: None, + tool_duration_ms: None, + }, + Message { + role: Role::Assistant, + content: vec![ContentBlock::ToolUse { + id: "turn2".to_string(), + name: "bash".to_string(), + input: json!({ "command": "ls" }), + thought_signature: None, + }], + timestamp: None, + tool_duration_ms: None, + }, + ]; + + let contents = build_contents(&messages); + let last_turn_sig = contents + .last() + .and_then(|content| content.parts.first()) + .and_then(|part| part.thought_signature.as_deref()); + assert_eq!( + last_turn_sig, + Some("SIG_TURN_1"), + "a fully-unsigned later turn must inherit the most recent real signature \ + so the backend does not reject it" + ); +} + +#[test] +fn build_contents_leaves_unsigned_calls_unsigned_when_no_prior_signature_exists() { + // If the conversation has never produced a real signature there is nothing + // to carry; we must not fabricate one out of thin air. + let messages = vec![Message { + role: Role::Assistant, + content: vec![ContentBlock::ToolUse { + id: "call".to_string(), + name: "bash".to_string(), + input: json!({ "command": "ls" }), + thought_signature: None, + }], + timestamp: None, + tool_duration_ms: None, + }]; + + let contents = build_contents(&messages); + assert_eq!( + contents[0].parts[0].thought_signature, None, + "with no prior signature in the conversation, an unsigned call stays unsigned" + ); +} + #[test] fn build_contents_preserves_tool_calls_and_results() { let messages = vec![ diff --git a/crates/jcode-base/src/provider/models_catalog.rs b/crates/jcode-base/src/provider/models_catalog.rs index 4a9824efd8..476682f738 100644 --- a/crates/jcode-base/src/provider/models_catalog.rs +++ b/crates/jcode-base/src/provider/models_catalog.rs @@ -226,8 +226,15 @@ pub async fn fetch_openai_api_key_model_catalog(api_key: &str) -> Result<OpenAIM note_openai_model_catalog_refresh_attempt(); let client = shared_http_client(); + // Honor the same API-base override as the Responses request path so a + // custom/proxied endpoint is probed for models instead of the real + // api.openai.com (issue #343). + let models_url = format!( + "{}/models", + crate::provider::openai::OpenAIProvider::resolve_api_base().trim_end_matches('/') + ); let resp = client - .get("https://api.openai.com/v1/models") + .get(&models_url) .header("Authorization", format!("Bearer {}", api_key)) .send() .await?; diff --git a/crates/jcode-base/src/provider/openai.rs b/crates/jcode-base/src/provider/openai.rs index 3becd8fe87..be1a228059 100644 --- a/crates/jcode-base/src/provider/openai.rs +++ b/crates/jcode-base/src/provider/openai.rs @@ -838,13 +838,64 @@ impl OpenAIProvider { fn responses_url(credentials: &CodexCredentials) -> String { let base = if Self::is_chatgpt_mode(credentials) { - CHATGPT_API_BASE + // ChatGPT/Codex OAuth backend is fixed; a custom base only applies + // to API-key usage of the native Responses API. + CHATGPT_API_BASE.to_string() } else { - OPENAI_API_BASE + Self::resolve_api_base() }; format!("{}/{}", base.trim_end_matches('/'), RESPONSES_PATH) } + /// Resolve the OpenAI Responses API base URL for **API-key** mode. + /// + /// Defaults to `https://api.openai.com/v1`, but honors a user override so + /// the native `openai-api` provider can target a local/proxied Responses + /// API endpoint (issue #343). Checked in order: + /// `JCODE_OPENAI_API_BASE`, `OPENAI_BASE_URL`, `OPENAI_API_BASE`. + /// + /// The override must be an absolute `http(s)://` URL; anything else is + /// logged and ignored so a malformed value never silently breaks requests. + /// A `/responses` suffix is not expected here (it is appended by callers), + /// so a trailing `/responses` is trimmed to avoid `.../responses/responses`. + pub(crate) fn resolve_api_base() -> String { + const OVERRIDE_VARS: [&str; 3] = + ["JCODE_OPENAI_API_BASE", "OPENAI_BASE_URL", "OPENAI_API_BASE"]; + for var in OVERRIDE_VARS { + let Ok(raw) = std::env::var(var) else { + continue; + }; + let trimmed = raw.trim(); + if trimmed.is_empty() { + continue; + } + if !(trimmed.starts_with("http://") || trimmed.starts_with("https://")) { + crate::logging::warn(&format!( + "Ignoring invalid {} '{}'; expected an absolute http(s):// URL", + var, trimmed + )); + continue; + } + let normalized = trimmed + .trim_end_matches('/') + .trim_end_matches("/responses") + .trim_end_matches('/'); + if normalized.is_empty() { + crate::logging::warn(&format!( + "Ignoring invalid {} '{}'; URL has no host/path", + var, trimmed + )); + continue; + } + crate::logging::info(&format!( + "OpenAI Responses API base overridden to '{}' via {}", + normalized, var + )); + return normalized.to_string(); + } + OPENAI_API_BASE.to_string() + } + fn responses_ws_url(credentials: &CodexCredentials) -> String { let base = Self::responses_url(credentials); base.replace("https://", "wss://") diff --git a/crates/jcode-base/src/provider/openai_stream_runtime.rs b/crates/jcode-base/src/provider/openai_stream_runtime.rs index b5817813bf..18e7119d8b 100644 --- a/crates/jcode-base/src/provider/openai_stream_runtime.rs +++ b/crates/jcode-base/src/provider/openai_stream_runtime.rs @@ -1445,18 +1445,16 @@ fn classify_unavailable_model_error(status: StatusCode, body: &str) -> Option<St /// Check if an error is transient and should be retried pub(super) fn is_retryable_error(error_str: &str) -> bool { - // Network/connection errors - error_str.contains("connection reset") - || error_str.contains("connection closed") - || error_str.contains("connection refused") - || error_str.contains("broken pipe") - || error_str.contains("timed out") - || error_str.contains("timeout") + // Shared transport-layer classifier used by every other provider. This + // covers transient TLS/network faults (connection reset/closed/refused/ + // aborted, broken pipe, timeouts, unexpected EOF, error decoding/reading, + // TLS BadRecordMac / fatal-alert, TLS handshake EOF, DNS/route failures, + // and HTTP/2 stream/protocol faults). Keeping the OpenAI path delegated + // here ensures retry behavior is unified across providers (issue #338). + crate::provider::is_transient_transport_error(error_str) + // OpenAI-specific transport wrapper. || error_str.contains("failed to send request to openai api") - // Stream/decode errors - || error_str.contains("error decoding") - || error_str.contains("error reading") - || error_str.contains("unexpected eof") + // Stream/decode errors specific to the OpenAI streaming runtime. || error_str.contains("incomplete message") || error_str.contains("stream disconnected before completion") || error_str.contains("ended before message completion marker") @@ -1467,6 +1465,11 @@ pub(super) fn is_retryable_error(error_str: &str) -> bool { || error_str.contains("503 service unavailable") || error_str.contains("504 gateway timeout") || error_str.contains("overloaded") + // Rate limiting (429): transient, recovers on retry. Unified with the + // other providers (Anthropic/Copilot) which already retry these. + || error_str.contains("429 too many requests") + || error_str.contains("rate limit") + || error_str.contains("rate_limit") // API-level server errors || error_str.contains("api_error") || error_str.contains("server_error") @@ -1517,4 +1520,36 @@ mod stream_runtime_tests { "openai token refresh failed; run /login to re-authenticate: network error" )); } + + #[test] + fn tls_transient_errors_are_retryable() { + // Regression for issue #338: transient TLS faults must be retried on + // the OpenAI path, matching every other provider. Callers pass the + // error string already lowercased. + assert!(is_retryable_error( + "stream error: io error: received fatal alert: badrecordmac" + )); + assert!(is_retryable_error("received fatal alert: badrecordmac")); + assert!(is_retryable_error("decryption failed or bad record mac")); + assert!(is_retryable_error("tls handshake eof")); + assert!(is_retryable_error("connection aborted")); + assert!(is_retryable_error("temporary failure in name resolution")); + assert!(is_retryable_error("no route to host")); + assert!(is_retryable_error("network is unreachable")); + } + + #[test] + fn rate_limit_is_retryable() { + // Regression for issue #338 (gap #2): 429s should be retried, unifying + // behavior with Anthropic/Copilot. + assert!(is_retryable_error("429 too many requests")); + assert!(is_retryable_error("rate limit exceeded")); + assert!(is_retryable_error("rate_limit_exceeded")); + } + + #[test] + fn auth_errors_remain_non_retryable() { + assert!(!is_retryable_error("401 unauthorized")); + assert!(!is_retryable_error("invalid api key")); + } } diff --git a/crates/jcode-base/src/provider/openai_tests.rs b/crates/jcode-base/src/provider/openai_tests.rs index 8d9e3f42bb..bddee6f7ae 100644 --- a/crates/jcode-base/src/provider/openai_tests.rs +++ b/crates/jcode-base/src/provider/openai_tests.rs @@ -32,6 +32,12 @@ impl EnvVarGuard { crate::env::set_var(key, value); Self { key, previous } } + + fn remove(key: &'static str) -> Self { + let previous = std::env::var_os(key); + crate::env::remove_var(key); + Self { key, previous } + } } impl Drop for EnvVarGuard { diff --git a/crates/jcode-base/src/provider/openai_tests/models_state.rs b/crates/jcode-base/src/provider/openai_tests/models_state.rs index 479460a609..363619dff3 100644 --- a/crates/jcode-base/src/provider/openai_tests/models_state.rs +++ b/crates/jcode-base/src/provider/openai_tests/models_state.rs @@ -263,3 +263,89 @@ fn openai_catalog_and_chat_endpoints_agree_on_credential_shape() { "credential with an id token must be treated as ChatGPT/Codex mode" ); } + +/// Issue #343: the native `openai-api` (Responses API) base URL must be +/// overridable for API-key usage so local/proxied Responses endpoints work, +/// while ChatGPT/Codex OAuth mode stays pinned to the Codex backend. +#[test] +fn responses_url_honors_api_base_override_in_api_key_mode() { + let _guard = crate::storage::lock_test_env(); + let _b = EnvVarGuard::remove("JCODE_OPENAI_API_BASE"); + let _c = EnvVarGuard::remove("OPENAI_BASE_URL"); + let _d = EnvVarGuard::remove("OPENAI_API_BASE"); + + let api_key_creds = CodexCredentials { + access_token: "sk-platform-key".to_string(), + refresh_token: String::new(), + id_token: None, + account_id: None, + expires_at: None, + }; + + // Default base when unset. + assert_eq!( + OpenAIProvider::responses_url(&api_key_creds), + format!("{}/responses", OPENAI_API_BASE), + ); + + // Override is applied (and a trailing slash is tolerated). + let _override = EnvVarGuard::set("JCODE_OPENAI_API_BASE", "http://127.0.0.1:8317/v1/"); + assert_eq!( + OpenAIProvider::responses_url(&api_key_creds), + "http://127.0.0.1:8317/v1/responses", + ); + // WS URL derives from the same base. + assert_eq!( + OpenAIProvider::responses_ws_url(&api_key_creds), + "ws://127.0.0.1:8317/v1/responses", + ); + // Compact endpoint too. + assert_eq!( + OpenAIProvider::responses_compact_url(&api_key_creds), + "http://127.0.0.1:8317/v1/responses/compact", + ); +} + +#[test] +fn responses_url_ignores_override_in_chatgpt_mode() { + let _guard = crate::storage::lock_test_env(); + let _override = EnvVarGuard::set("JCODE_OPENAI_API_BASE", "http://127.0.0.1:8317/v1"); + + let oauth_creds = CodexCredentials { + access_token: "oauth-access".to_string(), + refresh_token: "oauth-refresh".to_string(), + id_token: None, + account_id: None, + expires_at: None, + }; + // ChatGPT/Codex OAuth backend must stay fixed regardless of the override. + assert!( + OpenAIProvider::responses_url(&oauth_creds).starts_with(CHATGPT_API_BASE), + "ChatGPT/Codex mode must ignore the API base override" + ); +} + +#[test] +fn resolve_api_base_precedence_and_validation() { + let _guard = crate::storage::lock_test_env(); + let _a = EnvVarGuard::remove("JCODE_OPENAI_API_BASE"); + let _b = EnvVarGuard::remove("OPENAI_BASE_URL"); + let _c = EnvVarGuard::remove("OPENAI_API_BASE"); + + // Default. + assert_eq!(OpenAIProvider::resolve_api_base(), OPENAI_API_BASE); + + // JCODE_OPENAI_API_BASE wins over OPENAI_BASE_URL / OPENAI_API_BASE. + let _p1 = EnvVarGuard::set("OPENAI_API_BASE", "https://c.example/v1"); + let _p2 = EnvVarGuard::set("OPENAI_BASE_URL", "https://b.example/v1"); + let _p3 = EnvVarGuard::set("JCODE_OPENAI_API_BASE", "https://a.example/v1"); + assert_eq!(OpenAIProvider::resolve_api_base(), "https://a.example/v1"); + + // A trailing /responses is trimmed so callers don't double it. + let _p4 = EnvVarGuard::set("JCODE_OPENAI_API_BASE", "https://a.example/v1/responses"); + assert_eq!(OpenAIProvider::resolve_api_base(), "https://a.example/v1"); + + // Non-URL values are ignored, falling through to the next candidate. + let _p5 = EnvVarGuard::set("JCODE_OPENAI_API_BASE", "not-a-url"); + assert_eq!(OpenAIProvider::resolve_api_base(), "https://b.example/v1"); +} diff --git a/crates/jcode-base/src/provider/openrouter.rs b/crates/jcode-base/src/provider/openrouter.rs index 2d149f9ff7..0679b69ce0 100644 --- a/crates/jcode-base/src/provider/openrouter.rs +++ b/crates/jcode-base/src/provider/openrouter.rs @@ -13,10 +13,10 @@ use super::{EventStream, Provider}; use crate::message::{CacheControl, ContentBlock, Message, Role, StreamEvent, ToolDefinition}; use crate::provider_catalog::{ OPENAI_COMPAT_PROFILE, is_safe_env_file_name, is_safe_env_key_name, - load_api_key_from_env_or_config, normalize_api_base, openai_compatible_profile_by_id, - openai_compatible_profile_id_for_api_base, openai_compatible_profile_static_context_limits, - openai_compatible_profile_static_models, openai_compatible_profiles, - resolve_openai_compatible_profile, + load_api_key_from_env_or_config, load_env_value_from_env_or_config, normalize_api_base, + openai_compatible_profile_by_id, openai_compatible_profile_id_for_api_base, + openai_compatible_profile_static_context_limits, openai_compatible_profile_static_models, + openai_compatible_profiles, resolve_openai_compatible_profile, }; use anyhow::{Context, Result}; use async_trait::async_trait; @@ -928,6 +928,11 @@ pub struct OpenRouterProvider { supports_model_catalog: bool, profile_id: Option<String>, max_tokens: Option<u32>, + /// Extra top-level JSON object fields merged into every chat/completions + /// request body (e.g. NVIDIA NIM DeepSeek-V4 `chat_template_kwargs`). + /// Resolved once at construction from named-profile config or the + /// `JCODE_OPENAI_EXTRA_BODY` env/env-file value. + extra_body: Option<serde_json::Map<String, Value>>, static_models: Vec<String>, static_context_limits: HashMap<String, usize>, send_openrouter_headers: bool, @@ -1035,6 +1040,65 @@ impl OpenRouterProvider { None } + /// Resolve extra request-body fields for an OpenAI-compatible/OpenRouter + /// provider. + /// + /// Sources, in precedence order (later overrides earlier): + /// 1. An optional named-profile `extra_body` config object. + /// 2. The `JCODE_OPENAI_EXTRA_BODY` env var (or the same key inside the + /// profile's `.env` file), parsed as a JSON object string. + /// + /// This lets users inject non-standard parameters that some backends + /// require, e.g. NVIDIA NIM DeepSeek-V4 reasoning models need + /// `chat_template_kwargs = { "thinking": true, "reasoning_effort": "high" }` + /// or they silently hang instead of responding (issue #341). + /// + /// Returns `None` when nothing is configured. Invalid input is logged and + /// ignored rather than failing provider construction. + fn resolve_extra_body( + config: Option<&serde_json::Value>, + env_file: &str, + ) -> Option<serde_json::Map<String, Value>> { + let mut merged = serde_json::Map::new(); + + if let Some(value) = config { + match value.as_object() { + Some(object) => { + for (key, val) in object { + merged.insert(key.clone(), val.clone()); + } + } + None => crate::logging::warn( + "Ignoring provider `extra_body`: expected a table/object of top-level request fields", + ), + } + } + + if let Some(raw) = + load_env_value_from_env_or_config("JCODE_OPENAI_EXTRA_BODY", env_file) + { + match serde_json::from_str::<Value>(&raw) { + Ok(Value::Object(object)) => { + for (key, val) in object { + merged.insert(key, val); + } + } + Ok(_) => crate::logging::warn( + "Ignoring JCODE_OPENAI_EXTRA_BODY: expected a JSON object string, e.g. {\"chat_template_kwargs\":{\"thinking\":true}}", + ), + Err(err) => crate::logging::warn(&format!( + "Ignoring invalid JCODE_OPENAI_EXTRA_BODY JSON: {err}" + )), + } + } + + if merged.is_empty() { + None + } else { + Some(merged) + } + } + pub(crate) fn supports_provider_routing_features(&self) -> bool { self.supports_provider_features } @@ -1187,6 +1251,14 @@ impl OpenRouterProvider { ), profile_id: Some(profile_name.to_string()), max_tokens: Self::configured_max_tokens(Some(profile_name)), + extra_body: Self::resolve_extra_body( + profile.extra_body.as_ref(), + profile + .env_file + .as_deref() + .filter(|name| is_safe_env_file_name(name)) + .unwrap_or(DEFAULT_ENV_FILE), + ), static_models, static_context_limits, send_openrouter_headers: false, @@ -1317,6 +1389,7 @@ impl OpenRouterProvider { ProviderRouting::default() }; let max_tokens = Self::configured_max_tokens(profile_id.as_deref()); + let extra_body = Self::resolve_extra_body(None, &configured_env_file_name()); Ok(Self { client: crate::provider::shared_http_client(), @@ -1328,6 +1401,7 @@ impl OpenRouterProvider { supports_model_catalog, profile_id, max_tokens, + extra_body, static_models, static_context_limits, send_openrouter_headers, @@ -1366,6 +1440,7 @@ impl OpenRouterProvider { supports_model_catalog: true, profile_id: None, max_tokens: Self::configured_max_tokens(None), + extra_body: Self::resolve_extra_body(None, DEFAULT_ENV_FILE), static_models: Vec::new(), static_context_limits: HashMap::new(), send_openrouter_headers: true, @@ -1429,6 +1504,7 @@ impl OpenRouterProvider { supports_model_catalog: true, profile_id: Some(resolved.id.clone()), max_tokens: Self::configured_max_tokens(Some(&resolved.id)), + extra_body: Self::resolve_extra_body(None, &resolved.env_file), static_models, static_context_limits, send_openrouter_headers: false, @@ -1650,6 +1726,7 @@ impl OpenRouterProvider { supports_model_catalog: true, profile_id: None, max_tokens: None, + extra_body: None, static_models: Vec::new(), static_context_limits: HashMap::new(), send_openrouter_headers: true, diff --git a/crates/jcode-base/src/provider/openrouter_provider_impl.rs b/crates/jcode-base/src/provider/openrouter_provider_impl.rs index 95abb8ab1c..0f85bfc49b 100644 --- a/crates/jcode-base/src/provider/openrouter_provider_impl.rs +++ b/crates/jcode-base/src/provider/openrouter_provider_impl.rs @@ -170,6 +170,18 @@ impl Provider for OpenRouterProvider { request["provider"] = obj; } + // Merge user-configured extra request-body fields last so they can + // satisfy non-standard backend requirements (e.g. NVIDIA NIM + // DeepSeek-V4 `chat_template_kwargs`) and intentionally override any + // jcode-generated field with the same key (issue #341). + if let Some(extra) = self.extra_body.as_ref() { + if let Some(request_obj) = request.as_object_mut() { + for (key, value) in extra { + request_obj.insert(key.clone(), value.clone()); + } + } + } + let message_items = request .get("messages") .and_then(|value| value.as_array()) @@ -633,6 +645,7 @@ impl Provider for OpenRouterProvider { supports_model_catalog: self.supports_model_catalog, profile_id: self.profile_id.clone(), max_tokens: self.max_tokens, + extra_body: self.extra_body.clone(), static_models: self.static_models.clone(), static_context_limits: self.static_context_limits.clone(), send_openrouter_headers: self.send_openrouter_headers, diff --git a/crates/jcode-base/src/provider/openrouter_tests.rs b/crates/jcode-base/src/provider/openrouter_tests.rs index b3a64e06b5..5f6f32f3b1 100644 --- a/crates/jcode-base/src/provider/openrouter_tests.rs +++ b/crates/jcode-base/src/provider/openrouter_tests.rs @@ -1228,6 +1228,7 @@ fn make_provider() -> OpenRouterProvider { supports_model_catalog: true, profile_id: None, max_tokens: None, + extra_body: None, static_models: Vec::new(), static_context_limits: HashMap::new(), send_openrouter_headers: true, @@ -1254,6 +1255,7 @@ fn make_custom_compatible_provider() -> OpenRouterProvider { supports_model_catalog: true, profile_id: None, max_tokens: None, + extra_body: None, static_models: Vec::new(), static_context_limits: HashMap::new(), send_openrouter_headers: false, @@ -2266,3 +2268,151 @@ fn runtime_display_name_for_profile_runtime_instance() { assert_eq!(nim.runtime_display_name(), "NVIDIA NIM"); assert_eq!(Provider::name(&nim), "openrouter"); } + +#[test] +fn resolve_extra_body_returns_none_when_unset() { + let _lock = ENV_LOCK.lock(); + let _guard = EnvVarGuard::remove("JCODE_OPENAI_EXTRA_BODY"); + assert!(OpenRouterProvider::resolve_extra_body(None, "nonexistent.env").is_none()); +} + +#[test] +fn resolve_extra_body_parses_env_json_object() { + let _lock = ENV_LOCK.lock(); + let _guard = EnvVarGuard::set( + "JCODE_OPENAI_EXTRA_BODY", + r#"{"chat_template_kwargs":{"thinking":true,"reasoning_effort":"high"}}"#, + ); + let extra = + OpenRouterProvider::resolve_extra_body(None, "nonexistent.env").expect("extra body"); + let kwargs = extra + .get("chat_template_kwargs") + .and_then(|v| v.as_object()) + .expect("chat_template_kwargs object"); + assert_eq!(kwargs.get("thinking"), Some(&serde_json::json!(true))); + assert_eq!( + kwargs.get("reasoning_effort"), + Some(&serde_json::json!("high")) + ); +} + +#[test] +fn resolve_extra_body_ignores_invalid_env_json() { + let _lock = ENV_LOCK.lock(); + let _guard = EnvVarGuard::set("JCODE_OPENAI_EXTRA_BODY", "not-json"); + assert!(OpenRouterProvider::resolve_extra_body(None, "nonexistent.env").is_none()); +} + +#[test] +fn resolve_extra_body_ignores_non_object_env_json() { + let _lock = ENV_LOCK.lock(); + let _guard = EnvVarGuard::set("JCODE_OPENAI_EXTRA_BODY", "[1,2,3]"); + assert!(OpenRouterProvider::resolve_extra_body(None, "nonexistent.env").is_none()); +} + +#[test] +fn resolve_extra_body_merges_config_and_env_with_env_override() { + let _lock = ENV_LOCK.lock(); + let config = serde_json::json!({ + "chat_template_kwargs": {"thinking": false}, + "config_only": 1, + }); + let _guard = EnvVarGuard::set( + "JCODE_OPENAI_EXTRA_BODY", + r#"{"chat_template_kwargs":{"thinking":true},"env_only":2}"#, + ); + let extra = OpenRouterProvider::resolve_extra_body(Some(&config), "nonexistent.env") + .expect("merged extra body"); + // Env overrides the colliding key. + assert_eq!( + extra + .get("chat_template_kwargs") + .and_then(|v| v.get("thinking")), + Some(&serde_json::json!(true)) + ); + // Non-colliding keys from both sources survive. + assert_eq!(extra.get("config_only"), Some(&serde_json::json!(1))); + assert_eq!(extra.get("env_only"), Some(&serde_json::json!(2))); +} + +#[test] +fn resolve_extra_body_ignores_non_object_config() { + let _lock = ENV_LOCK.lock(); + let _guard = EnvVarGuard::remove("JCODE_OPENAI_EXTRA_BODY"); + let config = serde_json::json!("not an object"); + assert!(OpenRouterProvider::resolve_extra_body(Some(&config), "nonexistent.env").is_none()); +} + +#[test] +fn named_profile_extra_body_threads_into_provider() { + let _lock = ENV_LOCK.lock(); + let temp = TempDir::new().expect("create temp home"); + let jcode_home = temp.path().join("jcode-home"); + let _jcode_home = EnvVarGuard::set("JCODE_HOME", &jcode_home); + let _home = EnvVarGuard::set("HOME", temp.path()); + let _appdata = EnvVarGuard::set("APPDATA", temp.path().join("AppData").join("Roaming")); + let _env = isolate_openrouter_autodetect_env(); + let _extra_guard = EnvVarGuard::remove("JCODE_OPENAI_EXTRA_BODY"); + + let mut profile = crate::config::NamedProviderConfig { + base_url: "https://integrate.api.nvidia.com/v1".to_string(), + auth: crate::config::NamedProviderAuth::None, + requires_api_key: Some(false), + ..Default::default() + }; + profile.extra_body = Some(serde_json::json!({ + "chat_template_kwargs": {"thinking": true, "reasoning_effort": "high"} + })); + + let provider = OpenRouterProvider::new_named_openai_compatible("my-nim", &profile) + .expect("build named provider"); + let extra = provider.extra_body.as_ref().expect("extra body present"); + assert_eq!( + extra + .get("chat_template_kwargs") + .and_then(|v| v.get("reasoning_effort")), + Some(&serde_json::json!("high")) + ); +} + +#[test] +fn named_provider_config_deserializes_nested_extra_body_toml() { + // Verifies the exact `config.toml` shape documented in the README: + // a nested `[providers.<name>.extra_body.chat_template_kwargs]` table + // round-trips into the `serde_json::Value` field correctly. + let toml_str = r#" +type = "openai-compatible" +base_url = "https://integrate.api.nvidia.com/v1" +api_key_env = "NVIDIA_API_KEY" +default_model = "deepseek-ai/deepseek-v4-flash" + +[extra_body.chat_template_kwargs] +thinking = true +reasoning_effort = "high" +"#; + let profile: crate::config::NamedProviderConfig = + toml::from_str(toml_str).expect("parse named provider toml"); + let extra = profile.extra_body.as_ref().expect("extra_body present"); + let kwargs = extra + .get("chat_template_kwargs") + .and_then(|v| v.as_object()) + .expect("chat_template_kwargs object"); + assert_eq!(kwargs.get("thinking"), Some(&serde_json::json!(true))); + assert_eq!( + kwargs.get("reasoning_effort"), + Some(&serde_json::json!("high")) + ); + + // And the resolver hands it back unchanged when no env override is set. + let _lock = ENV_LOCK.lock(); + let _guard = EnvVarGuard::remove("JCODE_OPENAI_EXTRA_BODY"); + let resolved = + OpenRouterProvider::resolve_extra_body(profile.extra_body.as_ref(), "nonexistent.env") + .expect("resolved extra body"); + assert_eq!( + resolved + .get("chat_template_kwargs") + .and_then(|v| v.get("reasoning_effort")), + Some(&serde_json::json!("high")) + ); +} diff --git a/crates/jcode-config-types/Cargo.toml b/crates/jcode-config-types/Cargo.toml index a3c17c3566..f71508e309 100644 --- a/crates/jcode-config-types/Cargo.toml +++ b/crates/jcode-config-types/Cargo.toml @@ -6,3 +6,4 @@ publish = false [dependencies] serde = { version = "1", features = ["derive"] } +serde_json = "1" diff --git a/crates/jcode-config-types/src/lib.rs b/crates/jcode-config-types/src/lib.rs index 0c7e94bb80..c4c712bc7f 100644 --- a/crates/jcode-config-types/src/lib.rs +++ b/crates/jcode-config-types/src/lib.rs @@ -336,7 +336,7 @@ pub struct NamedProviderModelConfig { pub input: Vec<String>, } -#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] #[serde(default)] pub struct NamedProviderConfig { #[serde(rename = "type")] @@ -358,6 +358,17 @@ pub struct NamedProviderConfig { pub allow_provider_pinning: bool, #[serde(default, skip_serializing_if = "Vec::is_empty")] pub models: Vec<NamedProviderModelConfig>, + /// Extra top-level JSON fields merged into every chat/completions request + /// body sent to this provider. Lets users inject non-standard parameters + /// some OpenAI-compatible backends require (e.g. NVIDIA NIM DeepSeek-V4 + /// needs `chat_template_kwargs = { thinking = true, reasoning_effort = "high" }`). + /// Must be a JSON object; keys here override jcode-generated body fields. + #[serde( + default, + alias = "extra-body", + skip_serializing_if = "Option::is_none" + )] + pub extra_body: Option<serde_json::Value>, } impl Default for NamedProviderConfig { @@ -377,6 +388,7 @@ impl Default for NamedProviderConfig { model_catalog: false, allow_provider_pinning: false, models: Vec::new(), + extra_body: None, } } } diff --git a/crates/jcode-provider-core/src/anthropic.rs b/crates/jcode-provider-core/src/anthropic.rs index 1ec030b100..1a2a617593 100644 --- a/crates/jcode-provider-core/src/anthropic.rs +++ b/crates/jcode-provider-core/src/anthropic.rs @@ -65,11 +65,12 @@ impl AnthropicContextMode { pub fn anthropic_context_mode(model: &str) -> AnthropicContextMode { let base = anthropic_strip_1m_suffix(model.trim()).to_ascii_lowercase(); - // Native 1M (default, no opt-in): Opus 4.8 and 4.7. + // Native 1M (default, no opt-in): Opus 4.8 and 4.7, Fable 5. if base.starts_with("claude-opus-4-8") || base.starts_with("claude-opus-4.8") || base.starts_with("claude-opus-4-7") || base.starts_with("claude-opus-4.7") + || base.starts_with("claude-fable-5") { return AnthropicContextMode::Native1M; } diff --git a/crates/jcode-provider-core/src/models.rs b/crates/jcode-provider-core/src/models.rs index 040121178e..a6a1ba43ad 100644 --- a/crates/jcode-provider-core/src/models.rs +++ b/crates/jcode-provider-core/src/models.rs @@ -1,6 +1,7 @@ /// Available Claude models used by model lists and provider routing. pub const ALL_CLAUDE_MODELS: &[&str] = &[ "claude-opus-4-8", + "claude-fable-5", "claude-opus-4-6", "claude-opus-4-6[1m]", "claude-sonnet-4-6", @@ -145,6 +146,7 @@ pub fn provider_for_model(model: &str) -> Option<&'static str> { /// covered, while unknown/future Claude ids fall through to the dynamic cache. fn base_is_known_claude_model(base: &str) -> bool { const KNOWN_CLAUDE_PREFIXES: &[&str] = &[ + "claude-fable-5", "claude-opus-4-8", "claude-opus-4.8", "claude-opus-4-7", @@ -289,6 +291,11 @@ mod tests { context_limit_for_model_with_provider("claude-opus-4-7", Some("claude")), Some(1_000_000) ); + // Fable 5 is native-1M as well. + assert_eq!( + context_limit_for_model_with_provider("claude-fable-5", Some("claude")), + Some(1_000_000) + ); // Opt-in 1M: 200K by default, 1M only via the [1m] suffix. assert_eq!( context_limit_for_model_with_provider("claude-opus-4-6", Some("claude")), diff --git a/crates/jcode-provider-core/src/pricing.rs b/crates/jcode-provider-core/src/pricing.rs index 2521568efe..2e506179f5 100644 --- a/crates/jcode-provider-core/src/pricing.rs +++ b/crates/jcode-provider-core/src/pricing.rs @@ -15,6 +15,14 @@ pub fn anthropic_api_pricing(model: &str) -> Option<RouteCheapnessEstimate> { let base = model.strip_suffix("[1m]").unwrap_or(model); let long_context = model.ends_with("[1m]"); match base { + "claude-fable-5" => Some(RouteCheapnessEstimate::metered( + RouteCostSource::Heuristic, + RouteCostConfidence::Low, + usd_to_micros(if long_context { 6.0 } else { 3.0 }), + usd_to_micros(if long_context { 22.5 } else { 15.0 }), + Some(usd_to_micros(if long_context { 0.6 } else { 0.3 })), + Some("Estimated from Sonnet 4.6 API pricing".to_string()), + )), "claude-opus-4-8" => Some(RouteCheapnessEstimate::metered( RouteCostSource::PublicApiPricing, RouteCostConfidence::Exact, diff --git a/crates/jcode-provider-gemini/src/lib.rs b/crates/jcode-provider-gemini/src/lib.rs index 568e6ebc2a..894b31c37a 100644 --- a/crates/jcode-provider-gemini/src/lib.rs +++ b/crates/jcode-provider-gemini/src/lib.rs @@ -252,6 +252,25 @@ pub fn build_system_instruction_with_tool_guard( } pub fn build_contents(messages: &[Message]) -> Vec<GeminiContent> { + // Gemini-3 attaches an opaque `thoughtSignature` to function-call parts, and + // the Cloud Code / Antigravity backend rejects an assistant turn whose + // function calls are ALL unsigned with `Function call is missing a + // thought_signature in functionCall parts` (HTTP 400, issue #339). This + // happens because: + // * a parallel multi-call turn only signs its FIRST call (siblings persist + // unsigned), and + // * locally synthesized tool calls (batch sub-calls, manual tool use, + // auto-poke continuations, recovery) and pre-signature/imported sessions + // carry no signature at all. + // + // Live-verified backend rule: a turn is accepted as long as *at least one* + // of its function calls carries a (valid) signature; a fully-unsigned turn + // 400s. All calls in a session share the same opaque reasoning channel and + // the backend accepts a previously-emitted signature replayed on later + // calls, so we carry the most recent real signature forward across the whole + // conversation onto any function call that lacks one. This keeps multi-call + // turns and synthesized/imported histories replayable instead of hard-failing. + let mut last_signature: Option<String> = None; messages .iter() .filter_map(|message| { @@ -278,16 +297,21 @@ pub fn build_contents(messages: &[Message]) -> Vec<GeminiContent> { input, thought_signature, } => { + let own_signature = thought_signature + .as_ref() + .filter(|sig| !sig.is_empty()) + .cloned(); + if own_signature.is_some() { + last_signature = own_signature.clone(); + } + let signature = own_signature.or_else(|| last_signature.clone()); parts.push(GeminiPart { function_call: Some(GeminiFunctionCall { name: name.clone(), args: ToolCall::input_as_object(input), id: Some(id.clone()), }), - thought_signature: thought_signature - .as_ref() - .filter(|sig| !sig.is_empty()) - .cloned(), + thought_signature: signature, ..Default::default() }); } diff --git a/crates/jcode-setup-hints/src/lib.rs b/crates/jcode-setup-hints/src/lib.rs index c7f2c319e1..eaea8de6dc 100644 --- a/crates/jcode-setup-hints/src/lib.rs +++ b/crates/jcode-setup-hints/src/lib.rs @@ -39,6 +39,7 @@ use windows_setup::{ }; #[derive(Debug, Clone, Serialize, Deserialize, Default)] +#[serde(default)] pub struct SetupHintsState { pub launch_count: u64, pub hotkey_configured: bool, @@ -120,10 +121,24 @@ impl SetupHintsState { } pub fn load() -> Self { - Self::path() - .ok() - .and_then(|p| storage::read_json(&p).ok()) - .unwrap_or_default() + let Ok(path) = Self::path() else { + return Self::default(); + }; + Self::load_from(&path) + } + + /// Load state from `path`, falling back to its `.bak` sibling. + /// + /// The atomic writer keeps the previous version at `.bak`. If the primary + /// file is missing or unreadable (deleted, interrupted swap), fall back to + /// it instead of silently resetting state like `launch_count`, which + /// downstream heuristics (e.g. first-run onboarding) rely on. + fn load_from(path: &std::path::Path) -> Self { + if let Ok(state) = storage::read_json(path) { + return state; + } + let bak = path.with_extension("bak"); + storage::read_json(&bak).unwrap_or_default() } pub fn save(&self) -> Result<()> { diff --git a/crates/jcode-setup-hints/src/setup_hints_tests.rs b/crates/jcode-setup-hints/src/setup_hints_tests.rs index 62af9ad5fa..25300ec65f 100644 --- a/crates/jcode-setup-hints/src/setup_hints_tests.rs +++ b/crates/jcode-setup-hints/src/setup_hints_tests.rs @@ -213,3 +213,38 @@ fn nudge_budget_caps_at_max_and_persists() { assert_eq!(state.terminal_nudge_count, MAX_TERMINAL_NUDGES); assert!(!state.nudge_budget_remaining()); } + +#[test] +fn load_from_falls_back_to_bak_when_primary_missing() { + let dir = tempfile::tempdir().expect("tempdir"); + let path = dir.path().join("setup_hints.json"); + let bak = dir.path().join("setup_hints.bak"); + + std::fs::write(&bak, r#"{"launch_count":42}"#).unwrap(); + + // Primary file missing: must recover launch_count from the .bak instead of + // resetting to default (which would re-trigger first-run onboarding). + let loaded = SetupHintsState::load_from(&path); + assert_eq!(loaded.launch_count, 42); +} + +#[test] +fn load_from_falls_back_to_bak_when_primary_corrupt_without_inline_recovery() { + let dir = tempfile::tempdir().expect("tempdir"); + let path = dir.path().join("setup_hints.json"); + let bak = dir.path().join("setup_hints.bak"); + + std::fs::write(&path, b"{not json").unwrap(); + std::fs::write(&bak, r#"{"launch_count":7}"#).unwrap(); + + let loaded = SetupHintsState::load_from(&path); + assert_eq!(loaded.launch_count, 7); +} + +#[test] +fn load_from_defaults_when_both_missing() { + let dir = tempfile::tempdir().expect("tempdir"); + let path = dir.path().join("setup_hints.json"); + let loaded = SetupHintsState::load_from(&path); + assert_eq!(loaded.launch_count, 0); +} diff --git a/crates/jcode-tui-mermaid/src/lib.rs b/crates/jcode-tui-mermaid/src/lib.rs index f1d5c6a731..c07bd725a8 100644 --- a/crates/jcode-tui-mermaid/src/lib.rs +++ b/crates/jcode-tui-mermaid/src/lib.rs @@ -218,6 +218,8 @@ pub use model::{ mod cache_render; #[path = "mermaid_content.rs"] mod content_render; +#[path = "mermaid_inline.rs"] +mod inline_image; #[path = "mermaid_runtime.rs"] mod runtime; #[path = "mermaid_viewport.rs"] @@ -242,6 +244,9 @@ pub use runtime::{ is_video_export_mode, protocol_type, register_external_image, register_inline_image, set_video_export_mode, }; +pub use inline_image::{ + inline_image_dims, inline_image_id, materialize_inline_image, +}; pub use viewport_render::{ invalidate_render_state, render_image_widget_viewport, render_image_widget_viewport_precise, }; diff --git a/crates/jcode-tui-mermaid/src/mermaid_inline.rs b/crates/jcode-tui-mermaid/src/mermaid_inline.rs new file mode 100644 index 0000000000..bfd323ee8d --- /dev/null +++ b/crates/jcode-tui-mermaid/src/mermaid_inline.rs @@ -0,0 +1,294 @@ +//! Inline raster image support, decoupled from the Mermaid diagram pipeline. +//! +//! Real images (pasted screenshots, `read` of an image file, generated images) +//! are fundamentally different from Mermaid diagrams: they arrive as base64 +//! payloads, never need SVG layout/aspect buckets, and should be rendered +//! "fit to width" rather than cropped to an estimated height. +//! +//! This module provides a small, lazy API: +//! +//! * [`inline_image_dims`] - cheap, header-only dimensions (cached by id). Used +//! at *prepare* time to compute placeholder height without decoding the whole +//! image. +//! * [`materialize_inline_image`] - full decode + PNG-to-disk + insert into the +//! shared render cache. Used at *draw* time, only for images currently on +//! screen, so a session with many images only ever decodes the ones you look +//! at. +//! +//! Both share a stable content id so the placeholder computed at prepare time +//! lines up with the bytes rendered at draw time. Rendering itself reuses the +//! existing [`crate::render_image_widget_fit`] path via the shared +//! `RENDER_CACHE` (keyed by id -> on-disk PNG path). + +use super::*; + +/// Cap on the dimension cache. Header parsing is cheap, but a long session can +/// accumulate many distinct images; bound the metadata map so it cannot grow +/// without limit. +const INLINE_DIMS_MAX: usize = 256; + +#[derive(Clone, Copy)] +struct InlineDims { + width: u32, + height: u32, +} + +/// Cache of `id -> (width, height)` so repeated prepare passes never re-parse +/// the same image header. Bounded by insertion order. +static INLINE_DIMS_CACHE: LazyLock<Mutex<(HashMap<u64, InlineDims>, VecDeque<u64>)>> = + LazyLock::new(|| Mutex::new((HashMap::new(), VecDeque::new()))); + +/// Stable content id for an inline image, derived from its media type and +/// base64 payload. No decoding is performed. +pub fn inline_image_id(media_type: &str, data_b64: &str) -> u64 { + let mut hasher = std::hash::DefaultHasher::new(); + media_type.hash(&mut hasher); + data_b64.as_bytes().hash(&mut hasher); + hasher.finish() +} + +fn dims_cache_get(id: u64) -> Option<InlineDims> { + INLINE_DIMS_CACHE.lock().ok()?.0.get(&id).copied() +} + +fn dims_cache_put(id: u64, dims: InlineDims) { + if let Ok(mut guard) = INLINE_DIMS_CACHE.lock() { + let (map, order) = &mut *guard; + if map.insert(id, dims).is_none() { + order.push_back(id); + while order.len() > INLINE_DIMS_MAX { + if let Some(old) = order.pop_front() { + map.remove(&old); + } + } + } + } +} + +/// Cheap dimensions for an inline image: `(id, width, height)`. +/// +/// Tries a header-only parse of a decoded prefix first (so a multi-megabyte +/// screenshot only touches its first few KB), and falls back to a full decode +/// only if the header could not be understood. Results are cached by id. +pub fn inline_image_dims(media_type: &str, data_b64: &str) -> Option<(u64, u32, u32)> { + let id = inline_image_id(media_type, data_b64); + if let Some(dims) = dims_cache_get(id) { + return Some((id, dims.width, dims.height)); + } + + // Header-only fast path: decode just a prefix of the base64 payload. + if let Some((w, h)) = dims_from_b64_prefix(data_b64) { + let dims = InlineDims { + width: w, + height: h, + }; + dims_cache_put(id, dims); + return Some((id, w, h)); + } + + // Fallback: full decode (only happens once per image, then cached). + let bytes = base64::engine::general_purpose::STANDARD + .decode(data_b64) + .ok()?; + let image = image::load_from_memory(&bytes).ok()?; + let (w, h) = image.dimensions(); + let dims = InlineDims { + width: w, + height: h, + }; + dims_cache_put(id, dims); + Some((id, w, h)) +} + +/// Materialize an inline image for rendering: decode it, write a PNG-equivalent +/// file to the shared cache directory, and register it in `RENDER_CACHE` so the +/// existing `render_image_widget_*` paths can draw it by id. +/// +/// Idempotent and cheap on repeat (returns the cached entry without re-decoding +/// once the file exists). Returns `(id, width, height)` on success. +pub fn materialize_inline_image(media_type: &str, data_b64: &str) -> Option<(u64, u32, u32)> { + let id = inline_image_id(media_type, data_b64); + + if let Ok(mut cache) = RENDER_CACHE.lock() + && let Some(existing) = cache.get(id, None, Some(RenderProfile::default())) + { + return Some((id, existing.width, existing.height)); + } + + let bytes = base64::engine::general_purpose::STANDARD + .decode(data_b64) + .ok()?; + let image = image::load_from_memory(&bytes).ok()?; + let (width, height) = image.dimensions(); + dims_cache_put( + id, + InlineDims { + width, + height, + }, + ); + + let ext = inline_image_extension(media_type); + if let Ok(mut cache) = RENDER_CACHE.lock() { + let path = cache.cache_dir.join(format!("{:016x}_inline.{}", id, ext)); + if !path.exists() && fs::write(&path, &bytes).is_err() { + return None; + } + cache.insert( + id, + RenderProfile::default(), + CachedDiagram { + path, + width, + height, + }, + ); + return Some((id, width, height)); + } + + None +} + +fn inline_image_extension(media_type: &str) -> &'static str { + match media_type { + "image/png" => "png", + "image/jpeg" => "jpg", + "image/gif" => "gif", + "image/webp" => "webp", + "image/bmp" => "bmp", + "image/x-icon" | "image/vnd.microsoft.icon" => "ico", + _ => "img", + } +} + +/// Decode a bounded prefix of the base64 payload and try to read image +/// dimensions straight from the container header (PNG/JPEG/GIF/BMP/WEBP). +fn dims_from_b64_prefix(data_b64: &str) -> Option<(u32, u32)> { + // 16 KB of base64 -> 12 KB of header bytes, plenty for every supported + // container's dimension fields while staying far cheaper than a full decode. + const PREFIX_B64_CHARS: usize = 16 * 1024; + let take = data_b64.len().min(PREFIX_B64_CHARS); + // base64 decodes in 4-char groups; trim to a group boundary so the prefix + // is self-consistent without trailing padding. + let take = take - (take % 4); + if take == 0 { + return None; + } + let prefix = &data_b64.as_bytes()[..take]; + let bytes = base64::engine::general_purpose::STANDARD + .decode(prefix) + .ok()?; + dimensions_from_header(&bytes) +} + +/// Parse image dimensions directly from container header bytes. Mirrors the +/// lightweight parser used by the `read` tool, but lives here so the inline +/// renderer has no cross-crate dependency for header sniffing. +pub(crate) fn dimensions_from_header(data: &[u8]) -> Option<(u32, u32)> { + // PNG: 8-byte signature then IHDR (width/height as big-endian u32). + if data.len() > 24 && &data[0..8] == b"\x89PNG\r\n\x1a\n" { + let width = u32::from_be_bytes([data[16], data[17], data[18], data[19]]); + let height = u32::from_be_bytes([data[20], data[21], data[22], data[23]]); + if width > 0 && height > 0 { + return Some((width, height)); + } + } + + // JPEG: scan for a Start-Of-Frame marker. + if data.len() > 2 && data[0] == 0xFF && data[1] == 0xD8 { + let mut i = 2; + while i + 9 < data.len() { + if data[i] != 0xFF { + i += 1; + continue; + } + let marker = data[i + 1]; + // SOF0 (baseline) / SOF1 / SOF2 (progressive) etc. + if (0xC0..=0xCF).contains(&marker) + && marker != 0xC4 + && marker != 0xC8 + && marker != 0xCC + { + let height = u16::from_be_bytes([data[i + 5], data[i + 6]]) as u32; + let width = u16::from_be_bytes([data[i + 7], data[i + 8]]) as u32; + if width > 0 && height > 0 { + return Some((width, height)); + } + return None; + } + if i + 3 < data.len() { + let len = u16::from_be_bytes([data[i + 2], data[i + 3]]) as usize; + i += 2 + len; + } else { + break; + } + } + } + + // GIF: logical screen descriptor (little-endian u16). + if data.len() > 10 && (&data[0..6] == b"GIF87a" || &data[0..6] == b"GIF89a") { + let width = u16::from_le_bytes([data[6], data[7]]) as u32; + let height = u16::from_le_bytes([data[8], data[9]]) as u32; + if width > 0 && height > 0 { + return Some((width, height)); + } + } + + // BMP: DIB header (BITMAPINFOHEADER) width/height at offset 18/22. + if data.len() > 26 && &data[0..2] == b"BM" { + let width = i32::from_le_bytes([data[18], data[19], data[20], data[21]]); + let height = i32::from_le_bytes([data[22], data[23], data[24], data[25]]); + if width > 0 && height != 0 { + return Some((width as u32, height.unsigned_abs())); + } + } + + // WEBP (VP8X/VP8L/VP8): "RIFF"...."WEBP". + if data.len() > 30 && &data[0..4] == b"RIFF" && &data[8..12] == b"WEBP" { + let fourcc = &data[12..16]; + if fourcc == b"VP8X" && data.len() > 30 { + let w = 1 + (u32::from(data[24]) | (u32::from(data[25]) << 8) | (u32::from(data[26]) << 16)); + let h = 1 + (u32::from(data[27]) | (u32::from(data[28]) << 8) | (u32::from(data[29]) << 16)); + return Some((w, h)); + } + if fourcc == b"VP8 " && data.len() > 30 { + // Lossy: dimensions live in the key-frame header. + let w = (u16::from_le_bytes([data[26], data[27]]) & 0x3FFF) as u32; + let h = (u16::from_le_bytes([data[28], data[29]]) & 0x3FFF) as u32; + if w > 0 && h > 0 { + return Some((w, h)); + } + } + } + + None +} + +#[cfg(test)] +mod tests { + use super::*; + + fn tiny_png_b64() -> String { + // 1x1 transparent PNG. + "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAAC0lEQVR42mNk+M9QDwADhgGAWjR9awAAAABJRU5ErkJggg==".to_string() + } + + #[test] + fn inline_dims_reads_png_header_without_full_decode() { + let b64 = tiny_png_b64(); + let (id, w, h) = inline_image_dims("image/png", &b64).expect("dims"); + assert_eq!((w, h), (1, 1)); + // Cached and id-stable. + let again = inline_image_dims("image/png", &b64).expect("dims again"); + assert_eq!(again.0, id); + } + + #[test] + fn inline_id_is_stable_and_distinct() { + let a = inline_image_id("image/png", "AAAA"); + let b = inline_image_id("image/png", "AAAA"); + let c = inline_image_id("image/png", "BBBB"); + assert_eq!(a, b); + assert_ne!(a, c); + } +} diff --git a/crates/jcode-tui-messages/src/lib.rs b/crates/jcode-tui-messages/src/lib.rs index 33c7b877e3..b112ec5017 100644 --- a/crates/jcode-tui-messages/src/lib.rs +++ b/crates/jcode-tui-messages/src/lib.rs @@ -13,7 +13,7 @@ pub use message::{ transcript_preview_lines, truncate_transcript_preview, }; pub use prepared::{ - CopyTarget, EditToolRange, ImageRegion, PreparedChatFrame, PreparedMessages, PreparedSection, - PreparedSectionKind, + CopyTarget, EditToolRange, ImageRegion, ImageRegionRender, PreparedChatFrame, PreparedMessages, + PreparedSection, PreparedSectionKind, }; pub use wrapped_line_map::WrappedLineMap; diff --git a/crates/jcode-tui-messages/src/prepared.rs b/crates/jcode-tui-messages/src/prepared.rs index 75c522c12d..e5146830ad 100644 --- a/crates/jcode-tui-messages/src/prepared.rs +++ b/crates/jcode-tui-messages/src/prepared.rs @@ -14,6 +14,20 @@ pub struct ImageRegion { pub hash: u64, /// Total height of the image placeholder in lines. pub height: u16, + /// How the image should be fit into its region when drawn. + pub render: ImageRegionRender, +} + +/// Strategy for fitting an image into its placeholder region at draw time. +#[derive(Clone, Copy, PartialEq, Eq, Debug, Default)] +pub enum ImageRegionRender { + /// Crop into the pre-estimated region height. Used for Mermaid diagrams, + /// whose placeholder height already matches their rendered aspect ratio. + #[default] + Crop, + /// Scale-to-fit (preserve aspect, fit width and height). Used for inline + /// raster images so resizes and font-metric mismatches never slice them. + Fit, } #[derive(Clone, Debug)] @@ -74,6 +88,8 @@ pub enum PreparedSectionKind { /// `current` reasoning-display mode. Reasoning, Streaming, + /// Inline images rendered in the transcript flow (below the body). + InlineImages, } #[derive(Clone)] @@ -141,6 +157,7 @@ impl PreparedChatFrame { end_line: region.end_line + line_start, hash: region.hash, height: region.height, + render: region.render, })); edit_tool_ranges.extend(prepared.edit_tool_ranges.iter().map(|range| EditToolRange { edit_index: range.edit_index, diff --git a/crates/jcode-tui/src/tui/app.rs b/crates/jcode-tui/src/tui/app.rs index 926db0991c..7c600cf581 100644 --- a/crates/jcode-tui/src/tui/app.rs +++ b/crates/jcode-tui/src/tui/app.rs @@ -526,6 +526,23 @@ pub(super) struct CompactedHistoryLazyState { pub pending_request_visible: Option<usize>, } +/// Pending viewport anchor used to keep the chat stable when older compacted +/// history is loaded in. Older messages are prepended above the current view, +/// which would otherwise teleport the reader to the new absolute top. We instead +/// remember the reader's distance from the bottom (which is invariant under a +/// top-side prepend) and let the next render resolve it into an absolute offset. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub(super) struct HistoryScrollAnchor { + /// Wrapped lines between the top of the viewport and the bottom of the + /// transcript at the moment the load was requested. Invariant across the + /// prepend, so `new_total - lines_from_bottom` reproduces the same view. + pub lines_from_bottom: usize, + /// Total wrapped line count of the frame this anchor was captured from. Used + /// to detect when a frame with the newly-loaded content has rendered (its + /// total differs), so the anchor can be reconciled into `scroll_offset`. + pub base_total: usize, +} + #[derive(Debug, Clone, PartialEq, Eq)] pub(super) struct OvernightAutoPokeFingerprint { pub run_id: String, @@ -666,6 +683,10 @@ pub struct App { display_user_message_count: usize, display_edit_tool_message_count: usize, compacted_history_lazy: CompactedHistoryLazyState, + /// When older compacted history has just been loaded, this anchors the + /// viewport to the content the reader was looking at so the prepend does not + /// visibly jump. Resolved into `scroll_offset` by the next render frame. + pending_history_anchor: Option<HistoryScrollAnchor>, input: String, command_candidates_cache: RefCell<Option<CommandCandidatesCache>>, cursor_pos: usize, diff --git a/crates/jcode-tui/src/tui/app/commands.rs b/crates/jcode-tui/src/tui/app/commands.rs index 8e2c6e27b5..34dd8ac608 100644 --- a/crates/jcode-tui/src/tui/app/commands.rs +++ b/crates/jcode-tui/src/tui/app/commands.rs @@ -1626,6 +1626,11 @@ pub(super) fn handle_session_command(app: &mut App, trimmed: &str) -> bool { return true; } + if trimmed == "/commit-push" || trimmed == "/commit-and-push" { + handle_commit_push_command_local(app); + return true; + } + if trimmed == "/resume" || trimmed == "/sessions" || trimmed == "/session" { app.open_session_picker(); return true; @@ -2029,6 +2034,15 @@ pub(super) fn build_commit_prompt() -> String { "Make interactive, logical commits for the current uncommitted work. Inspect the git state first, including unstaged and staged changes. Group related changes into small coherent commits, staging only the files or hunks that belong together. Preserve unrelated user or agent work, do not discard changes, and do not amend existing commits unless clearly necessary. For each commit, use a concise conventional-style message when possible. Validate as appropriate for the changed files before committing, and report the commits created plus any remaining uncommitted changes.".to_string() } +pub(super) fn build_commit_push_prompt() -> String { + let mut prompt = build_commit_prompt(); + prompt.push(' '); + prompt.push_str( + "After creating the commits, push them to the remote tracking branch with git push (set the upstream with git push -u if the branch has no upstream yet). If the push fails, report the error instead of force-pushing, and never force-push or rewrite already-pushed history. Finally, report the commits created and the push result.", + ); + prompt +} + pub(super) fn commit_launch_notice(interrupted: bool) -> String { if interrupted { "👉 Interrupting and starting logical commits...".to_string() @@ -2037,6 +2051,14 @@ pub(super) fn commit_launch_notice(interrupted: bool) -> String { } } +pub(super) fn commit_push_launch_notice(interrupted: bool) -> String { + if interrupted { + "👉 Interrupting and starting logical commits + push...".to_string() + } else { + "🚀 Starting logical commits + push...".to_string() + } +} + fn handle_commit_command_local(app: &mut App) { let prompt = build_commit_prompt(); if app.is_processing { @@ -2052,6 +2074,21 @@ fn handle_commit_command_local(app: &mut App) { } } +fn handle_commit_push_command_local(app: &mut App) { + let prompt = build_commit_push_prompt(); + if app.is_processing { + super::commands_improve::interrupt_and_queue_synthetic_message( + app, + prompt, + "Interrupting for /commit-push...", + commit_push_launch_notice(true), + ); + } else { + app.push_display_message(DisplayMessage::system(commit_push_launch_notice(false))); + super::commands_improve::start_synthetic_user_turn(app, prompt); + } +} + fn handle_selfdev_command(app: &mut App, trimmed: &str) -> bool { if !trimmed.starts_with("/selfdev") { return false; diff --git a/crates/jcode-tui/src/tui/app/debug_bench.rs b/crates/jcode-tui/src/tui/app/debug_bench.rs index 4a40b971bf..cdfd1e7888 100644 --- a/crates/jcode-tui/src/tui/app/debug_bench.rs +++ b/crates/jcode-tui/src/tui/app/debug_bench.rs @@ -1130,7 +1130,7 @@ impl App { /// scroll position one content line at a time, captures the resulting widget /// placements, and runs the shared stability analyzer. pub(in crate::tui::app) fn run_widget_stability(&mut self, raw: Option<&str>) -> String { - use crate::tui::info_widget_stability::{PlacedRect, analyze_frames, intern_kind}; + use crate::tui::info_widget_stability::{PlacedRect, intern_kind}; use ratatui::Terminal; use ratatui::backend::TestBackend; @@ -1186,6 +1186,10 @@ impl App { // Walk from top to bottom one (or `step`) content lines at a time, recording // the widget placements at each scroll position. let mut frames: Vec<Vec<PlacedRect>> = Vec::new(); + // Absolute transcript line shown on the first visible row of each frame, so + // the analyzer can subtract the scroll-ride and report content-relative + // travel (how much widgets move *relative to the text* they sit beside). + let mut scroll_tops_abs: Vec<i64> = Vec::new(); let mut frame_payloads: Vec<serde_json::Value> = Vec::new(); self.auto_scroll_paused = true; @@ -1197,6 +1201,7 @@ impl App { errors.push(format!("draw error at scroll_top {}: {}", scroll_top, e)); break; } + scroll_tops_abs.push(crate::tui::ui::last_resolved_chat_scroll() as i64); let placed: Vec<PlacedRect> = match crate::tui::visual_debug::latest_frame() { Some(frame) => frame .info_widgets @@ -1236,7 +1241,10 @@ impl App { scroll_top = (scroll_top + step).min(max_scroll); } - let report = analyze_frames(&frames); + let report = crate::tui::info_widget_stability::analyze_frames_with_scroll( + &frames, + &scroll_tops_abs, + ); saved_state.restore(self); if !was_visual_debug { @@ -1262,6 +1270,7 @@ impl App { "notes": [ "Scrolls the current transcript one content line at a time over an offscreen backend.", "travel_per_100_lines = total widget x+y movement per 100 scroll lines (lower is calmer).", + "content_travel_per_100_lines = movement RELATIVE TO THE TRANSCRIPT (scroll-ride subtracted); ~0 means widgets stick to one negative-space spot and just scroll along.", "flicker_per_100_lines = widget appear/disappear transitions per 100 scroll lines.", "distraction_per_100_lines = travel + weighted flicker; the single headline number." ], diff --git a/crates/jcode-tui/src/tui/app/helpers.rs b/crates/jcode-tui/src/tui/app/helpers.rs index 838a297364..7497a957ab 100644 --- a/crates/jcode-tui/src/tui/app/helpers.rs +++ b/crates/jcode-tui/src/tui/app/helpers.rs @@ -491,6 +491,7 @@ pub(super) fn inferred_reasoning_efforts( || model.starts_with("claude-"); if is_anthropic { let supports_effort = model.contains("claude-mythos") + || model.contains("claude-fable-5") || model.contains("claude-opus-4-8") || model.contains("claude-opus-4-7") || model.contains("claude-opus-4-6") @@ -501,7 +502,10 @@ pub(super) fn inferred_reasoning_efforts( if !supports_effort { return Vec::new(); } - if model.contains("claude-opus-4-8") || model.contains("claude-opus-4-7") { + if model.contains("claude-fable-5") + || model.contains("claude-opus-4-8") + || model.contains("claude-opus-4-7") + { return vec!["none", "low", "medium", "high", "xhigh"]; } return vec!["none", "low", "medium", "high"]; diff --git a/crates/jcode-tui/src/tui/app/input_help.rs b/crates/jcode-tui/src/tui/app/input_help.rs index 771661622a..471a6cda69 100644 --- a/crates/jcode-tui/src/tui/app/input_help.rs +++ b/crates/jcode-tui/src/tui/app/input_help.rs @@ -58,6 +58,9 @@ impl App { "commit" => { "/commit\nAsk the agent to inspect current uncommitted changes and create interactive, logical commits.\n\nThe agent should group related files or hunks, preserve unrelated work, validate as appropriate, and report the commits created plus anything left uncommitted." } + "commit-push" | "commit-and-push" => { + "/commit-push\nSame as /commit, then push the new commits to the remote tracking branch.\n\nThe agent groups related changes into logical commits, preserves unrelated work, then runs git push (using git push -u if the branch has no upstream). It will not force-push or rewrite already-pushed history, and reports the commits created plus the push result." + } "catchup" => { "/catchup\nOpen the Catch Up picker for finished sessions that need attention.\n\n/catchup next\nTeleport to the next session needing attention and open a Catch Up brief in the side panel.\n\n/catchup list\nAlias for opening the picker." } diff --git a/crates/jcode-tui/src/tui/app/local.rs b/crates/jcode-tui/src/tui/app/local.rs index b696bbb9c8..13eff2900f 100644 --- a/crates/jcode-tui/src/tui/app/local.rs +++ b/crates/jcode-tui/src/tui/app/local.rs @@ -68,6 +68,9 @@ pub(super) fn handle_tick(app: &mut App) -> bool { needs_redraw |= app.update_chat_overscroll(); needs_redraw |= app.tick_reasoning_collapse(); needs_redraw |= app.update_pinned_images_auto_hide(); + // Adopt the resolved scroll position once a frame containing newly loaded + // older history has rendered, so manual scrolling resumes seamlessly. + needs_redraw |= app.reconcile_history_anchor(); if app.submit_input_on_startup && !app.is_processing { app.submit_input_on_startup = false; app.submit_input(); diff --git a/crates/jcode-tui/src/tui/app/navigation.rs b/crates/jcode-tui/src/tui/app/navigation.rs index 96b1fca988..c8293d192d 100644 --- a/crates/jcode-tui/src/tui/app/navigation.rs +++ b/crates/jcode-tui/src/tui/app/navigation.rs @@ -1,9 +1,6 @@ use super::*; use crate::tui::ui::input_ui; use ratatui::layout::Rect; -use std::time::Duration; - -const PINNED_IMAGES_AUTO_HIDE_AFTER: Duration = Duration::from_secs(20); #[derive(Clone, Debug, PartialEq, Eq)] struct MouseScrollTraceState { @@ -490,69 +487,17 @@ impl App { } fn side_pane_has_visual_images_ignoring_user_hidden(&self) -> bool { - if !self.pin_images || self.side_panel.focused_page().is_some() || self.diff_mode.is_file() - { - return false; - } - - if self.is_remote { - !self.remote_side_pane_images.is_empty() - } else { - crate::session::has_rendered_images(&self.session) - } + // Images now render inline in the transcript flow, not in the side + // panel, so they no longer drive the side-panel visibility heuristics. + false } pub(super) fn update_pinned_images_auto_hide(&mut self) -> bool { - if !self.pin_images || self.side_panel.focused_page().is_some() || self.diff_mode.is_file() - { - self.pinned_images_auto_hide_deadline = None; - self.pinned_images_seen_count = 0; - return false; - } - - let image_count = if self.is_remote { - self.remote_side_pane_images.len() - } else { - crate::session::render_images(&self.session).len() - }; - if image_count == 0 { - self.pinned_images_auto_hide_deadline = None; - self.pinned_images_seen_count = 0; - return false; - } - - let now = Instant::now(); - let mut needs_redraw = false; - if image_count > self.pinned_images_seen_count { - self.pinned_images_seen_count = image_count; - // Don't re-reveal a panel the user explicitly hid (Alt+M). This also - // keeps the hide sticky across server reloads/reconnects, where the - // seen count resets to 0 while images are repopulated from the - // history snapshot (which would otherwise look like "new images"). - if !self.side_panel_explicit_hidden { - self.side_panel_user_hidden = false; - self.pinned_images_auto_hide_deadline = Some(now + PINNED_IMAGES_AUTO_HIDE_AFTER); - needs_redraw = true; - } - } - - if let Some(deadline) = self.pinned_images_auto_hide_deadline - && now >= deadline - { - self.pinned_images_auto_hide_deadline = None; - if !self.side_panel_user_hidden && self.side_pane_has_visual_images() { - self.side_panel_user_hidden = true; - self.set_diff_pane_focus(false); - self.sync_diagram_fit_context(); - self.push_display_message(DisplayMessage::system(format!( - "Pinned image side panel hidden automatically. Press {} to show it again.", - crate::tui::keybind::side_panel_toggle_key_label() - ))); - needs_redraw = true; - } - } - - needs_redraw + // Images render inline in the transcript now, so there is no longer a + // pinned-image side panel to auto-reveal or auto-hide. + self.pinned_images_auto_hide_deadline = None; + self.pinned_images_seen_count = 0; + false } fn side_pane_line_scroll_amount(&self) -> usize { @@ -1459,6 +1404,21 @@ impl App { pub(super) fn scroll_up(&mut self, amount: usize) -> bool { // Scrolling up cancels any pending overscroll rebound line immediately. self.chat_overscroll_last = None; + // While older compacted history is still settling on screen, the renderer + // is anchored to a distance-from-bottom rather than `scroll_offset`. Keep + // scrolling continuous by moving the anchor itself instead of a stale + // offset the renderer is currently ignoring. + if let Some(mut anchor) = self.pending_history_anchor { + let total = super::super::ui::last_total_wrapped_lines(); + anchor.lines_from_bottom = anchor + .lines_from_bottom + .saturating_add(amount) + .min(total.max(anchor.lines_from_bottom)); + self.pending_history_anchor = Some(anchor); + self.auto_scroll_paused = true; + self.maybe_queue_compacted_history_load(); + return true; + } let before = (self.scroll_offset, self.auto_scroll_paused); let max = self.scroll_max_estimate(); if !self.auto_scroll_paused { @@ -1472,7 +1432,15 @@ impl App { self.scroll_offset = self.scroll_offset.saturating_sub(amount); } self.auto_scroll_paused = true; - self.maybe_queue_compacted_history_load(); + // If the upward scroll bottomed out against the top of the currently + // loaded content, fold the unsatisfied intent into the prefetch as + // overshoot so the newly loaded history scrolls into view smoothly. + let overshoot = if self.scroll_offset == 0 { + amount + } else { + 0 + }; + self.maybe_queue_compacted_history_load_with_overshoot(overshoot); before != (self.scroll_offset, self.auto_scroll_paused) } @@ -1494,6 +1462,18 @@ impl App { /// `false`, so the mouse-wheel queue does not accumulate phantom scroll /// that would later have to be undone before scrolling up moves the view. pub(super) fn scroll_down(&mut self, amount: usize) -> bool { + // Mirror `scroll_up`: while an older-history prepend is still settling, + // the renderer is anchored to distance-from-bottom, so move the anchor + // toward the bottom instead of a stale `scroll_offset`. + if let Some(mut anchor) = self.pending_history_anchor { + if anchor.lines_from_bottom == 0 { + self.register_chat_overscroll(); + return false; + } + anchor.lines_from_bottom = anchor.lines_from_bottom.saturating_sub(amount); + self.pending_history_anchor = Some(anchor); + return true; + } if !self.auto_scroll_paused { // Already pinned to the bottom: a further downward scroll is an // "overscroll". Reveal the elastic status line and keep it dwelling. @@ -1529,6 +1509,7 @@ impl App { } pub(super) fn follow_chat_bottom(&mut self) { + self.pending_history_anchor = None; self.scroll_offset = 0; self.auto_scroll_paused = false; } diff --git a/crates/jcode-tui/src/tui/app/onboarding_flow_control.rs b/crates/jcode-tui/src/tui/app/onboarding_flow_control.rs index c601893f39..a6e0ca8617 100644 --- a/crates/jcode-tui/src/tui/app/onboarding_flow_control.rs +++ b/crates/jcode-tui/src/tui/app/onboarding_flow_control.rs @@ -110,15 +110,55 @@ impl App { } } - /// Whether this install looks like a brand-new user (few launches). + /// Whether this install looks like a brand-new user. + /// + /// Primary signal is `launch_count` in `setup_hints.json`, but that file + /// only counts interactive `jcode` launches (TTY-gated) and can be reset + /// or lag far behind reality. So before concluding "new user" we also look + /// for independent evidence of an established install: a meaningful number + /// of persisted native sessions. A user with a long session history must + /// never be dragged through first-run onboarding just because their + /// launch counter looks low. fn is_new_user_for_onboarding(&self) -> bool { - crate::storage::jcode_dir() - .ok() - .and_then(|dir| std::fs::read_to_string(dir.join("setup_hints.json")).ok()) - .and_then(|content| serde_json::from_str::<serde_json::Value>(&content).ok()) - .and_then(|v| v.get("launch_count")?.as_u64()) - .map(|count| count <= 5) - .unwrap_or(true) + Self::is_new_user_install() + } + + /// Shared "does this install look brand-new?" check (see + /// [`Self::is_new_user_for_onboarding`] for the rationale). Also used by + /// the welcome-screen suggestion prompts. + /// + /// Loads via [`crate::setup_hints::SetupHintsState`] so the `.bak` + /// fallback applies when `setup_hints.json` is missing or corrupt. + pub(super) fn is_new_user_install() -> bool { + let Ok(dir) = crate::storage::jcode_dir() else { + return true; + }; + if crate::setup_hints::SetupHintsState::load().launch_count > 5 { + return false; + } + !Self::has_established_native_session_history(&dir) + } + + /// Independent "experienced user" evidence: enough persisted native + /// sessions on disk. Imported transcripts (`imported_*.json`) don't count; + /// they exist on fresh installs that imported Codex/Claude history. + fn has_established_native_session_history(jcode_dir: &std::path::Path) -> bool { + const ESTABLISHED_SESSION_THRESHOLD: usize = 10; + let Ok(entries) = std::fs::read_dir(jcode_dir.join("sessions")) else { + return false; + }; + let mut native_sessions = 0usize; + for entry in entries.flatten() { + let name = entry.file_name(); + let Some(name) = name.to_str() else { continue }; + if name.starts_with("session_") && name.ends_with(".json") { + native_sessions += 1; + if native_sessions >= ESTABLISHED_SESSION_THRESHOLD { + return true; + } + } + } + false } /// Whether this is a self-dev / canary session. diff --git a/crates/jcode-tui/src/tui/app/remote.rs b/crates/jcode-tui/src/tui/app/remote.rs index 5f69f0bd3e..9c21a8f7d2 100644 --- a/crates/jcode-tui/src/tui/app/remote.rs +++ b/crates/jcode-tui/src/tui/app/remote.rs @@ -82,6 +82,9 @@ pub(super) async fn handle_tick(app: &mut App, remote: &mut RemoteConnection) -> needs_redraw |= app.tick_reasoning_collapse(); needs_redraw |= app.update_pinned_images_auto_hide(); needs_redraw |= dispatch_compacted_history_load(app, remote).await; + // Adopt the resolved scroll position once a frame containing newly loaded + // older history has rendered, so manual scrolling resumes seamlessly. + needs_redraw |= app.reconcile_history_anchor(); // Reveal buffered streaming text at the smooth paced rate on each tick, the // same as the local turn loop. Finalization paths still call flush(). if let Some(chunk) = app.stream_buffer.flush_smooth_frame() { diff --git a/crates/jcode-tui/src/tui/app/remote/key_handling.rs b/crates/jcode-tui/src/tui/app/remote/key_handling.rs index 77b3b19c77..61c539106f 100644 --- a/crates/jcode-tui/src/tui/app/remote/key_handling.rs +++ b/crates/jcode-tui/src/tui/app/remote/key_handling.rs @@ -1761,29 +1761,39 @@ async fn handle_remote_key_internal( return Ok(()); } - if trimmed == "/commit" { - let prompt = app_mod::commands::build_commit_prompt(); + if trimmed == "/commit" || trimmed == "/commit-push" || trimmed == "/commit-and-push" + { + let is_push = trimmed != "/commit"; + let prompt = if is_push { + app_mod::commands::build_commit_push_prompt() + } else { + app_mod::commands::build_commit_prompt() + }; + let launch_notice = |interrupted: bool| { + if is_push { + app_mod::commands::commit_push_launch_notice(interrupted) + } else { + app_mod::commands::commit_launch_notice(interrupted) + } + }; + let cmd_label = if is_push { "/commit-push" } else { "/commit" }; if app.is_processing { - app.push_display_message(DisplayMessage::system( - app_mod::commands::commit_launch_notice(true), - )); + app.push_display_message(DisplayMessage::system(launch_notice(true))); match remote.soft_interrupt(prompt.clone(), false).await { Ok(request_id) => { app.track_pending_soft_interrupt(request_id, prompt); - app.set_status_notice("Interrupting for /commit..."); + app.set_status_notice(format!("Interrupting for {}...", cmd_label)); } Err(error) => { app.push_display_message(DisplayMessage::error(format!( - "Failed to start /commit: {}", - error + "Failed to start {}: {}", + cmd_label, error ))); - app.set_status_notice("/commit failed"); + app.set_status_notice(format!("{} failed", cmd_label)); } } } else { - app.push_display_message(DisplayMessage::system( - app_mod::commands::commit_launch_notice(false), - )); + app.push_display_message(DisplayMessage::system(launch_notice(false))); input_dispatch::begin_remote_send( app, remote, diff --git a/crates/jcode-tui/src/tui/app/state_ui_input_helpers.rs b/crates/jcode-tui/src/tui/app/state_ui_input_helpers.rs index 0b4e159e44..cef718ae6c 100644 --- a/crates/jcode-tui/src/tui/app/state_ui_input_helpers.rs +++ b/crates/jcode-tui/src/tui/app/state_ui_input_helpers.rs @@ -61,6 +61,10 @@ const REGISTERED_COMMANDS: &[RegisteredCommand] = &[ RegisteredCommand::public("/ssh", "Connect to a remote machine using system SSH"), RegisteredCommand::public("/git", "Show git status for the session working directory"), RegisteredCommand::public("/commit", "Make logical commits from current changes"), + RegisteredCommand::public( + "/commit-push", + "Make logical commits from current changes, then push", + ), RegisteredCommand::public("/transcript", "Open the current session transcript file"), RegisteredCommand::public("/subagent-model", "Show/change subagent model policy"), RegisteredCommand::public("/autoreview", "Show/toggle automatic end-of-turn review"), @@ -1225,16 +1229,7 @@ impl App { let is_new_user = if preview_mode { true } else { - crate::storage::jcode_dir() - .ok() - .and_then(|dir| { - let path = dir.join("setup_hints.json"); - std::fs::read_to_string(&path).ok() - }) - .and_then(|content| serde_json::from_str::<serde_json::Value>(&content).ok()) - .and_then(|v| v.get("launch_count")?.as_u64()) - .map(|count| count <= 5) - .unwrap_or(true) + Self::is_new_user_install() }; if !is_new_user { diff --git a/crates/jcode-tui/src/tui/app/state_ui_messages.rs b/crates/jcode-tui/src/tui/app/state_ui_messages.rs index 45824603ee..1500e9bc25 100644 --- a/crates/jcode-tui/src/tui/app/state_ui_messages.rs +++ b/crates/jcode-tui/src/tui/app/state_ui_messages.rs @@ -371,7 +371,14 @@ impl App { pending_request_visible: None, }; self.auto_scroll_paused = true; - self.scroll_offset = 0; + // Older messages are prepended above the current view. If the reader had + // an anchor captured (they scrolled up to trigger this load), leave the + // scroll position for the next render to resolve so the content under + // them stays put instead of teleporting to the new absolute top. Only + // fall back to the top when there is no anchor to honor. + if self.pending_history_anchor.is_none() { + self.scroll_offset = 0; + } self.bump_display_messages_version(); self.note_runtime_memory_event_force( "compacted_history_loaded", @@ -387,11 +394,77 @@ impl App { } } + /// Number of wrapped lines from the top of the chat viewport that should be + /// treated as the "near the top" zone that proactively loads older history. + /// Prefetching roughly one viewport ahead means scrolling up keeps flowing + /// instead of stalling at a hard wall and then jumping. + fn compacted_history_prefetch_threshold(&self) -> usize { + let viewport = crate::tui::ui::last_layout_snapshot() + .map(|layout| layout.messages_area.height as usize) + .unwrap_or(0); + // Trigger when within ~one viewport of the top, with a small floor so the + // behavior is sensible even before the first layout snapshot exists. + viewport.max(COMPACTED_HISTORY_LOAD_SCROLL_THRESHOLD) + } + + /// Capture a viewport anchor describing the reader's current distance from + /// the bottom of the transcript, plus any leftover upward scroll intent that + /// could not be satisfied because the view was already at the top of the + /// currently-loaded content. The next render that includes the newly loaded + /// (prepended) history resolves this back into an absolute `scroll_offset`, + /// keeping the content under the reader stable across the load. + pub(super) fn capture_history_anchor(&mut self, overshoot: usize) { + // Don't clobber an anchor that is still waiting to be resolved; the + // original distance-from-bottom remains correct across further prepends. + if self.pending_history_anchor.is_some() { + return; + } + let total = crate::tui::ui::last_total_wrapped_lines(); + if total == 0 { + return; + } + // The top of the viewport currently sits at absolute line `scroll_offset` + // within the pre-prepend transcript (length `total`). Its distance from + // the bottom is invariant when older lines are prepended, so capture it + // (plus any unsatisfied upward intent as `overshoot`) and let the next + // render map it back to an absolute offset against the larger total. + let scroll = self.scroll_offset.min(total); + let lines_from_bottom = total.saturating_sub(scroll).saturating_add(overshoot); + self.pending_history_anchor = Some(super::HistoryScrollAnchor { + lines_from_bottom, + base_total: total, + }); + } + + /// Adopt a resolved history anchor once a frame containing the newly loaded + /// content has rendered. Returns true when the scroll position changed. + pub(super) fn reconcile_history_anchor(&mut self) -> bool { + let Some(anchor) = self.pending_history_anchor else { + return false; + }; + let total = crate::tui::ui::last_total_wrapped_lines(); + // Wait until a frame with the prepended content has actually rendered + // (its total wrapped-line count differs from the captured base). + if total == 0 || total == anchor.base_total { + return false; + } + let resolved = crate::tui::ui::last_resolved_chat_scroll(); + self.pending_history_anchor = None; + let changed = self.scroll_offset != resolved || !self.auto_scroll_paused; + self.scroll_offset = resolved; + self.auto_scroll_paused = true; + changed + } + pub(super) fn maybe_queue_compacted_history_load(&mut self) { + self.maybe_queue_compacted_history_load_with_overshoot(0); + } + + pub(super) fn maybe_queue_compacted_history_load_with_overshoot(&mut self, overshoot: usize) { if !self.auto_scroll_paused { return; } - if self.scroll_offset > COMPACTED_HISTORY_LOAD_SCROLL_THRESHOLD { + if self.scroll_offset > self.compacted_history_prefetch_threshold() { return; } if self.compacted_history_lazy.remaining_messages == 0 { @@ -404,6 +477,12 @@ impl App { { return; } + // Throttle to one chunk per settled frame: while an anchor is still + // waiting to resolve on screen, hold off so prepends never compound into + // a visible jump. + if self.pending_history_anchor.is_some() { + return; + } let next_visible = self .compacted_history_lazy @@ -414,6 +493,9 @@ impl App { return; } + // Anchor the viewport before mutating so the prepend stays seamless. + self.capture_history_anchor(overshoot); + if self.is_remote { self.compacted_history_lazy.pending_request_visible = Some(next_visible); self.set_status_notice(format!( @@ -429,6 +511,12 @@ impl App { self.compacted_history_lazy.pending_request_visible.take() } + /// Whether there are older compacted-history messages not yet loaded into the + /// display transcript. + pub(super) fn compacted_history_has_remaining(&self) -> bool { + self.compacted_history_lazy.remaining_messages > 0 + } + pub(super) fn restore_pending_compacted_history_load(&mut self, visible_messages: usize) { self.compacted_history_lazy.pending_request_visible = Some(visible_messages); } diff --git a/crates/jcode-tui/src/tui/app/state_ui_runtime.rs b/crates/jcode-tui/src/tui/app/state_ui_runtime.rs index 699adac141..5da22b04d0 100644 --- a/crates/jcode-tui/src/tui/app/state_ui_runtime.rs +++ b/crates/jcode-tui/src/tui/app/state_ui_runtime.rs @@ -296,6 +296,8 @@ impl App { if positions.is_empty() { return; } + // An explicit jump should win over a still-settling history prepend. + self.pending_history_anchor = None; let current = self.scroll_offset; @@ -321,8 +323,16 @@ impl App { if let Some(pos) = target { self.scroll_offset = pos; + } else { + // No earlier prompt is loaded. If older compacted history exists, + // pull it in (anchored) and jump to the very top so the next press + // continues into the freshly loaded prompts instead of stalling. + if self.compacted_history_has_remaining() { + self.scroll_offset = 0; + self.auto_scroll_paused = true; + self.maybe_queue_compacted_history_load(); + } } - // If no prompt above, stay where we are } /// Scroll to the next user prompt (scroll down - later in conversation) @@ -331,6 +341,7 @@ impl App { if positions.is_empty() || !self.auto_scroll_paused { return; } + self.pending_history_anchor = None; let current = self.scroll_offset; @@ -357,6 +368,7 @@ impl App { if positions.is_empty() { return; } + self.pending_history_anchor = None; // positions are in document order (top to bottom), we want most-recent first let target_idx = positions.len().saturating_sub(rank); diff --git a/crates/jcode-tui/src/tui/app/tests/commands_accounts_01/part_01.rs b/crates/jcode-tui/src/tui/app/tests/commands_accounts_01/part_01.rs index b6a556e0d1..5e7117c96c 100644 --- a/crates/jcode-tui/src/tui/app/tests/commands_accounts_01/part_01.rs +++ b/crates/jcode-tui/src/tui/app/tests/commands_accounts_01/part_01.rs @@ -482,6 +482,37 @@ fn test_commit_command_starts_synthetic_user_turn() { assert!(notice.content.contains("Starting logical commits")); } +#[test] +fn test_commit_push_command_starts_synthetic_user_turn() { + let mut app = create_test_app(); + app.input = "/commit-push".to_string(); + app.submit_input(); + + assert!(app.is_processing); + assert!(app.pending_turn); + let notice = app + .display_messages() + .last() + .expect("missing launch notice"); + assert_eq!(notice.role, "system"); + assert!(notice.content.contains("Starting logical commits + push")); +} + +#[test] +fn test_help_topic_shows_commit_push_command_details() { + let mut app = create_test_app(); + app.input = "/help commit-push".to_string(); + app.submit_input(); + + let msg = app + .display_messages() + .last() + .expect("missing help response"); + assert_eq!(msg.role, "system"); + assert!(msg.content.contains("/commit-push")); + assert!(msg.content.contains("push")); +} + #[test] fn test_help_topic_shows_catchup_command_details() { let mut app = create_test_app(); diff --git a/crates/jcode-tui/src/tui/app/tests/onboarding_flow.rs b/crates/jcode-tui/src/tui/app/tests/onboarding_flow.rs index 467e67acfe..849d9a24e4 100644 --- a/crates/jcode-tui/src/tui/app/tests/onboarding_flow.rs +++ b/crates/jcode-tui/src/tui/app/tests/onboarding_flow.rs @@ -706,3 +706,67 @@ fn remote_post_login_validation_waits_for_catalog_refresh() { assert!(app.onboarding_pending_validation_ready_to_fire()); }); } + +#[test] +fn startup_check_skips_user_with_established_session_history() { + with_temp_jcode_home(|| { + // A low/missing launch_count alone must NOT classify someone as a new + // user when their jcode home has a substantial native session history + // (e.g. setup_hints.json was reset or lost). Seed >=10 native session + // files in the temp home. + let sessions_dir = crate::storage::jcode_dir() + .expect("jcode dir") + .join("sessions"); + std::fs::create_dir_all(&sessions_dir).expect("create sessions dir"); + for i in 0..10 { + std::fs::write( + sessions_dir.join(format!("session_test_{i:02}.json")), + "{}", + ) + .expect("write session file"); + } + + let mut app = create_test_app(); + app.onboarding_flow = None; + app.onboarding_startup_checked = false; + + app.maybe_begin_onboarding_flow_on_startup(); + + assert!(app.onboarding_startup_checked); + assert!( + app.onboarding_flow.is_none(), + "established users (many native sessions) must never re-onboard" + ); + }); +} + +#[test] +fn startup_check_imported_transcripts_do_not_count_as_history() { + with_temp_jcode_home(|| { + // Imported Codex/Claude transcripts exist on genuinely fresh installs + // that chose to import history; they must not suppress onboarding. + let sessions_dir = crate::storage::jcode_dir() + .expect("jcode dir") + .join("sessions"); + std::fs::create_dir_all(&sessions_dir).expect("create sessions dir"); + for i in 0..20 { + std::fs::write( + sessions_dir.join(format!("imported_codex_{i:02}.json")), + "{}", + ) + .expect("write imported file"); + } + + let mut app = create_test_app(); + app.onboarding_flow = None; + app.onboarding_startup_checked = false; + + app.maybe_begin_onboarding_flow_on_startup(); + + assert!(app.onboarding_startup_checked); + assert!( + app.onboarding_flow.is_some(), + "imported transcripts alone should still onboard a fresh install" + ); + }); +} diff --git a/crates/jcode-tui/src/tui/app/tests/remote_events_reload_01/part_01.rs b/crates/jcode-tui/src/tui/app/tests/remote_events_reload_01/part_01.rs index c0c4d80e7e..41661bbe0c 100644 --- a/crates/jcode-tui/src/tui/app/tests/remote_events_reload_01/part_01.rs +++ b/crates/jcode-tui/src/tui/app/tests/remote_events_reload_01/part_01.rs @@ -1080,10 +1080,11 @@ fn test_handle_server_event_side_pane_images_populates_pane_live() { assert!(needs_redraw, "live side-pane image should request a redraw"); assert_eq!(app.remote_side_pane_images.len(), 1); - // The pane should reveal (not user-hidden) and arm its auto-hide timer. + // Images render inline in the transcript now, so a live image must not flip + // the side panel or arm the old auto-hide timer. assert!(!app.side_panel_user_hidden); assert!(<App as crate::tui::TuiState>::pin_images(&app)); - assert!(app.pinned_images_auto_hide_deadline.is_some()); + assert!(app.pinned_images_auto_hide_deadline.is_none()); } #[test] diff --git a/crates/jcode-tui/src/tui/app/tests/scroll_copy_01/part_01.rs b/crates/jcode-tui/src/tui/app/tests/scroll_copy_01/part_01.rs index b6f1a5ecab..369ad0dd2f 100644 --- a/crates/jcode-tui/src/tui/app/tests/scroll_copy_01/part_01.rs +++ b/crates/jcode-tui/src/tui/app/tests/scroll_copy_01/part_01.rs @@ -702,7 +702,10 @@ fn test_local_alt_m_falls_back_to_diagram_pane_when_side_panel_is_empty() { } #[test] -fn test_local_alt_m_toggles_image_side_panel_visibility() { +fn test_images_do_not_drive_side_panel_visibility() { + // Images now render inline in the transcript flow, so they must not flip the + // side panel on, arm an auto-hide timer, or otherwise behave like the old + // pinned-image side pane. let mut app = create_test_app(); app.is_remote = true; app.side_panel = crate::side_panel::SidePanelSnapshot::default(); @@ -713,93 +716,10 @@ fn test_local_alt_m_toggles_image_side_panel_visibility() { source: crate::session::RenderedImageSource::UserInput, }); - app.handle_key(KeyCode::Char('m'), KeyModifiers::ALT) - .unwrap(); - assert!(app.side_panel_user_hidden); - assert_eq!(app.status_notice(), Some("Image side panel: OFF".to_string())); - - app.handle_key(KeyCode::Char('m'), KeyModifiers::ALT) - .unwrap(); - assert!(!app.side_panel_user_hidden); - assert_eq!(app.status_notice(), Some("Image side panel: ON".to_string())); -} - -#[test] -fn test_explicitly_hidden_image_side_panel_stays_hidden_after_server_reload() { - // Reproduces an Alt+M hide being undone by a server reload/reconnect: the - // history snapshot repopulates remote_side_pane_images while - // pinned_images_seen_count resets to 0, which previously looked like new - // images and re-revealed the panel. - let mut app = create_test_app(); - app.is_remote = true; - app.side_panel = crate::side_panel::SidePanelSnapshot::default(); - app.remote_side_pane_images.push(crate::session::RenderedImage { - media_type: "image/png".to_string(), - data: "image-data".to_string(), - label: Some("preview.png".to_string()), - source: crate::session::RenderedImageSource::UserInput, - }); - - // User explicitly hides the image side panel. - app.handle_key(KeyCode::Char('m'), KeyModifiers::ALT) - .unwrap(); - assert!(app.side_panel_user_hidden); - assert!(app.side_panel_explicit_hidden); - - // Simulate a server reload/reconnect: the seen count is reset while the - // image snapshot is re-applied with the same images. - app.pinned_images_seen_count = 0; - app.remote_side_pane_images = vec![crate::session::RenderedImage { - media_type: "image/png".to_string(), - data: "image-data".to_string(), - label: Some("preview.png".to_string()), - source: crate::session::RenderedImageSource::UserInput, - }]; - - app.update_pinned_images_auto_hide(); - - // The panel must remain hidden because the user explicitly closed it. - assert!(app.side_panel_user_hidden); - assert!(app.side_panel_explicit_hidden); + // Auto-hide bookkeeping is now a no-op for images. + assert!(!app.update_pinned_images_auto_hide()); assert!(app.pinned_images_auto_hide_deadline.is_none()); - - // Alt+M still toggles it back on. - app.handle_key(KeyCode::Char('m'), KeyModifiers::ALT) - .unwrap(); - assert!(!app.side_panel_user_hidden); - assert!(!app.side_panel_explicit_hidden); - assert_eq!(app.status_notice(), Some("Image side panel: ON".to_string())); -} - -#[test] -fn test_pinned_image_side_panel_auto_hides_and_mentions_alt_m() { - let mut app = create_test_app(); - app.is_remote = true; - app.side_panel = crate::side_panel::SidePanelSnapshot::default(); - app.remote_side_pane_images.push(crate::session::RenderedImage { - media_type: "image/png".to_string(), - data: "image-data".to_string(), - label: Some("preview.png".to_string()), - source: crate::session::RenderedImageSource::UserInput, - }); - - assert!(app.update_pinned_images_auto_hide()); assert!(!app.side_panel_user_hidden); - assert!(app.pinned_images_auto_hide_deadline.is_some()); - - app.pinned_images_auto_hide_deadline = - Some(std::time::Instant::now() - std::time::Duration::from_secs(1)); - assert!(app.update_pinned_images_auto_hide()); - - assert!(app.side_panel_user_hidden); - assert!(app.pinned_images_auto_hide_deadline.is_none()); - let notice = app - .display_messages - .last() - .map(|message| message.content.clone()) - .unwrap_or_default(); - assert!(notice.contains("Pinned image side panel hidden automatically")); - assert!(notice.contains(crate::tui::keybind::side_panel_toggle_key_label())); } #[test] diff --git a/crates/jcode-tui/src/tui/app/tests/scroll_copy_03.rs b/crates/jcode-tui/src/tui/app/tests/scroll_copy_03.rs index 970ca41213..9110728845 100644 --- a/crates/jcode-tui/src/tui/app/tests/scroll_copy_03.rs +++ b/crates/jcode-tui/src/tui/app/tests/scroll_copy_03.rs @@ -589,6 +589,231 @@ fn repro_ctrl_shift_jk_scroll_with_text_in_input() { ); } +/// Build a long single-message app and seed the render statics so the +/// history-anchor logic (which reads `last_total_wrapped_lines` / +/// `last_resolved_chat_scroll`) has a populated frame to work against. +fn anchor_test_app() -> (App, ratatui::Terminal<ratatui::backend::TestBackend>) { + create_scroll_test_app(80, 25, 0, 60) +} + +#[test] +fn test_history_anchor_keeps_distance_from_bottom_after_prepend() { + let _render_lock = scroll_render_test_lock(); + let (mut app, mut terminal) = anchor_test_app(); + + // Render at a scrolled-up position so the statics reflect a real frame. + render_and_snap(&app, &mut terminal); + let total_before = crate::tui::ui::last_total_wrapped_lines(); + assert!(total_before > 0, "expected a rendered transcript"); + + app.scroll_offset = 4; + app.auto_scroll_paused = true; + render_and_snap(&app, &mut terminal); + let total_before = crate::tui::ui::last_total_wrapped_lines(); + + // Simulate the reader sitting 4 lines from the top: capture an anchor as if a + // load were triggered, then "prepend" by growing the transcript. + app.capture_history_anchor(0); + let anchor = app + .pending_history_anchor + .expect("anchor should be captured"); + let expected_from_bottom = total_before.saturating_sub(4); + assert_eq!( + anchor.lines_from_bottom, expected_from_bottom, + "anchor should record distance from the bottom" + ); + + // Grow the transcript (older content prepended) and re-render. The resolved + // scroll must keep the same distance from the bottom, not snap to the top. + app.display_messages.insert( + 0, + DisplayMessage { + role: "assistant".to_string(), + content: App::build_scroll_test_content(0, 40, None), + tool_calls: vec![], + duration_secs: None, + title: None, + tool_data: None, + }, + ); + app.bump_display_messages_version(); + render_and_snap(&app, &mut terminal); + + let total_after = crate::tui::ui::last_total_wrapped_lines(); + assert!( + total_after > total_before, + "prepend should grow the transcript ({} -> {})", + total_before, + total_after + ); + let resolved = crate::tui::ui::last_resolved_chat_scroll(); + let distance_after = total_after.saturating_sub(resolved); + assert_eq!( + distance_after, expected_from_bottom, + "viewport must stay the same distance from the bottom across the prepend" + ); + assert_ne!( + resolved, 0, + "anchored viewport must not snap to the new absolute top" + ); +} + +#[test] +fn test_history_anchor_reconciles_into_scroll_offset_after_render() { + let _render_lock = scroll_render_test_lock(); + let (mut app, mut terminal) = anchor_test_app(); + + app.scroll_offset = 3; + app.auto_scroll_paused = true; + render_and_snap(&app, &mut terminal); + + app.capture_history_anchor(0); + assert!(app.pending_history_anchor.is_some()); + + // Before any new frame, reconcile must wait (total unchanged). + assert!( + !app.reconcile_history_anchor(), + "reconcile should wait until a frame with new content has rendered" + ); + assert!(app.pending_history_anchor.is_some()); + + // Prepend + render so the resolved scroll is published, then reconcile. + app.display_messages.insert( + 0, + DisplayMessage { + role: "assistant".to_string(), + content: App::build_scroll_test_content(0, 30, None), + tool_calls: vec![], + duration_secs: None, + title: None, + tool_data: None, + }, + ); + app.bump_display_messages_version(); + render_and_snap(&app, &mut terminal); + let resolved = crate::tui::ui::last_resolved_chat_scroll(); + + assert!(app.reconcile_history_anchor(), "reconcile should apply once"); + assert!( + app.pending_history_anchor.is_none(), + "anchor should be consumed after reconcile" + ); + assert_eq!( + app.scroll_offset, resolved, + "scroll_offset should adopt the resolved on-screen position" + ); + assert!(app.auto_scroll_paused, "anchored view stays paused"); +} + +/// Build a session whose compacted prefix is large enough to actually truncate +/// (the render window only hides history past ~80 messages / >5 turns), with one +/// live prompt at the tail. Returns the app with the truncated window applied. +fn compacted_history_app_with_remaining(turns: usize) -> App { + let mut app = create_test_app(); + for turn in 0..turns { + app.session.add_message( + crate::message::Role::User, + vec![crate::message::ContentBlock::Text { + text: format!("old prompt {turn}"), + cache_control: None, + }], + ); + app.session.add_message( + crate::message::Role::Assistant, + vec![crate::message::ContentBlock::Text { + text: format!("old response {turn}"), + cache_control: None, + }], + ); + } + app.session.add_message( + crate::message::Role::User, + vec![crate::message::ContentBlock::Text { + text: "current prompt".to_string(), + cache_control: None, + }], + ); + let compacted_count = turns * 2; + app.session.compaction = Some(crate::session::StoredCompactionState { + summary_text: "older turns".to_string(), + openai_encrypted_content: None, + covers_up_to_turn: turns, + original_turn_count: turns, + compacted_count, + }); + + let (rendered_messages, _images, _info) = + crate::session::render_messages_and_images_with_compacted_history(&app.session, 0); + let rendered = rendered_messages + .into_iter() + .map(|msg| DisplayMessage { + role: msg.role, + content: msg.content, + tool_calls: msg.tool_calls, + duration_secs: None, + title: None, + tool_data: msg.tool_data, + }) + .collect(); + app.replace_display_messages(rendered); + app +} + +#[test] +fn test_local_compacted_history_scroll_up_is_anchored_not_snapped() { + let _render_lock = scroll_render_test_lock(); + + let mut app = compacted_history_app_with_remaining(50); + assert!( + app.compacted_history_has_remaining(), + "older history should be hidden initially" + ); + + let backend = ratatui::backend::TestBackend::new(80, 12); + let mut terminal = ratatui::Terminal::new(backend).expect("failed to create test terminal"); + render_and_snap(&app, &mut terminal); + + // Scroll up to the top of the loaded window; this should both pull older + // history in and anchor the viewport rather than snapping to the new top. + app.scroll_offset = 0; + app.auto_scroll_paused = true; + app.scroll_up(3); + + assert!( + app.display_messages().len() > 2, + "scroll-up near the top should have loaded older messages into the transcript" + ); + // An anchor must have been captured so the next render keeps the view stable. + assert!( + app.pending_history_anchor.is_some(), + "scroll-up that loads history should capture a viewport anchor" + ); +} + +#[test] +fn test_prompt_jump_loads_older_history_when_at_top() { + let _render_lock = scroll_render_test_lock(); + + let mut app = compacted_history_app_with_remaining(50); + assert!(app.compacted_history_has_remaining()); + let messages_before = app.display_messages().len(); + + let backend = ratatui::backend::TestBackend::new(80, 12); + let mut terminal = ratatui::Terminal::new(backend).expect("failed to create test terminal"); + render_and_snap(&app, &mut terminal); + + // At the top with no earlier loaded prompt, a prompt-up jump should pull in + // the older compacted history instead of doing nothing. + app.scroll_offset = 0; + app.auto_scroll_paused = true; + app.scroll_to_prev_prompt(); + + assert!( + app.display_messages().len() > messages_before, + "prompt-up at the top should load older history" + ); +} + #[cfg(test)] #[path = "../tests_input_scroll.rs"] mod input_scroll_tests; diff --git a/crates/jcode-tui/src/tui/app/tui_lifecycle.rs b/crates/jcode-tui/src/tui/app/tui_lifecycle.rs index dbccf193a9..5c899a8f64 100644 --- a/crates/jcode-tui/src/tui/app/tui_lifecycle.rs +++ b/crates/jcode-tui/src/tui/app/tui_lifecycle.rs @@ -287,6 +287,7 @@ impl App { display_user_message_count: 0, display_edit_tool_message_count: 0, compacted_history_lazy: CompactedHistoryLazyState::default(), + pending_history_anchor: None, input: String::new(), command_candidates_cache: RefCell::new(None), cursor_pos: 0, @@ -665,6 +666,7 @@ impl App { display_user_message_count: 0, display_edit_tool_message_count: 0, compacted_history_lazy: CompactedHistoryLazyState::default(), + pending_history_anchor: None, input: String::new(), command_candidates_cache: RefCell::new(None), cursor_pos: 0, diff --git a/crates/jcode-tui/src/tui/app/tui_state.rs b/crates/jcode-tui/src/tui/app/tui_state.rs index c11dfd6110..07aea71f11 100644 --- a/crates/jcode-tui/src/tui/app/tui_state.rs +++ b/crates/jcode-tui/src/tui/app/tui_state.rs @@ -469,6 +469,11 @@ impl crate::tui::TuiState for App { self.auto_scroll_paused } + fn pending_history_anchor_lines_from_bottom(&self) -> Option<usize> { + self.pending_history_anchor + .map(|anchor| anchor.lines_from_bottom) + } + fn chat_overscroll_active(&self) -> bool { self.chat_overscroll_active() } diff --git a/crates/jcode-tui/src/tui/info_widget_layout.rs b/crates/jcode-tui/src/tui/info_widget_layout.rs index a7168063ec..57b90f172e 100644 --- a/crates/jcode-tui/src/tui/info_widget_layout.rs +++ b/crates/jcode-tui/src/tui/info_widget_layout.rs @@ -24,6 +24,13 @@ pub(crate) struct WidgetAnchor { pub placement: WidgetPlacement, /// Consecutive frames this anchor has been retained but not rendered. pub hidden_frames: u16, + /// Absolute transcript line the widget's top row is pinned to. When the user is + /// scrolling ([`Margins::content_anchored`]) the widget rides with this content + /// line instead of holding a fixed screen row, so it sticks to the same pocket + /// of negative space and scrolls along with the text. Refreshed every frame in + /// screen-anchored mode so a later switch into content-anchored mode hands off + /// seamlessly. + pub content_top: usize, } /// Result of a placement pass: what to render now, plus the anchor memory to feed @@ -50,6 +57,14 @@ pub struct Margins { /// fall back to the instantaneous widths (no look-ahead). pub right_reliable: Vec<u16>, pub left_reliable: Vec<u16>, + /// Absolute transcript line shown on the first visible row this frame. Lets the + /// placement engine translate a content-anchored widget by the scroll delta so + /// it rides the transcript instead of holding a fixed screen row. + pub scroll_top: usize, + /// When true (the user is actively scrolling), anchored widgets stick to their + /// transcript line and scroll with the content. When false (pinned at the + /// bottom / streaming) they hold a fixed screen row as before. + pub content_anchored: bool, } impl Margins { @@ -113,6 +128,7 @@ pub(crate) fn calculate_placements( .map(|placement| WidgetAnchor { placement, hidden_frames: 0, + content_top: 0, }) .collect(); calculate_placements_anchored(messages_area, margins, data, enabled, &prev_anchors).visible @@ -235,8 +251,35 @@ pub(crate) fn calculate_placements_anchored( continue; } - let row_start = prev.rect.y.saturating_sub(messages_area.y) as usize; + // Resolve which screen row the widget occupies this frame. + // + // Content-anchored (the user is scrolling): the widget is pinned to a + // transcript line (`anchor.content_top`) and rides with it, so it sticks to + // the same pocket of negative space and simply scrolls along with the text + // rather than churning against a fixed screen row. Because the rows it now + // covers map back to the *same* content lines, the free-width profile under + // it is invariant frame-to-frame, so its width is stable too. If its content + // line has scrolled above the viewport, drop the anchor and let Phase 2 home + // a fresh widget into the newly exposed space. + // + // Screen-anchored (pinned at the bottom / streaming): hold the exact screen + // row as before, and refresh `content_top` so a later switch into scrolling + // hands off seamlessly. let height = prev.rect.height as usize; + let (row_start, target_y, content_top) = if margins.content_anchored { + if anchor.content_top < margins.scroll_top { + continue; + } + let row = anchor.content_top - margins.scroll_top; + ( + row, + messages_area.y.saturating_add(row as u16), + anchor.content_top, + ) + } else { + let row = prev.rect.y.saturating_sub(messages_area.y) as usize; + (row, prev.rect.y, margins.scroll_top + row) + }; let row_end = row_start + height; let widths = match prev.side { Side::Right => &margins.right_widths, @@ -278,6 +321,7 @@ pub(crate) fn calculate_placements_anchored( next_anchors.push(WidgetAnchor { placement: prev.clone(), hidden_frames, + content_top, }); // Overview will pop back into its slot, so keep suppressing its // mergeable widgets while it is only transiently hidden. @@ -301,13 +345,14 @@ pub(crate) fn calculate_placements_anchored( }; let placement = WidgetPlacement { kind: prev.kind, - rect: Rect::new(kept_x, prev.rect.y, kept_width, prev.rect.height), + rect: Rect::new(kept_x, target_y, kept_width, prev.rect.height), side: prev.side, }; placements.push(placement.clone()); next_anchors.push(WidgetAnchor { placement, hidden_frames: 0, + content_top, }); kept.insert(prev.kind); anchored.insert(prev.kind); @@ -364,15 +409,33 @@ pub(crate) fn calculate_placements_anchored( continue; } + // Where inside the pocket to seat the widget. + // + // Content-anchored (scrolling): seat it at the *bottom* of the pocket so it + // has the maximum runway to ride upward with the transcript before scrolling + // off the top - otherwise a widget born at the pocket's top row would fall off + // and re-home every single frame (a constant recycle). The leftover free space + // is then the region above it. + // + // Screen-anchored: seat it at the top as before; it holds a fixed screen row. + let placed_top = if margins.content_anchored { + top + height.saturating_sub(widget_height) + } else { + top + }; + let placement = WidgetPlacement { kind, - rect: Rect::new(x, messages_area.y + top, width, widget_height), + rect: Rect::new(x, messages_area.y + placed_top, width, widget_height), side, }; placements.push(placement.clone()); next_anchors.push(WidgetAnchor { placement, hidden_frames: 0, + // Bind this fresh widget to the transcript line currently under its top + // row, so the moment the user keeps scrolling it rides with the content. + content_top: margins.scroll_top + placed_top as usize, }); if kind == WidgetKind::Overview { overview_placed = true; @@ -384,7 +447,13 @@ pub(crate) fn calculate_placements_anchored( continue; } - let new_top = top + widget_height; + // The leftover pocket: below the widget when top-aligned, above it when the + // widget was seated at the bottom (content-anchored). + let new_top = if margins.content_anchored { + top + } else { + top + widget_height + }; all_rects[idx].1 = new_top; all_rects[idx].2 = remaining_height; diff --git a/crates/jcode-tui/src/tui/info_widget_stability.rs b/crates/jcode-tui/src/tui/info_widget_stability.rs index 6e52fc404a..f34f9ea4c2 100644 --- a/crates/jcode-tui/src/tui/info_widget_stability.rs +++ b/crates/jcode-tui/src/tui/info_widget_stability.rs @@ -67,6 +67,17 @@ pub struct WidgetMotion { pub height_churn: u32, /// Largest single-frame top-left jump (Chebyshev distance). pub max_jump: u16, + /// Total *content-relative* vertical travel: |actual dy - expected scroll-ride|, + /// counting only small frame-to-frame residuals (<= [`RIDE_TOLERANCE`]). A widget + /// glued to its transcript line scores 0 here; a widget holding a fixed screen row + /// while the text scrolls under it accrues ~1 per frame (it drifts against the + /// text). This is the per-frame "jiggle" the user notices while scrolling. + pub content_y_travel: u32, + /// Number of steps where the widget moved much more than the scroll-ride (a + /// "recycle": it left the viewport at one edge and a fresh instance entered at + /// another). Visually this is one widget leaving with its content and another + /// joining new content, not a slide, so it is tracked separately from jiggle. + pub recycles: usize, } impl WidgetMotion { @@ -83,6 +94,13 @@ impl WidgetMotion { /// equivalent cells of positional travel. const FLICKER_WEIGHT: u32 = 8; +/// Maximum content-relative residual (rows) that still counts as "riding the scroll" +/// rather than a recycle. A perfectly content-anchored widget has residual 0; a +/// screen-anchored widget drifts ~1 row/frame against the text. A jump larger than +/// this means the widget was recycled to a different pocket (left at one edge, +/// re-entered at another), which is counted separately from per-frame jiggle. +const RIDE_TOLERANCE: i64 = 2; + /// Aggregate stability report over a scroll sequence. #[derive(Debug, Clone, Default, Serialize)] pub struct StabilityReport { @@ -98,10 +116,22 @@ pub struct StabilityReport { pub total_flicker: usize, /// Total positional travel (x + y) across all widgets. pub total_travel: u32, + /// Total *content-relative* vertical travel across all widgets (residual after + /// subtracting the expected scroll-ride). This is near-zero when widgets stick to + /// their negative-space spot in the transcript and only grows when they actually + /// jump between pockets. Headline metric for the "ride the scroll" behaviour. + pub total_content_travel: u32, + /// Total recycles across all widgets: a widget left the viewport at one edge and a + /// fresh instance entered at another (transcript scrolled past its pocket). This is + /// expected and calm, unlike per-frame jiggle, so it is reported on its own. + pub total_recycles: usize, /// Total size churn (width + height) across all widgets. pub total_size_churn: u32, /// Positional travel per 100 scroll lines (the headline distraction metric). pub travel_per_100_lines: f64, + /// Content-relative vertical travel per 100 scroll lines. Lower means widgets + /// ride the transcript more faithfully (stick to one negative-space spot). + pub content_travel_per_100_lines: f64, /// Flicker transitions per 100 scroll lines. pub flicker_per_100_lines: f64, /// Composite distraction score per 100 scroll lines (travel + weighted flicker). @@ -136,8 +166,24 @@ pub struct KindVisibility { /// Analyze a sequence of frames (each a list of placed widget rects) and compute /// movement/flicker metrics. Frames are assumed to be consecutive scroll positions -/// differing by one content line. +/// differing by one content line, scrolling downward (transcript top advances by 1). +/// For the content-relative travel metric to reflect real scroll deltas, prefer +/// [`analyze_frames_with_scroll`]. pub fn analyze_frames(frames: &[Vec<PlacedRect>]) -> StabilityReport { + // Default: assume each step advances the transcript top by exactly one line. + let scroll_tops: Vec<i64> = (0..frames.len() as i64).collect(); + analyze_frames_with_scroll(frames, &scroll_tops) +} + +/// Like [`analyze_frames`] but with explicit per-frame transcript tops, so the +/// content-relative travel metric subtracts the *real* scroll-ride. `scroll_tops[i]` +/// is the absolute transcript line shown on the first visible row of frame `i`. A +/// widget that rides its transcript line perfectly contributes zero content travel +/// even though its absolute `y` moves with the scroll. +pub fn analyze_frames_with_scroll( + frames: &[Vec<PlacedRect>], + scroll_tops: &[i64], +) -> StabilityReport { let mut report = StabilityReport { frames: frames.len(), steps: frames.len().saturating_sub(1), @@ -175,6 +221,16 @@ pub fn analyze_frames(frames: &[Vec<PlacedRect>]) -> StabilityReport { let cur = &frames[step + 1]; let mut step_unstable = false; + // Signed scroll delta for this step: how far the transcript top advanced. A + // content-anchored widget should move by `-scroll_delta` rows on screen + // (content scrolls up as the top line advances), so the residual is + // `signed_dy + scroll_delta`. + let scroll_delta = scroll_tops + .get(step + 1) + .copied() + .unwrap_or(step as i64 + 1) + - scroll_tops.get(step).copied().unwrap_or(step as i64); + // Index current frame by kind for lookup. let cur_index = |kind: &str| cur.iter().find(|r| r.kind == kind).copied(); let prev_index = |kind: &str| prev.iter().find(|r| r.kind == kind).copied(); @@ -206,6 +262,17 @@ pub fn analyze_frames(frames: &[Vec<PlacedRect>]) -> StabilityReport { } m.x_travel += dx as u32; m.y_travel += dy as u32; + // Residual after removing the expected scroll-ride. Small residuals + // are per-frame jiggle (drift against the text); a large residual + // means the widget jumped to a different pocket (a recycle), which + // is counted separately so it doesn't masquerade as smooth travel. + let signed_dy = c.y as i64 - p.y as i64; + let residual = (signed_dy + scroll_delta).abs(); + if residual <= RIDE_TOLERANCE { + m.content_y_travel += residual as u32; + } else { + m.recycles += 1; + } m.width_churn += dw as u32; m.height_churn += dh as u32; m.max_jump = m.max_jump.max(dx.max(dy)); @@ -243,6 +310,8 @@ pub fn analyze_frames(frames: &[Vec<PlacedRect>]) -> StabilityReport { .map(|w| w.appearances + w.disappearances) .sum(); report.total_travel = widgets.iter().map(|w| w.x_travel + w.y_travel).sum(); + report.total_content_travel = widgets.iter().map(|w| w.x_travel + w.content_y_travel).sum(); + report.total_recycles = widgets.iter().map(|w| w.recycles).sum(); report.total_size_churn = widgets.iter().map(|w| w.width_churn + w.height_churn).sum(); report.worst_widget = widgets .first() @@ -251,6 +320,7 @@ pub fn analyze_frames(frames: &[Vec<PlacedRect>]) -> StabilityReport { let steps = report.steps.max(1) as f64; report.travel_per_100_lines = report.total_travel as f64 / steps * 100.0; + report.content_travel_per_100_lines = report.total_content_travel as f64 / steps * 100.0; report.flicker_per_100_lines = report.total_flicker as f64 / steps * 100.0; let distraction: u32 = widgets.iter().map(|w| w.distraction()).sum(); report.distraction_per_100_lines = distraction as f64 / steps * 100.0; @@ -374,6 +444,10 @@ pub enum SimMode { /// Look-ahead sizing with NO anchor carry (re-solve each frame). Isolates how /// much stability comes from the smoothed profile alone vs the anchor logic. LookAheadFresh(u16), + /// Anchored carry PLUS content anchoring: widgets are pinned to a transcript line + /// and ride the scroll (the "stick to one negative-space spot" behaviour). This + /// is what the live renderer uses while the user is actively scrolling. + ContentAnchored, } /// Build the per-row free-width profile for `scroll`. When `window > 0`, each row's @@ -434,6 +508,7 @@ pub fn simulate_scroll_mode( _ => 0, }; let greedy = matches!(mode, SimMode::Greedy | SimMode::LookAheadFresh(_)); + let content_anchored = matches!(mode, SimMode::ContentAnchored); // Carry anchors across frames exactly like the live renderer does, so the // HUD pinning / hide-in-place behaviour is exercised identically. @@ -452,6 +527,8 @@ pub fn simulate_scroll_mode( centered: false, right_reliable, left_reliable: Vec::new(), + scroll_top: scroll, + content_anchored, }; // Greedy mode forgets all anchors each frame, so every frame independently // maximizes coverage (the old "fill the biggest pocket now" philosophy). diff --git a/crates/jcode-tui/src/tui/info_widget_stability_tests.rs b/crates/jcode-tui/src/tui/info_widget_stability_tests.rs index 5bc53bc5cf..77a8f3ce97 100644 --- a/crates/jcode-tui/src/tui/info_widget_stability_tests.rs +++ b/crates/jcode-tui/src/tui/info_widget_stability_tests.rs @@ -215,6 +215,92 @@ fn widgets_never_overlap_while_scrolling() { } } +/// Content anchoring is the "stick to one negative-space spot while scrolling" +/// behaviour: a widget pins to a transcript line and rides the scroll, so its motion +/// *relative to the surrounding text* is ~0 even though its absolute screen `y` +/// tracks the scroll. This must dramatically reduce content-relative travel versus +/// the screen-anchored mode on ragged content (where the old behaviour churned). +#[test] +fn content_anchoring_reduces_content_relative_travel() { + for period in [7usize, 9, 11, 13] { + let content: Vec<u16> = (0..240) + .map(|i| if i % period == 0 { 95 } else { 28 }) + .collect(); + let screen = measure_scroll_mode(&content, 100, 24, &sample_data(), SimMode::Anchored); + let stuck = measure_scroll_mode(&content, 100, 24, &sample_data(), SimMode::ContentAnchored); + assert!( + stuck.widgets.iter().any(|w| w.frames_present > 0), + "period {period}: expected a widget to be placed" + ); + assert!( + stuck.content_travel_per_100_lines <= screen.content_travel_per_100_lines + 1e-9, + "period {period}: content anchoring should not increase content-relative travel \ + (content={:.1} vs screen={:.1})", + stuck.content_travel_per_100_lines, + screen.content_travel_per_100_lines, + ); + // A perfectly stuck widget rides the scroll with zero residual travel. + assert_eq!( + stuck.total_content_travel, 0, + "period {period}: content-anchored widget should not drift relative to the transcript, \ + got {} ({:#?})", + stuck.total_content_travel, stuck + ); + } +} + +/// Content-anchored widgets must still never overlap while riding the scroll. +#[test] +fn content_anchored_widgets_never_overlap_while_scrolling() { + for period in [7usize, 9, 11, 14, 17] { + let content: Vec<u16> = (0..240) + .map(|i| if i % period == 0 { 95 } else { 26 }) + .collect(); + let report = measure_scroll_mode(&content, 100, 24, &rich_data(), SimMode::ContentAnchored); + assert_eq!( + report.overlap_frames, 0, + "period {period}: widgets overlapped in {} frames (max {} pairs)", + report.overlap_frames, report.max_overlap_pairs + ); + } +} + +/// A/B: stick-to-the-transcript anchoring vs holding a fixed screen row. Run with: +/// cargo test -p jcode-tui info_widget_stability::tests::demo_content_anchor -- --ignored --nocapture +#[test] +#[ignore] +fn demo_content_anchor() { + fn row(name: &str, content: &[u16]) { + let s = measure_scroll_mode(content, 100, 24, &rich_data(), SimMode::Anchored); + let c = measure_scroll_mode(content, 100, 24, &rich_data(), SimMode::ContentAnchored); + println!( + "{:<20} | screen-anchored: travel/100={:>6.1} content-travel/100={:>6.1} flicker/100={:>5.1} keepVis={:>3.0}% \ + | content-anchored: travel/100={:>6.1} content-travel/100={:>6.1} flicker/100={:>5.1} keepVis={:>3.0}%", + name, + s.travel_per_100_lines, s.content_travel_per_100_lines, s.flicker_per_100_lines, s.mean_kind_visibility * 100.0, + c.travel_per_100_lines, c.content_travel_per_100_lines, c.flicker_per_100_lines, c.mean_kind_visibility * 100.0, + ); + } + + println!( + "\n=== content-anchor A/B (100x24, rich widget set) ===\n\ + content-travel = vertical motion relative to the transcript (scroll-ride removed); lower = sticks to its spot\n" + ); + row("flat narrow", &vec![20; 300]); + row( + "long line every 7", + &(0..300).map(|i| if i % 7 == 0 { 95 } else { 28 }).collect::<Vec<_>>(), + ); + row( + "long line every 14", + &(0..300).map(|i| if i % 14 == 0 { 95 } else { 28 }).collect::<Vec<_>>(), + ); + row( + "code-like (ragged)", + &(0..300).map(|i| 20 + ((i * 37) % 70) as u16).collect::<Vec<_>>(), + ); +} + /// A/B: does stable (anchored) placement cost information vs greedy max-info? /// Run with: /// cargo test -p jcode-tui info_widget_stability::tests::demo_info_tradeoff -- --ignored --nocapture diff --git a/crates/jcode-tui/src/tui/mermaid.rs b/crates/jcode-tui/src/tui/mermaid.rs index 76fc06aca1..10a958f493 100644 --- a/crates/jcode-tui/src/tui/mermaid.rs +++ b/crates/jcode-tui/src/tui/mermaid.rs @@ -11,7 +11,8 @@ pub use jcode_tui_mermaid::{ debug_test_resize_stability, debug_test_scroll, deferred_render_epoch, diagram_placeholder_lines, error_lines_for, error_to_lines, estimate_image_height, evict_old_cache, get_active_diagrams, get_cached_path, get_cached_png, get_font_size, - image_protocol_available, image_widget_placeholder_markdown, init_picker, + image_protocol_available, image_widget_placeholder_markdown, init_picker, inline_image_dims, + inline_image_id, materialize_inline_image, invalidate_render_state, is_mermaid_lang, is_video_export_mode, normalize_aspect_ratio, parse_image_placeholder, preferred_aspect_ratio_bucket, protocol_type, register_active_diagram, register_external_image, register_inline_image, render_image_widget, render_image_widget_fit, diff --git a/crates/jcode-tui/src/tui/mod.rs b/crates/jcode-tui/src/tui/mod.rs index a9e445c90f..3b38ead753 100644 --- a/crates/jcode-tui/src/tui/mod.rs +++ b/crates/jcode-tui/src/tui/mod.rs @@ -133,6 +133,23 @@ pub trait TuiState { } fn has_display_edit_tool_messages(&self) -> bool; fn side_pane_images(&self) -> Vec<crate::session::RenderedImage>; + /// Cheap signature of the current inline-image set: `(count, content_hash)`. + /// Used by the prepared-frame cache so the inline image section invalidates + /// when images are added/removed without cloning the payloads every frame. + /// The default implementation derives it from `side_pane_images`; overrides + /// can provide a cheaper path. + fn side_pane_images_signature(&self) -> (usize, u64) { + use std::hash::{Hash, Hasher}; + let images = self.side_pane_images(); + let mut hasher = std::collections::hash_map::DefaultHasher::new(); + for image in &images { + image.media_type.hash(&mut hasher); + image.data.len().hash(&mut hasher); + // A short prefix is enough to distinguish distinct payloads cheaply. + image.data.as_bytes().iter().take(64).for_each(|b| b.hash(&mut hasher)); + } + (images.len(), hasher.finish()) + } /// Version counter for display_messages (monotonic, increments on mutation) fn display_messages_version(&self) -> u64; fn streaming_text(&self) -> &str; @@ -150,6 +167,13 @@ pub trait TuiState { fn scroll_offset(&self) -> usize; /// Whether auto-scroll to bottom is paused (user scrolled up during streaming) fn auto_scroll_paused(&self) -> bool; + /// When older compacted history is being loaded in, this is the reader's + /// captured distance (in wrapped lines) from the bottom of the transcript. + /// The renderer uses it to keep the viewport anchored to the same content as + /// older messages are prepended above, instead of snapping to the new top. + fn pending_history_anchor_lines_from_bottom(&self) -> Option<usize> { + None + } /// Whether the elastic overscroll status line (revealed by scrolling past /// the bottom of the transcript) is currently shown. fn chat_overscroll_active(&self) -> bool { diff --git a/crates/jcode-tui/src/tui/ui.rs b/crates/jcode-tui/src/tui/ui.rs index 2e1fa03574..c1fd231f89 100644 --- a/crates/jcode-tui/src/tui/ui.rs +++ b/crates/jcode-tui/src/tui/ui.rs @@ -80,6 +80,8 @@ mod header; mod inline_interactive_ui; #[path = "ui_inline.rs"] mod inline_ui; +#[path = "ui_inline_image.rs"] +pub(crate) mod inline_image_ui; #[path = "ui_input.rs"] pub(crate) mod input_ui; #[path = "ui_memory_estimates.rs"] @@ -189,6 +191,17 @@ static PINNED_PANE_TOTAL_LINES: AtomicUsize = AtomicUsize::new(0); /// Effective scroll position of the side pane after render-time clamping. #[cfg(not(test))] static LAST_DIFF_PANE_EFFECTIVE_SCROLL: AtomicUsize = AtomicUsize::new(0); +/// Total wrapped line count of the chat transcript on the most recent frame. +/// Used together with `LAST_RESOLVED_CHAT_SCROLL` to anchor the viewport when +/// older compacted history is loaded in (so the content under the reader stays +/// put instead of teleporting to the new absolute top). +#[cfg(not(test))] +static LAST_TOTAL_WRAPPED_LINES: AtomicUsize = AtomicUsize::new(0); +/// The chat scroll offset the renderer actually used on the most recent frame +/// (after clamping and after resolving any pending history anchor). Scroll +/// handlers adopt this so manual scrolling resumes from the on-screen position. +#[cfg(not(test))] +static LAST_RESOLVED_CHAT_SCROLL: AtomicUsize = AtomicUsize::new(0); /// Wrapped line indices where each user prompt starts (updated each render frame). /// Used by prompt-jump keybindings (Ctrl+5..9, Ctrl+[/]) for accurate positioning. #[cfg(not(test))] @@ -200,6 +213,8 @@ thread_local! { static TEST_LAST_CHAT_SCROLLBAR_VISIBLE: Cell<bool> = const { Cell::new(false) }; static TEST_PINNED_PANE_TOTAL_LINES: Cell<usize> = const { Cell::new(0) }; static TEST_LAST_DIFF_PANE_EFFECTIVE_SCROLL: Cell<usize> = const { Cell::new(0) }; + static TEST_LAST_TOTAL_WRAPPED_LINES: Cell<usize> = const { Cell::new(0) }; + static TEST_LAST_RESOLVED_CHAT_SCROLL: Cell<usize> = const { Cell::new(0) }; static TEST_LAST_USER_PROMPT_POSITIONS: RefCell<Vec<usize>> = const { RefCell::new(Vec::new()) }; static TEST_LAST_LAYOUT: RefCell<Option<LayoutSnapshot>> = const { RefCell::new(None) }; static TEST_LAST_STATUS_AREA: RefCell<Option<Rect>> = const { RefCell::new(None) }; @@ -346,6 +361,56 @@ pub(crate) fn set_last_diff_pane_effective_scroll(value: usize) { } } +/// Total wrapped line count of the chat transcript on the most recent frame. +/// Returns 0 if no frame has been rendered yet. +pub fn last_total_wrapped_lines() -> usize { + #[cfg(test)] + { + return TEST_LAST_TOTAL_WRAPPED_LINES.with(Cell::get); + } + #[cfg(not(test))] + { + LAST_TOTAL_WRAPPED_LINES.load(Ordering::Relaxed) + } +} + +pub(crate) fn set_last_total_wrapped_lines(value: usize) { + #[cfg(test)] + { + TEST_LAST_TOTAL_WRAPPED_LINES.with(|cell| cell.set(value)); + return; + } + #[cfg(not(test))] + { + LAST_TOTAL_WRAPPED_LINES.store(value, Ordering::Relaxed); + } +} + +/// The chat scroll offset the renderer actually used on the most recent frame +/// (after clamping and after resolving any pending history anchor). +pub fn last_resolved_chat_scroll() -> usize { + #[cfg(test)] + { + return TEST_LAST_RESOLVED_CHAT_SCROLL.with(Cell::get); + } + #[cfg(not(test))] + { + LAST_RESOLVED_CHAT_SCROLL.load(Ordering::Relaxed) + } +} + +pub(crate) fn set_last_resolved_chat_scroll(value: usize) { + #[cfg(test)] + { + TEST_LAST_RESOLVED_CHAT_SCROLL.with(|cell| cell.set(value)); + return; + } + #[cfg(not(test))] + { + LAST_RESOLVED_CHAT_SCROLL.store(value, Ordering::Relaxed); + } +} + pub(super) fn hash_text_for_cache(text: &str) -> u64 { let mut hasher = DefaultHasher::new(); text.hash(&mut hasher); @@ -935,6 +1000,7 @@ struct FullPrepCacheKey { streaming_text_hash: u64, batch_progress_hash: u64, reasoning_trace_hash: u64, + inline_images_signature: (usize, u64), } #[derive(Clone)] @@ -1169,6 +1235,8 @@ pub(crate) fn clear_test_render_state_for_tests() { set_last_max_scroll(0); set_pinned_pane_total_lines(0); set_last_diff_pane_effective_scroll(0); + set_last_total_wrapped_lines(0); + set_last_resolved_chat_scroll(0); update_user_prompt_positions(&[]); TEST_LAST_LAYOUT.with(|snapshot| { *snapshot.borrow_mut() = None; @@ -2084,14 +2152,15 @@ fn draw_inner(frame: &mut Frame, app: &dyn TuiState) { let pane_position = app.diagram_pane_position(); let has_side_panel_content = app.side_panel().focused_page().is_some(); let diff_mode = app.diff_mode(); - let pin_images = app.pin_images(); let collect_diffs = diff_mode.is_pinned(); - let has_pinned_content = if collect_diffs || pin_images { + // Images now render inline in the transcript, so the side panel only handles + // pinned file diffs. `pin_images` no longer feeds the side-panel surface. + let has_pinned_content = if collect_diffs { collect_pinned_content_cached( app.display_messages(), &app.side_pane_images(), collect_diffs, - pin_images, + false, app.display_messages_version(), ) } else { diff --git a/crates/jcode-tui/src/tui/ui_inline_image.rs b/crates/jcode-tui/src/tui/ui_inline_image.rs new file mode 100644 index 0000000000..b865233db8 --- /dev/null +++ b/crates/jcode-tui/src/tui/ui_inline_image.rs @@ -0,0 +1,319 @@ +//! Inline image transcript section. +//! +//! Images attached to the conversation (pasted screenshots, `read` of an image +//! file, generated images) render directly in the chat flow, sized to fit the +//! chat width with a capped height. This replaces the old "pinned image side +//! panel" surface. +//! +//! Design goals: +//! * **Lazy.** Prepare only needs each image's `(id, width, height)`, obtained +//! from a cheap header parse (no full decode, no disk write, no retained +//! bytes). The full decode + terminal transmit happens at draw time, and only +//! for images currently on screen. +//! * **Single source of pixels.** The base64 payloads stay in their existing +//! owner (`App::side_pane_images()`); this section keeps only ids and a small +//! ingest-time payload registry so the draw step can materialize on demand. +//! * **Correct fit.** Images scale to fit width (preserving aspect) and cap at a +//! fraction of the viewport so a tall screenshot never buries the transcript. + +use crate::tui::mermaid; +use jcode_tui_messages::{ImageRegion, ImageRegionRender, PreparedMessages}; +use ratatui::style::{Modifier, Style}; +use ratatui::text::{Line, Span}; +use std::sync::{LazyLock, Mutex}; + +#[inline] +fn div_ceil_u32(value: u32, divisor: u32) -> u32 { + let divisor = divisor.max(1); + value.div_ceil(divisor) +} + +/// One image to render inline, resolved from a `RenderedImage`. +pub(crate) struct InlineImageItem { + pub id: u64, + pub width: u32, + pub height: u32, + pub label: String, +} + +/// Default font cell size when the terminal has not reported one yet. +const DEFAULT_CELL: (u16, u16) = (8, 16); +/// Cap an inline image at this fraction of the chat viewport height so a tall +/// image cannot push the rest of the transcript off-screen. +const MAX_VIEWPORT_FRACTION_PERCENT: u16 = 55; +/// Never shrink an inline image below this many rows. +const MIN_IMAGE_ROWS: u16 = 3; + +/// Ingest-time registry mapping image id -> (media_type, base64) so the draw +/// step can materialize bytes without threading payloads through the cached +/// prepared-frame model. Bounded; entries are cheap (two `String`s + id). +static PAYLOAD_REGISTRY: LazyLock<Mutex<PayloadRegistry>> = + LazyLock::new(|| Mutex::new(PayloadRegistry::new())); + +const PAYLOAD_REGISTRY_MAX: usize = 512; + +struct PayloadRegistry { + map: std::collections::HashMap<u64, (String, String)>, + order: std::collections::VecDeque<u64>, +} + +impl PayloadRegistry { + fn new() -> Self { + Self { + map: std::collections::HashMap::new(), + order: std::collections::VecDeque::new(), + } + } + + fn insert(&mut self, id: u64, media_type: String, data_b64: String) { + if self.map.contains_key(&id) { + return; + } + self.map.insert(id, (media_type, data_b64)); + self.order.push_back(id); + while self.order.len() > PAYLOAD_REGISTRY_MAX { + if let Some(old) = self.order.pop_front() { + self.map.remove(&old); + } + } + } + + fn get(&self, id: u64) -> Option<(String, String)> { + self.map.get(&id).cloned() + } +} + +/// Record an image payload so [`materialize_visible`] can decode it on demand. +pub(crate) fn register_payload(id: u64, media_type: &str, data_b64: &str) { + if let Ok(mut reg) = PAYLOAD_REGISTRY.lock() { + reg.insert(id, media_type.to_string(), data_b64.to_string()); + } +} + +/// Ensure the image with `id` is materialized (decoded + cached) so it can be +/// drawn. Returns true on success. Cheap and idempotent on repeat. +pub(crate) fn materialize_visible(id: u64) -> bool { + if let Some((media_type, data_b64)) = PAYLOAD_REGISTRY + .lock() + .ok() + .and_then(|reg| reg.get(id)) + { + return mermaid::materialize_inline_image(&media_type, &data_b64).is_some(); + } + false +} + +/// Resolve the app's rendered images into lazily-sized inline items. Performs +/// only header-level work (no full decode) and registers each payload for the +/// later draw-time materialize. +pub(crate) fn resolve_items(images: &[crate::session::RenderedImage]) -> Vec<InlineImageItem> { + let mut items = Vec::new(); + for image in images { + let Some((id, width, height)) = + mermaid::inline_image_dims(&image.media_type, &image.data) + else { + continue; + }; + register_payload(id, &image.media_type, &image.data); + let label = image + .label + .clone() + .unwrap_or_else(|| image.media_type.clone()); + items.push(InlineImageItem { + id, + width, + height, + label, + }); + } + items +} + +/// Compute how many rows an inline image should occupy at `chat_width`, given a +/// viewport height to cap against. +fn fit_rows(width: u32, height: u32, chat_width: u16, viewport_height: u16) -> u16 { + if width == 0 || height == 0 { + return MIN_IMAGE_ROWS; + } + let (cell_w, cell_h) = mermaid::get_font_size().unwrap_or(DEFAULT_CELL); + let cell_w = cell_w.max(1) as u32; + let cell_h = cell_h.max(1) as u32; + + // Available width in pixels (leave 1 cell for the left border bar). + let avail_cells = chat_width.saturating_sub(1).max(1) as u32; + let avail_px = avail_cells * cell_w; + + // Native pixel height, unless the image is wider than the pane, in which + // case it scales down to fit width (preserving aspect ratio). + let scaled_h_px = if width <= avail_px { + height + } else { + height.saturating_mul(avail_px) / width.max(1) + }; + + let rows = div_ceil_u32(scaled_h_px.max(1), cell_h).max(MIN_IMAGE_ROWS as u32) as u16; + + // Cap to a fraction of the viewport so tall images stay manageable. + let cap = ((viewport_height as u32 * MAX_VIEWPORT_FRACTION_PERCENT as u32) / 100) + .max(MIN_IMAGE_ROWS as u32) as u16; + rows.min(cap.max(MIN_IMAGE_ROWS)) +} + +/// Build the inline-images prepared section: a heading + correctly-sized +/// placeholder per image, with explicit `image_regions` (render = Fit) that the +/// viewport draws lazily. +pub(crate) fn build_section( + items: &[InlineImageItem], + width: u16, + viewport_height: u16, + prefix_blank: bool, +) -> PreparedMessages { + use std::sync::Arc; + + let mut lines: Vec<Line<'static>> = Vec::new(); + let mut image_regions: Vec<ImageRegion> = Vec::new(); + + if items.is_empty() { + return empty(); + } + + if prefix_blank { + lines.push(Line::from("")); + } + + for item in items { + // Label line (dim), e.g. "🖼 screenshot.png 1920×1080". + let dims = format!("{}×{}", item.width, item.height); + let label = if item.label.is_empty() { + dims.clone() + } else { + format!("{} {}", item.label, dims) + }; + lines.push(Line::from(vec![ + Span::styled( + " 🖼 ", + Style::default().add_modifier(Modifier::DIM), + ), + Span::styled(label, Style::default().add_modifier(Modifier::DIM)), + ])); + + let rows = fit_rows(item.width, item.height, width, viewport_height); + let region_start = lines.len(); + for _ in 0..rows { + lines.push(Line::from("")); + } + image_regions.push(ImageRegion { + abs_line_idx: region_start, + end_line: region_start + rows as usize, + hash: item.id, + height: rows, + render: ImageRegionRender::Fit, + }); + // Trailing spacer between images. + lines.push(Line::from("")); + } + + let line_count = lines.len(); + let plain: Vec<String> = lines.iter().map(jcode_tui_render::line_plain_text).collect(); + + PreparedMessages { + wrapped_lines: lines, + wrapped_plain_lines: Arc::new(plain), + wrapped_copy_offsets: Arc::new(vec![0; line_count]), + raw_plain_lines: Arc::new(Vec::new()), + wrapped_line_map: Arc::new(Vec::new()), + wrapped_user_indices: Vec::new(), + wrapped_user_prompt_starts: Vec::new(), + wrapped_user_prompt_ends: Vec::new(), + user_prompt_texts: Vec::new(), + image_regions, + edit_tool_ranges: Vec::new(), + copy_targets: Vec::new(), + } +} + +fn empty() -> PreparedMessages { + use std::sync::Arc; + PreparedMessages { + wrapped_lines: Vec::new(), + wrapped_plain_lines: Arc::new(Vec::new()), + wrapped_copy_offsets: Arc::new(Vec::new()), + raw_plain_lines: Arc::new(Vec::new()), + wrapped_line_map: Arc::new(Vec::new()), + wrapped_user_indices: Vec::new(), + wrapped_user_prompt_starts: Vec::new(), + wrapped_user_prompt_ends: Vec::new(), + user_prompt_texts: Vec::new(), + image_regions: Vec::new(), + edit_tool_ranges: Vec::new(), + copy_targets: Vec::new(), + } +} + +#[cfg(test)] +mod tests { + use super::*; + + fn item(width: u32, height: u32) -> InlineImageItem { + InlineImageItem { + id: 0xABCD, + width, + height, + label: "test.png".to_string(), + } + } + + #[test] + fn fit_rows_caps_tall_image_to_viewport_fraction() { + // A very tall image must be capped so it cannot bury the transcript. + let rows = fit_rows(100, 100_000, 80, 40); + let cap = ((40u32 * MAX_VIEWPORT_FRACTION_PERCENT as u32) / 100) as u16; + assert!(rows <= cap, "rows {rows} should be <= cap {cap}"); + assert!(rows >= MIN_IMAGE_ROWS); + } + + #[test] + fn fit_rows_never_below_minimum() { + let rows = fit_rows(10, 10, 80, 40); + assert!(rows >= MIN_IMAGE_ROWS); + } + + #[test] + fn build_section_emits_one_fit_region_per_image_with_label() { + let items = vec![item(600, 400), item(800, 600)]; + let section = build_section(&items, 80, 40, true); + assert_eq!(section.image_regions.len(), 2); + for region in §ion.image_regions { + assert_eq!(region.render, ImageRegionRender::Fit); + assert_eq!(region.hash, 0xABCD); + // The region must point at blank placeholder lines, never the label. + let first = §ion.wrapped_lines[region.abs_line_idx]; + assert!( + jcode_tui_render::line_plain_text(first).trim().is_empty(), + "region should start on a blank placeholder line" + ); + // Region height must match its line span. + assert_eq!( + region.end_line - region.abs_line_idx, + region.height as usize + ); + } + // A dim label line precedes the first region. + let label_line = jcode_tui_render::line_plain_text(§ion.wrapped_lines[1]); + assert!(label_line.contains("test.png"), "label missing: {label_line:?}"); + } + + #[test] + fn build_section_is_empty_for_no_items() { + let section = build_section(&[], 80, 40, false); + assert!(section.wrapped_lines.is_empty()); + assert!(section.image_regions.is_empty()); + } + + #[test] + fn payload_registry_roundtrips() { + register_payload(0xDEAD_BEEF, "image/png", "AAAA"); + let got = PAYLOAD_REGISTRY.lock().unwrap().get(0xDEAD_BEEF); + assert_eq!(got, Some(("image/png".to_string(), "AAAA".to_string()))); + } +} diff --git a/crates/jcode-tui/src/tui/ui_prepare.rs b/crates/jcode-tui/src/tui/ui_prepare.rs index 78de00a07c..65af0587ac 100644 --- a/crates/jcode-tui/src/tui/ui_prepare.rs +++ b/crates/jcode-tui/src/tui/ui_prepare.rs @@ -234,6 +234,7 @@ fn compute_image_regions(wrapped_lines: &[ratatui::text::Line<'static>]) -> Vec< end_line: idx + height as usize, hash, height, + render: jcode_tui_messages::ImageRegionRender::Crop, }); } } @@ -516,6 +517,7 @@ pub(super) fn prepare_messages( streaming_text_hash: super::hash_text_for_cache(app.streaming_text()), batch_progress_hash: active_batch_progress_hash(app), reasoning_trace_hash: reasoning_trace_hash(app), + inline_images_signature: app.side_pane_images_signature(), }; super::note_full_prep_request(); @@ -565,6 +567,26 @@ fn prepare_messages_inner(app: &dyn TuiState, width: u16, height: u16) -> Prepar let body_prepared = prepare_body_cached(app, width); let body_ms = body_start.elapsed().as_secs_f64() * 1000.0; + // Inline images render in the transcript flow just below the body. Sized + // lazily (header-only) so a session with many images never decodes ones + // that are off-screen. + let inline_images_prepared = if app.pin_images() { + let items = super::inline_image_ui::resolve_items(&app.side_pane_images()); + if items.is_empty() { + Arc::new(empty_prepared_messages()) + } else { + let prefix_blank = !body_prepared.wrapped_lines.is_empty(); + Arc::new(super::inline_image_ui::build_section( + &items, + width, + height, + prefix_blank, + )) + } + } else { + Arc::new(empty_prepared_messages()) + }; + let batch_start = Instant::now(); let has_batch_progress = active_batch_progress(app).is_some(); let batch_prefix_blank = has_batch_progress && !body_prepared.wrapped_lines.is_empty(); @@ -705,6 +727,7 @@ fn prepare_messages_inner(app: &dyn TuiState, width: u16, height: u16) -> Prepar let frame = PreparedChatFrame::from_sections(vec![ (PreparedSectionKind::Header, header_prepared), (PreparedSectionKind::Body, body_prepared), + (PreparedSectionKind::InlineImages, inline_images_prepared), (PreparedSectionKind::BatchProgress, batch_progress_prepared), (PreparedSectionKind::Reasoning, reasoning_prepared), (PreparedSectionKind::Streaming, streaming_prepared), diff --git a/crates/jcode-tui/src/tui/ui_tests/basic/body_cache.rs b/crates/jcode-tui/src/tui/ui_tests/basic/body_cache.rs index a7bef4dec1..75ae5227e7 100644 --- a/crates/jcode-tui/src/tui/ui_tests/basic/body_cache.rs +++ b/crates/jcode-tui/src/tui/ui_tests/basic/body_cache.rs @@ -307,6 +307,7 @@ fn test_full_prep_cache_state_keeps_multiple_width_entries() { streaming_text_hash: 0, batch_progress_hash: 0, reasoning_trace_hash: 0, + inline_images_signature: (0, 0), }; let key_b = FullPrepCacheKey { width: 39, @@ -375,6 +376,7 @@ fn test_full_prep_cache_state_evicts_oldest_entries() { streaming_text_hash: 0, batch_progress_hash: 0, reasoning_trace_hash: 0, + inline_images_signature: (0, 0), }; let prepared = make_prepared_chat_frame(Arc::new(PreparedMessages { wrapped_lines: vec![Line::from(format!("{idx}"))], @@ -414,6 +416,7 @@ fn test_full_prep_cache_state_accepts_large_single_entry_within_total_budget() { streaming_text_hash: 0, batch_progress_hash: 0, reasoning_trace_hash: 0, + inline_images_signature: (0, 0), }; let prepared = make_prepared_chat_frame_with_content_bytes(3 * 1024 * 1024, "full-large-"); @@ -442,6 +445,7 @@ fn test_full_prep_cache_state_retains_oversized_hot_entry() { streaming_text_hash: 12345, batch_progress_hash: 0, reasoning_trace_hash: 0, + inline_images_signature: (0, 0), }; let prepared = make_oversized_prepared_chat_frame("full-oversized-"); @@ -472,6 +476,7 @@ fn test_full_prep_cache_state_keeps_two_oversized_width_entries_hot() { streaming_text_hash: 12345, batch_progress_hash: 0, reasoning_trace_hash: 0, + inline_images_signature: (0, 0), }; let key_b = FullPrepCacheKey { width: 139, diff --git a/crates/jcode-tui/src/tui/ui_tests/basic/input_layout.rs b/crates/jcode-tui/src/tui/ui_tests/basic/input_layout.rs index 45cd12cbd4..f96afb85e2 100644 --- a/crates/jcode-tui/src/tui/ui_tests/basic/input_layout.rs +++ b/crates/jcode-tui/src/tui/ui_tests/basic/input_layout.rs @@ -227,6 +227,7 @@ fn test_copy_badge_reserves_right_margin_for_info_widgets() { centered: false, right_reliable: Vec::new(), left_reliable: Vec::new(), + ..Default::default() }; let copy_badge_ui = crate::tui::app::CopyBadgeUiState::default(); diff --git a/crates/jcode-tui/src/tui/ui_viewport.rs b/crates/jcode-tui/src/tui/ui_viewport.rs index 6308929080..02e9346051 100644 --- a/crates/jcode-tui/src/tui/ui_viewport.rs +++ b/crates/jcode-tui/src/tui/ui_viewport.rs @@ -284,13 +284,28 @@ pub(super) fn draw_messages( super::set_last_max_scroll(max_scroll); update_user_prompt_positions(wrapped_user_prompt_starts); + // When older compacted history is being loaded in, the app hands us the + // reader's distance-from-bottom instead of an absolute offset. Distance from + // the bottom is invariant under a top-side prepend, so resolving it against + // the *current* total keeps the same content under the reader and the load + // is seamless (no jump to the new absolute top). + let anchored_scroll = app + .pending_history_anchor_lines_from_bottom() + .map(|lines_from_bottom| total_lines.saturating_sub(lines_from_bottom).min(max_scroll)); let user_scroll = app.scroll_offset().min(max_scroll); - let scroll = if app.auto_scroll_paused() { + let scroll = if let Some(anchored) = anchored_scroll { + anchored + } else if app.auto_scroll_paused() { user_scroll.min(max_scroll) } else { max_scroll }; + // Publish the resolved geometry so scroll handlers and the anchor-reconcile + // tick can adopt the exact on-screen position after a prepend. + super::set_last_total_wrapped_lines(total_lines); + super::set_last_resolved_chat_scroll(scroll); + let prompt_preview_lines = if crate::config::config().display.prompt_preview && scroll > 0 { compute_prompt_preview_line_count( wrapped_user_prompt_starts, @@ -366,6 +381,14 @@ pub(super) fn draw_messages( right_widths: vec![0; prompt_preview_lines as usize], left_widths: vec![0; prompt_preview_lines as usize], centered: content_margins.centered, + // Bind row `r` of the margin to transcript line `scroll_top + r` so a + // content-anchored info widget rides the transcript while the user scrolls + // instead of churning against a fixed screen row. The prompt-preview band at + // the top is synthetic (not part of the scrolled transcript), so offset by it + // to keep the content rows aligned. While pinned at the bottom (auto-follow), + // widgets stay screen-anchored as before. + scroll_top: scroll.saturating_sub(prompt_preview_lines as usize), + content_anchored: app.auto_scroll_paused(), ..Default::default() }; margins @@ -706,6 +729,13 @@ pub(super) fn draw_messages( let hash = region.hash; let total_height = region.height; let image_end = region.end_line; + let is_fit = region.render == jcode_tui_messages::ImageRegionRender::Fit; + + // Inline raster images are materialized lazily: only decode + cache + // the ones actually on screen this frame. + if is_fit && image_end > scroll && abs_idx < visible_end { + super::inline_image_ui::materialize_visible(hash); + } if image_end > scroll && abs_idx < visible_end { let marker_visible = abs_idx >= scroll && abs_idx < visible_end; @@ -722,14 +752,26 @@ pub(super) fn draw_messages( width: content_area.width, height: render_height, }; - let rows = crate::tui::mermaid::render_image_widget( - hash, - image_area, - frame.buffer_mut(), - centered, - false, - ); - if rows == 0 { + let rows = if is_fit { + // Scale-to-fit with a left border bar, so resizes and + // font-metric mismatches never slice the image. + crate::tui::mermaid::render_image_widget_fit( + hash, + image_area, + frame.buffer_mut(), + centered, + true, + ) + } else { + crate::tui::mermaid::render_image_widget( + hash, + image_area, + frame.buffer_mut(), + centered, + false, + ) + }; + if rows == 0 && !is_fit { frame.render_widget( Paragraph::new(Line::from(Span::styled( "↗ mermaid diagram unavailable", @@ -752,13 +794,25 @@ pub(super) fn draw_messages( width: content_area.width, height: render_height, }; - crate::tui::mermaid::render_image_widget( - hash, - image_area, - frame.buffer_mut(), - centered, - true, - ); + if is_fit { + // Top scrolled off: scale-to-fit into the visible + // portion rather than cropping arbitrarily. + crate::tui::mermaid::render_image_widget_fit( + hash, + image_area, + frame.buffer_mut(), + centered, + true, + ); + } else { + crate::tui::mermaid::render_image_widget( + hash, + image_area, + frame.buffer_mut(), + centered, + true, + ); + } } } } diff --git a/docs/proposals/computer-use-maximal-control.md b/docs/proposals/computer-use-maximal-control.md new file mode 100644 index 0000000000..233f8416b5 --- /dev/null +++ b/docs/proposals/computer-use-maximal-control.md @@ -0,0 +1,229 @@ +# Roadmap: maximal macOS control for the `computer` tool + +Goal: give the agent as much reliable control over macOS as the platform allows, +including **background control that does not disturb what the user is looking at**. + +This builds on the v1 `computer` tool (PR #345): screenshot, coordinate +mouse/keyboard, scroll, AX-tree read, cursor, permission check. + +Everything below is implementable in Rust with crates that are already in the +lockfile or available on crates.io (`accessibility-sys`, `screencapturekit`, +`objc2-app-kit`, `core-graphics`). No Swift/ObjC build step. + +--- + +## The one hard constraint + +macOS has **one HID cursor and one keyboard-focus** shared by the whole login +session. Synthetic *coordinate* input (CGEvent) is therefore always visible: it +moves the real cursor and types into the focused app. + +**Background / not-in-view control must avoid CGEvent** and instead go through: + +1. **Accessibility (AX) actions** - act on a specific element by reference. +2. **Apple Events / scripting** - drive scriptable apps with no UI. +3. **Per-window capture** - "see" a window without raising it. + +True simultaneous "you work + I work independently" needs a **separate display +or login session** (see Tier 4). + +--- + +## Tier 0 - done (v1, PR #345) + +- `screenshot` (main display, point/pixel scale aware) +- `move` / `click` / `double_click` / `right_click` / `drag` / `scroll` +- `type` / `key` (chords) +- `ui` (AX tree read via osascript) +- `cursor`, `check_permissions` + +## Tier 1 - AX semantic actions ← highest leverage for background control + +Read + act on elements by reference, no cursor movement, target app need not be +frontmost. Uses `accessibility-sys` (`AXUIElementPerformAction`, +`AXUIElementSetAttributeValue`, `AXUIElementCopyElementAtPosition`, +`AXUIElementCopyAttributeValue`). + +- `find_element { role?, title?, value?, pid?, app? }` -> stable element handles +- `element_at { x, y }` -> element under a point (AXUIElementCopyElementAtPosition) +- `press { element }` -> `AXPress` (click a button in a background window) +- `set_value { element, value }` -> type into a field without focus +- `get_value { element }` +- `perform_action { element, ax_action }` -> any advertised AX action +- `select_menu { app, path: ["File","Export…"] }` -> drive the menu bar of any app + +Handle format: `pid` + AX path (index chain) or a session-scoped element id cache, +so the model can act structurally instead of by pixels. + +Why it matters: this is the actual "click things you're not looking at" capability. + +## Tier 2 - app / window / system management + +Mostly `objc2-app-kit` (`NSWorkspace`, `NSRunningApplication`) + AX window +attributes + CoreGraphics window list. + +- `list_apps` / `activate_app { app }` / `hide_app` / `quit_app` +- `list_windows { pid? }` (CGWindowList) with ids, titles, bounds, on/off-screen +- `focus_window` / `move_window` / `resize_window` / `minimize_window` / `close_window` + (AX window actions - can target background windows) +- `window_screenshot { window_id }` -> capture a specific window even if occluded + (`CGWindowListCreateImage` now, ScreenCaptureKit later) +- `spaces` awareness (which Space an app is on; activating may switch Spaces - visible) + +## Tier 3 - clipboard, input fidelity, observation + +- `get_clipboard` / `set_clipboard { text }` (`NSPasteboard` via objc2-app-kit) +- `key_down` / `key_up` (hold modifiers, game-style input) +- `type_into { element, text }` (AX set value + confirm) for reliability over blind typing +- `wait_for { element|condition, timeout }` using `AXObserver*` notifications + (e.g. wait for a sheet to appear) instead of sleep-and-poll +- `paste_type { text }` - set clipboard + Cmd-V for fast/large text entry + +## Tier 4 - true background / parallel operation (advanced) + +These give genuinely off-screen, non-interfering control. Higher setup cost. + +- **Apple Events scripting bridge**: `run_applescript { script }` / `run_jxa`. + Fully headless for scriptable apps (Mail, Notes, Safari, Finder, Music, System + Settings panes, Terminal, many pro apps). No cursor, no focus. Per-app + Automation permission (prompts on first use). +- **Virtual / headless display**: route the agent's cursor+windows to a second + (virtual) display the user isn't looking at. Options: a virtual display driver + (e.g. BetterDisplay/`CGVirtualDisplay` private API) or a real unused monitor. + Lets the agent move windows there and use coordinate input without touching the + user's screen. +- **Separate login / Screen Sharing session**: a second macOS session has its own + cursor and focus; the agent drives that one. Strongest isolation, most setup. +- **Shortcuts integration**: invoke the user's `Shortcuts` automations + (`shortcuts run …`) as high-level, sanctioned actions. + +## Tier 5 - sensors / extras (optional, opt-in) + +- `ocr { region|window }` via Vision framework (read text in images / non-AX apps). +- `screen_record { seconds }` short clips via ScreenCaptureKit. +- Audio in/out control, notifications, `do_not_disturb` toggling via scripting. +- Camera/mic are separate TCC permissions; keep strictly opt-in. + +--- + +## Permissions (TCC) - the gatekeeping reality + +| Permission | Unlocks | Auto-grantable? | +|---|---|---| +| **Accessibility** | CGEvent input, all AX read/act, window control | No - user toggles once (we can prompt + deep-link) | +| **Screen Recording** | screenshots, window/ocr capture | Request API exists (`CGRequestScreenCaptureAccess`) | +| **Automation (Apple Events)** | scripting each app | Prompts per target app on first send | +| **Input Monitoring** | reading global input stream (only if we add capture) | Request API exists | + +Plan: a `request_permissions` action that calls +`AXIsProcessTrustedWithOptions(prompt=true)` (adds jcode to the list + shows the +dialog) and deep-links to the exact System Settings pane, then polls +`AXIsProcessTrusted()`. One prompt + one toggle; never zero-touch for Accessibility +(Apple's anti-malware boundary). + +Important: the permission attaches to the **host binary/terminal** running jcode. +For a stable experience we likely want a signed jcode.app with a fixed bundle id so +the grant persists across updates (otherwise each new binary path re-prompts). + +## Safety model (high blast radius) + +- Gated like `bash`: refuses early if required permission missing. +- `dry_run` on mutating actions: resolve + report target without acting. +- Prefer AX semantic actions over blind coordinate clicks (auditable, robust). +- Screenshot/element echo on destructive coordinate clicks. +- No global input *capture* unless explicitly enabled (keeps us out of Input + Monitoring by default). +- Per-action audit log; optional allowlist/denylist of target apps. + +## Suggested build order + +1. **Tier 1 (AX actions)** - biggest capability jump, enables background control. +2. **Tier 2 window mgmt + per-window screenshot** - "see and act on hidden windows". +3. **Tier 3 clipboard + AXObserver waits** - reliability. +4. **`run_applescript`/JXA bridge (Tier 4)** - headless scripting for many apps. +5. **Virtual-display / second-session (Tier 4)** - true parallel, non-interfering. +6. Signed jcode.app bundle for durable permissions. +7. Vision OCR (Tier 5) as needed. + +## Crates + +- `accessibility-sys` 0.2 (AX read/act/observe) - on crates.io +- `screencapturekit` 7 (modern capture) - on crates.io; `core-graphics` window list as fallback +- `objc2-app-kit` / `objc2-foundation` 0.3 - already in lockfile (NSWorkspace, NSPasteboard) +- `core-graphics` 0.23 - already a direct dep (CGEvent, CGWindowList, CGDisplay) + +--- + +## Tool interface design (decided) + +### Single tool, progressive disclosure + +One `computer` tool, `action`-dispatched (like `browser`). To keep always-on +context flat regardless of how many tiers exist, the schema uses **progressive +disclosure**: + +- **Always-on core (~370 tokens, measured with tiktoken cl100k_base):** + `screenshot, ui, ocr, click, type, key, press, set_value, run_applescript, + setup, discover`. +- **`discover { category }`** returns full specs for advanced actions on demand + (`mouse|keyboard|ax|windows|apps|clipboard|scripting|displays|system|all`), + ~130 tokens per category, paid only when used. +- **Shared handle types** (`element`, `window_id`, `region`) defined once and + reused, so params do not multiply with actions. + +Measured always-on cost: + +| Design | Actions visible | Always-on tokens | +|---|---|---| +| Current v1 tool | 12 | ~720 | +| Flat, all tiers (~46 actions) | 46 | ~1,020 | +| **Progressive core** | 11 | **~370** | + +Background control is a property of the *mechanism*, not the tier: CGEvent = +visible; **AX actions (press/set_value/select_menu) + Apple Events = background**. + +### `setup` / `check_permissions` action + +A first-class `setup` action that: + +1. **Reports** status of every requirement: Accessibility (`AXIsProcessTrusted`), + Screen Recording (`CGPreflightScreenCaptureAccess`), Automation (per-app, via + first Apple Event), plus install/bundle health. +2. **Requests** what it can programmatically: + - `AXIsProcessTrustedWithOptions(prompt=true)` — shows the Accessibility dialog + and pre-adds jcode to the list (toggled off). + - `CGRequestScreenCaptureAccess()` — prompts for Screen Recording. + - First Apple Event to a target app — triggers its Automation prompt. +3. **Deep-links** to the exact System Settings pane for anything still missing: + - `x-apple.systempreferences:com.apple.preference.security?Privacy_Accessibility` + - `…?Privacy_ScreenCapture` + - `…?Privacy_Automation` +4. **Polls** `AXIsProcessTrusted()` until granted, then reports "ready". + +**Hard limit:** the Accessibility *toggle itself cannot be flipped by any API* +(Apple anti-malware boundary). `tccutil` can only reset, not grant. So the best +achievable UX is **"one or two prompts + one toggle,"** never zero-touch. + +### Durable permissions: signed app bundle + +TCC permissions attach to the **running binary's identity**. A bare dev/cli binary +changes path/signature across updates, so macOS re-prompts every time. To make a +grant stick: + +- Ship/install jcode as a **signed `.app` bundle with a stable bundle id** + (e.g. `com.jcode.app`) and a Designated Requirement, so the Accessibility / + Screen Recording grant persists across updates. +- `setup` should detect "running from an unstable/unsigned path" and offer to + install the proper bundle, so the user grants **once**. + +### Build order (updated) + +1. Progressive-disclosure refactor of the v1 tool (core + `discover`). +2. `setup` action (check + request + deep-link + poll). +3. Tier 1 AX actions (background control). +4. Tier 2 window/app management + per-window screenshot. +5. Tier 3 clipboard + AXObserver waits. +6. `run_applescript`/JXA bridge (Tier 4 headless scripting). +7. Signed app bundle for durable permissions. +8. Tier 5 OCR (Vision). (Camera/audio intentionally excluded.) +9. Virtual-display / second-session for true parallel work (advanced). diff --git a/docs/proposals/computer-use-tool.md b/docs/proposals/computer-use-tool.md new file mode 100644 index 0000000000..e56b586ed1 --- /dev/null +++ b/docs/proposals/computer-use-tool.md @@ -0,0 +1,149 @@ +# Proposal: native `computer` tool for macOS computer use + +## Summary + +Add a single native tool, **`computer`**, that lets the agent observe and control +the macOS GUI — screenshots, the accessibility (AX) tree, mouse/keyboard input, +window/app management, and clipboard — through one `action`-dispatched interface. + +This mirrors the existing **`browser`** tool (`crates/jcode-app-core/src/tool/browser.rs`): +one registered tool, an `action: String` that selects a sub-operation, with optional +typed params. It gives jcode a closed control loop (*see screen → decide → act*) +without depending on a browser or external automation tooling. + +## Motivation + +- The agent can already drive a browser; it cannot drive native macOS apps, system + UI, or anything outside the browser sandbox. +- "Computer use" agents need exactly three primitives: **read the screen**, **read + UI structure**, and **synthesize input**. macOS exposes all three through the + Accessibility / Quartz Event Services / ScreenCaptureKit stack. +- A single, well-scoped tool keeps the tool surface small and the permission story + in one place. + +## Architecture + +``` +crates/jcode-macos-control/ (new) cfg(target_os = "macos") platform crate + └─ AX (accessibility-sys), CGEvent (core-graphics), + CoreFoundation (core-foundation), screenshots (ScreenCaptureKit / CGDisplay), + app/window control (objc2 + objc2-app-kit), clipboard (objc2 NSPasteboard) + +crates/jcode-app-core/src/tool/computer.rs (new) ComputerTool + └─ thin dispatch layer: parse input -> call jcode-macos-control -> ToolOutput + └─ registered in crates/jcode-app-core/src/tool/mod.rs base_tools() +``` + +- All native APIs are reached through existing Rust bindings (`objc2`, + `accessibility-sys`, `core-graphics`, `core-foundation`) — **no Swift/ObjC build + step**. +- On non-macOS targets the tool still registers but every action returns a clean + `unsupported on this platform` error, so the tool list stays stable across OSes. +- `screenshot` returns its image via `ToolOutput::with_image` (base64), matching how + `browser` returns screenshots today. + +## Permissions (the important part) + +macOS splits this across **four** TCC permissions. Programmatic *request* support +differs per permission: + +| Permission | Used for | Programmatic request | +|---|---|---| +| **Accessibility** | drive other apps' UI, inject `CGEvent` input | ⚠️ prompt + deep-link only; user must toggle | +| **Screen Recording** | screenshots / `get_ui_tree` of some apps | ✅ `CGRequestScreenCaptureAccess()` | +| **Input Monitoring** | reading the global input stream | ✅ `IOHIDRequestAccess(...)` | +| **Automation** (Apple Events) | scripting cooperating apps | ✅ prompts on first send, per target app | + +**Accessibility is the one that cannot be auto-granted** (Apple's anti-malware +boundary), and it is required for input injection. Best achievable flow, exposed via +the `request_permissions` action: + +1. `AXIsProcessTrustedWithOptions([kAXTrustedCheckOptionPrompt: true])` — shows the + system dialog *and auto-adds jcode to the Accessibility list* (toggled off). +2. Deep-link to the exact pane: + `open "x-apple.systempreferences:com.apple.preference.security?Privacy_Accessibility"`. +3. Poll `AXIsProcessTrusted()` until granted, then report ready. + +So the experience becomes **one prompt + one toggle**, not "go hunt in Settings" — +but never zero-touch for Accessibility. + +`check_permissions` reports the current state of all four so the agent can tell the +user precisely what is missing before attempting control. + +## Actions + +`action` (required) selects the operation. Params below are optional and validated +per action. + +**Permissions** +- `check_permissions` → status of accessibility / screen-recording / input-monitoring +- `request_permissions` → prompt + deep-link flow above + +**Observe** +- `screenshot` — `{ display?, window_id?, region? }` → image +- `get_ui_tree` — `{ pid? | frontmost, max_depth? }` → serialized AX tree (role, title, value, position, size, actions) +- `find_element` — `{ role?, title?, value?, pid? }` → matching elements + their identifiers +- `element_at` — `{ x, y }` → element under the point + +**Mouse** +- `move` — `{ x, y }` +- `click` / `double_click` / `right_click` — `{ x?, y? }` (current position if omitted) +- `drag` — `{ from: [x,y], to: [x,y] }` +- `scroll` — `{ x?, y?, dx, dy }` + +**Keyboard** +- `type` — `{ text }` +- `key` — `{ keys: "cmd+shift+4" }` (chord) +- `key_down` / `key_up` — `{ key }` + +**Semantic AX (preferred over raw input when available)** +- `press` — `{ element }` (AXPress) +- `set_value` — `{ element, value }` +- `get_value` — `{ element }` +- `perform_action` — `{ element, ax_action }` +- `select_menu` — `{ app, path: ["File", "Export…"] }` + +**Window / app** +- `list_apps`, `activate_app` `{ app }` +- `list_windows` `{ pid? }`, `focus_window` `{ window }` +- `move_window` `{ window, x, y }`, `resize_window` `{ window, w, h }` +- `minimize_window` / `close_window` `{ window }` + +**Clipboard** +- `get_clipboard`, `set_clipboard` `{ text }` + +> Element identifiers: `find_element` / `get_ui_tree` return stable-enough handles +> (e.g. `pid` + AX path or a session-scoped element id) that semantic actions accept, +> so the agent can act structurally instead of by pixel coordinates when possible. + +## Safety + +Computer use is high-blast-radius, so: + +- **Permission-gated** like other powerful tools; refuses early with a clear message + if Accessibility/Screen Recording is missing. +- **`dry_run` param** on mutating actions — resolves and reports the target without + acting. +- **Screenshot-assisted confirmation** for destructive coordinate clicks (return the + region/element being targeted). +- **No global input *capture*** in v1 (we synthesize input but do not log the user's + keystrokes), keeping us out of Input Monitoring unless a future feature needs it. +- Prefer **semantic AX actions** over blind coordinate input wherever the element is + resolvable — more robust and more auditable. + +## Implementation plan + +1. `jcode-macos-control` crate: permissions, screenshot, AX read, AX action, + CGEvent input, window/app control, clipboard. Unit-test the pure parts + (input parsing, chord parsing, tree serialization). +2. `ComputerTool` in `tool/computer.rs`: input struct + `action` dispatch + + schema + description; register `"computer"` in `tool/mod.rs` `base_tools()`. +3. Default-off / gated rollout + docs in `docs/`. +4. Follow-up: Windows/Linux backends behind the same tool surface. + +## Open questions + +- Element handle format — `pid`+AX-path vs an opaque session-scoped id cache? +- Should `request_permissions` block-and-poll, or return immediately with status and + let the agent re-check? +- Default enablement: opt-in flag vs always-registered-but-gated? diff --git a/src/cli/commands/provider_setup.rs b/src/cli/commands/provider_setup.rs index 442571b539..ebe1abb2d5 100644 --- a/src/cli/commands/provider_setup.rs +++ b/src/cli/commands/provider_setup.rs @@ -180,6 +180,7 @@ pub(crate) fn configure_provider_profile( context_window: options.context_window, input: Vec::new(), }], + extra_body: None, }; let config_path = Config::path().ok_or_else(|| anyhow::anyhow!("No config path"))?;