From 933637bb1c2c0237e233ae536659aa353d41217d Mon Sep 17 00:00:00 2001 From: Hunter B Date: Thu, 4 Jun 2026 19:48:45 -0700 Subject: [PATCH] feat(search): harvest custom duckduckgo endpoint Add optional [search].base_url support for DuckDuckGo-compatible private search endpoints, including a preferred CODEWHALE_SEARCH_BASE_URL env override and the legacy DEEPSEEK_SEARCH_BASE_URL alias. Network policy now gates the configured endpoint host, custom endpoints do not fall back to public Bing, non-DuckDuckGo provider/base_url combinations and challenge pages return explicit errors, and custom endpoint results report the configured host as their source. Fixes #2436 Reported by @Artenx Harvested from PR #2510 by @cyq1017 Co-authored-by: cyq1017 <61975706+cyq1017@users.noreply.github.com> --- CHANGELOG.md | 11 +- config.example.toml | 3 + crates/tui/CHANGELOG.md | 11 +- crates/tui/src/config.rs | 85 ++++++++++++ crates/tui/src/core/engine.rs | 4 + crates/tui/src/main.rs | 3 + crates/tui/src/runtime_threads.rs | 1 + crates/tui/src/tools/spec.rs | 5 + crates/tui/src/tools/web_search.rs | 208 ++++++++++++++++++++++++++--- crates/tui/src/tui/ui.rs | 1 + docs/CONFIGURATION.md | 8 ++ docs/V0_9_0_EXECUTION_MAP.md | 6 +- 12 files changed, 323 insertions(+), 23 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 04805cf38..d03fc62c9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -41,6 +41,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 were previously unpriced: `mimo-v2.5-pro` / `xiaomi/mimo-v2.5-pro` reuse the DeepSeek V4-Pro rate table and `mimo-v2.5` / `xiaomi/mimo-v2.5` reuse the DeepSeek V4-Flash rates. Existing DeepSeek pricing is unchanged (#2731, #2750). +- Added optional `[search].base_url` / `CODEWHALE_SEARCH_BASE_URL` support for + DuckDuckGo-compatible private search endpoints, while keeping + `DEEPSEEK_SEARCH_BASE_URL` as a legacy alias. Custom endpoints are gated by + their configured host, do not fall back to public Bing, and report the custom + host as the result source for diagnostics (#2436, #2510). ### Changed @@ -154,8 +159,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Community Thanks to **@sximelon** for reporting and fixing the saved-session resume -footer hint (#2758, #2760), **@cyq1017** for the restore-listing implementation -(#2513) and pending-input delivery-mode label work (#2532, #2054), +footer hint (#2758, #2760), **@cyq1017** for the custom +DuckDuckGo-compatible search endpoint, restore-listing implementation, and +pending-input delivery-mode label work (#2510, #2513, #2532, #2054), +**@Artenx** for the private-search endpoint report (#2436), **@wywsoor** for the broader macOS/iTerm rollback UX report (#2494), **@HUQIANTAO** for the `web_run` lock-splitting work (#2502), turn-metadata prefix-cache stability work (#2517), and project-context cache direction diff --git a/config.example.toml b/config.example.toml index b53435359..f4b8bd791 100644 --- a/config.example.toml +++ b/config.example.toml @@ -409,6 +409,7 @@ max_subagents = 10 # optional (1-20) # # baidu: 百度 AI Search via qianfan.baidubce.com,需 api_key # # volcengine: 火山引擎 Ark web_search (免费 2 万次/月), 需 api_key # # 也回退到 VOLCENGINE_API_KEY / VOLCENGINE_ARK_API_KEY / ARK_API_KEY 环境变量 +# base_url = "https://search.example/html/" # optional DuckDuckGo-compatible HTML endpoint # api_key = "YOUR_SEARCH_KEY" # required for tavily, bocha, and baidu; optional for metaso # # WARNING: treat config.toml like a secret file when # # storing API keys. Prefer env vars for local smoke tests. @@ -416,6 +417,8 @@ max_subagents = 10 # optional (1-20) # Env-var overrides: # DEEPSEEK_SEARCH_PROVIDER → search.provider # DEEPSEEK_SEARCH_API_KEY → search.api_key +# CODEWHALE_SEARCH_BASE_URL → search.base_url +# DEEPSEEK_SEARCH_BASE_URL → search.base_url (legacy alias) # METASO_API_KEY → metaso key fallback # BAIDU_SEARCH_API_KEY → baidu key fallback diff --git a/crates/tui/CHANGELOG.md b/crates/tui/CHANGELOG.md index 04805cf38..d03fc62c9 100644 --- a/crates/tui/CHANGELOG.md +++ b/crates/tui/CHANGELOG.md @@ -41,6 +41,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 were previously unpriced: `mimo-v2.5-pro` / `xiaomi/mimo-v2.5-pro` reuse the DeepSeek V4-Pro rate table and `mimo-v2.5` / `xiaomi/mimo-v2.5` reuse the DeepSeek V4-Flash rates. Existing DeepSeek pricing is unchanged (#2731, #2750). +- Added optional `[search].base_url` / `CODEWHALE_SEARCH_BASE_URL` support for + DuckDuckGo-compatible private search endpoints, while keeping + `DEEPSEEK_SEARCH_BASE_URL` as a legacy alias. Custom endpoints are gated by + their configured host, do not fall back to public Bing, and report the custom + host as the result source for diagnostics (#2436, #2510). ### Changed @@ -154,8 +159,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Community Thanks to **@sximelon** for reporting and fixing the saved-session resume -footer hint (#2758, #2760), **@cyq1017** for the restore-listing implementation -(#2513) and pending-input delivery-mode label work (#2532, #2054), +footer hint (#2758, #2760), **@cyq1017** for the custom +DuckDuckGo-compatible search endpoint, restore-listing implementation, and +pending-input delivery-mode label work (#2510, #2513, #2532, #2054), +**@Artenx** for the private-search endpoint report (#2436), **@wywsoor** for the broader macOS/iTerm rollback UX report (#2494), **@HUQIANTAO** for the `web_run` lock-splitting work (#2502), turn-metadata prefix-cache stability work (#2517), and project-context cache direction diff --git a/crates/tui/src/config.rs b/crates/tui/src/config.rs index 3b3bf8623..184b3da98 100644 --- a/crates/tui/src/config.rs +++ b/crates/tui/src/config.rs @@ -1116,6 +1116,11 @@ pub struct SearchConfig { /// Search provider: `bing` | `duckduckgo` | `tavily` | `bocha` | `metaso` | `baidu` | `volcengine`. Default: `duckduckgo`. #[serde(default)] pub provider: Option, + /// Optional DuckDuckGo-compatible HTML endpoint. When set with the + /// DuckDuckGo provider, `web_search` appends the `q` query parameter to + /// this URL instead of using `https://html.duckduckgo.com/html/`. + #[serde(default)] + pub base_url: Option, /// API key for Tavily, Bocha, Metaso, Baidu, or Volcengine. Not required for Bing or DuckDuckGo. /// Metaso also falls back to `METASO_API_KEY` env var, then a built-in default. /// Baidu also falls back to `BAIDU_SEARCH_API_KEY` env var. @@ -3803,6 +3808,12 @@ fn apply_env_overrides(config: &mut Config) { .get_or_insert_with(SearchConfig::default) .api_key = Some(value); } + if let Ok(value) = codewhale_env_var("CODEWHALE_SEARCH_BASE_URL", "DEEPSEEK_SEARCH_BASE_URL") { + config + .search + .get_or_insert_with(SearchConfig::default) + .base_url = Some(value); + } if let Ok(value) = std::env::var("DEEPSEEK_REQUIREMENTS_PATH") { config.requirements_path = Some(value); } @@ -5524,6 +5535,25 @@ mod tests { ); } + #[test] + fn search_config_preserves_custom_base_url() { + let config: Config = toml::from_str( + r#" + [search] + provider = "duckduckgo" + base_url = "https://search.internal.example/html/" + "#, + ) + .expect("search config"); + + let search = config.search.expect("search table"); + assert_eq!(search.provider, Some(SearchProvider::DuckDuckGo)); + assert_eq!( + search.base_url.as_deref(), + Some("https://search.internal.example/html/") + ); + } + #[test] fn explicit_baidu_search_provider_is_preserved() { let config: Config = toml::from_str( @@ -5667,6 +5697,61 @@ mod tests { ); } + #[test] + fn apply_env_overrides_sets_search_base_url() { + let _guard = lock_test_env(); + let prev_codewhale = env::var_os("CODEWHALE_SEARCH_BASE_URL"); + let prev_deepseek = env::var_os("DEEPSEEK_SEARCH_BASE_URL"); + unsafe { + env::remove_var("CODEWHALE_SEARCH_BASE_URL"); + env::set_var( + "DEEPSEEK_SEARCH_BASE_URL", + "https://search.internal.example/html/", + ) + }; + let mut config = Config::default(); + + apply_env_overrides(&mut config); + + unsafe { + EnvGuard::restore_var("CODEWHALE_SEARCH_BASE_URL", prev_codewhale); + EnvGuard::restore_var("DEEPSEEK_SEARCH_BASE_URL", prev_deepseek); + } + assert_eq!( + config.search.and_then(|search| search.base_url), + Some("https://search.internal.example/html/".to_string()) + ); + } + + #[test] + fn codewhale_search_base_url_env_wins_over_legacy_alias() { + let _guard = lock_test_env(); + let prev_codewhale = env::var_os("CODEWHALE_SEARCH_BASE_URL"); + let prev_deepseek = env::var_os("DEEPSEEK_SEARCH_BASE_URL"); + unsafe { + env::set_var( + "CODEWHALE_SEARCH_BASE_URL", + "https://codewhale-search.example/html/", + ); + env::set_var( + "DEEPSEEK_SEARCH_BASE_URL", + "https://legacy-search.example/html/", + ); + } + let mut config = Config::default(); + + apply_env_overrides(&mut config); + + unsafe { + EnvGuard::restore_var("CODEWHALE_SEARCH_BASE_URL", prev_codewhale); + EnvGuard::restore_var("DEEPSEEK_SEARCH_BASE_URL", prev_deepseek); + } + assert_eq!( + config.search.and_then(|search| search.base_url), + Some("https://codewhale-search.example/html/".to_string()) + ); + } + #[test] fn search_provider_resolution_ignores_invalid_env_override() { let _guard = lock_test_env(); diff --git a/crates/tui/src/core/engine.rs b/crates/tui/src/core/engine.rs index 27960fe85..ff7552a66 100644 --- a/crates/tui/src/core/engine.rs +++ b/crates/tui/src/core/engine.rs @@ -344,6 +344,8 @@ pub struct EngineConfig { /// Metaso also falls back to `METASO_API_KEY` env var, then a built-in key. /// Baidu also falls back to `BAIDU_SEARCH_API_KEY`. pub search_api_key: Option, + /// Optional DuckDuckGo-compatible HTML endpoint override. + pub search_base_url: Option, /// Per-step DeepSeek API timeout for sub-agent `create_message` requests. /// Resolved from `[subagents] api_timeout_secs` (clamped to 1..=1800) /// once at engine construction, then threaded onto every @@ -408,6 +410,7 @@ impl Default for EngineConfig { workshop: None, search_provider: crate::config::SearchProvider::default(), search_api_key: None, + search_base_url: None, subagent_api_timeout: Duration::from_secs( crate::config::DEFAULT_SUBAGENT_API_TIMEOUT_SECS, ), @@ -2251,6 +2254,7 @@ In {new} mode: {policy}\n\n\ // Wire search provider config. ctx.search_provider = self.config.search_provider; ctx.search_api_key = self.config.search_api_key.clone(); + ctx.search_base_url = self.config.search_base_url.clone(); let policy = sandbox_policy_for_mode(mode, &self.session.workspace); let mut ctx = ctx.with_elevated_sandbox_policy(policy); diff --git a/crates/tui/src/main.rs b/crates/tui/src/main.rs index cb06f5a77..2e238c193 100644 --- a/crates/tui/src/main.rs +++ b/crates/tui/src/main.rs @@ -5745,6 +5745,7 @@ async fn run_exec_agent( workshop: config.workshop.clone(), search_provider: config.search_provider(), search_api_key: config.search.as_ref().and_then(|s| s.api_key.clone()), + search_base_url: config.search.as_ref().and_then(|s| s.base_url.clone()), tools_always_load: config.tools_always_load(), tools: config.tools.clone(), }; @@ -6317,6 +6318,7 @@ mod doctor_endpoint_tests { let config = Config { search: Some(crate::config::SearchConfig { provider: Some(crate::config::SearchProvider::DuckDuckGo), + base_url: None, api_key: None, }), ..Default::default() @@ -6356,6 +6358,7 @@ mod doctor_endpoint_tests { let config = Config { search: Some(crate::config::SearchConfig { provider: Some(crate::config::SearchProvider::Bing), + base_url: None, api_key: None, }), ..Default::default() diff --git a/crates/tui/src/runtime_threads.rs b/crates/tui/src/runtime_threads.rs index 69e12e15d..48bf3e44e 100644 --- a/crates/tui/src/runtime_threads.rs +++ b/crates/tui/src/runtime_threads.rs @@ -2083,6 +2083,7 @@ impl RuntimeThreadManager { workshop: self.config.workshop.clone(), search_provider: self.config.search_provider(), search_api_key: self.config.search.as_ref().and_then(|s| s.api_key.clone()), + search_base_url: self.config.search.as_ref().and_then(|s| s.base_url.clone()), tools_always_load: self.config.tools_always_load(), tools: self.config.tools.clone(), }; diff --git a/crates/tui/src/tools/spec.rs b/crates/tui/src/tools/spec.rs index 52553cdfb..63ac165b1 100644 --- a/crates/tui/src/tools/spec.rs +++ b/crates/tui/src/tools/spec.rs @@ -169,6 +169,8 @@ pub struct ToolContext { /// Metaso also falls back to `METASO_API_KEY` env var, then a built-in key. /// Baidu also falls back to `BAIDU_SEARCH_API_KEY`. pub search_api_key: Option, + /// Optional DuckDuckGo-compatible HTML endpoint override for `web_search`. + pub search_base_url: Option, /// Per-session workshop variable store (#548). Holds the raw content of /// the most recent large-tool routing event so the parent can call @@ -210,6 +212,7 @@ impl ToolContext { large_output_router: None, search_provider: crate::config::SearchProvider::default(), search_api_key: None, + search_base_url: None, workshop_vars: None, } } @@ -247,6 +250,7 @@ impl ToolContext { large_output_router: None, search_provider: crate::config::SearchProvider::default(), search_api_key: None, + search_base_url: None, workshop_vars: None, } } @@ -284,6 +288,7 @@ impl ToolContext { large_output_router: None, search_provider: crate::config::SearchProvider::default(), search_api_key: None, + search_base_url: None, workshop_vars: None, } } diff --git a/crates/tui/src/tools/web_search.rs b/crates/tui/src/tools/web_search.rs index 5984d7916..cc33276ce 100644 --- a/crates/tui/src/tools/web_search.rs +++ b/crates/tui/src/tools/web_search.rs @@ -7,6 +7,7 @@ //! //! Set `[search]` in config.toml to switch providers: //! provider = "duckduckgo" # or tavily/bocha/metaso/baidu/volcengine +//! base_url = "https://search.example/html/" # optional DDG-compatible URL //! api_key = "tvly-..." use super::spec::{ @@ -22,7 +23,7 @@ use serde_json::{Value, json}; use std::sync::OnceLock; use std::time::Duration; -const DUCKDUCKGO_HOST: &str = "html.duckduckgo.com"; +const DUCKDUCKGO_ENDPOINT: &str = "https://html.duckduckgo.com/html/"; const BING_HOST: &str = "www.bing.com"; const TAVILY_ENDPOINT: &str = "https://api.tavily.com/search"; const BOCHA_ENDPOINT: &str = "https://api.bochaai.com/v1/ai/search"; @@ -139,7 +140,7 @@ impl ToolSpec for WebSearchTool { } fn description(&self) -> &'static str { - "Search the web and return ranked results with URLs and snippets. Default backend is DuckDuckGo with Bing fallback; set `[search] provider = \"bing\" | \"tavily\" | \"bocha\" | \"metaso\" | \"baidu\"` in config.toml to switch backends. Use this instead of scraping search engines with `curl` in `exec_shell`. For a known canonical URL, prefer `fetch_url` directly." + "Search the web and return ranked results with URLs and snippets. Default backend is DuckDuckGo with Bing fallback; set `[search] provider = \"bing\" | \"tavily\" | \"bocha\" | \"metaso\" | \"baidu\"` in config.toml to switch backends, or `[search] base_url` for a DuckDuckGo-compatible endpoint. Use this instead of scraping search engines with `curl` in `exec_shell`. For a known canonical URL, prefer `fetch_url` directly." } fn input_schema(&self) -> Value { @@ -200,6 +201,15 @@ impl ToolSpec for WebSearchTool { let max_results = max_results.clamp(1, MAX_RESULTS); let timeout_ms = optional_u64(&input, "timeout_ms", DEFAULT_TIMEOUT_MS).min(60_000); + if configured_search_base_url(context.search_base_url.as_deref()).is_some() + && !matches!(context.search_provider, SearchProvider::DuckDuckGo) + { + return Err(ToolError::invalid_input(format!( + "[search].base_url is only supported with provider = \"duckduckgo\"; current provider is \"{}\"", + context.search_provider.as_str() + ))); + } + // Dispatch to the configured API-backed search providers before // building the HTML-scraping client used by Bing/DuckDuckGo. match context.search_provider { @@ -265,13 +275,16 @@ impl ToolSpec for WebSearchTool { } // Per-domain network policy gate (#135). The "host" for web search is - // the upstream search engine domain — DuckDuckGo first, Bing on - // fallback. We gate DuckDuckGo here; Bing is gated separately inside - // the fallback path so a deny on one engine doesn't block the other. - check_policy(decider, DUCKDUCKGO_HOST)?; + // the upstream search engine domain — DuckDuckGo-compatible first, + // Bing on fallback. We gate the configured endpoint here; Bing is + // gated separately inside the fallback path so a deny on one engine + // doesn't silently allow the other. + let (url, duckduckgo_host) = + duckduckgo_search_url(context.search_base_url.as_deref(), &query)?; + let allow_bing_fallback = + duckduckgo_allows_bing_fallback(context.search_base_url.as_deref()); + check_policy(decider, &duckduckgo_host)?; - let encoded = url_encode(&query); - let url = format!("https://html.duckduckgo.com/html/?q={encoded}"); let resp = client .get(&url) .header( @@ -297,7 +310,11 @@ impl ToolSpec for WebSearchTool { } let mut results = parse_duckduckgo_results(&body, max_results); - let mut source = "duckduckgo"; + let mut source = if allow_bing_fallback { + "duckduckgo".to_string() + } else { + duckduckgo_host.clone() + }; let mut message_suffix: Option<&str> = None; // When Bing returned zero and we fell through to DuckDuckGo, surface @@ -306,15 +323,21 @@ impl ToolSpec for WebSearchTool { message_suffix = Some("Bing returned no results; used DuckDuckGo fallback"); } - if results.is_empty() { - let duckduckgo_blocked = is_duckduckgo_challenge(&body); + let duckduckgo_blocked = is_duckduckgo_challenge(&body); + if results.is_empty() && duckduckgo_blocked && !allow_bing_fallback { + return Err(ToolError::execution_failed(format!( + "DuckDuckGo-compatible search endpoint at {duckduckgo_host} returned a bot challenge; check the private search service, credentials, or network policy" + ))); + } + + if results.is_empty() && allow_bing_fallback { // Bing is a separate host — gate it independently so a deny on // DuckDuckGo doesn't silently let Bing through (and vice versa). check_policy(decider, BING_HOST)?; match run_bing_search(&client, &query, max_results).await { Ok(fallback_results) if !fallback_results.is_empty() => { results = fallback_results; - source = "bing"; + source = "bing".to_string(); message_suffix = Some(if duckduckgo_blocked { "DuckDuckGo returned a bot challenge; used Bing fallback" } else { @@ -341,7 +364,7 @@ impl ToolSpec for WebSearchTool { fn search_tool_result( query: String, - source: &'static str, + source: impl Into, results: Vec, message_suffix: Option<&str>, ) -> Result { @@ -355,7 +378,7 @@ fn search_tool_result( let response = WebSearchResponse { query, - source: source.to_string(), + source: source.into(), count: results.len(), message, results, @@ -1336,6 +1359,31 @@ fn normalize_bing_url(href: &str) -> String { href.to_string() } +fn duckduckgo_search_url( + base_url: Option<&str>, + query: &str, +) -> Result<(String, String), ToolError> { + let raw = configured_search_base_url(base_url).unwrap_or(DUCKDUCKGO_ENDPOINT); + let mut url = reqwest::Url::parse(raw).map_err(|err| { + ToolError::invalid_input(format!( + "Invalid DuckDuckGo-compatible search base_url: {err}" + )) + })?; + url.query_pairs_mut().append_pair("q", query); + let host = url.host_str().ok_or_else(|| { + ToolError::invalid_input("DuckDuckGo-compatible search base_url must include a host") + })?; + Ok((url.to_string(), host.to_string())) +} + +fn configured_search_base_url(base_url: Option<&str>) -> Option<&str> { + base_url.map(str::trim).filter(|value| !value.is_empty()) +} + +fn duckduckgo_allows_bing_fallback(base_url: Option<&str>) -> bool { + configured_search_base_url(base_url).is_none() +} + fn normalize_text(text: &str) -> String { let stripped = strip_html_tags(text); let decoded = decode_html_entities(&stripped); @@ -1439,9 +1487,9 @@ fn extract_query_param(url: &str, key: &str) -> Option { mod tests { use super::{ ERROR_BODY_PREVIEW_BYTES, WebSearchEntry, WebSearchTool, baidu_search_payload, - decode_html_entities, extract_search_query, is_likely_spam_results, normalize_bing_url, - optional_search_max_results, parse_baidu_results, root_domain, sanitize_error_body, - truncate_error_body, volcengine_extract_text, + decode_html_entities, duckduckgo_search_url, extract_search_query, is_likely_spam_results, + normalize_bing_url, optional_search_max_results, parse_baidu_results, root_domain, + sanitize_error_body, truncate_error_body, volcengine_extract_text, }; use serde_json::json; @@ -1979,4 +2027,130 @@ mod tests { "should not complain about missing API key (built-in default); got `{msg}`" ); } + + #[test] + fn duckduckgo_compatible_url_uses_custom_base_url_and_preserves_query() { + let (url, host) = duckduckgo_search_url( + Some("https://search.internal.example/html/?region=us"), + "rust async", + ) + .expect("custom duckduckgo-compatible url"); + + assert_eq!(host, "search.internal.example"); + assert_eq!( + url, + "https://search.internal.example/html/?region=us&q=rust+async" + ); + } + + #[test] + fn custom_duckduckgo_endpoint_disables_public_bing_fallback() { + assert!(super::duckduckgo_allows_bing_fallback(None)); + assert!(super::duckduckgo_allows_bing_fallback(Some(" "))); + assert!(!super::duckduckgo_allows_bing_fallback(Some( + "https://search.internal.example/html/" + ))); + } + + #[tokio::test] + async fn custom_duckduckgo_results_report_custom_host_source() { + use crate::config::SearchProvider; + use crate::tools::spec::{ToolContext, ToolSpec}; + use wiremock::matchers::{method, path, query_param}; + use wiremock::{Mock, MockServer, ResponseTemplate}; + + let server = MockServer::start().await; + Mock::given(method("GET")) + .and(path("/html/")) + .and(query_param("q", "rust async")) + .respond_with(ResponseTemplate::new(200).set_body_string( + r#" + + Rust async +
Async Rust result
+ + "#, + )) + .mount(&server) + .await; + + let tmp = tempfile::tempdir().expect("tempdir"); + let mut ctx = ToolContext::new(tmp.path().to_path_buf()); + ctx.search_provider = SearchProvider::DuckDuckGo; + let base_url = format!("{}/html/", server.uri()); + let expected_host = reqwest::Url::parse(&base_url) + .expect("mock server url") + .host_str() + .expect("mock server host") + .to_string(); + ctx.search_base_url = Some(base_url); + + let result = WebSearchTool + .execute(json!({"query": "rust async"}), &ctx) + .await + .expect("custom endpoint should return results"); + let value: serde_json::Value = + serde_json::from_str(&result.content).expect("web search json response"); + + assert_eq!(value["source"].as_str(), Some(expected_host.as_str())); + assert_eq!(value["count"].as_u64(), Some(1)); + } + + #[tokio::test] + async fn custom_duckduckgo_challenge_returns_actionable_error() { + use crate::config::SearchProvider; + use crate::tools::spec::{ToolContext, ToolSpec}; + use wiremock::matchers::{method, path, query_param}; + use wiremock::{Mock, MockServer, ResponseTemplate}; + + let server = MockServer::start().await; + Mock::given(method("GET")) + .and(path("/html/")) + .and(query_param("q", "rust async")) + .respond_with(ResponseTemplate::new(200).set_body_string( + r#"
Unfortunately, bots use DuckDuckGo too
"#, + )) + .mount(&server) + .await; + + let tmp = tempfile::tempdir().expect("tempdir"); + let mut ctx = ToolContext::new(tmp.path().to_path_buf()); + ctx.search_provider = SearchProvider::DuckDuckGo; + ctx.search_base_url = Some(format!("{}/html/", server.uri())); + + let err = WebSearchTool + .execute(json!({"query": "rust async"}), &ctx) + .await + .expect_err("custom endpoint challenge should error"); + let msg = err.to_string(); + assert!( + msg.contains("DuckDuckGo-compatible search endpoint") + && msg.contains("bot challenge") + && msg.contains("private search service"), + "got `{msg}`" + ); + } + + #[tokio::test] + async fn search_base_url_with_non_duckduckgo_provider_is_explicit_error() { + use crate::config::SearchProvider; + use crate::tools::spec::{ToolContext, ToolSpec}; + + let tmp = tempfile::tempdir().expect("tempdir"); + let mut ctx = ToolContext::new(tmp.path().to_path_buf()); + ctx.search_provider = SearchProvider::Tavily; + ctx.search_base_url = Some("https://search.internal.example/html/".to_string()); + + let err = WebSearchTool + .execute(json!({"query": "rust async"}), &ctx) + .await + .expect_err("non-duckduckgo provider with base_url should error"); + let msg = err.to_string(); + assert!( + msg.contains("[search].base_url") + && msg.contains("provider = \"duckduckgo\"") + && msg.contains("tavily"), + "got `{msg}`" + ); + } } diff --git a/crates/tui/src/tui/ui.rs b/crates/tui/src/tui/ui.rs index f957a10b5..ed07841c4 100644 --- a/crates/tui/src/tui/ui.rs +++ b/crates/tui/src/tui/ui.rs @@ -920,6 +920,7 @@ fn build_engine_config(app: &App, config: &Config) -> EngineConfig { workshop: config.workshop.clone(), search_provider: config.search_provider(), search_api_key: config.search.as_ref().and_then(|s| s.api_key.clone()), + search_base_url: config.search.as_ref().and_then(|s| s.base_url.clone()), tools_always_load: config.tools_always_load(), tools: config.tools.clone(), } diff --git a/docs/CONFIGURATION.md b/docs/CONFIGURATION.md index 9119cbba1..4e75b264a 100644 --- a/docs/CONFIGURATION.md +++ b/docs/CONFIGURATION.md @@ -1081,6 +1081,13 @@ parseable results. Bing remains selectable for users who explicitly want it, and Tavily, Bocha, Metaso, or Baidu can be selected when an API-backed provider is preferred. +For a private/internal search service that serves DuckDuckGo-compatible HTML, +keep `provider = "duckduckgo"` and set `base_url`; CodeWhale appends the `q` +query parameter to that endpoint and applies network policy to its host. +Custom endpoints do not fall back to public Bing. `CODEWHALE_SEARCH_BASE_URL` +can override this per process; `DEEPSEEK_SEARCH_BASE_URL` remains accepted as +the legacy alias. + **Metaso** ([metaso.cn](https://metaso.cn)) has a 100 searches/day free quota; set `METASO_API_KEY` or `[search] api_key` for a higher quota. @@ -1092,6 +1099,7 @@ only; it does not add a Baidu model provider. ```toml [search] provider = "baidu" # duckduckgo | bing | tavily | bocha | metaso | baidu +# base_url = "https://search.example/html/" # optional with provider = "duckduckgo" # api_key = "YOUR_KEY" # required for tavily, bocha, and baidu; optional for metaso ``` diff --git a/docs/V0_9_0_EXECUTION_MAP.md b/docs/V0_9_0_EXECUTION_MAP.md index 5a0c7fb58..a6462e83a 100644 --- a/docs/V0_9_0_EXECUTION_MAP.md +++ b/docs/V0_9_0_EXECUTION_MAP.md @@ -64,9 +64,11 @@ harvest/stewardship commits: | #2581 provider fallback chain design doc | Harvested; original closed on 2026-06-05 after public integration branch. | Manually harvested as `docs/rfcs/2574-provider-fallback-chain.md` because the current PR head had no net file changes. Credit @idling11 in commit `5dc1a63cd`; keep issue #2574 open for implementation. | | #2530 mention depth-cap hint | Already present; original closed on 2026-06-05 after public integration branch. | Present in the current v0.9 stack as `a97675824` and `29f57665e`. `cargo test -p codewhale-tui --locked try_autocomplete_file_mention_no_match` passed. | | #2513 restore snapshot listing | Harvested; original closed on 2026-06-05 after public integration branch. | Manually harvested as `311eb4002` with explicit `/restore list 101` cap rejection. `cargo test -p codewhale-tui --locked restore_`; `cargo fmt --all -- --check`; `cargo clippy -p codewhale-tui --locked -- -D warnings` passed. Keep #2494 open because this is only the restore-listing slice. | +| #2510 custom DuckDuckGo-compatible endpoint | Harvested into a focused review branch; close original after review PR lands. | Adds `[search].base_url`, preferred `CODEWHALE_SEARCH_BASE_URL`, and legacy `DEEPSEEK_SEARCH_BASE_URL` for private DDG-compatible HTML endpoints. Network policy gates the configured host, custom endpoints do not fall back to public Bing, non-DDG provider/base_url combinations and challenge pages return explicit errors, and custom results report the configured host as `source`. Credit @cyq1017 for #2510 and @Artenx for the DDG-style endpoint clarification in #2436. | | #2576 PrefixCacheChange first-freeze event | Already present; original closed on 2026-06-05 after public integration branch. | Present in the current v0.9 stack through `29acb87a9d`. `cargo test -p codewhale-tui --locked prefix_cache` passed. | | #2502 web_run RwLock split | Harvested; original closed on 2026-06-05 after public integration branch. | Manually harvested as `60f8e7d62` with panic-safe state write-back, `Arc` cache reads, and serialized cache tests. `cargo test -p codewhale-tui --locked web_run`; `cargo clippy -p codewhale-tui --locked -- -D warnings`; `cargo fmt --all -- --check` passed. | | #2517 turn_meta tail relocation | Manually harvested with the user-text content block first and volatile turn metadata last. | `cargo test -p codewhale-tui --locked turn_metadata`; `cargo test -p codewhale-tui --locked user_message_turn_meta_is_appended_not_prepended`; `cargo test -p codewhale-tui --locked post_edit_hook_injects_diagnostics_message_before_next_request`; `cargo test -p codewhale-tui --locked request_builder_keeps_tail_turn_meta_after_user_text_for_wire`; `cargo clippy -p codewhale-tui --locked -- -D warnings` passed. | +| #2528 background completion wait | Harvested through review PR #2765; original closed as harvested. | Widened the focused background-shell completion wait to 30 seconds so slow Windows runners do not leave lightweight completed background commands reported as `Running` before assertions fire. `cargo test -p codewhale-tui --bin codewhale-tui --locked test_background_execution -- --nocapture`, `... test_completed_background_shell_releases_process_handles ...`, and `cargo clippy -p codewhale-tui --bin codewhale-tui --locked -- -D warnings` passed. Credit @cyq1017; refs #2525/#2526. | ## Stabilization Gate Evidence (#2721) @@ -113,7 +115,7 @@ v0.9 branch so the remaining Windows/manual checks are explicit. | #2507 stream chunk timeout config | Draft/conflicting | Defer unless stabilization needs it. | | #2508 configurable path suffix | Conflicting / superseded | #2089 is already closed. The current implementation covers #1874's third-party gateway need without the broader env/CLI surface from #2508. Docs now show `[providers.openai].path_suffix = "/chat/completions"` and state that model/beta paths are not rewritten. Credit @hongqitai for the follow-up PR and @shuxiangxuebiancheng for the original #1874 report; close/comment after branch is public. | | #2509 parallel read-only web search | Closed / already merged via #2504 | Already present in `origin/main` as `a09af2024`; closed as harvested/superseded on 2026-06-04. | -| #2510 custom DuckDuckGo endpoint | Draft/mergeable | Low priority; defer unless docs/search lane takes it. | +| #2510 custom DuckDuckGo endpoint | Draft/mergeable / harvested in focused branch | Close/comment after the focused review PR lands. Keep credit for @cyq1017 and issue reporter @Artenx. | | #2511 ToolCallBefore hooks | Conflicting | Defer to hook lifecycle lane. | | #2512 custom completion sounds | Draft/conflicting | Defer. | | #2513 restore snapshot listing | Closed / harvested | Manually harvested as `311eb4002` with cap-rejection polish; original closed on 2026-06-05, leave #2494 open. | @@ -121,7 +123,7 @@ v0.9 branch so the remaining Windows/manual checks are explicit. | #2520 prompt base disk cache | Mergeable | Defer. Review found unused prompt-cache infrastructure with no runtime wiring, cache keys that still require building the prompt first, real-home cache writes in tests, and a contract that depends on the deferred #2687 prompt split. | | #2522 hard compaction preserving system segment | Mergeable | Defer. Review found a dormant hard path that would duplicate/cache summaries into the mutable system prompt if wired through current engine flow, and a simple tail split that can break tool-call pair and pinning invariants. | | #2526 shell tool availability docs | Draft/conflicting | Likely superseded by tool-surface docs; verify before closing. | -| #2528 background completion wait | Draft/conflicting | Defer unless failing tests prove need. | +| #2528 background completion wait | Closed / harvested | Harvested through #2765 with a 30-second focused wait for background-shell completion tests. Original closed as harvested, crediting @cyq1017; refs #2525/#2526. | | #2529 workspace shell opt-in | Draft/conflicting | Review with permissions/sandbox stabilization. | | #2530 mention depth cap hint | Closed / already present | Already present locally as `a97675824` and `29f57665e`; original closed on 2026-06-05. | | #2576 PrefixCacheChange events | Closed / already present | Already present locally through `29acb87a9d`; original closed on 2026-06-05. |