From 07e371786462e869e9a91e1de2e9e4159469a277 Mon Sep 17 00:00:00 2001 From: Mykhailo Chalyi Date: Sat, 4 Jul 2026 15:57:42 -0500 Subject: [PATCH] fix: accept bare host URLs --- crates/fetchkit/src/client.rs | 3 +- crates/fetchkit/src/fetchers/mod.rs | 6 +- crates/fetchkit/src/tool.rs | 39 +++++++-- crates/fetchkit/src/types.rs | 50 ++++++++++- crates/fetchkit/tests/transport.rs | 125 +++++++++++++++++++++++++++- specs/initial.md | 3 +- specs/threat-model.md | 12 ++- 7 files changed, 218 insertions(+), 20 deletions(-) diff --git a/crates/fetchkit/src/client.rs b/crates/fetchkit/src/client.rs index f8d94f9..73df494 100644 --- a/crates/fetchkit/src/client.rs +++ b/crates/fetchkit/src/client.rs @@ -192,13 +192,14 @@ pub async fn fetch(req: FetchRequest) -> Result { /// Uses the default fetcher registry with all built-in fetchers. /// For custom fetcher configuration, use [`FetcherRegistry`] directly. pub async fn fetch_with_options( - req: FetchRequest, + mut req: FetchRequest, options: FetchOptions, ) -> Result { // Validate URL early if req.url.is_empty() { return Err(FetchError::MissingUrl); } + req.normalize_url_for_fetch()?; // Use registry with default fetchers let registry = FetcherRegistry::with_defaults(); diff --git a/crates/fetchkit/src/fetchers/mod.rs b/crates/fetchkit/src/fetchers/mod.rs index 1060c91..d12d0c5 100644 --- a/crates/fetchkit/src/fetchers/mod.rs +++ b/crates/fetchkit/src/fetchers/mod.rs @@ -224,9 +224,10 @@ impl FetcherRegistry { /// (shouldn't happen with DefaultFetcher registered). pub async fn fetch( &self, - request: FetchRequest, + mut request: FetchRequest, options: FetchOptions, ) -> Result { + request.normalize_url_for_fetch()?; let (fetcher, _) = self.validate_and_find_fetcher(&request, &options)?; debug!(fetcher = fetcher.name(), url = %request.url, "Using fetcher"); fetcher.fetch(&request, &options).await @@ -235,10 +236,11 @@ impl FetcherRegistry { /// Fetch a URL and save to file using the appropriate fetcher pub async fn fetch_to_file( &self, - request: FetchRequest, + mut request: FetchRequest, options: FetchOptions, saver: &dyn FileSaver, ) -> Result { + request.normalize_url_for_fetch()?; let (fetcher, _) = self.validate_and_find_fetcher(&request, &options)?; tracing::debug!(fetcher = fetcher.name(), url = %request.url, "Using fetcher (save to file)"); fetcher.fetch_to_file(&request, &options, saver).await diff --git a/crates/fetchkit/src/tool.rs b/crates/fetchkit/src/tool.rs index 3176457..266dbe8 100644 --- a/crates/fetchkit/src/tool.rs +++ b/crates/fetchkit/src/tool.rs @@ -562,8 +562,9 @@ impl Tool { /// Create a single-use tool execution from JSON arguments. pub fn execution(&self, args: Value) -> Result { validate_args(self, &args)?; - let request: FetchRequest = serde_json::from_value(args) + let mut request: FetchRequest = serde_json::from_value(args) .map_err(|err| invalid_arguments_error(self.locale(), &err.to_string()))?; + normalize_request(self, &mut request)?; validate_request(self, &request)?; Ok(ToolExecution { @@ -580,7 +581,7 @@ impl Tool { /// Execute the tool with status updates. pub async fn execute_with_status( &self, - req: FetchRequest, + mut req: FetchRequest, mut status_callback: F, ) -> Result where @@ -592,9 +593,7 @@ impl Tool { return Err(FetchError::MissingUrl); } - if !req.url.starts_with("http://") && !req.url.starts_with("https://") { - return Err(FetchError::InvalidUrlScheme); - } + req.normalize_url_for_fetch()?; status_callback(ToolStatus::new("connect").with_percent(10.0)); status_callback(ToolStatus::new("fetch").with_percent(20.0)); @@ -618,6 +617,12 @@ impl Tool { req: FetchRequest, saver: Option<&dyn FileSaver>, ) -> Result { + let mut req = req; + if req.url.is_empty() { + return Err(FetchError::MissingUrl); + } + req.normalize_url_for_fetch()?; + if req.save_to_file.is_some() { if !self.enable_save_to_file { return Err(FetchError::SaverNotAvailable); @@ -773,6 +778,16 @@ fn validate_request(tool: &Tool, request: &FetchRequest) -> Result<(), ToolError Ok(()) } +fn normalize_request(tool: &Tool, request: &mut FetchRequest) -> Result<(), ToolError> { + if request.url.is_empty() { + return Err(map_fetch_error(tool.locale(), FetchError::MissingUrl)); + } + + request + .normalize_url_for_fetch() + .map_err(|err| map_fetch_error(tool.locale(), err)) +} + fn build_input_schema( enable_markdown: bool, enable_text: bool, @@ -781,7 +796,10 @@ fn build_input_schema( let mut properties = Map::new(); properties.insert( "url".to_string(), - json!({"type": "string", "format": "uri"}), + json!({ + "type": "string", + "description": "HTTP/HTTPS URL, or a bare domain URL normalized to https://" + }), ); properties.insert( "method".to_string(), @@ -1068,12 +1086,12 @@ fn build_help(tool: &Tool) -> String { fn parameter_description(locale: &str, field: &str) -> &'static str { match (is_ukrainian(locale), field) { - (true, "url") => "HTTP або HTTPS URL", + (true, "url") => "HTTP/HTTPS URL або доменний URL, який нормалізується до https://", (true, "method") => "`GET` або `HEAD`", (true, "as_markdown") => "Перетворити HTML у markdown", (true, "as_text") => "Перетворити HTML у plain text", (true, "save_to_file") => "Шлях призначення, визначений адаптером", - (false, "url") => "HTTP or HTTPS URL", + (false, "url") => "HTTP/HTTPS URL, or a bare domain URL normalized to `https://`", (false, "method") => "`GET` or `HEAD`", (false, "as_markdown") => "Convert HTML to markdown", (false, "as_text") => "Convert HTML to plain text", @@ -1269,7 +1287,10 @@ mod tests { let output_schema = tool.output_schema(); assert_eq!(input_schema["type"], "object"); - assert_eq!(input_schema["properties"]["url"]["format"], "uri"); + assert!(input_schema["properties"]["url"]["description"] + .as_str() + .unwrap() + .contains("bare domain URL")); assert_eq!(input_schema["properties"]["method"]["default"], "GET"); assert!(input_schema["properties"]["if_none_match"].is_object()); assert!(input_schema["properties"]["if_modified_since"].is_object()); diff --git a/crates/fetchkit/src/types.rs b/crates/fetchkit/src/types.rs index e50aadc..f4080b8 100644 --- a/crates/fetchkit/src/types.rs +++ b/crates/fetchkit/src/types.rs @@ -2,7 +2,11 @@ use schemars::JsonSchema; use serde::{Deserialize, Serialize}; +use std::net::IpAddr; use std::str::FromStr; +use url::Url; + +use crate::error::FetchError; /// HTTP method for the request /// @@ -70,7 +74,8 @@ impl std::fmt::Display for HttpMethod { /// ``` #[derive(Debug, Clone, Default, Serialize, Deserialize, JsonSchema)] pub struct FetchRequest { - /// The URL to fetch (required, must be http:// or https://) + /// The URL to fetch. HTTP/HTTPS URLs are accepted as-is; bare domain URLs + /// such as `example.com/docs` are normalized to `https://example.com/docs`. pub url: String, /// HTTP method (optional, default GET) @@ -115,6 +120,12 @@ impl FetchRequest { } } + /// Normalize browser-like bare host URLs before validation and fetching. + pub(crate) fn normalize_url_for_fetch(&mut self) -> Result<(), FetchError> { + self.url = canonical_fetch_url(&self.url)?; + Ok(()) + } + /// Set the HTTP method pub fn method(mut self, method: HttpMethod) -> Self { self.method = Some(method); @@ -181,6 +192,43 @@ impl FetchRequest { } } +fn canonical_fetch_url(raw_url: &str) -> Result { + if raw_url.starts_with("http://") || raw_url.starts_with("https://") { + return Ok(raw_url.to_string()); + } + + if raw_url.is_empty() + || raw_url.starts_with("//") + || raw_url.starts_with('/') + || raw_url.contains("://") + || raw_url.contains('\\') + || raw_url.chars().any(char::is_whitespace) + || raw_url.chars().any(char::is_control) + { + return Err(FetchError::InvalidUrlScheme); + } + + let candidate = format!("https://{raw_url}"); + let parsed = Url::parse(&candidate).map_err(|_| FetchError::InvalidUrlScheme)?; + let host = parsed.host_str().ok_or(FetchError::InvalidUrlScheme)?; + + if !is_domain_like_bare_host(host) + || !parsed.username().is_empty() + || parsed.password().is_some() + { + return Err(FetchError::InvalidUrlScheme); + } + + Ok(candidate) +} + +fn is_domain_like_bare_host(host: &str) -> bool { + let host = host.trim_end_matches('.'); + host.contains('.') + && host.parse::().is_err() + && host.chars().any(|ch| ch.is_ascii_alphabetic()) +} + /// A link extracted from the page with its text and href. /// /// # Examples diff --git a/crates/fetchkit/tests/transport.rs b/crates/fetchkit/tests/transport.rs index d6afc12..ce03fb6 100644 --- a/crates/fetchkit/tests/transport.rs +++ b/crates/fetchkit/tests/transport.rs @@ -7,9 +7,10 @@ use async_trait::async_trait; use bytes::Bytes; use fetchkit::{ - DnsPolicy, FetchOptions, FetchRequest, Fetcher, HttpMethod, HttpTransport, TransportError, - TransportMethod, TransportRequest, TransportResponse, + DnsPolicy, FetchError, FetchOptions, FetchRequest, Fetcher, HttpMethod, HttpTransport, + TransportError, TransportMethod, TransportRequest, TransportResponse, }; +use serde_json::json; use std::sync::{Arc, Mutex}; /// One canned response keyed by URL substring. @@ -273,6 +274,126 @@ async fn tool_execute_uses_injected_transport() { assert!(calls[0].0.contains("example.test/tool")); } +#[tokio::test] +async fn tool_execute_defaults_bare_host_urls_to_https() { + let mock = Arc::new(MockTransport::new().route( + "paseo.sh/docs/custom-providers", + 200, + "text/plain", + b"bare host normalized", + )); + + let tool = fetchkit::Tool::builder() + .block_private_ips(false) + .transport(mock.clone()) + .build(); + + let response = tool + .execute(FetchRequest::new("paseo.sh/docs/custom-providers")) + .await + .unwrap(); + + assert_eq!(response.status_code, 200); + assert_eq!(response.url, "https://paseo.sh/docs/custom-providers"); + assert_eq!(response.content.as_deref(), Some("bare host normalized")); + + let calls = mock.calls(); + assert_eq!(calls.len(), 1); + assert_eq!(calls[0].0, "https://paseo.sh/docs/custom-providers"); +} + +#[tokio::test] +async fn tool_execution_defaults_bare_host_urls_to_https() { + let mock = Arc::new(MockTransport::new().route( + "paseo.sh/docs/custom-providers", + 200, + "text/plain", + b"toolkit normalized", + )); + + let tool = fetchkit::Tool::builder() + .block_private_ips(false) + .transport(mock.clone()) + .build(); + + let output = tool + .execution(json!({ + "url": "paseo.sh/docs/custom-providers" + })) + .unwrap() + .execute() + .await + .unwrap(); + + assert_eq!( + output.result["url"], + "https://paseo.sh/docs/custom-providers" + ); + assert_eq!(output.result["content"], "toolkit normalized"); + + let calls = mock.calls(); + assert_eq!(calls.len(), 1); + assert_eq!(calls[0].0, "https://paseo.sh/docs/custom-providers"); +} + +#[tokio::test] +async fn bare_host_policy_checks_use_canonical_https_url() { + let mock = Arc::new(MockTransport::new().route( + "paseo.sh/docs/custom-providers", + 200, + "text/plain", + b"allowed canonical URL", + )); + + let allow_tool = fetchkit::Tool::builder() + .block_private_ips(false) + .allow_prefix("https://paseo.sh/docs") + .transport(mock.clone()) + .build(); + + let allowed = allow_tool + .execute(FetchRequest::new("paseo.sh/docs/custom-providers")) + .await + .unwrap(); + assert_eq!(allowed.url, "https://paseo.sh/docs/custom-providers"); + + let block_tool = fetchkit::Tool::builder() + .block_private_ips(false) + .block_prefix("https://paseo.sh/docs") + .transport(mock.clone()) + .build(); + + let blocked = block_tool + .execute(FetchRequest::new("paseo.sh/docs/custom-providers")) + .await + .unwrap_err(); + assert!(matches!(blocked, FetchError::BlockedUrl)); +} + +#[tokio::test] +async fn bare_url_normalization_rejects_non_web_and_localish_inputs() { + let mock = Arc::new(MockTransport::new()); + let tool = fetchkit::Tool::builder() + .block_private_ips(false) + .transport(mock.clone()) + .build(); + + for url in [ + "ftp://example.com/file", + "file:///etc/passwd", + "mailto:test@example.com", + "//example.com/path", + "localhost/path", + "intranet/path", + "127.0.0.1/path", + ] { + let err = tool.execute(FetchRequest::new(url)).await.unwrap_err(); + assert!(matches!(err, FetchError::InvalidUrlScheme), "{url}: {err}"); + } + + assert!(mock.calls().is_empty()); +} + #[tokio::test] async fn tool_execute_with_saver_uses_injected_transport() { let mock = Arc::new(MockTransport::new().route( diff --git a/specs/initial.md b/specs/initial.md index f24dc55..fb990e8 100644 --- a/specs/initial.md +++ b/specs/initial.md @@ -64,7 +64,8 @@ Provide a builder to configure tool options, including: #### Types - `FetchRequest` - - `url: String` (required) + - `url: String` (required; explicit `http://` and `https://` are accepted as-is; + bare domain URLs such as `example.com/docs` are normalized to `https://example.com/docs`) - `method: HttpMethod` (optional, default GET) - `as_markdown: bool` (optional, feature-gated) - `as_text: bool` (optional, feature-gated) diff --git a/specs/threat-model.md b/specs/threat-model.md index 6633bf8..fb9b8e4 100644 --- a/specs/threat-model.md +++ b/specs/threat-model.md @@ -226,11 +226,15 @@ expected outbound path. **TM-INPUT-001 — Scheme validation (MITIGATED):** ```rust // THREAT[TM-INPUT-001]: Block non-HTTP schemes (file://, ftp://, data:, etc.) -// Mitigation: Early return with InvalidUrlScheme error -if !request.url.starts_with("http://") && !request.url.starts_with("https://") { - return Err(FetchError::InvalidUrlScheme); -} +// Mitigation: normalize domain-like bare hosts to https://, then reject any +// remaining URL without an explicit http:// or https:// scheme. +request.normalize_url_for_fetch()?; ``` +Bare URL normalization is deliberately narrow: domain-like hosts such as +`example.com/docs` default to `https://example.com/docs`; non-web schemes, +protocol-relative URLs, bare IP literals, and single-label local names are rejected. +SSRF, DNS, port, allow-prefix, and block-prefix policies run after normalization +against the canonical URL that will be fetched. **TM-INPUT-002 — URL prefix bypass via encoding (MITIGATED):** Prefix matching now uses the `url` crate to parse both the URL and the prefix,