diff --git a/crates/fetchkit/src/error.rs b/crates/fetchkit/src/error.rs index adba5e2..7877476 100644 --- a/crates/fetchkit/src/error.rs +++ b/crates/fetchkit/src/error.rs @@ -19,7 +19,7 @@ pub enum FetchError { MissingUrl, /// URL has invalid scheme - #[error("Invalid URL: must start with http:// or https://")] + #[error("Invalid URL: must be http://, https://, or a bare domain URL")] InvalidUrlScheme, /// Invalid HTTP method @@ -110,7 +110,7 @@ mod tests { ); assert_eq!( FetchError::InvalidUrlScheme.to_string(), - "Invalid URL: must start with http:// or https://" + "Invalid URL: must be http://, https://, or a bare domain URL" ); assert_eq!( FetchError::InvalidMethod.to_string(), diff --git a/crates/fetchkit/src/fetchers/arxiv.rs b/crates/fetchkit/src/fetchers/arxiv.rs index a8f651d..084bb61 100644 --- a/crates/fetchkit/src/fetchers/arxiv.rs +++ b/crates/fetchkit/src/fetchers/arxiv.rs @@ -95,6 +95,7 @@ impl Fetcher for ArXivFetcher { request: &FetchRequest, options: &FetchOptions, ) -> Result { + let request = request.normalized_for_fetch()?; let url = Url::parse(&request.url).map_err(|_| FetchError::InvalidUrlScheme)?; let paper_id = Self::parse_url(&url) diff --git a/crates/fetchkit/src/fetchers/default.rs b/crates/fetchkit/src/fetchers/default.rs index 1f7778f..c30c690 100644 --- a/crates/fetchkit/src/fetchers/default.rs +++ b/crates/fetchkit/src/fetchers/default.rs @@ -227,6 +227,7 @@ impl Fetcher for DefaultFetcher { request: &FetchRequest, options: &FetchOptions, ) -> Result { + let request = request.normalized_for_fetch()?; if request.url.is_empty() { return Err(FetchError::MissingUrl); } @@ -244,7 +245,7 @@ impl Fetcher for DefaultFetcher { "*/*" }; - let headers = build_headers(options, accept, request); + let headers = build_headers(options, accept, &request); let parsed_url = url::Url::parse(&request.url).map_err(|_| FetchError::InvalidUrlScheme)?; let reqwest_method = match method { @@ -416,6 +417,7 @@ impl Fetcher for DefaultFetcher { Some(path) => path.clone(), None => return self.fetch(request, options).await, }; + let request = request.normalized_for_fetch()?; if request.url.is_empty() { return Err(FetchError::MissingUrl); @@ -424,7 +426,7 @@ impl Fetcher for DefaultFetcher { let method = request.effective_method(); let max_body_size = options.max_body_size.unwrap_or(DEFAULT_MAX_BODY_SIZE); - let headers = build_headers(options, "*/*", request); + let headers = build_headers(options, "*/*", &request); let parsed_url = url::Url::parse(&request.url).map_err(|_| FetchError::InvalidUrlScheme)?; let reqwest_method = match method { diff --git a/crates/fetchkit/src/fetchers/docs_site.rs b/crates/fetchkit/src/fetchers/docs_site.rs index ea4bb0b..1054bf6 100644 --- a/crates/fetchkit/src/fetchers/docs_site.rs +++ b/crates/fetchkit/src/fetchers/docs_site.rs @@ -110,6 +110,7 @@ impl Fetcher for DocsSiteFetcher { request: &FetchRequest, options: &FetchOptions, ) -> Result { + let request = request.normalized_for_fetch()?; let url = Url::parse(&request.url).map_err(|_| FetchError::InvalidUrlScheme)?; let user_agent = options.user_agent.as_deref().unwrap_or(DEFAULT_USER_AGENT); let ua_header = HeaderValue::from_str(user_agent) diff --git a/crates/fetchkit/src/fetchers/github_code.rs b/crates/fetchkit/src/fetchers/github_code.rs index e4aa616..7bd61f1 100644 --- a/crates/fetchkit/src/fetchers/github_code.rs +++ b/crates/fetchkit/src/fetchers/github_code.rs @@ -142,6 +142,7 @@ impl Fetcher for GitHubCodeFetcher { request: &FetchRequest, options: &FetchOptions, ) -> Result { + let request = request.normalized_for_fetch()?; let url = Url::parse(&request.url).map_err(|_| FetchError::InvalidUrlScheme)?; let parsed = Self::parse_url(&url) diff --git a/crates/fetchkit/src/fetchers/github_issue.rs b/crates/fetchkit/src/fetchers/github_issue.rs index d018929..b8ce40e 100644 --- a/crates/fetchkit/src/fetchers/github_issue.rs +++ b/crates/fetchkit/src/fetchers/github_issue.rs @@ -177,6 +177,7 @@ impl Fetcher for GitHubIssueFetcher { request: &FetchRequest, options: &FetchOptions, ) -> Result { + let request = request.normalized_for_fetch()?; let url = Url::parse(&request.url).map_err(|_| FetchError::InvalidUrlScheme)?; let (owner, repo, number) = Self::parse_url(&url).ok_or_else(|| { diff --git a/crates/fetchkit/src/fetchers/github_repo.rs b/crates/fetchkit/src/fetchers/github_repo.rs index 8cf1c9e..954a749 100644 --- a/crates/fetchkit/src/fetchers/github_repo.rs +++ b/crates/fetchkit/src/fetchers/github_repo.rs @@ -175,6 +175,7 @@ impl Fetcher for GitHubRepoFetcher { request: &FetchRequest, options: &FetchOptions, ) -> Result { + let request = request.normalized_for_fetch()?; let url = Url::parse(&request.url).map_err(|_| FetchError::InvalidUrlScheme)?; let (owner, repo) = Self::parse_github_url(&url).ok_or_else(|| { diff --git a/crates/fetchkit/src/fetchers/hackernews.rs b/crates/fetchkit/src/fetchers/hackernews.rs index c78cec1..8315cbb 100644 --- a/crates/fetchkit/src/fetchers/hackernews.rs +++ b/crates/fetchkit/src/fetchers/hackernews.rs @@ -87,6 +87,7 @@ impl Fetcher for HackerNewsFetcher { request: &FetchRequest, options: &FetchOptions, ) -> Result { + let request = request.normalized_for_fetch()?; let url = Url::parse(&request.url).map_err(|_| FetchError::InvalidUrlScheme)?; let item_id = Self::parse_url(&url) diff --git a/crates/fetchkit/src/fetchers/package_registry.rs b/crates/fetchkit/src/fetchers/package_registry.rs index 9d90f60..d6668bb 100644 --- a/crates/fetchkit/src/fetchers/package_registry.rs +++ b/crates/fetchkit/src/fetchers/package_registry.rs @@ -173,6 +173,7 @@ impl Fetcher for PackageRegistryFetcher { request: &FetchRequest, options: &FetchOptions, ) -> Result { + let request = request.normalized_for_fetch()?; let url = Url::parse(&request.url).map_err(|_| FetchError::InvalidUrlScheme)?; let registry = Self::parse_url(&url).ok_or_else(|| { diff --git a/crates/fetchkit/src/fetchers/rss_feed.rs b/crates/fetchkit/src/fetchers/rss_feed.rs index aa1a167..ad92de2 100644 --- a/crates/fetchkit/src/fetchers/rss_feed.rs +++ b/crates/fetchkit/src/fetchers/rss_feed.rs @@ -89,6 +89,7 @@ impl Fetcher for RSSFeedFetcher { request: &FetchRequest, options: &FetchOptions, ) -> Result { + let request = request.normalized_for_fetch()?; let user_agent = options.user_agent.as_deref().unwrap_or(DEFAULT_USER_AGENT); let mut headers = HeaderMap::new(); let ua_header = HeaderValue::from_str(user_agent) diff --git a/crates/fetchkit/src/fetchers/stackoverflow.rs b/crates/fetchkit/src/fetchers/stackoverflow.rs index 94e2dd4..51ade4d 100644 --- a/crates/fetchkit/src/fetchers/stackoverflow.rs +++ b/crates/fetchkit/src/fetchers/stackoverflow.rs @@ -120,6 +120,7 @@ impl Fetcher for StackOverflowFetcher { request: &FetchRequest, options: &FetchOptions, ) -> Result { + let request = request.normalized_for_fetch()?; let url = Url::parse(&request.url).map_err(|_| FetchError::InvalidUrlScheme)?; let (site, question_id) = Self::parse_url(&url).ok_or_else(|| { diff --git a/crates/fetchkit/src/fetchers/twitter.rs b/crates/fetchkit/src/fetchers/twitter.rs index fc20a31..82a2ac0 100644 --- a/crates/fetchkit/src/fetchers/twitter.rs +++ b/crates/fetchkit/src/fetchers/twitter.rs @@ -439,6 +439,7 @@ impl Fetcher for TwitterFetcher { request: &FetchRequest, options: &FetchOptions, ) -> Result { + let request = request.normalized_for_fetch()?; let url = Url::parse(&request.url).map_err(|_| FetchError::InvalidUrlScheme)?; let (_username, tweet_id) = Self::parse_tweet_url(&url).ok_or_else(|| { diff --git a/crates/fetchkit/src/fetchers/wikipedia.rs b/crates/fetchkit/src/fetchers/wikipedia.rs index 709fb3e..94dfe3c 100644 --- a/crates/fetchkit/src/fetchers/wikipedia.rs +++ b/crates/fetchkit/src/fetchers/wikipedia.rs @@ -107,6 +107,7 @@ impl Fetcher for WikipediaFetcher { request: &FetchRequest, options: &FetchOptions, ) -> Result { + let request = request.normalized_for_fetch()?; let url = Url::parse(&request.url).map_err(|_| FetchError::InvalidUrlScheme)?; let (lang, title) = Self::parse_url(&url) diff --git a/crates/fetchkit/src/fetchers/youtube.rs b/crates/fetchkit/src/fetchers/youtube.rs index 3cb0882..41bf8ee 100644 --- a/crates/fetchkit/src/fetchers/youtube.rs +++ b/crates/fetchkit/src/fetchers/youtube.rs @@ -98,6 +98,7 @@ impl Fetcher for YouTubeFetcher { request: &FetchRequest, options: &FetchOptions, ) -> Result { + let request = request.normalized_for_fetch()?; let url = Url::parse(&request.url).map_err(|_| FetchError::InvalidUrlScheme)?; let video_id = Self::parse_video_id(&url) diff --git a/crates/fetchkit/src/tool.rs b/crates/fetchkit/src/tool.rs index 266dbe8..dc25df4 100644 --- a/crates/fetchkit/src/tool.rs +++ b/crates/fetchkit/src/tool.rs @@ -1020,25 +1020,25 @@ fn build_help(tool: &Tool) -> String { let errors = if is_ukrainian(tool.locale()) { if tool.enable_save_to_file { "- `MissingUrl` — параметр `url` обов’язковий\n\ - - `InvalidUrlScheme` — схема URL має бути `http` або `https`\n\ + - `InvalidUrlScheme` — URL має бути `http://`, `https://` або доменним URL\n\ - `BlockedUrl` — URL заблокований політикою SSRF або allow/block правилами\n\ - `FirstByteTimeout` — сервер не відповів протягом 1 секунди\n\ - `SaverNotAvailable` — `save_to_file` потребує адаптер `FileSaver`\n" } else { "- `MissingUrl` — параметр `url` обов’язковий\n\ - - `InvalidUrlScheme` — схема URL має бути `http` або `https`\n\ + - `InvalidUrlScheme` — URL має бути `http://`, `https://` або доменним URL\n\ - `BlockedUrl` — URL заблокований політикою SSRF або allow/block правилами\n\ - `FirstByteTimeout` — сервер не відповів протягом 1 секунди\n" } } else if tool.enable_save_to_file { "- `MissingUrl` — `url` is required\n\ - - `InvalidUrlScheme` — URL scheme must be `http` or `https`\n\ + - `InvalidUrlScheme` — URL must be `http://`, `https://`, or a bare domain URL\n\ - `BlockedUrl` — URL blocked by SSRF policy or allow/block rules\n\ - `FirstByteTimeout` — server did not respond within 1 second\n\ - `SaverNotAvailable` — `save_to_file` requires a `FileSaver` adapter\n" } else { "- `MissingUrl` — `url` is required\n\ - - `InvalidUrlScheme` — URL scheme must be `http` or `https`\n\ + - `InvalidUrlScheme` — URL must be `http://`, `https://`, or a bare domain URL\n\ - `BlockedUrl` — URL blocked by SSRF policy or allow/block rules\n\ - `FirstByteTimeout` — server did not respond within 1 second\n" }; @@ -1139,7 +1139,7 @@ fn unknown_parameter_error(locale: &str, key: &str) -> ToolError { fn user_text(locale: &str, key: &str) -> &'static str { match (is_ukrainian(locale), key) { (true, "missing_url") => "Параметр url обов’язковий.", - (true, "invalid_scheme") => "Схема URL має бути http або https.", + (true, "invalid_scheme") => "URL має бути http://, https:// або доменним URL.", (true, "invalid_method") => "Метод має бути GET або HEAD.", (true, "blocked_url") => "URL заблокований політикою безпеки.", (true, "timeout") => { @@ -1153,7 +1153,7 @@ fn user_text(locale: &str, key: &str) -> &'static str { (true, "save_error") => "Не вдалося зберегти файл. Перевірте шлях призначення.", (true, "saver_missing") => "save_to_file потребує адаптер FileSaver.", (false, "missing_url") => "url is required.", - (false, "invalid_scheme") => "URL scheme must be http or https.", + (false, "invalid_scheme") => "URL must be http://, https://, or a bare domain URL.", (false, "invalid_method") => "Method must be GET or HEAD.", (false, "blocked_url") => "URL is blocked by security policy.", (false, "timeout") => { @@ -1357,7 +1357,7 @@ mod tests { assert!(err .unwrap_err() .to_string() - .contains("URL scheme must be http or https")); + .contains("URL must be http://, https://, or a bare domain URL")); } #[test] diff --git a/crates/fetchkit/src/types.rs b/crates/fetchkit/src/types.rs index f4080b8..cb5d692 100644 --- a/crates/fetchkit/src/types.rs +++ b/crates/fetchkit/src/types.rs @@ -126,6 +126,13 @@ impl FetchRequest { Ok(()) } + /// Return a normalized copy for APIs that accept `&FetchRequest`. + pub(crate) fn normalized_for_fetch(&self) -> Result { + let mut request = self.clone(); + request.normalize_url_for_fetch()?; + Ok(request) + } + /// Set the HTTP method pub fn method(mut self, method: HttpMethod) -> Self { self.method = Some(method); diff --git a/crates/fetchkit/tests/integration.rs b/crates/fetchkit/tests/integration.rs index e9af093..dc7337c 100644 --- a/crates/fetchkit/tests/integration.rs +++ b/crates/fetchkit/tests/integration.rs @@ -430,7 +430,7 @@ async fn test_invalid_url_scheme() { assert!(result .unwrap_err() .to_string() - .contains("http:// or https://")); + .contains("http://, https://, or a bare domain URL")); } #[tokio::test] diff --git a/crates/fetchkit/tests/transport.rs b/crates/fetchkit/tests/transport.rs index ce03fb6..5dfaa20 100644 --- a/crates/fetchkit/tests/transport.rs +++ b/crates/fetchkit/tests/transport.rs @@ -7,8 +7,11 @@ use async_trait::async_trait; use bytes::Bytes; use fetchkit::{ - DnsPolicy, FetchError, FetchOptions, FetchRequest, Fetcher, HttpMethod, HttpTransport, - TransportError, TransportMethod, TransportRequest, TransportResponse, + ArXivFetcher, DefaultFetcher, DnsPolicy, DocsSiteFetcher, FetchError, FetchOptions, + FetchRequest, Fetcher, GitHubCodeFetcher, GitHubIssueFetcher, GitHubRepoFetcher, + HackerNewsFetcher, HttpMethod, HttpTransport, PackageRegistryFetcher, RSSFeedFetcher, + StackOverflowFetcher, TransportError, TransportMethod, TransportRequest, TransportResponse, + TwitterFetcher, WikipediaFetcher, YouTubeFetcher, }; use serde_json::json; use std::sync::{Arc, Mutex}; @@ -122,6 +125,86 @@ async fn default_fetcher_get_uses_injected_transport() { assert!(calls[0].0.contains("example.test/page")); } +#[tokio::test] +async fn direct_default_fetcher_defaults_bare_host_urls_to_https() { + let mock = Arc::new(MockTransport::new().route( + "example.test/direct", + 200, + "text/plain", + b"direct default fetcher", + )); + let options = options_with(mock.clone()); + + let fetcher = DefaultFetcher::new(); + let request = FetchRequest::new("example.test/direct"); + let response = fetcher.fetch(&request, &options).await.unwrap(); + + assert_eq!(response.status_code, 200); + assert_eq!(response.url, "https://example.test/direct"); + + let calls = mock.calls(); + assert_eq!(calls.len(), 1); + assert_eq!(calls[0].0, "https://example.test/direct"); +} + +#[tokio::test] +async fn direct_specialized_fetchers_normalize_bare_host_urls_before_parsing() { + let cases: Vec<(Box, &str)> = vec![ + ( + Box::new(GitHubCodeFetcher::new()), + "github.com/owner/repo/blob/main/README.md", + ), + ( + Box::new(GitHubIssueFetcher::new()), + "github.com/owner/repo/issues/42", + ), + (Box::new(GitHubRepoFetcher::new()), "github.com/owner/repo"), + ( + Box::new(TwitterFetcher::new()), + "x.com/user/status/123456789", + ), + ( + Box::new(StackOverflowFetcher::new()), + "stackoverflow.com/questions/12345/title", + ), + ( + Box::new(PackageRegistryFetcher::new()), + "pypi.org/project/requests", + ), + ( + Box::new(WikipediaFetcher::new()), + "en.wikipedia.org/wiki/Rust", + ), + ( + Box::new(YouTubeFetcher::new()), + "youtube.com/watch?v=abc123", + ), + (Box::new(ArXivFetcher::new()), "arxiv.org/abs/2301.07041"), + ( + Box::new(HackerNewsFetcher::new()), + "news.ycombinator.com/item?id=123", + ), + (Box::new(RSSFeedFetcher::new()), "example.com/feed"), + ( + Box::new(DocsSiteFetcher::new()), + "docs.rs/tokio/latest/tokio", + ), + ]; + + for (fetcher, bare_url) in cases { + let mock = Arc::new(MockTransport::new()); + let options = options_with(mock); + let request = FetchRequest::new(bare_url); + let result = fetcher.fetch(&request, &options).await; + + assert!( + !matches!(result, Err(FetchError::InvalidUrlScheme)), + "{} rejected bare URL at scheme parsing", + fetcher.name() + ); + } +} + #[tokio::test] async fn default_fetcher_head_uses_injected_transport() { let mock = Arc::new(MockTransport::new().route("example.test/head", 200, "text/html", b"")); diff --git a/specs/fetchers.md b/specs/fetchers.md index 48155ce..80b97ed 100644 --- a/specs/fetchers.md +++ b/specs/fetchers.md @@ -272,6 +272,11 @@ pub async fn fetch_with_options(req: FetchRequest, options: FetchOptions) -> Result; ``` +Built-in fetchers normalize `FetchRequest::url` before parsing, so direct calls to +`Fetcher::fetch` accept the same URL forms as the registry and tool surfaces: +explicit `http://`, explicit `https://`, or bare domain URLs normalized to +`https://`. + ## Testing ### Unit Tests diff --git a/specs/initial.md b/specs/initial.md index fb990e8..3319975 100644 --- a/specs/initial.md +++ b/specs/initial.md @@ -170,8 +170,9 @@ Provide a builder to configure tool options, including: ### Request Validation - `url` is required. -- Only `http://` and `https://` URLs allowed. -- Invalid URL: `Invalid URL: must start with http:// or https://`. +- Explicit `http://` and `https://` URLs are allowed. Bare domain URLs are + accepted and normalized to `https://` before validation and fetch. +- Invalid URL: `Invalid URL: must be http://, https://, or a bare domain URL`. - Invalid method: `Invalid method: must be GET or HEAD`. - Allow/block list prefixes (if configured) are applied before fetch. - If allow list is non-empty, URL must match at least one allow prefix. @@ -321,7 +322,7 @@ Content is HTML if: ### Error Handling - Missing url -> tool error string "Missing required parameter: url". -- Invalid URL -> tool error string "Invalid URL: must start with http:// or https://". +- Invalid URL -> tool error string "Invalid URL: must be http://, https://, or a bare domain URL". - Invalid method -> tool error string "Invalid method: must be GET or HEAD". - Blocked URL (prefix or DNS policy) -> tool error string "Blocked URL: not allowed by policy". - First-byte timeout -> "Request timed out: server did not respond within 1 second".