Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion crates/fetchkit/src/client.rs
Original file line number Diff line number Diff line change
Expand Up @@ -192,13 +192,14 @@ pub async fn fetch(req: FetchRequest) -> Result<FetchResponse, FetchError> {
/// Uses the default fetcher registry with all built-in fetchers.
/// For custom fetcher configuration, use [`FetcherRegistry`] directly.
pub async fn fetch_with_options(
req: FetchRequest,
mut req: FetchRequest,
options: FetchOptions,
) -> Result<FetchResponse, FetchError> {
// Validate URL early
if req.url.is_empty() {
return Err(FetchError::MissingUrl);
}
req.normalize_url_for_fetch()?;

// Use registry with default fetchers
let registry = FetcherRegistry::with_defaults();
Expand Down
6 changes: 4 additions & 2 deletions crates/fetchkit/src/fetchers/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -224,9 +224,10 @@ impl FetcherRegistry {
/// (shouldn't happen with DefaultFetcher registered).
pub async fn fetch(
&self,
request: FetchRequest,
mut request: FetchRequest,
options: FetchOptions,
) -> Result<FetchResponse, FetchError> {
request.normalize_url_for_fetch()?;
let (fetcher, _) = self.validate_and_find_fetcher(&request, &options)?;
debug!(fetcher = fetcher.name(), url = %request.url, "Using fetcher");
fetcher.fetch(&request, &options).await
Expand All @@ -235,10 +236,11 @@ impl FetcherRegistry {
/// Fetch a URL and save to file using the appropriate fetcher
pub async fn fetch_to_file(
&self,
request: FetchRequest,
mut request: FetchRequest,
options: FetchOptions,
saver: &dyn FileSaver,
) -> Result<FetchResponse, FetchError> {
request.normalize_url_for_fetch()?;
let (fetcher, _) = self.validate_and_find_fetcher(&request, &options)?;
tracing::debug!(fetcher = fetcher.name(), url = %request.url, "Using fetcher (save to file)");
fetcher.fetch_to_file(&request, &options, saver).await
Expand Down
39 changes: 30 additions & 9 deletions crates/fetchkit/src/tool.rs
Original file line number Diff line number Diff line change
Expand Up @@ -562,8 +562,9 @@ impl Tool {
/// Create a single-use tool execution from JSON arguments.
pub fn execution(&self, args: Value) -> Result<ToolExecution, ToolError> {
validate_args(self, &args)?;
let request: FetchRequest = serde_json::from_value(args)
let mut request: FetchRequest = serde_json::from_value(args)
.map_err(|err| invalid_arguments_error(self.locale(), &err.to_string()))?;
normalize_request(self, &mut request)?;
validate_request(self, &request)?;

Ok(ToolExecution {
Expand All @@ -580,7 +581,7 @@ impl Tool {
/// Execute the tool with status updates.
pub async fn execute_with_status<F>(
&self,
req: FetchRequest,
mut req: FetchRequest,
mut status_callback: F,
) -> Result<FetchResponse, FetchError>
where
Expand All @@ -592,9 +593,7 @@ impl Tool {
return Err(FetchError::MissingUrl);
}

if !req.url.starts_with("http://") && !req.url.starts_with("https://") {
return Err(FetchError::InvalidUrlScheme);
}
req.normalize_url_for_fetch()?;

status_callback(ToolStatus::new("connect").with_percent(10.0));
status_callback(ToolStatus::new("fetch").with_percent(20.0));
Expand All @@ -618,6 +617,12 @@ impl Tool {
req: FetchRequest,
saver: Option<&dyn FileSaver>,
) -> Result<FetchResponse, FetchError> {
let mut req = req;
if req.url.is_empty() {
return Err(FetchError::MissingUrl);
}
req.normalize_url_for_fetch()?;

if req.save_to_file.is_some() {
if !self.enable_save_to_file {
return Err(FetchError::SaverNotAvailable);
Expand Down Expand Up @@ -773,6 +778,16 @@ fn validate_request(tool: &Tool, request: &FetchRequest) -> Result<(), ToolError
Ok(())
}

fn normalize_request(tool: &Tool, request: &mut FetchRequest) -> Result<(), ToolError> {
if request.url.is_empty() {
return Err(map_fetch_error(tool.locale(), FetchError::MissingUrl));
}

request
.normalize_url_for_fetch()
.map_err(|err| map_fetch_error(tool.locale(), err))
}

fn build_input_schema(
enable_markdown: bool,
enable_text: bool,
Expand All @@ -781,7 +796,10 @@ fn build_input_schema(
let mut properties = Map::new();
properties.insert(
"url".to_string(),
json!({"type": "string", "format": "uri"}),
json!({
"type": "string",
"description": "HTTP/HTTPS URL, or a bare domain URL normalized to https://"
}),
);
properties.insert(
"method".to_string(),
Expand Down Expand Up @@ -1068,12 +1086,12 @@ fn build_help(tool: &Tool) -> String {

fn parameter_description(locale: &str, field: &str) -> &'static str {
match (is_ukrainian(locale), field) {
(true, "url") => "HTTP або HTTPS URL",
(true, "url") => "HTTP/HTTPS URL або доменний URL, який нормалізується до https://",
(true, "method") => "`GET` або `HEAD`",
(true, "as_markdown") => "Перетворити HTML у markdown",
(true, "as_text") => "Перетворити HTML у plain text",
(true, "save_to_file") => "Шлях призначення, визначений адаптером",
(false, "url") => "HTTP or HTTPS URL",
(false, "url") => "HTTP/HTTPS URL, or a bare domain URL normalized to `https://`",
(false, "method") => "`GET` or `HEAD`",
(false, "as_markdown") => "Convert HTML to markdown",
(false, "as_text") => "Convert HTML to plain text",
Expand Down Expand Up @@ -1269,7 +1287,10 @@ mod tests {
let output_schema = tool.output_schema();

assert_eq!(input_schema["type"], "object");
assert_eq!(input_schema["properties"]["url"]["format"], "uri");
assert!(input_schema["properties"]["url"]["description"]
.as_str()
.unwrap()
.contains("bare domain URL"));
assert_eq!(input_schema["properties"]["method"]["default"], "GET");
assert!(input_schema["properties"]["if_none_match"].is_object());
assert!(input_schema["properties"]["if_modified_since"].is_object());
Expand Down
50 changes: 49 additions & 1 deletion crates/fetchkit/src/types.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,11 @@

use schemars::JsonSchema;
use serde::{Deserialize, Serialize};
use std::net::IpAddr;
use std::str::FromStr;
use url::Url;

use crate::error::FetchError;

/// HTTP method for the request
///
Expand Down Expand Up @@ -70,7 +74,8 @@ impl std::fmt::Display for HttpMethod {
/// ```
#[derive(Debug, Clone, Default, Serialize, Deserialize, JsonSchema)]
pub struct FetchRequest {
/// The URL to fetch (required, must be http:// or https://)
/// The URL to fetch. HTTP/HTTPS URLs are accepted as-is; bare domain URLs
/// such as `example.com/docs` are normalized to `https://example.com/docs`.
pub url: String,

/// HTTP method (optional, default GET)
Expand Down Expand Up @@ -115,6 +120,12 @@ impl FetchRequest {
}
}

/// Normalize browser-like bare host URLs before validation and fetching.
pub(crate) fn normalize_url_for_fetch(&mut self) -> Result<(), FetchError> {
self.url = canonical_fetch_url(&self.url)?;
Ok(())
}

/// Set the HTTP method
pub fn method(mut self, method: HttpMethod) -> Self {
self.method = Some(method);
Expand Down Expand Up @@ -181,6 +192,43 @@ impl FetchRequest {
}
}

fn canonical_fetch_url(raw_url: &str) -> Result<String, FetchError> {
if raw_url.starts_with("http://") || raw_url.starts_with("https://") {
return Ok(raw_url.to_string());
}

if raw_url.is_empty()
|| raw_url.starts_with("//")
|| raw_url.starts_with('/')
|| raw_url.contains("://")
|| raw_url.contains('\\')
|| raw_url.chars().any(char::is_whitespace)
|| raw_url.chars().any(char::is_control)
{
return Err(FetchError::InvalidUrlScheme);
}

let candidate = format!("https://{raw_url}");
let parsed = Url::parse(&candidate).map_err(|_| FetchError::InvalidUrlScheme)?;
let host = parsed.host_str().ok_or(FetchError::InvalidUrlScheme)?;

if !is_domain_like_bare_host(host)
|| !parsed.username().is_empty()
|| parsed.password().is_some()
{
return Err(FetchError::InvalidUrlScheme);
}

Ok(candidate)
}

fn is_domain_like_bare_host(host: &str) -> bool {
let host = host.trim_end_matches('.');
host.contains('.')
&& host.parse::<IpAddr>().is_err()
&& host.chars().any(|ch| ch.is_ascii_alphabetic())
}

/// A link extracted from the page with its text and href.
///
/// # Examples
Expand Down
125 changes: 123 additions & 2 deletions crates/fetchkit/tests/transport.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,10 @@
use async_trait::async_trait;
use bytes::Bytes;
use fetchkit::{
DnsPolicy, FetchOptions, FetchRequest, Fetcher, HttpMethod, HttpTransport, TransportError,
TransportMethod, TransportRequest, TransportResponse,
DnsPolicy, FetchError, FetchOptions, FetchRequest, Fetcher, HttpMethod, HttpTransport,
TransportError, TransportMethod, TransportRequest, TransportResponse,
};
use serde_json::json;
use std::sync::{Arc, Mutex};

/// One canned response keyed by URL substring.
Expand Down Expand Up @@ -273,6 +274,126 @@ async fn tool_execute_uses_injected_transport() {
assert!(calls[0].0.contains("example.test/tool"));
}

#[tokio::test]
async fn tool_execute_defaults_bare_host_urls_to_https() {
let mock = Arc::new(MockTransport::new().route(
"paseo.sh/docs/custom-providers",
200,
"text/plain",
b"bare host normalized",
));

let tool = fetchkit::Tool::builder()
.block_private_ips(false)
.transport(mock.clone())
.build();

let response = tool
.execute(FetchRequest::new("paseo.sh/docs/custom-providers"))
.await
.unwrap();

assert_eq!(response.status_code, 200);
assert_eq!(response.url, "https://paseo.sh/docs/custom-providers");
assert_eq!(response.content.as_deref(), Some("bare host normalized"));

let calls = mock.calls();
assert_eq!(calls.len(), 1);
assert_eq!(calls[0].0, "https://paseo.sh/docs/custom-providers");
}

#[tokio::test]
async fn tool_execution_defaults_bare_host_urls_to_https() {
let mock = Arc::new(MockTransport::new().route(
"paseo.sh/docs/custom-providers",
200,
"text/plain",
b"toolkit normalized",
));

let tool = fetchkit::Tool::builder()
.block_private_ips(false)
.transport(mock.clone())
.build();

let output = tool
.execution(json!({
"url": "paseo.sh/docs/custom-providers"
}))
.unwrap()
.execute()
.await
.unwrap();

assert_eq!(
output.result["url"],
"https://paseo.sh/docs/custom-providers"
);
assert_eq!(output.result["content"], "toolkit normalized");

let calls = mock.calls();
assert_eq!(calls.len(), 1);
assert_eq!(calls[0].0, "https://paseo.sh/docs/custom-providers");
}

#[tokio::test]
async fn bare_host_policy_checks_use_canonical_https_url() {
let mock = Arc::new(MockTransport::new().route(
"paseo.sh/docs/custom-providers",
200,
"text/plain",
b"allowed canonical URL",
));

let allow_tool = fetchkit::Tool::builder()
.block_private_ips(false)
.allow_prefix("https://paseo.sh/docs")
.transport(mock.clone())
.build();

let allowed = allow_tool
.execute(FetchRequest::new("paseo.sh/docs/custom-providers"))
.await
.unwrap();
assert_eq!(allowed.url, "https://paseo.sh/docs/custom-providers");

let block_tool = fetchkit::Tool::builder()
.block_private_ips(false)
.block_prefix("https://paseo.sh/docs")
.transport(mock.clone())
.build();

let blocked = block_tool
.execute(FetchRequest::new("paseo.sh/docs/custom-providers"))
.await
.unwrap_err();
assert!(matches!(blocked, FetchError::BlockedUrl));
}

#[tokio::test]
async fn bare_url_normalization_rejects_non_web_and_localish_inputs() {
let mock = Arc::new(MockTransport::new());
let tool = fetchkit::Tool::builder()
.block_private_ips(false)
.transport(mock.clone())
.build();

for url in [
"ftp://example.com/file",
"file:///etc/passwd",
"mailto:test@example.com",
"//example.com/path",
"localhost/path",
"intranet/path",
"127.0.0.1/path",
] {
let err = tool.execute(FetchRequest::new(url)).await.unwrap_err();
assert!(matches!(err, FetchError::InvalidUrlScheme), "{url}: {err}");
}

assert!(mock.calls().is_empty());
}

#[tokio::test]
async fn tool_execute_with_saver_uses_injected_transport() {
let mock = Arc::new(MockTransport::new().route(
Expand Down
3 changes: 2 additions & 1 deletion specs/initial.md
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,8 @@ Provide a builder to configure tool options, including:
#### Types

- `FetchRequest`
- `url: String` (required)
- `url: String` (required; explicit `http://` and `https://` are accepted as-is;
bare domain URLs such as `example.com/docs` are normalized to `https://example.com/docs`)
- `method: HttpMethod` (optional, default GET)
- `as_markdown: bool` (optional, feature-gated)
- `as_text: bool` (optional, feature-gated)
Expand Down
12 changes: 8 additions & 4 deletions specs/threat-model.md
Original file line number Diff line number Diff line change
Expand Up @@ -226,11 +226,15 @@ expected outbound path.
**TM-INPUT-001 — Scheme validation (MITIGATED):**
```rust
// THREAT[TM-INPUT-001]: Block non-HTTP schemes (file://, ftp://, data:, etc.)
// Mitigation: Early return with InvalidUrlScheme error
if !request.url.starts_with("http://") && !request.url.starts_with("https://") {
return Err(FetchError::InvalidUrlScheme);
}
// Mitigation: normalize domain-like bare hosts to https://, then reject any
// remaining URL without an explicit http:// or https:// scheme.
request.normalize_url_for_fetch()?;
```
Bare URL normalization is deliberately narrow: domain-like hosts such as
`example.com/docs` default to `https://example.com/docs`; non-web schemes,
protocol-relative URLs, bare IP literals, and single-label local names are rejected.
SSRF, DNS, port, allow-prefix, and block-prefix policies run after normalization
against the canonical URL that will be fetched.

**TM-INPUT-002 — URL prefix bypass via encoding (MITIGATED):**
Prefix matching now uses the `url` crate to parse both the URL and the prefix,
Expand Down
Loading