Skip to content
12 changes: 1 addition & 11 deletions crates/trusted-server-core/src/html_processor.rs
Original file line number Diff line number Diff line change
Expand Up @@ -455,7 +455,6 @@ pub fn create_html_processor(config: HtmlProcessorConfig) -> impl StreamProcesso
}),
];

let has_script_rewriters = !script_rewriters.is_empty();
for script_rewriter in script_rewriters {
let selector = script_rewriter.selector();
let rewriter = script_rewriter.clone();
Expand Down Expand Up @@ -493,16 +492,7 @@ pub fn create_html_processor(config: HtmlProcessorConfig) -> impl StreamProcesso
..RewriterSettings::default()
};

// Use buffered mode when script rewriters are registered. lol_html fragments
// text nodes across input chunk boundaries, breaking rewriters that expect
// complete text (e.g., __NEXT_DATA__, GTM). Buffered mode feeds the entire
// document in one write() call, preserving text node integrity.
// Phase 3 will make rewriters fragment-safe, enabling streaming for all configs.
let inner = if has_script_rewriters {
HtmlRewriterAdapter::new_buffered(rewriter_settings)
} else {
HtmlRewriterAdapter::new(rewriter_settings)
};
let inner = HtmlRewriterAdapter::new(rewriter_settings);

HtmlWithPostProcessing {
inner,
Expand Down
267 changes: 262 additions & 5 deletions crates/trusted-server-core/src/integrations/google_tag_manager.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
//! | `GET/POST` | `.../collect` | Proxies GA analytics beacons |
//! | `GET/POST` | `.../g/collect` | Proxies GA4 analytics beacons |

use std::sync::{Arc, LazyLock};
use std::sync::{Arc, LazyLock, Mutex};

use async_trait::async_trait;
use error_stack::{Report, ResultExt};
Expand Down Expand Up @@ -132,11 +132,22 @@ fn validate_container_id(container_id: &str) -> Result<(), validator::Validation

pub struct GoogleTagManagerIntegration {
config: GoogleTagManagerConfig,
/// Accumulates text fragments when `lol_html` splits a text node across
/// chunk boundaries. Drained on `is_last_in_text_node`.
///
/// Uses `Mutex` to satisfy the `Sync` bound on `IntegrationScriptRewriter`.
/// The pipeline is single-threaded (`lol_html::HtmlRewriter` is `!Send`),
/// so the lock is uncontended. `lol_html` delivers text chunks sequentially
/// per element — the buffer is always empty when a new element's text begins.
accumulated_text: Mutex<String>,
}

impl GoogleTagManagerIntegration {
fn new(config: GoogleTagManagerConfig) -> Arc<Self> {
Arc::new(Self { config })
Arc::new(Self {
config,
accumulated_text: Mutex::new(String::new()),
})
}

fn error(message: impl Into<String>) -> TrustedServerError {
Expand Down Expand Up @@ -488,14 +499,40 @@ impl IntegrationScriptRewriter for GoogleTagManagerIntegration {
"script" // Match all scripts to find inline GTM snippets
}

fn rewrite(&self, content: &str, _ctx: &IntegrationScriptContext<'_>) -> ScriptRewriteAction {
fn rewrite(&self, content: &str, ctx: &IntegrationScriptContext<'_>) -> ScriptRewriteAction {
let mut buf = self
.accumulated_text
.lock()
.unwrap_or_else(std::sync::PoisonError::into_inner);

if !ctx.is_last_in_text_node {
// Intermediate fragment — accumulate and suppress output.
buf.push_str(content);
return ScriptRewriteAction::RemoveNode;
}

// Last fragment. If we accumulated prior fragments, combine them.
let full_content: Option<String> = if buf.is_empty() {
None
} else {
buf.push_str(content);
Some(std::mem::take(&mut *buf))
};
let text = full_content.as_deref().unwrap_or(content);

// Look for the GTM snippet pattern.
// Standard snippet contains: "googletagmanager.com/gtm.js"
// Note: analytics.google.com is intentionally excluded — gtag.js stores
// that domain as a bare string and constructs URLs dynamically, so
// rewriting it in scripts produces broken URLs.
if content.contains("googletagmanager.com") || content.contains("google-analytics.com") {
return ScriptRewriteAction::replace(Self::rewrite_gtm_urls(content));
if text.contains("googletagmanager.com") || text.contains("google-analytics.com") {
return ScriptRewriteAction::replace(Self::rewrite_gtm_urls(text));
}

// No GTM content — if we accumulated fragments, emit them unchanged.
// Intermediate fragments were already suppressed via RemoveNode.
if full_content.is_some() {
return ScriptRewriteAction::replace(text.to_string());
}

ScriptRewriteAction::keep()
Expand Down Expand Up @@ -1632,4 +1669,224 @@ container_id = "GTM-DEFAULT"
other => panic!("Expected Integration error, got {:?}", other),
}
}

#[test]
fn fragmented_gtm_snippet_is_accumulated_and_rewritten() {
let config = GoogleTagManagerConfig {
enabled: true,
container_id: "GTM-FRAG1".to_string(),
upstream_url: "https://www.googletagmanager.com".to_string(),
cache_max_age: default_cache_max_age(),
max_beacon_body_size: default_max_beacon_body_size(),
};
let integration = GoogleTagManagerIntegration::new(config);

let document_state = IntegrationDocumentState::default();

// Simulate lol_html splitting the GTM snippet mid-domain.
let fragment1 = r#"(function(w,d,s,l,i){j.src='https://www.google"#;
let fragment2 = r#"tagmanager.com/gtm.js?id='+i;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-FRAG1');"#;

let ctx_intermediate = IntegrationScriptContext {
selector: "script",
request_host: "publisher.example.com",
request_scheme: "https",
origin_host: "origin.example.com",
is_last_in_text_node: false,
document_state: &document_state,
};
let ctx_last = IntegrationScriptContext {
is_last_in_text_node: true,
..ctx_intermediate
};

// Intermediate fragment: should be suppressed.
let action1 =
IntegrationScriptRewriter::rewrite(&*integration, fragment1, &ctx_intermediate);
assert_eq!(
action1,
ScriptRewriteAction::RemoveNode,
"should suppress intermediate fragment"
);

// Last fragment: should emit full rewritten content.
let action2 = IntegrationScriptRewriter::rewrite(&*integration, fragment2, &ctx_last);
match action2 {
ScriptRewriteAction::Replace(rewritten) => {
assert!(
rewritten.contains("/integrations/google_tag_manager/gtm.js"),
"should rewrite GTM URL. Got: {rewritten}"
);
assert!(
!rewritten.contains("googletagmanager.com"),
"should not contain original GTM domain. Got: {rewritten}"
);
}
other => panic!("expected Replace for fragmented GTM, got {other:?}"),
}
}

#[test]
fn non_gtm_fragmented_script_is_passed_through() {
let config = GoogleTagManagerConfig {
enabled: true,
container_id: "GTM-PASS1".to_string(),
upstream_url: "https://www.googletagmanager.com".to_string(),
cache_max_age: default_cache_max_age(),
max_beacon_body_size: default_max_beacon_body_size(),
};
let integration = GoogleTagManagerIntegration::new(config);

let document_state = IntegrationDocumentState::default();

let fragment1 = "console.log('hel";
let fragment2 = "lo world');";

let ctx_intermediate = IntegrationScriptContext {
selector: "script",
request_host: "publisher.example.com",
request_scheme: "https",
origin_host: "origin.example.com",
is_last_in_text_node: false,
document_state: &document_state,
};
let ctx_last = IntegrationScriptContext {
is_last_in_text_node: true,
..ctx_intermediate
};

let action1 =
IntegrationScriptRewriter::rewrite(&*integration, fragment1, &ctx_intermediate);
assert_eq!(
action1,
ScriptRewriteAction::RemoveNode,
"should suppress intermediate"
);

// Last fragment: should emit full unchanged content since it's not GTM.
let action2 = IntegrationScriptRewriter::rewrite(&*integration, fragment2, &ctx_last);
match action2 {
ScriptRewriteAction::Replace(content) => {
assert_eq!(
content, "console.log('hello world');",
"should emit full accumulated non-GTM content"
);
}
other => panic!("expected Replace with passthrough, got {other:?}"),
}
}

/// Verify the accumulation buffer drains correctly between two consecutive
/// `<script>` elements. The first is a fragmented GTM script, the second
/// is a fragmented non-GTM script. Both must produce correct output.
#[test]
fn accumulation_buffer_drains_between_consecutive_script_elements() {
let config = GoogleTagManagerConfig {
enabled: true,
container_id: "GTM-MULTI1".to_string(),
upstream_url: "https://www.googletagmanager.com".to_string(),
cache_max_age: default_cache_max_age(),
max_beacon_body_size: default_max_beacon_body_size(),
};
let integration = GoogleTagManagerIntegration::new(config);
let document_state = IntegrationDocumentState::default();

// --- First <script>: fragmented GTM snippet ---
let gtm_frag1 = r#"j.src='https://www.google"#;
let gtm_frag2 = r#"tagmanager.com/gtm.js?id=GTM-MULTI1';"#;

let ctx_intermediate = IntegrationScriptContext {
selector: "script",
request_host: "publisher.example.com",
request_scheme: "https",
origin_host: "origin.example.com",
is_last_in_text_node: false,
document_state: &document_state,
};
let ctx_last = IntegrationScriptContext {
is_last_in_text_node: true,
..ctx_intermediate
};

let action =
IntegrationScriptRewriter::rewrite(&*integration, gtm_frag1, &ctx_intermediate);
assert_eq!(action, ScriptRewriteAction::RemoveNode);

let action = IntegrationScriptRewriter::rewrite(&*integration, gtm_frag2, &ctx_last);
assert!(
matches!(action, ScriptRewriteAction::Replace(ref s) if s.contains("/integrations/google_tag_manager/gtm.js")),
"first element: should rewrite GTM URL. Got: {action:?}"
);

// --- Second <script>: fragmented non-GTM script ---
// Buffer must be empty here — no leftover from the first element.
let other_frag1 = "console.log('hel";
let other_frag2 = "lo');";

let action =
IntegrationScriptRewriter::rewrite(&*integration, other_frag1, &ctx_intermediate);
assert_eq!(
action,
ScriptRewriteAction::RemoveNode,
"second element intermediate should suppress"
);

let action = IntegrationScriptRewriter::rewrite(&*integration, other_frag2, &ctx_last);
match action {
ScriptRewriteAction::Replace(content) => {
assert_eq!(
content, "console.log('hello');",
"second element should contain only its own content, no GTM leftover"
);
}
other => panic!("expected Replace for second element, got {other:?}"),
}
}

/// Regression test: with a small chunk size, `lol_html` fragments the
/// inline GTM script text node. The rewriter must accumulate fragments
/// and produce correct output through the full HTML pipeline.
#[test]
fn small_chunk_gtm_rewrite_survives_fragmentation() {
let mut settings = make_settings();
settings
.integrations
.insert_config(
"google_tag_manager",
&serde_json::json!({
"enabled": true,
"container_id": "GTM-SMALL1"
}),
)
.expect("should update config");

let registry = IntegrationRegistry::new(&settings).expect("should create registry");
let config = config_from_settings(&settings, &registry);
let processor = create_html_processor(config);

// Use a very small chunk size to force fragmentation mid-domain.
let pipeline_config = PipelineConfig {
input_compression: Compression::None,
output_compression: Compression::None,
chunk_size: 32,
};
let mut pipeline = StreamingPipeline::new(pipeline_config, processor);

let html_input = r#"<html><head><script>j.src='https://www.googletagmanager.com/gtm.js?id=GTM-SMALL1';</script></head><body></body></html>"#;

let mut output = Vec::new();
pipeline
.process(Cursor::new(html_input.as_bytes()), &mut output)
.expect("should process with small chunks");
let processed = String::from_utf8_lossy(&output);

assert!(
processed.contains("/integrations/google_tag_manager/gtm.js"),
"should rewrite fragmented GTM URL. Got: {processed}"
);
assert!(
!processed.contains("googletagmanager.com"),
"should not contain original GTM domain. Got: {processed}"
);
}
}
51 changes: 51 additions & 0 deletions crates/trusted-server-core/src/integrations/nextjs/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -599,4 +599,55 @@ mod tests {
final_html
);
}

/// Regression test: with a small chunk size, `lol_html` fragments the
/// `__NEXT_DATA__` text node across chunks. The rewriter must accumulate
/// fragments and produce correct output.
#[test]
fn small_chunk_next_data_rewrite_survives_fragmentation() {
// Build a __NEXT_DATA__ payload large enough to cross a 32-byte chunk boundary.
let html = r#"<html><body><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{"href":"https://origin.example.com/reviews","title":"Hello World"}}}</script></body></html>"#;

let mut settings = create_test_settings();
settings
.integrations
.insert_config(
"nextjs",
&json!({
"enabled": true,
"rewrite_attributes": ["href", "link", "url"],
}),
)
.expect("should update nextjs config");
let registry = IntegrationRegistry::new(&settings).expect("should create registry");
let config = config_from_settings(&settings, &registry);
let processor = create_html_processor(config);

// Use a very small chunk size to force fragmentation.
let pipeline_config = PipelineConfig {
input_compression: Compression::None,
output_compression: Compression::None,
chunk_size: 32,
};
let mut pipeline = StreamingPipeline::new(pipeline_config, processor);

let mut output = Vec::new();
pipeline
.process(Cursor::new(html.as_bytes()), &mut output)
.expect("should process with small chunks");

let processed = String::from_utf8_lossy(&output);
assert!(
processed.contains("test.example.com") && processed.contains("/reviews"),
"should rewrite fragmented __NEXT_DATA__ href. Got: {processed}"
);
assert!(
!processed.contains("origin.example.com/reviews"),
"should not contain original origin href. Got: {processed}"
);
assert!(
processed.contains("Hello World"),
"should preserve non-URL content. Got: {processed}"
);
}
}
Loading
Loading