From b2d1705c16be859ffd15f7dbb55d680cabf6ad57 Mon Sep 17 00:00:00 2001
From: jichao wang <jichaowang02@gmail.com>
Date: Tue, 23 Jun 2026 02:02:19 +0100
Subject: [PATCH] Fix compute_head_fingerprint folding case-only title/meta
 changes

compute_head_fingerprint lowercased the entire <head> (`head_html.lower()`)
before extracting the title and meta values, so the captured signal values
were lowercased too. Two heads that differ only in the case of a title or
meta value (e.g. "iPhone" vs "IPHONE", "Buy Now" vs "BUY NOW") therefore
hashed to the same fingerprint. CacheValidator treats an equal fingerprint as
unchanged, so a genuinely updated page was reported FRESH and stale cached
content was served.

Match tags/attributes case-insensitively (re.IGNORECASE) against the original
head instead, so the extracted values keep their original case. Tag/attribute
case-insensitivity is preserved; identical content still hashes identically.

Adds regression tests: a case-only value change now changes the fingerprint
(fails on the old code), while tag/attribute-only case differences do not.
---
 crawl4ai/utils.py                             | 10 +++++---
 .../cache_validation/test_head_fingerprint.py | 23 +++++++++++++++++++
 2 files changed, 30 insertions(+), 3 deletions(-)
diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py
index 89fb782d9..0c7d0d75b 100644
--- a/crawl4ai/utils.py
+++ b/crawl4ai/utils.py
@@ -2920,11 +2920,15 @@ def compute_head_fingerprint(head_html: str) -> str:
     if not head_html:
         return ""
 
-    head_lower = head_html.lower()
     signals = []
 
+    # Match tags/attributes case-insensitively, but extract values from the
+    # ORIGINAL head: lowercasing the whole head first would fold a case-only
+    # title/meta change (e.g. "iPhone" -> "IPHONE") to the same fingerprint, so
+    # the cache validator would treat a genuinely changed page as unchanged.
+
     # Extract title
-    title_match = re.search(r'<title[^>]*>(.*?)</title>', head_lower, re.DOTALL)
+    title_match = re.search(r'<title[^>]*>(.*?)</title>', head_html, re.DOTALL | re.IGNORECASE)
     if title_match:
         signals.append(title_match.group(1).strip())
 
@@ -2946,7 +2950,7 @@ def compute_head_fingerprint(head_html: str) -> str:
             rf'<meta[^>]*content=["\']([^"\']*)["\'][^>]*{attr_type}=["\']{re.escape(attr_value)}["\']',
         ]
         for pattern in patterns:
-            match = re.search(pattern, head_lower)
+            match = re.search(pattern, head_html, re.IGNORECASE)
             if match:
                 signals.append(match.group(1).strip())
                 break  # Found this tag, move to next
diff --git a/tests/cache_validation/test_head_fingerprint.py b/tests/cache_validation/test_head_fingerprint.py
index 287f255d8..3d10a162e 100644
--- a/tests/cache_validation/test_head_fingerprint.py
+++ b/tests/cache_validation/test_head_fingerprint.py
@@ -95,3 +95,26 @@ def test_real_world_head(self):
         assert fp != ""
         # Should be deterministic
         assert fp == compute_head_fingerprint(head)
+
+    def test_value_case_change_changes_fingerprint(self):
+        """A case-only change in a title/meta *value* must change the
+        fingerprint, otherwise the cache validator treats a genuinely updated
+        page as unchanged and serves stale content. Regression."""
+        assert compute_head_fingerprint(
+            "<head><title>iPhone</title></head>"
+        ) != compute_head_fingerprint("<head><title>IPHONE</title></head>")
+        assert compute_head_fingerprint(
+            '<head><meta name="description" content="Buy Now"></head>'
+        ) != compute_head_fingerprint(
+            '<head><meta name="description" content="BUY NOW"></head>'
+        )
+
+    def test_tag_and_attribute_case_does_not_change_fingerprint(self):
+        """Tag/attribute case is still matched case-insensitively; only the
+        markup case (not the values) differing yields the same fingerprint."""
+        assert compute_head_fingerprint(
+            "<HEAD><TITLE>Hello</TITLE></HEAD>"
+        ) == compute_head_fingerprint("<head><title>Hello</title></head>")
+        assert compute_head_fingerprint(
+            '<head><META NAME="description" CONTENT="Hi"></head>'
+        ) == compute_head_fingerprint('<head><meta name="description" content="Hi"></head>')