From b2d1705c16be859ffd15f7dbb55d680cabf6ad57 Mon Sep 17 00:00:00 2001 From: jichao wang Date: Tue, 23 Jun 2026 02:02:19 +0100 Subject: [PATCH] Fix compute_head_fingerprint folding case-only title/meta changes compute_head_fingerprint lowercased the entire (`head_html.lower()`) before extracting the title and meta values, so the captured signal values were lowercased too. Two heads that differ only in the case of a title or meta value (e.g. "iPhone" vs "IPHONE", "Buy Now" vs "BUY NOW") therefore hashed to the same fingerprint. CacheValidator treats an equal fingerprint as unchanged, so a genuinely updated page was reported FRESH and stale cached content was served. Match tags/attributes case-insensitively (re.IGNORECASE) against the original head instead, so the extracted values keep their original case. Tag/attribute case-insensitivity is preserved; identical content still hashes identically. Adds regression tests: a case-only value change now changes the fingerprint (fails on the old code), while tag/attribute-only case differences do not. --- crawl4ai/utils.py | 10 +++++--- .../cache_validation/test_head_fingerprint.py | 23 +++++++++++++++++++ 2 files changed, 30 insertions(+), 3 deletions(-) diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py index 89fb782d9..0c7d0d75b 100644 --- a/crawl4ai/utils.py +++ b/crawl4ai/utils.py @@ -2920,11 +2920,15 @@ def compute_head_fingerprint(head_html: str) -> str: if not head_html: return "" - head_lower = head_html.lower() signals = [] + # Match tags/attributes case-insensitively, but extract values from the + # ORIGINAL head: lowercasing the whole head first would fold a case-only + # title/meta change (e.g. "iPhone" -> "IPHONE") to the same fingerprint, so + # the cache validator would treat a genuinely changed page as unchanged. + # Extract title - title_match = re.search(r']*>(.*?)', head_lower, re.DOTALL) + title_match = re.search(r']*>(.*?)', head_html, re.DOTALL | re.IGNORECASE) if title_match: signals.append(title_match.group(1).strip()) @@ -2946,7 +2950,7 @@ def compute_head_fingerprint(head_html: str) -> str: rf']*content=["\']([^"\']*)["\'][^>]*{attr_type}=["\']{re.escape(attr_value)}["\']', ] for pattern in patterns: - match = re.search(pattern, head_lower) + match = re.search(pattern, head_html, re.IGNORECASE) if match: signals.append(match.group(1).strip()) break # Found this tag, move to next diff --git a/tests/cache_validation/test_head_fingerprint.py b/tests/cache_validation/test_head_fingerprint.py index 287f255d8..3d10a162e 100644 --- a/tests/cache_validation/test_head_fingerprint.py +++ b/tests/cache_validation/test_head_fingerprint.py @@ -95,3 +95,26 @@ def test_real_world_head(self): assert fp != "" # Should be deterministic assert fp == compute_head_fingerprint(head) + + def test_value_case_change_changes_fingerprint(self): + """A case-only change in a title/meta *value* must change the + fingerprint, otherwise the cache validator treats a genuinely updated + page as unchanged and serves stale content. Regression.""" + assert compute_head_fingerprint( + "iPhone" + ) != compute_head_fingerprint("IPHONE") + assert compute_head_fingerprint( + '' + ) != compute_head_fingerprint( + '' + ) + + def test_tag_and_attribute_case_does_not_change_fingerprint(self): + """Tag/attribute case is still matched case-insensitively; only the + markup case (not the values) differing yields the same fingerprint.""" + assert compute_head_fingerprint( + "Hello" + ) == compute_head_fingerprint("Hello") + assert compute_head_fingerprint( + '' + ) == compute_head_fingerprint('')