diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py index 89fb782d9..0c7d0d75b 100644 --- a/crawl4ai/utils.py +++ b/crawl4ai/utils.py @@ -2920,11 +2920,15 @@ def compute_head_fingerprint(head_html: str) -> str: if not head_html: return "" - head_lower = head_html.lower() signals = [] + # Match tags/attributes case-insensitively, but extract values from the + # ORIGINAL head: lowercasing the whole head first would fold a case-only + # title/meta change (e.g. "iPhone" -> "IPHONE") to the same fingerprint, so + # the cache validator would treat a genuinely changed page as unchanged. + # Extract title - title_match = re.search(r']*>(.*?)', head_lower, re.DOTALL) + title_match = re.search(r']*>(.*?)', head_html, re.DOTALL | re.IGNORECASE) if title_match: signals.append(title_match.group(1).strip()) @@ -2946,7 +2950,7 @@ def compute_head_fingerprint(head_html: str) -> str: rf']*content=["\']([^"\']*)["\'][^>]*{attr_type}=["\']{re.escape(attr_value)}["\']', ] for pattern in patterns: - match = re.search(pattern, head_lower) + match = re.search(pattern, head_html, re.IGNORECASE) if match: signals.append(match.group(1).strip()) break # Found this tag, move to next diff --git a/tests/cache_validation/test_head_fingerprint.py b/tests/cache_validation/test_head_fingerprint.py index 287f255d8..3d10a162e 100644 --- a/tests/cache_validation/test_head_fingerprint.py +++ b/tests/cache_validation/test_head_fingerprint.py @@ -95,3 +95,26 @@ def test_real_world_head(self): assert fp != "" # Should be deterministic assert fp == compute_head_fingerprint(head) + + def test_value_case_change_changes_fingerprint(self): + """A case-only change in a title/meta *value* must change the + fingerprint, otherwise the cache validator treats a genuinely updated + page as unchanged and serves stale content. Regression.""" + assert compute_head_fingerprint( + "iPhone" + ) != compute_head_fingerprint("IPHONE") + assert compute_head_fingerprint( + '' + ) != compute_head_fingerprint( + '' + ) + + def test_tag_and_attribute_case_does_not_change_fingerprint(self): + """Tag/attribute case is still matched case-insensitively; only the + markup case (not the values) differing yields the same fingerprint.""" + assert compute_head_fingerprint( + "Hello" + ) == compute_head_fingerprint("Hello") + assert compute_head_fingerprint( + '' + ) == compute_head_fingerprint('')