From 9ce8ffd50286dc5ca3215febb3c4408a400b8681 Mon Sep 17 00:00:00 2001 From: jichao wang Date: Sun, 21 Jun 2026 17:56:40 +0100 Subject: [PATCH] Fix is_external_url misclassifying sibling domains as internal is_external_url decided same-site vs external with `not url_domain.endswith(base)`, a raw string-suffix test with no domain-label boundary. Any host that merely ends with the base string was treated as internal, so look-alike / sibling domains were mislabeled: is_external_url("https://notexample.com/x", "example.com") -> False (internal) even though notexample.com is a different registrable domain. This corrupts the internal/external link buckets that quick_extract_links() and deep-crawl scoping rely on (and lets a phishing-style look-alike host pass as same-site). Require a label boundary: same site iff the domain equals base or is a true sub-domain (`url_domain == base or url_domain.endswith("." + base)`). Real subdomains (sub.example.com) and the www variant stay internal; unrelated and look-alike domains are correctly external. Adds TestIsExternalUrl in tests/regression/test_reg_utils.py; the sibling-domain cases fail on the old code and pass with this fix. --- crawl4ai/utils.py | 7 ++-- tests/regression/test_reg_utils.py | 51 ++++++++++++++++++++++++++++++ 2 files changed, 56 insertions(+), 2 deletions(-) diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py index 89fb782d9..cd9e3be72 100644 --- a/crawl4ai/utils.py +++ b/crawl4ai/utils.py @@ -2610,8 +2610,11 @@ def is_external_url(url: str, base_domain: str) -> bool: url_domain = parsed.netloc.lower().split(":")[0].replace("www.", "") base = base_domain.lower().split(":")[0].replace("www.", "") - # Check if URL domain ends with base domain - return not url_domain.endswith(base) + # Same site iff the domains are equal or url_domain is a sub-domain of + # base (a label boundary, i.e. a leading dot, is required). A bare + # ``endswith`` has no boundary, so it wrongly treats ``notexample.com`` + # as internal to ``example.com``. + return not (url_domain == base or url_domain.endswith("." + base)) except Exception: return False diff --git a/tests/regression/test_reg_utils.py b/tests/regression/test_reg_utils.py index dfc63c42d..2fbad0e24 100644 --- a/tests/regression/test_reg_utils.py +++ b/tests/regression/test_reg_utils.py @@ -10,6 +10,8 @@ from crawl4ai.utils import ( extract_xml_data, extract_xml_data_legacy, + get_base_domain, + is_external_url, normalize_url, normalize_url_for_deep_crawl, efficient_normalize_url_for_deep_crawl, @@ -498,3 +500,52 @@ def test_image_description_threshold_exists(self): """IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD should exist.""" from crawl4ai.config import IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD assert isinstance(IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD, (int, float)) + + +# =================================================================== +# is_external_url +# =================================================================== + +class TestIsExternalUrl: + """is_external_url must use a domain-label boundary, not a raw string suffix. + + Before the fix, ``not url_domain.endswith(base)`` treated any host that + merely *ended with* the base string as internal, so sibling/look-alike + domains (``notexample.com``, ``evilexample.com``) were mislabeled as + internal to ``example.com`` — corrupting internal/external link bucketing + and deep-crawl scoping. + """ + + BASE = get_base_domain("https://example.com/page") # -> "example.com" + + def test_same_domain_is_internal(self): + assert is_external_url("https://example.com/other", self.BASE) is False + + def test_www_variant_is_internal(self): + assert is_external_url("https://www.example.com/x", self.BASE) is False + + def test_real_subdomain_is_internal(self): + assert is_external_url("https://sub.example.com/x", self.BASE) is False + assert is_external_url("https://deep.sub.example.com/x", self.BASE) is False + + def test_unrelated_domain_is_external(self): + assert is_external_url("https://other.com/x", self.BASE) is True + + @pytest.mark.parametrize( + "url", + [ + "https://notexample.com/landing", + "https://myexample.com/x", + "https://evilexample.com/phish", + ], + ) + def test_sibling_lookalike_domain_is_external(self, url): + # The regression: a label boundary is required, so a host that only + # shares the trailing string is external, not internal. + assert is_external_url(url, self.BASE) is True + + def test_special_scheme_is_external(self): + assert is_external_url("mailto:a@b.com", self.BASE) is True + + def test_relative_url_is_internal(self): + assert is_external_url("/relative/path", self.BASE) is False