From 9ce8ffd50286dc5ca3215febb3c4408a400b8681 Mon Sep 17 00:00:00 2001
From: jichao wang <jichaowang02@gmail.com>
Date: Sun, 21 Jun 2026 17:56:40 +0100
Subject: [PATCH] Fix is_external_url misclassifying sibling domains as
 internal

is_external_url decided same-site vs external with `not url_domain.endswith(base)`,
a raw string-suffix test with no domain-label boundary. Any host that merely
ends with the base string was treated as internal, so look-alike / sibling
domains were mislabeled:

    is_external_url("https://notexample.com/x", "example.com")  -> False (internal)

even though notexample.com is a different registrable domain. This corrupts the
internal/external link buckets that quick_extract_links() and deep-crawl scoping
rely on (and lets a phishing-style look-alike host pass as same-site).

Require a label boundary: same site iff the domain equals base or is a true
sub-domain (`url_domain == base or url_domain.endswith("." + base)`). Real
subdomains (sub.example.com) and the www variant stay internal; unrelated and
look-alike domains are correctly external.

Adds TestIsExternalUrl in tests/regression/test_reg_utils.py; the sibling-domain
cases fail on the old code and pass with this fix.
---
 crawl4ai/utils.py                  |  7 ++--
 tests/regression/test_reg_utils.py | 51 ++++++++++++++++++++++++++++++
 2 files changed, 56 insertions(+), 2 deletions(-)

diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py
index 89fb782d9..cd9e3be72 100644
--- a/crawl4ai/utils.py
+++ b/crawl4ai/utils.py
@@ -2610,8 +2610,11 @@ def is_external_url(url: str, base_domain: str) -> bool:
         url_domain = parsed.netloc.lower().split(":")[0].replace("www.", "")
         base = base_domain.lower().split(":")[0].replace("www.", "")
 
-        # Check if URL domain ends with base domain
-        return not url_domain.endswith(base)
+        # Same site iff the domains are equal or url_domain is a sub-domain of
+        # base (a label boundary, i.e. a leading dot, is required). A bare
+        # ``endswith`` has no boundary, so it wrongly treats ``notexample.com``
+        # as internal to ``example.com``.
+        return not (url_domain == base or url_domain.endswith("." + base))
     except Exception:
         return False
 
diff --git a/tests/regression/test_reg_utils.py b/tests/regression/test_reg_utils.py
index dfc63c42d..2fbad0e24 100644
--- a/tests/regression/test_reg_utils.py
+++ b/tests/regression/test_reg_utils.py
@@ -10,6 +10,8 @@
 from crawl4ai.utils import (
     extract_xml_data,
     extract_xml_data_legacy,
+    get_base_domain,
+    is_external_url,
     normalize_url,
     normalize_url_for_deep_crawl,
     efficient_normalize_url_for_deep_crawl,
@@ -498,3 +500,52 @@ def test_image_description_threshold_exists(self):
         """IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD should exist."""
         from crawl4ai.config import IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD
         assert isinstance(IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD, (int, float))
+
+
+# ===================================================================
+# is_external_url
+# ===================================================================
+
+class TestIsExternalUrl:
+    """is_external_url must use a domain-label boundary, not a raw string suffix.
+
+    Before the fix, ``not url_domain.endswith(base)`` treated any host that
+    merely *ended with* the base string as internal, so sibling/look-alike
+    domains (``notexample.com``, ``evilexample.com``) were mislabeled as
+    internal to ``example.com`` — corrupting internal/external link bucketing
+    and deep-crawl scoping.
+    """
+
+    BASE = get_base_domain("https://example.com/page")  # -> "example.com"
+
+    def test_same_domain_is_internal(self):
+        assert is_external_url("https://example.com/other", self.BASE) is False
+
+    def test_www_variant_is_internal(self):
+        assert is_external_url("https://www.example.com/x", self.BASE) is False
+
+    def test_real_subdomain_is_internal(self):
+        assert is_external_url("https://sub.example.com/x", self.BASE) is False
+        assert is_external_url("https://deep.sub.example.com/x", self.BASE) is False
+
+    def test_unrelated_domain_is_external(self):
+        assert is_external_url("https://other.com/x", self.BASE) is True
+
+    @pytest.mark.parametrize(
+        "url",
+        [
+            "https://notexample.com/landing",
+            "https://myexample.com/x",
+            "https://evilexample.com/phish",
+        ],
+    )
+    def test_sibling_lookalike_domain_is_external(self, url):
+        # The regression: a label boundary is required, so a host that only
+        # shares the trailing string is external, not internal.
+        assert is_external_url(url, self.BASE) is True
+
+    def test_special_scheme_is_external(self):
+        assert is_external_url("mailto:a@b.com", self.BASE) is True
+
+    def test_relative_url_is_internal(self):
+        assert is_external_url("/relative/path", self.BASE) is False