diff --git a/crawl4ai/async_url_seeder.py b/crawl4ai/async_url_seeder.py index 22fa8f630..16fc89997 100644 --- a/crawl4ai/async_url_seeder.py +++ b/crawl4ai/async_url_seeder.py @@ -1694,8 +1694,12 @@ def _is_nonsense_url(self, url: str) -> bool: if any(pattern in url_lower for pattern in ['?print=', '&print=', '/print/', '_print.']): return True - # 12. Very short paths (likely homepage redirects or errors) - if len(path.strip('/')) < 3 and path not in ['/', '/en', '/de', '/fr', '/es', '/it']: + # 12. Very short paths (likely homepage redirects or errors). + # Compare the slash-stripped path against the whitelist too, so the + # canonical trailing-slash form of a language root (e.g. "/en/") is + # kept just like "/en" instead of being dropped as nonsense. + stripped_path = path.strip('/') + if len(stripped_path) < 3 and stripped_path not in ['', 'en', 'de', 'fr', 'es', 'it']: return True return False diff --git a/tests/unit/test_nonsense_url_language_roots.py b/tests/unit/test_nonsense_url_language_roots.py new file mode 100644 index 000000000..e3d9ad740 --- /dev/null +++ b/tests/unit/test_nonsense_url_language_roots.py @@ -0,0 +1,53 @@ +"""Regression test for ``AsyncUrlSeeder._is_nonsense_url`` language roots. + +The "very short path" filter measured length on the slash-stripped path but +matched the *un-stripped* path against the language-root whitelist, so the +canonical trailing-slash form (e.g. ``/en/``) was dropped as nonsense even +though ``/en`` was kept. +""" +import sys +from types import SimpleNamespace + +import pytest + +# Avoid the optional rank_bm25 dependency at import time (mirrors the sibling +# unit test); _is_nonsense_url itself does not use BM25. +sys.modules.setdefault("rank_bm25", SimpleNamespace(BM25Okapi=object)) + +from crawl4ai.async_url_seeder import AsyncUrlSeeder + + +@pytest.fixture +def seeder(): + # _is_nonsense_url only uses its url argument (no instance state), so it is + # safe to bypass __init__. + return AsyncUrlSeeder.__new__(AsyncUrlSeeder) + + +@pytest.mark.parametrize( + "url", + [ + "https://example.com/en", + "https://example.com/en/", # the bug: canonical trailing-slash form + "https://example.com/de/", + "https://example.com/fr/", + "https://example.com/es/", + "https://example.com/it/", + "https://example.com/", + "https://example.com/about", + ], +) +def test_language_roots_are_not_filtered(seeder, url): + assert seeder._is_nonsense_url(url) is False + + +@pytest.mark.parametrize( + "url", + [ + "https://example.com/ab/", # genuinely short, non-language path + "https://example.com/x", + "https://example.com/robots.txt", + ], +) +def test_short_junk_and_utility_still_filtered(seeder, url): + assert seeder._is_nonsense_url(url) is True