Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions crawl4ai/async_url_seeder.py
Original file line number Diff line number Diff line change
Expand Up @@ -1694,8 +1694,12 @@ def _is_nonsense_url(self, url: str) -> bool:
if any(pattern in url_lower for pattern in ['?print=', '&print=', '/print/', '_print.']):
return True

# 12. Very short paths (likely homepage redirects or errors)
if len(path.strip('/')) < 3 and path not in ['/', '/en', '/de', '/fr', '/es', '/it']:
# 12. Very short paths (likely homepage redirects or errors).
# Compare the slash-stripped path against the whitelist too, so the
# canonical trailing-slash form of a language root (e.g. "/en/") is
# kept just like "/en" instead of being dropped as nonsense.
stripped_path = path.strip('/')
if len(stripped_path) < 3 and stripped_path not in ['', 'en', 'de', 'fr', 'es', 'it']:
return True

return False
Expand Down
53 changes: 53 additions & 0 deletions tests/unit/test_nonsense_url_language_roots.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
"""Regression test for ``AsyncUrlSeeder._is_nonsense_url`` language roots.

The "very short path" filter measured length on the slash-stripped path but
matched the *un-stripped* path against the language-root whitelist, so the
canonical trailing-slash form (e.g. ``/en/``) was dropped as nonsense even
though ``/en`` was kept.
"""
import sys
from types import SimpleNamespace

import pytest

# Avoid the optional rank_bm25 dependency at import time (mirrors the sibling
# unit test); _is_nonsense_url itself does not use BM25.
sys.modules.setdefault("rank_bm25", SimpleNamespace(BM25Okapi=object))

from crawl4ai.async_url_seeder import AsyncUrlSeeder


@pytest.fixture
def seeder():
# _is_nonsense_url only uses its url argument (no instance state), so it is
# safe to bypass __init__.
return AsyncUrlSeeder.__new__(AsyncUrlSeeder)


@pytest.mark.parametrize(
"url",
[
"https://example.com/en",
"https://example.com/en/", # the bug: canonical trailing-slash form
"https://example.com/de/",
"https://example.com/fr/",
"https://example.com/es/",
"https://example.com/it/",
"https://example.com/",
"https://example.com/about",
],
)
def test_language_roots_are_not_filtered(seeder, url):
assert seeder._is_nonsense_url(url) is False


@pytest.mark.parametrize(
"url",
[
"https://example.com/ab/", # genuinely short, non-language path
"https://example.com/x",
"https://example.com/robots.txt",
],
)
def test_short_junk_and_utility_still_filtered(seeder, url):
assert seeder._is_nonsense_url(url) is True