Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 13 additions & 10 deletions crawl4ai/link_preview.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,24 +154,27 @@ def _filter_links(self, links: Links, link_config: Dict[str, Any]) -> List[str]:
self._log("debug", "After exclude patterns: {count} links remain",
params={"count": len(filtered_urls)})

# Limit number of links
max_links = link_config.max_links
if max_links > 0 and len(filtered_urls) > max_links:
filtered_urls = filtered_urls[:max_links]
self._log("debug", "Limited to {max_links} links",
params={"max_links": max_links})

# Remove duplicates while preserving order
# Remove duplicates while preserving order. This must run BEFORE the
# max_links limit: truncating first spends the budget on duplicate
# copies of the same URL (common for repeated nav/footer links), so the
# subsequent dedup would yield far fewer than max_links unique URLs.
seen = set()
unique_urls = []
for url in filtered_urls:
if url not in seen:
seen.add(url)
unique_urls.append(url)


# Limit number of links (counting distinct URLs)
max_links = link_config.max_links
if max_links > 0 and len(unique_urls) > max_links:
unique_urls = unique_urls[:max_links]
self._log("debug", "Limited to {max_links} links",
params={"max_links": max_links})

self._log("debug", "Final filtered URLs: {count} unique links",
params={"count": len(unique_urls)})

return unique_urls

async def _extract_heads_parallel(
Expand Down
35 changes: 35 additions & 0 deletions tests/test_merge_head_data_scoring.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import pytest
from unittest.mock import MagicMock

from crawl4ai.async_configs import LinkPreviewConfig
from crawl4ai.models import Link, Links
from crawl4ai.link_preview import LinkPreview
from crawl4ai.utils import calculate_total_score
Expand Down Expand Up @@ -181,3 +182,37 @@ def test_scoring_disabled_returns_neutral_score(self):
updated = preview._merge_head_data(links, head_results, config)

assert updated.internal[0].total_score == 5.0


class TestFilterLinksDeduplication:
"""_filter_links must deduplicate BEFORE applying max_links, so the limit
counts distinct URLs instead of being spent on duplicate copies.

Regression for the truncate-before-dedup bug: with duplicate links at the
head of the list, the old code sliced [:max_links] first and then collapsed
the duplicates, returning far fewer than max_links unique URLs.
"""

@staticmethod
def _internal(hrefs):
return Links(internal=[Link(href=h) for h in hrefs], external=[])

def test_dedup_runs_before_max_links(self):
# First three entries are duplicates of "a"; max_links=3 must still
# yield three DISTINCT urls, not collapse to one.
hrefs = ["https://x.com/a"] * 3 + ["https://x.com/b", "https://x.com/c", "https://x.com/d"]
cfg = LinkPreviewConfig(include_internal=True, include_external=False, max_links=3)
result = LinkPreview()._filter_links(self._internal(hrefs), cfg)
assert result == ["https://x.com/a", "https://x.com/b", "https://x.com/c"]

def test_keeps_all_when_uniques_below_max(self):
hrefs = ["https://x.com/a", "https://x.com/a", "https://x.com/b"]
cfg = LinkPreviewConfig(include_internal=True, include_external=False, max_links=3)
result = LinkPreview()._filter_links(self._internal(hrefs), cfg)
assert result == ["https://x.com/a", "https://x.com/b"]

def test_caps_distinct_urls_at_max(self):
hrefs = [f"https://x.com/{i}" for i in range(5)]
cfg = LinkPreviewConfig(include_internal=True, include_external=False, max_links=3)
result = LinkPreview()._filter_links(self._internal(hrefs), cfg)
assert result == ["https://x.com/0", "https://x.com/1", "https://x.com/2"]