diff --git a/crawl4ai/link_preview.py b/crawl4ai/link_preview.py index 3ed29666e..2158e061e 100644 --- a/crawl4ai/link_preview.py +++ b/crawl4ai/link_preview.py @@ -154,24 +154,27 @@ def _filter_links(self, links: Links, link_config: Dict[str, Any]) -> List[str]: self._log("debug", "After exclude patterns: {count} links remain", params={"count": len(filtered_urls)}) - # Limit number of links - max_links = link_config.max_links - if max_links > 0 and len(filtered_urls) > max_links: - filtered_urls = filtered_urls[:max_links] - self._log("debug", "Limited to {max_links} links", - params={"max_links": max_links}) - - # Remove duplicates while preserving order + # Remove duplicates while preserving order. This must run BEFORE the + # max_links limit: truncating first spends the budget on duplicate + # copies of the same URL (common for repeated nav/footer links), so the + # subsequent dedup would yield far fewer than max_links unique URLs. seen = set() unique_urls = [] for url in filtered_urls: if url not in seen: seen.add(url) unique_urls.append(url) - + + # Limit number of links (counting distinct URLs) + max_links = link_config.max_links + if max_links > 0 and len(unique_urls) > max_links: + unique_urls = unique_urls[:max_links] + self._log("debug", "Limited to {max_links} links", + params={"max_links": max_links}) + self._log("debug", "Final filtered URLs: {count} unique links", params={"count": len(unique_urls)}) - + return unique_urls async def _extract_heads_parallel( diff --git a/tests/test_merge_head_data_scoring.py b/tests/test_merge_head_data_scoring.py index 65aad0468..f8bd5b2ba 100644 --- a/tests/test_merge_head_data_scoring.py +++ b/tests/test_merge_head_data_scoring.py @@ -10,6 +10,7 @@ import pytest from unittest.mock import MagicMock +from crawl4ai.async_configs import LinkPreviewConfig from crawl4ai.models import Link, Links from crawl4ai.link_preview import LinkPreview from crawl4ai.utils import calculate_total_score @@ -181,3 +182,37 @@ def test_scoring_disabled_returns_neutral_score(self): updated = preview._merge_head_data(links, head_results, config) assert updated.internal[0].total_score == 5.0 + + +class TestFilterLinksDeduplication: + """_filter_links must deduplicate BEFORE applying max_links, so the limit + counts distinct URLs instead of being spent on duplicate copies. + + Regression for the truncate-before-dedup bug: with duplicate links at the + head of the list, the old code sliced [:max_links] first and then collapsed + the duplicates, returning far fewer than max_links unique URLs. + """ + + @staticmethod + def _internal(hrefs): + return Links(internal=[Link(href=h) for h in hrefs], external=[]) + + def test_dedup_runs_before_max_links(self): + # First three entries are duplicates of "a"; max_links=3 must still + # yield three DISTINCT urls, not collapse to one. + hrefs = ["https://x.com/a"] * 3 + ["https://x.com/b", "https://x.com/c", "https://x.com/d"] + cfg = LinkPreviewConfig(include_internal=True, include_external=False, max_links=3) + result = LinkPreview()._filter_links(self._internal(hrefs), cfg) + assert result == ["https://x.com/a", "https://x.com/b", "https://x.com/c"] + + def test_keeps_all_when_uniques_below_max(self): + hrefs = ["https://x.com/a", "https://x.com/a", "https://x.com/b"] + cfg = LinkPreviewConfig(include_internal=True, include_external=False, max_links=3) + result = LinkPreview()._filter_links(self._internal(hrefs), cfg) + assert result == ["https://x.com/a", "https://x.com/b"] + + def test_caps_distinct_urls_at_max(self): + hrefs = [f"https://x.com/{i}" for i in range(5)] + cfg = LinkPreviewConfig(include_internal=True, include_external=False, max_links=3) + result = LinkPreview()._filter_links(self._internal(hrefs), cfg) + assert result == ["https://x.com/0", "https://x.com/1", "https://x.com/2"]