unclecode · jichaowang02-lang · Jun 21, 2026
diff --git a/crawl4ai/link_preview.py b/crawl4ai/link_preview.py
@@ -154,24 +154,27 @@ def _filter_links(self, links: Links, link_config: Dict[str, Any]) -> List[str]:
             self._log("debug", "After exclude patterns: {count} links remain",
                       params={"count": len(filtered_urls)})
 
-        # Limit number of links
-        max_links = link_config.max_links
-        if max_links > 0 and len(filtered_urls) > max_links:
-            filtered_urls = filtered_urls[:max_links]
-            self._log("debug", "Limited to {max_links} links",
-                      params={"max_links": max_links})
-
-        # Remove duplicates while preserving order
+        # Remove duplicates while preserving order. This must run BEFORE the
+        # max_links limit: truncating first spends the budget on duplicate
+        # copies of the same URL (common for repeated nav/footer links), so the
+        # subsequent dedup would yield far fewer than max_links unique URLs.
         seen = set()
         unique_urls = []
         for url in filtered_urls:
             if url not in seen:
                 seen.add(url)
                 unique_urls.append(url)
-
+
+        # Limit number of links (counting distinct URLs)
+        max_links = link_config.max_links
+        if max_links > 0 and len(unique_urls) > max_links:
+            unique_urls = unique_urls[:max_links]
+            self._log("debug", "Limited to {max_links} links",
+                      params={"max_links": max_links})
+
         self._log("debug", "Final filtered URLs: {count} unique links",
                   params={"count": len(unique_urls)})
-        
+
         return unique_urls
 
     async def _extract_heads_parallel(

diff --git a/tests/test_merge_head_data_scoring.py b/tests/test_merge_head_data_scoring.py
@@ -10,6 +10,7 @@
 import pytest
 from unittest.mock import MagicMock
 
+from crawl4ai.async_configs import LinkPreviewConfig
 from crawl4ai.models import Link, Links
 from crawl4ai.link_preview import LinkPreview
 from crawl4ai.utils import calculate_total_score
@@ -181,3 +182,37 @@ def test_scoring_disabled_returns_neutral_score(self):
         updated = preview._merge_head_data(links, head_results, config)
 
         assert updated.internal[0].total_score == 5.0
+
+
+class TestFilterLinksDeduplication:
+    """_filter_links must deduplicate BEFORE applying max_links, so the limit
+    counts distinct URLs instead of being spent on duplicate copies.
+
+    Regression for the truncate-before-dedup bug: with duplicate links at the
+    head of the list, the old code sliced [:max_links] first and then collapsed
+    the duplicates, returning far fewer than max_links unique URLs.
+    """
+
+    @staticmethod
+    def _internal(hrefs):
+        return Links(internal=[Link(href=h) for h in hrefs], external=[])
+
+    def test_dedup_runs_before_max_links(self):
+        # First three entries are duplicates of "a"; max_links=3 must still
+        # yield three DISTINCT urls, not collapse to one.
+        hrefs = ["https://x.com/a"] * 3 + ["https://x.com/b", "https://x.com/c", "https://x.com/d"]
+        cfg = LinkPreviewConfig(include_internal=True, include_external=False, max_links=3)
+        result = LinkPreview()._filter_links(self._internal(hrefs), cfg)
+        assert result == ["https://x.com/a", "https://x.com/b", "https://x.com/c"]
+
+    def test_keeps_all_when_uniques_below_max(self):
+        hrefs = ["https://x.com/a", "https://x.com/a", "https://x.com/b"]
+        cfg = LinkPreviewConfig(include_internal=True, include_external=False, max_links=3)
+        result = LinkPreview()._filter_links(self._internal(hrefs), cfg)
+        assert result == ["https://x.com/a", "https://x.com/b"]
+
+    def test_caps_distinct_urls_at_max(self):
+        hrefs = [f"https://x.com/{i}" for i in range(5)]
+        cfg = LinkPreviewConfig(include_internal=True, include_external=False, max_links=3)
+        result = LinkPreview()._filter_links(self._internal(hrefs), cfg)
+        assert result == ["https://x.com/0", "https://x.com/1", "https://x.com/2"]