2424import ssl
2525import logging
2626import asyncio
27+ import random
2728from pathlib import Path
2829from typing import Optional , Set , Dict , List
2930from urllib .parse import urlparse , urljoin
3031
3132import httpx
3233import protego
3334
34- from .config import DEFAULT_EXCLUDE_DOMAINS , DEFAULT_EXCLUDE_EXTENSIONS
35+ from .config import DEFAULT_EXCLUDE_DOMAINS , DEFAULT_EXCLUDE_EXTENSIONS , DEFAULT_USER_AGENTS
3536from .utils import print_stats , estimate_dataset_size , merge_datasets
3637
3738# Configure logging
3839logging .basicConfig (level = logging .INFO , format = '%(message)s' )
3940
4041
4142# =============================================================================
42- # ROBOTS.TXT CACHE
43+ # ROBOTS.TXT CACHE & USER AGENT ROTATION
4344# =============================================================================
4445
4546# Global cache for robots.txt parsers
4647_robots_cache : Dict [str , Optional [protego .Protego ]] = {}
4748
4849
49- async def get_robots_parser ( client : httpx . AsyncClient , url : str ) -> Optional [ protego . Protego ] :
50- """Get robots.txt parser for a domain (cached).
50+ def get_random_user_agent ( ) -> str :
51+ return random . choice ( DEFAULT_USER_AGENTS )
5152
52- Args:
53- client: httpx AsyncClient
54- url: URL to get robots.txt for
5553
56- Returns:
57- Protego parser or None if robots.txt not available
58- """
54+ async def get_robots_parser (client : httpx .AsyncClient , url : str ) -> Optional [protego .Protego ]:
5955 parsed = urlparse (url )
6056 domain = f"{ parsed .scheme } ://{ parsed .netloc } "
6157
@@ -77,16 +73,6 @@ async def get_robots_parser(client: httpx.AsyncClient, url: str) -> Optional[pro
7773
7874
7975def can_fetch (robots_parser : Optional [protego .Protego ], url : str , user_agent : str = "textnano" ) -> bool :
80- """Check if URL can be fetched according to robots.txt.
81-
82- Args:
83- robots_parser: Protego parser or None
84- url: URL to check
85- user_agent: User agent string
86-
87- Returns:
88- bool: True if URL can be fetched
89- """
9076 if robots_parser is None :
9177 return True
9278 return robots_parser .can_fetch (url , user_agent )
@@ -124,7 +110,7 @@ async def download_text_async(url: str, client: httpx.AsyncClient, timeout: int
124110 if delay :
125111 await asyncio .sleep (delay )
126112
127- headers = {'User-Agent' : f'Mozilla/5.0 (compatible; { user_agent } /1.0)' }
113+ headers = {'User-Agent' : get_random_user_agent () }
128114 response = await client .get (url , timeout = timeout , headers = headers , follow_redirects = True )
129115 response .raise_for_status ()
130116
@@ -143,13 +129,8 @@ async def download_text_async(url: str, client: httpx.AsyncClient, timeout: int
143129
144130
145131def download_text (url : str , timeout : int = 30 ) -> str :
146- """Download and extract text from a URL (synchronous fallback).
147-
148- Returns:
149- str: Cleaned text content, or empty string if failed
150- """
151132 try :
152- headers = {'User-Agent' : 'Mozilla/5.0' }
133+ headers = {'User-Agent' : get_random_user_agent () }
153134 req = urllib .request .Request (url , headers = headers )
154135
155136 context = ssl .create_default_context ()
@@ -177,14 +158,6 @@ def download_text(url: str, timeout: int = 30) -> str:
177158# =============================================================================
178159
179160def clean_html (html_content : str ) -> str :
180- """Remove HTML tags and clean text.
181-
182- Args:
183- html_content: Raw HTML string
184-
185- Returns:
186- str: Clean text
187- """
188161 text = re .sub (r'<script[^>]*>.*?</script>' , '' , html_content , flags = re .DOTALL | re .IGNORECASE )
189162 text = re .sub (r'<style[^>]*>.*?</style>' , '' , text , flags = re .DOTALL | re .IGNORECASE )
190163 text = re .sub (r'<[^>]+>' , '' , text )
@@ -201,30 +174,12 @@ def clean_html(html_content: str) -> str:
201174# =============================================================================
202175
203176def text_fingerprint (text : str , n : int = 8 ) -> str :
204- """Create fingerprint of text using first N words.
205-
206- Args:
207- text: Input text
208- n: Number of words to use (default: 8)
209-
210- Returns:
211- str: MD5 hash of first N words
212- """
213177 words = text .lower ().split (maxsplit = n )[:n ]
214178 fingerprint_text = ' ' .join (words )
215179 return hashlib .md5 (fingerprint_text .encode ()).hexdigest ()
216180
217181
218182def is_duplicate (text : str , seen_fingerprints : Set [str ]) -> bool :
219- """Check if text is duplicate based on fingerprint.
220-
221- Args:
222- text: Text to check
223- seen_fingerprints: Set of seen fingerprints
224-
225- Returns:
226- bool: True if duplicate
227- """
228183 fp = text_fingerprint (text )
229184
230185 if fp in seen_fingerprints :
0 commit comments