Skip to content

Commit 78cce51

Browse files
author
Rustem
committed
Add user agent rotation and reduce docstrings
1 parent 140931d commit 78cce51

2 files changed

Lines changed: 23 additions & 53 deletions

File tree

textnano/config.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,3 +46,18 @@
4646
'mov', 'mp3', 'mp4', 'ogv', 'pdf', 'png', 'pps', 'ppt', 'pptx', 'svg',
4747
'tar', 'tgz', 'webm', 'wma', 'wmv', 'xml', 'xz', 'zip'
4848
]
49+
50+
# =============================================================================
51+
# USER AGENT ROTATION
52+
# =============================================================================
53+
54+
DEFAULT_USER_AGENTS = [
55+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
56+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
57+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0',
58+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.1 Safari/605.1.15',
59+
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
60+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Edge/120.0.0.0',
61+
'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:121.0) Gecko/20100101 Firefox/121.0',
62+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
63+
]

textnano/core.py

Lines changed: 8 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -24,38 +24,34 @@
2424
import ssl
2525
import logging
2626
import asyncio
27+
import random
2728
from pathlib import Path
2829
from typing import Optional, Set, Dict, List
2930
from urllib.parse import urlparse, urljoin
3031

3132
import httpx
3233
import protego
3334

34-
from .config import DEFAULT_EXCLUDE_DOMAINS, DEFAULT_EXCLUDE_EXTENSIONS
35+
from .config import DEFAULT_EXCLUDE_DOMAINS, DEFAULT_EXCLUDE_EXTENSIONS, DEFAULT_USER_AGENTS
3536
from .utils import print_stats, estimate_dataset_size, merge_datasets
3637

3738
# Configure logging
3839
logging.basicConfig(level=logging.INFO, format='%(message)s')
3940

4041

4142
# =============================================================================
42-
# ROBOTS.TXT CACHE
43+
# ROBOTS.TXT CACHE & USER AGENT ROTATION
4344
# =============================================================================
4445

4546
# Global cache for robots.txt parsers
4647
_robots_cache: Dict[str, Optional[protego.Protego]] = {}
4748

4849

49-
async def get_robots_parser(client: httpx.AsyncClient, url: str) -> Optional[protego.Protego]:
50-
"""Get robots.txt parser for a domain (cached).
50+
def get_random_user_agent() -> str:
51+
return random.choice(DEFAULT_USER_AGENTS)
5152

52-
Args:
53-
client: httpx AsyncClient
54-
url: URL to get robots.txt for
5553

56-
Returns:
57-
Protego parser or None if robots.txt not available
58-
"""
54+
async def get_robots_parser(client: httpx.AsyncClient, url: str) -> Optional[protego.Protego]:
5955
parsed = urlparse(url)
6056
domain = f"{parsed.scheme}://{parsed.netloc}"
6157

@@ -77,16 +73,6 @@ async def get_robots_parser(client: httpx.AsyncClient, url: str) -> Optional[pro
7773

7874

7975
def can_fetch(robots_parser: Optional[protego.Protego], url: str, user_agent: str = "textnano") -> bool:
80-
"""Check if URL can be fetched according to robots.txt.
81-
82-
Args:
83-
robots_parser: Protego parser or None
84-
url: URL to check
85-
user_agent: User agent string
86-
87-
Returns:
88-
bool: True if URL can be fetched
89-
"""
9076
if robots_parser is None:
9177
return True
9278
return robots_parser.can_fetch(url, user_agent)
@@ -124,7 +110,7 @@ async def download_text_async(url: str, client: httpx.AsyncClient, timeout: int
124110
if delay:
125111
await asyncio.sleep(delay)
126112

127-
headers = {'User-Agent': f'Mozilla/5.0 (compatible; {user_agent}/1.0)'}
113+
headers = {'User-Agent': get_random_user_agent()}
128114
response = await client.get(url, timeout=timeout, headers=headers, follow_redirects=True)
129115
response.raise_for_status()
130116

@@ -143,13 +129,8 @@ async def download_text_async(url: str, client: httpx.AsyncClient, timeout: int
143129

144130

145131
def download_text(url: str, timeout: int = 30) -> str:
146-
"""Download and extract text from a URL (synchronous fallback).
147-
148-
Returns:
149-
str: Cleaned text content, or empty string if failed
150-
"""
151132
try:
152-
headers = {'User-Agent': 'Mozilla/5.0'}
133+
headers = {'User-Agent': get_random_user_agent()}
153134
req = urllib.request.Request(url, headers=headers)
154135

155136
context = ssl.create_default_context()
@@ -177,14 +158,6 @@ def download_text(url: str, timeout: int = 30) -> str:
177158
# =============================================================================
178159

179160
def clean_html(html_content: str) -> str:
180-
"""Remove HTML tags and clean text.
181-
182-
Args:
183-
html_content: Raw HTML string
184-
185-
Returns:
186-
str: Clean text
187-
"""
188161
text = re.sub(r'<script[^>]*>.*?</script>', '', html_content, flags=re.DOTALL | re.IGNORECASE)
189162
text = re.sub(r'<style[^>]*>.*?</style>', '', text, flags=re.DOTALL | re.IGNORECASE)
190163
text = re.sub(r'<[^>]+>', '', text)
@@ -201,30 +174,12 @@ def clean_html(html_content: str) -> str:
201174
# =============================================================================
202175

203176
def text_fingerprint(text: str, n: int = 8) -> str:
204-
"""Create fingerprint of text using first N words.
205-
206-
Args:
207-
text: Input text
208-
n: Number of words to use (default: 8)
209-
210-
Returns:
211-
str: MD5 hash of first N words
212-
"""
213177
words = text.lower().split(maxsplit=n)[:n]
214178
fingerprint_text = ' '.join(words)
215179
return hashlib.md5(fingerprint_text.encode()).hexdigest()
216180

217181

218182
def is_duplicate(text: str, seen_fingerprints: Set[str]) -> bool:
219-
"""Check if text is duplicate based on fingerprint.
220-
221-
Args:
222-
text: Text to check
223-
seen_fingerprints: Set of seen fingerprints
224-
225-
Returns:
226-
bool: True if duplicate
227-
"""
228183
fp = text_fingerprint(text)
229184

230185
if fp in seen_fingerprints:

0 commit comments

Comments
 (0)