3838# DOWNLOAD
3939# =============================================================================
4040
41- def download_text (url : str , timeout : int = 30 ) -> Optional [ str ] :
41+ def download_text (url : str , timeout : int = 30 ) -> str :
4242 """Download and extract text from a URL.
4343
4444 Returns:
45- str or None : Cleaned text content, or None if failed
45+ str: Cleaned text content, or empty string if failed
4646 """
4747 try :
48- # Download
4948 headers = {'User-Agent' : 'Mozilla/5.0' }
5049 req = urllib .request .Request (url , headers = headers )
5150
52- # Create SSL context that doesn't verify certificates
5351 context = ssl .create_default_context ()
5452 context .check_hostname = False
5553 context .verify_mode = ssl .CERT_NONE
5654
5755 with urllib .request .urlopen (req , timeout = timeout , context = context ) as response :
5856 content = response .read ().decode ('utf-8' , errors = 'ignore' )
5957
60- # Basic HTML cleaning
61- text = clean_html (content )
62-
63- return text if text .strip () else None
58+ return clean_html (content )
6459
6560 except urllib .error .HTTPError as e :
6661 logging .error (f"HTTP error { e .code } for { url } : { e .reason } " )
67- return None
62+ return ""
6863 except urllib .error .URLError as e :
6964 logging .error (f"URL error for { url } : { e .reason } " )
70- return None
65+ return ""
7166 except Exception as e :
7267 logging .error (f"Unexpected error for { url } : { type (e ).__name__ } : { e } " )
73- return None
68+ return ""
7469
7570
7671# =============================================================================
@@ -86,21 +81,12 @@ def clean_html(html_content: str) -> str:
8681 Returns:
8782 str: Clean text
8883 """
89- # Unescape HTML entities
90- text = html .unescape (html_content )
91-
92- # Remove script and style tags
93- text = re .sub (r'<script[^>]*>.*?</script>' , '' , text , flags = re .DOTALL | re .IGNORECASE )
84+ text = re .sub (r'<script[^>]*>.*?</script>' , '' , html_content , flags = re .DOTALL | re .IGNORECASE )
9485 text = re .sub (r'<style[^>]*>.*?</style>' , '' , text , flags = re .DOTALL | re .IGNORECASE )
95-
96- # Remove HTML tags
9786 text = re .sub (r'<[^>]+>' , '' , text )
98-
99- # Normalize whitespace
87+ text = html .unescape (text )
10088 text = re .sub (r'\s+' , ' ' , text )
10189 text = re .sub (r'\n\s*\n' , '\n \n ' , text )
102-
103- # Remove leading/trailing whitespace
10490 text = text .strip ()
10591
10692 return text
@@ -120,7 +106,7 @@ def text_fingerprint(text: str, n: int = 8) -> str:
120106 Returns:
121107 str: MD5 hash of first N words
122108 """
123- words = text .lower ().split ()[:n ]
109+ words = text .lower ().split (maxsplit = n )[:n ]
124110 fingerprint_text = ' ' .join (words )
125111 return hashlib .md5 (fingerprint_text .encode ()).hexdigest ()
126112
0 commit comments