@@ -52,25 +52,44 @@ def _getURLs(path):
5252 return urls
5353
5454def _checkURL (url ):
55+ import sys
56+ print (f'[checkLinks] Checking { url } ' , flush = True , file = sys .stderr )
57+ timeout = 5
58+ headers = {'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' }
59+
5560 try :
56- rc = urllib2 .urlopen (url ).getcode ()
61+ req = urllib2 .Request (url , headers = headers )
62+ rc = urllib2 .urlopen (req , timeout = timeout ).getcode ()
63+ print (f'[checkLinks] -> { rc } ' , flush = True , file = sys .stderr )
5764 return (url , rc )
58- except :
59- pass
60- try :
61- headers = {'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)' }
62- rc = urllib2 .urlopen (urllib2 .Request (url , None , headers ), context = ssl ._create_unverified_context ()).getcode ()
6365 except urllib2 .HTTPError as e :
6466 rc = e .code
67+ print (f'[checkLinks] -> HTTPError { rc } ' , flush = True , file = sys .stderr )
6568 if rc == 429 :
6669 # Ignore too many requests
67- rc = 200
70+ return (url , 200 )
71+ elif rc == 403 :
72+ # Ignore forbidden (server blocking automated requests)
73+ return (url , 200 )
74+ elif rc == 418 :
75+ # Warn but don't fail on teapot (rate limiting from academic sites)
76+ print (f'[checkLinks] WARNING: { url } returned 418 (rate limited?)' , flush = True , file = sys .stderr )
77+ return (url , 200 )
78+ elif rc == 500 :
79+ # Warn but don't fail on server errors (often transient, work in browser)
80+ print (f'[checkLinks] WARNING: { url } returned 500 (server error, may be transient)' , flush = True , file = sys .stderr )
81+ return (url , 200 )
6882 elif rc in (301 , 302 ):
6983 # Handle redirect errors
70- rc = urllib2 .build_opener (urllib2 .HTTPCookieProcessor ).open (url ).code
71- except :
72- rc = 0
73- return (url , rc )
84+ try :
85+ rc = urllib2 .build_opener (urllib2 .HTTPCookieProcessor ).open (url , timeout = timeout ).getcode ()
86+ except Exception :
87+ pass
88+ return (url , rc )
89+ except Exception as e :
90+ print (f'[checkLinks] -> Timeout/error: { type (e ).__name__ } ' , flush = True , file = sys .stderr )
91+ # Treat all timeouts/errors as 0 (skip them)
92+ return (url , 0 )
7493
7594def checkLinks (path ):
7695 if os .path .isdir (path ):
0 commit comments