88"""
99
1010import os
11- import time
1211import random
12+ import time
13+ from typing import Any , Dict , List , Optional
14+
1315import requests
14- from typing import List , Optional
16+ from fake_useragent import UserAgent
17+
1518from urlchecker .core import fileproc
1619from urlchecker .core .exclude import excluded
17- from urlchecker .logger import print_success , print_failure
20+ from urlchecker .logger import print_failure , print_success
1821
1922
2023def check_response_status_code (
@@ -46,51 +49,48 @@ def check_response_status_code(
4649 return True
4750
4851
49- def get_user_agent () -> str :
52+ def get_user_agent () -> dict :
5053 """
51- Return a randomly chosen user agent for requests
54+ Return a randomly chosen user agent and headers for requests
5255
5356 Returns:
54- user agent string to include with User-Agent .
57+ headers dict to include with request .
5558 """
56- agents = [
57- (
58- "Mozilla/5.0 (X11; Linux x86_64) "
59- "AppleWebKit/537.36 (KHTML, like Gecko) "
60- "Chrome/57.0.2987.110 "
61- "Safari/537.36"
62- ), # chrome
63- (
64- "Mozilla/5.0 (X11; Linux x86_64) "
65- "AppleWebKit/537.36 (KHTML, like Gecko) "
66- "Chrome/61.0.3163.79 "
67- "Safari/537.36"
68- ), # chrome
69- (
70- "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:55.0) "
71- "Gecko/20100101 "
72- "Firefox/55.0"
73- ), # firefox
74- (
75- "Mozilla/5.0 (X11; Linux x86_64) "
76- "AppleWebKit/537.36 (KHTML, like Gecko) "
77- "Chrome/61.0.3163.91 "
78- "Safari/537.36"
79- ), # chrome
80- (
81- "Mozilla/5.0 (X11; Linux x86_64) "
82- "AppleWebKit/537.36 (KHTML, like Gecko) "
83- "Chrome/62.0.3202.89 "
84- "Safari/537.36"
85- ), # chrome
86- (
87- "Mozilla/5.0 (X11; Linux x86_64) "
88- "AppleWebKit/537.36 (KHTML, like Gecko) "
89- "Chrome/63.0.3239.108 "
90- "Safari/537.36"
91- ), # chrome
92- ]
93- return random .choice (agents )
59+ browser = random .choice (["chrome" , "firefox" ])
60+ headers = get_faux_headers (browser )
61+ headers ["User-Agent" ] = getattr (UserAgent (), browser )
62+ return headers
63+
64+
65+ def get_faux_headers (browser ) -> Dict [Any , Any ]:
66+ """
67+ Get faux headers to populate based on user agent
68+ """
69+ headers = {
70+ "chrome" : {
71+ "Accept" : "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9" ,
72+ "Accept-Encoding" : "gzip, deflate, br" ,
73+ "Accept-Language" : "en-US,en;q=0.9" ,
74+ "Sec-Ch-Ua" : '"Chromium";v="92", " Not A;Brand";v="99", "Google Chrome";v="92"' ,
75+ "Sec-Ch-Ua-Mobile" : "?0" ,
76+ "Sec-Fetch-Dest" : "document" ,
77+ "Sec-Fetch-Mode" : "navigate" ,
78+ "Sec-Fetch-Site" : "none" ,
79+ "Sec-Fetch-User" : "?1" ,
80+ "Upgrade-Insecure-Requests" : "1" ,
81+ },
82+ "firefox" : {
83+ "Accept" : "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8" ,
84+ "Accept-Encoding" : "gzip, deflate, br" ,
85+ "Accept-Language" : "en-US,en;q=0.5" ,
86+ "Sec-Fetch-Dest" : "document" ,
87+ "Sec-Fetch-Mode" : "navigate" ,
88+ "Sec-Fetch-Site" : "none" ,
89+ "Sec-Fetch-User" : "?1" ,
90+ "Upgrade-Insecure-Requests" : "1" ,
91+ },
92+ }
93+ return headers [browser ]
9494
9595
9696class UrlCheckResult :
@@ -185,13 +185,12 @@ def check_urls(
185185 # init seen urls list
186186 seen = set ()
187187
188- # Some sites will return 403 if it's not a "human" user agent
189- user_agent = get_user_agent ()
190- headers = {"User-Agent" : user_agent }
191-
192188 # check links
193189 for url in [url for url in urls if "http" in url ]:
194190
191+ # Some sites will return 403 if it's not a "human" user agent
192+ headers = get_user_agent ()
193+
195194 # init do retrails and retrails counts
196195 do_retry = True
197196 rcount = retry_count
@@ -211,7 +210,6 @@ def check_urls(
211210 response = None
212211 try :
213212 response = requests .get (url , timeout = pause , headers = headers )
214-
215213 except requests .exceptions .Timeout as e :
216214 print (e )
217215
0 commit comments