Skip to content

Commit 2416589

Browse files
authored
Better method to derive User-Agent strings (#76)
* updating user agent string generation to be more random (and get better results) Signed-off-by: vsoch <vsoch@users.noreply.github.com>
1 parent ea187df commit 2416589

14 files changed

Lines changed: 82 additions & 72 deletions

File tree

.github/workflows/main.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ jobs:
99
testing-docker:
1010
runs-on: ubuntu-latest
1111
steps:
12-
- uses: actions/checkout@v2
12+
- uses: actions/checkout@v3
1313
- name: Build container image
1414
run: |
1515
docker build -t quay.io/urlstechie/urlchecker .

.github/workflows/test.yml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,11 +11,11 @@ jobs:
1111
formatting:
1212
runs-on: ubuntu-latest
1313
steps:
14-
- uses: actions/checkout@v2
14+
- uses: actions/checkout@v3
1515

1616
- name: Setup black environment
1717
run: conda create --quiet --name black pyflakes
18-
18+
1919
- name: Check formatting with black
2020
run: |
2121
export PATH="/usr/share/miniconda/bin:$PATH"
@@ -28,7 +28,7 @@ jobs:
2828
needs: formatting
2929
runs-on: ubuntu-latest
3030
steps:
31-
- uses: actions/checkout@v2
31+
- uses: actions/checkout@v3
3232

3333
- name: Setup mypy environment
3434
run: conda create --quiet --name type_checking mypy
@@ -46,7 +46,7 @@ jobs:
4646
needs: type_checking
4747
runs-on: ubuntu-latest
4848
steps:
49-
- uses: actions/checkout@v2
49+
- uses: actions/checkout@v3
5050
- name: Setup testing environment
5151
run: conda create --quiet --name testing pytest
5252

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ and **Merged pull requests**. Critical items to know are:
1212
Referenced versions in headers are tagged on Github, in parentheses are for pypi.
1313

1414
## [vxx](https://github.com/urlstechie/urlschecker-python/tree/master) (master)
15+
- use ANSI escape sequences for colors, fake-useragent for agents (0.0.30)
1516
- adding type hints to code, more tests and logging bug fix (0.0.29)
1617
- decrease verbosity when filename is None (0.0.28)
1718
- don't exit and fail if no URLs to check (0.0.27)

mypy.ini

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
[mypy]
2+
ignore_missing_imports = True

tests/test_core_urlproc.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -55,8 +55,9 @@ def test_check_urls(file):
5555

5656

5757
def test_get_user_agent():
58-
user_agent = get_user_agent()
59-
assert isinstance(user_agent, str)
58+
ua = get_user_agent()
59+
assert isinstance(ua, dict)
60+
assert "User-Agent" in ua
6061

6162

6263
def test_check_response_status_code():

urlchecker/client/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,9 @@
1010
"""
1111

1212

13-
import sys
1413
import argparse
14+
import sys
15+
1516
import urlchecker
1617

1718

urlchecker/client/check.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -8,15 +8,15 @@
88
99
"""
1010

11-
import re
11+
import logging
1212
import os
13+
import re
1314
import sys
14-
import logging
1515

16-
from urlchecker.main.github import clone_repo, delete_repo
17-
from urlchecker.core.fileproc import remove_empty
1816
from urlchecker.core.check import UrlChecker
17+
from urlchecker.core.fileproc import remove_empty
1918
from urlchecker.logger import print_failure
19+
from urlchecker.main.github import clone_repo, delete_repo
2020

2121
logger = logging.getLogger("urlchecker")
2222

@@ -114,11 +114,11 @@ def main(args, extra):
114114
if result["failed"]:
115115
print_failure(file_name + ":")
116116
for url in result["failed"]:
117-
print_failure(" " + url)
117+
print_failure(" ❌️ " + url)
118118
else:
119119
print("\n\U0001F914 Uh oh... The following urls did not pass:")
120120
for failed_url in check_results["failed"]:
121-
print_failure(failed_url)
121+
print_failure("❌️ " + failed_url)
122122

123123
# If we have failures and it's not a force pass, exit with 1
124124
if not args.force_pass and check_results["failed"]:

urlchecker/core/check.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,10 +11,11 @@
1111
import os
1212
import re
1313
import sys
14-
from typing import List, Dict
14+
from typing import Dict, List
15+
1516
from urlchecker.core import fileproc
16-
from urlchecker.core.worker import Workers
1717
from urlchecker.core.urlproc import UrlCheckResult
18+
from urlchecker.core.worker import Workers
1819

1920

2021
class UrlChecker:

urlchecker/core/fileproc.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,10 @@
88
"""
99

1010
import fnmatch
11-
import re
1211
import os
12+
import re
1313
from typing import List
14+
1415
from urlchecker.core import urlmarker
1516

1617

urlchecker/core/urlproc.py

Lines changed: 47 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -8,13 +8,16 @@
88
"""
99

1010
import os
11-
import time
1211
import random
12+
import time
13+
from typing import Any, Dict, List, Optional
14+
1315
import requests
14-
from typing import List, Optional
16+
from fake_useragent import UserAgent
17+
1518
from urlchecker.core import fileproc
1619
from urlchecker.core.exclude import excluded
17-
from urlchecker.logger import print_success, print_failure
20+
from urlchecker.logger import print_failure, print_success
1821

1922

2023
def check_response_status_code(
@@ -46,51 +49,48 @@ def check_response_status_code(
4649
return True
4750

4851

49-
def get_user_agent() -> str:
52+
def get_user_agent() -> dict:
5053
"""
51-
Return a randomly chosen user agent for requests
54+
Return a randomly chosen user agent and headers for requests
5255
5356
Returns:
54-
user agent string to include with User-Agent.
57+
headers dict to include with request.
5558
"""
56-
agents = [
57-
(
58-
"Mozilla/5.0 (X11; Linux x86_64) "
59-
"AppleWebKit/537.36 (KHTML, like Gecko) "
60-
"Chrome/57.0.2987.110 "
61-
"Safari/537.36"
62-
), # chrome
63-
(
64-
"Mozilla/5.0 (X11; Linux x86_64) "
65-
"AppleWebKit/537.36 (KHTML, like Gecko) "
66-
"Chrome/61.0.3163.79 "
67-
"Safari/537.36"
68-
), # chrome
69-
(
70-
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:55.0) "
71-
"Gecko/20100101 "
72-
"Firefox/55.0"
73-
), # firefox
74-
(
75-
"Mozilla/5.0 (X11; Linux x86_64) "
76-
"AppleWebKit/537.36 (KHTML, like Gecko) "
77-
"Chrome/61.0.3163.91 "
78-
"Safari/537.36"
79-
), # chrome
80-
(
81-
"Mozilla/5.0 (X11; Linux x86_64) "
82-
"AppleWebKit/537.36 (KHTML, like Gecko) "
83-
"Chrome/62.0.3202.89 "
84-
"Safari/537.36"
85-
), # chrome
86-
(
87-
"Mozilla/5.0 (X11; Linux x86_64) "
88-
"AppleWebKit/537.36 (KHTML, like Gecko) "
89-
"Chrome/63.0.3239.108 "
90-
"Safari/537.36"
91-
), # chrome
92-
]
93-
return random.choice(agents)
59+
browser = random.choice(["chrome", "firefox"])
60+
headers = get_faux_headers(browser)
61+
headers["User-Agent"] = getattr(UserAgent(), browser)
62+
return headers
63+
64+
65+
def get_faux_headers(browser) -> Dict[Any, Any]:
66+
"""
67+
Get faux headers to populate based on user agent
68+
"""
69+
headers = {
70+
"chrome": {
71+
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
72+
"Accept-Encoding": "gzip, deflate, br",
73+
"Accept-Language": "en-US,en;q=0.9",
74+
"Sec-Ch-Ua": '"Chromium";v="92", " Not A;Brand";v="99", "Google Chrome";v="92"',
75+
"Sec-Ch-Ua-Mobile": "?0",
76+
"Sec-Fetch-Dest": "document",
77+
"Sec-Fetch-Mode": "navigate",
78+
"Sec-Fetch-Site": "none",
79+
"Sec-Fetch-User": "?1",
80+
"Upgrade-Insecure-Requests": "1",
81+
},
82+
"firefox": {
83+
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
84+
"Accept-Encoding": "gzip, deflate, br",
85+
"Accept-Language": "en-US,en;q=0.5",
86+
"Sec-Fetch-Dest": "document",
87+
"Sec-Fetch-Mode": "navigate",
88+
"Sec-Fetch-Site": "none",
89+
"Sec-Fetch-User": "?1",
90+
"Upgrade-Insecure-Requests": "1",
91+
},
92+
}
93+
return headers[browser]
9494

9595

9696
class UrlCheckResult:
@@ -185,13 +185,12 @@ def check_urls(
185185
# init seen urls list
186186
seen = set()
187187

188-
# Some sites will return 403 if it's not a "human" user agent
189-
user_agent = get_user_agent()
190-
headers = {"User-Agent": user_agent}
191-
192188
# check links
193189
for url in [url for url in urls if "http" in url]:
194190

191+
# Some sites will return 403 if it's not a "human" user agent
192+
headers = get_user_agent()
193+
195194
# init do retrails and retrails counts
196195
do_retry = True
197196
rcount = retry_count
@@ -211,7 +210,6 @@ def check_urls(
211210
response = None
212211
try:
213212
response = requests.get(url, timeout=pause, headers=headers)
214-
215213
except requests.exceptions.Timeout as e:
216214
print(e)
217215

0 commit comments

Comments
 (0)