Skip to content

Commit 109d0d9

Browse files
committed
feat: add html2pdf4doc_fuzzer script and the first fuzz test
1 parent ecb86b4 commit 109d0d9

64 files changed

Lines changed: 45795 additions & 1 deletion

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.github/workflows/ci-mac.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ jobs:
1111
strategy:
1212
matrix:
1313
python-version: [
14-
"3.8", "3.12"
14+
"3.9", "3.13"
1515
]
1616

1717
steps:
Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
name: "HTML2PDF4Doc Fuzz Testing on Linux"
2+
3+
on:
4+
pull_request:
5+
branches: [ "**" ]
6+
7+
jobs:
8+
build:
9+
runs-on: ubuntu-latest
10+
timeout-minutes: 120 # 2 hours
11+
12+
strategy:
13+
matrix:
14+
python-version: [
15+
"3.12"
16+
]
17+
18+
steps:
19+
- uses: actions/checkout@v3
20+
21+
- name: Set up Python ${{ matrix.python-version }}
22+
uses: actions/setup-python@v1
23+
with:
24+
python-version: ${{ matrix.python-version }}
25+
26+
- name: Upgrade pip
27+
run: |
28+
python -m pip install --upgrade pip
29+
30+
- name: Install Python packages
31+
run: |
32+
pip install -r requirements.development.txt
33+
34+
- name: Clone html2pdf4doc.js
35+
run: |
36+
invoke bootstrap
37+
env:
38+
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
39+
40+
- name: Install html2pdf4doc dependencies.
41+
run: |
42+
python developer/pip_install_html2pdf4doc_deps.py
43+
44+
- name: Run Lint tasks
45+
run: |
46+
invoke lint
47+
48+
- name: Build HTML2PDF4Doc.js
49+
run: |
50+
invoke build
51+
52+
- name: Run tests
53+
run: |
54+
if [ "${{ github.event_name }}" = "schedule" ]; then
55+
echo "🕒 Running long fuzzing..."
56+
invoke test-fuzz --long
57+
else
58+
echo "🚀 Running short fuzzing..."
59+
invoke test-fuzz
60+
fi
61+
62+
- name: Upload broken PDFs as artifact
63+
# Always upload, even if job fails.
64+
if: failure() || always()
65+
uses: actions/upload-artifact@v4
66+
with:
67+
name: broken-pdfs
68+
path: output/
69+
retention-days: 30

.gitignore

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,3 +9,7 @@ tests/integration/.lit_test_times.txt
99
tests/integration/**/Output/
1010
output/
1111

12+
__pycache__/
13+
14+
# Fuzz testing files.
15+
**.mut.**

html2pdf4doc/html2pdf4doc.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323

2424
__version__ = "0.0.22"
2525

26+
PATH_TO_HTML2PDF4DOC_PY = __file__
2627
PATH_TO_HTML2PDF4DOC_JS = os.path.join(
2728
os.path.dirname(os.path.join(__file__)),
2829
"html2pdf4doc_js",
Lines changed: 157 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,157 @@
1+
import argparse
2+
import contextlib
3+
import datetime
4+
import os.path
5+
import random
6+
import shutil
7+
import sys
8+
from pathlib import Path
9+
from subprocess import CalledProcessError, CompletedProcess, TimeoutExpired, run
10+
from time import time
11+
from typing import Iterator, List
12+
13+
from faker import Faker
14+
from lxml import etree, html
15+
16+
from html2pdf4doc import PATH_TO_HTML2PDF4DOC_PY
17+
18+
19+
@contextlib.contextmanager
20+
def measure_performance(title: str) -> Iterator[None]:
21+
time_start = time()
22+
yield
23+
time_end = time()
24+
25+
time_diff = time_end - time_start
26+
padded_name = f"{title} ".ljust(60, ".")
27+
padded_time = f" {time_diff:0.2f}".rjust(6, ".")
28+
print(f"{padded_name}{padded_time}s", flush=True) # noqa: T201
29+
30+
31+
def mutate_and_print(path_to_input_file: str) -> bool:
32+
assert os.path.isfile(path_to_input_file), path_to_input_file
33+
34+
text = open(path_to_input_file, encoding="utf-8").read()
35+
36+
# Parse HTML into DOM
37+
tree = html.fromstring(text)
38+
39+
# Pick a random element
40+
elems = tree.xpath("//p | //td")
41+
if elems:
42+
for _i in range(10):
43+
node = random.choice(elems)
44+
45+
print("Mutating node:", node.tag, flush=True) # noqa: T201
46+
47+
n_sentences = random.randint(1, 100)
48+
49+
fake = Faker()
50+
extra_text = fake.text(max_nb_chars=10 * n_sentences)
51+
52+
node.text = extra_text
53+
54+
# Serialize back to HTML
55+
mutated_html = etree.tostring(
56+
tree, pretty_print=False, method="html", encoding="unicode"
57+
)
58+
59+
# Save next to input file
60+
path_to_mut_html = path_to_input_file + ".mut.html"
61+
path_to_mut_pdf = path_to_input_file + ".mut.html.pdf"
62+
with open(path_to_mut_html, "w", encoding="utf-8") as f:
63+
f.write(mutated_html)
64+
65+
print("Wrote mutated file:", path_to_mut_html, flush=True) # noqa: T201
66+
67+
paths_to_print = [(path_to_mut_html, path_to_mut_pdf)]
68+
69+
cmd: List[str] = [
70+
sys.executable,
71+
PATH_TO_HTML2PDF4DOC_PY,
72+
"print",
73+
"--strict",
74+
]
75+
76+
for path_to_print_ in paths_to_print:
77+
cmd.append(path_to_print_[0])
78+
cmd.append(path_to_print_[1])
79+
80+
with measure_performance(
81+
"html2pdf4doc_fuzzer: printing HTML to PDF using HTML2PDF and Chrome Driver"
82+
):
83+
try:
84+
_: CompletedProcess[bytes] = run(
85+
cmd, capture_output=False, check=True, bufsize=1
86+
)
87+
except CalledProcessError as called_process_error_:
88+
print(called_process_error_) # noqa: T201
89+
90+
Path("output/").mkdir(exist_ok=True)
91+
92+
mut_html_filename = Path(path_to_mut_html).stem
93+
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
94+
path_to_mut_html_out = os.path.join(
95+
"output", f"{mut_html_filename}.{timestamp}.html"
96+
)
97+
shutil.copy(path_to_mut_html, path_to_mut_html_out)
98+
99+
path_to_mut_pdf_out = os.path.join(
100+
"output", f"{mut_html_filename}.{timestamp}.pdf"
101+
)
102+
shutil.copy(path_to_mut_pdf, path_to_mut_pdf_out)
103+
104+
print( # noqa: T201
105+
f"Saved failed mutated HTML as:\n"
106+
f"HTML: {path_to_mut_html_out}\n"
107+
f"PDF: {path_to_mut_pdf_out}"
108+
)
109+
return False
110+
except TimeoutExpired:
111+
raise TimeoutError from None
112+
return True
113+
114+
115+
def main() -> None:
116+
parser = argparse.ArgumentParser()
117+
118+
parser.add_argument("input_file", type=str, help="TODO")
119+
parser.add_argument(
120+
"--long",
121+
action="store_true",
122+
help="Run the fuzzer in long mode (more iterations).",
123+
)
124+
125+
args = parser.parse_args()
126+
127+
path_to_input_file = args.input_file
128+
129+
total_runs = 200 if args.long else 20
130+
success_count, failure_count = 0, 0
131+
for i in range(1, total_runs + 1):
132+
print( # noqa: T201
133+
f"html2pdf4doc_fuzzer print cycle #{i} — "
134+
f"So far: 🟢{success_count} / 🔴{failure_count}",
135+
flush=True,
136+
)
137+
success = mutate_and_print(path_to_input_file)
138+
if success:
139+
success_count += 1
140+
else:
141+
failure_count += 1
142+
143+
assert total_runs > 0
144+
success_rate_percent = (success_count / total_runs) * 100
145+
146+
print( # noqa: T201
147+
f"html2pdf4doc_fuzzer: finished {'✅' if failure_count == 0 else '❌'} — "
148+
f"Success rate: {success_count}/{total_runs} ({success_rate_percent}%)",
149+
flush=True,
150+
)
151+
152+
if failure_count > 0:
153+
sys.exit(1)
154+
155+
156+
if __name__ == "__main__":
157+
main()

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,7 @@ development = [
7070

7171
[project.scripts]
7272
html2pdf4doc = "html2pdf4doc.html2pdf4doc:main"
73+
html2pdf4doc_fuzzer = "html2pdf4doc.html2pdf4doc_fuzzer:main"
7374

7475
[project.urls]
7576
Changelog = "https://github.com/mettta/html2pdf_python/releases/"

requirements.development.txt

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,3 +16,9 @@ ruff>=0.9
1616
#
1717
lit
1818
filecheck==0.0.24
19+
20+
#
21+
# Fuzz tests
22+
#
23+
faker>=37.8.0
24+
lxml>=5.3.0

tasks.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -192,6 +192,17 @@ def test_integration(
192192
run_invoke(context, itest_command)
193193

194194

195+
@task(aliases=["tf"])
196+
def test_fuzz(context):
197+
run_invoke(
198+
context,
199+
"""
200+
python html2pdf4doc/html2pdf4doc_fuzzer.py
201+
tests/fuzz/01_strictdoc_guide_202510/strictdoc/docs/strictdoc_01_user_guide-PDF.html
202+
""",
203+
)
204+
205+
195206
@task(aliases=["t"])
196207
def test(context):
197208
test_integration(context)

0 commit comments

Comments
 (0)