Skip to content

Commit 394215e

Browse files
committed
feat: add html2pdf4doc_fuzzer script and the first fuzz test
1 parent ecb86b4 commit 394215e

64 files changed

Lines changed: 45769 additions & 1 deletion

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.github/workflows/ci-mac.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ jobs:
1111
strategy:
1212
matrix:
1313
python-version: [
14-
"3.8", "3.12"
14+
"3.9", "3.13"
1515
]
1616

1717
steps:
Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
name: "HTML2PDF4Doc Fuzz Testing on Linux"
2+
3+
on:
4+
pull_request:
5+
branches: [ "**" ]
6+
7+
jobs:
8+
build:
9+
runs-on: ubuntu-latest
10+
11+
strategy:
12+
matrix:
13+
python-version: [
14+
"3.12"
15+
]
16+
17+
steps:
18+
- uses: actions/checkout@v3
19+
20+
- name: Set up Python ${{ matrix.python-version }}
21+
uses: actions/setup-python@v1
22+
with:
23+
python-version: ${{ matrix.python-version }}
24+
25+
- name: Upgrade pip
26+
run: |
27+
python -m pip install --upgrade pip
28+
29+
- name: Install Python packages
30+
run: |
31+
pip install -r requirements.development.txt
32+
33+
- name: Clone html2pdf4doc.js
34+
run: |
35+
invoke bootstrap
36+
env:
37+
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
38+
39+
- name: Install html2pdf4doc dependencies.
40+
run: |
41+
python developer/pip_install_html2pdf4doc_deps.py
42+
43+
- name: Run Lint tasks
44+
run: |
45+
invoke lint
46+
47+
- name: Build HTML2PDF4Doc.js
48+
run: |
49+
invoke build
50+
51+
- name: Run tests
52+
run: |
53+
invoke test-fuzz
54+
55+
- name: Upload broken PDFs as artifact
56+
# Always upload, even if job fails.
57+
if: failure() || always()
58+
uses: actions/upload-artifact@v4
59+
with:
60+
name: broken-pdfs
61+
path: output/
62+
retention-days: 30

.gitignore

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,3 +9,7 @@ tests/integration/.lit_test_times.txt
99
tests/integration/**/Output/
1010
output/
1111

12+
__pycache__/
13+
14+
# Fuzz testing files.
15+
**.mut.**

html2pdf4doc/html2pdf4doc.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323

2424
__version__ = "0.0.22"
2525

26+
PATH_TO_HTML2PDF4DOC_PY = __file__
2627
PATH_TO_HTML2PDF4DOC_JS = os.path.join(
2728
os.path.dirname(os.path.join(__file__)),
2829
"html2pdf4doc_js",
Lines changed: 138 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,138 @@
1+
import argparse
2+
import contextlib
3+
import datetime
4+
import os.path
5+
import random
6+
import shutil
7+
import sys
8+
from pathlib import Path
9+
from subprocess import CalledProcessError, CompletedProcess, TimeoutExpired, run
10+
from time import time
11+
from typing import Iterator, List
12+
13+
from faker import Faker
14+
from lxml import etree, html
15+
16+
from html2pdf4doc import PATH_TO_HTML2PDF4DOC_PY
17+
18+
19+
@contextlib.contextmanager
20+
def measure_performance(title: str) -> Iterator[None]:
21+
time_start = time()
22+
yield
23+
time_end = time()
24+
25+
time_diff = time_end - time_start
26+
padded_name = f"{title} ".ljust(60, ".")
27+
padded_time = f" {time_diff:0.2f}".rjust(6, ".")
28+
print(f"{padded_name}{padded_time}s", flush=True) # noqa: T201
29+
30+
31+
def mutate_and_print(path_to_input_file: str) -> bool:
32+
assert os.path.isfile(path_to_input_file), path_to_input_file
33+
34+
text = open(path_to_input_file, encoding="utf-8").read()
35+
36+
# Parse HTML into DOM
37+
tree = html.fromstring(text)
38+
39+
# Pick a random element
40+
elems = tree.xpath("//p | //td")
41+
if elems:
42+
for _i in range(10):
43+
node = random.choice(elems)
44+
45+
print("Mutating node:", node.tag) # noqa: T201
46+
47+
n_sentences = random.randint(1, 100)
48+
49+
fake = Faker()
50+
extra_text = fake.text(max_nb_chars=10 * n_sentences)
51+
52+
node.text = extra_text
53+
54+
# Serialize back to HTML
55+
mutated_html = etree.tostring(
56+
tree, pretty_print=False, method="html", encoding="unicode"
57+
)
58+
59+
# Save next to input file
60+
path_to_mut_html = path_to_input_file + ".mut.html"
61+
path_to_mut_pdf = path_to_input_file + ".mut.html.pdf"
62+
with open(path_to_mut_html, "w", encoding="utf-8") as f:
63+
f.write(mutated_html)
64+
65+
print("Wrote mutated file:", path_to_mut_html) # noqa: T201
66+
67+
paths_to_print = [(path_to_mut_html, path_to_mut_pdf)]
68+
69+
cmd: List[str] = [
70+
sys.executable,
71+
PATH_TO_HTML2PDF4DOC_PY,
72+
"print",
73+
"--strict",
74+
]
75+
76+
for path_to_print_ in paths_to_print:
77+
cmd.append(path_to_print_[0])
78+
cmd.append(path_to_print_[1])
79+
80+
with measure_performance(
81+
"PDFPrintDriver: printing HTML to PDF using HTML2PDF and Chrome Driver"
82+
):
83+
try:
84+
_: CompletedProcess[bytes] = run(
85+
cmd,
86+
capture_output=False,
87+
check=True,
88+
)
89+
except CalledProcessError as called_process_error_:
90+
print(called_process_error_) # noqa: T201
91+
92+
Path("output/").mkdir(exist_ok=True)
93+
94+
mut_html_filename = Path(path_to_mut_html).stem
95+
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
96+
path_to_mut_html_out = os.path.join(
97+
"output", f"{mut_html_filename}.{timestamp}.html"
98+
)
99+
shutil.copy(path_to_mut_html, path_to_mut_html_out)
100+
101+
path_to_mut_pdf_out = os.path.join(
102+
"output", f"{mut_html_filename}.{timestamp}.pdf"
103+
)
104+
shutil.copy(path_to_mut_pdf, path_to_mut_pdf_out)
105+
106+
print( # noqa: T201
107+
f"Saved failed mutated HTML as:\n"
108+
f"HTML: {path_to_mut_html_out}\n"
109+
f"PDF: {path_to_mut_pdf_out}"
110+
)
111+
return False
112+
except TimeoutExpired:
113+
raise TimeoutError from None
114+
return True
115+
116+
117+
def main() -> None:
118+
parser = argparse.ArgumentParser()
119+
120+
parser.add_argument("input_file", type=str, help="TODO")
121+
args = parser.parse_args()
122+
123+
path_to_input_file = args.input_file
124+
125+
success_count, failure_count = 0, 0
126+
for i in range(1, 100):
127+
print( # noqa: T201
128+
f"--- Printing cycle #{i} — So far: 🟢{success_count} / 🔴{failure_count} ---"
129+
)
130+
success = mutate_and_print(path_to_input_file)
131+
if success:
132+
success_count += 1
133+
else:
134+
failure_count += 1
135+
136+
137+
if __name__ == "__main__":
138+
main()

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,7 @@ development = [
7070

7171
[project.scripts]
7272
html2pdf4doc = "html2pdf4doc.html2pdf4doc:main"
73+
html2pdf4doc_fuzzer = "html2pdf4doc.html2pdf4doc_fuzzer:main"
7374

7475
[project.urls]
7576
Changelog = "https://github.com/mettta/html2pdf_python/releases/"

requirements.development.txt

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,3 +16,9 @@ ruff>=0.9
1616
#
1717
lit
1818
filecheck==0.0.24
19+
20+
#
21+
# Fuzz tests
22+
#
23+
faker>=37.8.0
24+
lxml>=5.3.0

tasks.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -192,6 +192,17 @@ def test_integration(
192192
run_invoke(context, itest_command)
193193

194194

195+
@task(aliases=["tf"])
196+
def test_fuzz(context):
197+
run_invoke(
198+
context,
199+
"""
200+
python html2pdf4doc/html2pdf4doc_fuzzer.py
201+
tests/fuzz/01_strictdoc_guide_202510/strictdoc/docs/strictdoc_01_user_guide-PDF.html
202+
""",
203+
)
204+
205+
195206
@task(aliases=["t"])
196207
def test(context):
197208
test_integration(context)

0 commit comments

Comments
 (0)