Skip to content

Commit 83669cc

Browse files
committed
feat: add html2pdf4doc_fuzzer script and the first fuzz test
1 parent ecb86b4 commit 83669cc

65 files changed

Lines changed: 70885 additions & 1 deletion

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.github/workflows/ci-mac.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ jobs:
1111
strategy:
1212
matrix:
1313
python-version: [
14-
"3.8", "3.12"
14+
"3.9", "3.13"
1515
]
1616

1717
steps:
Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
name: "HTML2PDF4Doc Fuzz Testing on Linux"
2+
3+
on:
4+
pull_request:
5+
branches: [ "**" ]
6+
7+
jobs:
8+
build:
9+
runs-on: ubuntu-latest
10+
11+
strategy:
12+
matrix:
13+
python-version: [
14+
"3.12"
15+
]
16+
17+
steps:
18+
- uses: actions/checkout@v3
19+
20+
- name: Set up Python ${{ matrix.python-version }}
21+
uses: actions/setup-python@v1
22+
with:
23+
python-version: ${{ matrix.python-version }}
24+
25+
- name: Upgrade pip
26+
run: |
27+
python -m pip install --upgrade pip
28+
29+
- name: Install Python packages
30+
run: |
31+
pip install -r requirements.development.txt
32+
33+
- name: Clone html2pdf4doc.js
34+
run: |
35+
invoke bootstrap
36+
env:
37+
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
38+
39+
- name: Install html2pdf4doc dependencies.
40+
run: |
41+
python developer/pip_install_html2pdf4doc_deps.py
42+
43+
- name: Run Lint tasks
44+
run: |
45+
invoke lint
46+
47+
- name: Build HTML2PDF4Doc.js
48+
run: |
49+
invoke build
50+
51+
- name: Run tests
52+
run: |
53+
invoke test-fuzz
54+
55+
- name: Upload broken PDFs as artifact
56+
# Always upload, even if job fails.
57+
if: failure() || always()
58+
uses: actions/upload-artifact@v4
59+
with:
60+
name: broken-pdfs
61+
path: output/
62+
retention-days: 30

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,3 +9,5 @@ tests/integration/.lit_test_times.txt
99
tests/integration/**/Output/
1010
output/
1111

12+
__pycache__/
13+
Lines changed: 126 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,126 @@
1+
import argparse
2+
import contextlib
3+
import datetime
4+
import os.path
5+
import random
6+
import shutil
7+
from pathlib import Path
8+
from subprocess import CalledProcessError, CompletedProcess, TimeoutExpired, run
9+
from time import time
10+
from typing import Iterator, List
11+
12+
from faker import Faker
13+
from lxml import etree, html
14+
15+
16+
@contextlib.contextmanager
17+
def measure_performance(title: str) -> Iterator[None]:
18+
time_start = time()
19+
yield
20+
time_end = time()
21+
22+
time_diff = time_end - time_start
23+
padded_name = f"{title} ".ljust(60, ".")
24+
padded_time = f" {time_diff:0.2f}".rjust(6, ".")
25+
print(f"{padded_name}{padded_time}s", flush=True) # noqa: T201
26+
27+
28+
def mutate_and_print(path_to_input_file: str) -> bool:
29+
assert os.path.isfile(path_to_input_file), path_to_input_file
30+
31+
text = open(path_to_input_file, encoding="utf-8").read()
32+
33+
# Parse HTML into DOM
34+
tree = html.fromstring(text)
35+
36+
# Pick a random element
37+
elems = tree.xpath("//p | //td")
38+
if elems:
39+
for _i in range(10):
40+
node = random.choice(elems)
41+
42+
print("Mutating node:", node.tag) # noqa: T201
43+
44+
n_sentences = random.randint(1, 100)
45+
46+
fake = Faker()
47+
extra_text = fake.text(max_nb_chars=10 * n_sentences)
48+
49+
node.text = extra_text
50+
51+
# Serialize back to HTML
52+
mutated_html = etree.tostring(
53+
tree, pretty_print=False, method="html", encoding="unicode"
54+
)
55+
56+
# Save next to input file
57+
path_to_mut_html = path_to_input_file + ".mut.html"
58+
path_to_mut_pdf = path_to_input_file + ".mut.html.pdf"
59+
with open(path_to_mut_html, "w", encoding="utf-8") as f:
60+
f.write(mutated_html)
61+
62+
print("Wrote mutated file:", path_to_mut_html) # noqa: T201
63+
64+
paths_to_print = [(path_to_mut_html, path_to_mut_pdf)]
65+
66+
cmd: List[str] = ["html2pdf4doc", "print", "--strict"]
67+
68+
for path_to_print_ in paths_to_print:
69+
cmd.append(path_to_print_[0])
70+
cmd.append(path_to_print_[1])
71+
72+
with measure_performance(
73+
"PDFPrintDriver: printing HTML to PDF using HTML2PDF and Chrome Driver"
74+
):
75+
try:
76+
_: CompletedProcess[bytes] = run(
77+
cmd,
78+
capture_output=False,
79+
check=True,
80+
)
81+
except CalledProcessError:
82+
mut_html_filename = Path(path_to_mut_html).stem
83+
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
84+
path_to_mut_html_out = os.path.join(
85+
"output", f"{mut_html_filename}.{timestamp}.html"
86+
)
87+
shutil.copy(path_to_mut_html, path_to_mut_html_out)
88+
89+
path_to_mut_pdf_out = os.path.join(
90+
"output", f"{mut_html_filename}.{timestamp}.pdf"
91+
)
92+
shutil.copy(path_to_mut_pdf, path_to_mut_pdf_out)
93+
94+
print( # noqa: T201
95+
f"Saved failed mutated HTML as:\n"
96+
f"HTML: {path_to_mut_html_out}\n"
97+
f"PDF: {path_to_mut_pdf_out}"
98+
)
99+
return False
100+
except TimeoutExpired:
101+
raise TimeoutError from None
102+
return True
103+
104+
105+
def main() -> None:
106+
parser = argparse.ArgumentParser()
107+
108+
parser.add_argument("input_file", type=str, help="TODO")
109+
args = parser.parse_args()
110+
111+
path_to_input_file = args.input_file
112+
113+
success_count, failure_count = 0, 0
114+
for i in range(1, 100):
115+
print( # noqa: T201
116+
f"--- Printing cycle #{i} — So far: 🟢{success_count} / 🔴{failure_count} ---"
117+
)
118+
success = mutate_and_print(path_to_input_file)
119+
if success:
120+
success_count += 1
121+
else:
122+
failure_count += 1
123+
124+
125+
if __name__ == "__main__":
126+
main()

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,7 @@ development = [
7070

7171
[project.scripts]
7272
html2pdf4doc = "html2pdf4doc.html2pdf4doc:main"
73+
html2pdf4doc_fuzzer = "html2pdf4doc.html2pdf4doc_fuzzer:main"
7374

7475
[project.urls]
7576
Changelog = "https://github.com/mettta/html2pdf_python/releases/"

requirements.development.txt

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,3 +16,9 @@ ruff>=0.9
1616
#
1717
lit
1818
filecheck==0.0.24
19+
20+
#
21+
# Fuzz tests
22+
#
23+
faker>=37.8.0
24+
lxml>=5.3.0

tasks.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -192,6 +192,17 @@ def test_integration(
192192
run_invoke(context, itest_command)
193193

194194

195+
@task(aliases=["tf"])
196+
def test_fuzz(context):
197+
run_invoke(
198+
context,
199+
"""
200+
python html2pdf4doc/html2pdf4doc_fuzzer.py
201+
tests/fuzz/01_strictdoc_guide_202510/strictdoc/docs/strictdoc_01_user_guide-PDF.html
202+
""",
203+
)
204+
205+
195206
@task(aliases=["t"])
196207
def test(context):
197208
test_integration(context)

0 commit comments

Comments
 (0)