Skip to content

Commit 3e79117

Browse files
committed
feat: validate page count: real PDF pages vs html2pdf4doc pages
1 parent 3584a4f commit 3e79117

3 files changed

Lines changed: 45 additions & 10 deletions

File tree

html2pdf4doc/html2pdf4doc.py

Lines changed: 42 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -10,9 +10,10 @@
1010
from datetime import datetime
1111
from pathlib import Path
1212
from time import sleep
13-
from typing import Dict, List, Optional
13+
from typing import Dict, List, Optional, Tuple
1414

1515
import requests
16+
from pypdf import PdfReader
1617
from requests import Response
1718
from selenium import webdriver
1819
from selenium.webdriver.chrome.options import Options
@@ -39,6 +40,16 @@
3940
sys.stdout = open(sys.stdout.fileno(), mode="w", encoding="utf8", closefd=False)
4041

4142

43+
def extract_page_count(logs: List[Dict[str, str]]) -> int:
44+
pattern = re.compile(r'"\[HTML2PDF4DOC]\s*Page count:"\s*(\d+)')
45+
for entry_ in logs:
46+
log_message = entry_["message"]
47+
match = pattern.search(log_message)
48+
if match:
49+
return int(match.group(1))
50+
raise ValueError("No page count found in logs.")
51+
52+
4253
class ChromeDriverManager:
4354
def get_chrome_driver(self, path_to_cache_dir: str) -> str:
4455
chrome_version: Optional[str] = self.get_chrome_version()
@@ -253,7 +264,7 @@ def get_inches_from_millimeters(mm: float) -> float:
253264
return mm / 25.4
254265

255266

256-
def get_pdf_from_html(driver: webdriver.Chrome, url: str) -> bytes:
267+
def get_pdf_from_html(driver: webdriver.Chrome, url: str) -> Tuple[bytes, int]:
257268
print(f"html2pdf4doc: opening URL with ChromeDriver: {url}") # noqa: T201
258269

259270
driver.get(url)
@@ -285,21 +296,27 @@ def get_pdf_from_html(driver: webdriver.Chrome, url: str) -> bytes:
285296
}
286297

287298
class Done(Exception):
288-
pass
299+
def __init__(self, page_count: int):
300+
super().__init__()
301+
self.page_count: int = page_count
289302

290303
datetime_start = datetime.today()
291304

292305
logs: List[Dict[str, str]] = []
306+
page_count: int = 0
293307
try:
294308
while True:
295309
logs = driver.get_log("browser") # type: ignore[no-untyped-call]
296310
for entry_ in logs:
297311
if "[HTML2PDF4DOC] Total time:" in entry_["message"]:
298312
print("success: HTML2PDF4Doc completed its job.") # noqa: T201
299-
raise Done
313+
314+
page_count = extract_page_count(logs)
315+
316+
raise Done(page_count)
300317
if (datetime.today() - datetime_start).total_seconds() > 60:
301318
raise TimeoutError
302-
sleep(0.5)
319+
sleep(0.1)
303320
except Done:
304321
pass
305322
except TimeoutError:
@@ -322,7 +339,13 @@ class Done(Exception):
322339
result = driver.execute_cdp_cmd("Page.printToPDF", calculated_print_options)
323340

324341
data = base64.b64decode(result["data"])
325-
return data
342+
343+
if page_count == 0:
344+
raise RuntimeError(
345+
"html2pdf4doc: Something went wrong. "
346+
"Could not capture the printed page count from Chrome."
347+
)
348+
return data, page_count
326349

327350

328351
def create_webdriver(
@@ -521,9 +544,21 @@ def exit_handler() -> None:
521544

522545
url = Path(os.path.abspath(path_to_input_html)).as_uri()
523546

524-
pdf_bytes = get_pdf_from_html(driver, url)
547+
pdf_bytes, page_count = get_pdf_from_html(driver, url)
525548
with open(path_to_output_pdf, "wb") as f:
526549
f.write(pdf_bytes)
550+
551+
reader = PdfReader(path_to_output_pdf)
552+
if len(reader.pages) != page_count:
553+
raise RuntimeError(
554+
"Something went wrong with the printed page. "
555+
f"Page count mismatch: "
556+
f"PDF pages: {reader.pages}, "
557+
f"html2pdf4doc pages: {page_count}."
558+
)
559+
560+
assert reader.pages[0].extract_text() == "Hello world!"
561+
527562
else:
528563
print("html2pdf4doc: unknown command.") # noqa: T201
529564
sys.exit(1)

pyproject.toml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,9 @@ dependencies = [
5656

5757
# requests is used for downloading the Chrome driver.
5858
"requests",
59+
60+
# pypdf is used for validating the printed PDF.
61+
"pypdf>=3.9.0",
5962
]
6063

6164
[project.optional-dependencies]

requirements.development.txt

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,3 @@ ruff>=0.9
1616
#
1717
lit
1818
filecheck==0.0.24
19-
20-
# Integration tests use PyPDF to check the contents of the printed PDF.
21-
pypdf==3.9.0

0 commit comments

Comments
 (0)