1010from datetime import datetime
1111from pathlib import Path
1212from time import sleep
13- from typing import Dict , List , Optional
13+ from typing import Dict , List , Optional , Tuple
1414
1515import requests
16+ from pypdf import PdfReader
1617from requests import Response
1718from selenium import webdriver
1819from selenium .webdriver .chrome .options import Options
3940sys .stdout = open (sys .stdout .fileno (), mode = "w" , encoding = "utf8" , closefd = False )
4041
4142
43+ def extract_page_count (logs : List [Dict [str , str ]]) -> int :
44+ pattern = re .compile (r'"\[HTML2PDF4DOC]\s*Page count:"\s*(\d+)' )
45+ for entry_ in logs :
46+ log_message = entry_ ["message" ]
47+ match = pattern .search (log_message )
48+ if match :
49+ return int (match .group (1 ))
50+ raise ValueError ("No page count found in logs." )
51+
52+
4253class ChromeDriverManager :
4354 def get_chrome_driver (self , path_to_cache_dir : str ) -> str :
4455 chrome_version : Optional [str ] = self .get_chrome_version ()
@@ -253,7 +264,7 @@ def get_inches_from_millimeters(mm: float) -> float:
253264 return mm / 25.4
254265
255266
256- def get_pdf_from_html (driver : webdriver .Chrome , url : str ) -> bytes :
267+ def get_pdf_from_html (driver : webdriver .Chrome , url : str ) -> Tuple [ bytes , int ] :
257268 print (f"html2pdf4doc: opening URL with ChromeDriver: { url } " ) # noqa: T201
258269
259270 driver .get (url )
@@ -285,21 +296,27 @@ def get_pdf_from_html(driver: webdriver.Chrome, url: str) -> bytes:
285296 }
286297
287298 class Done (Exception ):
288- pass
299+ def __init__ (self , page_count : int ):
300+ super ().__init__ ()
301+ self .page_count : int = page_count
289302
290303 datetime_start = datetime .today ()
291304
292305 logs : List [Dict [str , str ]] = []
306+ page_count : int = 0
293307 try :
294308 while True :
295309 logs = driver .get_log ("browser" ) # type: ignore[no-untyped-call]
296310 for entry_ in logs :
297311 if "[HTML2PDF4DOC] Total time:" in entry_ ["message" ]:
298312 print ("success: HTML2PDF4Doc completed its job." ) # noqa: T201
299- raise Done
313+
314+ page_count = extract_page_count (logs )
315+
316+ raise Done (page_count )
300317 if (datetime .today () - datetime_start ).total_seconds () > 60 :
301318 raise TimeoutError
302- sleep (0.5 )
319+ sleep (0.1 )
303320 except Done :
304321 pass
305322 except TimeoutError :
@@ -322,7 +339,13 @@ class Done(Exception):
322339 result = driver .execute_cdp_cmd ("Page.printToPDF" , calculated_print_options )
323340
324341 data = base64 .b64decode (result ["data" ])
325- return data
342+
343+ if page_count == 0 :
344+ raise RuntimeError (
345+ "html2pdf4doc: Something went wrong. "
346+ "Could not capture the printed page count from Chrome."
347+ )
348+ return data , page_count
326349
327350
328351def create_webdriver (
@@ -521,9 +544,21 @@ def exit_handler() -> None:
521544
522545 url = Path (os .path .abspath (path_to_input_html )).as_uri ()
523546
524- pdf_bytes = get_pdf_from_html (driver , url )
547+ pdf_bytes , page_count = get_pdf_from_html (driver , url )
525548 with open (path_to_output_pdf , "wb" ) as f :
526549 f .write (pdf_bytes )
550+
551+ reader = PdfReader (path_to_output_pdf )
552+ if len (reader .pages ) != page_count :
553+ raise RuntimeError (
554+ "Something went wrong with the printed page. "
555+ f"Page count mismatch: "
556+ f"PDF pages: { reader .pages } , "
557+ f"html2pdf4doc pages: { page_count } ."
558+ )
559+
560+ assert reader .pages [0 ].extract_text () == "Hello world!"
561+
527562 else :
528563 print ("html2pdf4doc: unknown command." ) # noqa: T201
529564 sys .exit (1 )
0 commit comments