|
| 1 | +import argparse |
| 2 | +import contextlib |
| 3 | +import datetime |
| 4 | +import os.path |
| 5 | +import random |
| 6 | +import shutil |
| 7 | +from pathlib import Path |
| 8 | +from subprocess import CalledProcessError, CompletedProcess, TimeoutExpired, run |
| 9 | +from time import time |
| 10 | +from typing import Iterator, List |
| 11 | + |
| 12 | +from faker import Faker |
| 13 | +from lxml import etree, html |
| 14 | + |
| 15 | + |
| 16 | +@contextlib.contextmanager |
| 17 | +def measure_performance(title: str) -> Iterator[None]: |
| 18 | + time_start = time() |
| 19 | + yield |
| 20 | + time_end = time() |
| 21 | + |
| 22 | + time_diff = time_end - time_start |
| 23 | + padded_name = f"{title} ".ljust(60, ".") |
| 24 | + padded_time = f" {time_diff:0.2f}".rjust(6, ".") |
| 25 | + print(f"{padded_name}{padded_time}s", flush=True) # noqa: T201 |
| 26 | + |
| 27 | + |
| 28 | +def mutate_and_print(path_to_input_file: str) -> bool: |
| 29 | + assert os.path.isfile(path_to_input_file), path_to_input_file |
| 30 | + |
| 31 | + text = open(path_to_input_file, encoding="utf-8").read() |
| 32 | + |
| 33 | + # Parse HTML into DOM |
| 34 | + tree = html.fromstring(text) |
| 35 | + |
| 36 | + # Pick a random element |
| 37 | + elems = tree.xpath("//p | //td") |
| 38 | + if elems: |
| 39 | + for _i in range(10): |
| 40 | + node = random.choice(elems) |
| 41 | + |
| 42 | + print("Mutating node:", node.tag) # noqa: T201 |
| 43 | + |
| 44 | + n_sentences = random.randint(1, 100) |
| 45 | + |
| 46 | + fake = Faker() |
| 47 | + extra_text = fake.text(max_nb_chars=10 * n_sentences) |
| 48 | + |
| 49 | + node.text = extra_text |
| 50 | + |
| 51 | + # Serialize back to HTML |
| 52 | + mutated_html = etree.tostring( |
| 53 | + tree, pretty_print=False, method="html", encoding="unicode" |
| 54 | + ) |
| 55 | + |
| 56 | + # Save next to input file |
| 57 | + path_to_mut_html = path_to_input_file + ".mut.html" |
| 58 | + path_to_mut_pdf = path_to_input_file + ".mut.html.pdf" |
| 59 | + with open(path_to_mut_html, "w", encoding="utf-8") as f: |
| 60 | + f.write(mutated_html) |
| 61 | + |
| 62 | + print("Wrote mutated file:", path_to_mut_html) # noqa: T201 |
| 63 | + |
| 64 | + paths_to_print = [(path_to_mut_html, path_to_mut_pdf)] |
| 65 | + |
| 66 | + cmd: List[str] = ["html2pdf4doc", "print", "--strict"] |
| 67 | + |
| 68 | + for path_to_print_ in paths_to_print: |
| 69 | + cmd.append(path_to_print_[0]) |
| 70 | + cmd.append(path_to_print_[1]) |
| 71 | + |
| 72 | + with measure_performance( |
| 73 | + "PDFPrintDriver: printing HTML to PDF using HTML2PDF and Chrome Driver" |
| 74 | + ): |
| 75 | + try: |
| 76 | + _: CompletedProcess[bytes] = run( |
| 77 | + cmd, |
| 78 | + capture_output=False, |
| 79 | + check=True, |
| 80 | + ) |
| 81 | + except CalledProcessError: |
| 82 | + mut_html_filename = Path(path_to_mut_html).stem |
| 83 | + timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") |
| 84 | + path_to_mut_html_out = os.path.join( |
| 85 | + "output", f"{mut_html_filename}.{timestamp}.html" |
| 86 | + ) |
| 87 | + shutil.copy(path_to_mut_html, path_to_mut_html_out) |
| 88 | + |
| 89 | + path_to_mut_pdf_out = os.path.join( |
| 90 | + "output", f"{mut_html_filename}.{timestamp}.pdf" |
| 91 | + ) |
| 92 | + shutil.copy(path_to_mut_pdf, path_to_mut_pdf_out) |
| 93 | + |
| 94 | + print( # noqa: T201 |
| 95 | + f"Saved failed mutated HTML as:\n" |
| 96 | + f"HTML: {path_to_mut_html_out}\n" |
| 97 | + f"PDF: {path_to_mut_pdf_out}" |
| 98 | + ) |
| 99 | + return False |
| 100 | + except TimeoutExpired: |
| 101 | + raise TimeoutError from None |
| 102 | + return True |
| 103 | + |
| 104 | + |
| 105 | +def main() -> None: |
| 106 | + parser = argparse.ArgumentParser() |
| 107 | + |
| 108 | + parser.add_argument("input_file", type=str, help="TODO") |
| 109 | + args = parser.parse_args() |
| 110 | + |
| 111 | + path_to_input_file = args.input_file |
| 112 | + |
| 113 | + success_count, failure_count = 0, 0 |
| 114 | + for i in range(1, 100): |
| 115 | + print( # noqa: T201 |
| 116 | + f"--- Printing cycle #{i} — So far: 🟢{success_count} / 🔴{failure_count} ---" |
| 117 | + ) |
| 118 | + success = mutate_and_print(path_to_input_file) |
| 119 | + if success: |
| 120 | + success_count += 1 |
| 121 | + else: |
| 122 | + failure_count += 1 |
| 123 | + |
| 124 | + |
| 125 | +if __name__ == "__main__": |
| 126 | + main() |
0 commit comments