-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathconverter.py
More file actions
93 lines (73 loc) · 2.51 KB
/
converter.py
File metadata and controls
93 lines (73 loc) · 2.51 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
from __future__ import annotations
import hashlib
from pathlib import Path
from typing import TYPE_CHECKING, Optional
import pypandoc
if TYPE_CHECKING:
from docxplain.formats import PandocFormat
__all__ = ["convert_file", "get_hash"]
def convert_file(
filename: str,
output_format: PandocFormat,
suffix: Optional[str] = None,
header: Optional[str] = None,
) -> bool:
"""Convert the docx file to plaintext.
Parameters
----------
filename : `str`
Path of the docx file.
output_format : `docxplain.formats.PandocFormat`
The output format for the converted plain text file.
suffix : `str`, optional
Custom suffix for the output plain text file, including ``"."`` prefix.
Default is based on the output format, but a custom suffix like
``".extracted.txt"`` can be useful.
header : `str`, optional
Content that is added to the top of the plain text file.
Returns
-------
changed : bool
True if the converted file is different
"""
docx_path = Path(filename)
if not docx_path.is_file():
raise RuntimeError(f"Source file {docx_path} does not exist.")
if suffix is None:
file_suffix = ".txt"
else:
file_suffix = suffix
plain_path = docx_path.with_suffix(file_suffix)
if plain_path.is_file():
exists = True
initial_hash = get_hash(plain_path)
else:
exists = False
pypandoc.convert_file(str(docx_path), "plain", outputfile=str(plain_path))
if header:
insert_header(plain_path, header, docx_path.name)
trim_trailing_whitespace(plain_path)
if exists:
final_hash = get_hash(plain_path)
return final_hash != initial_hash
else:
return True
def insert_header(path: Path, header: str, docx_name: str) -> None:
"""Add a header to the beginning of a plain text file."""
content = path.read_text()
context = {"docx": docx_name}
content = "\n\n".join((header.format(**context), content))
path.write_text(content)
def trim_trailing_whitespace(path: Path) -> None:
"""Trim trailing whitespace from the plain text file, updating it
in place.
"""
content = path.read_text()
formatted_lines = [line.rstrip() for line in content.splitlines()]
new_content = "\n".join(formatted_lines) + "\n"
path.write_text(new_content)
def get_hash(path: Path) -> str:
"""Get the SHA256 hash digest of a file."""
m = hashlib.sha256()
m.update(path.read_bytes())
return m.hexdigest()