22
33import hashlib
44from pathlib import Path
5+ from typing import Optional
56
67import pypandoc
78
89__all__ = ["convert_file" , "get_hash" ]
910
1011
11- def convert_file (filename : str , suffix : str = ".txt" ) -> bool :
12+ def convert_file (
13+ filename : str , suffix : str = ".txt" , header : Optional [str ] = None
14+ ) -> bool :
1215 """Convert the docx file to plaintext.
1316
1417 Parameters
@@ -19,6 +22,8 @@ def convert_file(filename: str, suffix: str = ".txt") -> bool:
1922 Suffix for the output plain text file, including ``"."`` prefix.
2023 Default is ``".txt"``, but a suffix like ``".extracted.txt"``
2124 could be useful.
25+ header : `str`, optional
26+ Content that is added to the top of the plain text file.
2227
2328 Returns
2429 -------
@@ -38,13 +43,23 @@ def convert_file(filename: str, suffix: str = ".txt") -> bool:
3843
3944 pypandoc .convert_file (str (docx_path ), "plain" , outputfile = str (plain_path ))
4045
46+ if header :
47+ insert_header (plain_path , header )
48+
4149 if exists :
4250 final_hash = get_hash (plain_path )
4351 return final_hash != initial_hash
4452 else :
4553 return True
4654
4755
56+ def insert_header (path : Path , header : str ) -> None :
57+ """Add a header to the beginning of a plain text file."""
58+ content = path .read_text ()
59+ content = "\n \n " .join ((header , content ))
60+ path .write_text (content )
61+
62+
4863def get_hash (path : Path ) -> str :
4964 """Get the SHA256 hash diget of a file."""
5065 m = hashlib .sha256 ()
0 commit comments