22
33import hashlib
44from pathlib import Path
5+ from typing import Optional
56
67import pypandoc
78
89__all__ = ["convert_file" , "get_hash" ]
910
1011
11- def convert_file (filename : str , suffix : str = ".txt" ) -> bool :
12+ def convert_file (
13+ filename : str , suffix : str = ".txt" , header : Optional [str ] = None
14+ ) -> bool :
1215 """Convert the docx file to plaintext.
1316
1417 Parameters
@@ -19,6 +22,8 @@ def convert_file(filename: str, suffix: str = ".txt") -> bool:
1922 Suffix for the output plain text file, including ``"."`` prefix.
2023 Default is ``".txt"``, but a suffix like ``".extracted.txt"``
2124 could be useful.
25+ header : `str`, optional
26+ Content that is added to the top of the plain text file.
2227
2328 Returns
2429 -------
@@ -38,15 +43,26 @@ def convert_file(filename: str, suffix: str = ".txt") -> bool:
3843
3944 pypandoc .convert_file (str (docx_path ), "plain" , outputfile = str (plain_path ))
4045
46+ if header :
47+ insert_header (plain_path , header , docx_path .name )
48+
4149 if exists :
4250 final_hash = get_hash (plain_path )
4351 return final_hash != initial_hash
4452 else :
4553 return True
4654
4755
56+ def insert_header (path : Path , header : str , docx_name : str ) -> None :
57+ """Add a header to the beginning of a plain text file."""
58+ content = path .read_text ()
59+ context = {"docx" : docx_name }
60+ content = "\n \n " .join ((header .format (** context ), content ))
61+ path .write_text (content )
62+
63+
4864def get_hash (path : Path ) -> str :
49- """Get the SHA256 hash diget of a file."""
65+ """Get the SHA256 hash digest of a file."""
5066 m = hashlib .sha256 ()
5167 m .update (path .read_bytes ())
5268 return m .hexdigest ()
0 commit comments