Skip to content

Commit 2202cdb

Browse files
committed
Initial implementation
- Package set up - docxplain command-line entrypoint that is usable as a pre-commit hook. - Integration with pandoc for document conversion - Hashing of the plain text artifact to determine if it has changed.
1 parent 9c2d2e5 commit 2202cdb

10 files changed

Lines changed: 248 additions & 1 deletion

File tree

.pre-commit-config.yaml

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
repos:
2+
- repo: https://github.com/pre-commit/pre-commit-hooks
3+
rev: v3.4.0
4+
hooks:
5+
- id: check-yaml
6+
- id: check-toml
7+
- id: check-json
8+
- id: trailing-whitespace
9+
10+
- repo: https://github.com/Lucas-C/pre-commit-hooks-markup
11+
rev: v1.0.1
12+
hooks:
13+
- id: rst-linter
14+
files: (README\.rst)|(CHANGELOG\.rst)
15+
16+
- repo: https://github.com/PyCQA/isort
17+
rev: 5.7.0
18+
hooks:
19+
- id: isort
20+
additional_dependencies:
21+
- toml
22+
23+
- repo: https://github.com/psf/black
24+
rev: 20.8b1
25+
hooks:
26+
- id: black
27+
28+
- repo: https://github.com/asottile/blacken-docs
29+
rev: v1.10.0
30+
hooks:
31+
- id: blacken-docs
32+
additional_dependencies: [black==20.8b1]
33+
args: [-l, "79", -t, py38]
34+
35+
- repo: https://gitlab.com/pycqa/flake8
36+
rev: 3.9.0
37+
hooks:
38+
- id: flake8

CHANGELOG.rst

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
Change log
2+
==========
3+
4+
Unreleased
5+
----------
6+
7+
Initial release.
Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,5 @@
1-
# pre-commit-docx-plain
1+
#####################
2+
pre-commit-docx-plain
3+
#####################
4+
25
Pre-commit hook for converting Office (docx) files into plain text (using Pandoc).

pyproject.toml

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
[build-system]
2+
requires = [
3+
"setuptools>=42",
4+
"wheel",
5+
"setuptools_scm[toml]>=3.4"
6+
]
7+
build-backend = 'setuptools.build_meta'
8+
9+
[tool.setuptools_scm]
10+
11+
[tool.black]
12+
line-length = 79
13+
target-version = ['py38']
14+
exclude = '''
15+
/(
16+
\.eggs
17+
| \.git
18+
| \.mypy_cache
19+
| \.tox
20+
| \.venv
21+
| _build
22+
| build
23+
| dist
24+
)/
25+
'''
26+
# Use single-quoted strings so TOML treats the string like a Python r-string
27+
# Multi-line strings are implicitly treated by black as regular expressions
28+
29+
[tool.isort]
30+
include_trailing_comma = true
31+
multi_line_output = 3
32+
known_first_party = "docxplain"
33+
skip = ["docs/conf.py"]

setup.cfg

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
[metadata]
2+
name = docxplain
3+
description = Convert Office (docx) files to plain text using pandoc.
4+
author = Jonathan Sick
5+
author_email = hi@jsick.codes
6+
long_description = file: README.rst, CHANGELOG.rst, LICENSE
7+
long_description_content_type = text/x-rst
8+
url = https://github.com/jsickcodes/pre-commit-docx-plain
9+
project_urls =
10+
Change log = https://github.com/jsickcodes/pre-commit-docx-plain/blob/master/CHANGELOG.rst
11+
Source code = https://github.com/jsickcodes/pre-commit-docx-plain
12+
Issue tracker = https://github.com/jsickcodes/pre-commit-docx-plain/issues
13+
classifiers =
14+
Development Status :: 4 - Beta
15+
License :: OSI Approved :: MIT License
16+
Programming Language :: Python
17+
Programming Language :: Python :: 3
18+
Intended Audience :: Developers
19+
Natural Language :: English
20+
Operating System :: POSIX
21+
keywords =
22+
precommit
23+
24+
[options]
25+
zip_safe = False
26+
include_package_data = True
27+
package_dir =
28+
= src
29+
packages = find:
30+
python_requires = >=3.7
31+
setup_requires =
32+
setuptools_scm
33+
install_requires =
34+
importlib_metadata; python_version < "3.8"
35+
pypandoc
36+
py-pandoc
37+
38+
[options.packages.find]
39+
where = src
40+
41+
[options.entry_points]
42+
console_scripts =
43+
docxplain = docxplain.cli:main
44+
45+
[options.extras_require]
46+
dev =
47+
pytest
48+
49+
[flake8]
50+
max-line-length = 79
51+
# E203: whitespace before :, flake8 disagrees with PEP 8
52+
# W503: line break after binary operator, flake8 disagrees with PEP 8
53+
ignore = E203, W503
54+
55+
[mypy]
56+
disallow_untyped_defs = True
57+
disallow_incomplete_defs = True
58+
ignore_missing_imports = True
59+
show_error_codes = True
60+
strict_equality = True
61+
warn_redundant_casts = True
62+
warn_unreachable = True
63+
warn_unused_ignores = True

setup.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
from setuptools import setup
2+
3+
setup(use_scm_version=True)

src/docxplain/__init__.py

Whitespace-only changes.

src/docxplain/cli.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
from __future__ import annotations
2+
3+
import argparse
4+
import sys
5+
6+
from docxplain.converter import convert_file
7+
8+
9+
def main() -> None:
10+
"""Command-line entrypoint."""
11+
parser = create_parser()
12+
args = parser.parse_args()
13+
changed = convert_file(args.source)
14+
if changed:
15+
sys.exit(1)
16+
else:
17+
sys.exit(0)
18+
19+
20+
def create_parser() -> argparse.ArgumentParser:
21+
parser = argparse.ArgumentParser(description="Convert docx to plain text.")
22+
parser.add_argument("source")
23+
24+
return parser

src/docxplain/converter.py

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
from __future__ import annotations
2+
3+
import hashlib
4+
from pathlib import Path
5+
6+
import pypandoc
7+
8+
__all__ = ["convert_file", "get_hash"]
9+
10+
11+
def convert_file(filename: str) -> bool:
12+
"""Convert the docx file to plaintext.
13+
14+
Returns
15+
-------
16+
changed : bool
17+
True if the converted file is different
18+
"""
19+
docx_path = Path(filename)
20+
if not docx_path.is_file():
21+
raise RuntimeError(f"Source file {docx_path} does not exist.")
22+
23+
plain_path = docx_path.with_suffix(".txt")
24+
if plain_path.is_file():
25+
exists = True
26+
initial_hash = get_hash(plain_path)
27+
else:
28+
exists = False
29+
30+
pypandoc.convert_file(str(docx_path), "plain", outputfile=str(plain_path))
31+
32+
if exists:
33+
final_hash = get_hash(plain_path)
34+
return final_hash != initial_hash
35+
else:
36+
return True
37+
38+
39+
def get_hash(path: Path) -> str:
40+
"""Get the SHA256 hash diget of a file."""
41+
m = hashlib.sha256()
42+
m.update(path.read_bytes())
43+
return m.hexdigest()

tox.ini

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
[tox]
2+
envlist =
3+
typing
4+
lint
5+
packaging
6+
isolated_build = True
7+
skip_missing_interpreters = True
8+
9+
[testenv:lint]
10+
description = Lint codebase by running pre-commit (Black, isort, Flake8).
11+
skip_install = true
12+
deps =
13+
pre-commit
14+
commands = pre-commit run --all-files
15+
16+
[testenv:typing]
17+
description = Run mypy.
18+
deps =
19+
mypy
20+
commands =
21+
mypy src tests setup.py
22+
23+
[testenv:packaging]
24+
description = Check packaging for PyPI with twine
25+
skip_install = true
26+
allowlist_externals =
27+
rm
28+
deps =
29+
twine
30+
commands =
31+
rm -rf dist
32+
python setup.py sdist bdist_wheel
33+
twine check dist/*

0 commit comments

Comments
 (0)