Skip to content

Commit 7272b2a

Browse files
committed
add script scripts/remove_gremlins.py
1 parent 44e8cd4 commit 7272b2a

1 file changed

Lines changed: 186 additions & 0 deletions

File tree

scripts/remove_gremlins.py

Lines changed: 186 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,186 @@
1+
#!/usr/bin/env python3
2+
import argparse
3+
import os
4+
import re
5+
import shutil
6+
import sys
7+
import unicodedata
8+
from pathlib import Path
9+
from tempfile import NamedTemporaryFile
10+
11+
# ----- Patterns & Tables -----
12+
13+
# Zero-width and bidi control junk to remove
14+
ZERO_WIDTH_BIDI_RE = re.compile(
15+
"[" # open char class
16+
"\u200B-\u200D" # zero-width space/joiner/non-joiner
17+
"\u2060" # word joiner
18+
"\u200E\u200F" # LTR/RTL marks
19+
"\u202A-\u202E" # bidi embedding/override
20+
"\u061C" # Arabic Letter Mark
21+
"\uFEFF" # BOM
22+
"]"
23+
)
24+
25+
# NBSP-like spaces to turn into a normal ASCII space
26+
NBSP_RE = re.compile("[\u00A0\u202F\u180E]") # NBSP, narrow NBSP, deprecated Mongolian vowel sep
27+
28+
# Common typographic → ASCII replacements (leave French « » intact)
29+
TYPO_MAP = {
30+
"\u2010": "-", # hyphen
31+
"\u2011": "-", # non-breaking hyphen
32+
"\u2012": "-", # figure dash
33+
"\u2013": "-", # en dash
34+
"\u2014": "-", # em dash
35+
"\u2212": "-", # minus sign
36+
37+
"\u2018": "'", # left single quotation mark
38+
"\u2019": "'", # right single quotation mark / apostrophe
39+
"\u201B": "'", # single high-reversed-9 quotation mark
40+
"\u2032": "'", # prime (often used as apostrophe)
41+
42+
"\u201C": '"', # left double quotation mark
43+
"\u201D": '"', # right double quotation mark
44+
"\u201F": '"', # double high-reversed-9 quotation mark
45+
"\u2033": '"', # double prime
46+
47+
"\u2026": "...", # ellipsis
48+
}
49+
50+
# Build fast translate table for single-char replacements
51+
TRANSLATE_TABLE = str.maketrans(TYPO_MAP)
52+
53+
# Heuristic markers that commonly show up in UTF-8→Latin-1 mojibake
54+
MOJIBAKE_MARKERS = ("Ã", "Â", "â", "€", "œ", "�")
55+
56+
57+
# ----- Helpers -----
58+
59+
def is_binary(path: Path, probe_size: int = 4096) -> bool:
60+
"""
61+
Basic binary detection: NUL bytes or cannot decode a probe chunk as UTF-8.
62+
"""
63+
try:
64+
with open(path, "rb") as f:
65+
chunk = f.read(probe_size)
66+
if b"\x00" in chunk:
67+
return True
68+
# Try to decode as UTF-8 (common for text). If it fails badly, skip.
69+
chunk.decode("utf-8")
70+
return False
71+
except Exception:
72+
return True
73+
74+
75+
def looks_like_mojibake(text: str) -> bool:
76+
return any(m in text for m in MOJIBAKE_MARKERS)
77+
78+
79+
def mojibake_score(text: str) -> int:
80+
return sum(text.count(m) for m in MOJIBAKE_MARKERS)
81+
82+
83+
def try_fix_mojibake(text: str) -> str:
84+
"""
85+
Attempt to repair common UTF-8 mojibake where UTF-8 bytes were decoded as Latin-1.
86+
We only apply this if it clearly improves the text (reduces mojibake markers).
87+
"""
88+
if not looks_like_mojibake(text):
89+
return text
90+
91+
# Only attempt safe round-trip if all code points are within Latin-1 range
92+
try:
93+
encoded = text.encode("latin-1", errors="strict")
94+
repaired = encoded.decode("utf-8", errors="strict")
95+
# Accept only if it reduces mojibake markers significantly
96+
if mojibake_score(repaired) < mojibake_score(text):
97+
return repaired
98+
return text
99+
except Exception:
100+
return text
101+
102+
103+
def normalize_text_block(s: str) -> str:
104+
original = s
105+
106+
# Fix mojibake first (works better before other rules)
107+
s = try_fix_mojibake(s)
108+
109+
# Normalize to composed form (keep accents, avoid compatibility folding)
110+
s = unicodedata.normalize("NFC", s)
111+
112+
# Remove zero-width/bidi controls
113+
s = ZERO_WIDTH_BIDI_RE.sub("", s)
114+
115+
# Convert NBSP-like to a normal ASCII space
116+
s = NBSP_RE.sub(" ", s)
117+
118+
# Replace typographic gremlins with ASCII equivalents (quotes, dashes, ellipsis)
119+
s = s.translate(TRANSLATE_TABLE)
120+
121+
# Keep French guillemets « » as-is; do NOT touch accents
122+
123+
return s
124+
125+
126+
# ----- Processing -----
127+
128+
def process_file(path: Path) -> None:
129+
print(f"[FOUND] {path}")
130+
if is_binary(path):
131+
print(" -> Skipped (binary file)")
132+
return
133+
134+
try:
135+
# Read full text to allow safe mojibake detection and normalization
136+
with open(path, "r", encoding="utf-8", errors="strict") as fin:
137+
text = fin.read()
138+
except UnicodeDecodeError:
139+
# If strict UTF-8 fails, treat as binary/skip — avoids accidental corruption
140+
print(" -> Skipped (encoding not UTF-8)")
141+
return
142+
143+
normalized = normalize_text_block(text)
144+
if normalized != text:
145+
# Atomic replace using a temp file in the same directory
146+
with NamedTemporaryFile("w", delete=False, encoding="utf-8", dir=str(path.parent)) as fout:
147+
fout.write(normalized)
148+
temp_name = fout.name
149+
shutil.move(temp_name, path)
150+
print(" -> ✅ FIXED")
151+
else:
152+
print(" -> OK (clean)")
153+
154+
155+
def main():
156+
parser = argparse.ArgumentParser(
157+
description=(
158+
"Recursive Unicode normalizer for text files.\n"
159+
"- Fixes dashes/quotes/ellipsis to ASCII punctuation\n"
160+
"- Repairs French accents (NFC) and common UTF-8 mojibake\n"
161+
"- Removes zero-width/BOM/bidi controls\n"
162+
"- Converts NBSP/narrow NBSP to normal space\n"
163+
"- Keeps « » and French letters intact"
164+
)
165+
)
166+
parser.add_argument("folder", help="Root folder to scan recursively")
167+
parser.add_argument("extension", help="File extension to match (e.g. .txt, .md, .csv)")
168+
args = parser.parse_args()
169+
170+
root = Path(args.folder)
171+
if not root.exists():
172+
print(f"Folder not found: {root}", file=sys.stderr)
173+
sys.exit(1)
174+
175+
ext = args.extension.lower()
176+
if not ext.startswith("."):
177+
ext = "." + ext
178+
179+
# Recursive search
180+
for p in root.rglob("*"):
181+
if p.is_file() and p.suffix.lower() == ext and not p.is_symlink():
182+
process_file(p)
183+
184+
185+
if __name__ == "__main__":
186+
main()

0 commit comments

Comments
 (0)