add script scripts/remove_gremlins.py

copdips · copdips · commit 7272b2a41e74 · 2025-12-06T19:07:46.000+01:00
diff --git a/scripts/remove_gremlins.py b/scripts/remove_gremlins.py
@@ -0,0 +1,186 @@
+#!/usr/bin/env python3
+import argparse
+import os
+import re
+import shutil
+import sys
+import unicodedata
+from pathlib import Path
+from tempfile import NamedTemporaryFile
+
+# ----- Patterns & Tables -----
+
+# Zero-width and bidi control junk to remove
+ZERO_WIDTH_BIDI_RE = re.compile(
+    "["                     # open char class
+    "\u200B-\u200D"         # zero-width space/joiner/non-joiner
+    "\u2060"                # word joiner
+    "\u200E\u200F"          # LTR/RTL marks
+    "\u202A-\u202E"         # bidi embedding/override
+    "\u061C"                # Arabic Letter Mark
+    "\uFEFF"                # BOM
+    "]"
+)
+
+# NBSP-like spaces to turn into a normal ASCII space
+NBSP_RE = re.compile("[\u00A0\u202F\u180E]")  # NBSP, narrow NBSP, deprecated Mongolian vowel sep
+
+# Common typographic → ASCII replacements (leave French « » intact)
+TYPO_MAP = {
+    "\u2010": "-",  # hyphen
+    "\u2011": "-",  # non-breaking hyphen
+    "\u2012": "-",  # figure dash
+    "\u2013": "-",  # en dash
+    "\u2014": "-",  # em dash
+    "\u2212": "-",  # minus sign
+
+    "\u2018": "'",  # left single quotation mark
+    "\u2019": "'",  # right single quotation mark / apostrophe
+    "\u201B": "'",  # single high-reversed-9 quotation mark
+    "\u2032": "'",  # prime (often used as apostrophe)
+
+    "\u201C": '"',  # left double quotation mark
+    "\u201D": '"',  # right double quotation mark
+    "\u201F": '"',  # double high-reversed-9 quotation mark
+    "\u2033": '"',  # double prime
+
+    "\u2026": "...",  # ellipsis
+}
+
+# Build fast translate table for single-char replacements
+TRANSLATE_TABLE = str.maketrans(TYPO_MAP)
+
+# Heuristic markers that commonly show up in UTF-8→Latin-1 mojibake
+MOJIBAKE_MARKERS = ("Ã", "Â", "â", "€", "œ", "�")
+
+
+# ----- Helpers -----
+
+def is_binary(path: Path, probe_size: int = 4096) -> bool:
+    """
+    Basic binary detection: NUL bytes or cannot decode a probe chunk as UTF-8.
+    """
+    try:
+        with open(path, "rb") as f:
+            chunk = f.read(probe_size)
+        if b"\x00" in chunk:
+            return True
+        # Try to decode as UTF-8 (common for text). If it fails badly, skip.
+        chunk.decode("utf-8")
+        return False
+    except Exception:
+        return True
+
+
+def looks_like_mojibake(text: str) -> bool:
+    return any(m in text for m in MOJIBAKE_MARKERS)
+
+
+def mojibake_score(text: str) -> int:
+    return sum(text.count(m) for m in MOJIBAKE_MARKERS)
+
+
+def try_fix_mojibake(text: str) -> str:
+    """
+    Attempt to repair common UTF-8 mojibake where UTF-8 bytes were decoded as Latin-1.
+    We only apply this if it clearly improves the text (reduces mojibake markers).
+    """
+    if not looks_like_mojibake(text):
+        return text
+
+    # Only attempt safe round-trip if all code points are within Latin-1 range
+    try:
+        encoded = text.encode("latin-1", errors="strict")
+        repaired = encoded.decode("utf-8", errors="strict")
+        # Accept only if it reduces mojibake markers significantly
+        if mojibake_score(repaired) < mojibake_score(text):
+            return repaired
+        return text
+    except Exception:
+        return text
+
+
+def normalize_text_block(s: str) -> str:
+    original = s
+
+    # Fix mojibake first (works better before other rules)
+    s = try_fix_mojibake(s)
+
+    # Normalize to composed form (keep accents, avoid compatibility folding)
+    s = unicodedata.normalize("NFC", s)
+
+    # Remove zero-width/bidi controls
+    s = ZERO_WIDTH_BIDI_RE.sub("", s)
+
+    # Convert NBSP-like to a normal ASCII space
+    s = NBSP_RE.sub(" ", s)
+
+    # Replace typographic gremlins with ASCII equivalents (quotes, dashes, ellipsis)
+    s = s.translate(TRANSLATE_TABLE)
+
+    # Keep French guillemets « » as-is; do NOT touch accents
+
+    return s
+
+
+# ----- Processing -----
+
+def process_file(path: Path) -> None:
+    print(f"[FOUND] {path}")
+    if is_binary(path):
+        print("  -> Skipped (binary file)")
+        return
+
+    try:
+        # Read full text to allow safe mojibake detection and normalization
+        with open(path, "r", encoding="utf-8", errors="strict") as fin:
+            text = fin.read()
+    except UnicodeDecodeError:
+        # If strict UTF-8 fails, treat as binary/skip — avoids accidental corruption
+        print("  -> Skipped (encoding not UTF-8)")
+        return
+
+    normalized = normalize_text_block(text)
+    if normalized != text:
+        # Atomic replace using a temp file in the same directory
+        with NamedTemporaryFile("w", delete=False, encoding="utf-8", dir=str(path.parent)) as fout:
+            fout.write(normalized)
+            temp_name = fout.name
+        shutil.move(temp_name, path)
+        print("  -> ✅ FIXED")
+    else:
+        print("  -> OK (clean)")
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description=(
+            "Recursive Unicode normalizer for text files.\n"
+            "- Fixes dashes/quotes/ellipsis to ASCII punctuation\n"
+            "- Repairs French accents (NFC) and common UTF-8 mojibake\n"
+            "- Removes zero-width/BOM/bidi controls\n"
+            "- Converts NBSP/narrow NBSP to normal space\n"
+            "- Keeps « » and French letters intact"
+        )
+    )
+    parser.add_argument("folder", help="Root folder to scan recursively")
+    parser.add_argument("extension", help="File extension to match (e.g. .txt, .md, .csv)")
+    args = parser.parse_args()
+
+    root = Path(args.folder)
+    if not root.exists():
+        print(f"Folder not found: {root}", file=sys.stderr)
+        sys.exit(1)
+
+    ext = args.extension.lower()
+    if not ext.startswith("."):
+        ext = "." + ext
+
+    # Recursive search
+    for p in root.rglob("*"):
+        if p.is_file() and p.suffix.lower() == ext and not p.is_symlink():
+            process_file(p)
+
+
+if __name__ == "__main__":
+    main()