|
| 1 | +#!/usr/bin/env python3 |
| 2 | +import argparse |
| 3 | +import os |
| 4 | +import re |
| 5 | +import shutil |
| 6 | +import sys |
| 7 | +import unicodedata |
| 8 | +from pathlib import Path |
| 9 | +from tempfile import NamedTemporaryFile |
| 10 | + |
| 11 | +# ----- Patterns & Tables ----- |
| 12 | + |
| 13 | +# Zero-width and bidi control junk to remove |
| 14 | +ZERO_WIDTH_BIDI_RE = re.compile( |
| 15 | + "[" # open char class |
| 16 | + "\u200B-\u200D" # zero-width space/joiner/non-joiner |
| 17 | + "\u2060" # word joiner |
| 18 | + "\u200E\u200F" # LTR/RTL marks |
| 19 | + "\u202A-\u202E" # bidi embedding/override |
| 20 | + "\u061C" # Arabic Letter Mark |
| 21 | + "\uFEFF" # BOM |
| 22 | + "]" |
| 23 | +) |
| 24 | + |
| 25 | +# NBSP-like spaces to turn into a normal ASCII space |
| 26 | +NBSP_RE = re.compile("[\u00A0\u202F\u180E]") # NBSP, narrow NBSP, deprecated Mongolian vowel sep |
| 27 | + |
| 28 | +# Common typographic → ASCII replacements (leave French « » intact) |
| 29 | +TYPO_MAP = { |
| 30 | + "\u2010": "-", # hyphen |
| 31 | + "\u2011": "-", # non-breaking hyphen |
| 32 | + "\u2012": "-", # figure dash |
| 33 | + "\u2013": "-", # en dash |
| 34 | + "\u2014": "-", # em dash |
| 35 | + "\u2212": "-", # minus sign |
| 36 | + |
| 37 | + "\u2018": "'", # left single quotation mark |
| 38 | + "\u2019": "'", # right single quotation mark / apostrophe |
| 39 | + "\u201B": "'", # single high-reversed-9 quotation mark |
| 40 | + "\u2032": "'", # prime (often used as apostrophe) |
| 41 | + |
| 42 | + "\u201C": '"', # left double quotation mark |
| 43 | + "\u201D": '"', # right double quotation mark |
| 44 | + "\u201F": '"', # double high-reversed-9 quotation mark |
| 45 | + "\u2033": '"', # double prime |
| 46 | + |
| 47 | + "\u2026": "...", # ellipsis |
| 48 | +} |
| 49 | + |
| 50 | +# Build fast translate table for single-char replacements |
| 51 | +TRANSLATE_TABLE = str.maketrans(TYPO_MAP) |
| 52 | + |
| 53 | +# Heuristic markers that commonly show up in UTF-8→Latin-1 mojibake |
| 54 | +MOJIBAKE_MARKERS = ("Ã", "Â", "â", "€", "œ", "�") |
| 55 | + |
| 56 | + |
| 57 | +# ----- Helpers ----- |
| 58 | + |
| 59 | +def is_binary(path: Path, probe_size: int = 4096) -> bool: |
| 60 | + """ |
| 61 | + Basic binary detection: NUL bytes or cannot decode a probe chunk as UTF-8. |
| 62 | + """ |
| 63 | + try: |
| 64 | + with open(path, "rb") as f: |
| 65 | + chunk = f.read(probe_size) |
| 66 | + if b"\x00" in chunk: |
| 67 | + return True |
| 68 | + # Try to decode as UTF-8 (common for text). If it fails badly, skip. |
| 69 | + chunk.decode("utf-8") |
| 70 | + return False |
| 71 | + except Exception: |
| 72 | + return True |
| 73 | + |
| 74 | + |
| 75 | +def looks_like_mojibake(text: str) -> bool: |
| 76 | + return any(m in text for m in MOJIBAKE_MARKERS) |
| 77 | + |
| 78 | + |
| 79 | +def mojibake_score(text: str) -> int: |
| 80 | + return sum(text.count(m) for m in MOJIBAKE_MARKERS) |
| 81 | + |
| 82 | + |
| 83 | +def try_fix_mojibake(text: str) -> str: |
| 84 | + """ |
| 85 | + Attempt to repair common UTF-8 mojibake where UTF-8 bytes were decoded as Latin-1. |
| 86 | + We only apply this if it clearly improves the text (reduces mojibake markers). |
| 87 | + """ |
| 88 | + if not looks_like_mojibake(text): |
| 89 | + return text |
| 90 | + |
| 91 | + # Only attempt safe round-trip if all code points are within Latin-1 range |
| 92 | + try: |
| 93 | + encoded = text.encode("latin-1", errors="strict") |
| 94 | + repaired = encoded.decode("utf-8", errors="strict") |
| 95 | + # Accept only if it reduces mojibake markers significantly |
| 96 | + if mojibake_score(repaired) < mojibake_score(text): |
| 97 | + return repaired |
| 98 | + return text |
| 99 | + except Exception: |
| 100 | + return text |
| 101 | + |
| 102 | + |
| 103 | +def normalize_text_block(s: str) -> str: |
| 104 | + original = s |
| 105 | + |
| 106 | + # Fix mojibake first (works better before other rules) |
| 107 | + s = try_fix_mojibake(s) |
| 108 | + |
| 109 | + # Normalize to composed form (keep accents, avoid compatibility folding) |
| 110 | + s = unicodedata.normalize("NFC", s) |
| 111 | + |
| 112 | + # Remove zero-width/bidi controls |
| 113 | + s = ZERO_WIDTH_BIDI_RE.sub("", s) |
| 114 | + |
| 115 | + # Convert NBSP-like to a normal ASCII space |
| 116 | + s = NBSP_RE.sub(" ", s) |
| 117 | + |
| 118 | + # Replace typographic gremlins with ASCII equivalents (quotes, dashes, ellipsis) |
| 119 | + s = s.translate(TRANSLATE_TABLE) |
| 120 | + |
| 121 | + # Keep French guillemets « » as-is; do NOT touch accents |
| 122 | + |
| 123 | + return s |
| 124 | + |
| 125 | + |
| 126 | +# ----- Processing ----- |
| 127 | + |
| 128 | +def process_file(path: Path) -> None: |
| 129 | + print(f"[FOUND] {path}") |
| 130 | + if is_binary(path): |
| 131 | + print(" -> Skipped (binary file)") |
| 132 | + return |
| 133 | + |
| 134 | + try: |
| 135 | + # Read full text to allow safe mojibake detection and normalization |
| 136 | + with open(path, "r", encoding="utf-8", errors="strict") as fin: |
| 137 | + text = fin.read() |
| 138 | + except UnicodeDecodeError: |
| 139 | + # If strict UTF-8 fails, treat as binary/skip — avoids accidental corruption |
| 140 | + print(" -> Skipped (encoding not UTF-8)") |
| 141 | + return |
| 142 | + |
| 143 | + normalized = normalize_text_block(text) |
| 144 | + if normalized != text: |
| 145 | + # Atomic replace using a temp file in the same directory |
| 146 | + with NamedTemporaryFile("w", delete=False, encoding="utf-8", dir=str(path.parent)) as fout: |
| 147 | + fout.write(normalized) |
| 148 | + temp_name = fout.name |
| 149 | + shutil.move(temp_name, path) |
| 150 | + print(" -> ✅ FIXED") |
| 151 | + else: |
| 152 | + print(" -> OK (clean)") |
| 153 | + |
| 154 | + |
| 155 | +def main(): |
| 156 | + parser = argparse.ArgumentParser( |
| 157 | + description=( |
| 158 | + "Recursive Unicode normalizer for text files.\n" |
| 159 | + "- Fixes dashes/quotes/ellipsis to ASCII punctuation\n" |
| 160 | + "- Repairs French accents (NFC) and common UTF-8 mojibake\n" |
| 161 | + "- Removes zero-width/BOM/bidi controls\n" |
| 162 | + "- Converts NBSP/narrow NBSP to normal space\n" |
| 163 | + "- Keeps « » and French letters intact" |
| 164 | + ) |
| 165 | + ) |
| 166 | + parser.add_argument("folder", help="Root folder to scan recursively") |
| 167 | + parser.add_argument("extension", help="File extension to match (e.g. .txt, .md, .csv)") |
| 168 | + args = parser.parse_args() |
| 169 | + |
| 170 | + root = Path(args.folder) |
| 171 | + if not root.exists(): |
| 172 | + print(f"Folder not found: {root}", file=sys.stderr) |
| 173 | + sys.exit(1) |
| 174 | + |
| 175 | + ext = args.extension.lower() |
| 176 | + if not ext.startswith("."): |
| 177 | + ext = "." + ext |
| 178 | + |
| 179 | + # Recursive search |
| 180 | + for p in root.rglob("*"): |
| 181 | + if p.is_file() and p.suffix.lower() == ext and not p.is_symlink(): |
| 182 | + process_file(p) |
| 183 | + |
| 184 | + |
| 185 | +if __name__ == "__main__": |
| 186 | + main() |
0 commit comments