|
| 1 | +#!/usr/bin/env python |
| 2 | + |
| 3 | +""" |
| 4 | +Unicode Text Cleaner |
| 5 | +
|
| 6 | +This script normalizes problematic Unicode characters to their ASCII equivalents. |
| 7 | +It handles common issues like fancy quotes, em/en dashes, and zero-width spaces |
| 8 | +that can cause problems in text processing. |
| 9 | +
|
| 10 | +The script takes one or more input files and creates cleaned versions with |
| 11 | +".clean.txt" appended to the original filename. It skips duplicate files |
| 12 | +and handles errors gracefully. |
| 13 | +
|
| 14 | +Example: |
| 15 | + $ python cleanup-text.py file1.txt file2.txt |
| 16 | + [✓] Cleaned: file1.txt → file1.clean.txt |
| 17 | + [✓] Cleaned: file2.txt → file2.clean.txt |
| 18 | +""" |
| 19 | + |
| 20 | +import argparse |
| 21 | +import os |
| 22 | +import re |
| 23 | +import sys |
| 24 | + |
| 25 | +# Check for unidecode dependency early, with a clear message if missing |
| 26 | +try: |
| 27 | + from unidecode import unidecode # noqa: F401 |
| 28 | +except ImportError: |
| 29 | + print( |
| 30 | + "[✗] Missing dependency: 'Unidecode'. Please install it with:\n" |
| 31 | + " pip install Unidecode\n" |
| 32 | + "Or install all requirements with:\n" |
| 33 | + " pip install -r requirements.txt", |
| 34 | + file=sys.stderr |
| 35 | + ) |
| 36 | + sys.exit(1) |
| 37 | + |
| 38 | + |
| 39 | +class CustomArgumentParser(argparse.ArgumentParser): |
| 40 | + def print_help(self, file=None): |
| 41 | + if file is None: |
| 42 | + file = sys.stderr |
| 43 | + print('', file=file) # Blank line before help |
| 44 | + super().print_help(file) |
| 45 | + print('', file=file) # Blank line after help |
| 46 | + |
| 47 | + def exit(self, status=0, message=None): |
| 48 | + if message: |
| 49 | + print('', file=sys.stderr) # Blank line before error/usage |
| 50 | + self._print_message(message, sys.stderr) |
| 51 | + print('', file=sys.stderr) # Blank line after error/usage |
| 52 | + sys.exit(status) |
| 53 | + |
| 54 | + |
| 55 | +def clean_text(text: str, preserve_invisible: bool = False) -> str: |
| 56 | + """ |
| 57 | + Normalize problematic or invisible Unicode characters to safe ASCII equivalents. |
| 58 | +
|
| 59 | + Args: |
| 60 | + text (str): The input text containing Unicode characters |
| 61 | + preserve_invisible (bool): If True, do not remove invisible characters |
| 62 | +
|
| 63 | + Returns: |
| 64 | + str: The cleaned text with normalized ASCII characters |
| 65 | + """ |
| 66 | + replacements = { |
| 67 | + '\u2018': "'", '\u2019': "'", # Smart single quotes |
| 68 | + '\u201C': '"', '\u201D': '"', # Smart double quotes |
| 69 | + '\u2011': '-', # Non-breaking hyphen to regular hyphen |
| 70 | + } |
| 71 | + for orig, repl in replacements.items(): |
| 72 | + text = text.replace(orig, repl) |
| 73 | + |
| 74 | + # Replace EM dashes (U+2014) with space-dash-space, unless already surrounded by spaces |
| 75 | + def em_dash_replacer(match): |
| 76 | + before = match.group(1) |
| 77 | + after = match.group(2) |
| 78 | + if before and after: |
| 79 | + return before + '-' + after |
| 80 | + return ' - ' |
| 81 | + text = re.sub(r'(\s*)\u2014(\s*)', em_dash_replacer, text) |
| 82 | + |
| 83 | + # Replace EN dashes (U+2013) with plain dash, preserving spacing |
| 84 | + text = re.sub(r'\u2013', '-', text) |
| 85 | + |
| 86 | + if not preserve_invisible: |
| 87 | + # Remove zero-width and other invisible characters |
| 88 | + text = re.sub(r'[\u200B\u200C\u200D\uFEFF\u00A0]', '', text) |
| 89 | + |
| 90 | + # Remove trailing whitespace on every line |
| 91 | + text = re.sub(r'[ \t]+(\r?\n)', r'\1', text) |
| 92 | + |
| 93 | + return text |
| 94 | + |
| 95 | + |
| 96 | +def ensure_single_newline(text: str) -> str: |
| 97 | + """ |
| 98 | + Ensure the text ends with exactly one newline character. Used for all text files. |
| 99 | + """ |
| 100 | + return text.rstrip('\r\n') + '\n' |
| 101 | + |
| 102 | + |
| 103 | +def main(): |
| 104 | + """ |
| 105 | + Main function that handles command-line interface and file processing. |
| 106 | + """ |
| 107 | + parser = CustomArgumentParser( |
| 108 | + description=( |
| 109 | + "Clean Unicode quirks from text.\n" |
| 110 | + "If no input files are given, reads from STDIN and writes to STDOUT (filter mode).\n" |
| 111 | + "If input files are given, creates cleaned files with .clean before the extension " |
| 112 | + "(e.g., foo.txt -> foo.clean.txt).\n" |
| 113 | + "Use -o - to force output to STDOUT for all input files, or -o <file> to specify a single output file " |
| 114 | + "(only with one input file)." |
| 115 | + ), |
| 116 | + epilog="\n" |
| 117 | + ) |
| 118 | + parser.add_argument("infile", nargs="*", help="Input file(s)") |
| 119 | + parser.add_argument( |
| 120 | + "-i", "--invisible", |
| 121 | + action="store_true", |
| 122 | + help="Preserve invisible Unicode characters (zero-width, non-breaking, etc.)" |
| 123 | + ) |
| 124 | + parser.add_argument( |
| 125 | + "-o", "--output", |
| 126 | + help="Output file name, or '-' for STDOUT. Only valid with one input file, or use '-' for STDOUT with multiple files." |
| 127 | + ) |
| 128 | + parser.add_argument( |
| 129 | + "-t", "--temp", |
| 130 | + action="store_true", |
| 131 | + help=( |
| 132 | + "In-place cleaning:\n" |
| 133 | + " Move each input file to .tmp, clean it, write cleaned output to original name,\n" |
| 134 | + " and delete .tmp after success." |
| 135 | + ) |
| 136 | + ) |
| 137 | + parser.add_argument( |
| 138 | + "-p", "--preserve-tmp", |
| 139 | + action="store_true", |
| 140 | + help=( |
| 141 | + "With -t, preserve the .tmp file after cleaning (do not delete it).\n" |
| 142 | + " Useful for backup or manual recovery." |
| 143 | + ) |
| 144 | + ) |
| 145 | + parser.add_argument( |
| 146 | + "-n", "--no-newline", |
| 147 | + action="store_true", |
| 148 | + help="Do not add a newline at the end of the output file (suppress final newline)." |
| 149 | + ) |
| 150 | + args = parser.parse_args() |
| 151 | + |
| 152 | + if not args.infile: |
| 153 | + # No files provided: filter mode (STDIN to STDOUT) |
| 154 | + raw = sys.stdin.read() |
| 155 | + cleaned = clean_text(raw, preserve_invisible=args.invisible) |
| 156 | + # Add or suppress newline at EOF based on -n/--no-newline |
| 157 | + if not args.no_newline: |
| 158 | + cleaned = ensure_single_newline(cleaned) |
| 159 | + else: |
| 160 | + cleaned = cleaned.rstrip('\r\n') |
| 161 | + sys.stdout.write(cleaned) |
| 162 | + return |
| 163 | + |
| 164 | + if args.output and args.output != '-' and len(args.infile) > 1: |
| 165 | + print( |
| 166 | + "[✗] -o/--output with a filename is only allowed when processing a single input file.", |
| 167 | + file=sys.stderr |
| 168 | + ) |
| 169 | + sys.exit(1) |
| 170 | + |
| 171 | + seen = set() |
| 172 | + for infile in args.infile: |
| 173 | + if infile in seen: |
| 174 | + print(f"[!] Skipping duplicate: {infile}") |
| 175 | + continue |
| 176 | + seen.add(infile) |
| 177 | + |
| 178 | + try: |
| 179 | + if args.temp: |
| 180 | + tmpfile = infile + ".tmp" |
| 181 | + os.rename(infile, tmpfile) |
| 182 | + with open(tmpfile, "r", encoding="utf-8", errors="replace") as f: |
| 183 | + raw = f.read() |
| 184 | + cleaned = clean_text(raw, preserve_invisible=args.invisible) |
| 185 | + # Add or suppress newline at EOF based on -n/--no-newline |
| 186 | + if not args.no_newline: |
| 187 | + cleaned = ensure_single_newline(cleaned) |
| 188 | + else: |
| 189 | + cleaned = cleaned.rstrip('\r\n') |
| 190 | + with open(infile, "w", encoding="utf-8") as f: |
| 191 | + f.write(cleaned) |
| 192 | + print(f"[✓] Cleaned (in-place): {infile}") |
| 193 | + if not args.preserve_tmp: |
| 194 | + os.remove(tmpfile) |
| 195 | + else: |
| 196 | + print(f"[i] Preserved temp file: {tmpfile}") |
| 197 | + continue |
| 198 | + |
| 199 | + with open(infile, "r", encoding="utf-8", errors="replace") as f: |
| 200 | + raw = f.read() |
| 201 | + cleaned = clean_text(raw, preserve_invisible=args.invisible) |
| 202 | + # Add or suppress newline at EOF based on -n/--no-newline |
| 203 | + if not args.no_newline: |
| 204 | + cleaned = ensure_single_newline(cleaned) |
| 205 | + else: |
| 206 | + cleaned = cleaned.rstrip('\r\n') |
| 207 | + |
| 208 | + if args.output: |
| 209 | + if args.output == '-': |
| 210 | + sys.stdout.write(cleaned) |
| 211 | + continue |
| 212 | + else: |
| 213 | + outfile = args.output |
| 214 | + else: |
| 215 | + base, ext = os.path.splitext(infile) |
| 216 | + outfile = f"{base}.clean{ext}" |
| 217 | + |
| 218 | + with open(outfile, "w", encoding="utf-8") as f: |
| 219 | + f.write(cleaned) |
| 220 | + print(f"[✓] Cleaned: {infile} → {outfile}") |
| 221 | + except Exception as e: |
| 222 | + print(f"[✗] Failed to process {infile}: {e}") |
| 223 | + |
| 224 | + |
| 225 | +if __name__ == '__main__': |
| 226 | + main() |
0 commit comments