|
22 | 22 | import re |
23 | 23 | import sys |
24 | 24 |
|
25 | | -# Check for unidecode dependency early, with a clear message if missing |
| 25 | +# Check for ftfy dependency early, with a clear message if missing |
26 | 26 | try: |
27 | | - from unidecode import unidecode # noqa: F401 |
| 27 | + import ftfy |
28 | 28 | except ImportError: |
29 | 29 | print( |
30 | | - "[✗] Missing dependency: 'Unidecode'. Please install it with:\n" |
31 | | - " pip install Unidecode\n" |
| 30 | + "[✗] Missing dependency: 'ftfy'. Please install it with:\n" |
| 31 | + " pip install ftfy\n" |
32 | 32 | "Or install all requirements with:\n" |
33 | 33 | " pip install -r requirements.txt", |
34 | 34 | file=sys.stderr |
@@ -63,6 +63,10 @@ def clean_text(text: str, preserve_invisible: bool = False) -> str: |
63 | 63 | Returns: |
64 | 64 | str: The cleaned text with normalized ASCII characters |
65 | 65 | """ |
| 66 | + # Use ftfy for intelligent text fixing and normalization |
| 67 | + text = ftfy.fix_text(text) |
| 68 | + |
| 69 | + # Handle specific cases that unidecode might not handle perfectly |
66 | 70 | replacements = { |
67 | 71 | '\u2018': "'", '\u2019': "'", # Smart single quotes |
68 | 72 | '\u201C': '"', '\u201D': '"', # Smart double quotes |
@@ -153,11 +157,14 @@ def main(): |
153 | 157 | # No files provided: filter mode (STDIN to STDOUT) |
154 | 158 | raw = sys.stdin.read() |
155 | 159 | cleaned = clean_text(raw, preserve_invisible=args.invisible) |
156 | | - # Add or suppress newline at EOF based on -n/--no-newline |
| 160 | + |
| 161 | + # Handle newline at EOF based on -n/--no-newline |
157 | 162 | if not args.no_newline: |
158 | | - cleaned = ensure_single_newline(cleaned) |
159 | | - else: |
160 | | - cleaned = cleaned.rstrip('\r\n') |
| 163 | + # Only add newline if there isn't one already |
| 164 | + if not cleaned.endswith('\n'): |
| 165 | + cleaned += '\n' |
| 166 | + # If --no-newline is specified, leave the file exactly as is (no changes to newlines) |
| 167 | + |
161 | 168 | sys.stdout.write(cleaned) |
162 | 169 | return |
163 | 170 |
|
|
0 commit comments