|
21 | 21 | import os |
22 | 22 | import re |
23 | 23 | import sys |
| 24 | +import unicodedata |
24 | 25 |
|
25 | 26 | # Check for ftfy dependency early, with a clear message if missing |
26 | 27 | try: |
@@ -66,15 +67,62 @@ def clean_text(text: str, preserve_invisible: bool = False) -> str: |
66 | 67 | # Use ftfy for intelligent text fixing and normalization |
67 | 68 | text = ftfy.fix_text(text) |
68 | 69 |
|
69 | | - # Handle specific cases that unidecode might not handle perfectly |
| 70 | + # Handle specific cases that ftfy might not map to ASCII for shell-friendliness |
70 | 71 | replacements = { |
71 | | - '\u2018': "'", '\u2019': "'", # Smart single quotes |
72 | | - '\u201C': '"', '\u201D': '"', # Smart double quotes |
73 | | - '\u2011': '-', # Non-breaking hyphen to regular hyphen |
| 72 | + # Smart/apostrophe variants → ' |
| 73 | + '\u2018': "'", # left single quotation mark |
| 74 | + '\u2019': "'", # right single quotation mark |
| 75 | + '\u201B': "'", # single high-reversed-9 quotation mark |
| 76 | + '\u201A': "'", # single low-9 quotation mark |
| 77 | + '\u2039': "'", # single left-pointing angle quotation mark |
| 78 | + '\u203A': "'", # single right-pointing angle quotation mark |
| 79 | + '\u02BC': "'", # modifier letter apostrophe |
| 80 | + '\uFF07': "'", # fullwidth apostrophe |
| 81 | + |
| 82 | + # Double-quote variants → " |
| 83 | + '\u201C': '"', # left double quotation mark |
| 84 | + '\u201D': '"', # right double quotation mark |
| 85 | + '\u201E': '"', # double low-9 quotation mark |
| 86 | + '\u201F': '"', # double high-reversed-9 quotation mark |
| 87 | + '\u00AB': '"', # left-pointing double angle quotation mark |
| 88 | + '\u00BB': '"', # right-pointing double angle quotation mark |
| 89 | + '\uFF02': '"', # fullwidth quotation mark |
| 90 | + |
| 91 | + # Non-breaking hyphen → ASCII hyphen-minus |
| 92 | + '\u2011': '-', |
74 | 93 | } |
75 | 94 | for orig, repl in replacements.items(): |
76 | 95 | text = text.replace(orig, repl) |
77 | 96 |
|
| 97 | + # Fallback: map any remaining Unicode quote punctuation to ASCII |
| 98 | + single_like = { |
| 99 | + '\u2018', '\u2019', '\u201B', '\u201A', # various single quotes |
| 100 | + '\u2039', '\u203A', # angle single |
| 101 | + '\u02BC', '\uFF07', # apostrophes |
| 102 | + '\u2032', '\u2035', # prime marks often used like ' |
| 103 | + } |
| 104 | + double_like = { |
| 105 | + '\u201C', '\u201D', '\u201E', '\u201F', # various double quotes |
| 106 | + '\u00AB', '\u00BB', # angle double |
| 107 | + '\uFF02', # fullwidth double quote |
| 108 | + '\u2033', '\u2036', # double prime marks often used like " |
| 109 | + } |
| 110 | + mapped_chars = [] |
| 111 | + for ch in text: |
| 112 | + cat = unicodedata.category(ch) |
| 113 | + if cat in ("Pi", "Pf"): |
| 114 | + cp = f"\\u{ord(ch):04X}" |
| 115 | + if cp in single_like: |
| 116 | + mapped_chars.append("'") |
| 117 | + elif cp in double_like: |
| 118 | + mapped_chars.append('"') |
| 119 | + else: |
| 120 | + # Default quote fallback to double quote |
| 121 | + mapped_chars.append('"') |
| 122 | + else: |
| 123 | + mapped_chars.append(ch) |
| 124 | + text = ''.join(mapped_chars) |
| 125 | + |
78 | 126 | # Replace EM dashes (U+2014) with space-dash-space, unless already surrounded by spaces |
79 | 127 | def em_dash_replacer(match): |
80 | 128 | before = match.group(1) |
@@ -111,21 +159,10 @@ def handle_newlines(text: str, no_newline: bool = False) -> str: |
111 | 159 | if no_newline: |
112 | 160 | return text # Leave exactly as is |
113 | 161 |
|
114 | | - # Check if running inside VS Code extension host (but not CI/CD pipeline) |
115 | | - vscode_extension = False |
116 | | - process_title = os.environ.get('VSCODE_PROCESS_TITLE', '') |
117 | | - app_insights = os.environ.get('APPLICATION_INSIGHTS_NO_DIAGNOSTIC_CHANNEL', '') |
118 | | - if process_title.startswith('extension-host') and app_insights != 'true': |
119 | | - vscode_extension = True |
120 | | - |
121 | 162 | # Only add newline if there isn't one already |
122 | 163 | if not text.endswith('\n'): |
123 | 164 | text += '\n' |
124 | 165 |
|
125 | | - # Add extra newline if running in VS Code extension host (to compensate for stripping) |
126 | | - if vscode_extension: |
127 | | - text += '\n' |
128 | | - |
129 | 166 | return text |
130 | 167 |
|
131 | 168 |
|
@@ -182,7 +219,19 @@ def main(): |
182 | 219 | # No files provided: filter mode (STDIN to STDOUT) |
183 | 220 | raw = sys.stdin.read() |
184 | 221 | cleaned = clean_text(raw, preserve_invisible=args.invisible) |
| 222 | + |
| 223 | + # Check if running inside VS Code extension host (but not CI/CD pipeline) |
| 224 | + vscode_extension = False |
| 225 | + process_title = os.environ.get('VSCODE_PROCESS_TITLE', '') |
| 226 | + app_insights = os.environ.get('APPLICATION_INSIGHTS_NO_DIAGNOSTIC_CHANNEL', '') |
| 227 | + if process_title.startswith('extension-host') and app_insights != 'true': |
| 228 | + vscode_extension = True |
| 229 | + |
| 230 | + # Base newline handling |
185 | 231 | cleaned = handle_newlines(cleaned, args.no_newline) |
| 232 | + # VS Code compensation only in filter mode |
| 233 | + if not args.no_newline and vscode_extension: |
| 234 | + cleaned += '\n' |
186 | 235 | sys.stdout.write(cleaned) |
187 | 236 | return |
188 | 237 |
|
|
0 commit comments