Skip to content

Commit d8b4602

Browse files
committed
20250812_00 patch - Expand Unicode quote normalization; refine VS Code filter newline handling; preserve extended ASCII
1 parent aa9d1ac commit d8b4602

File tree

2 files changed

+71
-15
lines changed

2 files changed

+71
-15
lines changed

CHANGELOG.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,12 @@
11
# Changelog for UnicodeFix
22

3+
## 2025-08-12
4+
5+
### Minor patch
6+
- Expanded quote normalization: map additional Unicode quote/prime/angle/fullwidth marks to ASCII ' and " for shell-safe output
7+
- Refined VS Code filter handling: only apply newline compensation in filter mode; never in file-write modes; respect CI/CD env
8+
- No breaking changes; behavior unchanged for already-clean inputs
9+
310
## 2025-07-28
411

512
### **Extended ASCII Preservation Fix**

bin/cleanup-text.py

Lines changed: 64 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
import os
2222
import re
2323
import sys
24+
import unicodedata
2425

2526
# Check for ftfy dependency early, with a clear message if missing
2627
try:
@@ -66,15 +67,62 @@ def clean_text(text: str, preserve_invisible: bool = False) -> str:
6667
# Use ftfy for intelligent text fixing and normalization
6768
text = ftfy.fix_text(text)
6869

69-
# Handle specific cases that unidecode might not handle perfectly
70+
# Handle specific cases that ftfy might not map to ASCII for shell-friendliness
7071
replacements = {
71-
'\u2018': "'", '\u2019': "'", # Smart single quotes
72-
'\u201C': '"', '\u201D': '"', # Smart double quotes
73-
'\u2011': '-', # Non-breaking hyphen to regular hyphen
72+
# Smart/apostrophe variants → '
73+
'\u2018': "'", # left single quotation mark
74+
'\u2019': "'", # right single quotation mark
75+
'\u201B': "'", # single high-reversed-9 quotation mark
76+
'\u201A': "'", # single low-9 quotation mark
77+
'\u2039': "'", # single left-pointing angle quotation mark
78+
'\u203A': "'", # single right-pointing angle quotation mark
79+
'\u02BC': "'", # modifier letter apostrophe
80+
'\uFF07': "'", # fullwidth apostrophe
81+
82+
# Double-quote variants → "
83+
'\u201C': '"', # left double quotation mark
84+
'\u201D': '"', # right double quotation mark
85+
'\u201E': '"', # double low-9 quotation mark
86+
'\u201F': '"', # double high-reversed-9 quotation mark
87+
'\u00AB': '"', # left-pointing double angle quotation mark
88+
'\u00BB': '"', # right-pointing double angle quotation mark
89+
'\uFF02': '"', # fullwidth quotation mark
90+
91+
# Non-breaking hyphen → ASCII hyphen-minus
92+
'\u2011': '-',
7493
}
7594
for orig, repl in replacements.items():
7695
text = text.replace(orig, repl)
7796

97+
# Fallback: map any remaining Unicode quote punctuation to ASCII
98+
single_like = {
99+
'\u2018', '\u2019', '\u201B', '\u201A', # various single quotes
100+
'\u2039', '\u203A', # angle single
101+
'\u02BC', '\uFF07', # apostrophes
102+
'\u2032', '\u2035', # prime marks often used like '
103+
}
104+
double_like = {
105+
'\u201C', '\u201D', '\u201E', '\u201F', # various double quotes
106+
'\u00AB', '\u00BB', # angle double
107+
'\uFF02', # fullwidth double quote
108+
'\u2033', '\u2036', # double prime marks often used like "
109+
}
110+
mapped_chars = []
111+
for ch in text:
112+
cat = unicodedata.category(ch)
113+
if cat in ("Pi", "Pf"):
114+
cp = f"\\u{ord(ch):04X}"
115+
if cp in single_like:
116+
mapped_chars.append("'")
117+
elif cp in double_like:
118+
mapped_chars.append('"')
119+
else:
120+
# Default quote fallback to double quote
121+
mapped_chars.append('"')
122+
else:
123+
mapped_chars.append(ch)
124+
text = ''.join(mapped_chars)
125+
78126
# Replace EM dashes (U+2014) with space-dash-space, unless already surrounded by spaces
79127
def em_dash_replacer(match):
80128
before = match.group(1)
@@ -111,21 +159,10 @@ def handle_newlines(text: str, no_newline: bool = False) -> str:
111159
if no_newline:
112160
return text # Leave exactly as is
113161

114-
# Check if running inside VS Code extension host (but not CI/CD pipeline)
115-
vscode_extension = False
116-
process_title = os.environ.get('VSCODE_PROCESS_TITLE', '')
117-
app_insights = os.environ.get('APPLICATION_INSIGHTS_NO_DIAGNOSTIC_CHANNEL', '')
118-
if process_title.startswith('extension-host') and app_insights != 'true':
119-
vscode_extension = True
120-
121162
# Only add newline if there isn't one already
122163
if not text.endswith('\n'):
123164
text += '\n'
124165

125-
# Add extra newline if running in VS Code extension host (to compensate for stripping)
126-
if vscode_extension:
127-
text += '\n'
128-
129166
return text
130167

131168

@@ -182,7 +219,19 @@ def main():
182219
# No files provided: filter mode (STDIN to STDOUT)
183220
raw = sys.stdin.read()
184221
cleaned = clean_text(raw, preserve_invisible=args.invisible)
222+
223+
# Check if running inside VS Code extension host (but not CI/CD pipeline)
224+
vscode_extension = False
225+
process_title = os.environ.get('VSCODE_PROCESS_TITLE', '')
226+
app_insights = os.environ.get('APPLICATION_INSIGHTS_NO_DIAGNOSTIC_CHANNEL', '')
227+
if process_title.startswith('extension-host') and app_insights != 'true':
228+
vscode_extension = True
229+
230+
# Base newline handling
185231
cleaned = handle_newlines(cleaned, args.no_newline)
232+
# VS Code compensation only in filter mode
233+
if not args.no_newline and vscode_extension:
234+
cleaned += '\n'
186235
sys.stdout.write(cleaned)
187236
return
188237

0 commit comments

Comments
 (0)