Skip to content

Commit ff74caf

Browse files
authored
Merge pull request #3200 from kena0ki/issue3032
Filter U+FEFF (BOM) when decoding input data
2 parents 4da5d55 + 3d941f2 commit ff74caf

File tree

2 files changed

+42
-8
lines changed

2 files changed

+42
-8
lines changed

src/common/input/TextDecoder.test.ts

Lines changed: 34 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -58,8 +58,8 @@ describe('text encodings', () => {
5858
const decoder = new StringToUtf32();
5959
const target = new Uint32Array(5);
6060
for (let i = 0; i < 65536; ++i) {
61-
// skip surrogate pairs
62-
if (i >= 0xD800 && i <= 0xDFFF) {
61+
// skip surrogate pairs and a BOM
62+
if ((i >= 0xD800 && i <= 0xDFFF) || i === 0xFEFF) {
6363
continue;
6464
}
6565
const length = decoder.decode(String.fromCharCode(i), target);
@@ -84,6 +84,14 @@ describe('text encodings', () => {
8484
decoder.clear();
8585
}
8686
});
87+
88+
it('0xFEFF(BOM)', () => {
89+
const decoder = new StringToUtf32();
90+
const target = new Uint32Array(5);
91+
const length = decoder.decode(String.fromCharCode(0xFEFF), target);
92+
assert.equal(length, 0);
93+
decoder.clear();
94+
});
8795
});
8896

8997
it('test strings', () => {
@@ -118,8 +126,8 @@ describe('text encodings', () => {
118126
const decoder = new Utf8ToUtf32();
119127
const target = new Uint32Array(5);
120128
for (let i = 0; i < 65536; ++i) {
121-
// skip surrogate pairs
122-
if (i >= 0xD800 && i <= 0xDFFF) {
129+
// skip surrogate pairs and a BOM
130+
if ((i >= 0xD800 && i <= 0xDFFF) || i === 0xFEFF) {
123131
continue;
124132
}
125133
const utf8Data = fromByteString(encode(String.fromCharCode(i)));
@@ -142,6 +150,15 @@ describe('text encodings', () => {
142150
decoder.clear();
143151
}
144152
});
153+
154+
it('0xFEFF(BOM)', () => {
155+
const decoder = new Utf8ToUtf32();
156+
const target = new Uint32Array(5);
157+
const utf8Data = fromByteString(encode(String.fromCharCode(0xFEFF)));
158+
const length = decoder.decode(utf8Data, target);
159+
assert.equal(length, 0);
160+
decoder.clear();
161+
});
145162
});
146163

147164
it('test strings', () => {
@@ -215,6 +232,19 @@ describe('text encodings', () => {
215232
}
216233
assert(decoded, 'Ä€𝄞Ö𝄞€Ü𝄞€');
217234
});
235+
236+
it('BOMs (3 byte sequences) - advance by 2', () => {
237+
const decoder = new Utf8ToUtf32();
238+
const target = new Uint32Array(5);
239+
const utf8Data = fromByteString('\xef\xbb\xbf\xef\xbb\xbf');
240+
let decoded = '';
241+
for (let i = 0; i < utf8Data.length; i += 2) {
242+
const written = decoder.decode(utf8Data.slice(i, i + 2), target);
243+
decoded += toString(target, written);
244+
}
245+
assert.equal(decoded, '');
246+
});
247+
218248
it('test break after 3 bytes - issue #2495', () => {
219249
const decoder = new Utf8ToUtf32();
220250
const target = new Uint32Array(5);

src/common/input/TextDecoder.ts

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,10 @@ export class StringToUtf32 {
105105
}
106106
continue;
107107
}
108+
if (code === 0xFEFF) {
109+
// BOM
110+
continue;
111+
}
108112
target[size++] = code;
109113
}
110114
return size;
@@ -188,8 +192,8 @@ export class Utf8ToUtf32 {
188192
target[size++] = cp;
189193
}
190194
} else if (type === 3) {
191-
if (cp < 0x0800 || (cp >= 0xD800 && cp <= 0xDFFF)) {
192-
// illegal codepoint
195+
if (cp < 0x0800 || (cp >= 0xD800 && cp <= 0xDFFF) || cp === 0xFEFF) {
196+
// illegal codepoint or BOM
193197
} else {
194198
target[size++] = cp;
195199
}
@@ -286,8 +290,8 @@ export class Utf8ToUtf32 {
286290
continue;
287291
}
288292
codepoint = (byte1 & 0x0F) << 12 | (byte2 & 0x3F) << 6 | (byte3 & 0x3F);
289-
if (codepoint < 0x0800 || (codepoint >= 0xD800 && codepoint <= 0xDFFF)) {
290-
// illegal codepoint, no i-- here
293+
if (codepoint < 0x0800 || (codepoint >= 0xD800 && codepoint <= 0xDFFF) || codepoint === 0xFEFF) {
294+
// illegal codepoint or BOM, no i-- here
291295
continue;
292296
}
293297
target[size++] = codepoint;

0 commit comments

Comments
 (0)