Merge pull request #3200 from kena0ki/issue3032

jerch · web-flow · commit ff74caf50d6d · 2021-01-23T13:03:57.000+01:00
Filter U+FEFF (BOM) when decoding input data
diff --git a/src/common/input/TextDecoder.test.ts b/src/common/input/TextDecoder.test.ts
@@ -58,8 +58,8 @@ describe('text encodings', () => {
         const decoder = new StringToUtf32();
         const target = new Uint32Array(5);
         for (let i = 0; i < 65536; ++i) {
-          // skip surrogate pairs
-          if (i >= 0xD800 && i <= 0xDFFF) {
+          // skip surrogate pairs and a BOM
+          if ((i >= 0xD800 && i <= 0xDFFF) || i === 0xFEFF) {
             continue;
           }
           const length = decoder.decode(String.fromCharCode(i), target);
@@ -84,6 +84,14 @@ describe('text encodings', () => {
           decoder.clear();
         }
       });
+
+      it('0xFEFF(BOM)', () => {
+        const decoder = new StringToUtf32();
+        const target = new Uint32Array(5);
+        const length = decoder.decode(String.fromCharCode(0xFEFF), target);
+        assert.equal(length, 0);
+        decoder.clear();
+      });
     });
 
     it('test strings', () => {
@@ -118,8 +126,8 @@ describe('text encodings', () => {
         const decoder = new Utf8ToUtf32();
         const target = new Uint32Array(5);
         for (let i = 0; i < 65536; ++i) {
-          // skip surrogate pairs
-          if (i >= 0xD800 && i <= 0xDFFF) {
+          // skip surrogate pairs and a BOM
+          if ((i >= 0xD800 && i <= 0xDFFF) || i === 0xFEFF) {
             continue;
           }
           const utf8Data = fromByteString(encode(String.fromCharCode(i)));
@@ -142,6 +150,15 @@ describe('text encodings', () => {
           decoder.clear();
         }
       });
+
+      it('0xFEFF(BOM)', () => {
+        const decoder = new Utf8ToUtf32();
+        const target = new Uint32Array(5);
+        const utf8Data = fromByteString(encode(String.fromCharCode(0xFEFF)));
+        const length = decoder.decode(utf8Data, target);
+        assert.equal(length, 0);
+        decoder.clear();
+      });
     });
 
     it('test strings', () => {
@@ -215,6 +232,19 @@ describe('text encodings', () => {
         }
         assert(decoded, 'Ä€𝄞Ö𝄞€Ü𝄞€');
       });
+
+      it('BOMs (3 byte sequences) - advance by 2', () => {
+        const decoder = new Utf8ToUtf32();
+        const target = new Uint32Array(5);
+        const utf8Data = fromByteString('\xef\xbb\xbf\xef\xbb\xbf');
+        let decoded = '';
+        for (let i = 0; i < utf8Data.length; i += 2) {
+          const written = decoder.decode(utf8Data.slice(i, i + 2), target);
+          decoded += toString(target, written);
+        }
+        assert.equal(decoded, '');
+      });
+
       it('test break after 3 bytes - issue #2495', () => {
         const decoder = new Utf8ToUtf32();
         const target = new Uint32Array(5);
diff --git a/src/common/input/TextDecoder.ts b/src/common/input/TextDecoder.ts
@@ -105,6 +105,10 @@ export class StringToUtf32 {
         }
         continue;
       }
+      if (code === 0xFEFF) {
+        // BOM
+        continue;
+      }
       target[size++] = code;
     }
     return size;
@@ -188,8 +192,8 @@ export class Utf8ToUtf32 {
             target[size++] = cp;
           }
         } else if (type === 3) {
-          if (cp < 0x0800 || (cp >= 0xD800 && cp <= 0xDFFF)) {
-            // illegal codepoint
+          if (cp < 0x0800 || (cp >= 0xD800 && cp <= 0xDFFF) || cp === 0xFEFF) {
+            // illegal codepoint or BOM
           } else {
             target[size++] = cp;
           }
@@ -286,8 +290,8 @@ export class Utf8ToUtf32 {
           continue;
         }
         codepoint = (byte1 & 0x0F) << 12 | (byte2 & 0x3F) << 6 | (byte3 & 0x3F);
-        if (codepoint < 0x0800 || (codepoint >= 0xD800 && codepoint <= 0xDFFF)) {
-          // illegal codepoint, no i-- here
+        if (codepoint < 0x0800 || (codepoint >= 0xD800 && codepoint <= 0xDFFF) || codepoint === 0xFEFF) {
+          // illegal codepoint or BOM, no i-- here
           continue;
         }
         target[size++] = codepoint;

Original file line number	Diff line number	Diff line change
`@@ -105,6 +105,10 @@ export class StringToUtf32 {`
`105`	`105`	`}`
`106`	`106`	`continue;`
`107`	`107`	`}`
	`108`	`+ if (code === 0xFEFF) {`
	`109`	`+ // BOM`
	`110`	`+ continue;`
	`111`	`+ }`
`108`	`112`	`target[size++] = code;`
`109`	`113`	`}`
`110`	`114`	`return size;`
`@@ -188,8 +192,8 @@ export class Utf8ToUtf32 {`
`188`	`192`	`target[size++] = cp;`
`189`	`193`	`}`
`190`	`194`	`} else if (type === 3) {`
`191`		`- if (cp < 0x0800 \|\| (cp >= 0xD800 && cp <= 0xDFFF)) {`
`192`		`- // illegal codepoint`
	`195`	`+ if (cp < 0x0800 \|\| (cp >= 0xD800 && cp <= 0xDFFF) \|\| cp === 0xFEFF) {`
	`196`	`+ // illegal codepoint or BOM`
`193`	`197`	`} else {`
`194`	`198`	`target[size++] = cp;`
`195`	`199`	`}`
`@@ -286,8 +290,8 @@ export class Utf8ToUtf32 {`
`286`	`290`	`continue;`
`287`	`291`	`}`
`288`	`292`	`codepoint = (byte1 & 0x0F) << 12 \| (byte2 & 0x3F) << 6 \| (byte3 & 0x3F);`
`289`		`- if (codepoint < 0x0800 \|\| (codepoint >= 0xD800 && codepoint <= 0xDFFF)) {`
`290`		`- // illegal codepoint, no i-- here`
	`293`	`+ if (codepoint < 0x0800 \|\| (codepoint >= 0xD800 && codepoint <= 0xDFFF) \|\| codepoint === 0xFEFF) {`
	`294`	`+ // illegal codepoint or BOM, no i-- here`
`291`	`295`	`continue;`
`292`	`296`	`}`
`293`	`297`	`target[size++] = codepoint;`