Merge pull request #659 from Crozzers/fix-safemode-issues

nicholasserra · web-flow · commit 3fe9325c7605 · 2025-11-09T20:05:57.000-05:00
Fix a number of safemode issues (#647)
diff --git a/CHANGES.md b/CHANGES.md
@@ -5,6 +5,7 @@
 - [pull #639] Fix middle-word-em interfering with strongs (#637)
 - [pull #640] Fix code friendly extra stopping other syntax being processed (#638)
 - [pull #644] Fix a number of em/strong issues (#641, #642, #643)
+- [pull #659] Fix a number of safemode issues (#647)
 
 
 ## python-markdown2 2.5.4
diff --git a/lib/markdown2.py b/lib/markdown2.py
@@ -155,7 +155,9 @@ def _hash_text(s: str) -> str:
 
 # Ampersand-encoding based entirely on Nat Irons's Amputator MT plugin:
 #   http://bumppo.net/projects/amputator/
-_AMPERSAND_RE = re.compile(r'&(?!#?[xX]?(?:[0-9a-fA-F]+|\w+);)')
+_AMPERSAND_BODY_RE = r'#?[xX]?(?:[0-9a-fA-F]+|\w+);'
+_AMPERSAND_RE = re.compile(r'&(?!%s)' % _AMPERSAND_BODY_RE)
+_ESCAPED_AMPERSAND_RE = re.compile(r'(?:\\\\)*\\&(%s)' % _AMPERSAND_BODY_RE)
 
 
 # ---- exceptions
@@ -1287,6 +1289,10 @@ def _run_span_gamut(self, text: str) -> str:
         )
         """, re.X)
 
+    # regex that checks that the start of a string is NOT escaped
+    # it does this by matching pairs of `\` chars and checking that they're NOT followed by another `\`
+    _is_unescaped_re = re.compile(r'^((?:\\\\)*(?!\\))')
+
     @mark_stage(Stage.ESCAPE_SPECIAL)
     def _escape_special_chars(self, text: str) -> str:
         # Python markdown note: the HTML tokenization here differs from
@@ -1295,27 +1301,30 @@ def _escape_special_chars(self, text: str) -> str:
         # it isn't susceptible to unmatched '<' and '>' in HTML tags).
         # Note, however, that '>' is not allowed in an auto-link URL
         # here.
-        lead_escape_re = re.compile(r'^((?:\\\\)*(?!\\))')
         escaped = []
         is_html_markup = False
         for token in self._sorta_html_tokenize_re.split(text):
             # check token is preceded by 0 or more PAIRS of escapes, because escape pairs
             # escape themselves and don't affect the token
-            if is_html_markup and lead_escape_re.match(token):
+            if is_html_markup and self._is_unescaped_re.match(token):
                 # Within tags/HTML-comments/auto-links, encode * and _
                 # so they don't conflict with their use in Markdown for
                 # italics and strong.  We're replacing each such
                 # character with its corresponding MD5 checksum value;
                 # this is likely overkill, but it should prevent us from
                 # colliding with the escape values by accident.
-                escape_seq, token = lead_escape_re.split(token)[1:] or ('', token)
+                escape_seq, token = self._is_unescaped_re.split(token)[1:] or ('', token)
                 escaped.append(
                     escape_seq.replace('\\\\', self._escape_table['\\'])
                     + token.replace('*', self._escape_table['*'])
                            .replace('_', self._escape_table['_'])
                 )
             else:
-                escaped.append(self._encode_backslash_escapes(token.replace('\\<', '&lt;')))
+                escaped.append(
+                    self._encode_backslash_escapes(
+                        token.replace('\\<', '&lt;').replace('\\>', '&gt;')
+                    )
+                )
             is_html_markup = not is_html_markup
         return ''.join(escaped)
 
@@ -1351,20 +1360,32 @@ def _is_comment(token):
 
         tokens = []
         split_tokens = self._sorta_html_tokenize_re.split(text)
-        is_html_markup = False
-        for index, token in enumerate(split_tokens):
-            if is_html_markup and not self._is_auto_link(token) and not _is_code_span(index, token):
+        index = 0
+        while index < len(split_tokens):
+            is_html_markup = index % 2 != 0
+            token = split_tokens[index]
+            is_code = _is_code_span(index, token)
+
+            if is_html_markup and not self._is_auto_link(token) and not is_code:
                 is_comment = _is_comment(token)
                 if is_comment:
                     tokens.append(self._hash_span(self._sanitize_html(is_comment.group(1))))
                     # sanitise but leave comment body intact for further markdown processing
                     tokens.append(self._sanitize_html(is_comment.group(2)))
                     tokens.append(self._hash_span(self._sanitize_html(is_comment.group(3))))
+                elif self._is_unescaped_re.match(token) is None:
+                    # if the HTML is escaped then escape any special chars and add the token as-is
+                    tokens.append(self._escape_special_chars(token))
                 else:
                     tokens.append(self._hash_span(self._sanitize_html(token)))
+            elif is_html_markup and is_code:
+                # code span contents are hashed, so should be safe to just add directly
+                tokens.extend(split_tokens[index: index + 3])
+                index += 3
+                continue
             else:
                 tokens.append(self._encode_incomplete_tags(token))
-            is_html_markup = not is_html_markup
+            index += 1
         return ''.join(tokens)
 
     def _unhash_html_spans(self, text: str, spans=True, code=False) -> str:
@@ -2187,6 +2208,7 @@ def _encode_amps_and_angles(self, text: str) -> str:
         # Smart processing for ampersands and angle brackets that need
         # to be encoded.
         text = _AMPERSAND_RE.sub('&amp;', text)
+        text = _ESCAPED_AMPERSAND_RE.sub(r'&amp;\1', text)
 
         # Encode naked <'s
         text = self._naked_lt_re.sub('&lt;', text)
@@ -2206,10 +2228,25 @@ def _encode_incomplete_tags(self, text: str) -> str:
         if self._is_auto_link(text):
             return text  # this is not an incomplete tag, this is a link in the form <http://x.y.z>
 
+        # protect code blocks. code blocks may have stuff like `C:\<folder>` in which is NOT a tag
+        # and will get encoded anyway in _encode_code
+        hashes = {}
+        for span in self._code_span_re.findall(text):
+            # the regex matches 2 groups: the syntax and the context. Reconstruct the entire match for easier processing
+            span = span[0] + span[1] + span[0]
+            hashed = _hash_text(span)
+            hashes[hashed] = span
+            text = text.replace(span, hashed)
+
         def incomplete_tags_sub(match):
             return match.group().replace('<', '&lt;')
 
-        return self._incomplete_tags_re.sub(incomplete_tags_sub, text)
+        text = self._incomplete_tags_re.sub(incomplete_tags_sub, text)
+
+        for hashed, original in hashes.items():
+            text = text.replace(hashed, original)
+
+        return text
 
     def _encode_backslash_escapes(self, text: str) -> str:
         for ch, escape in list(self._escape_table.items()):
@@ -3047,8 +3084,10 @@ def test(self, text):
         if '```' not in text:
             return False
         if self.md.stage == Stage.PREPROCESS and not self.md.safe_mode:
+            # if safe mode is off then run before HASH_HTML and not worry about the tags getting messed up
             return True
         if self.md.stage == Stage.LINK_DEFS and self.md.safe_mode:
+            # if safe mode is on then run after HASH_HTML is done
             return True
         return self.md.stage == Stage.BLOCK_GAMUT
 
@@ -3127,7 +3166,19 @@ def sub(self, match: re.Match) -> str:
 
         tags = self.tags(lexer_name)
 
-        return "\n{}{}{}\n{}{}\n".format(leading_indent, tags[0], codeblock, leading_indent, tags[1])
+        # when not in safe-mode, we convert fenced code blocks before Stage.HASH_HTML, which means the text
+        # ends up as `\n\nmd5-...\n\n`, thanks to the hashing stages adding in some newlines
+        # in safe mode, we run fenced code blocks AFTER the hashing, so we don't end up with that same
+        # `\n\n` wrap. We can correct that here
+        surrounding_newlines = '\n\n' if self.md.safe_mode else '\n'
+
+        return (
+            f'{surrounding_newlines}'
+            f'{leading_indent}{tags[0]}'
+            f'{codeblock}'
+            f'\n{leading_indent}{tags[1]}'
+            f'{surrounding_newlines}'
+        )
 
     def run(self, text):
         return self.fenced_code_block_re.sub(self.sub, text)
diff --git a/test/markdowntest-cases/Backslash escapes.html b/test/markdowntest-cases/Backslash escapes.html
@@ -20,7 +20,7 @@
 
 <p>Right paren: )</p>
 
-<p>Greater-than: ></p>
+<p>Greater-than: &gt;</p>
 
 <p>Hash: #</p>
 
diff --git a/test/tm-cases/escaped_ampersands.html b/test/tm-cases/escaped_ampersands.html
@@ -0,0 +1,5 @@
+<p>&amp;amp;
+&amp;lt;
+&amp;gt;
+&amp;quot;
+&amp;#8217;</p>
diff --git a/test/tm-cases/escaped_ampersands.text b/test/tm-cases/escaped_ampersands.text
@@ -0,0 +1,5 @@
+\&amp;
+\&lt;
+\&gt;
+\&quot;
+\&#8217;
diff --git a/test/tm-cases/escaped_html_in_safe_mode.html b/test/tm-cases/escaped_html_in_safe_mode.html
@@ -0,0 +1,2 @@
+<p>&lt;abc&gt;
+&lt;abc></p>
diff --git a/test/tm-cases/escaped_html_in_safe_mode.opts b/test/tm-cases/escaped_html_in_safe_mode.opts
@@ -0,0 +1 @@
+{'safe_mode': 'escape'}
diff --git a/test/tm-cases/escaped_html_in_safe_mode.text b/test/tm-cases/escaped_html_in_safe_mode.text
@@ -0,0 +1,2 @@
+\<abc\>
+\<abc>
diff --git a/test/tm-cases/incomplete_tags_in_code_spans.html b/test/tm-cases/incomplete_tags_in_code_spans.html
@@ -0,0 +1,3 @@
+<p>This appears to be an incomplete tag, but it's not because it's in a code span.</p>
+
+<p>Path: <code>C:\&lt;folder 1&gt;</code></p>
diff --git a/test/tm-cases/incomplete_tags_in_code_spans.opts b/test/tm-cases/incomplete_tags_in_code_spans.opts
@@ -0,0 +1 @@
+{'safe_mode': 'escape'}
diff --git a/test/tm-cases/incomplete_tags_in_code_spans.text b/test/tm-cases/incomplete_tags_in_code_spans.text
@@ -0,0 +1,3 @@
+This appears to be an incomplete tag, but it's not because it's in a code span.
+
+Path: `C:\<folder 1>`
diff --git a/test/tm-cases/safe_mode_code_spans_in_links.html b/test/tm-cases/safe_mode_code_spans_in_links.html
@@ -0,0 +1 @@
+<p><a href="https://example.com"><code>example.com</code></a></p>
diff --git a/test/tm-cases/safe_mode_code_spans_in_links.opts b/test/tm-cases/safe_mode_code_spans_in_links.opts
@@ -0,0 +1 @@
+{"safe_mode": "escape"}
diff --git a/test/tm-cases/safe_mode_code_spans_in_links.text b/test/tm-cases/safe_mode_code_spans_in_links.text
@@ -0,0 +1 @@
+[`example.com`](https://example.com)
diff --git a/test/tm-cases/safe_mode_fenced_code_joined_to_lists.html b/test/tm-cases/safe_mode_fenced_code_joined_to_lists.html
@@ -0,0 +1,7 @@
+<ul>
+<li>Item 1</li>
+<li>Item 2</li>
+</ul>
+
+<pre><code>// Some code
+</code></pre>
diff --git a/test/tm-cases/safe_mode_fenced_code_joined_to_lists.opts b/test/tm-cases/safe_mode_fenced_code_joined_to_lists.opts
@@ -0,0 +1 @@
+{'safe_mode': 'escape', 'extras': ['fenced-code-blocks']}
diff --git a/test/tm-cases/safe_mode_fenced_code_joined_to_lists.text b/test/tm-cases/safe_mode_fenced_code_joined_to_lists.text
@@ -0,0 +1,6 @@
+* Item 1
+* Item 2
+
+```
+// Some code
+```

-Original file line number
+Diff line change
@@ @@ -0,0 +1,5 @@ @@
 +\&amp;
 +\&lt;
 +\&gt;
 +\&quot;
 +\&#8217;
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+<p>This appears to be an incomplete tag, but it's not because it's in a code span.</p>`
	`2`	`+`
	`3`	`+<p>Path: <code>C:\<folder 1></code></p>`