diff --git a/CHANGES.md b/CHANGES.md index 60c03486..c1c77f13 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -5,6 +5,7 @@ - [pull #639] Fix middle-word-em interfering with strongs (#637) - [pull #640] Fix code friendly extra stopping other syntax being processed (#638) - [pull #644] Fix a number of em/strong issues (#641, #642, #643) +- [pull #659] Fix a number of safemode issues (#647) ## python-markdown2 2.5.4 diff --git a/lib/markdown2.py b/lib/markdown2.py index cc926ab4..913559ba 100755 --- a/lib/markdown2.py +++ b/lib/markdown2.py @@ -155,7 +155,9 @@ def _hash_text(s: str) -> str: # Ampersand-encoding based entirely on Nat Irons's Amputator MT plugin: # http://bumppo.net/projects/amputator/ -_AMPERSAND_RE = re.compile(r'&(?!#?[xX]?(?:[0-9a-fA-F]+|\w+);)') +_AMPERSAND_BODY_RE = r'#?[xX]?(?:[0-9a-fA-F]+|\w+);' +_AMPERSAND_RE = re.compile(r'&(?!%s)' % _AMPERSAND_BODY_RE) +_ESCAPED_AMPERSAND_RE = re.compile(r'(?:\\\\)*\\&(%s)' % _AMPERSAND_BODY_RE) # ---- exceptions @@ -1287,6 +1289,10 @@ def _run_span_gamut(self, text: str) -> str: ) """, re.X) + # regex that checks that the start of a string is NOT escaped + # it does this by matching pairs of `\` chars and checking that they're NOT followed by another `\` + _is_unescaped_re = re.compile(r'^((?:\\\\)*(?!\\))') + @mark_stage(Stage.ESCAPE_SPECIAL) def _escape_special_chars(self, text: str) -> str: # Python markdown note: the HTML tokenization here differs from @@ -1295,27 +1301,30 @@ def _escape_special_chars(self, text: str) -> str: # it isn't susceptible to unmatched '<' and '>' in HTML tags). # Note, however, that '>' is not allowed in an auto-link URL # here. - lead_escape_re = re.compile(r'^((?:\\\\)*(?!\\))') escaped = [] is_html_markup = False for token in self._sorta_html_tokenize_re.split(text): # check token is preceded by 0 or more PAIRS of escapes, because escape pairs # escape themselves and don't affect the token - if is_html_markup and lead_escape_re.match(token): + if is_html_markup and self._is_unescaped_re.match(token): # Within tags/HTML-comments/auto-links, encode * and _ # so they don't conflict with their use in Markdown for # italics and strong. We're replacing each such # character with its corresponding MD5 checksum value; # this is likely overkill, but it should prevent us from # colliding with the escape values by accident. - escape_seq, token = lead_escape_re.split(token)[1:] or ('', token) + escape_seq, token = self._is_unescaped_re.split(token)[1:] or ('', token) escaped.append( escape_seq.replace('\\\\', self._escape_table['\\']) + token.replace('*', self._escape_table['*']) .replace('_', self._escape_table['_']) ) else: - escaped.append(self._encode_backslash_escapes(token.replace('\\<', '<'))) + escaped.append( + self._encode_backslash_escapes( + token.replace('\\<', '<').replace('\\>', '>') + ) + ) is_html_markup = not is_html_markup return ''.join(escaped) @@ -1351,20 +1360,32 @@ def _is_comment(token): tokens = [] split_tokens = self._sorta_html_tokenize_re.split(text) - is_html_markup = False - for index, token in enumerate(split_tokens): - if is_html_markup and not self._is_auto_link(token) and not _is_code_span(index, token): + index = 0 + while index < len(split_tokens): + is_html_markup = index % 2 != 0 + token = split_tokens[index] + is_code = _is_code_span(index, token) + + if is_html_markup and not self._is_auto_link(token) and not is_code: is_comment = _is_comment(token) if is_comment: tokens.append(self._hash_span(self._sanitize_html(is_comment.group(1)))) # sanitise but leave comment body intact for further markdown processing tokens.append(self._sanitize_html(is_comment.group(2))) tokens.append(self._hash_span(self._sanitize_html(is_comment.group(3)))) + elif self._is_unescaped_re.match(token) is None: + # if the HTML is escaped then escape any special chars and add the token as-is + tokens.append(self._escape_special_chars(token)) else: tokens.append(self._hash_span(self._sanitize_html(token))) + elif is_html_markup and is_code: + # code span contents are hashed, so should be safe to just add directly + tokens.extend(split_tokens[index: index + 3]) + index += 3 + continue else: tokens.append(self._encode_incomplete_tags(token)) - is_html_markup = not is_html_markup + index += 1 return ''.join(tokens) def _unhash_html_spans(self, text: str, spans=True, code=False) -> str: @@ -2187,6 +2208,7 @@ def _encode_amps_and_angles(self, text: str) -> str: # Smart processing for ampersands and angle brackets that need # to be encoded. text = _AMPERSAND_RE.sub('&', text) + text = _ESCAPED_AMPERSAND_RE.sub(r'&\1', text) # Encode naked <'s text = self._naked_lt_re.sub('<', text) @@ -2206,10 +2228,25 @@ def _encode_incomplete_tags(self, text: str) -> str: if self._is_auto_link(text): return text # this is not an incomplete tag, this is a link in the form + # protect code blocks. code blocks may have stuff like `C:\` in which is NOT a tag + # and will get encoded anyway in _encode_code + hashes = {} + for span in self._code_span_re.findall(text): + # the regex matches 2 groups: the syntax and the context. Reconstruct the entire match for easier processing + span = span[0] + span[1] + span[0] + hashed = _hash_text(span) + hashes[hashed] = span + text = text.replace(span, hashed) + def incomplete_tags_sub(match): return match.group().replace('<', '<') - return self._incomplete_tags_re.sub(incomplete_tags_sub, text) + text = self._incomplete_tags_re.sub(incomplete_tags_sub, text) + + for hashed, original in hashes.items(): + text = text.replace(hashed, original) + + return text def _encode_backslash_escapes(self, text: str) -> str: for ch, escape in list(self._escape_table.items()): @@ -3047,8 +3084,10 @@ def test(self, text): if '```' not in text: return False if self.md.stage == Stage.PREPROCESS and not self.md.safe_mode: + # if safe mode is off then run before HASH_HTML and not worry about the tags getting messed up return True if self.md.stage == Stage.LINK_DEFS and self.md.safe_mode: + # if safe mode is on then run after HASH_HTML is done return True return self.md.stage == Stage.BLOCK_GAMUT @@ -3127,7 +3166,19 @@ def sub(self, match: re.Match) -> str: tags = self.tags(lexer_name) - return "\n{}{}{}\n{}{}\n".format(leading_indent, tags[0], codeblock, leading_indent, tags[1]) + # when not in safe-mode, we convert fenced code blocks before Stage.HASH_HTML, which means the text + # ends up as `\n\nmd5-...\n\n`, thanks to the hashing stages adding in some newlines + # in safe mode, we run fenced code blocks AFTER the hashing, so we don't end up with that same + # `\n\n` wrap. We can correct that here + surrounding_newlines = '\n\n' if self.md.safe_mode else '\n' + + return ( + f'{surrounding_newlines}' + f'{leading_indent}{tags[0]}' + f'{codeblock}' + f'\n{leading_indent}{tags[1]}' + f'{surrounding_newlines}' + ) def run(self, text): return self.fenced_code_block_re.sub(self.sub, text) diff --git a/test/markdowntest-cases/Backslash escapes.html b/test/markdowntest-cases/Backslash escapes.html index 77823c3c..5658c795 100644 --- a/test/markdowntest-cases/Backslash escapes.html +++ b/test/markdowntest-cases/Backslash escapes.html @@ -20,7 +20,7 @@

Right paren: )

-

Greater-than: >

+

Greater-than: >

Hash: #

diff --git a/test/tm-cases/escaped_ampersands.html b/test/tm-cases/escaped_ampersands.html new file mode 100644 index 00000000..23bc7879 --- /dev/null +++ b/test/tm-cases/escaped_ampersands.html @@ -0,0 +1,5 @@ +

&amp; +&lt; +&gt; +&quot; +&#8217;

diff --git a/test/tm-cases/escaped_ampersands.text b/test/tm-cases/escaped_ampersands.text new file mode 100644 index 00000000..20e6b84e --- /dev/null +++ b/test/tm-cases/escaped_ampersands.text @@ -0,0 +1,5 @@ +\& +\< +\> +\" +\’ \ No newline at end of file diff --git a/test/tm-cases/escaped_html_in_safe_mode.html b/test/tm-cases/escaped_html_in_safe_mode.html new file mode 100644 index 00000000..0b4ad213 --- /dev/null +++ b/test/tm-cases/escaped_html_in_safe_mode.html @@ -0,0 +1,2 @@ +

<abc> +<abc>

diff --git a/test/tm-cases/escaped_html_in_safe_mode.opts b/test/tm-cases/escaped_html_in_safe_mode.opts new file mode 100644 index 00000000..de64198e --- /dev/null +++ b/test/tm-cases/escaped_html_in_safe_mode.opts @@ -0,0 +1 @@ +{'safe_mode': 'escape'} \ No newline at end of file diff --git a/test/tm-cases/escaped_html_in_safe_mode.text b/test/tm-cases/escaped_html_in_safe_mode.text new file mode 100644 index 00000000..73f44be6 --- /dev/null +++ b/test/tm-cases/escaped_html_in_safe_mode.text @@ -0,0 +1,2 @@ +\ +\ \ No newline at end of file diff --git a/test/tm-cases/incomplete_tags_in_code_spans.html b/test/tm-cases/incomplete_tags_in_code_spans.html new file mode 100644 index 00000000..34ee38e6 --- /dev/null +++ b/test/tm-cases/incomplete_tags_in_code_spans.html @@ -0,0 +1,3 @@ +

This appears to be an incomplete tag, but it's not because it's in a code span.

+ +

Path: C:\<folder 1>

diff --git a/test/tm-cases/incomplete_tags_in_code_spans.opts b/test/tm-cases/incomplete_tags_in_code_spans.opts new file mode 100644 index 00000000..de64198e --- /dev/null +++ b/test/tm-cases/incomplete_tags_in_code_spans.opts @@ -0,0 +1 @@ +{'safe_mode': 'escape'} \ No newline at end of file diff --git a/test/tm-cases/incomplete_tags_in_code_spans.text b/test/tm-cases/incomplete_tags_in_code_spans.text new file mode 100644 index 00000000..cc06cfba --- /dev/null +++ b/test/tm-cases/incomplete_tags_in_code_spans.text @@ -0,0 +1,3 @@ +This appears to be an incomplete tag, but it's not because it's in a code span. + +Path: `C:\` \ No newline at end of file diff --git a/test/tm-cases/safe_mode_code_spans_in_links.html b/test/tm-cases/safe_mode_code_spans_in_links.html new file mode 100644 index 00000000..56d812d5 --- /dev/null +++ b/test/tm-cases/safe_mode_code_spans_in_links.html @@ -0,0 +1 @@ +

example.com

diff --git a/test/tm-cases/safe_mode_code_spans_in_links.opts b/test/tm-cases/safe_mode_code_spans_in_links.opts new file mode 100644 index 00000000..54de31a8 --- /dev/null +++ b/test/tm-cases/safe_mode_code_spans_in_links.opts @@ -0,0 +1 @@ +{"safe_mode": "escape"} \ No newline at end of file diff --git a/test/tm-cases/safe_mode_code_spans_in_links.text b/test/tm-cases/safe_mode_code_spans_in_links.text new file mode 100644 index 00000000..0722751d --- /dev/null +++ b/test/tm-cases/safe_mode_code_spans_in_links.text @@ -0,0 +1 @@ +[`example.com`](https://example.com) \ No newline at end of file diff --git a/test/tm-cases/safe_mode_fenced_code_joined_to_lists.html b/test/tm-cases/safe_mode_fenced_code_joined_to_lists.html new file mode 100644 index 00000000..b9f93e82 --- /dev/null +++ b/test/tm-cases/safe_mode_fenced_code_joined_to_lists.html @@ -0,0 +1,7 @@ +
    +
  • Item 1
  • +
  • Item 2
  • +
+ +
// Some code
+
diff --git a/test/tm-cases/safe_mode_fenced_code_joined_to_lists.opts b/test/tm-cases/safe_mode_fenced_code_joined_to_lists.opts new file mode 100644 index 00000000..d5258749 --- /dev/null +++ b/test/tm-cases/safe_mode_fenced_code_joined_to_lists.opts @@ -0,0 +1 @@ +{'safe_mode': 'escape', 'extras': ['fenced-code-blocks']} \ No newline at end of file diff --git a/test/tm-cases/safe_mode_fenced_code_joined_to_lists.text b/test/tm-cases/safe_mode_fenced_code_joined_to_lists.text new file mode 100644 index 00000000..6fd1058c --- /dev/null +++ b/test/tm-cases/safe_mode_fenced_code_joined_to_lists.text @@ -0,0 +1,6 @@ +* Item 1 +* Item 2 + +``` +// Some code +``` \ No newline at end of file