Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
- [pull #639] Fix middle-word-em interfering with strongs (#637)
- [pull #640] Fix code friendly extra stopping other syntax being processed (#638)
- [pull #644] Fix a number of em/strong issues (#641, #642, #643)
- [pull #659] Fix a number of safemode issues (#647)


## python-markdown2 2.5.4
Expand Down
73 changes: 62 additions & 11 deletions lib/markdown2.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,9 @@ def _hash_text(s: str) -> str:

# Ampersand-encoding based entirely on Nat Irons's Amputator MT plugin:
# http://bumppo.net/projects/amputator/
_AMPERSAND_RE = re.compile(r'&(?!#?[xX]?(?:[0-9a-fA-F]+|\w+);)')
_AMPERSAND_BODY_RE = r'#?[xX]?(?:[0-9a-fA-F]+|\w+);'
_AMPERSAND_RE = re.compile(r'&(?!%s)' % _AMPERSAND_BODY_RE)
_ESCAPED_AMPERSAND_RE = re.compile(r'(?:\\\\)*\\&(%s)' % _AMPERSAND_BODY_RE)


# ---- exceptions
Expand Down Expand Up @@ -1287,6 +1289,10 @@ def _run_span_gamut(self, text: str) -> str:
)
""", re.X)

# regex that checks that the start of a string is NOT escaped
# it does this by matching pairs of `\` chars and checking that they're NOT followed by another `\`
_is_unescaped_re = re.compile(r'^((?:\\\\)*(?!\\))')

@mark_stage(Stage.ESCAPE_SPECIAL)
def _escape_special_chars(self, text: str) -> str:
# Python markdown note: the HTML tokenization here differs from
Expand All @@ -1295,27 +1301,30 @@ def _escape_special_chars(self, text: str) -> str:
# it isn't susceptible to unmatched '<' and '>' in HTML tags).
# Note, however, that '>' is not allowed in an auto-link URL
# here.
lead_escape_re = re.compile(r'^((?:\\\\)*(?!\\))')
escaped = []
is_html_markup = False
for token in self._sorta_html_tokenize_re.split(text):
# check token is preceded by 0 or more PAIRS of escapes, because escape pairs
# escape themselves and don't affect the token
if is_html_markup and lead_escape_re.match(token):
if is_html_markup and self._is_unescaped_re.match(token):
# Within tags/HTML-comments/auto-links, encode * and _
# so they don't conflict with their use in Markdown for
# italics and strong. We're replacing each such
# character with its corresponding MD5 checksum value;
# this is likely overkill, but it should prevent us from
# colliding with the escape values by accident.
escape_seq, token = lead_escape_re.split(token)[1:] or ('', token)
escape_seq, token = self._is_unescaped_re.split(token)[1:] or ('', token)
escaped.append(
escape_seq.replace('\\\\', self._escape_table['\\'])
+ token.replace('*', self._escape_table['*'])
.replace('_', self._escape_table['_'])
)
else:
escaped.append(self._encode_backslash_escapes(token.replace('\\<', '&lt;')))
escaped.append(
self._encode_backslash_escapes(
token.replace('\\<', '&lt;').replace('\\>', '&gt;')
)
)
is_html_markup = not is_html_markup
return ''.join(escaped)

Expand Down Expand Up @@ -1351,20 +1360,32 @@ def _is_comment(token):

tokens = []
split_tokens = self._sorta_html_tokenize_re.split(text)
is_html_markup = False
for index, token in enumerate(split_tokens):
if is_html_markup and not self._is_auto_link(token) and not _is_code_span(index, token):
index = 0
while index < len(split_tokens):
is_html_markup = index % 2 != 0
token = split_tokens[index]
is_code = _is_code_span(index, token)

if is_html_markup and not self._is_auto_link(token) and not is_code:
is_comment = _is_comment(token)
if is_comment:
tokens.append(self._hash_span(self._sanitize_html(is_comment.group(1))))
# sanitise but leave comment body intact for further markdown processing
tokens.append(self._sanitize_html(is_comment.group(2)))
tokens.append(self._hash_span(self._sanitize_html(is_comment.group(3))))
elif self._is_unescaped_re.match(token) is None:
# if the HTML is escaped then escape any special chars and add the token as-is
tokens.append(self._escape_special_chars(token))
else:
tokens.append(self._hash_span(self._sanitize_html(token)))
elif is_html_markup and is_code:
# code span contents are hashed, so should be safe to just add directly
tokens.extend(split_tokens[index: index + 3])
index += 3
continue
else:
tokens.append(self._encode_incomplete_tags(token))
is_html_markup = not is_html_markup
index += 1
return ''.join(tokens)

def _unhash_html_spans(self, text: str, spans=True, code=False) -> str:
Expand Down Expand Up @@ -2187,6 +2208,7 @@ def _encode_amps_and_angles(self, text: str) -> str:
# Smart processing for ampersands and angle brackets that need
# to be encoded.
text = _AMPERSAND_RE.sub('&amp;', text)
text = _ESCAPED_AMPERSAND_RE.sub(r'&amp;\1', text)

# Encode naked <'s
text = self._naked_lt_re.sub('&lt;', text)
Expand All @@ -2206,10 +2228,25 @@ def _encode_incomplete_tags(self, text: str) -> str:
if self._is_auto_link(text):
return text # this is not an incomplete tag, this is a link in the form <http://x.y.z>

# protect code blocks. code blocks may have stuff like `C:\<folder>` in which is NOT a tag
# and will get encoded anyway in _encode_code
hashes = {}
for span in self._code_span_re.findall(text):
# the regex matches 2 groups: the syntax and the context. Reconstruct the entire match for easier processing
span = span[0] + span[1] + span[0]
hashed = _hash_text(span)
hashes[hashed] = span
text = text.replace(span, hashed)

def incomplete_tags_sub(match):
return match.group().replace('<', '&lt;')

return self._incomplete_tags_re.sub(incomplete_tags_sub, text)
text = self._incomplete_tags_re.sub(incomplete_tags_sub, text)

for hashed, original in hashes.items():
text = text.replace(hashed, original)

return text

def _encode_backslash_escapes(self, text: str) -> str:
for ch, escape in list(self._escape_table.items()):
Expand Down Expand Up @@ -3047,8 +3084,10 @@ def test(self, text):
if '```' not in text:
return False
if self.md.stage == Stage.PREPROCESS and not self.md.safe_mode:
# if safe mode is off then run before HASH_HTML and not worry about the tags getting messed up
return True
if self.md.stage == Stage.LINK_DEFS and self.md.safe_mode:
# if safe mode is on then run after HASH_HTML is done
return True
return self.md.stage == Stage.BLOCK_GAMUT

Expand Down Expand Up @@ -3127,7 +3166,19 @@ def sub(self, match: re.Match) -> str:

tags = self.tags(lexer_name)

return "\n{}{}{}\n{}{}\n".format(leading_indent, tags[0], codeblock, leading_indent, tags[1])
# when not in safe-mode, we convert fenced code blocks before Stage.HASH_HTML, which means the text
# ends up as `\n\nmd5-...\n\n`, thanks to the hashing stages adding in some newlines
# in safe mode, we run fenced code blocks AFTER the hashing, so we don't end up with that same
# `\n\n` wrap. We can correct that here
surrounding_newlines = '\n\n' if self.md.safe_mode else '\n'

return (
f'{surrounding_newlines}'
f'{leading_indent}{tags[0]}'
f'{codeblock}'
f'\n{leading_indent}{tags[1]}'
f'{surrounding_newlines}'
)

def run(self, text):
return self.fenced_code_block_re.sub(self.sub, text)
Expand Down
2 changes: 1 addition & 1 deletion test/markdowntest-cases/Backslash escapes.html
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@

<p>Right paren: )</p>

<p>Greater-than: ></p>
<p>Greater-than: &gt;</p>

<p>Hash: #</p>

Expand Down
5 changes: 5 additions & 0 deletions test/tm-cases/escaped_ampersands.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
<p>&amp;amp;
&amp;lt;
&amp;gt;
&amp;quot;
&amp;#8217;</p>
5 changes: 5 additions & 0 deletions test/tm-cases/escaped_ampersands.text
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
\&amp;
\&lt;
\&gt;
\&quot;
\&#8217;
2 changes: 2 additions & 0 deletions test/tm-cases/escaped_html_in_safe_mode.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
<p>&lt;abc&gt;
&lt;abc></p>
1 change: 1 addition & 0 deletions test/tm-cases/escaped_html_in_safe_mode.opts
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{'safe_mode': 'escape'}
2 changes: 2 additions & 0 deletions test/tm-cases/escaped_html_in_safe_mode.text
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
\<abc\>
\<abc>
3 changes: 3 additions & 0 deletions test/tm-cases/incomplete_tags_in_code_spans.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
<p>This appears to be an incomplete tag, but it's not because it's in a code span.</p>

<p>Path: <code>C:\&lt;folder 1&gt;</code></p>
1 change: 1 addition & 0 deletions test/tm-cases/incomplete_tags_in_code_spans.opts
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{'safe_mode': 'escape'}
3 changes: 3 additions & 0 deletions test/tm-cases/incomplete_tags_in_code_spans.text
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
This appears to be an incomplete tag, but it's not because it's in a code span.

Path: `C:\<folder 1>`
1 change: 1 addition & 0 deletions test/tm-cases/safe_mode_code_spans_in_links.html
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
<p><a href="https://example.com"><code>example.com</code></a></p>
1 change: 1 addition & 0 deletions test/tm-cases/safe_mode_code_spans_in_links.opts
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"safe_mode": "escape"}
1 change: 1 addition & 0 deletions test/tm-cases/safe_mode_code_spans_in_links.text
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
[`example.com`](https://example.com)
7 changes: 7 additions & 0 deletions test/tm-cases/safe_mode_fenced_code_joined_to_lists.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
<ul>
<li>Item 1</li>
<li>Item 2</li>
</ul>

<pre><code>// Some code
</code></pre>
1 change: 1 addition & 0 deletions test/tm-cases/safe_mode_fenced_code_joined_to_lists.opts
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{'safe_mode': 'escape', 'extras': ['fenced-code-blocks']}
6 changes: 6 additions & 0 deletions test/tm-cases/safe_mode_fenced_code_joined_to_lists.text
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
* Item 1
* Item 2

```
// Some code
```