Skip to content

Commit 3fe9325

Browse files
Merge pull request #659 from Crozzers/fix-safemode-issues
Fix a number of safemode issues (#647)
2 parents 0896a1f + 42f3dd4 commit 3fe9325

17 files changed

+103
-12
lines changed

CHANGES.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
- [pull #639] Fix middle-word-em interfering with strongs (#637)
66
- [pull #640] Fix code friendly extra stopping other syntax being processed (#638)
77
- [pull #644] Fix a number of em/strong issues (#641, #642, #643)
8+
- [pull #659] Fix a number of safemode issues (#647)
89

910

1011
## python-markdown2 2.5.4

lib/markdown2.py

Lines changed: 62 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -155,7 +155,9 @@ def _hash_text(s: str) -> str:
155155

156156
# Ampersand-encoding based entirely on Nat Irons's Amputator MT plugin:
157157
# http://bumppo.net/projects/amputator/
158-
_AMPERSAND_RE = re.compile(r'&(?!#?[xX]?(?:[0-9a-fA-F]+|\w+);)')
158+
_AMPERSAND_BODY_RE = r'#?[xX]?(?:[0-9a-fA-F]+|\w+);'
159+
_AMPERSAND_RE = re.compile(r'&(?!%s)' % _AMPERSAND_BODY_RE)
160+
_ESCAPED_AMPERSAND_RE = re.compile(r'(?:\\\\)*\\&(%s)' % _AMPERSAND_BODY_RE)
159161

160162

161163
# ---- exceptions
@@ -1287,6 +1289,10 @@ def _run_span_gamut(self, text: str) -> str:
12871289
)
12881290
""", re.X)
12891291

1292+
# regex that checks that the start of a string is NOT escaped
1293+
# it does this by matching pairs of `\` chars and checking that they're NOT followed by another `\`
1294+
_is_unescaped_re = re.compile(r'^((?:\\\\)*(?!\\))')
1295+
12901296
@mark_stage(Stage.ESCAPE_SPECIAL)
12911297
def _escape_special_chars(self, text: str) -> str:
12921298
# Python markdown note: the HTML tokenization here differs from
@@ -1295,27 +1301,30 @@ def _escape_special_chars(self, text: str) -> str:
12951301
# it isn't susceptible to unmatched '<' and '>' in HTML tags).
12961302
# Note, however, that '>' is not allowed in an auto-link URL
12971303
# here.
1298-
lead_escape_re = re.compile(r'^((?:\\\\)*(?!\\))')
12991304
escaped = []
13001305
is_html_markup = False
13011306
for token in self._sorta_html_tokenize_re.split(text):
13021307
# check token is preceded by 0 or more PAIRS of escapes, because escape pairs
13031308
# escape themselves and don't affect the token
1304-
if is_html_markup and lead_escape_re.match(token):
1309+
if is_html_markup and self._is_unescaped_re.match(token):
13051310
# Within tags/HTML-comments/auto-links, encode * and _
13061311
# so they don't conflict with their use in Markdown for
13071312
# italics and strong. We're replacing each such
13081313
# character with its corresponding MD5 checksum value;
13091314
# this is likely overkill, but it should prevent us from
13101315
# colliding with the escape values by accident.
1311-
escape_seq, token = lead_escape_re.split(token)[1:] or ('', token)
1316+
escape_seq, token = self._is_unescaped_re.split(token)[1:] or ('', token)
13121317
escaped.append(
13131318
escape_seq.replace('\\\\', self._escape_table['\\'])
13141319
+ token.replace('*', self._escape_table['*'])
13151320
.replace('_', self._escape_table['_'])
13161321
)
13171322
else:
1318-
escaped.append(self._encode_backslash_escapes(token.replace('\\<', '&lt;')))
1323+
escaped.append(
1324+
self._encode_backslash_escapes(
1325+
token.replace('\\<', '&lt;').replace('\\>', '&gt;')
1326+
)
1327+
)
13191328
is_html_markup = not is_html_markup
13201329
return ''.join(escaped)
13211330

@@ -1351,20 +1360,32 @@ def _is_comment(token):
13511360

13521361
tokens = []
13531362
split_tokens = self._sorta_html_tokenize_re.split(text)
1354-
is_html_markup = False
1355-
for index, token in enumerate(split_tokens):
1356-
if is_html_markup and not self._is_auto_link(token) and not _is_code_span(index, token):
1363+
index = 0
1364+
while index < len(split_tokens):
1365+
is_html_markup = index % 2 != 0
1366+
token = split_tokens[index]
1367+
is_code = _is_code_span(index, token)
1368+
1369+
if is_html_markup and not self._is_auto_link(token) and not is_code:
13571370
is_comment = _is_comment(token)
13581371
if is_comment:
13591372
tokens.append(self._hash_span(self._sanitize_html(is_comment.group(1))))
13601373
# sanitise but leave comment body intact for further markdown processing
13611374
tokens.append(self._sanitize_html(is_comment.group(2)))
13621375
tokens.append(self._hash_span(self._sanitize_html(is_comment.group(3))))
1376+
elif self._is_unescaped_re.match(token) is None:
1377+
# if the HTML is escaped then escape any special chars and add the token as-is
1378+
tokens.append(self._escape_special_chars(token))
13631379
else:
13641380
tokens.append(self._hash_span(self._sanitize_html(token)))
1381+
elif is_html_markup and is_code:
1382+
# code span contents are hashed, so should be safe to just add directly
1383+
tokens.extend(split_tokens[index: index + 3])
1384+
index += 3
1385+
continue
13651386
else:
13661387
tokens.append(self._encode_incomplete_tags(token))
1367-
is_html_markup = not is_html_markup
1388+
index += 1
13681389
return ''.join(tokens)
13691390

13701391
def _unhash_html_spans(self, text: str, spans=True, code=False) -> str:
@@ -2187,6 +2208,7 @@ def _encode_amps_and_angles(self, text: str) -> str:
21872208
# Smart processing for ampersands and angle brackets that need
21882209
# to be encoded.
21892210
text = _AMPERSAND_RE.sub('&amp;', text)
2211+
text = _ESCAPED_AMPERSAND_RE.sub(r'&amp;\1', text)
21902212

21912213
# Encode naked <'s
21922214
text = self._naked_lt_re.sub('&lt;', text)
@@ -2206,10 +2228,25 @@ def _encode_incomplete_tags(self, text: str) -> str:
22062228
if self._is_auto_link(text):
22072229
return text # this is not an incomplete tag, this is a link in the form <http://x.y.z>
22082230

2231+
# protect code blocks. code blocks may have stuff like `C:\<folder>` in which is NOT a tag
2232+
# and will get encoded anyway in _encode_code
2233+
hashes = {}
2234+
for span in self._code_span_re.findall(text):
2235+
# the regex matches 2 groups: the syntax and the context. Reconstruct the entire match for easier processing
2236+
span = span[0] + span[1] + span[0]
2237+
hashed = _hash_text(span)
2238+
hashes[hashed] = span
2239+
text = text.replace(span, hashed)
2240+
22092241
def incomplete_tags_sub(match):
22102242
return match.group().replace('<', '&lt;')
22112243

2212-
return self._incomplete_tags_re.sub(incomplete_tags_sub, text)
2244+
text = self._incomplete_tags_re.sub(incomplete_tags_sub, text)
2245+
2246+
for hashed, original in hashes.items():
2247+
text = text.replace(hashed, original)
2248+
2249+
return text
22132250

22142251
def _encode_backslash_escapes(self, text: str) -> str:
22152252
for ch, escape in list(self._escape_table.items()):
@@ -3047,8 +3084,10 @@ def test(self, text):
30473084
if '```' not in text:
30483085
return False
30493086
if self.md.stage == Stage.PREPROCESS and not self.md.safe_mode:
3087+
# if safe mode is off then run before HASH_HTML and not worry about the tags getting messed up
30503088
return True
30513089
if self.md.stage == Stage.LINK_DEFS and self.md.safe_mode:
3090+
# if safe mode is on then run after HASH_HTML is done
30523091
return True
30533092
return self.md.stage == Stage.BLOCK_GAMUT
30543093

@@ -3127,7 +3166,19 @@ def sub(self, match: re.Match) -> str:
31273166

31283167
tags = self.tags(lexer_name)
31293168

3130-
return "\n{}{}{}\n{}{}\n".format(leading_indent, tags[0], codeblock, leading_indent, tags[1])
3169+
# when not in safe-mode, we convert fenced code blocks before Stage.HASH_HTML, which means the text
3170+
# ends up as `\n\nmd5-...\n\n`, thanks to the hashing stages adding in some newlines
3171+
# in safe mode, we run fenced code blocks AFTER the hashing, so we don't end up with that same
3172+
# `\n\n` wrap. We can correct that here
3173+
surrounding_newlines = '\n\n' if self.md.safe_mode else '\n'
3174+
3175+
return (
3176+
f'{surrounding_newlines}'
3177+
f'{leading_indent}{tags[0]}'
3178+
f'{codeblock}'
3179+
f'\n{leading_indent}{tags[1]}'
3180+
f'{surrounding_newlines}'
3181+
)
31313182

31323183
def run(self, text):
31333184
return self.fenced_code_block_re.sub(self.sub, text)

test/markdowntest-cases/Backslash escapes.html

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020

2121
<p>Right paren: )</p>
2222

23-
<p>Greater-than: ></p>
23+
<p>Greater-than: &gt;</p>
2424

2525
<p>Hash: #</p>
2626

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
<p>&amp;amp;
2+
&amp;lt;
3+
&amp;gt;
4+
&amp;quot;
5+
&amp;#8217;</p>
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
\&amp;
2+
\&lt;
3+
\&gt;
4+
\&quot;
5+
\&#8217;
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
<p>&lt;abc&gt;
2+
&lt;abc></p>
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
{'safe_mode': 'escape'}
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
\<abc\>
2+
\<abc>
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
<p>This appears to be an incomplete tag, but it's not because it's in a code span.</p>
2+
3+
<p>Path: <code>C:\&lt;folder 1&gt;</code></p>
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
{'safe_mode': 'escape'}

0 commit comments

Comments
 (0)