@@ -155,7 +155,9 @@ def _hash_text(s: str) -> str:
155155
156156# Ampersand-encoding based entirely on Nat Irons's Amputator MT plugin:
157157# http://bumppo.net/projects/amputator/
158- _AMPERSAND_RE = re .compile (r'&(?!#?[xX]?(?:[0-9a-fA-F]+|\w+);)' )
158+ _AMPERSAND_BODY_RE = r'#?[xX]?(?:[0-9a-fA-F]+|\w+);'
159+ _AMPERSAND_RE = re .compile (r'&(?!%s)' % _AMPERSAND_BODY_RE )
160+ _ESCAPED_AMPERSAND_RE = re .compile (r'(?:\\\\)*\\&(%s)' % _AMPERSAND_BODY_RE )
159161
160162
161163# ---- exceptions
@@ -1287,6 +1289,10 @@ def _run_span_gamut(self, text: str) -> str:
12871289 )
12881290 """ , re .X )
12891291
1292+ # regex that checks that the start of a string is NOT escaped
1293+ # it does this by matching pairs of `\` chars and checking that they're NOT followed by another `\`
1294+ _is_unescaped_re = re .compile (r'^((?:\\\\)*(?!\\))' )
1295+
12901296 @mark_stage (Stage .ESCAPE_SPECIAL )
12911297 def _escape_special_chars (self , text : str ) -> str :
12921298 # Python markdown note: the HTML tokenization here differs from
@@ -1295,27 +1301,30 @@ def _escape_special_chars(self, text: str) -> str:
12951301 # it isn't susceptible to unmatched '<' and '>' in HTML tags).
12961302 # Note, however, that '>' is not allowed in an auto-link URL
12971303 # here.
1298- lead_escape_re = re .compile (r'^((?:\\\\)*(?!\\))' )
12991304 escaped = []
13001305 is_html_markup = False
13011306 for token in self ._sorta_html_tokenize_re .split (text ):
13021307 # check token is preceded by 0 or more PAIRS of escapes, because escape pairs
13031308 # escape themselves and don't affect the token
1304- if is_html_markup and lead_escape_re .match (token ):
1309+ if is_html_markup and self . _is_unescaped_re .match (token ):
13051310 # Within tags/HTML-comments/auto-links, encode * and _
13061311 # so they don't conflict with their use in Markdown for
13071312 # italics and strong. We're replacing each such
13081313 # character with its corresponding MD5 checksum value;
13091314 # this is likely overkill, but it should prevent us from
13101315 # colliding with the escape values by accident.
1311- escape_seq , token = lead_escape_re .split (token )[1 :] or ('' , token )
1316+ escape_seq , token = self . _is_unescaped_re .split (token )[1 :] or ('' , token )
13121317 escaped .append (
13131318 escape_seq .replace ('\\ \\ ' , self ._escape_table ['\\ ' ])
13141319 + token .replace ('*' , self ._escape_table ['*' ])
13151320 .replace ('_' , self ._escape_table ['_' ])
13161321 )
13171322 else :
1318- escaped .append (self ._encode_backslash_escapes (token .replace ('\\ <' , '<' )))
1323+ escaped .append (
1324+ self ._encode_backslash_escapes (
1325+ token .replace ('\\ <' , '<' ).replace ('\\ >' , '>' )
1326+ )
1327+ )
13191328 is_html_markup = not is_html_markup
13201329 return '' .join (escaped )
13211330
@@ -1351,20 +1360,32 @@ def _is_comment(token):
13511360
13521361 tokens = []
13531362 split_tokens = self ._sorta_html_tokenize_re .split (text )
1354- is_html_markup = False
1355- for index , token in enumerate (split_tokens ):
1356- if is_html_markup and not self ._is_auto_link (token ) and not _is_code_span (index , token ):
1363+ index = 0
1364+ while index < len (split_tokens ):
1365+ is_html_markup = index % 2 != 0
1366+ token = split_tokens [index ]
1367+ is_code = _is_code_span (index , token )
1368+
1369+ if is_html_markup and not self ._is_auto_link (token ) and not is_code :
13571370 is_comment = _is_comment (token )
13581371 if is_comment :
13591372 tokens .append (self ._hash_span (self ._sanitize_html (is_comment .group (1 ))))
13601373 # sanitise but leave comment body intact for further markdown processing
13611374 tokens .append (self ._sanitize_html (is_comment .group (2 )))
13621375 tokens .append (self ._hash_span (self ._sanitize_html (is_comment .group (3 ))))
1376+ elif self ._is_unescaped_re .match (token ) is None :
1377+ # if the HTML is escaped then escape any special chars and add the token as-is
1378+ tokens .append (self ._escape_special_chars (token ))
13631379 else :
13641380 tokens .append (self ._hash_span (self ._sanitize_html (token )))
1381+ elif is_html_markup and is_code :
1382+ # code span contents are hashed, so should be safe to just add directly
1383+ tokens .extend (split_tokens [index : index + 3 ])
1384+ index += 3
1385+ continue
13651386 else :
13661387 tokens .append (self ._encode_incomplete_tags (token ))
1367- is_html_markup = not is_html_markup
1388+ index += 1
13681389 return '' .join (tokens )
13691390
13701391 def _unhash_html_spans (self , text : str , spans = True , code = False ) -> str :
@@ -2187,6 +2208,7 @@ def _encode_amps_and_angles(self, text: str) -> str:
21872208 # Smart processing for ampersands and angle brackets that need
21882209 # to be encoded.
21892210 text = _AMPERSAND_RE .sub ('&' , text )
2211+ text = _ESCAPED_AMPERSAND_RE .sub (r'&\1' , text )
21902212
21912213 # Encode naked <'s
21922214 text = self ._naked_lt_re .sub ('<' , text )
@@ -2206,10 +2228,25 @@ def _encode_incomplete_tags(self, text: str) -> str:
22062228 if self ._is_auto_link (text ):
22072229 return text # this is not an incomplete tag, this is a link in the form <http://x.y.z>
22082230
2231+ # protect code blocks. code blocks may have stuff like `C:\<folder>` in which is NOT a tag
2232+ # and will get encoded anyway in _encode_code
2233+ hashes = {}
2234+ for span in self ._code_span_re .findall (text ):
2235+ # the regex matches 2 groups: the syntax and the context. Reconstruct the entire match for easier processing
2236+ span = span [0 ] + span [1 ] + span [0 ]
2237+ hashed = _hash_text (span )
2238+ hashes [hashed ] = span
2239+ text = text .replace (span , hashed )
2240+
22092241 def incomplete_tags_sub (match ):
22102242 return match .group ().replace ('<' , '<' )
22112243
2212- return self ._incomplete_tags_re .sub (incomplete_tags_sub , text )
2244+ text = self ._incomplete_tags_re .sub (incomplete_tags_sub , text )
2245+
2246+ for hashed , original in hashes .items ():
2247+ text = text .replace (hashed , original )
2248+
2249+ return text
22132250
22142251 def _encode_backslash_escapes (self , text : str ) -> str :
22152252 for ch , escape in list (self ._escape_table .items ()):
@@ -3047,8 +3084,10 @@ def test(self, text):
30473084 if '```' not in text :
30483085 return False
30493086 if self .md .stage == Stage .PREPROCESS and not self .md .safe_mode :
3087+ # if safe mode is off then run before HASH_HTML and not worry about the tags getting messed up
30503088 return True
30513089 if self .md .stage == Stage .LINK_DEFS and self .md .safe_mode :
3090+ # if safe mode is on then run after HASH_HTML is done
30523091 return True
30533092 return self .md .stage == Stage .BLOCK_GAMUT
30543093
@@ -3127,7 +3166,19 @@ def sub(self, match: re.Match) -> str:
31273166
31283167 tags = self .tags (lexer_name )
31293168
3130- return "\n {}{}{}\n {}{}\n " .format (leading_indent , tags [0 ], codeblock , leading_indent , tags [1 ])
3169+ # when not in safe-mode, we convert fenced code blocks before Stage.HASH_HTML, which means the text
3170+ # ends up as `\n\nmd5-...\n\n`, thanks to the hashing stages adding in some newlines
3171+ # in safe mode, we run fenced code blocks AFTER the hashing, so we don't end up with that same
3172+ # `\n\n` wrap. We can correct that here
3173+ surrounding_newlines = '\n \n ' if self .md .safe_mode else '\n '
3174+
3175+ return (
3176+ f'{ surrounding_newlines } '
3177+ f'{ leading_indent } { tags [0 ]} '
3178+ f'{ codeblock } '
3179+ f'\n { leading_indent } { tags [1 ]} '
3180+ f'{ surrounding_newlines } '
3181+ )
31313182
31323183 def run (self , text ):
31333184 return self .fenced_code_block_re .sub (self .sub , text )
0 commit comments