diff --git a/lib/markdown2.py b/lib/markdown2.py index 913559ba..71b19f67 100755 --- a/lib/markdown2.py +++ b/lib/markdown2.py @@ -1358,6 +1358,14 @@ def _is_comment(token): return return re.match(r'()', token) + # protect raw code spans from processing, as they can often contain anything that looks like HTML and + # trips up the regex. These are encoded and processed later on anyway + code_hashes = {} + text = self._code_span_re.sub( + lambda m: self._hash_span(m.string[m.start(): m.end()], code_hashes), + text + ) + tokens = [] split_tokens = self._sorta_html_tokenize_re.split(text) index = 0 @@ -1386,7 +1394,12 @@ def _is_comment(token): else: tokens.append(self._encode_incomplete_tags(token)) index += 1 - return ''.join(tokens) + + text = ''.join(tokens) + # put markdown code spans back into the text for processing + for key, code in code_hashes.items(): + text = text.replace(key, code) + return text def _unhash_html_spans(self, text: str, spans=True, code=False) -> str: ''' @@ -2219,7 +2232,7 @@ def _encode_amps_and_angles(self, text: str) -> str: text = self._naked_gt_re.sub('>', text) return text - _incomplete_tags_re = re.compile(r"<(!--|/?\w+?(?!\w)\s*?.+?(?:[\s/]+?|$))") + _incomplete_tags_re = re.compile(r"\\*<(!--|/?\w+?(?!\w)\s*?.+?(?:[\s/]+?|$))") def _encode_incomplete_tags(self, text: str) -> str: if self.safe_mode not in ("replace", "escape"): @@ -2228,24 +2241,15 @@ def _encode_incomplete_tags(self, text: str) -> str: if self._is_auto_link(text): return text # this is not an incomplete tag, this is a link in the form - # protect code blocks. code blocks may have stuff like `C:\` in which is NOT a tag - # and will get encoded anyway in _encode_code - hashes = {} - for span in self._code_span_re.findall(text): - # the regex matches 2 groups: the syntax and the context. Reconstruct the entire match for easier processing - span = span[0] + span[1] + span[0] - hashed = _hash_text(span) - hashes[hashed] = span - text = text.replace(span, hashed) - def incomplete_tags_sub(match): - return match.group().replace('<', '<') + text = match.group() + # ensure that we handle escaped incomplete tags properly by consuming and replacing the escapes + if not self._is_unescaped_re.match(text): + text = text.replace('\\<', '<') + return text.replace('<', '<') text = self._incomplete_tags_re.sub(incomplete_tags_sub, text) - for hashed, original in hashes.items(): - text = text.replace(hashed, original) - return text def _encode_backslash_escapes(self, text: str) -> str: @@ -2314,13 +2318,23 @@ def _outdent(self, text: str) -> str: # Remove one level of line-leading tabs or spaces return self._outdent_re.sub('', text) - def _hash_span(self, text: str) -> str: + def _hash_span(self, text: str, hash_table: Optional[dict] = None) -> str: ''' Wrapper around `_hash_text` that also adds the hash to `self.hash_spans`, meaning it will be automatically unhashed during conversion. + + Args: + text: the text to hash + hash_table: the dict to insert the hash into. If omitted will default to `self.html_spans` + + Returns: + The hashed text ''' key = _hash_text(text) - self.html_spans[key] = text + if hash_table is not None: + hash_table[key] = text + else: + self.html_spans[key] = text return key @staticmethod @@ -2559,9 +2573,7 @@ def sub(self, match: re.Match) -> str: def sub_hash(self, match: re.Match) -> str: substr = match.string[match.start(): match.end()] - key = _hash_text(substr) - self.hash_table[key] = substr - return key + return self.md._hash_span(substr, self.hash_table) def test(self, text): if self.md.order < Stage.ITALIC_AND_BOLD: @@ -3124,7 +3136,7 @@ def unhash_code(codeblock): **formatter_opts) # add back the indent to all lines - return "\n%s\n" % self.md._uniform_indent(colored, leading_indent, True) + return self.md._uniform_indent(colored, leading_indent, True) def tags(self, lexer_name: str) -> tuple[str, str]: ''' @@ -3149,12 +3161,20 @@ def sub(self, match: re.Match) -> str: codeblock = match.group(3) codeblock = codeblock[:-1] # drop one trailing newline + # figure out what newlines were already surrounding the code block and preserve them in the output + leading_newlines = match.string[match.start(): match.regs[1][0]] + trailing_newlines = re.search(r'\n*$', match.group()).group() + # Use pygments only if not using the highlightjs-lang extra if lexer_name and "highlightjs-lang" not in self.md.extras: lexer = self.md._get_pygments_lexer(lexer_name) if lexer: - leading_indent = ' '*(len(match.group(1)) - len(match.group(1).lstrip())) - return self._code_block_with_lexer_sub(codeblock, leading_indent, lexer) + leading_indent = ' ' * (len(match.group(1)) - len(match.group(1).lstrip())) + return ( + leading_newlines + + self._code_block_with_lexer_sub(codeblock, leading_indent, lexer) + + trailing_newlines + ) # Fenced code blocks need to be outdented before encoding, and then reapplied leading_indent = ' ' * (len(match.group(1)) - len(match.group(1).lstrip())) @@ -3166,18 +3186,12 @@ def sub(self, match: re.Match) -> str: tags = self.tags(lexer_name) - # when not in safe-mode, we convert fenced code blocks before Stage.HASH_HTML, which means the text - # ends up as `\n\nmd5-...\n\n`, thanks to the hashing stages adding in some newlines - # in safe mode, we run fenced code blocks AFTER the hashing, so we don't end up with that same - # `\n\n` wrap. We can correct that here - surrounding_newlines = '\n\n' if self.md.safe_mode else '\n' - return ( - f'{surrounding_newlines}' + f'{leading_newlines}' f'{leading_indent}{tags[0]}' f'{codeblock}' f'\n{leading_indent}{tags[1]}' - f'{surrounding_newlines}' + f'{trailing_newlines}' ) def run(self, text): @@ -3296,8 +3310,7 @@ def run(self, text): .replace('*', self.md._escape_table['*']) .replace('_', self.md._escape_table['_'])) link = '{}'.format(escaped_href, text[start:end]) - hash = _hash_text(link) - link_from_hash[hash] = link + hash = self.md._hash_span(link, link_from_hash) text = text[:start] + hash + text[end:] for hash, link in list(link_from_hash.items()): text = text.replace(hash, link) diff --git a/test/tm-cases/escaped_html_in_safe_mode.html b/test/tm-cases/escaped_html_in_safe_mode.html index 0b4ad213..ddb13060 100644 --- a/test/tm-cases/escaped_html_in_safe_mode.html +++ b/test/tm-cases/escaped_html_in_safe_mode.html @@ -1,2 +1,3 @@

<abc> -<abc>

+<abc> +<why?

diff --git a/test/tm-cases/escaped_html_in_safe_mode.text b/test/tm-cases/escaped_html_in_safe_mode.text index 73f44be6..ea5c1876 100644 --- a/test/tm-cases/escaped_html_in_safe_mode.text +++ b/test/tm-cases/escaped_html_in_safe_mode.text @@ -1,2 +1,3 @@ \ -\ \ No newline at end of file +\ +\This appears to be an incomplete tag, but it's not because it's in a code span.

+

This appears to be incomplete tags, but they're not because they're in code spans.

Path: C:\<folder 1>

+ +

Path: C:\<folder>

diff --git a/test/tm-cases/incomplete_tags_in_code_spans.text b/test/tm-cases/incomplete_tags_in_code_spans.text index cc06cfba..3bdf62cb 100644 --- a/test/tm-cases/incomplete_tags_in_code_spans.text +++ b/test/tm-cases/incomplete_tags_in_code_spans.text @@ -1,3 +1,5 @@ -This appears to be an incomplete tag, but it's not because it's in a code span. +This appears to be incomplete tags, but they're not because they're in code spans. -Path: `C:\` \ No newline at end of file +Path: `C:\` + +Path: `C:\` \ No newline at end of file diff --git a/test/tm-cases/safe_mode_fenced_code_joined_to_lists.text b/test/tm-cases/safe_mode_fenced_code_joined_to_lists.text index 6fd1058c..6acfd342 100644 --- a/test/tm-cases/safe_mode_fenced_code_joined_to_lists.text +++ b/test/tm-cases/safe_mode_fenced_code_joined_to_lists.text @@ -3,4 +3,4 @@ ``` // Some code -``` \ No newline at end of file +``` diff --git a/test/tm-cases/safe_mode_fenced_code_with_lexer_joined_to_lists.html b/test/tm-cases/safe_mode_fenced_code_with_lexer_joined_to_lists.html new file mode 100644 index 00000000..f4fe4322 --- /dev/null +++ b/test/tm-cases/safe_mode_fenced_code_with_lexer_joined_to_lists.html @@ -0,0 +1,9 @@ +
    +
  • Item 1
  • +
  • Item 2
  • +
+ +
+
// Some code
+
+
diff --git a/test/tm-cases/safe_mode_fenced_code_with_lexer_joined_to_lists.opts b/test/tm-cases/safe_mode_fenced_code_with_lexer_joined_to_lists.opts new file mode 100644 index 00000000..d5258749 --- /dev/null +++ b/test/tm-cases/safe_mode_fenced_code_with_lexer_joined_to_lists.opts @@ -0,0 +1 @@ +{'safe_mode': 'escape', 'extras': ['fenced-code-blocks']} \ No newline at end of file diff --git a/test/tm-cases/safe_mode_fenced_code_with_lexer_joined_to_lists.tags b/test/tm-cases/safe_mode_fenced_code_with_lexer_joined_to_lists.tags new file mode 100644 index 00000000..e2282d2f --- /dev/null +++ b/test/tm-cases/safe_mode_fenced_code_with_lexer_joined_to_lists.tags @@ -0,0 +1 @@ +fenced-code-blocks pygments \ No newline at end of file diff --git a/test/tm-cases/safe_mode_fenced_code_with_lexer_joined_to_lists.text b/test/tm-cases/safe_mode_fenced_code_with_lexer_joined_to_lists.text new file mode 100644 index 00000000..230ba98d --- /dev/null +++ b/test/tm-cases/safe_mode_fenced_code_with_lexer_joined_to_lists.text @@ -0,0 +1,6 @@ +* Item 1 +* Item 2 + +```cpp +// Some code +```