66
77# Original code Copyright 2008 [Jack Miller](https://codezen.org/)
88
9- # All changes Copyright 2008-2014 The Python Markdown Project
9+ # All changes Copyright 2008-2024 The Python Markdown Project
1010
1111# License: [BSD](https://opensource.org/licenses/bsd-license.php)
1212
2121
2222from . import Extension
2323from ..treeprocessors import Treeprocessor
24- from ..util import code_escape , parseBoolValue , AMP_SUBSTITUTE , HTML_PLACEHOLDER_RE , AtomicString
24+ from ..util import parseBoolValue , AMP_SUBSTITUTE , deprecated , HTML_PLACEHOLDER_RE , AtomicString
2525from ..treeprocessors import UnescapeTreeprocessor
26+ from ..serializers import RE_AMP
2627import re
2728import html
2829import unicodedata
30+ from copy import deepcopy
2931import xml .etree .ElementTree as etree
3032from typing import TYPE_CHECKING , Any , Iterator , MutableSet
3133
@@ -63,6 +65,7 @@ def unique(id: str, ids: MutableSet[str]) -> str:
6365 return id
6466
6567
68+ @deprecated ('Use `render_inner_html` and `striptags` instead.' )
6669def get_name (el : etree .Element ) -> str :
6770 """Get title name."""
6871
@@ -75,6 +78,7 @@ def get_name(el: etree.Element) -> str:
7578 return '' .join (text ).strip ()
7679
7780
81+ @deprecated ('Use `run_postprocessors`, `render_inner_html` and/or `striptags` instead.' )
7882def stashedHTML2text (text : str , md : Markdown , strip_entities : bool = True ) -> str :
7983 """ Extract raw HTML from stash, reduce to plain text and swap with placeholder. """
8084 def _html_sub (m : re .Match [str ]) -> str :
@@ -93,11 +97,80 @@ def _html_sub(m: re.Match[str]) -> str:
9397
9498
9599def unescape (text : str ) -> str :
96- """ Unescape escaped text. """
100+ """ Unescape Markdown backslash escaped text. """
97101 c = UnescapeTreeprocessor ()
98102 return c .unescape (text )
99103
100104
105+ def strip_tags (text : str ) -> str :
106+ """ Strip HTML tags and return plain text. Note: HTML entities are unaffected. """
107+ # A comment could contain a tag, so strip comments first
108+ while (start := text .find ('<!--' )) != - 1 and (end := text .find ('-->' , start )) != - 1 :
109+ text = f'{ text [:start ]} { text [end + 3 :]} '
110+
111+ while (start := text .find ('<' )) != - 1 and (end := text .find ('>' , start )) != - 1 :
112+ text = f'{ text [:start ]} { text [end + 1 :]} '
113+
114+ # Collapse whitespace
115+ text = ' ' .join (text .split ())
116+ return text
117+
118+
119+ def escape_cdata (text : str ) -> str :
120+ """ Escape character data. """
121+ if "&" in text :
122+ # Only replace & when not part of an entity
123+ text = RE_AMP .sub ('&' , text )
124+ if "<" in text :
125+ text = text .replace ("<" , "<" )
126+ if ">" in text :
127+ text = text .replace (">" , ">" )
128+ return text
129+
130+
131+ def run_postprocessors (text : str , md : Markdown ) -> str :
132+ """ Run postprocessors from Markdown instance on text. """
133+ for pp in md .postprocessors :
134+ text = pp .run (text )
135+ return text .strip ()
136+
137+
138+ def render_inner_html (el : etree .Element , md : Markdown ) -> str :
139+ """ Fully render inner html of an `etree` element as a string. """
140+ # The `UnescapeTreeprocessor` runs after `toc` extension so run here.
141+ text = unescape (md .serializer (el ))
142+
143+ # strip parent tag
144+ start = text .index ('>' ) + 1
145+ end = text .rindex ('<' )
146+ text = text [start :end ].strip ()
147+
148+ return run_postprocessors (text , md )
149+
150+
151+ def remove_fnrefs (root : etree .Element ) -> etree .Element :
152+ """ Remove footnote references from a copy of the element, if any are present. """
153+ # Remove footnote references, which look like this: `<sup id="fnref:1">...</sup>`.
154+ # If there are no `sup` elements, then nothing to do.
155+ if next (root .iter ('sup' ), None ) is None :
156+ return root
157+ root = deepcopy (root )
158+ # Find parent elements that contain `sup` elements.
159+ for parent in root .findall ('.//sup/..' ):
160+ carry_text = ""
161+ for child in reversed (parent ): # Reversed for the ability to mutate during iteration.
162+ # Remove matching footnote references but carry any `tail` text to preceding elements.
163+ if child .tag == 'sup' and child .get ('id' , '' ).startswith ('fnref' ):
164+ carry_text = f'{ child .tail or "" } { carry_text } '
165+ parent .remove (child )
166+ elif carry_text :
167+ child .tail = f'{ child .tail or "" } { carry_text } '
168+ carry_text = ""
169+ if carry_text :
170+ parent .text = f'{ parent .text or "" } { carry_text } '
171+ return root
172+
173+
101174def nest_toc_tokens (toc_list ):
102175 """Given an unsorted list with errors and skips, return a nested one.
103176
@@ -300,27 +373,30 @@ def run(self, doc: etree.Element) -> None:
300373 for el in doc .iter ():
301374 if isinstance (el .tag , str ) and self .header_rgx .match (el .tag ):
302375 self .set_level (el )
303- text = get_name (el )
376+ innerhtml = render_inner_html (remove_fnrefs (el ), self .md )
377+ name = strip_tags (innerhtml )
304378
305379 # Do not override pre-existing ids
306380 if "id" not in el .attrib :
307- innertext = unescape (stashedHTML2text (text , self .md ))
308- el .attrib ["id" ] = unique (self .slugify (innertext , self .sep ), used_ids )
381+ el .attrib ["id" ] = unique (self .slugify (html .unescape (name ), self .sep ), used_ids )
382+
383+ data_toc_label = ''
384+ if 'data-toc-label' in el .attrib :
385+ data_toc_label = run_postprocessors (unescape (el .attrib ['data-toc-label' ]), self .md )
386+ # Overwrite name with sanitized value of `data-toc-label`.
387+ name = escape_cdata (strip_tags (data_toc_label ))
388+ # Remove the data-toc-label attribute as it is no longer needed
389+ del el .attrib ['data-toc-label' ]
309390
310391 if int (el .tag [- 1 ]) >= self .toc_top and int (el .tag [- 1 ]) <= self .toc_bottom :
311392 toc_tokens .append ({
312393 'level' : int (el .tag [- 1 ]),
313394 'id' : el .attrib ["id" ],
314- 'name' : unescape (stashedHTML2text (
315- code_escape (el .attrib .get ('data-toc-label' , text )),
316- self .md , strip_entities = False
317- ))
395+ 'name' : name ,
396+ 'html' : innerhtml ,
397+ 'data-toc-label' : data_toc_label
318398 })
319399
320- # Remove the data-toc-label attribute as it is no longer needed
321- if 'data-toc-label' in el .attrib :
322- del el .attrib ['data-toc-label' ]
323-
324400 if self .use_anchors :
325401 self .add_anchor (el , el .attrib ["id" ])
326402 if self .use_permalinks not in [False , None ]:
0 commit comments