Skip to content

Commit 66db0d1

Browse files
committed
Refactor _do_links and simplify link handling logic
Removed a whole bunch of duplication and tried to make the function flow much easier to understand
1 parent 1f314c1 commit 66db0d1

File tree

1 file changed

+85
-116
lines changed

1 file changed

+85
-116
lines changed

lib/markdown2.py

Lines changed: 85 additions & 116 deletions
Original file line numberDiff line numberDiff line change
@@ -1524,7 +1524,8 @@ def _do_links(self, text: str) -> str:
15241524
anchor_allowed_pos = 0
15251525

15261526
curr_pos = 0
1527-
while True: # Handle the next link.
1527+
1528+
while True:
15281529
# The next '[' is the start of:
15291530
# - an inline anchor: [text](url "title")
15301531
# - a reference anchor: [text][id]
@@ -1552,8 +1553,11 @@ def _do_links(self, text: str) -> str:
15521553
# matching brackets in img alt text -- we'll differ in that
15531554
# regard.
15541555
bracket_depth = 0
1555-
for p in range(start_idx+1, min(start_idx+MAX_LINK_TEXT_SENTINEL,
1556-
text_length)):
1556+
1557+
for p in range(
1558+
start_idx + 1,
1559+
min(start_idx + MAX_LINK_TEXT_SENTINEL, text_length)
1560+
):
15571561
ch = text[p]
15581562
if ch == ']':
15591563
bracket_depth -= 1
@@ -1566,7 +1570,7 @@ def _do_links(self, text: str) -> str:
15661570
# This isn't markup.
15671571
curr_pos = start_idx + 1
15681572
continue
1569-
link_text = text[start_idx+1:p]
1573+
link_text = text[start_idx + 1: p]
15701574

15711575
# Fix for issue 341 - Injecting XSS into link text
15721576
if self.safe_mode:
@@ -1578,72 +1582,30 @@ def _do_links(self, text: str) -> str:
15781582
normed_id = re.sub(r'\W', '-', link_text[1:])
15791583
if normed_id in self.footnotes:
15801584
self.footnote_ids.append(normed_id)
1581-
result = '<sup class="footnote-ref" id="fnref-%s">' \
1582-
'<a href="#fn-%s">%s</a></sup>' \
1583-
% (normed_id, normed_id, len(self.footnote_ids))
1585+
result = (
1586+
f'<sup class="footnote-ref" id="fnref-{normed_id}">'
1587+
f'<a href="#fn-{normed_id}">{len(self.footnote_ids)}</a></sup>'
1588+
)
15841589
text = text[:start_idx] + result + text[p+1:]
15851590
else:
15861591
# This id isn't defined, leave the markup alone.
1587-
curr_pos = p+1
1592+
curr_pos = p + 1
15881593
continue
15891594

15901595
# Now determine what this is by the remainder.
15911596
p += 1
15921597

1593-
# Inline anchor or img?
1594-
if text[p:p + 1] == '(': # attempt at perf improvement
1595-
url, title, url_end_idx = self._extract_url_and_title(text, p)
1596-
if url is not None:
1597-
# Handle an inline anchor or img.
1598-
is_img = start_idx > 0 and text[start_idx-1] == "!"
1599-
if is_img:
1600-
start_idx -= 1
1598+
# -- Extract the URL, title and end index from the link
16011599

1602-
# We've got to encode these to avoid conflicting
1603-
# with italics/bold.
1604-
url = self._unhash_html_spans(url, code=True) \
1605-
.replace('*', self._escape_table['*']) \
1606-
.replace('_', self._escape_table['_'])
1607-
if title:
1608-
title_str = ' title="%s"' % (
1609-
_xml_escape_attr(title)
1610-
.replace('*', self._escape_table['*'])
1611-
.replace('_', self._escape_table['_']))
1612-
else:
1613-
title_str = ''
1614-
if is_img:
1615-
img_class_str = self._html_class_str_from_tag("img")
1616-
result = '<img src="%s" alt="%s"%s%s%s' \
1617-
% (self._protect_url(url),
1618-
self._hash_span(_xml_escape_attr(link_text)),
1619-
title_str,
1620-
img_class_str,
1621-
self.empty_element_suffix)
1622-
if "smarty-pants" in self.extras:
1623-
result = result.replace('"', self._escape_table['"'])
1624-
curr_pos = start_idx + len(result)
1625-
anchor_allowed_pos = start_idx + len(result)
1626-
text = text[:start_idx] + result + text[url_end_idx:]
1627-
elif start_idx >= anchor_allowed_pos:
1628-
safe_link = self._safe_href.match(url)
1629-
if self.safe_mode and not safe_link:
1630-
result_head = '<a href="#"%s>' % (title_str)
1631-
else:
1632-
result_head = '<a href="{}"{}>'.format(self._protect_url(url), title_str)
1633-
result = '{}{}</a>'.format(result_head, link_text)
1634-
if "smarty-pants" in self.extras:
1635-
result = result.replace('"', self._escape_table['"'])
1636-
# <img> allowed from curr_pos on, <a> from
1637-
# anchor_allowed_pos on.
1638-
curr_pos = start_idx + len(result_head)
1639-
anchor_allowed_pos = start_idx + len(result)
1640-
text = text[:start_idx] + result + text[url_end_idx:]
1641-
else:
1642-
# Anchor not allowed here.
1643-
curr_pos = start_idx + 1
1600+
# inline anchor or inline img
1601+
if text[p:p + 1] == '(':
1602+
url, title, url_end_idx = self._extract_url_and_title(text, p)
1603+
if url is None:
1604+
# text isn't markup
1605+
curr_pos = start_idx + 1
16441606
continue
1645-
1646-
# Reference anchor or img?
1607+
url = self._unhash_html_spans(url, code=True)
1608+
# reference anchor or reference img
16471609
else:
16481610
match = None
16491611
if 'link-shortrefs' in self.extras:
@@ -1662,64 +1624,71 @@ def _do_links(self, text: str) -> str:
16621624
match = None
16631625

16641626
match = match or self._tail_of_reference_link_re.match(text, p)
1665-
if match:
1666-
# Handle a reference-style anchor or img.
1667-
is_img = start_idx > 0 and text[start_idx-1] == "!"
1668-
if is_img:
1669-
start_idx -= 1
1670-
link_id = match.group("id").lower()
1671-
if not link_id:
1672-
link_id = link_text.lower() # for links like [this][]
1673-
if link_id in self.urls:
1674-
url = self.urls[link_id]
1675-
# We've got to encode these to avoid conflicting
1676-
# with italics/bold.
1677-
url = url.replace('*', self._escape_table['*']) \
1678-
.replace('_', self._escape_table['_'])
1679-
title = self.titles.get(link_id)
1680-
if title:
1681-
title = _xml_escape_attr(title) \
1682-
.replace('*', self._escape_table['*']) \
1683-
.replace('_', self._escape_table['_'])
1684-
title_str = ' title="%s"' % title
1685-
else:
1686-
title_str = ''
1687-
if is_img:
1688-
img_class_str = self._html_class_str_from_tag("img")
1689-
result = '<img src="%s" alt="%s"%s%s%s' \
1690-
% (self._protect_url(url),
1691-
self._hash_span(_xml_escape_attr(link_text)),
1692-
title_str,
1693-
img_class_str,
1694-
self.empty_element_suffix)
1695-
if "smarty-pants" in self.extras:
1696-
result = result.replace('"', self._escape_table['"'])
1697-
curr_pos = start_idx + len(result)
1698-
text = text[:start_idx] + result + text[match.end():]
1699-
elif start_idx >= anchor_allowed_pos:
1700-
if self.safe_mode and not self._safe_href.match(url):
1701-
result_head = '<a href="#"%s>' % (title_str)
1702-
else:
1703-
result_head = '<a href="{}"{}>'.format(self._protect_url(url), title_str)
1704-
result = '{}{}</a>'.format(result_head, link_text)
1705-
if "smarty-pants" in self.extras:
1706-
result = result.replace('"', self._escape_table['"'])
1707-
# <img> allowed from curr_pos on, <a> from
1708-
# anchor_allowed_pos on.
1709-
curr_pos = start_idx + len(result_head)
1710-
anchor_allowed_pos = start_idx + len(result)
1711-
text = text[:start_idx] + result + text[match.end():]
1712-
else:
1713-
# Anchor not allowed here.
1714-
curr_pos = start_idx + 1
1715-
else:
1716-
# This id isn't defined, leave the markup alone.
1717-
# set current pos to end of link title and continue from there
1718-
curr_pos = p
1627+
if not match:
1628+
# text isn't markup
1629+
curr_pos = start_idx + 1
17191630
continue
17201631

1721-
# Otherwise, it isn't markup.
1722-
curr_pos = start_idx + 1
1632+
link_id = match.group("id").lower() or link_text.lower() # for links like [this][]
1633+
1634+
if link_id not in self.urls:
1635+
# This id isn't defined, leave the markup alone.
1636+
# set current pos to end of link title and continue from there
1637+
curr_pos = p
1638+
continue
1639+
1640+
url = self.urls[link_id]
1641+
title = self.titles.get(link_id)
1642+
url_end_idx = match.end()
1643+
1644+
# -- Encode and hash the URL and title to avoid conflicts with italics/bold
1645+
1646+
url = (
1647+
url
1648+
.replace('*', self._escape_table['*'])
1649+
.replace('_', self._escape_table['_'])
1650+
)
1651+
if title:
1652+
title = (
1653+
_xml_escape_attr(title)
1654+
.replace('*', self._escape_table['*'])
1655+
.replace('_', self._escape_table['_'])
1656+
)
1657+
title_str = f' title="{title}"'
1658+
else:
1659+
title_str = ''
1660+
1661+
# -- Process the anchor/image
1662+
1663+
is_img = start_idx > 0 and text[start_idx-1] == "!"
1664+
if is_img:
1665+
start_idx -= 1
1666+
img_class_str = self._html_class_str_from_tag("img")
1667+
result = result_head = (
1668+
f'<img src="{self._protect_url(url)}"'
1669+
f' alt="{self._hash_span(_xml_escape_attr(link_text))}"'
1670+
f'{title_str}{img_class_str}{self.empty_element_suffix}'
1671+
)
1672+
elif start_idx >= anchor_allowed_pos:
1673+
if self.safe_mode and not self._safe_href.match(url):
1674+
result_head = f'<a href="#"{title_str}>'
1675+
else:
1676+
result_head = f'<a href="{self._protect_url(url)}"{title_str}>'
1677+
result = f'{result_head}{link_text}</a>'
1678+
else:
1679+
# anchor not allowed here/invalid markup
1680+
curr_pos = start_idx + 1
1681+
continue
1682+
1683+
if "smarty-pants" in self.extras:
1684+
result = result.replace('"', self._escape_table['"'])
1685+
1686+
# <img> allowed from curr_pos onwards, <a> allowed from anchor_allowed_pos onwards.
1687+
# this means images can exist within `<a>` tags but anchors can only come after the
1688+
# current anchor has been closed
1689+
curr_pos = start_idx + len(result_head)
1690+
anchor_allowed_pos = start_idx + len(result)
1691+
text = text[:start_idx] + result + text[url_end_idx:]
17231692

17241693
return text
17251694

0 commit comments

Comments
 (0)