Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 27 additions & 0 deletions doc/usage/configuration.rst
Original file line number Diff line number Diff line change
Expand Up @@ -3813,6 +3813,33 @@ and the number of workers to use.

.. versionadded:: 7.3

.. confval:: linkcheck_ignore_case
:type: :code-py:`bool`
:default: :code-py:`False`

When :code-py:`True`, the *linkcheck* builder will compare URLs
and anchors case-insensitively during validation.
This is useful for checking links on case-insensitive servers
(for example, some web servers or hosting platforms)
that may return URLs with different case than the original link.

When this option is enabled:

* URL paths are compared case-insensitively
(e.g., ``/Path`` and ``/path`` are considered equal)
* HTML anchors are compared case-insensitively
(e.g., ``#MyAnchor`` and ``#myanchor`` are considered equal)

By default, this option is disabled and checking is case-sensitive.

Example:

.. code-block:: python

linkcheck_ignore_case = True

.. versionadded:: 8.2

.. confval:: linkcheck_rate_limit_timeout
:type: :code-py:`int`
:default: :code-py:`300`
Expand Down
36 changes: 28 additions & 8 deletions sphinx/builders/linkcheck.py
Original file line number Diff line number Diff line change
Expand Up @@ -409,6 +409,7 @@ def __init__(
self.user_agent = config.user_agent
self.tls_verify = config.tls_verify
self.tls_cacerts = config.tls_cacerts
self.ignore_case = config.linkcheck_ignore_case

self._session = requests._Session(
_ignored_redirects=tuple(map(re.compile, config.linkcheck_ignore))
Expand Down Expand Up @@ -545,7 +546,9 @@ def _check_uri(self, uri: str, hyperlink: Hyperlink) -> _URIProperties:
) as response:
if anchor and self.check_anchors and response.ok:
try:
found = contains_anchor(response, anchor)
found = contains_anchor(
response, anchor, ignore_case=self.ignore_case
)
except UnicodeDecodeError:
return (
_Status.IGNORED,
Expand Down Expand Up @@ -629,8 +632,16 @@ def _check_uri(self, uri: str, hyperlink: Hyperlink) -> _URIProperties:
netloc = urlsplit(req_url).netloc
self.rate_limits.pop(netloc, None)

# Compare URLs, optionally case-insensitively
response_url_stripped = response_url.rstrip('/')
req_url_stripped = req_url.rstrip('/')
if self.ignore_case:
urls_match = response_url_stripped.lower() == req_url_stripped.lower()
else:
urls_match = response_url_stripped == req_url_stripped

if (
(response_url.rstrip('/') == req_url.rstrip('/'))
urls_match
or _allowed_redirect(req_url, response_url, self.allowed_redirects)
): # fmt: skip
return _Status.WORKING, '', 0
Expand Down Expand Up @@ -695,9 +706,11 @@ def _get_request_headers(
return {}


def contains_anchor(response: Response, anchor: str) -> bool:
def contains_anchor(
response: Response, anchor: str, *, ignore_case: bool = False
) -> bool:
"""Determine if an anchor is contained within an HTTP response."""
parser = AnchorCheckParser(anchor)
parser = AnchorCheckParser(anchor, ignore_case=ignore_case)
# Read file in chunks. If we find a matching anchor, we break
# the loop early in hopes not to have to download the whole thing.
for chunk in response.iter_content(chunk_size=4096, decode_unicode=True):
Expand All @@ -715,17 +728,23 @@ def contains_anchor(response: Response, anchor: str) -> bool:
class AnchorCheckParser(HTMLParser):
"""Specialised HTML parser that looks for a specific anchor."""

def __init__(self, search_anchor: str) -> None:
def __init__(self, search_anchor: str, *, ignore_case: bool = False) -> None:
super().__init__()

self.search_anchor = search_anchor
self.ignore_case = ignore_case
self.found = False

def handle_starttag(self, tag: Any, attrs: Any) -> None:
for key, value in attrs:
if key in {'id', 'name'} and value == self.search_anchor:
self.found = True
break
if key in {'id', 'name'}:
if self.ignore_case:
match = value.lower() == self.search_anchor.lower()
else:
match = value == self.search_anchor
if match:
self.found = True
break


def _allowed_redirect(
Expand Down Expand Up @@ -816,6 +835,7 @@ def setup(app: Sphinx) -> ExtensionMetadata:
app.add_config_value(
'linkcheck_report_timeouts_as_broken', False, '', types=frozenset({bool})
)
app.add_config_value('linkcheck_ignore_case', False, '', types=frozenset({bool}))

app.add_event('linkcheck-process-uri')

Expand Down
2 changes: 1 addition & 1 deletion sphinx/transforms/i18n.py
Original file line number Diff line number Diff line change
Expand Up @@ -415,7 +415,7 @@ def apply(self, **kwargs: Any) -> None:
# There is no point in having noqa on literal blocks because
# they cannot contain references. Recognizing it would just
# completely prevent escaping the noqa. Outside of literal
# blocks, one can always write \#noqa.
# blocks, one can always write \\#noqa.
if not isinstance(node, LITERAL_TYPE_NODES):
msgstr, _ = parse_noqa(msgstr)

Expand Down
2 changes: 1 addition & 1 deletion tests/roots/test-root/wrongenc.inc
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
This file is encoded in latin-1 but at first read as utf-8.

Max Strauß aß in München eine Leberkässemmel.
Max Strauß aß in München eine Leberkässemmel.
2 changes: 1 addition & 1 deletion tests/roots/test-warnings/wrongenc.inc
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
This file is encoded in latin-1 but at first read as utf-8.

Max Strauß aß in München eine Leberkässemmel.
Max Strauß aß in München eine Leberkässemmel.
3 changes: 1 addition & 2 deletions tests/test_builders/test_build_html_numfig.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,7 @@
from sphinx.testing.util import SphinxTestApp


@pytest.mark.sphinx('html', testroot='numfig')
@pytest.mark.test_params(shared_result='test_build_html_numfig')
@pytest.mark.sphinx('html', testroot='numfig', freshenv=True)
def test_numfig_disabled_warn(app: SphinxTestApp) -> None:
app.build()
warnings = app.warning.getvalue()
Expand Down
143 changes: 143 additions & 0 deletions tests/test_builders/test_build_linkcheck.py
Original file line number Diff line number Diff line change
Expand Up @@ -1439,3 +1439,146 @@ def test_linkcheck_exclude_documents(app: SphinxTestApp) -> None:
'uri': 'https://www.sphinx-doc.org/this-is-another-broken-link',
'info': 'br0ken_link matched br[0-9]ken_link from linkcheck_exclude_documents',
} in content


class CaseSensitiveHandler(BaseHTTPRequestHandler):
"""Handler that returns URLs with uppercase in the redirect location."""

protocol_version = 'HTTP/1.1'

def do_HEAD(self):
# Simulate a server that returns URLs with different case
if self.path == '/path':
# Return the path with uppercase
self.send_response(200, 'OK')
# Simulate the response URL being in uppercase
self.send_header('Content-Length', '0')
self.end_headers()
elif self.path == '/anchor.html':
self.send_response(200, 'OK')
self.send_header('Content-Length', '0')
self.end_headers()
else:
self.send_response(404, 'Not Found')
self.send_header('Content-Length', '0')
self.end_headers()

def do_GET(self):
if self.path == '/path':
content = b'ok\n\n'
self.send_response(200, 'OK')
self.send_header('Content-Length', str(len(content)))
self.end_headers()
self.wfile.write(content)
elif self.path == '/anchor.html':
# HTML with anchor in mixed case
doc = '<!DOCTYPE html><html><body><a id="MyAnchor"></a></body></html>'
content = doc.encode('utf-8')
self.send_response(200, 'OK')
self.send_header('Content-Length', str(len(content)))
self.end_headers()
self.wfile.write(content)
else:
self.send_response(404, 'Not Found')
self.send_header('Content-Length', '0')
self.end_headers()


@pytest.mark.sphinx(
'linkcheck',
testroot='linkcheck-localserver',
freshenv=True,
confoverrides={'linkcheck_ignore_case': False},
)
def test_linkcheck_case_sensitive(app: SphinxTestApp) -> None:
"""Test that case-sensitive checking is the default behavior."""
with serve_application(app, CaseSensitiveHandler) as address:
# Monkey-patch the session to change the response URL to uppercase
# to simulate a case-insensitive server
from unittest.mock import patch

original_request = requests._Session.request

def mock_request(self, method, url, **kwargs):
response = original_request(self, method, url, **kwargs)
# Change the URL to uppercase to simulate server behavior
if '/path' in str(response.url).lower():
response.url = str(response.url).replace('/path', '/PATH')
return response

with patch.object(requests._Session, 'request', mock_request):
app.build()

content = (app.outdir / 'output.json').read_text(encoding='utf8')
rows = [json.loads(x) for x in content.splitlines()]
rowsby = {row['uri']: row for row in rows}

# With case-sensitive checking, a URL that redirects to different case
# should be marked as redirected
lowercase_uri = f'http://{address}/path'
if lowercase_uri in rowsby:
# Should be redirected because case doesn't match
assert rowsby[lowercase_uri]['status'] == 'redirected'


@pytest.mark.sphinx(
'linkcheck',
testroot='linkcheck-localserver',
freshenv=True,
confoverrides={'linkcheck_ignore_case': True},
)
def test_linkcheck_case_insensitive(app: SphinxTestApp) -> None:
"""Test that linkcheck_ignore_case=True ignores case differences in URLs."""
with serve_application(app, CaseSensitiveHandler) as address:
# Monkey-patch the session to change the response URL to uppercase
from unittest.mock import patch

original_request = requests._Session.request

def mock_request(self, method, url, **kwargs):
response = original_request(self, method, url, **kwargs)
# Change the URL to uppercase to simulate server behavior
if '/path' in str(response.url).lower():
response.url = str(response.url).replace('/path', '/PATH')
return response

with patch.object(requests._Session, 'request', mock_request):
app.build()

content = (app.outdir / 'output.json').read_text(encoding='utf8')
rows = [json.loads(x) for x in content.splitlines()]
rowsby = {row['uri']: row for row in rows}

# With case-insensitive checking, a URL that differs only in case
# should be marked as working
lowercase_uri = f'http://{address}/path'
if lowercase_uri in rowsby:
# Should be working because case is ignored
assert rowsby[lowercase_uri]['status'] == 'working'


@pytest.mark.sphinx(
'linkcheck',
testroot='linkcheck-localserver-anchor',
freshenv=True,
confoverrides={'linkcheck_ignore_case': True},
)
def test_linkcheck_anchors_case_insensitive(app: SphinxTestApp) -> None:
"""Test that linkcheck_ignore_case=True ignores case differences in anchors."""
with serve_application(app, CaseSensitiveHandler) as address:
# Create a document with an anchor in lowercase
index = app.srcdir / 'index.rst'
index.write_text(
f'* `Link with anchor <http://{address}/anchor.html#myanchor>`_\n',
encoding='utf-8',
)
app.build()

content = (app.outdir / 'output.json').read_text(encoding='utf8')
rows = [json.loads(x) for x in content.splitlines()]

# The HTML has "MyAnchor" but we request "myanchor"
# With ignore_case=True, this should work
assert len(rows) == 1
assert rows[0]['status'] == 'working'
assert rows[0]['uri'] == f'http://{address}/anchor.html#myanchor'
18 changes: 15 additions & 3 deletions tests/test_command_line.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,11 @@ def test_make_mode_parse_arguments_pos_last(
with pytest.raises(SystemExit):
run_make_mode(args)
stderr = capsys.readouterr().err.splitlines()
assert stderr[-1].endswith('error: argument --builder/-b: expected one argument')
# Strip ANSI color codes before checking
import re

stderr_clean = re.sub(r'\x1b\[[0-9;]+m', '', stderr[-1])
assert stderr_clean.endswith('error: argument --builder/-b: expected one argument')


def test_make_mode_parse_arguments_pos_middle(
Expand All @@ -196,7 +200,11 @@ def test_make_mode_parse_arguments_pos_middle(
with pytest.raises(SystemExit):
run_make_mode(args)
stderr = capsys.readouterr().err.splitlines()
assert stderr[-1].endswith('error: argument --builder/-b: expected one argument')
# Strip ANSI color codes before checking
import re

stderr_clean = re.sub(r'\x1b\[[0-9;]+m', '', stderr[-1])
assert stderr_clean.endswith('error: argument --builder/-b: expected one argument')


@pytest.mark.xfail(
Expand Down Expand Up @@ -233,4 +241,8 @@ def test_make_mode_parse_arguments_pos_intermixed(
with pytest.raises(SystemExit):
run_make_mode(args)
stderr = capsys.readouterr().err.splitlines()
assert stderr[-1].endswith('error: argument --builder/-b: expected one argument')
# Strip ANSI color codes before checking
import re

stderr_clean = re.sub(r'\x1b\[[0-9;]+m', '', stderr[-1])
assert stderr_clean.endswith('error: argument --builder/-b: expected one argument')
Loading