diff --git a/CHANGES.rst b/CHANGES.rst index cd36d83957b..01d23b6fb26 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -48,6 +48,11 @@ Features added * #13439: linkcheck: Permit warning on every redirect with ``linkcheck_allowed_redirects = {}``. Patch by Adam Turner and James Addison. +* #14046: linkcheck: Add :confval:`linkcheck_case_insensitive` configuration to + allow case-insensitive URL comparison for specific URL patterns. + This is useful for links to websites that normalise URL casing (for example, + GitHub) or case-insensitive servers. + Patch by Fazeel Usmani. * #13497: Support C domain objects in the table of contents. * #13500: LaTeX: add support for ``fontawesome6`` package. Patch by Jean-François B. diff --git a/doc/usage/configuration.rst b/doc/usage/configuration.rst index ff903fa4f6c..151e7a011d5 100644 --- a/doc/usage/configuration.rst +++ b/doc/usage/configuration.rst @@ -3813,6 +3813,38 @@ and the number of workers to use. .. versionadded:: 7.3 +.. confval:: linkcheck_case_insensitive + :type: :code-py:`list` of :code-py:`str` + :default: :code-py:`[]` + + A list of regular expressions that match URLs for which the *linkcheck* + builder should perform case-insensitive comparisons. This is useful for + links to websites that normalise URL casing (for example, GitHub) or + servers that are case-insensitive (for example, Windows-based servers). + + By default, *linkcheck* requires the destination URL to match the + documented URL case-sensitively. For example, a link to + ``http://example.com/PATH`` that redirects to ``http://example.com/path`` + will be reported as ``redirected``. + + If the URL matches a pattern in this list, such redirects will instead be + reported as ``working``. + + For example, to treat all GitHub URLs as case-insensitive: + + .. code-block:: python + + linkcheck_case_insensitive = [ + r'https://github\.com/.*', + ] + + .. note:: + + HTML anchor checking is always case-sensitive and is not affected by + this setting. + + .. versionadded:: 8.2 + .. confval:: linkcheck_rate_limit_timeout :type: :code-py:`int` :default: :code-py:`300` diff --git a/sphinx/builders/linkcheck.py b/sphinx/builders/linkcheck.py index d3ce638fea4..037fb23e642 100644 --- a/sphinx/builders/linkcheck.py +++ b/sphinx/builders/linkcheck.py @@ -409,6 +409,9 @@ def __init__( self.user_agent = config.user_agent self.tls_verify = config.tls_verify self.tls_cacerts = config.tls_cacerts + self.case_insensitive_patterns: list[re.Pattern[str]] = list( + map(re.compile, config.linkcheck_case_insensitive) + ) self._session = requests._Session( _ignored_redirects=tuple(map(re.compile, config.linkcheck_ignore)) @@ -629,8 +632,29 @@ def _check_uri(self, uri: str, hyperlink: Hyperlink) -> _URIProperties: netloc = urlsplit(req_url).netloc self.rate_limits.pop(netloc, None) + # Check if URL should be compared case-insensitively based on patterns + is_case_insensitive = any( + pattern.match(req_url) for pattern in self.case_insensitive_patterns + ) + + # Compare URLs, optionally case-insensitively + def _normalise_url(url: str) -> str: + """Reduces a URL to a normal/equality-comparable form.""" + normalised_url = url.rstrip('/') + if is_case_insensitive: + # Only casefold the URL before the fragment; fragments are case-sensitive + if '#' in normalised_url: + url_part, fragment = normalised_url.split('#', 1) + normalised_url = url_part.casefold() + '#' + fragment + else: + normalised_url = normalised_url.casefold() + return normalised_url + + normalised_request_url = _normalise_url(req_url) + normalised_response_url = _normalise_url(response_url) + if ( - (response_url.rstrip('/') == req_url.rstrip('/')) + normalised_request_url == normalised_response_url or _allowed_redirect(req_url, response_url, self.allowed_redirects) ): # fmt: skip return _Status.WORKING, '', 0 @@ -816,6 +840,9 @@ def setup(app: Sphinx) -> ExtensionMetadata: app.add_config_value( 'linkcheck_report_timeouts_as_broken', False, '', types=frozenset({bool}) ) + app.add_config_value( + 'linkcheck_case_insensitive', [], '', types=frozenset({list, tuple}) + ) app.add_event('linkcheck-process-uri') diff --git a/tests/roots/test-linkcheck-case-check/conf.py b/tests/roots/test-linkcheck-case-check/conf.py new file mode 100644 index 00000000000..71319b6d4a5 --- /dev/null +++ b/tests/roots/test-linkcheck-case-check/conf.py @@ -0,0 +1 @@ +# Empty config for linkcheck case sensitivity tests diff --git a/tests/roots/test-linkcheck-case-check/index.rst b/tests/roots/test-linkcheck-case-check/index.rst new file mode 100644 index 00000000000..3a0c282ab66 --- /dev/null +++ b/tests/roots/test-linkcheck-case-check/index.rst @@ -0,0 +1,3 @@ +`path1 `_ + +`path2 `_ diff --git a/tests/test_builders/test_build_linkcheck.py b/tests/test_builders/test_build_linkcheck.py index a09a4a42216..778f12c3a20 100644 --- a/tests/test_builders/test_build_linkcheck.py +++ b/tests/test_builders/test_build_linkcheck.py @@ -1439,3 +1439,119 @@ def test_linkcheck_exclude_documents(app: SphinxTestApp) -> None: 'uri': 'https://www.sphinx-doc.org/this-is-another-broken-link', 'info': 'br0ken_link matched br[0-9]ken_link from linkcheck_exclude_documents', } in content + + +class CapitalisePathHandler(BaseHTTPRequestHandler): + """Test server that capitalises URL paths via redirects.""" + + protocol_version = 'HTTP/1.1' + + def do_HEAD(self): + # Use same logic as GET but don't send body + if self.path.startswith('/') and len(self.path) > 1 and self.path[1:].islower(): + # Redirect lowercase paths to capitalized versions + self.send_response(301, 'Moved Permanently') + self.send_header('Location', '/' + self.path[1:].capitalize()) + self.send_header('Content-Length', '0') + self.end_headers() + elif ( + self.path.startswith('/') + and len(self.path) > 1 + and self.path[1].isupper() + and self.path[2:].islower() + ): + # Serve capitalized paths + self.send_response(200, 'OK') + self.send_header('Content-Length', '0') + self.end_headers() + else: + self.send_response(404, 'Not Found') + self.send_header('Content-Length', '0') + self.end_headers() + + def do_GET(self): + if self.path.startswith('/') and len(self.path) > 1 and self.path[1:].islower(): + # Redirect lowercase paths to capitalized versions + self.send_response(301, 'Moved Permanently') + self.send_header('Location', '/' + self.path[1:].capitalize()) + self.send_header('Content-Length', '0') + self.end_headers() + elif ( + self.path.startswith('/') + and len(self.path) > 1 + and self.path[1].isupper() + and self.path[2:].islower() + ): + # Serve capitalized paths + content = b'ok\n\n' + self.send_response(200, 'OK') + self.send_header('Content-Length', str(len(content))) + self.end_headers() + self.wfile.write(content) + else: + self.send_response(404, 'Not Found') + self.send_header('Content-Length', '0') + self.end_headers() + + +@pytest.mark.sphinx( + 'linkcheck', + testroot='linkcheck-case-check', + freshenv=True, +) +def test_linkcheck_case_sensitive(app: SphinxTestApp) -> None: + """Test that case-sensitive checking is the default behavior.""" + with serve_application(app, CapitalisePathHandler) as address: + app.build() + + content = (app.outdir / 'output.json').read_text(encoding='utf8') + rows = [json.loads(x) for x in content.splitlines()] + rowsby = {row['uri']: row for row in rows} + + # With case-sensitive checking (default), URLs that redirect to different case + # should be marked as redirected + assert rowsby[f'http://{address}/path1']['status'] == 'redirected' + assert rowsby[f'http://{address}/path2']['status'] == 'redirected' + + +@pytest.mark.sphinx( + 'linkcheck', + testroot='linkcheck-case-check', + freshenv=True, + confoverrides={'linkcheck_case_insensitive': [r'http://localhost:\d+/.*']}, +) +def test_linkcheck_case_insensitive(app: SphinxTestApp) -> None: + """Test that URLs matching linkcheck_case_insensitive patterns ignore case differences.""" + with serve_application(app, CapitalisePathHandler) as address: + app.build() + + content = (app.outdir / 'output.json').read_text(encoding='utf8') + rows = [json.loads(x) for x in content.splitlines()] + rowsby = {row['uri']: row for row in rows} + + # With case-insensitive pattern matching, URLs that differ only in case + # should be marked as working + assert rowsby[f'http://{address}/path1']['status'] == 'working' + assert rowsby[f'http://{address}/path2']['status'] == 'working' + + +@pytest.mark.sphinx( + 'linkcheck', + testroot='linkcheck-case-check', + freshenv=True, + confoverrides={'linkcheck_case_insensitive': [r'http://localhost:\d+/path1']}, +) +def test_linkcheck_mixed_case_sensitivity(app: SphinxTestApp) -> None: + """Test both case-sensitive and case-insensitive checking in one test.""" + with serve_application(app, CapitalisePathHandler) as address: + app.build() + + content = (app.outdir / 'output.json').read_text(encoding='utf8') + rows = [json.loads(x) for x in content.splitlines()] + rowsby = {row['uri']: row for row in rows} + + # path1 matches case-insensitive pattern → should be 'working' + assert rowsby[f'http://{address}/path1']['status'] == 'working' + + # path2 doesn't match pattern → should be 'redirected' + assert rowsby[f'http://{address}/path2']['status'] == 'redirected'