Skip to content
143 changes: 94 additions & 49 deletions src/mkdocs_llmstxt/_internal/plugin.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,32 +122,17 @@ def on_page_content(self, html: str, *, page: Page, **kwargs: Any) -> str | None
page: The page object.
"""
if (src_uri := page.file.src_uri) in self._file_uris:
path_md = Path(page.file.abs_dest_path).with_suffix(".md")
page_md = _generate_page_markdown(
html,
should_autoclean=self.config.autoclean,
preprocess=self.config.preprocess,
path=str(path_md),
)
page_md = self._generate_page_markdown(html, page)

md_url = Path(page.file.dest_uri).with_suffix(".md").as_posix()
# Apply the same logic as in the `Page.url` property.
if md_url in (".", "./"):
md_url = ""

# Use `base_url` if it exists.
if self.config.base_url is not None:
base = cast("str", self.config.base_url)
else:
# Use `site_url`, which we assume to be always specified.
base = cast("str", self.mkdocs_config.site_url)
if not base.endswith("/"):
base += "/"
md_url = urljoin(base, md_url)
md_url = urljoin(self._get_base_url(), md_url)

self._md_pages[src_uri] = _MDPageInfo(
title=page.title if page.title is not None else src_uri,
path_md=path_md,
title=str(page.title) if page.title is not None else src_uri,
path_md=_get_page_md_path(page),
md_url=md_url,
content=page_md,
)
Expand Down Expand Up @@ -199,6 +184,96 @@ def on_post_build(self, *, config: MkDocsConfig, **kwargs: Any) -> None: # noqa
full_output_file.write_text(full_markdown, encoding="utf8")
_logger.debug(f"Generated file /{self.config.full_output}.txt")

def _generate_page_markdown(self, html: str, page: Page) -> str:
"""Convert HTML to Markdown.

Parameters:
html: The HTML content.
page: The page object.

Returns:
The Markdown content.
"""
soup = Soup(html, "html.parser")
if self.config.autoclean:
autoclean(soup)
if self.config.preprocess:
_preprocess(soup, self.config.preprocess, str(_get_page_md_path(page)))

# Convert relative links to absolute links
base_uri = self._get_base_url()
current_dir = _get_parent_directory(page.file.dest_uri)
self._convert_to_absolute_links(soup, base_uri, current_dir)

return mdformat.text(
_converter.convert_soup(soup),
options={"wrap": "no"},
extensions=("tables",),
)

def _convert_to_absolute_links(self, soup: Soup, base_uri: str, current_dir: str) -> None:
"""Handle links in the HTML.

Parameters:
soup: The soup to modify.
base_uri: The base URI of the site.
current_dir: The current directory of the page (relative to site root).
"""
# Find all anchor tags with href attributes
for link in soup.find_all("a", href=True):
href = link.get("href")

# Skip if href is not a string or is empty
if not isinstance(href, str) or not href:
continue

# Skip if it's already an absolute URL (starts with http:// or https://)
if href.startswith(("http://", "https://")):
continue

# Skip if it's a mailto: or other protocol links
if ":" in href and not href.startswith("/"):
continue

# Skip if it's an anchor link (starts with #)
if href.startswith("#"):
continue

# Convert relative link to absolute
if href.startswith("/"):
# Absolute path from site root
final_href = urljoin(base_uri, href)
else:
# Relative path from current directory
relative_base = urljoin(base_uri, current_dir + "/") if current_dir else base_uri
final_href = urljoin(relative_base, href)

# Convert directory paths (ending with /) to point to index.md files
# This represents the README.md of the directory
if final_href.endswith("/"):
final_href = final_href + "index.md"

link["href"] = final_href

def _get_base_url(self) -> str:
if self.config.base_url is not None:
base_url = cast("str", self.config.base_url)
else:
base_url = cast("str", self.mkdocs_config.site_url)
if not base_url.endswith("/"):
base_url += "/"
return base_url


def _get_page_md_path(page: Page) -> Path:
return Path(page.file.abs_dest_path).with_suffix(".md")


def _get_parent_directory(dest_uri: str) -> str:
if dest_uri == ".":
return ""
return str(Path(dest_uri).parent)


def _language_callback(tag: Tag) -> str:
for css_class in chain(tag.get("class") or (), (tag.parent.get("class") or ()) if tag.parent else ()):
Expand All @@ -213,33 +288,3 @@ def _language_callback(tag: Tag) -> str:
escape_underscores=False,
heading_style=ATX,
)


def _generate_page_markdown(
html: str,
*,
should_autoclean: bool,
preprocess: str | None,
path: str,
) -> str:
"""Convert HTML to Markdown.

Parameters:
html: The HTML content.
should_autoclean: Whether to autoclean the HTML.
preprocess: An optional path of a Python module containing a `preprocess` function.
path: The output path of the relevant Markdown file.

Returns:
The Markdown content.
"""
soup = Soup(html, "html.parser")
if should_autoclean:
autoclean(soup)
if preprocess:
_preprocess(soup, preprocess, path)
return mdformat.text(
_converter.convert_soup(soup),
options={"wrap": "no"},
extensions=("tables",),
)
13 changes: 13 additions & 0 deletions tests/test_plugin.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
"sections": {
"Index": ["index.md"],
"Usage": [{"page1.md": "Some usage docs."}],
"Links": [{"page2.md": "Page with links."}],
},
},
},
Expand All @@ -28,6 +29,7 @@
"pages": {
"index.md": "# Hello world",
"page1.md": "# Usage\n\nSome paragraph.",
"page2.md": "# Links\n\n[Relative link](../index.md)\n[Absolute link](/page1.md)\n[External link](https://example.com)\n[Anchor link](#section)",
},
},
],
Expand Down Expand Up @@ -56,3 +58,14 @@ def test_plugin(mkdocs_conf: MkDocsConfig) -> None:
page1md = Path(mkdocs_conf.site_dir, "page1/index.md")
assert page1md.exists()
assert "Some paragraph." in page1md.read_text()

# Test relative link conversion
page2md = Path(mkdocs_conf.site_dir, "page2/index.md")
assert page2md.exists()
page2md_content = page2md.read_text()

# Check that relative links are converted to absolute URLs
assert "https://example.org/en/0.1.34/index.md" in page2md_content # ../index.md converted
assert "https://example.org/page1.md" in page2md_content # /page1.md converted (absolute from domain root)
assert "https://example.com" in page2md_content # External link unchanged
assert "#section" in page2md_content # Anchor link unchanged