Skip to content

Commit 9f63040

Browse files
committed
Improve llms.txt formatting
1 parent 3d9b6b0 commit 9f63040

File tree

3 files changed

+85
-41
lines changed

3 files changed

+85
-41
lines changed

docs/other/build-llms-txt.ts

Lines changed: 55 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,10 @@
11
import { unified } from "unified";
22
import rehypeParse from "rehype-parse";
3+
import rehypeRemoveComments from "rehype-remove-comments";
34
import rehypeRemark from "rehype-remark";
45
import remarkStringify from "remark-stringify";
56
import remarkGfm from "remark-gfm";
7+
import { visitParents } from "unist-util-visit-parents";
68
import { basename, dirname, join, relative } from "node:path";
79
import { fileURLToPath } from "node:url";
810
import { mkdir, readdir, readFile, writeFile } from "node:fs/promises";
@@ -43,20 +45,55 @@ async function collectFiles(currentDir: string, baseDir: string): Promise<FileMa
4345
}
4446
}
4547

46-
const REGEX_PATTERNS = {
47-
multipleNewlines: /\n{3,}/g,
48-
bulletSpacing: /- \n\s+/g,
49-
multiLineBullets: /(- [^\n]*)(?:\n\s+([^\n-][^\n]*))/g,
50-
startLineSpaces: /(\n|^)[ \t]+\n/g,
51-
endLineSpaces: /\n[ \t]+($|\n)/g,
52-
inlineCodeBefore: /(\S+)\s*\n\s*(`[^`]+?`)/g,
53-
inlineCodeAfter: /(`[^`]+?`)\s*\n\s*(\S+)/g,
54-
parenCodeStart: /\(\s*\n\s*(`[^`]+?`)/g,
55-
parenCodeEnd: /(`[^`]+?`)\s*\n\s*\)/g,
56-
escapedBackticks: /\\`([^`]+?)\\`/g,
57-
codeBlockIndent: /```([a-z]*)\n\t/g,
58-
htmlComments: /<!--.*?-->/gs,
59-
} as const;
48+
/**
49+
* Custom remark plugin to clean up code blocks by removing excessive blank lines.
50+
*/
51+
function remarkCleanCodeBlocks() {
52+
return (tree: any) => {
53+
function visit(node: any) {
54+
if (node.type === 'code' && node.value) {
55+
// Remove lines that only contain whitespace
56+
node.value = node.value
57+
.split('\n')
58+
.filter((line: string) => line.trim().length > 0)
59+
.join('\n').trim();
60+
}
61+
62+
if (node.children) {
63+
for (const child of node.children) {
64+
visit(child);
65+
}
66+
}
67+
}
68+
69+
visit(tree);
70+
};
71+
}
72+
73+
/**
74+
* Text inside parameter tables is double-html-encodes, meaning > becomes &gt; becomes &amp;gt;;
75+
* This is a special case, and the easiest way to handle it is to just manually decode it.
76+
*/
77+
function remarkDecodeTableEntities() {
78+
return (tree: any) => {
79+
visitParents(tree, 'text', (node: any, ancestors: any[]) => {
80+
// Check if any ancestor is a tableCell
81+
const isInTableCell = ancestors.some((ancestor) => ancestor.type === 'tableCell');
82+
83+
if (isInTableCell) {
84+
node.value = node.value
85+
.replace(/\\&#123;/g, "{")
86+
.replace(/\\&#125;/g, "}")
87+
.replace(/\\&amp;/g, "&")
88+
.replace(/&#123;/g, "{")
89+
.replace(/&#125;/g, "}")
90+
.replace(/&lt;/g, "<")
91+
.replace(/&gt;/g, ">")
92+
.replace(/&amp;/g, "&");
93+
}
94+
});
95+
};
96+
}
6097

6198
async function transformAndSaveMarkdown(rawHtml: string) {
6299
const dom = new JSDOM(rawHtml);
@@ -84,8 +121,11 @@ async function transformAndSaveMarkdown(rawHtml: string) {
84121

85122
const file = await unified()
86123
.use(rehypeParse)
124+
.use(rehypeRemoveComments)
87125
.use(rehypeRemark)
88126
.use(remarkGfm)
127+
.use(remarkCleanCodeBlocks)
128+
.use(remarkDecodeTableEntities)
89129
.use(remarkStringify, {
90130
bullet: "-",
91131
listItemIndent: "one",
@@ -94,27 +134,7 @@ async function transformAndSaveMarkdown(rawHtml: string) {
94134
})
95135
.process(html);
96136

97-
const sanitizedFile = String(file)
98-
.replace(REGEX_PATTERNS.htmlComments, "")
99-
.replace(REGEX_PATTERNS.multipleNewlines, "\n\n")
100-
.replace(REGEX_PATTERNS.bulletSpacing, "- ")
101-
.replace(REGEX_PATTERNS.multiLineBullets, "$1 $2")
102-
.replace(REGEX_PATTERNS.startLineSpaces, "$1")
103-
.replace(REGEX_PATTERNS.endLineSpaces, "$1")
104-
.replace(REGEX_PATTERNS.inlineCodeBefore, "$1 $2")
105-
.replace(REGEX_PATTERNS.inlineCodeAfter, "$1 $2")
106-
.replace(REGEX_PATTERNS.parenCodeStart, "($1")
107-
.replace(REGEX_PATTERNS.parenCodeEnd, "$1)")
108-
.replace(REGEX_PATTERNS.escapedBackticks, "`$1`")
109-
.replace(REGEX_PATTERNS.codeBlockIndent, "```$1\n")
110-
.replace(/\u00C2/g, "") // Â
111-
.replace(/\u2014/g, "") // —
112-
// oxlint-disable-next-line no-control-regex
113-
.replace(/[^\u0000-\u007F]/g, "")
114-
.replaceAll("\t", " ")
115-
.trim();
116-
117-
return sanitizedFile;
137+
return String(file).trim();
118138
}
119139

120140
async function generateRootLLMsTxt(fileNames: string[]) {

docs/package.json

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,9 @@
6161
"unist-util-visit": "^5.0.0",
6262
"velite": "^0.2.4",
6363
"vite": "catalog:",
64-
"vite-plugin-devtools-json": "^1.0.0"
64+
"vite-plugin-devtools-json": "^1.0.0",
65+
"rehype-remove-comments": "^6.1.1",
66+
"unist-util-visit-parents": "^6.0.2"
6567
},
6668
"type": "module"
6769
}

pnpm-lock.yaml

Lines changed: 27 additions & 5 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)