11import { unified } from "unified" ;
22import rehypeParse from "rehype-parse" ;
3+ import rehypeRemoveComments from "rehype-remove-comments" ;
34import rehypeRemark from "rehype-remark" ;
45import remarkStringify from "remark-stringify" ;
56import remarkGfm from "remark-gfm" ;
7+ import { visitParents } from "unist-util-visit-parents" ;
68import { basename , dirname , join , relative } from "node:path" ;
79import { fileURLToPath } from "node:url" ;
810import { mkdir , readdir , readFile , writeFile } from "node:fs/promises" ;
@@ -43,20 +45,55 @@ async function collectFiles(currentDir: string, baseDir: string): Promise<FileMa
4345 }
4446}
4547
46- const REGEX_PATTERNS = {
47- multipleNewlines : / \n { 3 , } / g,
48- bulletSpacing : / - \n \s + / g,
49- multiLineBullets : / ( - [ ^ \n ] * ) (?: \n \s + ( [ ^ \n - ] [ ^ \n ] * ) ) / g,
50- startLineSpaces : / ( \n | ^ ) [ \t ] + \n / g,
51- endLineSpaces : / \n [ \t ] + ( $ | \n ) / g,
52- inlineCodeBefore : / ( \S + ) \s * \n \s * ( ` [ ^ ` ] + ?` ) / g,
53- inlineCodeAfter : / ( ` [ ^ ` ] + ?` ) \s * \n \s * ( \S + ) / g,
54- parenCodeStart : / \( \s * \n \s * ( ` [ ^ ` ] + ?` ) / g,
55- parenCodeEnd : / ( ` [ ^ ` ] + ?` ) \s * \n \s * \) / g,
56- escapedBackticks : / \\ ` ( [ ^ ` ] + ?) \\ ` / g,
57- codeBlockIndent : / ` ` ` ( [ a - z ] * ) \n \t / g,
58- htmlComments : / < ! - - .* ?- - > / gs,
59- } as const ;
48+ /**
49+ * Custom remark plugin to clean up code blocks by removing excessive blank lines.
50+ */
51+ function remarkCleanCodeBlocks ( ) {
52+ return ( tree : any ) => {
53+ function visit ( node : any ) {
54+ if ( node . type === 'code' && node . value ) {
55+ // Remove lines that only contain whitespace
56+ node . value = node . value
57+ . split ( '\n' )
58+ . filter ( ( line : string ) => line . trim ( ) . length > 0 )
59+ . join ( '\n' ) . trim ( ) ;
60+ }
61+
62+ if ( node . children ) {
63+ for ( const child of node . children ) {
64+ visit ( child ) ;
65+ }
66+ }
67+ }
68+
69+ visit ( tree ) ;
70+ } ;
71+ }
72+
73+ /**
74+ * Text inside parameter tables is double-html-encodes, meaning > becomes > becomes &gt;;
75+ * This is a special case, and the easiest way to handle it is to just manually decode it.
76+ */
77+ function remarkDecodeTableEntities ( ) {
78+ return ( tree : any ) => {
79+ visitParents ( tree , 'text' , ( node : any , ancestors : any [ ] ) => {
80+ // Check if any ancestor is a tableCell
81+ const isInTableCell = ancestors . some ( ( ancestor ) => ancestor . type === 'tableCell' ) ;
82+
83+ if ( isInTableCell ) {
84+ node . value = node . value
85+ . replace ( / \\ & # 1 2 3 ; / g, "{" )
86+ . replace ( / \\ & # 1 2 5 ; / g, "}" )
87+ . replace ( / \\ & a m p ; / g, "&" )
88+ . replace ( / & # 1 2 3 ; / g, "{" )
89+ . replace ( / & # 1 2 5 ; / g, "}" )
90+ . replace ( / & l t ; / g, "<" )
91+ . replace ( / & g t ; / g, ">" )
92+ . replace ( / & a m p ; / g, "&" ) ;
93+ }
94+ } ) ;
95+ } ;
96+ }
6097
6198async function transformAndSaveMarkdown ( rawHtml : string ) {
6299 const dom = new JSDOM ( rawHtml ) ;
@@ -84,8 +121,11 @@ async function transformAndSaveMarkdown(rawHtml: string) {
84121
85122 const file = await unified ( )
86123 . use ( rehypeParse )
124+ . use ( rehypeRemoveComments )
87125 . use ( rehypeRemark )
88126 . use ( remarkGfm )
127+ . use ( remarkCleanCodeBlocks )
128+ . use ( remarkDecodeTableEntities )
89129 . use ( remarkStringify , {
90130 bullet : "-" ,
91131 listItemIndent : "one" ,
@@ -94,27 +134,7 @@ async function transformAndSaveMarkdown(rawHtml: string) {
94134 } )
95135 . process ( html ) ;
96136
97- const sanitizedFile = String ( file )
98- . replace ( REGEX_PATTERNS . htmlComments , "" )
99- . replace ( REGEX_PATTERNS . multipleNewlines , "\n\n" )
100- . replace ( REGEX_PATTERNS . bulletSpacing , "- " )
101- . replace ( REGEX_PATTERNS . multiLineBullets , "$1 $2" )
102- . replace ( REGEX_PATTERNS . startLineSpaces , "$1" )
103- . replace ( REGEX_PATTERNS . endLineSpaces , "$1" )
104- . replace ( REGEX_PATTERNS . inlineCodeBefore , "$1 $2" )
105- . replace ( REGEX_PATTERNS . inlineCodeAfter , "$1 $2" )
106- . replace ( REGEX_PATTERNS . parenCodeStart , "($1" )
107- . replace ( REGEX_PATTERNS . parenCodeEnd , "$1)" )
108- . replace ( REGEX_PATTERNS . escapedBackticks , "`$1`" )
109- . replace ( REGEX_PATTERNS . codeBlockIndent , "```$1\n" )
110- . replace ( / \u00C2 / g, "" ) // Â
111- . replace ( / \u2014 / g, "" ) // —
112- // oxlint-disable-next-line no-control-regex
113- . replace ( / [ ^ \u0000 - \u007F ] / g, "" )
114- . replaceAll ( "\t" , " " )
115- . trim ( ) ;
116-
117- return sanitizedFile ;
137+ return String ( file ) . trim ( ) ;
118138}
119139
120140async function generateRootLLMsTxt ( fileNames : string [ ] ) {
0 commit comments