|
1 | | -import sax from 'sax' |
2 | | -import Message from 'vfile-message' |
3 | | - |
4 | | -var Parser = sax.SAXParser |
5 | | - |
6 | | -var fromCharCode = String.fromCharCode |
7 | | - |
8 | | -var search = /\r?\n|\r/g |
9 | | - |
10 | | -export function fromXml(doc) { |
11 | | - var parser = new Parser(true, {position: true, strictEntities: true}) |
12 | | - var stack = [{type: 'root', children: []}] |
13 | | - var position = now() |
14 | | - |
15 | | - parser.ondoctype = ondoctype |
16 | | - parser.onsgmldeclaration = onsgmldeclaration |
17 | | - parser.onprocessinginstruction = onprocessinginstruction |
18 | | - parser.ontext = ontext |
19 | | - parser.oncomment = oncomment |
20 | | - parser.onopencdata = oncdataopen |
21 | | - parser.oncdata = oncdatavalue |
22 | | - parser.onclosecdata = exit |
23 | | - parser.onopentag = onopen |
24 | | - parser.onclosetag = exit |
25 | | - parser.onerror = onerror |
26 | | - |
27 | | - parser.write(doc).close() |
28 | | - |
29 | | - return stack[0] |
30 | | - |
31 | | - function onerror(error) { |
32 | | - var index = error.message.indexOf('\nLine') |
33 | | - // The substring should always be included, but this guards against |
34 | | - // changes in newer sax versions. |
35 | | - /* c8 ignore next */ |
36 | | - fail(index === -1 ? error.message : error.message.slice(0, index), 'sax') |
37 | | - } |
38 | | - |
39 | | - function onsgmldeclaration() { |
40 | | - fail('Unexpected SGML declaration', 'unexpected-sgml') |
41 | | - } |
42 | | - |
43 | | - // eslint-disable-next-line complexity |
44 | | - function ondoctype(value) { |
45 | | - var node = {type: 'doctype', name: '', public: null, system: null} |
46 | | - var index = -1 |
47 | | - var state = 'BEGIN' |
48 | | - var returnState |
49 | | - var buffer |
50 | | - var bufferIndex |
51 | | - var start |
52 | | - var marker |
53 | | - var code |
54 | | - |
55 | | - while (++index <= value.length) { |
56 | | - code = index === value.length ? null /* EOF */ : value.charCodeAt(index) |
57 | | - |
58 | | - switch (state) { |
59 | | - case 'BEGIN': |
60 | | - if (isSpace(code)) { |
61 | | - state = 'BEFORE_NAME' |
62 | | - } else { |
63 | | - fail('Expected doctype name', 'doctype-name') |
64 | | - } |
65 | | - |
66 | | - break |
67 | | - case 'BEFORE_NAME': |
68 | | - if (isSpace(code)) { |
69 | | - // As expected. |
70 | | - } else if (isNameStartChar(code)) { |
71 | | - state = 'IN_NAME' |
72 | | - start = index |
73 | | - } else { |
74 | | - fail('Expected start of doctype name', 'doctype-name') |
75 | | - } |
76 | | - |
77 | | - break |
78 | | - case 'IN_NAME': |
79 | | - if (isNameChar(code)) { |
80 | | - // As expected. |
81 | | - } else if (isSpace(code) || code === null /* EOF */) { |
82 | | - state = 'AFTER_NAME' |
83 | | - node.name = value.slice(start, index) |
84 | | - } else if (code === 91 /* `[` */) { |
85 | | - fail('Unexpected internal subset', 'doctype-internal-subset') |
86 | | - } else { |
87 | | - fail( |
88 | | - 'Expected doctype name character, whitespace, or doctype end', |
89 | | - 'doctype-name' |
90 | | - ) |
91 | | - } |
92 | | - |
93 | | - break |
94 | | - case 'AFTER_NAME': |
95 | | - if (code === null /* EOF */) { |
96 | | - // Done. |
97 | | - } else if (isSpace(code)) { |
98 | | - // As expected. |
99 | | - } else |
100 | | - switch (code) { |
101 | | - case 80: { |
102 | | - state = 'IN_EID' |
103 | | - returnState = 'AFTER_PUBLIC' |
104 | | - buffer = 'PUBLIC' |
105 | | - bufferIndex = 0 |
106 | | - |
107 | | - break |
108 | | - } |
109 | | - |
110 | | - case 83: { |
111 | | - state = 'IN_EID' |
112 | | - returnState = 'AFTER_SYSTEM' |
113 | | - buffer = 'SYSTEM' |
114 | | - bufferIndex = 0 |
115 | | - |
116 | | - break |
117 | | - } |
118 | | - |
119 | | - case 91: { |
120 | | - fail('Unexpected internal subset', 'doctype-internal-subset') |
121 | | - |
122 | | - break |
123 | | - } |
124 | | - |
125 | | - default: { |
126 | | - fail( |
127 | | - 'Expected external identifier (`PUBLIC` or `SYSTEM`), whitespace, or doctype end', |
128 | | - 'doctype-external-identifier' |
129 | | - ) |
130 | | - } |
131 | | - } |
132 | | - |
133 | | - break |
134 | | - case 'IN_EID': |
135 | | - if (code === buffer.charCodeAt(++bufferIndex)) { |
136 | | - if (bufferIndex === buffer.length - 1) { |
137 | | - state = returnState |
138 | | - } |
139 | | - } else { |
140 | | - fail( |
141 | | - 'Expected external identifier (`PUBLIC` or `SYSTEM`)', |
142 | | - 'doctype-external-identifier' |
143 | | - ) |
144 | | - } |
145 | | - |
146 | | - break |
147 | | - case 'AFTER_PUBLIC': |
148 | | - if (isSpace(code)) { |
149 | | - state = 'BEFORE_PUBLIC_LITERAL' |
150 | | - } else { |
151 | | - fail('Expected whitespace after `PUBLIC`', 'doctype-public-literal') |
152 | | - } |
153 | | - |
154 | | - break |
155 | | - case 'AFTER_SYSTEM': |
156 | | - if (isSpace(code)) { |
157 | | - state = 'BEFORE_SYSTEM_LITERAL' |
158 | | - } else { |
159 | | - fail('Expected whitespace after `SYSTEM`', 'doctype-system-literal') |
160 | | - } |
161 | | - |
162 | | - break |
163 | | - case 'BEFORE_PUBLIC_LITERAL': |
164 | | - if (isSpace(code)) { |
165 | | - // As expected. |
166 | | - } else if (code === 34 /* `"` */ || code === 39 /* `'` */) { |
167 | | - state = 'IN_PUBLIC_LITERAL' |
168 | | - start = index + 1 |
169 | | - marker = code |
170 | | - } else { |
171 | | - fail( |
172 | | - 'Expected quote or apostrophe to start public literal', |
173 | | - 'doctype-public-literal' |
174 | | - ) |
175 | | - } |
176 | | - |
177 | | - break |
178 | | - case 'IN_PUBLIC_LITERAL': |
179 | | - if (code === marker) { |
180 | | - state = 'AFTER_PUBLIC_LITERAL' |
181 | | - node.public = value.slice(start, index) |
182 | | - } else if (isPubidChar(code)) { |
183 | | - // As expected. |
184 | | - } else { |
185 | | - fail( |
186 | | - 'Expected pubid character in public literal', |
187 | | - 'doctype-public-literal' |
188 | | - ) |
189 | | - } |
190 | | - |
191 | | - break |
192 | | - case 'AFTER_PUBLIC_LITERAL': |
193 | | - if (isSpace(code)) { |
194 | | - // As expected. |
195 | | - state = 'BEFORE_SYSTEM_LITERAL' |
196 | | - } else { |
197 | | - fail( |
198 | | - 'Expected whitespace after public literal', |
199 | | - 'doctype-system-literal' |
200 | | - ) |
201 | | - } |
202 | | - |
203 | | - break |
204 | | - case 'BEFORE_SYSTEM_LITERAL': |
205 | | - if (isSpace(code)) { |
206 | | - // As expected. |
207 | | - } else if (code === 34 /* `"` */ || code === 39 /* `'` */) { |
208 | | - state = 'IN_SYSTEM_LITERAL' |
209 | | - start = index + 1 |
210 | | - marker = code |
211 | | - } else { |
212 | | - fail( |
213 | | - 'Expected quote or apostrophe to start system literal', |
214 | | - 'doctype-system-literal' |
215 | | - ) |
216 | | - } |
217 | | - |
218 | | - break |
219 | | - case 'IN_SYSTEM_LITERAL': |
220 | | - // Handled by SAX, but keep it to guard against changes in newer sax |
221 | | - // versions. |
222 | | - /* c8 ignore next 5 */ |
223 | | - if (code === null /* EOF */) { |
224 | | - fail( |
225 | | - 'Expected quote or apostrophe to end system literal', |
226 | | - 'doctype-system-literal' |
227 | | - ) |
228 | | - } else if (code === marker) { |
229 | | - state = 'AFTER_SYSTEM_LITERAL' |
230 | | - node.system = value.slice(start, index) |
231 | | - } else { |
232 | | - // As expected. |
233 | | - } |
234 | | - |
235 | | - break |
236 | | - |
237 | | - case 'AFTER_SYSTEM_LITERAL': |
238 | | - if (code === null /* EOF */) { |
239 | | - // Done. |
240 | | - } else if (isSpace(code)) { |
241 | | - // As expected. |
242 | | - } else if (code === 91 /* `[` */) { |
243 | | - fail('Unexpected internal subset', 'internal-subset') |
244 | | - } else { |
245 | | - fail('Expected whitespace or end of doctype', 'system-literal') |
246 | | - } |
247 | | - |
248 | | - break |
249 | | - // Guard against new states. |
250 | | - /* c8 ignore next 2 */ |
251 | | - default: |
252 | | - throw new Error('Unhandled state `' + state + '`') |
253 | | - } |
254 | | - } |
255 | | - |
256 | | - enter(node) |
257 | | - exit() |
258 | | - } |
259 | | - |
260 | | - function onprocessinginstruction(value) { |
261 | | - enter({ |
262 | | - type: 'instruction', |
263 | | - name: String(value.name), |
264 | | - value: String(value.body) |
265 | | - }) |
266 | | - exit() |
267 | | - } |
268 | | - |
269 | | - function oncomment(value) { |
270 | | - var node = {type: 'comment', value} |
271 | | - |
272 | | - // Comment has a positional bug… 😢 |
273 | | - // They end right before the last character (`>`), so let’s add that: |
274 | | - var actualEnd = now() |
275 | | - actualEnd.column++ |
276 | | - actualEnd.offset++ |
277 | | - |
278 | | - enter(node) |
279 | | - exit() |
280 | | - |
281 | | - node.position.end = Object.assign({}, actualEnd) |
282 | | - position = actualEnd |
283 | | - } |
284 | | - |
285 | | - function oncdataopen() { |
286 | | - enter({type: 'cdata', value: ''}) |
287 | | - } |
288 | | - |
289 | | - function oncdatavalue(value) { |
290 | | - stack[stack.length - 1].value += value |
291 | | - } |
292 | | - |
293 | | - function ontext(value) { |
294 | | - var node = {type: 'text', value} |
295 | | - // Text has a positional bug… 😢 |
296 | | - // When they are added, the position is already at the next token. |
297 | | - // So let’s reverse that. |
298 | | - var actualEnd = Object.assign({}, position) |
299 | | - var start = 0 |
300 | | - var match |
301 | | - |
302 | | - while (start < value.length) { |
303 | | - search.lastIndex = start |
304 | | - match = search.exec(value) |
305 | | - |
306 | | - if (match) { |
307 | | - actualEnd.line++ |
308 | | - actualEnd.column = 1 |
309 | | - start = match.index + match[0].length |
310 | | - } else { |
311 | | - actualEnd.column += value.length - start |
312 | | - start = value.length |
313 | | - } |
314 | | - } |
315 | | - |
316 | | - actualEnd.offset += value.length |
317 | | - |
318 | | - enter(node) |
319 | | - exit() |
320 | | - |
321 | | - node.position.end = Object.assign({}, actualEnd) |
322 | | - position = actualEnd |
323 | | - } |
324 | | - |
325 | | - function onopen(value) { |
326 | | - enter({ |
327 | | - type: 'element', |
328 | | - name: value.name, |
329 | | - attributes: value.attributes, |
330 | | - children: [] |
331 | | - }) |
332 | | - } |
333 | | - |
334 | | - function enter(node) { |
335 | | - node.position = {start: Object.assign({}, position)} |
336 | | - stack[stack.length - 1].children.push(node) |
337 | | - stack.push(node) |
338 | | - position = now() |
339 | | - } |
340 | | - |
341 | | - function exit() { |
342 | | - position = now() |
343 | | - stack.pop().position.end = Object.assign({}, position) |
344 | | - } |
345 | | - |
346 | | - function now() { |
347 | | - return { |
348 | | - line: parser.line + 1, |
349 | | - column: parser.column + 1, |
350 | | - offset: parser.position |
351 | | - } |
352 | | - } |
353 | | - |
354 | | - function fail(reason, id) { |
355 | | - throw new Message(reason, now(), 'xast-util-from-xml:' + id) |
356 | | - } |
357 | | -} |
358 | | - |
359 | | -// See: <https://www.w3.org/TR/xml/#NT-NameStartChar> |
360 | | -function isNameStartChar(code) { |
361 | | - return /[:A-Z_a-z\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02FF\u0370-\u037D\u037F-\u1FFF\u200C\u200D\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD]/.test( |
362 | | - fromCharCode(code) |
363 | | - ) |
364 | | -} |
365 | | - |
366 | | -// See: <https://www.w3.org/TR/xml/#NT-NameChar> |
367 | | -function isNameChar(code) { |
368 | | - return ( |
369 | | - isNameStartChar(code) || |
370 | | - /[-.\d\u00B7\u0300-\u036F\u203F\u2040]/.test(fromCharCode(code)) |
371 | | - ) |
372 | | -} |
373 | | - |
374 | | -function isSpace(code) { |
375 | | - return /[\t\n\r ]/.test(fromCharCode(code)) |
376 | | -} |
377 | | - |
378 | | -function isPubidChar(code) { |
379 | | - return /[\n\r !#$%'-;=?-Z_a-z]/.test(fromCharCode(code)) |
380 | | -} |
| 1 | +export {fromXml} from './lib/index.js' |
0 commit comments