@@ -3,7 +3,7 @@ import { getFaviconList } from './favicon.js';
33import { isDuckAi , isBeingFramed , getTabUrl } from '../utils.js' ;
44const MSG_PAGE_CONTEXT_RESPONSE = 'collectionResult' ;
55
6- function checkNodeIsVisible ( node ) {
6+ export function checkNodeIsVisible ( node ) {
77 try {
88 const style = window . getComputedStyle ( node ) ;
99
@@ -36,6 +36,29 @@ function isHtmlElement(node) {
3636 * @returns {Document | null }
3737 */
3838function getSameOriginIframeDocument ( iframe ) {
39+ // Pre-check conditions that would prevent access without triggering security errors
40+ const src = iframe . src ;
41+
42+ // Skip sandboxed iframes unless they explicitly allow scripts
43+ // Avoids: Blocked script execution in 'about:blank' because the document's frame is sandboxed and the 'allow-scripts' permission is not set.
44+ // Note: iframe.sandbox always returns a DOMTokenList, so check hasAttribute instead
45+ if ( iframe . hasAttribute ( 'sandbox' ) && ! iframe . sandbox . contains ( 'allow-scripts' ) ) {
46+ return null ;
47+ }
48+
49+ // Check for cross-origin URLs (but allow about:blank and empty src as they inherit parent origin)
50+ if ( src && src !== 'about:blank' && src !== '' ) {
51+ try {
52+ const iframeUrl = new URL ( src , window . location . href ) ;
53+ if ( iframeUrl . origin !== window . location . origin ) {
54+ return null ;
55+ }
56+ } catch ( e ) {
57+ // Invalid URL, skip
58+ return null ;
59+ }
60+ }
61+
3962 try {
4063 // Try to access the contentDocument - this will throw if cross-origin
4164 const doc = iframe . contentDocument ;
@@ -76,8 +99,9 @@ function domToMarkdownChildren(childNodes, settings, depth = 0) {
7699 * @typedef {Object } DomToMarkdownSettings
77100 * @property {number } maxLength - Maximum length of content
78101 * @property {number } maxDepth - Maximum depth to traverse
79- * @property {string } excludeSelectors - CSS selectors to exclude from processing
102+ * @property {string | null } excludeSelectors - CSS selectors to exclude from processing
80103 * @property {boolean } includeIframes - Whether to include iframe content
104+ * @property {boolean } trimBlankLinks - Whether to trim blank links
81105 */
82106
83107/**
@@ -87,7 +111,7 @@ function domToMarkdownChildren(childNodes, settings, depth = 0) {
87111 * @param {number } depth
88112 * @returns {string }
89113 */
90- function domToMarkdown ( node , settings , depth = 0 ) {
114+ export function domToMarkdown ( node , settings , depth = 0 ) {
91115 if ( depth > settings . maxDepth ) {
92116 return '' ;
93117 }
@@ -97,7 +121,7 @@ function domToMarkdown(node, settings, depth = 0) {
97121 if ( ! isHtmlElement ( node ) ) {
98122 return '' ;
99123 }
100- if ( ! checkNodeIsVisible ( node ) || node . matches ( settings . excludeSelectors ) ) {
124+ if ( ! checkNodeIsVisible ( node ) || ( settings . excludeSelectors && node . matches ( settings . excludeSelectors ) ) ) {
101125 return '' ;
102126 }
103127
@@ -127,12 +151,15 @@ function domToMarkdown(node, settings, depth = 0) {
127151 return `${ children } \n` ;
128152 case 'br' :
129153 return `\n` ;
154+ case 'img' :
155+ return `\n } )\n` ;
130156 case 'ul' :
157+ case 'ol' :
131158 return `\n${ children } \n` ;
132159 case 'li' :
133- return `\n- ${ children . trim ( ) } \n` ;
160+ return `\n- ${ collapseAndTrim ( children ) } \n` ;
134161 case 'a' :
135- return getLinkText ( node ) ;
162+ return getLinkText ( node , children , settings ) ;
136163 case 'iframe' : {
137164 if ( ! settings . includeIframes ) {
138165 return children ;
@@ -151,13 +178,30 @@ function domToMarkdown(node, settings, depth = 0) {
151178 }
152179}
153180
181+ /**
182+ * @param {Element } node
183+ * @param {string } attr
184+ * @returns {string }
185+ */
186+ function getAttributeOrBlank ( node , attr ) {
187+ const attrValue = node . getAttribute ( attr ) ?? '' ;
188+ return attrValue . trim ( ) ;
189+ }
190+
154191function collapseAndTrim ( str ) {
155192 return collapseWhitespace ( str ) . trim ( ) ;
156193}
157194
158- function getLinkText ( node ) {
195+ function getLinkText ( node , children , settings ) {
159196 const href = node . getAttribute ( 'href' ) ;
160- return href ? `[${ collapseAndTrim ( node . textContent ) } ](${ href } )` : collapseWhitespace ( node . textContent ) ;
197+ const trimmedContent = collapseAndTrim ( children ) ;
198+ if ( settings . trimBlankLinks && trimmedContent . length === 0 ) {
199+ return '' ;
200+ }
201+ // The difference in whitespace handling is intentional here.
202+ // Where we don't wrap in a link:
203+ // we should retain at least one preceding and following space.
204+ return href ? `[${ trimmedContent } ](${ href } )` : collapseWhitespace ( children ) ;
161205}
162206
163207export default class PageContext extends ContentFeature {
@@ -420,6 +464,7 @@ export default class PageContext extends ContentFeature {
420464 const maxDepth = this . getFeatureSetting ( 'maxDepth' ) || 5000 ;
421465 let excludeSelectors = this . getFeatureSetting ( 'excludeSelectors' ) || [ '.ad' , '.sidebar' , '.footer' , '.nav' , '.header' ] ;
422466 const excludedInertElements = this . getFeatureSetting ( 'excludedInertElements' ) || [
467+ 'img' , // Note we're currently disabling images which we're handling in domToMarkdown (this can be per-site enabled in the config if needed).
423468 'script' ,
424469 'style' ,
425470 'link' ,
@@ -436,22 +481,34 @@ export default class PageContext extends ContentFeature {
436481 const mainContentSelector = this . getFeatureSetting ( 'mainContentSelector' ) || 'main, article, .content, .main, #content, #main' ;
437482 let mainContent = document . querySelector ( mainContentSelector ) ;
438483 const mainContentLength = this . getFeatureSetting ( 'mainContentLength' ) || 100 ;
484+ // Fast path to avoid processing main content if it's too short
439485 if ( mainContent && mainContent . innerHTML . trim ( ) . length <= mainContentLength ) {
440486 mainContent = null ;
441487 }
442- const contentRoot = mainContent || document . body ;
488+ let contentRoot = mainContent || document . body ;
443489
444- if ( contentRoot ) {
445- this . log . info ( 'Getting main content' , contentRoot ) ;
446- content += domToMarkdown ( contentRoot , {
490+ // Use a closure to reuse the domToMarkdown parameters
491+ const extractContent = ( root ) => {
492+ this . log . info ( 'Getting content' , root ) ;
493+ const result = domToMarkdown ( root , {
447494 maxLength : upperLimit ,
448495 maxDepth,
449496 includeIframes : this . getFeatureSettingEnabled ( 'includeIframes' , 'enabled' ) ,
450497 excludeSelectors : excludeSelectorsString ,
451- } ) ;
452- this . log . info ( 'Content markdown' , content , contentRoot ) ;
498+ trimBlankLinks : this . getFeatureSettingEnabled ( 'trimBlankLinks' , 'enabled' ) ,
499+ } ) . trim ( ) ;
500+ this . log . info ( 'Content markdown' , result , root ) ;
501+ return result ;
502+ } ;
503+
504+ if ( contentRoot ) {
505+ content += extractContent ( contentRoot ) ;
506+ }
507+ // If the main content is empty, use the body
508+ if ( content . length === 0 && contentRoot !== document . body && this . getFeatureSettingEnabled ( 'bodyFallback' , 'enabled' ) ) {
509+ contentRoot = document . body ;
510+ content += extractContent ( contentRoot ) ;
453511 }
454- content = content . trim ( ) ;
455512
456513 // Store the full content length before truncation
457514 this . fullContentLength = content . length ;
0 commit comments