Skip to content

Commit 1f66b13

Browse files
Trim links and include images (#2011)
* Trim links and include images * Update img tag to use alt attribute for markdown * Update image rendering to use getAttributeOrBlank Refactor image rendering to use getAttributeOrBlank for safer attribute retrieval. * Update comments on whitespace handling Clarified whitespace handling comments in the code. * Clarify handling of excluded inert elements Added a comment explaining the disabling of images in excluded inert elements. * Update parameter type from Node to Element * Lint fixes * Add fallback to body collection if main is empty * Avoid triggering security errors for iframes * Ignore blank sandbox attr * Update comment for excluded inert elements handling Clarify comment regarding image handling in excluded inert elements. * Fixes sandbox attr check * Add test cases * Lint fix
1 parent dc1f82d commit 1f66b13

26 files changed

+954
-17
lines changed

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,9 @@ Sources/ContentScopeScripts/dist/
1212
test-results
1313
!Sources/ContentScopeScripts/dist/pages/.gitignore
1414

15+
# Test output files (generated during tests)
16+
injected/unit-test/fixtures/page-context/output/
17+
1518
# Local Netlify folder
1619
.netlify
1720
# VS Code user config

injected/package.json

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,12 +26,12 @@
2626
},
2727
"type": "module",
2828
"dependencies": {
29+
"@duckduckgo/privacy-configuration": "github:duckduckgo/privacy-configuration#1752154773643",
30+
"esbuild": "^0.25.10",
2931
"minimist": "^1.2.8",
3032
"parse-address": "^1.1.2",
3133
"seedrandom": "^3.0.5",
3234
"sjcl": "^1.0.8",
33-
"@duckduckgo/privacy-configuration": "github:duckduckgo/privacy-configuration#1752154773643",
34-
"esbuild": "^0.25.10",
3535
"urlpattern-polyfill": "^10.1.0"
3636
},
3737
"devDependencies": {
@@ -43,6 +43,7 @@
4343
"@typescript-eslint/eslint-plugin": "^8.46.0",
4444
"fast-check": "^4.2.0",
4545
"jasmine": "^5.12.0",
46+
"jsdom": "^27.0.0",
4647
"web-ext": "^9.0.0"
4748
}
4849
}

injected/src/features/page-context.js

Lines changed: 72 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ import { getFaviconList } from './favicon.js';
33
import { isDuckAi, isBeingFramed, getTabUrl } from '../utils.js';
44
const MSG_PAGE_CONTEXT_RESPONSE = 'collectionResult';
55

6-
function checkNodeIsVisible(node) {
6+
export function checkNodeIsVisible(node) {
77
try {
88
const style = window.getComputedStyle(node);
99

@@ -36,6 +36,29 @@ function isHtmlElement(node) {
3636
* @returns {Document | null}
3737
*/
3838
function getSameOriginIframeDocument(iframe) {
39+
// Pre-check conditions that would prevent access without triggering security errors
40+
const src = iframe.src;
41+
42+
// Skip sandboxed iframes unless they explicitly allow scripts
43+
// Avoids: Blocked script execution in 'about:blank' because the document's frame is sandboxed and the 'allow-scripts' permission is not set.
44+
// Note: iframe.sandbox always returns a DOMTokenList, so check hasAttribute instead
45+
if (iframe.hasAttribute('sandbox') && !iframe.sandbox.contains('allow-scripts')) {
46+
return null;
47+
}
48+
49+
// Check for cross-origin URLs (but allow about:blank and empty src as they inherit parent origin)
50+
if (src && src !== 'about:blank' && src !== '') {
51+
try {
52+
const iframeUrl = new URL(src, window.location.href);
53+
if (iframeUrl.origin !== window.location.origin) {
54+
return null;
55+
}
56+
} catch (e) {
57+
// Invalid URL, skip
58+
return null;
59+
}
60+
}
61+
3962
try {
4063
// Try to access the contentDocument - this will throw if cross-origin
4164
const doc = iframe.contentDocument;
@@ -76,8 +99,9 @@ function domToMarkdownChildren(childNodes, settings, depth = 0) {
7699
* @typedef {Object} DomToMarkdownSettings
77100
* @property {number} maxLength - Maximum length of content
78101
* @property {number} maxDepth - Maximum depth to traverse
79-
* @property {string} excludeSelectors - CSS selectors to exclude from processing
102+
* @property {string | null} excludeSelectors - CSS selectors to exclude from processing
80103
* @property {boolean} includeIframes - Whether to include iframe content
104+
* @property {boolean} trimBlankLinks - Whether to trim blank links
81105
*/
82106

83107
/**
@@ -87,7 +111,7 @@ function domToMarkdownChildren(childNodes, settings, depth = 0) {
87111
* @param {number} depth
88112
* @returns {string}
89113
*/
90-
function domToMarkdown(node, settings, depth = 0) {
114+
export function domToMarkdown(node, settings, depth = 0) {
91115
if (depth > settings.maxDepth) {
92116
return '';
93117
}
@@ -97,7 +121,7 @@ function domToMarkdown(node, settings, depth = 0) {
97121
if (!isHtmlElement(node)) {
98122
return '';
99123
}
100-
if (!checkNodeIsVisible(node) || node.matches(settings.excludeSelectors)) {
124+
if (!checkNodeIsVisible(node) || (settings.excludeSelectors && node.matches(settings.excludeSelectors))) {
101125
return '';
102126
}
103127

@@ -127,12 +151,15 @@ function domToMarkdown(node, settings, depth = 0) {
127151
return `${children}\n`;
128152
case 'br':
129153
return `\n`;
154+
case 'img':
155+
return `\n![${getAttributeOrBlank(node, 'alt')}](${getAttributeOrBlank(node, 'src')})\n`;
130156
case 'ul':
157+
case 'ol':
131158
return `\n${children}\n`;
132159
case 'li':
133-
return `\n- ${children.trim()}\n`;
160+
return `\n- ${collapseAndTrim(children)}\n`;
134161
case 'a':
135-
return getLinkText(node);
162+
return getLinkText(node, children, settings);
136163
case 'iframe': {
137164
if (!settings.includeIframes) {
138165
return children;
@@ -151,13 +178,30 @@ function domToMarkdown(node, settings, depth = 0) {
151178
}
152179
}
153180

181+
/**
182+
* @param {Element} node
183+
* @param {string} attr
184+
* @returns {string}
185+
*/
186+
function getAttributeOrBlank(node, attr) {
187+
const attrValue = node.getAttribute(attr) ?? '';
188+
return attrValue.trim();
189+
}
190+
154191
function collapseAndTrim(str) {
155192
return collapseWhitespace(str).trim();
156193
}
157194

158-
function getLinkText(node) {
195+
function getLinkText(node, children, settings) {
159196
const href = node.getAttribute('href');
160-
return href ? `[${collapseAndTrim(node.textContent)}](${href})` : collapseWhitespace(node.textContent);
197+
const trimmedContent = collapseAndTrim(children);
198+
if (settings.trimBlankLinks && trimmedContent.length === 0) {
199+
return '';
200+
}
201+
// The difference in whitespace handling is intentional here.
202+
// Where we don't wrap in a link:
203+
// we should retain at least one preceding and following space.
204+
return href ? `[${trimmedContent}](${href})` : collapseWhitespace(children);
161205
}
162206

163207
export default class PageContext extends ContentFeature {
@@ -420,6 +464,7 @@ export default class PageContext extends ContentFeature {
420464
const maxDepth = this.getFeatureSetting('maxDepth') || 5000;
421465
let excludeSelectors = this.getFeatureSetting('excludeSelectors') || ['.ad', '.sidebar', '.footer', '.nav', '.header'];
422466
const excludedInertElements = this.getFeatureSetting('excludedInertElements') || [
467+
'img', // Note we're currently disabling images which we're handling in domToMarkdown (this can be per-site enabled in the config if needed).
423468
'script',
424469
'style',
425470
'link',
@@ -436,22 +481,34 @@ export default class PageContext extends ContentFeature {
436481
const mainContentSelector = this.getFeatureSetting('mainContentSelector') || 'main, article, .content, .main, #content, #main';
437482
let mainContent = document.querySelector(mainContentSelector);
438483
const mainContentLength = this.getFeatureSetting('mainContentLength') || 100;
484+
// Fast path to avoid processing main content if it's too short
439485
if (mainContent && mainContent.innerHTML.trim().length <= mainContentLength) {
440486
mainContent = null;
441487
}
442-
const contentRoot = mainContent || document.body;
488+
let contentRoot = mainContent || document.body;
443489

444-
if (contentRoot) {
445-
this.log.info('Getting main content', contentRoot);
446-
content += domToMarkdown(contentRoot, {
490+
// Use a closure to reuse the domToMarkdown parameters
491+
const extractContent = (root) => {
492+
this.log.info('Getting content', root);
493+
const result = domToMarkdown(root, {
447494
maxLength: upperLimit,
448495
maxDepth,
449496
includeIframes: this.getFeatureSettingEnabled('includeIframes', 'enabled'),
450497
excludeSelectors: excludeSelectorsString,
451-
});
452-
this.log.info('Content markdown', content, contentRoot);
498+
trimBlankLinks: this.getFeatureSettingEnabled('trimBlankLinks', 'enabled'),
499+
}).trim();
500+
this.log.info('Content markdown', result, root);
501+
return result;
502+
};
503+
504+
if (contentRoot) {
505+
content += extractContent(contentRoot);
506+
}
507+
// If the main content is empty, use the body
508+
if (content.length === 0 && contentRoot !== document.body && this.getFeatureSettingEnabled('bodyFallback', 'enabled')) {
509+
contentRoot = document.body;
510+
content += extractContent(contentRoot);
453511
}
454-
content = content.trim();
455512

456513
// Store the full content length before truncation
457514
this.fullContentLength = content.length;
Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
# Page Context DOM-to-Markdown Tests
2+
3+
This directory contains test fixtures for testing the `domToMarkdown` function from `page-context.js`.
4+
5+
## Directory Structure
6+
7+
- `output/` - Generated markdown files from test runs (temporary, regenerated on each run)
8+
- `expected/` - Expected markdown output files (committed to git)
9+
10+
## How It Works
11+
12+
The test suite (`page-context-dom.spec.js`) does the following:
13+
14+
1. **Creates test cases** with HTML snippets and settings for `domToMarkdown`
15+
2. **Converts HTML to Markdown** using JSDom to simulate a browser environment
16+
3. **Writes output** to `output/` directory for inspection
17+
4. **Compares output** with expected files in `expected/` directory
18+
5. **Fails if different** - Any difference between output and expected causes test failure
19+
20+
## Test Cases
21+
22+
The suite includes 20 test cases covering:
23+
24+
- Basic HTML elements (paragraphs, headings, lists, links, images)
25+
- Formatting (bold, italic, mixed formatting)
26+
- Complex structures (nested lists, articles, blog posts)
27+
- Edge cases (hidden content, empty links, whitespace handling)
28+
- Configuration options (max length truncation, excluded selectors, trim blank links)
29+
30+
## Updating Expected Output
31+
32+
When the `domToMarkdown` function behavior changes:
33+
34+
1. Review the changes in `output/` directory
35+
2. If changes are correct, copy them to `expected/`:
36+
```bash
37+
cp unit-test/fixtures/page-context/output/*.md unit-test/fixtures/page-context/expected/
38+
```
39+
3. Commit the updated expected files
40+
41+
## Running Tests
42+
43+
```bash
44+
npm run test-unit -- unit-test/page-context-dom.spec.js
45+
```
46+
47+
## Why This Approach?
48+
49+
- **Visibility**: Output files make it easy to review markdown generation
50+
- **Regression detection**: Tests fail on any unintended changes
51+
- **Documentation**: Expected files serve as examples of the function's behavior
52+
- **Easy updates**: Simple to update baselines when behavior intentionally changes
53+
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
# Article Title
2+
By **Author Name**
3+
This is the introduction paragraph with some *emphasis*.
4+
5+
## First Section
6+
Content of the first section.
7+
8+
9+
- Point one
10+
11+
- Point two
12+
13+
14+
## Second Section
15+
Content with a [link](https://example.com).
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
# Blog Post Title
2+
Published on January 1, 2024
3+
4+
![Header image](header.jpg)
5+
Lorem ipsum dolor sit amet, consectetur adipiscing elit.
6+
7+
## Key Takeaways
8+
9+
10+
- First takeaway
11+
12+
- Second takeaway
13+
14+
- Third takeaway
15+
16+
Read more on [our blog](https://blog.example.com).
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
This is **bold** and this is *italic*.
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
# Article Title
2+
Introduction paragraph.
3+
4+
## Section 1
5+
Section content with **bold** text.

injected/unit-test/fixtures/page-context/expected/empty-link-with-trim.md

Whitespace-only changes.
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
[](https://example.com)

0 commit comments

Comments
 (0)