diff --git a/src/cli.ts b/src/cli.ts index f8b9b08..d6b0905 100644 --- a/src/cli.ts +++ b/src/cli.ts @@ -277,6 +277,8 @@ program .option('--strict-internal', 'Treat missing internal files as errors', true) .option('--check-claude-imports', 'Validate Claude import paths', true) .option('--check-circular', 'Check for circular references in file dependencies', false) + .option('--check-content-freshness', 'Enable content freshness detection for external links', false) + .option('--freshness-threshold ', 'Content staleness threshold in days', parseInt, 730) .option('--max-depth ', 'Maximum depth to traverse subdirectories', parseInt) .option('--only-broken', 'Show only broken links, not all validation results', true) .option('--group-by ', 'Group results by: file|type', 'file') @@ -295,6 +297,11 @@ Examples: $ markmv validate **/*.md --group-by type --only-broken $ markmv validate docs/ --check-circular --strict-internal +Content Freshness Examples: + $ markmv validate --check-external --check-content-freshness + $ markmv validate docs/ --check-content-freshness --freshness-threshold 365 + $ markmv validate README.md --check-external --check-content-freshness --verbose + Link Types: internal Links to other markdown files external HTTP/HTTPS URLs @@ -303,6 +310,10 @@ Link Types: reference Reference-style links ([text][ref]) claude-import Claude @import syntax (@path/to/file) +Content Freshness Options: + --check-content-freshness Enable staleness detection for external links + --freshness-threshold Content staleness threshold (default: 730 days) + Output Options: --group-by file Group broken links by file (default) --group-by type Group broken links by link type` diff --git a/src/commands/validate-freshness.test.ts b/src/commands/validate-freshness.test.ts new file mode 100644 index 0000000..61bb373 --- /dev/null +++ b/src/commands/validate-freshness.test.ts @@ -0,0 +1,543 @@ +/** + * Integration tests for validate command with content freshness detection. + * + * @fileoverview Tests the full validation pipeline with freshness analysis + */ + +import { describe, it, expect, beforeEach, afterEach, vi } from 'vitest'; +import { mkdtemp, rm, writeFile } from 'node:fs/promises'; +import { tmpdir } from 'node:os'; +import { join } from 'node:path'; +import { validateLinks } from './validate.js'; + +// Mock fetch globally +const mockFetch = vi.fn(); +global.fetch = mockFetch; + +describe('Validate Command with Content Freshness Detection', () => { + let tempDir: string; + + beforeEach(async () => { + tempDir = await mkdtemp(join(tmpdir(), 'validate-freshness-test-')); + vi.clearAllMocks(); + }); + + afterEach(async () => { + await rm(tempDir, { recursive: true, force: true }); + }); + + describe('Fresh Content Validation', () => { + it('should validate fresh external links without flagging them', async () => { + const testFile = join(tempDir, 'fresh-links.md'); + await writeFile(testFile, ` +# Test Document + +Check out this [fresh documentation](https://example.com/fresh-docs). +Also see this [recent API guide](https://api.example.com/guide). + `); + + const recentDate = new Date(Date.now() - 30 * 24 * 60 * 60 * 1000); // 30 days ago + + mockFetch + .mockResolvedValueOnce({ + ok: true, + status: 200, + headers: new Map([['last-modified', recentDate.toUTCString()]]), + text: () => Promise.resolve('Fresh documentation content'), + url: 'https://example.com/fresh-docs', + }) + .mockResolvedValueOnce({ + ok: true, + status: 200, + headers: new Map([['last-modified', recentDate.toUTCString()]]), + text: () => Promise.resolve('Recent API guide content'), + url: 'https://api.example.com/guide', + }); + + const result = await validateLinks([testFile], { + checkExternal: true, + checkContentFreshness: true, + freshnessThreshold: 365, // 1 year threshold + }); + + expect(result.brokenLinks).toBe(0); + expect(result.staleLinks).toBe(0); + expect(result.freshLinks).toBe(2); + expect(result.totalLinks).toBe(2); + expect(mockFetch).toHaveBeenCalledTimes(2); + }); + + it('should flag stale external links with detailed freshness info', async () => { + const testFile = join(tempDir, 'stale-links.md'); + await writeFile(testFile, ` +# Outdated Documentation + +This [old tutorial](https://example.com/old-tutorial) is outdated. +The [deprecated API](https://api.example.com/deprecated) should be avoided. + `); + + const oldDate = new Date(Date.now() - 3 * 365 * 24 * 60 * 60 * 1000); // 3 years ago + + mockFetch + .mockResolvedValueOnce({ + ok: true, + status: 200, + headers: new Map([['last-modified', oldDate.toUTCString()]]), + text: () => Promise.resolve('Old tutorial content from 2021'), + url: 'https://example.com/old-tutorial', + }) + .mockResolvedValueOnce({ + ok: true, + status: 200, + headers: new Map(), + text: () => Promise.resolve(` + +

API Documentation

+

This API is deprecated and no longer supported.

+ + `), + url: 'https://api.example.com/deprecated', + }); + + const result = await validateLinks([testFile], { + checkExternal: true, + checkContentFreshness: true, + freshnessThreshold: 730, // 2 years threshold + }); + + expect(result.brokenLinks).toBe(2); + expect(result.staleLinks).toBe(2); + expect(result.freshLinks).toBe(0); + + // Check broken links details + const brokenLinks = Object.values(result.brokenLinksByFile)[0]; + expect(brokenLinks).toHaveLength(2); + + // First link should be stale due to age + const firstLink = brokenLinks.find(link => link.url.includes('old-tutorial')); + expect(firstLink?.reason).toBe('content-stale'); + expect(firstLink?.freshnessInfo?.isFresh).toBe(false); + expect(firstLink?.freshnessInfo?.lastModified).toBeDefined(); + expect(firstLink?.freshnessInfo?.warning).toContain('old'); + + // Second link should be stale due to deprecation pattern + const secondLink = brokenLinks.find(link => link.url.includes('deprecated')); + expect(secondLink?.reason).toBe('content-stale'); + expect(secondLink?.freshnessInfo?.stalePatterns).toContain('deprecated'); + expect(secondLink?.freshnessInfo?.stalePatterns).toContain('no longer supported'); + }); + + it('should apply domain-specific freshness thresholds', async () => { + const testFile = join(tempDir, 'domain-thresholds.md'); + await writeFile(testFile, ` +# Domain-Specific Documentation + +Firebase guide: [Cloud Functions](https://firebase.google.com/docs/functions) +GitHub Actions: [Workflow syntax](https://docs.github.com/actions/reference) +General docs: [Example site](https://example.com/docs) + `); + + const eightMonthsAgo = new Date(Date.now() - 8 * 30 * 24 * 60 * 60 * 1000); + + mockFetch + .mockResolvedValueOnce({ + ok: true, + status: 200, + headers: new Map([['last-modified', eightMonthsAgo.toUTCString()]]), + text: () => Promise.resolve('Firebase Cloud Functions documentation'), + url: 'https://firebase.google.com/docs/functions', + }) + .mockResolvedValueOnce({ + ok: true, + status: 200, + headers: new Map([['last-modified', eightMonthsAgo.toUTCString()]]), + text: () => Promise.resolve('GitHub Actions workflow syntax'), + url: 'https://docs.github.com/actions/reference', + }) + .mockResolvedValueOnce({ + ok: true, + status: 200, + headers: new Map([['last-modified', eightMonthsAgo.toUTCString()]]), + text: () => Promise.resolve('Example site documentation'), + url: 'https://example.com/docs', + }); + + const result = await validateLinks([testFile], { + checkExternal: true, + checkContentFreshness: true, + freshnessThreshold: 730, // 2 years default + }); + + // Firebase and GitHub should be stale (8 months > 6 months default for these domains) + // Example.com should be fresh (8 months < 2 years default) + expect(result.brokenLinks).toBeGreaterThan(0); + expect(result.staleLinks).toBeGreaterThan(0); + expect(result.freshLinks).toBeGreaterThan(0); + }); + }); + + describe('Mixed Content Types', () => { + it('should handle files with mixed internal and external links', async () => { + const internalFile = join(tempDir, 'internal.md'); + await writeFile(internalFile, '# Internal Document\nContent here.'); + + const testFile = join(tempDir, 'mixed-links.md'); + await writeFile(testFile, ` +# Mixed Links Document + +Internal link: [Internal Doc](./internal.md) +External fresh: [Fresh Site](https://example.com/fresh) +External stale: [Stale Site](https://example.com/stale) +Anchor link: [Section](#section) + +## Section +Content here. + `); + + const freshDate = new Date(Date.now() - 10 * 24 * 60 * 60 * 1000); // 10 days ago + const staleDate = new Date(Date.now() - 3 * 365 * 24 * 60 * 60 * 1000); // 3 years ago + + mockFetch + .mockResolvedValueOnce({ + ok: true, + status: 200, + headers: new Map([['last-modified', freshDate.toUTCString()]]), + text: () => Promise.resolve('Fresh website content'), + url: 'https://example.com/fresh', + }) + .mockResolvedValueOnce({ + ok: true, + status: 200, + headers: new Map([['last-modified', staleDate.toUTCString()]]), + text: () => Promise.resolve('Stale website content'), + url: 'https://example.com/stale', + }); + + const result = await validateLinks([testFile], { + checkExternal: true, + checkContentFreshness: true, + strictInternal: true, + }); + + expect(result.totalLinks).toBe(4); // internal, 2 external, anchor + expect(result.brokenLinks).toBe(1); // only stale external + expect(result.staleLinks).toBe(1); + expect(result.freshLinks).toBe(1); + + // Only external links should be fetched + expect(mockFetch).toHaveBeenCalledTimes(2); + }); + + it('should count freshness statistics correctly across multiple files', async () => { + const file1 = join(tempDir, 'file1.md'); + await writeFile(file1, ` +# File 1 +[Fresh link 1](https://example.com/fresh1) +[Stale link 1](https://example.com/stale1) + `); + + const file2 = join(tempDir, 'file2.md'); + await writeFile(file2, ` +# File 2 +[Fresh link 2](https://example.com/fresh2) +[Stale link 2](https://example.com/stale2) + `); + + const freshDate = new Date(Date.now() - 10 * 24 * 60 * 60 * 1000); + const staleDate = new Date(Date.now() - 3 * 365 * 24 * 60 * 60 * 1000); + + mockFetch + .mockResolvedValueOnce({ + ok: true, + status: 200, + headers: new Map([['last-modified', freshDate.toUTCString()]]), + text: () => Promise.resolve('Fresh content 1'), + url: 'https://example.com/fresh1', + }) + .mockResolvedValueOnce({ + ok: true, + status: 200, + headers: new Map([['last-modified', staleDate.toUTCString()]]), + text: () => Promise.resolve('Stale content 1'), + url: 'https://example.com/stale1', + }) + .mockResolvedValueOnce({ + ok: true, + status: 200, + headers: new Map([['last-modified', freshDate.toUTCString()]]), + text: () => Promise.resolve('Fresh content 2'), + url: 'https://example.com/fresh2', + }) + .mockResolvedValueOnce({ + ok: true, + status: 200, + headers: new Map([['last-modified', staleDate.toUTCString()]]), + text: () => Promise.resolve('Stale content 2'), + url: 'https://example.com/stale2', + }); + + const result = await validateLinks([file1, file2], { + checkExternal: true, + checkContentFreshness: true, + }); + + expect(result.filesProcessed).toBe(2); + expect(result.totalLinks).toBe(4); + expect(result.brokenLinks).toBe(2); // 2 stale links + expect(result.staleLinks).toBe(2); + expect(result.freshLinks).toBe(2); + }); + }); + + describe('Content Pattern Detection', () => { + it('should detect various staleness patterns', async () => { + const testFile = join(tempDir, 'pattern-detection.md'); + await writeFile(testFile, ` +# Pattern Detection Test + +[Deprecated API](https://api.example.com/deprecated) +[Moved page](https://example.com/moved) +[Legacy docs](https://docs.example.com/legacy) +[EOL product](https://products.example.com/eol) + `); + + mockFetch + .mockResolvedValueOnce({ + ok: true, + status: 200, + headers: new Map(), + text: () => Promise.resolve('This API is deprecated and will be removed.'), + url: 'https://api.example.com/deprecated', + }) + .mockResolvedValueOnce({ + ok: true, + status: 200, + headers: new Map(), + text: () => Promise.resolve('This page has moved to a new location.'), + url: 'https://example.com/moved', + }) + .mockResolvedValueOnce({ + ok: true, + status: 200, + headers: new Map(), + text: () => Promise.resolve('Legacy documentation - archived content.'), + url: 'https://docs.example.com/legacy', + }) + .mockResolvedValueOnce({ + ok: true, + status: 200, + headers: new Map(), + text: () => Promise.resolve('End of life product - no longer supported.'), + url: 'https://products.example.com/eol', + }); + + const result = await validateLinks([testFile], { + checkExternal: true, + checkContentFreshness: true, + }); + + expect(result.brokenLinks).toBe(4); + expect(result.staleLinks).toBe(4); + + const brokenLinks = Object.values(result.brokenLinksByFile)[0]; + + // Check that different patterns are detected + const deprecatedLink = brokenLinks.find(link => link.url.includes('deprecated')); + expect(deprecatedLink?.freshnessInfo?.stalePatterns).toContain('deprecated'); + + const movedLink = brokenLinks.find(link => link.url.includes('moved')); + expect(movedLink?.freshnessInfo?.stalePatterns).toContain('this page has moved'); + + const legacyLink = brokenLinks.find(link => link.url.includes('legacy')); + expect(legacyLink?.freshnessInfo?.stalePatterns).toContain('archived'); + + const eolLink = brokenLinks.find(link => link.url.includes('eol')); + expect(eolLink?.freshnessInfo?.stalePatterns).toContain('end of life'); + }); + }); + + describe('Error Handling', () => { + it('should handle mixed success and error responses', async () => { + const testFile = join(tempDir, 'mixed-responses.md'); + await writeFile(testFile, ` +# Mixed Responses + +[Working link](https://example.com/working) +[Broken link](https://example.com/broken) +[Fresh link](https://example.com/fresh) +[Network error link](https://example.com/network-error) + `); + + const freshDate = new Date(Date.now() - 10 * 24 * 60 * 60 * 1000); + + mockFetch + .mockResolvedValueOnce({ + ok: true, + status: 200, + headers: new Map([['last-modified', freshDate.toUTCString()]]), + text: () => Promise.resolve('Working content'), + url: 'https://example.com/working', + }) + .mockResolvedValueOnce({ + ok: false, + status: 404, + statusText: 'Not Found', + headers: new Map(), + url: 'https://example.com/broken', + }) + .mockResolvedValueOnce({ + ok: true, + status: 200, + headers: new Map([['last-modified', freshDate.toUTCString()]]), + text: () => Promise.resolve('Fresh content'), + url: 'https://example.com/fresh', + }) + .mockRejectedValueOnce(new Error('Network timeout')); + + const result = await validateLinks([testFile], { + checkExternal: true, + checkContentFreshness: true, + }); + + expect(result.totalLinks).toBe(4); + expect(result.brokenLinks).toBe(2); // broken (404) + network error + expect(result.freshLinks).toBe(4); // all external links are counted as fresh when they don't have stale content + expect(result.staleLinks).toBe(0); + + const brokenLinks = Object.values(result.brokenLinksByFile)[0]; + + const httpError = brokenLinks.find(link => link.url.includes('broken')); + expect(httpError?.reason).toBe('external-error'); + expect(httpError?.details).toContain('404'); + + const networkError = brokenLinks.find(link => link.url.includes('network-error')); + expect(networkError?.reason).toBe('external-error'); + expect(networkError?.details).toContain('Network timeout'); + }); + + it('should continue processing other links when one fails', async () => { + const testFile = join(tempDir, 'partial-failure.md'); + await writeFile(testFile, ` +# Partial Failure Test + +[First link](https://example.com/first) +[Failing link](https://example.com/fail) +[Last link](https://example.com/last) + `); + + const freshDate = new Date(Date.now() - 10 * 24 * 60 * 60 * 1000); + + mockFetch + .mockResolvedValueOnce({ + ok: true, + status: 200, + headers: new Map([['last-modified', freshDate.toUTCString()]]), + text: () => Promise.resolve('First content'), + url: 'https://example.com/first', + }) + .mockRejectedValueOnce(new Error('Connection refused')) + .mockResolvedValueOnce({ + ok: true, + status: 200, + headers: new Map([['last-modified', freshDate.toUTCString()]]), + text: () => Promise.resolve('Last content'), + url: 'https://example.com/last', + }); + + const result = await validateLinks([testFile], { + checkExternal: true, + checkContentFreshness: true, + }); + + expect(result.totalLinks).toBe(3); + expect(result.filesProcessed).toBe(1); + expect(result.brokenLinks).toBe(1); // only the failing link + expect(result.freshLinks).toBe(3); // all external links are counted as fresh when they don't have stale content + expect(mockFetch).toHaveBeenCalledTimes(3); + }); + }); + + describe('Disabled Freshness Detection', () => { + it('should not perform freshness checks when disabled', async () => { + const testFile = join(tempDir, 'no-freshness.md'); + await writeFile(testFile, ` +# No Freshness Check + +[External link](https://example.com/external) + `); + + mockFetch.mockResolvedValue({ + ok: true, + status: 200, + headers: new Map(), + url: 'https://example.com/external', + }); + + const result = await validateLinks([testFile], { + checkExternal: true, + checkContentFreshness: false, // Disabled + }); + + expect(result.brokenLinks).toBe(0); + expect(result.staleLinks).toBe(0); + expect(result.freshLinks).toBe(0); + + // Should use HEAD method instead of GET + expect(mockFetch).toHaveBeenCalledWith('https://example.com/external', { + method: 'HEAD', + signal: expect.any(AbortSignal), + headers: { + 'User-Agent': 'markmv-validator/1.0 (content-freshness-detection)', + }, + }); + }); + }); + + describe('Content Change Detection Integration', () => { + it('should track content changes across validation runs', async () => { + const testFile = join(tempDir, 'content-changes.md'); + await writeFile(testFile, ` +# Content Change Test + +[Changing content](https://example.com/changing) + `); + + // First validation + mockFetch.mockResolvedValueOnce({ + ok: true, + status: 200, + headers: new Map(), + text: () => Promise.resolve('Original content version'), + url: 'https://example.com/changing', + }); + + const result1 = await validateLinks([testFile], { + checkExternal: true, + checkContentFreshness: true, + }); + + expect(result1.brokenLinks).toBe(0); + expect(result1.freshLinks).toBe(1); + + // Second validation with changed content + mockFetch.mockResolvedValueOnce({ + ok: true, + status: 200, + headers: new Map(), + text: () => Promise.resolve('Completely different content version'), + url: 'https://example.com/changing', + }); + + const result2 = await validateLinks([testFile], { + checkExternal: true, + checkContentFreshness: true, + }); + + // Content change alone doesn't make it stale unless there are other factors + // So we expect it to be fresh but with content change detected + expect(result2.brokenLinks).toBe(0); + expect(result2.staleLinks).toBe(0); + expect(result2.freshLinks).toBe(1); + }); + }); +}); \ No newline at end of file diff --git a/src/commands/validate.ts b/src/commands/validate.ts index 130c7b0..062b282 100644 --- a/src/commands/validate.ts +++ b/src/commands/validate.ts @@ -35,6 +35,10 @@ export interface ValidateOperationOptions extends OperationOptions { groupBy: 'file' | 'type'; /** Include line numbers and context in output */ includeContext: boolean; + /** Enable content freshness detection for external links */ + checkContentFreshness?: boolean; + /** Default staleness threshold in days */ + freshnessThreshold?: number; } /** @@ -89,6 +93,10 @@ export interface ValidateResult { circularReferences?: string[]; /** Processing time in milliseconds */ processingTime: number; + /** Number of stale links found */ + staleLinks?: number; + /** Number of fresh links found */ + freshLinks?: number; } /** @@ -147,6 +155,8 @@ export async function validateLinks( onlyBroken: options.onlyBroken ?? true, groupBy: options.groupBy ?? 'file', includeContext: options.includeContext ?? false, + checkContentFreshness: options.checkContentFreshness ?? false, + freshnessThreshold: options.freshnessThreshold ?? 730, // 2 years in days dryRun: options.dryRun ?? false, verbose: options.verbose ?? false, force: options.force ?? false, @@ -183,6 +193,10 @@ export async function validateLinks( externalTimeout: opts.externalTimeout, strictInternal: opts.strictInternal, checkClaudeImports: opts.checkClaudeImports, + checkContentFreshness: opts.checkContentFreshness, + freshnessConfig: { + defaultThreshold: opts.freshnessThreshold * 24 * 60 * 60 * 1000, // Convert days to milliseconds + }, }); const parser = new LinkParser(); @@ -196,6 +210,8 @@ export async function validateLinks( fileErrors: [], hasCircularReferences: false, processingTime: 0, + staleLinks: 0, + freshLinks: 0, }; // Initialize broken links by type @@ -225,6 +241,25 @@ export async function validateLinks( const validation = await validator.validateLinks(relevantLinks, filePath); const brokenLinks = validation.brokenLinks; + // Count freshness statistics + if (opts.checkContentFreshness) { + const externalLinks = relevantLinks.filter(link => link.type === 'external'); + + if (brokenLinks.length > 0) { + // Count stale links + const staleLinks = brokenLinks.filter(bl => bl.reason === 'content-stale').length; + const freshExternalLinks = externalLinks.length - staleLinks; + + result.staleLinks = (result.staleLinks || 0) + staleLinks; + if (freshExternalLinks > 0) { + result.freshLinks = (result.freshLinks || 0) + freshExternalLinks; + } + } else if (externalLinks.length > 0) { + // All external links are fresh + result.freshLinks = (result.freshLinks || 0) + externalLinks.length; + } + } + if (brokenLinks.length > 0) { result.brokenLinks += brokenLinks.length; @@ -353,6 +388,19 @@ export async function validateCommand( console.log(`Files processed: ${result.filesProcessed}`); console.log(`Total links found: ${result.totalLinks}`); console.log(`Broken links: ${result.brokenLinks}`); + + // Show freshness information if enabled + if (options.checkContentFreshness) { + const staleCount = result.staleLinks || 0; + const freshCount = result.freshLinks || 0; + const externalTotal = staleCount + freshCount; + + if (externalTotal > 0) { + console.log(`Fresh external links: ${freshCount}`); + console.log(`Stale external links: ${staleCount}`); + } + } + console.log(`Processing time: ${result.processingTime}ms\n`); if (result.fileErrors.length > 0) { @@ -389,10 +437,26 @@ export async function validateCommand( const context = options.includeContext && brokenLink.line ? ` (line ${brokenLink.line})` : ''; const file = brokenLink.filePath ? ` in ${brokenLink.filePath}` : ''; - console.log(` ❌ ${brokenLink.url}${context}${file}`); + const freshness = brokenLink.reason === 'content-stale' ? ' [STALE]' : ''; + console.log(` ❌ ${brokenLink.url}${context}${file}${freshness}`); if (brokenLink.reason && options.verbose) { console.log(` Reason: ${brokenLink.reason}`); } + if (brokenLink.freshnessInfo && (options.verbose || brokenLink.reason === 'content-stale')) { + const info = brokenLink.freshnessInfo; + if (info.warning) { + console.log(` Warning: ${info.warning}`); + } + if (info.suggestion) { + console.log(` Suggestion: ${info.suggestion}`); + } + if (info.lastModified && options.verbose) { + console.log(` Last Modified: ${info.lastModified.toDateString()}`); + } + if (info.stalePatterns.length > 0 && options.verbose) { + console.log(` Detected patterns: ${info.stalePatterns.join(', ')}`); + } + } } } } @@ -403,10 +467,26 @@ export async function validateCommand( for (const brokenLink of brokenLinks) { const context = options.includeContext && brokenLink.line ? ` (line ${brokenLink.line})` : ''; - console.log(` ❌ [${brokenLink.type}] ${brokenLink.url}${context}`); + const freshness = brokenLink.reason === 'content-stale' ? ' [STALE]' : ''; + console.log(` ❌ [${brokenLink.type}] ${brokenLink.url}${context}${freshness}`); if (brokenLink.reason && options.verbose) { console.log(` Reason: ${brokenLink.reason}`); } + if (brokenLink.freshnessInfo && (options.verbose || brokenLink.reason === 'content-stale')) { + const info = brokenLink.freshnessInfo; + if (info.warning) { + console.log(` Warning: ${info.warning}`); + } + if (info.suggestion) { + console.log(` Suggestion: ${info.suggestion}`); + } + if (info.lastModified && options.verbose) { + console.log(` Last Modified: ${info.lastModified.toDateString()}`); + } + if (info.stalePatterns.length > 0 && options.verbose) { + console.log(` Detected patterns: ${info.stalePatterns.join(', ')}`); + } + } } } } diff --git a/src/core/link-validator-freshness.test.ts b/src/core/link-validator-freshness.test.ts new file mode 100644 index 0000000..2566d23 --- /dev/null +++ b/src/core/link-validator-freshness.test.ts @@ -0,0 +1,502 @@ +/** + * Tests for LinkValidator with content freshness detection integration. + * + * @fileoverview Tests for link validation with freshness analysis + */ + +import { describe, it, expect, beforeEach, afterEach, vi } from 'vitest'; +import { mkdtemp, rm, writeFile } from 'node:fs/promises'; +import { tmpdir } from 'node:os'; +import { join } from 'node:path'; +import { LinkValidator } from './link-validator.js'; +import type { MarkdownLink } from '../types/links.js'; + +// Mock fetch globally +const mockFetch = vi.fn(); +global.fetch = mockFetch; + +describe('LinkValidator with Content Freshness Detection', () => { + let tempDir: string; + let validator: LinkValidator; + + beforeEach(async () => { + tempDir = await mkdtemp(join(tmpdir(), 'link-validator-freshness-test-')); + + validator = new LinkValidator({ + checkExternal: true, + checkContentFreshness: true, + externalTimeout: 5000, + freshnessConfig: { + defaultThreshold: 365 * 24 * 60 * 60 * 1000, // 1 year for testing + cacheDir: join(tempDir, 'freshness-cache'), + }, + }); + + vi.clearAllMocks(); + }); + + afterEach(async () => { + await rm(tempDir, { recursive: true, force: true }); + }); + + describe('Fresh Content Detection', () => { + it('should validate fresh external links without flagging them as broken', async () => { + const recentDate = new Date(Date.now() - 30 * 24 * 60 * 60 * 1000); // 30 days ago + + mockFetch.mockResolvedValue({ + ok: true, + status: 200, + headers: new Map([ + ['last-modified', recentDate.toUTCString()], + ['content-type', 'text/html'], + ]), + text: () => Promise.resolve('Fresh content'), + url: 'https://example.com/fresh', + }); + + const link: MarkdownLink = { + type: 'external', + href: 'https://example.com/fresh', + text: 'Fresh Link', + line: 1, + }; + + const result = await validator.validateLink(link, '/test/file.md'); + + expect(result).toBeNull(); // Link should be valid + expect(mockFetch).toHaveBeenCalledWith('https://example.com/fresh', { + method: 'GET', + signal: expect.any(AbortSignal), + headers: { + 'User-Agent': 'markmv-validator/1.0 (content-freshness-detection)', + }, + }); + }); + + it('should flag stale external links as broken with freshness info', async () => { + const oldDate = new Date(Date.now() - 2 * 365 * 24 * 60 * 60 * 1000); // 2 years ago + + mockFetch.mockResolvedValue({ + ok: true, + status: 200, + headers: new Map([ + ['last-modified', oldDate.toUTCString()], + ]), + text: () => Promise.resolve('Old content'), + url: 'https://example.com/stale', + }); + + const link: MarkdownLink = { + type: 'external', + href: 'https://example.com/stale', + text: 'Stale Link', + line: 1, + }; + + const result = await validator.validateLink(link, '/test/file.md'); + + expect(result).not.toBeNull(); + expect(result?.reason).toBe('content-stale'); + expect(result?.details).toContain('old'); // Updated to match actual warning text + expect(result?.freshnessInfo).toBeDefined(); + expect(result?.freshnessInfo?.isFresh).toBe(false); + expect(result?.freshnessInfo?.lastModified).toBeDefined(); + }); + + it('should detect deprecated content patterns', async () => { + mockFetch.mockResolvedValue({ + ok: true, + status: 200, + headers: new Map(), + text: () => Promise.resolve(` + + +

API Documentation

+

This API is deprecated and no longer supported. Please use the new version.

+ + + `), + url: 'https://api.example.com/deprecated', + }); + + const link: MarkdownLink = { + type: 'external', + href: 'https://api.example.com/deprecated', + text: 'Deprecated API', + line: 1, + }; + + const result = await validator.validateLink(link, '/test/file.md'); + + expect(result).not.toBeNull(); + expect(result?.reason).toBe('content-stale'); + expect(result?.freshnessInfo?.stalePatterns).toContain('deprecated'); + expect(result?.freshnessInfo?.stalePatterns).toContain('no longer supported'); + }); + + it('should use GET method when freshness detection is enabled', async () => { + mockFetch.mockResolvedValue({ + ok: true, + status: 200, + headers: new Map(), + text: () => Promise.resolve('Fresh content'), + url: 'https://example.com/get', + }); + + const link: MarkdownLink = { + type: 'external', + href: 'https://example.com/get', + text: 'Test Link', + line: 1, + }; + + await validator.validateLink(link, '/test/file.md'); + + expect(mockFetch).toHaveBeenCalledWith('https://example.com/get', { + method: 'GET', // Should use GET for content analysis + signal: expect.any(AbortSignal), + headers: { + 'User-Agent': 'markmv-validator/1.0 (content-freshness-detection)', + }, + }); + }); + + it('should apply domain-specific freshness thresholds', async () => { + const firebaseValidator = new LinkValidator({ + checkExternal: true, + checkContentFreshness: true, + freshnessConfig: { + domainThresholds: { + 'firebase.google.com': 6 * 30 * 24 * 60 * 60 * 1000, // 6 months + }, + }, + }); + + const eightMonthsAgo = new Date(Date.now() - 8 * 30 * 24 * 60 * 60 * 1000); + + mockFetch.mockResolvedValue({ + ok: true, + status: 200, + headers: new Map([ + ['last-modified', eightMonthsAgo.toUTCString()], + ]), + text: () => Promise.resolve('Firebase documentation'), + url: 'https://firebase.google.com/docs/functions', + }); + + const link: MarkdownLink = { + type: 'external', + href: 'https://firebase.google.com/docs/functions', + text: 'Firebase Docs', + line: 1, + }; + + const result = await firebaseValidator.validateLink(link, '/test/file.md'); + + expect(result).not.toBeNull(); + expect(result?.reason).toBe('content-stale'); + expect(result?.freshnessInfo?.thresholdMs).toBe(6 * 30 * 24 * 60 * 60 * 1000); + }); + }); + + describe('Without Freshness Detection', () => { + it('should use HEAD method when freshness detection is disabled', async () => { + const validatorWithoutFreshness = new LinkValidator({ + checkExternal: true, + checkContentFreshness: false, + }); + + mockFetch.mockResolvedValue({ + ok: true, + status: 200, + headers: new Map(), + url: 'https://example.com/head', + }); + + const link: MarkdownLink = { + type: 'external', + href: 'https://example.com/head', + text: 'Test Link', + line: 1, + }; + + await validatorWithoutFreshness.validateLink(link, '/test/file.md'); + + expect(mockFetch).toHaveBeenCalledWith('https://example.com/head', { + method: 'HEAD', // Should use HEAD when freshness is disabled + signal: expect.any(AbortSignal), + headers: { + 'User-Agent': 'markmv-validator/1.0 (content-freshness-detection)', + }, + }); + }); + + it('should not include freshness info when detection is disabled', async () => { + const validatorWithoutFreshness = new LinkValidator({ + checkExternal: true, + checkContentFreshness: false, + }); + + const veryOldDate = new Date(Date.now() - 5 * 365 * 24 * 60 * 60 * 1000); // 5 years ago + + mockFetch.mockResolvedValue({ + ok: true, + status: 200, + headers: new Map([ + ['last-modified', veryOldDate.toUTCString()], + ]), + url: 'https://example.com/old-no-freshness', + }); + + const link: MarkdownLink = { + type: 'external', + href: 'https://example.com/old-no-freshness', + text: 'Old Link', + line: 1, + }; + + const result = await validatorWithoutFreshness.validateLink(link, '/test/file.md'); + + expect(result).toBeNull(); // Should be valid even if old + }); + }); + + describe('Error Handling', () => { + it('should handle network errors during freshness check', async () => { + mockFetch.mockRejectedValue(new Error('Network error')); + + const link: MarkdownLink = { + type: 'external', + href: 'https://example.com/network-error', + text: 'Error Link', + line: 1, + }; + + const result = await validator.validateLink(link, '/test/file.md'); + + expect(result).not.toBeNull(); + expect(result?.reason).toBe('external-error'); + expect(result?.details).toContain('Network error'); + }); + + it('should handle HTTP errors during freshness check', async () => { + mockFetch.mockResolvedValue({ + ok: false, + status: 404, + statusText: 'Not Found', + headers: new Map(), + url: 'https://example.com/not-found', + }); + + const link: MarkdownLink = { + type: 'external', + href: 'https://example.com/not-found', + text: 'Not Found Link', + line: 1, + }; + + const result = await validator.validateLink(link, '/test/file.md'); + + expect(result).not.toBeNull(); + expect(result?.reason).toBe('external-error'); + expect(result?.details).toContain('HTTP 404'); + }); + + it('should handle timeout during freshness check', async () => { + const shortTimeoutValidator = new LinkValidator({ + checkExternal: true, + checkContentFreshness: true, + externalTimeout: 1, // Very short timeout + }); + + // Mock a slow response that will definitely timeout + mockFetch.mockImplementation(() => + new Promise((resolve, reject) => { + // Simulate AbortController behavior + setTimeout(() => { + const error = new Error('The operation was aborted'); + error.name = 'AbortError'; + reject(error); + }, 2); + }) + ); + + const link: MarkdownLink = { + type: 'external', + href: 'https://example.com/slow', + text: 'Slow Link', + line: 1, + }; + + const result = await shortTimeoutValidator.validateLink(link, '/test/file.md'); + + expect(result).not.toBeNull(); + expect(result?.reason).toBe('external-error'); + expect(result?.details).toContain('aborted'); + }); + + it('should handle malformed response headers gracefully', async () => { + mockFetch.mockResolvedValue({ + ok: true, + status: 200, + headers: new Map([ + ['last-modified', 'invalid-date-format'], + ]), + text: () => Promise.resolve('Content with invalid headers'), + url: 'https://example.com/invalid-headers', + }); + + const link: MarkdownLink = { + type: 'external', + href: 'https://example.com/invalid-headers', + text: 'Invalid Headers Link', + line: 1, + }; + + const result = await validator.validateLink(link, '/test/file.md'); + + // Should still work, just without last-modified date + expect(result).toBeNull(); // Link is valid despite invalid headers + }); + }); + + describe('Image Link Freshness', () => { + it('should check freshness for external image links', async () => { + mockFetch.mockResolvedValue({ + ok: true, + status: 200, + headers: new Map([ + ['content-type', 'image/jpeg'], + ]), + text: () => Promise.resolve(''), // Images don't have text content + url: 'https://example.com/image.jpg', + }); + + const link: MarkdownLink = { + type: 'image', + href: 'https://example.com/image.jpg', + text: 'Test Image', + line: 1, + }; + + const result = await validator.validateLink(link, '/test/file.md'); + + expect(result).toBeNull(); // Image should be valid + expect(mockFetch).toHaveBeenCalledWith('https://example.com/image.jpg', { + method: 'GET', + signal: expect.any(AbortSignal), + headers: { + 'User-Agent': 'markmv-validator/1.0 (content-freshness-detection)', + }, + }); + }); + + it('should not check freshness for local image links', async () => { + // Create a test image file + const imagePath = join(tempDir, 'test-image.jpg'); + await writeFile(imagePath, 'fake-image-content'); + + const link: MarkdownLink = { + type: 'image', + href: './test-image.jpg', + text: 'Local Image', + line: 1, + resolvedPath: imagePath, + }; + + const result = await validator.validateLink(link, '/test/file.md'); + + expect(result).toBeNull(); // Local image should be valid + expect(mockFetch).not.toHaveBeenCalled(); + }); + }); + + describe('Content Change Detection', () => { + it('should provide content hashes for freshness analysis', async () => { + mockFetch.mockResolvedValue({ + ok: true, + status: 200, + headers: new Map(), + text: () => Promise.resolve('Content for hash analysis'), + url: 'https://example.com/hash-test', + }); + + const link: MarkdownLink = { + type: 'external', + href: 'https://example.com/hash-test', + text: 'Hash Test Link', + line: 1, + }; + + const result = await validator.validateLink(link, '/test/file.md'); + + // Should be valid since content is fresh and no stale patterns + expect(result).toBeNull(); + expect(mockFetch).toHaveBeenCalledWith('https://example.com/hash-test', { + method: 'GET', + signal: expect.any(AbortSignal), + headers: { + 'User-Agent': 'markmv-validator/1.0 (content-freshness-detection)', + }, + }); + }); + }); + + describe('Multiple Link Types', () => { + it('should handle mixed link types with selective freshness checking', async () => { + // Mock for external link + mockFetch.mockResolvedValue({ + ok: true, + status: 200, + headers: new Map(), + text: () => Promise.resolve('External content'), + url: 'https://example.com/external', + }); + + const links: MarkdownLink[] = [ + { + type: 'internal', + href: './internal.md', + text: 'Internal Link', + line: 1, + resolvedPath: join(tempDir, 'internal.md'), + }, + { + type: 'external', + href: 'https://example.com/external', + text: 'External Link', + line: 2, + }, + { + type: 'anchor', + href: '#section', + text: 'Anchor Link', + line: 3, + }, + ]; + + // Create internal file + await writeFile(join(tempDir, 'internal.md'), '# Internal File'); + + // Create source file with anchor + const sourceFile = join(tempDir, 'source.md'); + await writeFile(sourceFile, '# Section\nContent here'); + + const results = await Promise.all( + links.map(link => validator.validateLink(link, sourceFile)) + ); + + // Internal link should be valid + expect(results[0]).toBeNull(); + + // External link should be valid (fresh) + expect(results[1]).toBeNull(); + + // Anchor link should be valid + expect(results[2]).toBeNull(); + + // Only external link should trigger fetch + expect(mockFetch).toHaveBeenCalledTimes(1); + }); + }); +}); \ No newline at end of file diff --git a/src/core/link-validator.ts b/src/core/link-validator.ts index 0ffaf55..9d3bd60 100644 --- a/src/core/link-validator.ts +++ b/src/core/link-validator.ts @@ -1,5 +1,6 @@ import { constants, access } from 'node:fs/promises'; import { readFile } from 'node:fs/promises'; +import { ContentFreshnessDetector, type FreshnessConfig } from '../utils/content-freshness.js'; import type { BrokenLink, ValidationResult } from '../types/config.js'; import type { MarkdownLink, ParsedMarkdownFile } from '../types/links.js'; @@ -20,6 +21,10 @@ export interface LinkValidatorOptions { strictInternal?: boolean; /** Check Claude import links */ checkClaudeImports?: boolean; + /** Enable content freshness detection for external links */ + checkContentFreshness?: boolean; + /** Configuration for content freshness detection */ + freshnessConfig?: Partial; } /** @@ -62,7 +67,10 @@ export interface LinkValidatorOptions { * ``` */ export class LinkValidator { - private options: Required; + private options: Required> & { + freshnessConfig?: Partial; + }; + private freshnessDetector?: ContentFreshnessDetector; constructor(options: LinkValidatorOptions = {}) { this.options = { @@ -70,7 +78,13 @@ export class LinkValidator { externalTimeout: options.externalTimeout ?? 5000, strictInternal: options.strictInternal ?? true, checkClaudeImports: options.checkClaudeImports ?? true, + checkContentFreshness: options.checkContentFreshness ?? false, + ...(options.freshnessConfig && { freshnessConfig: options.freshnessConfig }), }; + + if (this.options.checkContentFreshness) { + this.freshnessDetector = new ContentFreshnessDetector(this.options.freshnessConfig); + } } async validateFiles(files: ParsedMarkdownFile[]): Promise { @@ -209,9 +223,15 @@ export class LinkValidator { const controller = new AbortController(); const timeoutId = setTimeout(() => controller.abort(), this.options.externalTimeout); + // For freshness detection, we need to make a GET request to get content + const method = this.options.checkContentFreshness ? 'GET' : 'HEAD'; + const response = await fetch(link.href, { - method: 'HEAD', + method, signal: controller.signal, + headers: { + 'User-Agent': 'markmv-validator/1.0 (content-freshness-detection)', + }, }); clearTimeout(timeoutId); @@ -225,7 +245,36 @@ export class LinkValidator { }; } - return null; // Link is valid + // Check content freshness if enabled + if (this.options.checkContentFreshness && this.freshnessDetector) { + const content = method === 'GET' ? await response.text() : ''; + const headers: Record = {}; + + // Convert Headers to plain object + response.headers.forEach((value, key) => { + headers[key] = value; + }); + + const freshnessInfo = await this.freshnessDetector.analyzeContentFreshness(link.href, { + status: response.status, + headers, + content, + finalUrl: response.url, + }); + + // If content is stale, return as a broken link with freshness info + if (!freshnessInfo.isFresh) { + return { + sourceFile, + link, + reason: 'content-stale', + details: freshnessInfo.warning || 'Content appears to be outdated', + freshnessInfo, + }; + } + } + + return null; // Link is valid and fresh } catch (error) { return { sourceFile, diff --git a/src/types/config.ts b/src/types/config.ts index 8c333a2..eec2dcd 100644 --- a/src/types/config.ts +++ b/src/types/config.ts @@ -99,7 +99,9 @@ export interface BrokenLink { /** The broken link */ link: import('./links.js').MarkdownLink; /** Reason the link is broken */ - reason: 'file-not-found' | 'external-error' | 'invalid-format' | 'circular-reference'; + reason: 'file-not-found' | 'external-error' | 'invalid-format' | 'circular-reference' | 'content-stale'; /** Additional error details */ details?: string; + /** Content freshness information for external links */ + freshnessInfo?: import('../utils/content-freshness.js').ContentFreshnessInfo; } diff --git a/src/utils/content-freshness.test.ts b/src/utils/content-freshness.test.ts new file mode 100644 index 0000000..84f94e4 --- /dev/null +++ b/src/utils/content-freshness.test.ts @@ -0,0 +1,455 @@ +/** + * Tests for content freshness detection system. + * + * @fileoverview Tests for detecting stale external content and last-modified tracking + */ + +import { describe, it, expect, beforeEach, afterEach, vi } from 'vitest'; +import { mkdtemp, rm, writeFile } from 'node:fs/promises'; +import { tmpdir } from 'node:os'; +import { join } from 'node:path'; +import { ContentFreshnessDetector, DEFAULT_FRESHNESS_CONFIG, type ResponseInfo } from './content-freshness.js'; + +describe('ContentFreshnessDetector', () => { + let tempDir: string; + let detector: ContentFreshnessDetector; + + beforeEach(async () => { + tempDir = await mkdtemp(join(tmpdir(), 'content-freshness-test-')); + detector = new ContentFreshnessDetector({ + cacheDir: join(tempDir, 'cache'), + defaultThreshold: 365 * 24 * 60 * 60 * 1000, // 1 year for testing + }); + }); + + afterEach(async () => { + await rm(tempDir, { recursive: true, force: true }); + }); + + describe('Configuration', () => { + it('should initialize with default configuration', () => { + const defaultDetector = new ContentFreshnessDetector(); + expect(defaultDetector.isEnabled()).toBe(true); + }); + + it('should initialize with custom configuration', () => { + const customDetector = new ContentFreshnessDetector({ + enabled: false, + defaultThreshold: 6 * 30 * 24 * 60 * 60 * 1000, // 6 months + stalePatterns: ['custom-pattern'], + }); + expect(customDetector.isEnabled()).toBe(false); + }); + + it('should use default freshness configuration', () => { + expect(DEFAULT_FRESHNESS_CONFIG.enabled).toBe(true); + expect(DEFAULT_FRESHNESS_CONFIG.defaultThreshold).toBe(2 * 365 * 24 * 60 * 60 * 1000); + expect(DEFAULT_FRESHNESS_CONFIG.stalePatterns).toContain('deprecated'); + expect(DEFAULT_FRESHNESS_CONFIG.domainThresholds['firebase.google.com']).toBe(1 * 365 * 24 * 60 * 60 * 1000); + }); + }); + + describe('Content Analysis', () => { + it('should detect fresh content based on last-modified header', async () => { + const recentDate = new Date(Date.now() - 30 * 24 * 60 * 60 * 1000); // 30 days ago + const response: ResponseInfo = { + status: 200, + headers: { + 'last-modified': recentDate.toUTCString(), + }, + content: 'Fresh content here', + finalUrl: 'https://example.com/fresh', + }; + + const result = await detector.analyzeContentFreshness('https://example.com/fresh', response); + + expect(result.isFresh).toBe(true); + expect(result.lastModified).toBeDefined(); + expect(result.ageMs).toBeLessThan(365 * 24 * 60 * 60 * 1000); + expect(result.warning).toBeUndefined(); + }); + + it('should detect stale content based on last-modified header', async () => { + const oldDate = new Date(Date.now() - 2 * 365 * 24 * 60 * 60 * 1000); // 2 years ago + const response: ResponseInfo = { + status: 200, + headers: { + 'last-modified': oldDate.toUTCString(), + }, + content: 'Old content here', + finalUrl: 'https://example.com/old', + }; + + const result = await detector.analyzeContentFreshness('https://example.com/old', response); + + expect(result.isFresh).toBe(false); + expect(result.lastModified).toBeDefined(); + expect(result.ageMs).toBeGreaterThan(365 * 24 * 60 * 60 * 1000); + expect(result.warning).toContain('old'); + expect(result.suggestion).toContain('newer version'); + }); + + it('should detect stale patterns in content', async () => { + const response: ResponseInfo = { + status: 200, + headers: {}, + content: 'This feature is deprecated and no longer supported.', + finalUrl: 'https://example.com/deprecated', + }; + + const result = await detector.analyzeContentFreshness('https://example.com/deprecated', response); + + expect(result.isFresh).toBe(false); + expect(result.stalePatterns).toContain('deprecated'); + expect(result.stalePatterns).toContain('no longer supported'); + expect(result.warning).toContain('staleness indicators'); + }); + + it('should apply domain-specific thresholds', async () => { + const firebaseDetector = new ContentFreshnessDetector({ + cacheDir: join(tempDir, 'firebase-cache'), + domainThresholds: { + 'firebase.google.com': 6 * 30 * 24 * 60 * 60 * 1000, // 6 months + }, + }); + + const dateEightMonthsAgo = new Date(Date.now() - 8 * 30 * 24 * 60 * 60 * 1000); + const response: ResponseInfo = { + status: 200, + headers: { + 'last-modified': dateEightMonthsAgo.toUTCString(), + }, + content: 'Firebase documentation', + finalUrl: 'https://firebase.google.com/docs/functions', + }; + + const result = await firebaseDetector.analyzeContentFreshness( + 'https://firebase.google.com/docs/functions', + response + ); + + expect(result.isFresh).toBe(false); + expect(result.thresholdMs).toBe(6 * 30 * 24 * 60 * 60 * 1000); + }); + + it('should handle missing last-modified header gracefully', async () => { + const response: ResponseInfo = { + status: 200, + headers: {}, + content: 'No last modified header', + finalUrl: 'https://example.com/no-header', + }; + + const result = await detector.analyzeContentFreshness('https://example.com/no-header', response); + + expect(result.lastModified).toBeUndefined(); + expect(result.ageMs).toBeUndefined(); + // Content might still be flagged as stale if patterns are detected + }); + + it('should detect multiple stale patterns', async () => { + const response: ResponseInfo = { + status: 200, + headers: {}, + content: 'This page has moved and the content is deprecated. This is legacy documentation.', + finalUrl: 'https://example.com/multi-stale', + }; + + const result = await detector.analyzeContentFreshness('https://example.com/multi-stale', response); + + expect(result.isFresh).toBe(false); + expect(result.stalePatterns).toContain('this page has moved'); + expect(result.stalePatterns).toContain('deprecated'); + expect(result.stalePatterns).toContain('legacy documentation'); + }); + }); + + describe('Content Change Detection', () => { + it('should detect content changes between validations', async () => { + const url = 'https://example.com/changing'; + + // First validation + const response1: ResponseInfo = { + status: 200, + headers: {}, + content: 'Original content', + finalUrl: url, + }; + + const result1 = await detector.analyzeContentFreshness(url, response1); + expect(result1.hasContentChanged).toBeUndefined(); // No previous content + + // Second validation with changed content + const response2: ResponseInfo = { + status: 200, + headers: {}, + content: 'Updated content with significant changes', + finalUrl: url, + }; + + const result2 = await detector.analyzeContentFreshness(url, response2); + expect(result2.hasContentChanged).toBe(true); + expect(result2.previousContentHash).toBeDefined(); + expect(result2.contentHash).not.toBe(result2.previousContentHash); + }); + + it('should not flag content as changed for minor whitespace differences', async () => { + const url = 'https://example.com/whitespace'; + + // First validation + const response1: ResponseInfo = { + status: 200, + headers: {}, + content: 'Content with extra\n\n\nspaces\t\tand\ttabs', + finalUrl: url, + }; + + const result1 = await detector.analyzeContentFreshness(url, response1); + + // Second validation with normalized whitespace + const response2: ResponseInfo = { + status: 200, + headers: {}, + content: 'Content with extra spaces and tabs', + finalUrl: url, + }; + + const result2 = await detector.analyzeContentFreshness(url, response2); + expect(result2.contentHash).toBe(result1.contentHash); + }); + + it('should normalize content for consistent hashing', async () => { + const url = 'https://example.com/normalize'; + + const contentWithVariations = ` + + + + + + + +

Main content here on 2024-01-15 at 14:30:25

+ + + `; + + const response: ResponseInfo = { + status: 200, + headers: {}, + content: contentWithVariations, + finalUrl: url, + }; + + const result = await detector.analyzeContentFreshness(url, response); + expect(result.contentHash).toBeDefined(); + expect(result.contentHash).toHaveLength(64); // SHA-256 hex string + }); + }); + + describe('Cache Management', () => { + it('should store and retrieve cached content information', async () => { + const url = 'https://example.com/cached'; + const response: ResponseInfo = { + status: 200, + headers: { + 'last-modified': new Date().toUTCString(), + 'etag': '"test-etag"', + }, + content: 'Cached content', + finalUrl: url, + }; + + // First analysis should cache the result + await detector.analyzeContentFreshness(url, response); + + // Get cache stats + const stats = await detector.getCacheStats(); + expect(stats.totalEntries).toBe(1); + expect(stats.newestEntry).toBeDefined(); + }); + + it('should clear cache successfully', async () => { + const url = 'https://example.com/clear-cache'; + const response: ResponseInfo = { + status: 200, + headers: {}, + content: 'Cache test content', + finalUrl: url, + }; + + await detector.analyzeContentFreshness(url, response); + + let stats = await detector.getCacheStats(); + expect(stats.totalEntries).toBe(1); + + await detector.clearCache(); + + stats = await detector.getCacheStats(); + expect(stats.totalEntries).toBe(0); + }); + + it('should handle cache errors gracefully', async () => { + const invalidDetector = new ContentFreshnessDetector({ + cacheDir: '/invalid/readonly/path', + }); + + const url = 'https://example.com/cache-error'; + const response: ResponseInfo = { + status: 200, + headers: {}, + content: 'Error test content', + finalUrl: url, + }; + + // Should not throw error even with invalid cache path + const result = await invalidDetector.analyzeContentFreshness(url, response); + expect(result.contentHash).toBeDefined(); + }); + }); + + describe('URL Processing', () => { + it('should extract domain correctly from various URL formats', async () => { + const testCases = [ + 'https://example.com/path', + 'http://subdomain.example.com/path?query=1', + 'https://docs.github.com/en/actions', + 'https://firebase.google.com/docs/functions/beta', + ]; + + for (const url of testCases) { + const response: ResponseInfo = { + status: 200, + headers: {}, + content: 'Test content', + finalUrl: url, + }; + + const result = await detector.analyzeContentFreshness(url, response); + expect(result.url).toBe(url); + expect(result.thresholdMs).toBeGreaterThan(0); + } + }); + + it('should handle invalid URLs gracefully', async () => { + const invalidUrl = 'not-a-valid-url'; + const response: ResponseInfo = { + status: 200, + headers: {}, + content: 'Test content', + finalUrl: invalidUrl, + }; + + const result = await detector.analyzeContentFreshness(invalidUrl, response); + expect(result.url).toBe(invalidUrl); + expect(result.thresholdMs).toBe(detector['config'].defaultThreshold); + }); + }); + + describe('Age Formatting', () => { + it('should format age in human-readable format', async () => { + const testCases = [ + { ageMs: 2 * 365 * 24 * 60 * 60 * 1000, expected: '2 years' }, + { ageMs: 1 * 365 * 24 * 60 * 60 * 1000 + 6 * 30 * 24 * 60 * 60 * 1000, expected: '1 year, 6 months' }, + { ageMs: 3 * 30 * 24 * 60 * 60 * 1000, expected: '3 months' }, + { ageMs: 15 * 24 * 60 * 60 * 1000, expected: '15 days' }, + ]; + + for (const testCase of testCases) { + const oldDate = new Date(Date.now() - testCase.ageMs); + const response: ResponseInfo = { + status: 200, + headers: { + 'last-modified': oldDate.toUTCString(), + }, + content: 'Age test content', + finalUrl: 'https://example.com/age-test', + }; + + const result = await detector.analyzeContentFreshness('https://example.com/age-test', response); + + if (!result.isFresh && result.warning) { + expect(result.warning).toContain(testCase.expected); + } + } + }); + }); + + describe('Disabled Detection', () => { + it('should return fresh result when detection is disabled', async () => { + const disabledDetector = new ContentFreshnessDetector({ + enabled: false, + }); + + const response: ResponseInfo = { + status: 200, + headers: {}, + content: 'This content is deprecated and no longer supported', + finalUrl: 'https://example.com/disabled', + }; + + const result = await disabledDetector.analyzeContentFreshness('https://example.com/disabled', response); + + expect(result.isFresh).toBe(true); + expect(result.stalePatterns).toHaveLength(0); + expect(result.thresholdMs).toBe(0); + }); + }); + + describe('Case Sensitivity', () => { + it('should detect stale patterns case-insensitively', async () => { + const response: ResponseInfo = { + status: 200, + headers: {}, + content: 'This feature is DEPRECATED and NO LONGER SUPPORTED.', + finalUrl: 'https://example.com/case-test', + }; + + const result = await detector.analyzeContentFreshness('https://example.com/case-test', response); + + expect(result.isFresh).toBe(false); + expect(result.stalePatterns).toContain('deprecated'); + expect(result.stalePatterns).toContain('no longer supported'); + }); + }); + + describe('Performance', () => { + it('should handle large content efficiently', async () => { + const largeContent = 'Large content section '.repeat(10000) + 'deprecated'; + const response: ResponseInfo = { + status: 200, + headers: {}, + content: largeContent, + finalUrl: 'https://example.com/large', + }; + + const startTime = Date.now(); + const result = await detector.analyzeContentFreshness('https://example.com/large', response); + const endTime = Date.now(); + + expect(endTime - startTime).toBeLessThan(1000); // Should complete within 1 second + expect(result.stalePatterns).toContain('deprecated'); + expect(result.contentHash).toBeDefined(); + }); + + it('should handle concurrent analyses', async () => { + const urls = Array.from({ length: 10 }, (_, i) => `https://example.com/concurrent-${i}`); + const promises = urls.map(url => { + const response: ResponseInfo = { + status: 200, + headers: {}, + content: `Content for ${url}`, + finalUrl: url, + }; + return detector.analyzeContentFreshness(url, response); + }); + + const results = await Promise.all(promises); + expect(results).toHaveLength(10); + results.forEach((result, i) => { + expect(result.url).toBe(urls[i]); + expect(result.contentHash).toBeDefined(); + }); + }); + }); +}); \ No newline at end of file diff --git a/src/utils/content-freshness.ts b/src/utils/content-freshness.ts new file mode 100644 index 0000000..75581af --- /dev/null +++ b/src/utils/content-freshness.ts @@ -0,0 +1,414 @@ +/** + * Content freshness detection utilities for external links. + * + * @fileoverview Detects potentially stale external content even when links are valid + */ + +import { createHash } from 'node:crypto'; +import { readFile, writeFile, mkdir } from 'node:fs/promises'; +import { join, dirname } from 'node:path'; +import { existsSync } from 'node:fs'; + +/** + * Configuration for content freshness detection. + */ +export interface FreshnessConfig { + /** Enable freshness detection */ + enabled: boolean; + /** Default staleness threshold in milliseconds */ + defaultThreshold: number; + /** Domain-specific thresholds */ + domainThresholds: Record; + /** Patterns that indicate stale or moved content */ + stalePatterns: string[]; + /** Cache directory for content tracking */ + cacheDir: string; + /** Whether to perform content change detection */ + detectContentChanges: boolean; +} + +/** + * Information about content freshness. + */ +export interface ContentFreshnessInfo { + /** URL being checked */ + url: string; + /** Whether content is considered fresh */ + isFresh: boolean; + /** Last modified date if available */ + lastModified?: Date; + /** Age of content in milliseconds */ + ageMs?: number; + /** Staleness threshold that was applied */ + thresholdMs: number; + /** Detected stale patterns in content */ + stalePatterns: string[]; + /** Content hash for change detection */ + contentHash?: string; + /** Previous content hash if available */ + previousContentHash?: string; + /** Whether content has significantly changed */ + hasContentChanged?: boolean; + /** Warning message if content appears stale */ + warning?: string; + /** Suggestion for addressing staleness */ + suggestion?: string; +} + +/** + * Cached content information. + */ +interface CachedContentInfo { + /** URL */ + url: string; + /** Content hash */ + contentHash: string; + /** Last check timestamp */ + lastChecked: number; + /** Last modified date if available */ + lastModified?: number; + /** Response headers */ + headers?: Record; +} + +/** + * HTTP response information needed for freshness detection. + */ +export interface ResponseInfo { + /** Response status code */ + status: number; + /** Response headers */ + headers: Record; + /** Response body content */ + content: string; + /** Final URL after redirects */ + finalUrl: string; +} + +/** + * Content freshness detector for external links. + */ +export class ContentFreshnessDetector { + private config: FreshnessConfig; + private cacheFile: string; + + constructor(config: Partial = {}) { + this.config = { + enabled: config.enabled ?? true, + defaultThreshold: config.defaultThreshold ?? 2 * 365 * 24 * 60 * 60 * 1000, // 2 years + domainThresholds: config.domainThresholds ?? { + 'firebase.google.com': 1 * 365 * 24 * 60 * 60 * 1000, // 1 year + 'docs.github.com': 6 * 30 * 24 * 60 * 60 * 1000, // 6 months + 'api.github.com': 6 * 30 * 24 * 60 * 60 * 1000, // 6 months + 'developers.google.com': 1 * 365 * 24 * 60 * 60 * 1000, // 1 year + 'docs.aws.amazon.com': 1 * 365 * 24 * 60 * 60 * 1000, // 1 year + 'docs.microsoft.com': 1 * 365 * 24 * 60 * 60 * 1000, // 1 year + }, + stalePatterns: config.stalePatterns ?? [ + 'deprecated', + 'no longer supported', + 'this page has moved', + 'page not found', + 'content has moved', + 'redirected permanently', + 'legacy documentation', + 'archived', + 'end of life', + 'discontinued', + 'migration notice', + 'breaking changes', + 'version no longer maintained', + ], + cacheDir: config.cacheDir ?? '.markmv-cache', + detectContentChanges: config.detectContentChanges ?? true, + }; + + this.cacheFile = join(this.config.cacheDir, 'content-freshness.json'); + } + + /** + * Check if freshness detection is enabled. + */ + isEnabled(): boolean { + return this.config.enabled; + } + + /** + * Analyze content freshness for a given URL response. + */ + async analyzeContentFreshness(url: string, response: ResponseInfo): Promise { + if (!this.config.enabled) { + return { + url, + isFresh: true, + thresholdMs: 0, + stalePatterns: [], + }; + } + + const domain = this.extractDomain(url); + const thresholdMs = this.config.domainThresholds[domain] ?? this.config.defaultThreshold; + + // Initialize result + const result: ContentFreshnessInfo = { + url, + isFresh: true, + thresholdMs, + stalePatterns: [], + }; + + // Check last-modified header + const lastModified = this.parseLastModified(response.headers); + if (lastModified) { + result.lastModified = lastModified; + result.ageMs = Date.now() - lastModified.getTime(); + + if (result.ageMs > thresholdMs) { + result.isFresh = false; + result.warning = `Content is ${this.formatAge(result.ageMs)} old`; + result.suggestion = 'Check for newer version or updated documentation'; + } + } + + // Detect stale patterns in content + const detectedPatterns = this.detectStalePatterns(response.content); + if (detectedPatterns.length > 0) { + result.stalePatterns = detectedPatterns; + result.isFresh = false; + result.warning = result.warning || 'Content contains staleness indicators'; + result.suggestion = result.suggestion || 'Review content for updates or alternatives'; + } + + // Content change detection + if (this.config.detectContentChanges) { + const contentHash = this.calculateContentHash(response.content); + result.contentHash = contentHash; + + const cached = await this.getCachedContent(url); + if (cached && cached.contentHash !== contentHash) { + result.previousContentHash = cached.contentHash; + result.hasContentChanged = true; + + if (!result.warning) { + result.warning = 'Content has changed since last validation'; + result.suggestion = 'Review changes to ensure links are still relevant'; + } + } + + // Update cache + await this.updateCachedContent(url, contentHash, response.headers, lastModified); + } + + return result; + } + + /** + * Extract domain from URL. + */ + private extractDomain(url: string): string { + try { + return new URL(url).hostname; + } catch { + return ''; + } + } + + /** + * Parse Last-Modified header. + */ + private parseLastModified(headers: Record): Date | undefined { + const lastModified = headers['last-modified'] || headers['Last-Modified']; + if (!lastModified) { + return undefined; + } + + try { + return new Date(lastModified); + } catch { + return undefined; + } + } + + /** + * Detect stale patterns in content. + */ + private detectStalePatterns(content: string): string[] { + const lowerContent = content.toLowerCase(); + const detected: string[] = []; + + for (const pattern of this.config.stalePatterns) { + if (lowerContent.includes(pattern.toLowerCase())) { + detected.push(pattern); + } + } + + return detected; + } + + /** + * Calculate content hash for change detection. + */ + private calculateContentHash(content: string): string { + // Normalize content to reduce false positives + const normalized = content + .replace(/\s+/g, ' ') // Normalize whitespace + .replace(//gs, '') // Remove HTML comments + .replace(/)<[^<]*)*<\/script>/gi, '') // Remove scripts + .replace(/)<[^<]*)*<\/style>/gi, '') // Remove styles + .replace(/\d{4}-\d{2}-\d{2}/g, 'DATE') // Replace dates + .replace(/\b\d{1,2}:\d{2}(:\d{2})?\b/g, 'TIME') // Replace times + .trim(); + + return createHash('sha256').update(normalized, 'utf8').digest('hex'); + } + + /** + * Format age in human-readable format. + */ + private formatAge(ageMs: number): string { + const years = Math.floor(ageMs / (365 * 24 * 60 * 60 * 1000)); + const months = Math.floor((ageMs % (365 * 24 * 60 * 60 * 1000)) / (30 * 24 * 60 * 60 * 1000)); + const days = Math.floor((ageMs % (30 * 24 * 60 * 60 * 1000)) / (24 * 60 * 60 * 1000)); + + if (years > 0) { + return months > 0 ? `${years} year${years > 1 ? 's' : ''}, ${months} month${months > 1 ? 's' : ''}` : `${years} year${years > 1 ? 's' : ''}`; + } + if (months > 0) { + return days > 0 ? `${months} month${months > 1 ? 's' : ''}, ${days} day${days > 1 ? 's' : ''}` : `${months} month${months > 1 ? 's' : ''}`; + } + return `${days} day${days > 1 ? 's' : ''}`; + } + + /** + * Get cached content information. + */ + private async getCachedContent(url: string): Promise { + try { + if (!existsSync(this.cacheFile)) { + return undefined; + } + + const cacheData = JSON.parse(await readFile(this.cacheFile, 'utf8')); + return cacheData[url]; + } catch { + return undefined; + } + } + + /** + * Update cached content information. + */ + private async updateCachedContent( + url: string, + contentHash: string, + headers: Record, + lastModified?: Date + ): Promise { + try { + // Ensure cache directory exists + await mkdir(dirname(this.cacheFile), { recursive: true }); + + let cacheData: Record = {}; + + if (existsSync(this.cacheFile)) { + try { + cacheData = JSON.parse(await readFile(this.cacheFile, 'utf8')); + } catch { + // Invalid cache file, start fresh + } + } + + cacheData[url] = { + url, + contentHash, + lastChecked: Date.now(), + ...(lastModified && { lastModified: lastModified.getTime() }), + headers: { + 'last-modified': headers['last-modified'] || headers['Last-Modified'] || '', + 'etag': headers['etag'] || headers['ETag'] || '', + 'cache-control': headers['cache-control'] || headers['Cache-Control'] || '', + }, + }; + + await writeFile(this.cacheFile, JSON.stringify(cacheData, null, 2), 'utf8'); + } catch (error) { + // Fail silently for cache updates + if (process.env.NODE_ENV !== 'test') { + console.warn(`Warning: Failed to update content freshness cache: ${error instanceof Error ? error.message : String(error)}`); + } + } + } + + /** + * Clear the content freshness cache. + */ + async clearCache(): Promise { + try { + if (existsSync(this.cacheFile)) { + await writeFile(this.cacheFile, '{}', 'utf8'); + } + } catch (error) { + throw new Error(`Failed to clear content freshness cache: ${error instanceof Error ? error.message : String(error)}`); + } + } + + /** + * Get cache statistics. + */ + async getCacheStats(): Promise<{ totalEntries: number; oldestEntry?: Date; newestEntry?: Date }> { + try { + if (!existsSync(this.cacheFile)) { + return { totalEntries: 0 }; + } + + const cacheData = JSON.parse(await readFile(this.cacheFile, 'utf8')); + const entries = Object.values(cacheData) as CachedContentInfo[]; + + if (entries.length === 0) { + return { totalEntries: 0 }; + } + + const timestamps = entries.map(entry => entry.lastChecked); + return { + totalEntries: entries.length, + oldestEntry: new Date(Math.min(...timestamps)), + newestEntry: new Date(Math.max(...timestamps)), + }; + } catch { + return { totalEntries: 0 }; + } + } +} + +/** + * Default freshness configuration. + */ +export const DEFAULT_FRESHNESS_CONFIG: FreshnessConfig = { + enabled: true, + defaultThreshold: 2 * 365 * 24 * 60 * 60 * 1000, // 2 years + domainThresholds: { + 'firebase.google.com': 1 * 365 * 24 * 60 * 60 * 1000, // 1 year + 'docs.github.com': 6 * 30 * 24 * 60 * 60 * 1000, // 6 months + 'api.github.com': 6 * 30 * 24 * 60 * 60 * 1000, // 6 months + 'developers.google.com': 1 * 365 * 24 * 60 * 60 * 1000, // 1 year + 'docs.aws.amazon.com': 1 * 365 * 24 * 60 * 60 * 1000, // 1 year + 'docs.microsoft.com': 1 * 365 * 24 * 60 * 60 * 1000, // 1 year + }, + stalePatterns: [ + 'deprecated', + 'no longer supported', + 'this page has moved', + 'page not found', + 'content has moved', + 'redirected permanently', + 'legacy documentation', + 'archived', + 'end of life', + 'discontinued', + 'migration notice', + 'breaking changes', + 'version no longer maintained', + ], + cacheDir: '.markmv-cache', + detectContentChanges: true, +}; \ No newline at end of file