From 47f3c0c719cf0cc9526bba39cea3a46355efd301 Mon Sep 17 00:00:00 2001 From: Joe Heck Date: Sat, 8 Nov 2025 14:45:09 -0800 Subject: [PATCH 1/2] a crawling script that walks a local development instance for content issues - uses playwright - dumps a report in JSON, the file name of which has been added to .gitignore - after hosting the site locally, run `npm run site-check` to drive through the site - implemented using claude/ML --- .gitignore | 3 + README.md | 70 ++ package-lock.json | 475 +++++++++- package.json | 10 +- scripts/site-check.js | 1056 +++++++++++++++++++++ scripts/site-report-format.md | 408 ++++++++ scripts/site-visualize-usage.md | 269 ++++++ scripts/site-visualize.js | 1552 +++++++++++++++++++++++++++++++ scripts/site-visualize.md | 717 ++++++++++++++ 9 files changed, 4556 insertions(+), 4 deletions(-) create mode 100644 scripts/site-check.js create mode 100644 scripts/site-report-format.md create mode 100644 scripts/site-visualize-usage.md create mode 100644 scripts/site-visualize.js create mode 100644 scripts/site-visualize.md diff --git a/.gitignore b/.gitignore index 7402a5506..5f639ffaa 100644 --- a/.gitignore +++ b/.gitignore @@ -25,3 +25,6 @@ xcuserdata # Npm modules node_modules + +# Site check reports +site-check-report.json diff --git a/README.md b/README.md index e35035784..4a0304d75 100644 --- a/README.md +++ b/README.md @@ -18,6 +18,7 @@ Swift.org uses [Jekyll](https://jekyllrb.com), a blog-aware, static site generat ### Running locally Requirements + - Git - Ruby 3.3 or higher _(a Ruby installation manager, such as @@ -38,6 +39,7 @@ open "http://localhost:4000" If you’d like to contribute to this project, please run Prettier before submitting your pull request to ensure consistent code style across the project. Requirements + - [Node v18.17.1 or higher](https://nodejs.org) ```shell @@ -48,6 +50,74 @@ npm install npm run prettify ``` +### Running site content checks + +The site checker tool crawls the locally running development site to identify content issues like broken links, missing images, and isolated pages. + +**Prerequisites:** + +- Node.js v18.17.1 or higher +- Site running locally at http://localhost:4000 + +**Running the checker:** + +```bash +# Install dependencies (if not already done) +npm install + +# Start the local development server in one terminal +LC_ALL=en_us.UTF-8 bundle exec jekyll serve --config _config.yml,_config_dev.yml + +# In another terminal, run the site checker +npm run site-check +``` + +The tool will generate: + +- Console output with detected issues +- `site-check-report.json` with detailed findings + +**Querying the report:** + +```bash +# List all isolated pages (pages with no incoming links) +jq '.pages | to_entries | map(select(.value.isIsolated == true) | .key) | .[]' site-check-report.json + +# Show incoming links for a specific page +jq '.pages["/install/macos"].incomingLinks' site-check-report.json + +# Find pages with the most incoming links +jq '.pages | to_entries | map({page: .key, count: (.value.incomingLinks | length)}) | sort_by(.count) | reverse | .[0:10]' site-check-report.json + +# Find which pages link to a specific page +jq '.pages | to_entries[] | select(.value.outgoingLinks.content[] == "/documentation") | .key' site-check-report.json + +# List all pages with errors and their incoming links +jq '.pages | to_entries | map(select(.value.issues.error != null) | {page: .key, error: .value.issues.error, incomingLinks: .value.incomingLinks}) | .[]' site-check-report.json +``` + +**Configuration:** + +You can customize the site check behavior using environment variables: + +```bash +# Check up to 2000 pages (default: 1000) +MAX_PAGES=2000 npm run site-check + +# Use a different base URL +SITE_URL=http://localhost:8080 npm run site-check + +# Add delay between page requests in milliseconds (default: 50) +# Increase this if your dev server is struggling under load +CRAWL_DELAY=250 npm run site-check + +# Enable external link checking (default: false, currently unused) +CHECK_EXTERNAL=true npm run site-check + +# Combine multiple options +MAX_PAGES=500 CRAWL_DELAY=100 npm run site-check +``` + ### Running in Docker First build the site with Docker Compose: diff --git a/package-lock.json b/package-lock.json index c17da49e4..556bca352 100644 --- a/package-lock.json +++ b/package-lock.json @@ -7,9 +7,11 @@ "name": "swift.org", "hasInstallScript": true, "dependencies": { - "animejs": "^4.0.2" + "animejs": "^4.0.2", + "d3": "^7.9.0" }, "devDependencies": { + "playwright": "^1.49.1", "prettier": "^3.5.3" } }, @@ -19,6 +21,462 @@ "integrity": "sha512-f0L/kSya2RF23iMSF/VO01pMmLwlAFoiQeNAvBXhEyLzIPd2/QTBRatwGUqkVCC6seaAJYzAkGir55N4SL+h3A==", "license": "MIT" }, + "node_modules/commander": { + "version": "7.2.0", + "resolved": "https://npm.apple.com/commander/-/commander-7.2.0.tgz", + "integrity": "sha512-QrWXB+ZQSVPmIWIhtEO9H+gwHaMGYiF5ChvoJ+K9ZGHG/sVsa6yiesAD1GC/x46sET00Xlwo1u49RVVVzvcSkw==", + "engines": { + "node": ">= 10" + } + }, + "node_modules/d3": { + "version": "7.9.0", + "resolved": "https://npm.apple.com/d3/-/d3-7.9.0.tgz", + "integrity": "sha512-e1U46jVP+w7Iut8Jt8ri1YsPOvFpg46k+K8TpCb0P+zjCkjkPnV7WzfDJzMHy1LnA+wj5pLT1wjO901gLXeEhA==", + "dependencies": { + "d3-array": "3", + "d3-axis": "3", + "d3-brush": "3", + "d3-chord": "3", + "d3-color": "3", + "d3-contour": "4", + "d3-delaunay": "6", + "d3-dispatch": "3", + "d3-drag": "3", + "d3-dsv": "3", + "d3-ease": "3", + "d3-fetch": "3", + "d3-force": "3", + "d3-format": "3", + "d3-geo": "3", + "d3-hierarchy": "3", + "d3-interpolate": "3", + "d3-path": "3", + "d3-polygon": "3", + "d3-quadtree": "3", + "d3-random": "3", + "d3-scale": "4", + "d3-scale-chromatic": "3", + "d3-selection": "3", + "d3-shape": "3", + "d3-time": "3", + "d3-time-format": "4", + "d3-timer": "3", + "d3-transition": "3", + "d3-zoom": "3" + }, + "engines": { + "node": ">=12" + } + }, + "node_modules/d3-array": { + "version": "3.2.4", + "resolved": "https://artifacts.apple.com/artifactory/api/npm/npm-apple/d3-array/-/d3-array-3.2.4.tgz", + "integrity": "sha512-tdQAmyA18i4J7wprpYq8ClcxZy3SC31QMeByyCFyRt7BVHdREQZ5lpzoe5mFEYZUWe+oq8HBvk9JjpibyEV4Jg==", + "license": "ISC", + "dependencies": { + "internmap": "1 - 2" + }, + "engines": { + "node": ">=12" + } + }, + "node_modules/d3-axis": { + "version": "3.0.0", + "resolved": "https://npm.apple.com/d3-axis/-/d3-axis-3.0.0.tgz", + "integrity": "sha512-IH5tgjV4jE/GhHkRV0HiVYPDtvfjHQlQfJHs0usq7M30XcSBvOotpmH1IgkcXsO/5gEQZD43B//fc7SRT5S+xw==", + "engines": { + "node": ">=12" + } + }, + "node_modules/d3-brush": { + "version": "3.0.0", + "resolved": "https://artifacts.apple.com/artifactory/api/npm/npm-apple/d3-brush/-/d3-brush-3.0.0.tgz", + "integrity": "sha512-ALnjWlVYkXsVIGlOsuWH1+3udkYFI48Ljihfnh8FZPF2QS9o+PzGLBslO0PjzVoHLZ2KCVgAM8NVkXPJB2aNnQ==", + "license": "ISC", + "dependencies": { + "d3-dispatch": "1 - 3", + "d3-drag": "2 - 3", + "d3-interpolate": "1 - 3", + "d3-selection": "3", + "d3-transition": "3" + }, + "engines": { + "node": ">=12" + } + }, + "node_modules/d3-chord": { + "version": "3.0.1", + "resolved": "https://npm.apple.com/d3-chord/-/d3-chord-3.0.1.tgz", + "integrity": "sha512-VE5S6TNa+j8msksl7HwjxMHDM2yNK3XCkusIlpX5kwauBfXuyLAtNg9jCp/iHH61tgI4sb6R/EIMWCqEIdjT/g==", + "dependencies": { + "d3-path": "1 - 3" + }, + "engines": { + "node": ">=12" + } + }, + "node_modules/d3-color": { + "version": "3.1.0", + "resolved": "https://npm.apple.com/d3-color/-/d3-color-3.1.0.tgz", + "integrity": "sha512-zg/chbXyeBtMQ1LbD/WSoW2DpC3I0mpmPdW+ynRTj/x2DAWYrIY7qeZIHidozwV24m4iavr15lNwIwLxRmOxhA==", + "engines": { + "node": ">=12" + } + }, + "node_modules/d3-contour": { + "version": "4.0.2", + "resolved": "https://npm.apple.com/d3-contour/-/d3-contour-4.0.2.tgz", + "integrity": "sha512-4EzFTRIikzs47RGmdxbeUvLWtGedDUNkTcmzoeyg4sP/dvCexO47AaQL7VKy/gul85TOxw+IBgA8US2xwbToNA==", + "dependencies": { + "d3-array": "^3.2.0" + }, + "engines": { + "node": ">=12" + } + }, + "node_modules/d3-delaunay": { + "version": "6.0.4", + "resolved": "https://npm.apple.com/d3-delaunay/-/d3-delaunay-6.0.4.tgz", + "integrity": "sha512-mdjtIZ1XLAM8bm/hx3WwjfHt6Sggek7qH043O8KEjDXN40xi3vx/6pYSVTwLjEgiXQTbvaouWKynLBiUZ6SK6A==", + "dependencies": { + "delaunator": "5" + }, + "engines": { + "node": ">=12" + } + }, + "node_modules/d3-dispatch": { + "version": "3.0.1", + "resolved": "https://npm.apple.com/d3-dispatch/-/d3-dispatch-3.0.1.tgz", + "integrity": "sha512-rzUyPU/S7rwUflMyLc1ETDeBj0NRuHKKAcvukozwhshr6g6c5d8zh4c2gQjY2bZ0dXeGLWc1PF174P2tVvKhfg==", + "engines": { + "node": ">=12" + } + }, + "node_modules/d3-drag": { + "version": "3.0.0", + "resolved": "https://npm.apple.com/d3-drag/-/d3-drag-3.0.0.tgz", + "integrity": "sha512-pWbUJLdETVA8lQNJecMxoXfH6x+mO2UQo8rSmZ+QqxcbyA3hfeprFgIT//HW2nlHChWeIIMwS2Fq+gEARkhTkg==", + "dependencies": { + "d3-dispatch": "1 - 3", + "d3-selection": "3" + }, + "engines": { + "node": ">=12" + } + }, + "node_modules/d3-dsv": { + "version": "3.0.1", + "resolved": "https://npm.apple.com/d3-dsv/-/d3-dsv-3.0.1.tgz", + "integrity": "sha512-UG6OvdI5afDIFP9w4G0mNq50dSOsXHJaRE8arAS5o9ApWnIElp8GZw1Dun8vP8OyHOZ/QJUKUJwxiiCCnUwm+Q==", + "dependencies": { + "commander": "7", + "iconv-lite": "0.6", + "rw": "1" + }, + "bin": { + "csv2json": "bin/dsv2json.js", + "csv2tsv": "bin/dsv2dsv.js", + "dsv2dsv": "bin/dsv2dsv.js", + "dsv2json": "bin/dsv2json.js", + "json2csv": "bin/json2dsv.js", + "json2dsv": "bin/json2dsv.js", + "json2tsv": "bin/json2dsv.js", + "tsv2csv": "bin/dsv2dsv.js", + "tsv2json": "bin/dsv2json.js" + }, + "engines": { + "node": ">=12" + } + }, + "node_modules/d3-ease": { + "version": "3.0.1", + "resolved": "https://npm.apple.com/d3-ease/-/d3-ease-3.0.1.tgz", + "integrity": "sha512-wR/XK3D3XcLIZwpbvQwQ5fK+8Ykds1ip7A2Txe0yxncXSdq1L9skcG7blcedkOX+ZcgxGAmLX1FrRGbADwzi0w==", + "engines": { + "node": ">=12" + } + }, + "node_modules/d3-fetch": { + "version": "3.0.1", + "resolved": "https://npm.apple.com/d3-fetch/-/d3-fetch-3.0.1.tgz", + "integrity": "sha512-kpkQIM20n3oLVBKGg6oHrUchHM3xODkTzjMoj7aWQFq5QEM+R6E4WkzT5+tojDY7yjez8KgCBRoj4aEr99Fdqw==", + "dependencies": { + "d3-dsv": "1 - 3" + }, + "engines": { + "node": ">=12" + } + }, + "node_modules/d3-force": { + "version": "3.0.0", + "resolved": "https://npm.apple.com/d3-force/-/d3-force-3.0.0.tgz", + "integrity": "sha512-zxV/SsA+U4yte8051P4ECydjD/S+qeYtnaIyAs9tgHCqfguma/aAQDjo85A9Z6EKhBirHRJHXIgJUlffT4wdLg==", + "dependencies": { + "d3-dispatch": "1 - 3", + "d3-quadtree": "1 - 3", + "d3-timer": "1 - 3" + }, + "engines": { + "node": ">=12" + } + }, + "node_modules/d3-format": { + "version": "3.1.0", + "resolved": "https://npm.apple.com/d3-format/-/d3-format-3.1.0.tgz", + "integrity": "sha512-YyUI6AEuY/Wpt8KWLgZHsIU86atmikuoOmCfommt0LYHiQSPjvX2AcFc38PX0CBpr2RCyZhjex+NS/LPOv6YqA==", + "engines": { + "node": ">=12" + } + }, + "node_modules/d3-geo": { + "version": "3.1.1", + "resolved": "https://npm.apple.com/d3-geo/-/d3-geo-3.1.1.tgz", + "integrity": "sha512-637ln3gXKXOwhalDzinUgY83KzNWZRKbYubaG+fGVuc/dxO64RRljtCTnf5ecMyE1RIdtqpkVcq0IbtU2S8j2Q==", + "dependencies": { + "d3-array": "2.5.0 - 3" + }, + "engines": { + "node": ">=12" + } + }, + "node_modules/d3-hierarchy": { + "version": "3.1.2", + "resolved": "https://artifacts.apple.com/artifactory/api/npm/npm-apple/d3-hierarchy/-/d3-hierarchy-3.1.2.tgz", + "integrity": "sha512-FX/9frcub54beBdugHjDCdikxThEqjnR93Qt7PvQTOHxyiNCAlvMrHhclk3cD5VeAaq9fxmfRp+CnWw9rEMBuA==", + "license": "ISC", + "engines": { + "node": ">=12" + } + }, + "node_modules/d3-interpolate": { + "version": "3.0.1", + "resolved": "https://artifacts.apple.com/artifactory/api/npm/npm-apple/d3-interpolate/-/d3-interpolate-3.0.1.tgz", + "integrity": "sha512-3bYs1rOD33uo8aqJfKP3JWPAibgw8Zm2+L9vBKEHJ2Rg+viTR7o5Mmv5mZcieN+FRYaAOWX5SJATX6k1PWz72g==", + "license": "ISC", + "dependencies": { + "d3-color": "1 - 3" + }, + "engines": { + "node": ">=12" + } + }, + "node_modules/d3-path": { + "version": "3.1.0", + "resolved": "https://npm.apple.com/d3-path/-/d3-path-3.1.0.tgz", + "integrity": "sha512-p3KP5HCf/bvjBSSKuXid6Zqijx7wIfNW+J/maPs+iwR35at5JCbLUT0LzF1cnjbCHWhqzQTIN2Jpe8pRebIEFQ==", + "engines": { + "node": ">=12" + } + }, + "node_modules/d3-polygon": { + "version": "3.0.1", + "resolved": "https://npm.apple.com/d3-polygon/-/d3-polygon-3.0.1.tgz", + "integrity": "sha512-3vbA7vXYwfe1SYhED++fPUQlWSYTTGmFmQiany/gdbiWgU/iEyQzyymwL9SkJjFFuCS4902BSzewVGsHHmHtXg==", + "engines": { + "node": ">=12" + } + }, + "node_modules/d3-quadtree": { + "version": "3.0.1", + "resolved": "https://npm.apple.com/d3-quadtree/-/d3-quadtree-3.0.1.tgz", + "integrity": "sha512-04xDrxQTDTCFwP5H6hRhsRcb9xxv2RzkcsygFzmkSIOJy3PeRJP7sNk3VRIbKXcog561P9oU0/rVH6vDROAgUw==", + "engines": { + "node": ">=12" + } + }, + "node_modules/d3-random": { + "version": "3.0.1", + "resolved": "https://npm.apple.com/d3-random/-/d3-random-3.0.1.tgz", + "integrity": "sha512-FXMe9GfxTxqd5D6jFsQ+DJ8BJS4E/fT5mqqdjovykEB2oFbTMDVdg1MGFxfQW+FBOGoB++k8swBrgwSHT1cUXQ==", + "engines": { + "node": ">=12" + } + }, + "node_modules/d3-scale": { + "version": "4.0.2", + "resolved": "https://npm.apple.com/d3-scale/-/d3-scale-4.0.2.tgz", + "integrity": "sha512-GZW464g1SH7ag3Y7hXjf8RoUuAFIqklOAq3MRl4OaWabTFJY9PN/E1YklhXLh+OQ3fM9yS2nOkCoS+WLZ6kvxQ==", + "dependencies": { + "d3-array": "2.10.0 - 3", + "d3-format": "1 - 3", + "d3-interpolate": "1.2.0 - 3", + "d3-time": "2.1.1 - 3", + "d3-time-format": "2 - 4" + }, + "engines": { + "node": ">=12" + } + }, + "node_modules/d3-scale-chromatic": { + "version": "3.1.0", + "resolved": "https://npm.apple.com/d3-scale-chromatic/-/d3-scale-chromatic-3.1.0.tgz", + "integrity": "sha512-A3s5PWiZ9YCXFye1o246KoscMWqf8BsD9eRiJ3He7C9OBaxKhAd5TFCdEx/7VbKtxxTsu//1mMJFrEt572cEyQ==", + "dependencies": { + "d3-color": "1 - 3", + "d3-interpolate": "1 - 3" + }, + "engines": { + "node": ">=12" + } + }, + "node_modules/d3-selection": { + "version": "3.0.0", + "resolved": "https://npm.apple.com/d3-selection/-/d3-selection-3.0.0.tgz", + "integrity": "sha512-fmTRWbNMmsmWq6xJV8D19U/gw/bwrHfNXxrIN+HfZgnzqTHp9jOmKMhsTUjXOJnZOdZY9Q28y4yebKzqDKlxlQ==", + "peer": true, + "engines": { + "node": ">=12" + } + }, + "node_modules/d3-shape": { + "version": "3.2.0", + "resolved": "https://npm.apple.com/d3-shape/-/d3-shape-3.2.0.tgz", + "integrity": "sha512-SaLBuwGm3MOViRq2ABk3eLoxwZELpH6zhl3FbAoJ7Vm1gofKx6El1Ib5z23NUEhF9AsGl7y+dzLe5Cw2AArGTA==", + "dependencies": { + "d3-path": "^3.1.0" + }, + "engines": { + "node": ">=12" + } + }, + "node_modules/d3-time": { + "version": "3.1.0", + "resolved": "https://npm.apple.com/d3-time/-/d3-time-3.1.0.tgz", + "integrity": "sha512-VqKjzBLejbSMT4IgbmVgDjpkYrNWUYJnbCGo874u7MMKIWsILRX+OpX/gTk8MqjpT1A/c6HY2dCA77ZN0lkQ2Q==", + "dependencies": { + "d3-array": "2 - 3" + }, + "engines": { + "node": ">=12" + } + }, + "node_modules/d3-time-format": { + "version": "4.1.0", + "resolved": "https://npm.apple.com/d3-time-format/-/d3-time-format-4.1.0.tgz", + "integrity": "sha512-dJxPBlzC7NugB2PDLwo9Q8JiTR3M3e4/XANkreKSUxF8vvXKqm1Yfq4Q5dl8budlunRVlUUaDUgFt7eA8D6NLg==", + "dependencies": { + "d3-time": "1 - 3" + }, + "engines": { + "node": ">=12" + } + }, + "node_modules/d3-timer": { + "version": "3.0.1", + "resolved": "https://npm.apple.com/d3-timer/-/d3-timer-3.0.1.tgz", + "integrity": "sha512-ndfJ/JxxMd3nw31uyKoY2naivF+r29V+Lc0svZxe1JvvIRmi8hUsrMvdOwgS1o6uBHmiz91geQ0ylPP0aj1VUA==", + "engines": { + "node": ">=12" + } + }, + "node_modules/d3-transition": { + "version": "3.0.1", + "resolved": "https://artifacts.apple.com/artifactory/api/npm/npm-apple/d3-transition/-/d3-transition-3.0.1.tgz", + "integrity": "sha512-ApKvfjsSR6tg06xrL434C0WydLr7JewBB3V+/39RMHsaXTOG0zmt/OAXeng5M5LBm0ojmxJrpomQVZ1aPvBL4w==", + "license": "ISC", + "dependencies": { + "d3-color": "1 - 3", + "d3-dispatch": "1 - 3", + "d3-ease": "1 - 3", + "d3-interpolate": "1 - 3", + "d3-timer": "1 - 3" + }, + "engines": { + "node": ">=12" + }, + "peerDependencies": { + "d3-selection": "2 - 3" + } + }, + "node_modules/d3-zoom": { + "version": "3.0.0", + "resolved": "https://npm.apple.com/d3-zoom/-/d3-zoom-3.0.0.tgz", + "integrity": "sha512-b8AmV3kfQaqWAuacbPuNbL6vahnOJflOhexLzMMNLga62+/nh0JzvJ0aO/5a5MVgUFGS7Hu1P9P03o3fJkDCyw==", + "dependencies": { + "d3-dispatch": "1 - 3", + "d3-drag": "2 - 3", + "d3-interpolate": "1 - 3", + "d3-selection": "2 - 3", + "d3-transition": "2 - 3" + }, + "engines": { + "node": ">=12" + } + }, + "node_modules/delaunator": { + "version": "5.0.1", + "resolved": "https://npm.apple.com/delaunator/-/delaunator-5.0.1.tgz", + "integrity": "sha512-8nvh+XBe96aCESrGOqMp/84b13H9cdKbG5P2ejQCh4d4sK9RL4371qou9drQjMhvnPmhWl5hnmqbEE0fXr9Xnw==", + "dependencies": { + "robust-predicates": "^3.0.2" + } + }, + "node_modules/fsevents": { + "version": "2.3.2", + "resolved": "https://artifacts.apple.com/artifactory/api/npm/npm-apple/fsevents/-/fsevents-2.3.2.tgz", + "integrity": "sha512-xiqMQR4xAeHTuB9uWm+fFRcIOgKBMiOBP+eXiyT7jsgVCq1bkVygt00oASowB7EdtpOHaaPgKt812P9ab+DDKA==", + "dev": true, + "hasInstallScript": true, + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": "^8.16.0 || ^10.6.0 || >=11.0.0" + } + }, + "node_modules/iconv-lite": { + "version": "0.6.3", + "resolved": "https://npm.apple.com/iconv-lite/-/iconv-lite-0.6.3.tgz", + "integrity": "sha512-4fCk79wshMdzMp2rH06qWrJE4iolqLhCUH+OiuIgU++RB0+94NlDL81atO7GX55uUKueo0txHNtvEyI6D7WdMw==", + "dependencies": { + "safer-buffer": ">= 2.1.2 < 3.0.0" + }, + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/internmap": { + "version": "2.0.3", + "resolved": "https://npm.apple.com/internmap/-/internmap-2.0.3.tgz", + "integrity": "sha512-5Hh7Y1wQbvY5ooGgPbDaL5iYLAPzMTUrjMulskHLH6wnv/A+1q5rgEaiuqEjB+oxGXIVZs1FF+R/KPN3ZSQYYg==", + "engines": { + "node": ">=12" + } + }, + "node_modules/playwright": { + "version": "1.56.1", + "resolved": "https://npm.apple.com/playwright/-/playwright-1.56.1.tgz", + "integrity": "sha512-aFi5B0WovBHTEvpM3DzXTUaeN6eN0qWnTkKx4NQaH4Wvcmc153PdaY2UBdSYKaGYw+UyWXSVyxDUg5DoPEttjw==", + "dev": true, + "dependencies": { + "playwright-core": "1.56.1" + }, + "bin": { + "playwright": "cli.js" + }, + "engines": { + "node": ">=18" + }, + "optionalDependencies": { + "fsevents": "2.3.2" + } + }, + "node_modules/playwright-core": { + "version": "1.56.1", + "resolved": "https://npm.apple.com/playwright-core/-/playwright-core-1.56.1.tgz", + "integrity": "sha512-hutraynyn31F+Bifme+Ps9Vq59hKuUCz7H1kDOcBs+2oGguKkWTU50bBWrtz34OUWmIwpBTWDxaRPXrIXkgvmQ==", + "dev": true, + "bin": { + "playwright-core": "cli.js" + }, + "engines": { + "node": ">=18" + } + }, "node_modules/prettier": { "version": "3.5.3", "resolved": "https://registry.npmjs.org/prettier/-/prettier-3.5.3.tgz", @@ -34,6 +492,21 @@ "funding": { "url": "https://github.com/prettier/prettier?sponsor=1" } + }, + "node_modules/robust-predicates": { + "version": "3.0.2", + "resolved": "https://npm.apple.com/robust-predicates/-/robust-predicates-3.0.2.tgz", + "integrity": "sha512-IXgzBWvWQwE6PrDI05OvmXUIruQTcoMDzRsOd5CDvHCVLcLHMTSYvOK5Cm46kWqlV3yAbuSpBZdJ5oP5OUoStg==" + }, + "node_modules/rw": { + "version": "1.3.3", + "resolved": "https://npm.apple.com/rw/-/rw-1.3.3.tgz", + "integrity": "sha512-PdhdWy89SiZogBLaw42zdeqtRJ//zFd2PgQavcICDUgJT5oW10QCRKbJ6bg4r0/UY2M6BWd5tkxuGFRvCkgfHQ==" + }, + "node_modules/safer-buffer": { + "version": "2.1.2", + "resolved": "https://npm.apple.com/safer-buffer/-/safer-buffer-2.1.2.tgz", + "integrity": "sha512-YZo3K82SD7Riyi0E1EQPojLz7kpepnSQI9IyPbHHg1XXXevb5dJI7tpyN2ADxGcQbHG7vcyRHk0cbwqcQriUtg==" } } } diff --git a/package.json b/package.json index 718434e2d..7313d360b 100644 --- a/package.json +++ b/package.json @@ -4,12 +4,16 @@ "scripts": { "prettify": "./scripts/prettify.sh", "copy:vendor": "bash scripts/copy-vendor.sh", - "postinstall": "npm run copy:vendor" + "postinstall": "npm run copy:vendor", + "site-check": "node scripts/site-check.js", + "site-visualize": "node scripts/site-visualize.js" }, "devDependencies": { - "prettier": "^3.5.3" + "prettier": "^3.5.3", + "playwright": "^1.49.1" }, "dependencies": { - "animejs": "^4.0.2" + "animejs": "^4.0.2", + "d3": "^7.9.0" } } diff --git a/scripts/site-check.js b/scripts/site-check.js new file mode 100644 index 000000000..2755e9d97 --- /dev/null +++ b/scripts/site-check.js @@ -0,0 +1,1056 @@ +#!/usr/bin/env node +//===----------------------------------------------------------------------===// +// +// This source file is part of the Swift.org open source project +// +// Copyright (c) 2025 Apple Inc. and the Swift.org project authors +// Licensed under Apache License v2.0 +// +// See LICENSE.txt for license information +// See CONTRIBUTORS.txt for the list of Swift.org project authors +// +// SPDX-License-Identifier: Apache-2.0 +// +//===----------------------------------------------------------------------===// + +/** + * Site Check Tool + * + * Crawls the local Swift.org development site to identify content issues: + * - Broken internal links + * - Broken images + * - Isolated pages (no incoming links) + * - Pages with no outgoing links + * + * The crawler attempts to load /sitemap.xml first to get a comprehensive list + * of all pages. If the sitemap is not available, it falls back to crawling + * from the homepage. + * + * Usage: npm run site-check + * + * Environment Variables: + * - SITE_URL: Base URL to crawl (default: http://localhost:4000) + * - MAX_PAGES: Maximum number of pages to crawl (default: 1000) + * - CHECK_EXTERNAL: Whether to check external links (default: false) + * - CRAWL_DELAY: Delay in milliseconds between page requests (default: 50) + * Increase this if your dev server is struggling (e.g., 250, 500, 1000) + */ + +const { chromium } = require('playwright') +const fs = require('fs').promises + +// Configuration - can be overridden via environment variables +const CONFIG = { + baseUrl: process.env.SITE_URL || 'http://localhost:4000', // Target site URL + maxPages: parseInt(process.env.MAX_PAGES, 10) || 1000, // Limit crawl to prevent runaway + timeout: 10000, // Page load timeout in milliseconds + checkExternalLinks: process.env.CHECK_EXTERNAL === 'true', // Currently unused, reserved for future + concurrency: 3, // Currently unused, reserved for concurrent crawling + outputFile: 'site-check-report.json', // Where to save the detailed JSON report + delayBetweenRequests: parseInt(process.env.CRAWL_DELAY, 10) || 50, // Delay in ms between page requests (default 50ms) +} + +// State tracking - shared across all crawl operations +// All URLs are stored as URIs (e.g., '/blog/post') for consistency +// URIs always start with '/' and represent paths from the site root +// Homepage is always represented as '/' +// External links are stored as full URLs on each page +const state = { + visited: new Set(), // URIs already crawled + toVisit: new Set(), // URIs to crawl next + pages: new Map(), // uri -> { outgoingLinks: {header: [], footer: [], content: []}, images: [], incomingLinks: [], externalLinks: [] } + brokenLinks: [], // Links that returned 404 (URIs) + brokenImages: new Map(), // Map of broken image URI -> { pages: [], alt: string } + redirects: new Map(), // Map of source URI -> target URI for all redirects discovered + errors: [], // Pages that threw errors during crawl (URIs) + canonicalUrls: new Map(), // Map of any URI variant -> canonical URI (resolved after redirects) +} + +// Color codes for console output - ANSI escape sequences +const colors = { + reset: '\x1b[0m', + red: '\x1b[31m', + green: '\x1b[32m', + yellow: '\x1b[33m', + blue: '\x1b[34m', + cyan: '\x1b[36m', +} + +// Helper to log colored messages to console +function log(message, color = 'reset') { + console.log(`${colors[color]}${message}${colors.reset}`) +} + +// Helper to add delay between requests +function sleep(ms) { + return new Promise((resolve) => setTimeout(resolve, ms)) +} + +// Normalize hostname to treat localhost and 0.0.0.0 as equivalent +function normalizeHost(hostname) { + if (hostname === '0.0.0.0' || hostname === 'localhost') { + return 'localhost' + } + return hostname +} + +// Check if URL belongs to the same site being crawled (not external) +function isInternalUrl(url, baseUrl) { + try { + const urlObj = new URL(url, baseUrl) + const baseObj = new URL(baseUrl) + + return ( + normalizeHost(urlObj.hostname) === normalizeHost(baseObj.hostname) && + urlObj.port === baseObj.port + ) + } catch { + return false + } +} + +// Normalize URLs for consistent comparison - removes hash only +// Keep trailing slashes and extensions as-is to preserve server behavior +function normalizeUrl(url, baseUrl) { + try { + const urlObj = new URL(url, baseUrl) + // Only remove hash for consistency - keep everything else as-is + urlObj.hash = '' + return urlObj.href + } catch { + return null + } +} + +// Strip base URL from internal links - returns just the path/URI +function stripBaseUrl(url, baseUrl) { + try { + const urlObj = new URL(url) + const baseObj = new URL(baseUrl) + + // Only strip if it's the same host (with normalization) and same port + if ( + normalizeHost(urlObj.hostname) === normalizeHost(baseObj.hostname) && + urlObj.port === baseObj.port + ) { + return urlObj.pathname + urlObj.search + urlObj.hash + } + return url + } catch { + return url + } +} + +/** + * Convert a full URL to a URI (path + search) + * Store URLs as-is without transformation - canonicalization happens later + * @param {string} url - Full URL to convert + * @param {string} baseUrl - Base URL to strip + * @returns {string|null} URI representation, always '/' for homepage, always starts with '/' + */ +function toUri(url, baseUrl) { + const normalized = normalizeUrl(url, baseUrl) + if (!normalized) return null + + let uri = stripBaseUrl(normalized, baseUrl) + + // Ensure URI always starts with '/' for consistency + if (uri === '') return '/' + if (!uri.startsWith('/')) { + // This shouldn't happen, but handle it defensively + uri = '/' + uri + } + + // Return URI as-is without any transformation + // This preserves the original URL structure (trailing slashes, extensions, etc.) + return uri +} + +/** + * Convert a URI back to a full URL + * @param {string} uri - URI to convert (e.g., '/blog/post') + * @param {string} baseUrl - Base URL to prepend + * @returns {string} Full URL + */ +function uriToUrl(uri, baseUrl) { + if (uri === '/') { + return baseUrl + } + return new URL(uri, baseUrl).href +} + +/** + * Create initial page data structure + * @param {number|null} status - HTTP status code (optional) + * @returns {object} Page data object + */ +function createPageData(status = null) { + const data = { + outgoingLinks: { + header: [], + footer: [], + content: [], + }, + images: [], + incomingLinks: [], + externalLinks: [], // Full URLs to external sites + } + + if (status !== null) { + data.status = status + } + + return data +} + +/** + * Get list of pages that link to the given URI + * @param {string} uri - URI to get referrers for + * @returns {Array} Array of URIs that link to this page + */ +function getReferrers(uri) { + return state.pages.has(uri) ? state.pages.get(uri).incomingLinks : [] +} + +/** + * Check if a redirect occurred and track it + * @param {string} requestedUri - URI that was requested + * @param {string} responseUrl - Full URL from browser response + * @param {string} baseUrl - Base URL for conversion + * @returns {string|null} Final URI if redirect occurred, null otherwise + */ +function trackRedirect(requestedUri, responseUrl, baseUrl) { + const finalUri = toUri(responseUrl, baseUrl) + + // No redirect if same URI + if (finalUri === requestedUri) return null + + // Track this redirect in our map + state.redirects.set(requestedUri, finalUri) + log(` Redirect: ${requestedUri} -> ${finalUri}`, 'yellow') + + return finalUri +} + +// Extract all links and images from the current page using browser context +// Links are categorized into header, footer, and content buckets +async function extractLinksAndImages(page) { + return await page.evaluate(() => { + const images = [] + + // Helper to extract links from a container + const extractLinks = (container) => { + const links = [] + if (!container) return links + + container.querySelectorAll('a[href]').forEach((a) => { + const href = a.getAttribute('href') + if ( + href && + !href.startsWith('javascript:') && + !href.startsWith('mailto:') + ) { + links.push({ + href: a.href, // Use fully resolved URL from browser, not raw attribute + text: a.textContent.trim().substring(0, 100), // Truncate long link text + }) + } + }) + return links + } + + // Extract header/navigation links + const headerContainer = document.querySelector('header.site-navigation') + const headerLinks = extractLinks(headerContainer) + + // Extract footer links + const footerContainer = document.querySelector('footer.global-footer') + const footerLinks = extractLinks(footerContainer) + + // Extract content links (defensive: all links minus header and footer) + const allLinks = extractLinks(document) + const headerHrefs = new Set(headerLinks.map((l) => l.href)) + const footerHrefs = new Set(footerLinks.map((l) => l.href)) + const contentLinks = allLinks.filter( + (link) => !headerHrefs.has(link.href) && !footerHrefs.has(link.href), + ) + + // Extract all images with their src and alt text + document.querySelectorAll('img[src]').forEach((img) => { + images.push({ + src: img.src, // Use fully resolved URL from browser, not raw attribute + alt: img.getAttribute('alt') || '(no alt text)', + }) + }) + + return { headerLinks, footerLinks, contentLinks, images } + }) +} + +// Main crawl function - visits a page, extracts links/images, checks validity +async function crawlPage(browser, uri) { + // Skip if already visited or reached max page limit from CONFIG + if (state.visited.has(uri) || state.visited.size >= CONFIG.maxPages) { + return + } + + // Mark as visited immediately to prevent duplicate crawls + state.visited.add(uri) + log(`[${state.visited.size}/${CONFIG.maxPages}] Crawling: ${uri}`, 'cyan') + + // Convert URI to full URL for browser + const fullUrl = uriToUrl(uri, CONFIG.baseUrl) + + const page = await browser.newPage() + const failedImages = new Set() // Track images that failed (store as URIs) + + // Listen for failed image requests - convert to URI + page.on('requestfailed', (request) => { + if (request.resourceType() === 'image') { + const imageUri = toUri(request.url(), CONFIG.baseUrl) + if (imageUri) { + failedImages.add(imageUri) + } + } + }) + + try { + // Load page with full URL + const response = await page.goto(fullUrl, { + waitUntil: 'domcontentloaded', + timeout: CONFIG.timeout, + }) + + // Track pages that fail to load + if (!response || response.status() !== 200) { + const referrers = getReferrers(uri) + + state.errors.push({ + url: uri, + error: `HTTP ${response?.status() || 'unknown'}`, + referrers, + }) + await page.close() + return + } + + // Detect and track redirects + const finalUri = trackRedirect(uri, response.url(), CONFIG.baseUrl) + const actualUri = finalUri || uri // Use redirected URI if redirect occurred + + // If this page redirected, mark the target as visited to avoid crawling it again + if (finalUri) { + state.visited.add(finalUri) + // Remove from toVisit queue if it's there + state.toVisit.delete(finalUri) + } + + // Store canonical URL mapping + state.canonicalUrls.set(uri, actualUri) + if (finalUri) { + // Also map the final URI to itself + state.canonicalUrls.set(finalUri, finalUri) + } + + // Extract links and images + const { headerLinks, footerLinks, contentLinks, images } = + await extractLinksAndImages(page) + + // Initialize page data for the actual URI (after redirect resolution) + if (!state.pages.has(actualUri)) { + state.pages.set(actualUri, createPageData(response.status())) + } + + const pageData = state.pages.get(actualUri) + + // Helper function to process links from a specific category + const processLinks = (links, category) => { + for (const link of links) { + // link.href is now a fully resolved URL from the browser + const linkFullUrl = normalizeUrl(link.href, CONFIG.baseUrl) + if (!linkFullUrl) continue + + // Check if link is internal or external + if (isInternalUrl(linkFullUrl, CONFIG.baseUrl)) { + const linkUri = toUri(linkFullUrl, CONFIG.baseUrl) + if (!linkUri) continue + + // Internal link - add to categorized outgoing links (as URI, not canonical yet) + pageData.outgoingLinks[category].push(linkUri) + + // Track incoming links for orphan detection (using actualUri as source) + if (!state.pages.has(linkUri)) { + state.pages.set(linkUri, createPageData()) + } + state.pages.get(linkUri).incomingLinks.push(actualUri) + + // Add to crawl queue if not yet visited + if (!state.visited.has(linkUri)) { + state.toVisit.add(linkUri) + } + } else { + // External link - store full URL on this page + pageData.externalLinks.push(linkFullUrl) + } + } + } + + // Process links from each category + processLinks(headerLinks, 'header') + processLinks(footerLinks, 'footer') + processLinks(contentLinks, 'content') + + // Process images and check against failed requests + for (const image of images) { + // image.src is now a fully resolved URL from the browser + const imageFullUrl = normalizeUrl(image.src, CONFIG.baseUrl) + if (!imageFullUrl) continue + + const imageUri = toUri(imageFullUrl, CONFIG.baseUrl) + if (!imageUri) continue + + pageData.images.push(imageUri) + + // Check if image failed to load (compare URIs) + if (failedImages.has(imageUri)) { + if (!state.brokenImages.has(imageUri)) { + state.brokenImages.set(imageUri, { + pages: [uri], + alt: image.alt, + }) + } else { + state.brokenImages.get(imageUri).pages.push(uri) + } + } + } + } catch (error) { + const referrers = getReferrers(uri) + + state.errors.push({ + url: uri, + error: error.message, + referrers, + }) + log(` Error: ${error.message}`, 'red') + } finally { + await page.close() + } +} + +// Validate links that were discovered but not crawled (check for 404s) +// Only validates internal links - external links are not checked +// This also discovers redirects for unvisited links +async function validateLinks(browser) { + log('\nValidating internal links...', 'blue') + + const page = await browser.newPage() + const allLinks = new Set() + + // Collect all unique internal link URIs from crawled pages + for (const [uri, data] of state.pages) { + // Collect from all three categories + for (const link of data.outgoingLinks.header) { + allLinks.add(link) + } + for (const link of data.outgoingLinks.footer) { + allLinks.add(link) + } + for (const link of data.outgoingLinks.content) { + allLinks.add(link) + } + } + + // Check links that weren't visited during crawl + for (const linkUri of allLinks) { + if (!state.visited.has(linkUri)) { + try { + log(` Checking unvisited link: ${linkUri}`, 'yellow') + + // Convert URI to full URL for request + const fullUrl = uriToUrl(linkUri, CONFIG.baseUrl) + + // Use page.request.get() instead of page.goto() for faster validation + // This follows redirects but doesn't render/parse the page + const response = await page.request.get(fullUrl, { + timeout: CONFIG.timeout, + maxRedirects: 3, // Follow up to 3 redirects + }) + + if (!response) { + const referrers = getReferrers(linkUri) + state.errors.push({ + url: linkUri, + error: 'No response received', + referrers, + }) + continue + } + + // Track redirect if one occurred by checking final URL + const responseUrl = response.url() + const finalUri = trackRedirect(linkUri, responseUrl, CONFIG.baseUrl) + const actualUri = finalUri || linkUri + + // Store canonical URL mapping + state.canonicalUrls.set(linkUri, actualUri) + if (finalUri) { + state.canonicalUrls.set(finalUri, finalUri) + } + + // Mark as broken if 404 + if (response.status() === 404) { + const referrers = getReferrers(linkUri) + + state.brokenLinks.push({ + url: linkUri, + referrers, + status: 404, + }) + } else { + // Successful validation - ensure page data exists for actual URI + if (!state.pages.has(actualUri)) { + state.pages.set(actualUri, createPageData(response.status())) + } + } + + // Add delay between validation requests + if (CONFIG.delayBetweenRequests > 0) { + await sleep(CONFIG.delayBetweenRequests) + } + } catch (error) { + const referrers = getReferrers(linkUri) + + state.errors.push({ + url: linkUri, + error: `Validation failed: ${error.message}`, + referrers, + }) + log(` Error checking ${linkUri}: ${error.message}`, 'red') + } + } + } + + await page.close() +} + +// Find pages with no incoming links (orphaned/isolated pages) +function analyzeIsolatedPages() { + const isolated = [] + + for (const [uri, data] of state.pages) { + // Skip homepage from isolation check + if (uri === '/') continue + + // Check if page has incoming links (excluding self-references) + const incomingLinks = data.incomingLinks.filter((link) => link !== uri) + if (incomingLinks.length === 0) { + // Count total outgoing links across all categories + const totalOutgoingLinks = + data.outgoingLinks.header.length + + data.outgoingLinks.footer.length + + data.outgoingLinks.content.length + + isolated.push({ + url: uri, + outgoingLinks: totalOutgoingLinks, + }) + } + } + + return isolated +} + +/** + * Canonicalize all URIs in the state to their final destinations + * This resolves redirect chains and updates all references + */ +function canonicalizeUrls() { + log('\nCanonicalizing URLs...', 'blue') + + // Helper to resolve a URI through the redirect chain + const resolveUri = (uri) => { + const visited = new Set() + let current = uri + + // Follow redirect chain until we reach the end or detect a loop + while (state.redirects.has(current)) { + if (visited.has(current)) { + log(` Warning: Redirect loop detected for ${uri}`, 'yellow') + break + } + visited.add(current) + current = state.redirects.get(current) + } + + return current + } + + // Build complete canonical mapping by resolving all redirect chains + for (const uri of state.canonicalUrls.keys()) { + const canonical = resolveUri(uri) + state.canonicalUrls.set(uri, canonical) + } + + // Also ensure all pages are in the canonical map + for (const uri of state.pages.keys()) { + if (!state.canonicalUrls.has(uri)) { + state.canonicalUrls.set(uri, resolveUri(uri)) + } + } + + // Canonicalize all links in page data + const newPages = new Map() + + for (const [uri, data] of state.pages) { + const canonicalUri = state.canonicalUrls.get(uri) || uri + + // Canonicalize outgoing links + const canonicalOutgoing = { + header: data.outgoingLinks.header.map( + (link) => state.canonicalUrls.get(link) || link + ), + footer: data.outgoingLinks.footer.map( + (link) => state.canonicalUrls.get(link) || link + ), + content: data.outgoingLinks.content.map( + (link) => state.canonicalUrls.get(link) || link + ), + } + + // Canonicalize incoming links + const canonicalIncoming = data.incomingLinks.map( + (link) => state.canonicalUrls.get(link) || link + ) + + // Get or create target page data + if (!newPages.has(canonicalUri)) { + newPages.set(canonicalUri, { + outgoingLinks: { header: [], footer: [], content: [] }, + images: [], + incomingLinks: [], + externalLinks: [], + status: data.status, + }) + } + + const targetData = newPages.get(canonicalUri) + + // Always merge the data (whether redirect or not) + // This handles both the case where a page redirects AND where the redirect target exists + targetData.outgoingLinks.header.push(...canonicalOutgoing.header) + targetData.outgoingLinks.footer.push(...canonicalOutgoing.footer) + targetData.outgoingLinks.content.push(...canonicalOutgoing.content) + targetData.incomingLinks.push(...canonicalIncoming) + targetData.images.push(...data.images) + targetData.externalLinks.push(...data.externalLinks) + + // Preserve status from the canonical URI (if this IS the canonical) + if (canonicalUri === uri) { + targetData.status = data.status + } + } + + // Deduplicate all arrays in the new pages map + for (const [uri, data] of newPages) { + data.outgoingLinks.header = [...new Set(data.outgoingLinks.header)] + data.outgoingLinks.footer = [...new Set(data.outgoingLinks.footer)] + data.outgoingLinks.content = [...new Set(data.outgoingLinks.content)] + data.incomingLinks = [...new Set(data.incomingLinks)] + data.externalLinks = [...new Set(data.externalLinks)] + data.images = [...new Set(data.images)] + } + + // Replace state.pages with canonicalized version + state.pages = newPages + + log(` Canonicalized ${state.pages.size} pages`, 'green') +} + +// Load and parse sitemap.xml to get initial list of URLs to crawl +async function loadSitemap(browser) { + const sitemapUrl = `${CONFIG.baseUrl}/sitemap.xml` + log(`Attempting to load sitemap from: ${sitemapUrl}`, 'blue') + + const page = await browser.newPage() + const uris = [] + + try { + const response = await page.goto(sitemapUrl, { + waitUntil: 'domcontentloaded', + timeout: CONFIG.timeout, + }) + + if (!response || response.status() !== 200) { + log( + ` Sitemap not found (HTTP ${response?.status() || 'unknown'})`, + 'yellow', + ) + return null + } + + // Get the sitemap content + const content = await page.content() + + // Parse XML to extract tags using regex + // This handles both URL and formats + const locRegex = /\s*(?:)?\s*<\/loc>/gi + let match + + while ((match = locRegex.exec(content)) !== null) { + const url = match[1].trim() + if (url && isInternalUrl(url, CONFIG.baseUrl)) { + const uri = toUri(url, CONFIG.baseUrl) + if (uri) { + uris.push(uri) + } + } + } + + if (uris.length === 0) { + log(` Sitemap found but contains no valid URLs`, 'yellow') + return null + } + + log(` Found ${uris.length} URLs in sitemap`, 'green') + return uris + } catch (error) { + log(` Error loading sitemap: ${error.message}`, 'yellow') + return null + } finally { + await page.close() + } +} + +// Generate JSON report with all findings - saved to CONFIG.outputFile +function generateReport() { + const isolated = analyzeIsolatedPages() + + // Count total links across all categories + const totalLinks = Array.from(state.pages.values()).reduce((sum, p) => { + return ( + sum + + p.outgoingLinks.header.length + + p.outgoingLinks.footer.length + + p.outgoingLinks.content.length + ) + }, 0) + + // Create a Set of isolated page URIs for quick lookup + const isolatedSet = new Set(isolated.map((p) => p.url)) + + // Build redirect list from the redirect map + const redirectList = Array.from(state.redirects.entries()).map( + ([from, to]) => ({ + from, + to, + status: 301, // Most redirects are 301 or 302, but we don't track the exact code anymore + }) + ) + + // Build page-centric report structure + const pages = {} + for (const [uri, data] of state.pages.entries()) { + // Find any error that occurred on this page + const error = state.errors.find((e) => e.url === uri) + + // Find broken links that this page links to + const brokenLinksFromThisPage = state.brokenLinks + .filter( + (bl) => + data.outgoingLinks.header.includes(bl.url) || + data.outgoingLinks.footer.includes(bl.url) || + data.outgoingLinks.content.includes(bl.url), + ) + .map((bl) => ({ + url: bl.url, // Already a URI + status: bl.status, + })) + + // Find broken images on this page + const brokenImagesOnPage = Array.from(state.brokenImages.entries()) + .filter(([, imageData]) => imageData.pages.includes(uri)) + .map(([imageUri, imageData]) => ({ + url: imageUri, // Already a URI + alt: imageData.alt, + })) + + // Deduplicate incoming links (already URIs) + const uniqueIncomingLinks = [...new Set(data.incomingLinks)] + + // Deduplicate external links (full URLs) + const uniqueExternalLinks = [...new Set(data.externalLinks)] + + // Find all URIs that redirect to this page + const redirectsToHere = Array.from(state.redirects.entries()) + .filter(([, target]) => target === uri) + .map(([source]) => source) + + pages[uri] = { + incomingLinks: uniqueIncomingLinks, + isIsolated: isolatedSet.has(uri), + outgoingLinks: { + header: data.outgoingLinks.header, // Already canonical URIs + footer: data.outgoingLinks.footer, // Already canonical URIs + content: data.outgoingLinks.content, // Already canonical URIs + }, + externalLinks: uniqueExternalLinks, // Full URLs + imagesCount: data.images.length, + issues: { + redirect: redirectsToHere.length > 0 ? { from: redirectsToHere } : null, + error: error ? error.error : null, + brokenLinks: brokenLinksFromThisPage, + brokenImages: brokenImagesOnPage, + }, + } + } + + // Count unique external domains across all pages + const allExternalDomains = new Set() + for (const pageData of state.pages.values()) { + for (const externalUrl of pageData.externalLinks) { + try { + const domain = new URL(externalUrl).hostname + allExternalDomains.add(domain) + } catch (e) { + // Invalid URL, skip + } + } + } + + const report = { + timestamp: new Date().toISOString(), + config: CONFIG, + summary: { + totalPages: state.visited.size, + totalLinks: totalLinks, + brokenLinks: state.brokenLinks.length, + brokenImages: state.brokenImages.size, + redirects: redirectList.length, + isolatedPages: isolated.length, + errors: state.errors.length, + externalDomains: allExternalDomains.size, + }, + pages: pages, + } + + return report +} + +// Print colorized summary to console - truncates long lists +function printSummary(report) { + log('\n' + '='.repeat(60), 'cyan') + log('Site Check Summary', 'cyan') + log('='.repeat(60), 'cyan') + + log(`\nPages Crawled: ${report.summary.totalPages}`, 'blue') + log(`Total Links: ${report.summary.totalLinks}`, 'blue') + + // Collect issues from page-centric structure + const brokenLinks = [] + const brokenImages = [] + const redirects = [] + const isolatedPages = [] + + for (const [pageUrl, pageData] of Object.entries(report.pages)) { + if (pageData.issues.brokenLinks.length > 0) { + pageData.issues.brokenLinks.forEach((bl) => { + brokenLinks.push({ + url: bl.url, + referrer: pageUrl, + }) + }) + } + if (pageData.issues.brokenImages.length > 0) { + pageData.issues.brokenImages.forEach((bi) => { + brokenImages.push({ + url: bi.url, + alt: bi.alt, + page: pageUrl, + }) + }) + } + if (pageData.issues.redirect) { + redirects.push({ + from: pageUrl, + to: pageData.issues.redirect.to, + }) + } + if (pageData.isIsolated) { + isolatedPages.push(pageUrl) + } + if (pageData.issues.error) { + // Add to broken links list so they appear in the broken links report + // Include error pages even if they have no referrers + if (pageData.incomingLinks.length > 0) { + pageData.incomingLinks.forEach((referrer) => { + brokenLinks.push({ + url: pageUrl, + referrer: referrer, + error: pageData.issues.error, + }) + }) + } else { + // Error page with no incoming links - still report it + brokenLinks.push({ + url: pageUrl, + referrer: '(no referrers)', + error: pageData.issues.error, + }) + } + } + } + + if (brokenLinks.length > 0) { + log(`\n❌ Broken Links: ${brokenLinks.length}`, 'red') + brokenLinks.forEach((link) => { + log(` ${link.url}`, 'red') + if (link.error) { + log(` Error: ${link.error}`, 'red') + } + log(` Referenced by: ${link.referrer}`, 'yellow') + }) + } else { + log(`\n✅ No broken links found`, 'green') + } + + if (brokenImages.length > 0) { + log(`\n❌ Broken Images: ${brokenImages.length}`, 'red') + brokenImages.slice(0, 10).forEach((img) => { + log(` ${img.url}`, 'red') + log(` Alt text: ${img.alt}`, 'yellow') + log(` Found on: ${img.page}`, 'yellow') + }) + if (brokenImages.length > 10) { + log( + ` ... and ${brokenImages.length - 10} more (see report file)`, + 'yellow', + ) + } + } else { + log(`\n✅ No broken images found`, 'green') + } + + if (redirects.length > 0) { + log(`\n⚠️ Redirects: ${redirects.length}`, 'yellow') + log(' (Pages that redirect to another URL)', 'yellow') + redirects.slice(0, 10).forEach((redirect) => { + log(` ${redirect.from}`, 'yellow') + log(` -> ${redirect.to}`, 'cyan') + }) + if (redirects.length > 10) { + log(` ... and ${redirects.length - 10} more (see report file)`, 'yellow') + } + } else { + log(`\n✅ No redirects found`, 'green') + } + + if (isolatedPages.length > 0) { + log(`\n⚠️ Isolated Pages: ${isolatedPages.length}`, 'yellow') + log(' (Pages with no incoming links from the site)', 'yellow') + isolatedPages.forEach((page) => { + log(` ${page}`, 'yellow') + }) + } else { + log(`\n✅ No isolated pages found`, 'green') + } + + if (report.summary.externalDomains > 0) { + log( + `\n🔗 External Domains Linked: ${report.summary.externalDomains}`, + 'blue', + ) + log(' (See individual pages in report for details)', 'blue') + } + + log(`\n📄 Full report saved to: ${CONFIG.outputFile}`, 'green') + log('='.repeat(60) + '\n', 'cyan') +} + +// Main entry point - orchestrates crawl, validation, and reporting +async function main() { + log('Starting Swift.org Site Check Tool', 'blue') + log(`Base URL: ${CONFIG.baseUrl}`, 'blue') + log(`Max Pages: ${CONFIG.maxPages}\n`, 'blue') + + // Launch headless browser via Playwright + const browser = await chromium.launch({ headless: true }) + + try { + // Try to load sitemap.xml to get initial list of URIs + const sitemapUris = await loadSitemap(browser) + + if (sitemapUris && sitemapUris.length > 0) { + // Use sitemap URIs as starting point + log(`Using sitemap URLs as crawl queue`, 'green') + for (const uri of sitemapUris) { + state.toVisit.add(uri) + } + } else { + // Fall back to starting from homepage + log(`⚠️ Sitemap not available, starting from homepage only`, 'yellow') + log( + ` This may miss pages not linked from the site navigation\n`, + 'yellow', + ) + state.toVisit.add('/') // Always use '/' for homepage + } + + log('') // Empty line before crawl starts + + // Crawl pages until queue empty or hit CONFIG.maxPages + while (state.toVisit.size > 0 && state.visited.size < CONFIG.maxPages) { + // Get and remove first URI from Set + const uri = state.toVisit.values().next().value + state.toVisit.delete(uri) + await crawlPage(browser, uri) + + // Add delay between requests to avoid overwhelming the server + if (state.toVisit.size > 0 && CONFIG.delayBetweenRequests > 0) { + await sleep(CONFIG.delayBetweenRequests) + } + } + + // Validate all discovered links that weren't crawled + await validateLinks(browser) + + // Canonicalize all URIs to their final destinations + canonicalizeUrls() + + // Generate report + const report = generateReport() + + // Save to file specified in CONFIG.outputFile + await fs.writeFile( + CONFIG.outputFile, + JSON.stringify(report, null, 2), + 'utf8', + ) + + // Print summary + printSummary(report) + + // Exit with error code if issues found (for CI integration) + const hasIssues = + report.summary.brokenLinks > 0 || + report.summary.brokenImages > 0 || + report.summary.errors > 0 + + process.exit(hasIssues ? 1 : 0) + } catch (error) { + log(`Fatal error: ${error.message}`, 'red') + console.error(error) + process.exit(1) + } finally { + await browser.close() + } +} + +// Run if called directly (not imported as module) +if (require.main === module) { + main().catch((error) => { + console.error(error) + process.exit(1) + }) +} + +module.exports = { main } diff --git a/scripts/site-report-format.md b/scripts/site-report-format.md new file mode 100644 index 000000000..3c03792b2 --- /dev/null +++ b/scripts/site-report-format.md @@ -0,0 +1,408 @@ +# Site Check Report Format + +This document describes the JSON format of the report generated by `site-check.js`. The report provides comprehensive information about website structure, link integrity, and content issues. + +## Overview + +The report is generated by crawling a website (typically a local development instance) and analyzing: + +- Page structure and link topology +- Internal and external links +- Image resources +- Content issues (broken links, errors, redirects) +- Page isolation and connectivity + +## Top-Level Structure + +```json +{ + "timestamp": "2025-11-11T00:40:59.540Z", + "config": { + /* Configuration object */ + }, + "summary": { + /* Summary statistics */ + }, + "pages": { + /* Page-by-page details */ + } +} +``` + +### Fields + +| Field | Type | Description | +| ----------- | ------ | ------------------------------------------------ | +| `timestamp` | string | ISO 8601 timestamp when the report was generated | +| `config` | object | Configuration settings used for the crawl | +| `summary` | object | High-level statistics about the crawl results | +| `pages` | object | Detailed information for each discovered page | + +## Configuration Object + +Records the settings used during the crawl. + +```json +{ + "baseUrl": "http://localhost:4000", + "maxPages": 1000, + "timeout": 30000, + "checkExternalLinks": false, + "concurrency": 3, + "outputFile": "site-check-report.json", + "delayBetweenRequests": 50 +} +``` + +### Fields + +| Field | Type | Description | +| ---------------------- | ------- | -------------------------------------------------------- | +| `baseUrl` | string | Base URL that was crawled | +| `maxPages` | number | Maximum number of pages to crawl (hard limit) | +| `timeout` | number | Page load timeout in milliseconds | +| `checkExternalLinks` | boolean | Whether external links were validated (currently unused) | +| `concurrency` | number | Planned concurrent crawl workers (currently unused) | +| `outputFile` | string | Path where the report was saved | +| `delayBetweenRequests` | number | Delay in milliseconds between page requests | + +## Summary Object + +Provides aggregate statistics across the entire crawl. + +```json +{ + "totalPages": 441, + "totalLinks": 15070, + "brokenLinks": 0, + "brokenImages": 0, + "redirects": 0, + "isolatedPages": 116, + "errors": 19, + "externalDomains": 216 +} +``` + +### Fields + +| Field | Type | Description | +| ----------------- | ------ | -------------------------------------------------------------------------- | +| `totalPages` | number | Number of pages successfully visited | +| `totalLinks` | number | Total count of internal links across all pages (header + footer + content) | +| `brokenLinks` | number | Count of internal links that returned 404 or failed validation | +| `brokenImages` | number | Count of unique broken images found | +| `redirects` | number | Count of pages that redirect (excludes trailing slash redirects) | +| `isolatedPages` | number | Count of pages with no incoming links (orphaned pages) | +| `errors` | number | Count of pages that failed to load or threw errors | +| `externalDomains` | number | Count of unique external domains linked from the site | + +## Pages Object + +A map of page URIs to detailed page information. Each key is a URI (e.g., `/`, `/blog/post`, `/documentation`). + +### URI Format + +- **Internal pages**: Stored as URIs (path + search + hash) + - Homepage: `/` + - Regular pages: `/blog/post`, `/documentation/guide` + - With query params: `/search?q=swift` + - With hash: `/docs/api#section` +- **External links**: Stored as full URLs in the `externalLinks` array + - Example: `https://github.com/apple/swift` + +### Page Object Structure + +```json +"/": { + "incomingLinks": ["/about", "/blog", "/documentation"], + "isIsolated": false, + "outgoingLinks": { + "header": ["/documentation", "/community"], + "footer": ["/", "/legal/license.html"], + "content": ["/get-started/cloud-services"] + }, + "externalLinks": [ + "https://github.com/swiftlang/sourcekit-lsp", + "https://developer.apple.com/xcode" + ], + "imagesCount": 12, + "issues": { + "redirect": null, + "error": null, + "brokenLinks": [], + "brokenImages": [] + } +} +``` + +### Page Fields + +| Field | Type | Description | +| --------------- | ------------- | ------------------------------------------------------------------ | +| `incomingLinks` | array[string] | List of URIs that link to this page (deduplicated) | +| `isIsolated` | boolean | `true` if page has no incoming links (orphaned), `false` otherwise | +| `outgoingLinks` | object | Categorized outgoing internal links (see below) | +| `externalLinks` | array[string] | Full URLs to external domains (deduplicated) | +| `imagesCount` | number | Total number of images on this page | +| `issues` | object | Content and structural issues (see below) | + +### Outgoing Links Object + +Links are categorized by their location in the page structure: + +```json +{ + "header": ["/", "/documentation", "/community"], + "footer": ["/legal/license.html", "/privacy"], + "content": ["/blog/post", "/getting-started"] +} +``` + +| Field | Type | Description | +| --------- | ------------- | ------------------------------------------------------------- | +| `header` | array[string] | URIs of links found in `