-
-
Notifications
You must be signed in to change notification settings - Fork 1.4k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
2 changed files
with
151 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
/ | ||
/changelog/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,149 @@ | ||
/* eslint-disable no-console */ | ||
|
||
import {readFileSync} from 'fs'; | ||
import path, {dirname} from 'path'; | ||
import {fileURLToPath} from 'url'; | ||
|
||
const baseURL = 'http://localhost:3000/'; | ||
type Link = {href: string; innerText: string}; | ||
|
||
const trimSlashes = (s: string) => s.replace(/(^\/|\/$)/g, ''); | ||
|
||
// @ts-ignore | ||
const ignoreListFile = path.join(dirname(import.meta.url), './ignore-list.txt'); | ||
|
||
// Paths to skip | ||
const ignoreList: string[] = readFileSync(fileURLToPath(ignoreListFile), 'utf8') | ||
.split('\n') | ||
.map(trimSlashes) | ||
.filter(Boolean); | ||
|
||
async function fetchWithFollow(url: URL | string): Promise<Response> { | ||
const r = await fetch(url); | ||
if (r.status >= 300 && r.status < 400 && r.headers.has('location')) { | ||
return fetchWithFollow(r.headers.get('location')!); | ||
} | ||
return r; | ||
} | ||
|
||
async function main() { | ||
const sitemap = await fetch(`${baseURL}sitemap.xml`).then(r => r.text()); | ||
|
||
const slugs = [...sitemap.matchAll(/<loc>([^<]*)<\/loc>/g)] | ||
.map(l => l[1]) | ||
.map(url => trimSlashes(new URL(url).pathname)) | ||
.filter(Boolean); | ||
const allSlugsSet = new Set(slugs); | ||
|
||
console.log('Checking 404s on %d pages', slugs.length); | ||
|
||
const all404s: {page404s: Link[]; slug: string}[] = []; | ||
|
||
// check if the slug equivalent of the href is in the sitemap | ||
const isInSitemap = (href: string) => { | ||
// remove hash | ||
const pathnameSlug = trimSlashes(href.replace(/#.*$/, '')); | ||
|
||
// some #hash links result in empty slugs when stripped | ||
return pathnameSlug === '' || allSlugsSet.has(pathnameSlug); | ||
}; | ||
|
||
function shoudlSkipLink(href: string) { | ||
const isExternal = (href_: string) => | ||
href_.startsWith('http') || href_.startsWith('mailto:'); | ||
const isLocalhost = (href_: string) => | ||
href_.startsWith('http') && new URL(href_).hostname === 'localhost'; | ||
const isIp = (href_: string) => /(\d{1,3}\.){3}\d{1,3}/.test(href_); | ||
const isImage = (href_: string) => /\.(png|jpg|jpeg|gif|svg|webp)$/.test(href_); | ||
|
||
return [ | ||
isExternal, | ||
(s = '') => ignoreList.includes(trimSlashes(s)), | ||
isImage, | ||
isLocalhost, | ||
isIp, | ||
].some(fn => fn(href)); | ||
} | ||
|
||
async function is404(link: Link, pageUrl: URL): Promise<boolean> { | ||
if (shoudlSkipLink(link.href)) { | ||
return false; | ||
} | ||
|
||
const fullPath = link.href.startsWith('/') | ||
? trimSlashes(link.href) | ||
: // relative path | ||
trimSlashes(new URL(pageUrl.pathname + '/' + link.href, baseURL).pathname); | ||
|
||
if (isInSitemap(fullPath)) { | ||
return false; | ||
} | ||
const fullUrl = new URL(fullPath, baseURL); | ||
const resp = await fetchWithFollow(fullUrl); | ||
if (resp.status === 404) { | ||
return true; | ||
} | ||
return false; | ||
} | ||
|
||
for (const slug of slugs) { | ||
const pageUrl = new URL(slug, baseURL); | ||
const now = performance.now(); | ||
const html = await fetchWithFollow(pageUrl.href).then(r => r.text()); | ||
|
||
const linkRegex = /<a[^>]*href="([^"]*)"[^>]*>([^<]*)<\/a>/g; | ||
const links = Array.from(html.matchAll(linkRegex)).map(m => { | ||
const [, href, innerText] = m; | ||
return {href, innerText}; | ||
}); | ||
const page404s = ( | ||
await Promise.all( | ||
links.map(async link => { | ||
const is404_ = await is404(link, pageUrl); | ||
return [link, is404_] as [Link, boolean]; | ||
}) | ||
) | ||
) | ||
.filter(([_, is404_]) => is404_) | ||
.map(([link]) => link); | ||
|
||
if (page404s.length) { | ||
all404s.push({slug, page404s}); | ||
} | ||
|
||
console.log( | ||
page404s.length ? '❌' : '✅', | ||
`in ${(performance.now() - now).toFixed(1).padStart(4, '0')} ms | ${slug}` | ||
); | ||
} | ||
|
||
if (all404s.length === 0) { | ||
console.log('\n\n🎉 No 404s found'); | ||
return false; | ||
} | ||
console.log( | ||
'\n❌ Found %d 404s across %d %s', | ||
all404s.map(x => x.page404s.length).reduce((a, b) => a + b, 0), | ||
all404s.length, | ||
all404s.length === 1 ? 'page' : 'pages' | ||
); | ||
for (const {slug, page404s} of all404s) { | ||
console.log('\n🌐', baseURL + slug); | ||
for (const link of page404s) { | ||
console.log(` - [${link.innerText}](${link.href})`); | ||
} | ||
} | ||
|
||
console.log( | ||
'\n👉 Note: the markdown syntax is not necessarily on the source files, but the links do exist on the final pages' | ||
); | ||
// signal error | ||
return true; | ||
} | ||
const now = performance.now(); | ||
main().then(has404s => { | ||
console.log(`\n Done in ${(performance.now() - now).toFixed(1)} ms`); | ||
process.exit(has404s ? 1 : 0); | ||
}); | ||
|
||
export {}; |