Last active
June 16, 2025 22:15
-
-
Save bquast/9f28870597a5dbf23e4d4557bb062503 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const { Readability } = require('@mozilla/readability'); | |
const jsdom = require('jsdom'); | |
const fs = require('fs'); | |
const path = require('path'); | |
const os = require('os'); | |
const fetch = (...args) => import('node-fetch').then(({default: fetch}) => fetch(...args)); | |
// Helper to download an image and return the local filename | |
async function downloadImage(imgUrl, folder, idx) { | |
try { | |
const urlObj = new URL(imgUrl); | |
let ext = path.extname(urlObj.pathname).split('?')[0] || '.jpg'; | |
if (!ext.match(/^\.\w+$/)) ext = '.jpg'; | |
const fileName = `image-${idx}${ext}`; | |
const filePath = path.join(folder, fileName); | |
const res = await fetch(imgUrl); | |
if (!res.ok) throw new Error(`Failed to download image: ${imgUrl}`); | |
const buffer = await res.arrayBuffer(); | |
fs.writeFileSync(filePath, Buffer.from(buffer)); | |
return fileName; | |
} catch (e) { | |
// Optionally log errors or skip | |
return null; | |
} | |
} | |
(async () => { | |
const url = process.argv[2]; | |
let destFolder = process.argv[3]; | |
if (!destFolder) { | |
destFolder = path.join(os.homedir(), 'Articles'); | |
} | |
if (!url) { | |
console.error('Usage: node readability-extract.js URL [LIBRARY_PATH]'); | |
process.exit(1); | |
} | |
const res = await fetch(url); | |
const html = await res.text(); | |
const dom = new jsdom.JSDOM(html, { url }); | |
const article = new Readability(dom.window.document).parse(); | |
if (!article) { | |
console.error('Failed to extract article'); | |
process.exit(1); | |
} | |
const author = (article.byline || 'UnknownAuthor').replace(/[^\w\- ]/g, '').replace(/\s+/g, ''); | |
const date = new Date().toISOString().slice(0,10); | |
const safeTitle = article.title | |
.replace(/[^\w\- ]/g, '') | |
.replace(/\s+/g, '-') | |
.toLowerCase(); | |
const authorFolder = path.join(destFolder, author); | |
fs.mkdirSync(authorFolder, { recursive: true }); | |
const imagesFolder = path.join(authorFolder, `${date}-${safeTitle}_images`); | |
fs.mkdirSync(imagesFolder, { recursive: true }); | |
// Load article HTML into JSDOM for filtering and image processing | |
const contentDom = new jsdom.JSDOM(article.content); | |
const imgEls = contentDom.window.document.querySelectorAll('img'); | |
let idx = 1; | |
for (let img of imgEls) { | |
const src = img.getAttribute('src'); | |
if (!src) { | |
img.remove(); | |
continue; | |
} | |
// Basic ad/logo/banner filtering | |
if (src.match(/ad|logo|banner|pixel/i)) { | |
img.remove(); | |
continue; | |
} | |
// Optionally: filter tiny images (many ads/tracking pixels are <50px) | |
let width = img.width || img.getAttribute('width') || 0; | |
let height = img.height || img.getAttribute('height') || 0; | |
width = Number(width); | |
height = Number(height); | |
if ((width && width < 50) || (height && height < 50)) { | |
img.remove(); | |
continue; | |
} | |
// Resolve relative URLs | |
let absUrl; | |
try { | |
absUrl = new URL(src, url).toString(); | |
} catch (e) { | |
img.remove(); | |
continue; | |
} | |
const localName = await downloadImage(absUrl, imagesFolder, idx++); | |
if (localName) { | |
img.setAttribute('src', `./${date}-${safeTitle}_images/${localName}`); | |
} else { | |
img.remove(); // remove if download failed | |
} | |
} | |
// Replace article.content with updated HTML (images fixed) | |
article.content = contentDom.window.document.body.innerHTML; | |
const file = path.join(authorFolder, `${date}-${safeTitle}.html`); | |
const fullHtml = ` | |
<!doctype html> | |
<html> | |
<head> | |
<meta charset="utf-8"> | |
<title>${article.title}</title> | |
<meta name="author" content="${author}"> | |
</head> | |
<body> | |
<h1>${article.title}</h1> | |
<h2>By ${author}</h2> | |
<article> | |
${article.content} | |
</article> | |
</body> | |
</html> | |
`; | |
fs.writeFileSync(file, fullHtml, 'utf8'); | |
console.log(file); | |
})(); |
Move to article directory (replace UnknownAuthor
with the author folder):
cd ~/Article/UnknownAuthor
Convert using pandoc:
pandoc "2025-06-23-title-of-article.html" -o "title-of-article.epub"
The title-of-article.epub
is just an example.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Install:
Usage: