Skip to content

Instantly share code, notes, and snippets.

@bquast
Last active June 16, 2025 22:15
Show Gist options
  • Save bquast/9f28870597a5dbf23e4d4557bb062503 to your computer and use it in GitHub Desktop.
Save bquast/9f28870597a5dbf23e4d4557bb062503 to your computer and use it in GitHub Desktop.
const { Readability } = require('@mozilla/readability');
const jsdom = require('jsdom');
const fs = require('fs');
const path = require('path');
const os = require('os');
const fetch = (...args) => import('node-fetch').then(({default: fetch}) => fetch(...args));
// Helper to download an image and return the local filename
async function downloadImage(imgUrl, folder, idx) {
try {
const urlObj = new URL(imgUrl);
let ext = path.extname(urlObj.pathname).split('?')[0] || '.jpg';
if (!ext.match(/^\.\w+$/)) ext = '.jpg';
const fileName = `image-${idx}${ext}`;
const filePath = path.join(folder, fileName);
const res = await fetch(imgUrl);
if (!res.ok) throw new Error(`Failed to download image: ${imgUrl}`);
const buffer = await res.arrayBuffer();
fs.writeFileSync(filePath, Buffer.from(buffer));
return fileName;
} catch (e) {
// Optionally log errors or skip
return null;
}
}
(async () => {
const url = process.argv[2];
let destFolder = process.argv[3];
if (!destFolder) {
destFolder = path.join(os.homedir(), 'Articles');
}
if (!url) {
console.error('Usage: node readability-extract.js URL [LIBRARY_PATH]');
process.exit(1);
}
const res = await fetch(url);
const html = await res.text();
const dom = new jsdom.JSDOM(html, { url });
const article = new Readability(dom.window.document).parse();
if (!article) {
console.error('Failed to extract article');
process.exit(1);
}
const author = (article.byline || 'UnknownAuthor').replace(/[^\w\- ]/g, '').replace(/\s+/g, '');
const date = new Date().toISOString().slice(0,10);
const safeTitle = article.title
.replace(/[^\w\- ]/g, '')
.replace(/\s+/g, '-')
.toLowerCase();
const authorFolder = path.join(destFolder, author);
fs.mkdirSync(authorFolder, { recursive: true });
const imagesFolder = path.join(authorFolder, `${date}-${safeTitle}_images`);
fs.mkdirSync(imagesFolder, { recursive: true });
// Load article HTML into JSDOM for filtering and image processing
const contentDom = new jsdom.JSDOM(article.content);
const imgEls = contentDom.window.document.querySelectorAll('img');
let idx = 1;
for (let img of imgEls) {
const src = img.getAttribute('src');
if (!src) {
img.remove();
continue;
}
// Basic ad/logo/banner filtering
if (src.match(/ad|logo|banner|pixel/i)) {
img.remove();
continue;
}
// Optionally: filter tiny images (many ads/tracking pixels are <50px)
let width = img.width || img.getAttribute('width') || 0;
let height = img.height || img.getAttribute('height') || 0;
width = Number(width);
height = Number(height);
if ((width && width < 50) || (height && height < 50)) {
img.remove();
continue;
}
// Resolve relative URLs
let absUrl;
try {
absUrl = new URL(src, url).toString();
} catch (e) {
img.remove();
continue;
}
const localName = await downloadImage(absUrl, imagesFolder, idx++);
if (localName) {
img.setAttribute('src', `./${date}-${safeTitle}_images/${localName}`);
} else {
img.remove(); // remove if download failed
}
}
// Replace article.content with updated HTML (images fixed)
article.content = contentDom.window.document.body.innerHTML;
const file = path.join(authorFolder, `${date}-${safeTitle}.html`);
const fullHtml = `
<!doctype html>
<html>
<head>
<meta charset="utf-8">
<title>${article.title}</title>
<meta name="author" content="${author}">
</head>
<body>
<h1>${article.title}</h1>
<h2>By ${author}</h2>
<article>
${article.content}
</article>
</body>
</html>
`;
fs.writeFileSync(file, fullHtml, 'utf8');
console.log(file);
})();
@bquast
Copy link
Author

bquast commented Jun 16, 2025

Install:

npm install @mozilla/readability jsdom node-fetch

Usage:

node readability-extract.js "https://example.com/article"

@bquast
Copy link
Author

bquast commented Jun 16, 2025

Move to article directory (replace UnknownAuthor with the author folder):

cd ~/Article/UnknownAuthor

Convert using pandoc:

pandoc "2025-06-23-title-of-article.html" -o "title-of-article.epub"

The title-of-article.epub is just an example.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment