bquast · June 16, 2025 22:15 · bquast · Jun 16, 2025 · bquast · Jun 16, 2025
diff --git a/readability-extract.js b/readability-extract.js
 const { Readability } = require('@mozilla/readability');
 const jsdom = require('jsdom');
 const fs = require('fs');
 const path = require('path');
 const os = require('os');
 const fetch = (...args) => import('node-fetch').then(({default: fetch}) => fetch(...args));

 // Helper to download an image and return the local filename
 async function downloadImage(imgUrl, folder, idx) {
  try {
    const urlObj = new URL(imgUrl);
    let ext = path.extname(urlObj.pathname).split('?')[0] || '.jpg';
    if (!ext.match(/^\.\w+$/)) ext = '.jpg';
    const fileName = `image-${idx}${ext}`;
    const filePath = path.join(folder, fileName);
    const res = await fetch(imgUrl);
    if (!res.ok) throw new Error(`Failed to download image: ${imgUrl}`);
    const buffer = await res.arrayBuffer();
    fs.writeFileSync(filePath, Buffer.from(buffer));
    return fileName;
  } catch (e) {
    // Optionally log errors or skip
    return null;
  }
 }

 (async () => {
  const url = process.argv[2];
  let destFolder = process.argv[3];
  if (!destFolder) {
    destFolder = path.join(os.homedir(), 'Articles');
  }
  if (!url) {
    console.error('Usage: node readability-extract.js URL [LIBRARY_PATH]');
    process.exit(1);
  }

  const res = await fetch(url);
  const html = await res.text();
  const dom = new jsdom.JSDOM(html, { url });
  const article = new Readability(dom.window.document).parse();

  if (!article) {
    console.error('Failed to extract article');
    process.exit(1);
  }

  const author = (article.byline || 'UnknownAuthor').replace(/[^\w\- ]/g, '').replace(/\s+/g, '');
  const date = new Date().toISOString().slice(0,10);
  const safeTitle = article.title
    .replace(/[^\w\- ]/g, '')
    .replace(/\s+/g, '-')
    .toLowerCase();

  const authorFolder = path.join(destFolder, author);
  fs.mkdirSync(authorFolder, { recursive: true });
  const imagesFolder = path.join(authorFolder, `${date}-${safeTitle}_images`);
  fs.mkdirSync(imagesFolder, { recursive: true });

  // Load article HTML into JSDOM for filtering and image processing
  const contentDom = new jsdom.JSDOM(article.content);
  const imgEls = contentDom.window.document.querySelectorAll('img');
  let idx = 1;
  for (let img of imgEls) {
    const src = img.getAttribute('src');
    if (!src) {
      img.remove();
      continue;
    }
    // Basic ad/logo/banner filtering
    if (src.match(/ad|logo|banner|pixel/i)) {
      img.remove();
      continue;
    }
    // Optionally: filter tiny images (many ads/tracking pixels are <50px)
    let width = img.width || img.getAttribute('width') || 0;
    let height = img.height || img.getAttribute('height') || 0;
    width = Number(width);
    height = Number(height);
    if ((width && width < 50) || (height && height < 50)) {
      img.remove();
      continue;
    }
    // Resolve relative URLs
    let absUrl;
    try {
      absUrl = new URL(src, url).toString();
    } catch (e) {
      img.remove();
      continue;
    }
    const localName = await downloadImage(absUrl, imagesFolder, idx++);
    if (localName) {
      img.setAttribute('src', `./${date}-${safeTitle}_images/${localName}`);
    } else {
      img.remove(); // remove if download failed
    }
  }

  // Replace article.content with updated HTML (images fixed)
  article.content = contentDom.window.document.body.innerHTML;

  const file = path.join(authorFolder, `${date}-${safeTitle}.html`);
  const fullHtml = `
    <!doctype html>
    <html>
    <head>
      <meta charset="utf-8">
      <title>${article.title}</title>
      <meta name="author" content="${author}">
    </head>
    <body>
      <h1>${article.title}</h1>
      <h2>By ${author}</h2>
      <article>
        ${article.content}
      </article>
    </body>
    </html>
  `;

  fs.writeFileSync(file, fullHtml, 'utf8');
  console.log(file);
 })();
	const { Readability } = require('@mozilla/readability');
	const jsdom = require('jsdom');
	const fs = require('fs');
	const path = require('path');
	const os = require('os');
	const fetch = (...args) => import('node-fetch').then(({default: fetch}) => fetch(...args));

	// Helper to download an image and return the local filename
	async function downloadImage(imgUrl, folder, idx) {
	try {
	const urlObj = new URL(imgUrl);
	let ext = path.extname(urlObj.pathname).split('?')[0] \|\| '.jpg';
	if (!ext.match(/^\.\w+$/)) ext = '.jpg';
	const fileName = `image-${idx}${ext}`;
	const filePath = path.join(folder, fileName);
	const res = await fetch(imgUrl);
	if (!res.ok) throw new Error(`Failed to download image: ${imgUrl}`);
	const buffer = await res.arrayBuffer();
	fs.writeFileSync(filePath, Buffer.from(buffer));
	return fileName;
	} catch (e) {
	// Optionally log errors or skip
	return null;
	}
	}

	(async () => {
	const url = process.argv[2];
	let destFolder = process.argv[3];
	if (!destFolder) {
	destFolder = path.join(os.homedir(), 'Articles');
	}
	if (!url) {
	console.error('Usage: node readability-extract.js URL [LIBRARY_PATH]');
	process.exit(1);
	}

	const res = await fetch(url);
	const html = await res.text();
	const dom = new jsdom.JSDOM(html, { url });
	const article = new Readability(dom.window.document).parse();

	if (!article) {
	console.error('Failed to extract article');
	process.exit(1);
	}

	const author = (article.byline \|\| 'UnknownAuthor').replace(/[^\w\- ]/g, '').replace(/\s+/g, '');
	const date = new Date().toISOString().slice(0,10);
	const safeTitle = article.title
	.replace(/[^\w\- ]/g, '')
	.replace(/\s+/g, '-')
	.toLowerCase();

	const authorFolder = path.join(destFolder, author);
	fs.mkdirSync(authorFolder, { recursive: true });
	const imagesFolder = path.join(authorFolder, `${date}-${safeTitle}_images`);
	fs.mkdirSync(imagesFolder, { recursive: true });

	// Load article HTML into JSDOM for filtering and image processing
	const contentDom = new jsdom.JSDOM(article.content);
	const imgEls = contentDom.window.document.querySelectorAll('img');
	let idx = 1;
	for (let img of imgEls) {
	const src = img.getAttribute('src');
	if (!src) {
	img.remove();
	continue;
	}
	// Basic ad/logo/banner filtering
	if (src.match(/ad\|logo\|banner\|pixel/i)) {
	img.remove();
	continue;
	}
	// Optionally: filter tiny images (many ads/tracking pixels are <50px)
	let width = img.width \|\| img.getAttribute('width') \|\| 0;
	let height = img.height \|\| img.getAttribute('height') \|\| 0;
	width = Number(width);
	height = Number(height);
	if ((width && width < 50) \|\| (height && height < 50)) {
	img.remove();
	continue;
	}
	// Resolve relative URLs
	let absUrl;
	try {
	absUrl = new URL(src, url).toString();
	} catch (e) {
	img.remove();
	continue;
	}
	const localName = await downloadImage(absUrl, imagesFolder, idx++);
	if (localName) {
	img.setAttribute('src', `./${date}-${safeTitle}_images/${localName}`);
	} else {
	img.remove(); // remove if download failed
	}
	}

	// Replace article.content with updated HTML (images fixed)
	article.content = contentDom.window.document.body.innerHTML;

	const file = path.join(authorFolder, `${date}-${safeTitle}.html`);
	const fullHtml = `
	<!doctype html>
	<html>
	<head>
	<meta charset="utf-8">
	<title>${article.title}</title>
	<meta name="author" content="${author}">
	</head>
	<body>
	<h1>${article.title}</h1>
	<h2>By ${author}</h2>
	<article>
	${article.content}
	</article>
	</body>
	</html>
	`;

	fs.writeFileSync(file, fullHtml, 'utf8');
	console.log(file);
	})();