Skip to content

Instantly share code, notes, and snippets.

@gbertb
Created August 3, 2025 16:43
Show Gist options
  • Save gbertb/49dd8867822f9823d23b1431df1f6c1c to your computer and use it in GitHub Desktop.
Save gbertb/49dd8867822f9823d23b1431df1f6c1c to your computer and use it in GitHub Desktop.
spider crawl typescript example
import { Spider } from "@spider-cloud/spider-client";
import { JSDOM } from 'jsdom';
const SPIDER_API_KEY = process.env.SPIDER_API_KEY;
if (!SPIDER_API_KEY) {
console.error('Error: SPIDER_API_KEY environment variable is not set');
console.error('Please run: export SPIDER_API_KEY=your-api-key');
process.exit(1);
}
async function startCrawl(url: string) {
const spider = new Spider({ apiKey: SPIDER_API_KEY });
const crawlParams = {
limit: 1,
request: 'smart' as const,
full_resources: false,
return_page_links: false,
return_headers: false,
concurrency_limit: 1,
return_format: 'raw' as const,
readability: false,
external_domains: [],
root_selector: 'body',
subdomains: false,
tld: false,
proxy_enabled: true,
block_ads: true,
block_stylesheets: true,
wait_for: {
page_navigations: true,
delay: {
timeout: { secs: 10, nanos: 0 }
}
}
};
const baseDomain = new URL(url).hostname.replace('www.', '');
const isInternal = (link: string) => {
try {
const parsed = new URL(link, url);
return parsed.hostname.endsWith(baseDomain);
} catch {
return false;
}
};
let pageCount = 0;
let skippedCount = 0;
const skippedUrls: string[] = [];
console.log(`πŸ‘· Starting crawl for: ${url}`);
await spider.crawlUrl(url, crawlParams, true, async (page: any) => {
try {
pageCount++;
console.log(`πŸ” Spider discovered: ${page.url}`);
console.log(`πŸ“„ Processing page #${pageCount}: ${page.url}`);
if (!page.url) return;
const html = page.content || page.html;
if (!html) return;
// Remove <style> and <script> blocks before parsing to reduce memory usage
const cleanedHtml = html
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '');
const dom = new JSDOM(cleanedHtml);
const doc = dom.window.document;
// Extract headings
const extractHeadings = (level: number): string[] =>
Array.from(doc?.querySelectorAll(`h${level}`) || []).map(
(el) => el.textContent?.trim() || ''
);
const hTags: { [k: string]: string[] } = {};
for (let i = 1; i <= 6; i++) {
hTags[`h${i}`] = extractHeadings(i);
}
const meta = page.metadata || {};
const links: string[] = page.links || [];
const internalLinks = links.filter((link: string) => isInternal(link));
const externalLinks = links.filter((link: string) => !isInternal(link));
// Print page summary
console.log('---- Page summary ----');
console.log(`URL: ${page.url}`);
console.log('Meta:', meta);
console.log('Headings:', hTags);
console.log('Internal Links:', internalLinks.slice(0, 10));
console.log('External Links:', externalLinks.slice(0, 10));
console.log('Content:', page.content);
console.log('----------------------');
} catch (err: any) {
console.error(`❌ Error processing page: ${page.url}, ${err.message}`);
skippedCount++;
skippedUrls.push(page.url);
return;
}
});
console.log(`\nFinished. Processed ${pageCount} pages. Skipped ${skippedCount} pages.`);
if (skippedUrls.length > 0) {
console.log('Skipped URLs:', skippedUrls);
}
}
// ----------- CLI ----------
if (require.main === module) {
const url = process.argv[2];
if (!url) {
console.error('Usage: node crawl-site.js <url>');
process.exit(1);
}
startCrawl(url).catch((err) => {
console.error('Fatal error:', err);
process.exit(1);
});
}
@tom-seed
Copy link

tom-seed commented Aug 4, 2025

I'm going to be converting the .ts to .js, would import { Spider } from "@spider-cloud/spider-client"; still be the incorrect client?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment