Created
August 3, 2025 16:43
-
-
Save gbertb/49dd8867822f9823d23b1431df1f6c1c to your computer and use it in GitHub Desktop.
spider crawl typescript example
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import { Spider } from "@spider-cloud/spider-client"; | |
import { JSDOM } from 'jsdom'; | |
const SPIDER_API_KEY = process.env.SPIDER_API_KEY; | |
if (!SPIDER_API_KEY) { | |
console.error('Error: SPIDER_API_KEY environment variable is not set'); | |
console.error('Please run: export SPIDER_API_KEY=your-api-key'); | |
process.exit(1); | |
} | |
async function startCrawl(url: string) { | |
const spider = new Spider({ apiKey: SPIDER_API_KEY }); | |
const crawlParams = { | |
limit: 1, | |
request: 'smart' as const, | |
full_resources: false, | |
return_page_links: false, | |
return_headers: false, | |
concurrency_limit: 1, | |
return_format: 'raw' as const, | |
readability: false, | |
external_domains: [], | |
root_selector: 'body', | |
subdomains: false, | |
tld: false, | |
proxy_enabled: true, | |
block_ads: true, | |
block_stylesheets: true, | |
wait_for: { | |
page_navigations: true, | |
delay: { | |
timeout: { secs: 10, nanos: 0 } | |
} | |
} | |
}; | |
const baseDomain = new URL(url).hostname.replace('www.', ''); | |
const isInternal = (link: string) => { | |
try { | |
const parsed = new URL(link, url); | |
return parsed.hostname.endsWith(baseDomain); | |
} catch { | |
return false; | |
} | |
}; | |
let pageCount = 0; | |
let skippedCount = 0; | |
const skippedUrls: string[] = []; | |
console.log(`π· Starting crawl for: ${url}`); | |
await spider.crawlUrl(url, crawlParams, true, async (page: any) => { | |
try { | |
pageCount++; | |
console.log(`π Spider discovered: ${page.url}`); | |
console.log(`π Processing page #${pageCount}: ${page.url}`); | |
if (!page.url) return; | |
const html = page.content || page.html; | |
if (!html) return; | |
// Remove <style> and <script> blocks before parsing to reduce memory usage | |
const cleanedHtml = html | |
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '') | |
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, ''); | |
const dom = new JSDOM(cleanedHtml); | |
const doc = dom.window.document; | |
// Extract headings | |
const extractHeadings = (level: number): string[] => | |
Array.from(doc?.querySelectorAll(`h${level}`) || []).map( | |
(el) => el.textContent?.trim() || '' | |
); | |
const hTags: { [k: string]: string[] } = {}; | |
for (let i = 1; i <= 6; i++) { | |
hTags[`h${i}`] = extractHeadings(i); | |
} | |
const meta = page.metadata || {}; | |
const links: string[] = page.links || []; | |
const internalLinks = links.filter((link: string) => isInternal(link)); | |
const externalLinks = links.filter((link: string) => !isInternal(link)); | |
// Print page summary | |
console.log('---- Page summary ----'); | |
console.log(`URL: ${page.url}`); | |
console.log('Meta:', meta); | |
console.log('Headings:', hTags); | |
console.log('Internal Links:', internalLinks.slice(0, 10)); | |
console.log('External Links:', externalLinks.slice(0, 10)); | |
console.log('Content:', page.content); | |
console.log('----------------------'); | |
} catch (err: any) { | |
console.error(`β Error processing page: ${page.url}, ${err.message}`); | |
skippedCount++; | |
skippedUrls.push(page.url); | |
return; | |
} | |
}); | |
console.log(`\nFinished. Processed ${pageCount} pages. Skipped ${skippedCount} pages.`); | |
if (skippedUrls.length > 0) { | |
console.log('Skipped URLs:', skippedUrls); | |
} | |
} | |
// ----------- CLI ---------- | |
if (require.main === module) { | |
const url = process.argv[2]; | |
if (!url) { | |
console.error('Usage: node crawl-site.js <url>'); | |
process.exit(1); | |
} | |
startCrawl(url).catch((err) => { | |
console.error('Fatal error:', err); | |
process.exit(1); | |
}); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
I'm going to be converting the .ts to .js, would
import { Spider } from "@spider-cloud/spider-client";
still be the incorrect client?