Last active
January 12, 2024 14:22
-
-
Save sueszli/d5836b263d2840fa0c64983bfcf44620 to your computer and use it in GitHub Desktop.
kijiji.com scraper
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import axios from 'axios' | |
import * as cheerio from 'cheerio' | |
import { assert } from 'console' | |
import open from 'open' | |
const main = async () => { | |
let url = process.argv[2] | |
assert(process.argv.length !== 2, 'illegal number of arguments') | |
assert(url, 'missing url as argument') | |
const links = [] | |
while (true) { | |
let htmlStr = await axios.get(url).then((r) => r.data) | |
let $ = cheerio.load(htmlStr) | |
// get links from current page | |
const newLinks = [] | |
$('a[data-testid="listing-link"]').each((i, a) => { | |
if (a.type === 'tag') { | |
const href = a.attribs && a.attribs.href | |
if (href) { | |
newLinks.push(href) | |
} | |
} | |
}) | |
console.log('found ' + newLinks.length + ' links') | |
links.push(...newLinks) | |
// set link for next page, repeat | |
let nextButton = $('li[data-testid="pagination-next-link"] a') | |
if (!nextButton.length) { | |
console.log('reached last page') | |
break | |
} | |
const nextButtonHref = nextButton.attr('href') | |
if (!nextButtonHref) { | |
console.error('next button has no href') | |
process.exit(1) | |
} | |
const nextPageLink = new URL(nextButtonHref, url).href | |
url = nextPageLink | |
} | |
console.log('press enter key to open the ' + links.length + ' scraped links in your default browser') | |
await new Promise((resolve) => process.stdin.once('data', resolve)) | |
// iterate through links | |
for (let i = 0; i < links.length; i++) { | |
const l = links[i] | |
const lurl = new URL(l, url).href | |
console.log('opening: ' + lurl) | |
open(lurl) | |
} | |
process.exit(0) | |
} | |
main() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const { join } = require("path"); | |
module.exports = { | |
cacheDirectory: join(__dirname, ".cache", "puppeteer"), | |
}; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment