Skip to content

Instantly share code, notes, and snippets.

@VityaSchel
Created October 3, 2024 05:10
Show Gist options
  • Save VityaSchel/81ccf5a160173a5c5e78a520e09f162c to your computer and use it in GitHub Desktop.
Save VityaSchel/81ccf5a160173a5c5e78a520e09f162c to your computer and use it in GitHub Desktop.
Tapochek.net HTML scraper that gets topics
// OPEN https://tapochek.net/viewforum.php?f=910&start=0 FIRST
// THEN RUN THIS SCRIPT:
async function start() {
const result = []
async function parsePage(url) {
const response = await fetch(url)
if (!response.ok) {
throw new Error(`HTTP error! status: ${response.status}`)
}
const arrayBuffer = await response.arrayBuffer()
const decoder = new TextDecoder('windows-1251')
const decodedString = decoder.decode(arrayBuffer)
const parser = new DOMParser()
const doc = parser.parseFromString(decodedString, 'text/html')
const table = Array.from(doc.querySelectorAll('tbody > tr[id^=tr]')).filter(row => {
const topicName = row.querySelector('a.torTopic')
return topicName && topicName.textContent.trim().length > 0
}).map(row => {
const topicName = row.querySelector('a.torTopic')
return {
title: topicName.textContent.trim(),
size: row.querySelector('a[href^="./download.php"]')?.textContent?.trim(),
downloads: row.querySelector('p.med > b')?.textContent?.trim()
}
})
result.push(...table)
const nextPageLink = Array.from(doc.querySelectorAll('a[href^=viewforum]'))
.find(a => a.textContent.trim() === 'След.')?.href
return nextPageLink
}
let nextPage = window.location.href
while (nextPage) {
try {
nextPage = await parsePage(nextPage)
} catch (err) {
console.error('Error fetching the page:', err)
nextPage = null
}
}
console.log(result)
}
start()
// NOW COPY LOGGED ARRAY AS JSON OBJECT FROM BROWSER'S CONSOLE
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment