Last active
July 20, 2025 17:17
-
-
Save KrishGarg/3e0b5aa33f89270be54f7e4a73f0f462 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// scrape.js | |
const puppeteer = require("puppeteer"); | |
const fs = require("fs"); | |
const path = require("path"); | |
// Home page as direct access sometimes denies access | |
const TARGET_URL = "https://tgeapcet.nic.in/default.aspx"; | |
async function scrapeAllotments() { | |
console.log("🚀 Starting the scraper..."); | |
const browser = await puppeteer.launch({ headless: true }); // headless makes the entire browser run inside program and without opening the UI window | |
const ua = | |
"Mozilla/5.0 (Linux; Android 10; K) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Mobile Safari/537.3"; // to mimic browser in headless mode | |
const page = await browser.newPage(); | |
await page.setUserAgent(ua); | |
await page.goto(TARGET_URL, { waitUntil: "networkidle2" }); | |
console.log(`On ${TARGET_URL}`); | |
// You should inspect the page to get the correct selector IDs | |
const collegeSelector = "#MainContent_DropDownList1"; | |
const branchSelector = "#MainContent_DropDownList2"; | |
const submitButtonSelector = "#MainContent_btn_allot"; | |
const resultsTableSelector = "table.sortable"; | |
// Wait for new page/tab to open | |
const newPagePromise = new Promise((x) => | |
browser.once("targetcreated", (target) => x(target.page())) | |
); | |
// click on the link to cutoffs page | |
await page.click("a[href$='college_allotment.aspx']"); | |
const newPage = await newPagePromise; | |
await newPage.waitForNavigation(); // this will be used multiple times, it basically waits for the page to load | |
await newPage.setUserAgent(ua); | |
const colleges = await newPage.evaluate((selector) => { // evaluate() scripts run inside the browser console | |
return Array.from(document.querySelector(selector).options) | |
.map((option) => ({ name: option.text, value: option.value })) | |
.filter((opt) => opt.value !== ""); // Filter out the default "--Select--" option | |
}, collegeSelector); // to pass data from program to browser instance | |
// colleges.forEach((c) => { | |
// console.log(c.name); | |
// }); | |
// array to hold all scraped data | |
const allRecords = {}; | |
// looping through all combinations of colleges and branches | |
for (let i = 1; i <= colleges.length; i++) { | |
// why we aren't starting from zero is because the site is a little bugged for first entry. you have to click on other college then back to first for it to work | |
// so we are just taking the first college at the end | |
const college = colleges[i % colleges.length]; | |
// select the college in dropdown | |
await newPage.select(collegeSelector, college.value); | |
await newPage.waitForNavigation(); | |
// get all branch names | |
const branches = await newPage.evaluate((selector) => { | |
return Array.from(document.querySelector(selector).options) | |
.map((option) => ({ name: option.text, value: option.value })) | |
.filter((opt) => opt.value !== "0"); // Filter out the default "--Select--" option | |
}, branchSelector); | |
allRecords[college.name] = {}; | |
for (const branch of branches) { | |
try { | |
console.log(`Fetching: ${college.name} -> ${branch.name}`); | |
// Select college and branch | |
await newPage.select(collegeSelector, college.value); | |
await newPage.waitForNavigation(); | |
await newPage.select(branchSelector, branch.value); | |
// Click submit and wait for the results table to appear | |
await newPage.click(submitButtonSelector); | |
await newPage.waitForNavigation(); | |
allRecords[college.name][branch.name] = []; | |
// scraping the table in browser | |
const tableData = await newPage.evaluate((tableSelector) => { | |
const rows = document.querySelectorAll(`${tableSelector} tr`); | |
const records = []; | |
// Start from 1 to skip the header row | |
for (let i = 0; i < rows.length; i++) { | |
const cells = rows[i].querySelectorAll("td"); | |
if (cells.length > 1) { | |
// Ensure row has data | |
// The keys here should match the table structure | |
records.push({ | |
sno: +cells[0]?.innerText.trim(), // + makes it integer | |
hallticketno: cells[1]?.innerText.trim(), | |
rank: +cells[2]?.innerText.trim(), | |
name: cells[3]?.innerText.trim(), | |
sex: cells[4]?.innerText.trim(), | |
caste: cells[5]?.innerText.trim(), | |
region: cells[6]?.innerText.trim(), | |
seatcategory: cells[7]?.innerText.trim(), | |
}); | |
} | |
} | |
return records; | |
}, resultsTableSelector); | |
if (tableData.length > 0) { | |
allRecords[college.name][branch.name].push(...tableData); | |
} else { | |
// this was an edge case found during the testing where some branches didn't have allotments | |
// so table would be empty, we just cleanly go back and continue the process | |
await newPage.goBack(); | |
await newPage.waitForNavigation(); | |
} | |
} catch (error) { | |
console.error( | |
`Could not fetch data for ${college.name} -> ${branch.name}. Skipping.`, | |
error | |
); | |
// any error, just go back and continue | |
await newPage.goBack(); | |
await newPage.waitForNavigation(); | |
} | |
} | |
} | |
// write to json | |
const outputPath = path.resolve(__dirname, "allotment_data.json"); | |
fs.writeFileSync(outputPath, JSON.stringify(allRecords, null, 2)); | |
console.log(`✅ Scraping complete! Data saved to ${outputPath}`); | |
await browser.close(); | |
} | |
scrapeAllotments().catch(console.error); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment