Created
April 18, 2025 20:56
-
-
Save petersandor/7c5882fcfec7103f305ed381507a99c1 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
from bs4 import BeautifulSoup | |
import getpass | |
import csv | |
import argparse | |
import sys | |
LOGIN_URL = "https://news.ycombinator.com/login" | |
UPVOTED_URL = "https://news.ycombinator.com/upvoted" | |
FAVORITES_URL = "https://news.ycombinator.com/favorites" | |
session = requests.Session() | |
session.headers.update({"User-Agent": "Mozilla/5.0"}) | |
def login(username, password): | |
payload = { | |
"acct": username, | |
"pw": password | |
} | |
res = session.post(LOGIN_URL, data=payload) | |
if "Bad login" in res.text: | |
raise Exception("β Login failed. Check your username/password.") | |
print("β Login successful!") | |
def use_cookie(cookie_value): | |
session.cookies.set("user", cookie_value) | |
print("π Using provided session cookie.") | |
def scrape_upvoted(username): | |
return scrape_story_list(f"{UPVOTED_URL}?id={username}", "Upvoted") | |
def scrape_favorites(username): | |
return scrape_story_list(f"{FAVORITES_URL}?id={username}", "Favorites", filter_comments=True) | |
def scrape_story_list(base_url, label, filter_comments=False): | |
results = [] | |
page = 1 | |
while True: | |
url = f"{base_url}&p={page}" | |
print(f"π Fetching {label} page {page}...") | |
res = session.get(url) | |
soup = BeautifulSoup(res.text, "html.parser") | |
items = soup.select("tr.athing") | |
print(f" - Found {len(items)} stories on page {page}") | |
if not items: | |
break | |
for item in items: | |
# Optional filtering to skip comments | |
# subtext = item.find_next_sibling("tr") | |
# if filter_comments and subtext and "comment" in subtext.text.lower(): | |
# continue | |
title_elem = item.select_one(".titleline > a") | |
if not title_elem: | |
print(" -- Couldn't find link") | |
continue | |
results.append({ | |
"title": title_elem.text.strip(), | |
"url": title_elem["href"] | |
}) | |
# if len(items) < 30: | |
# break | |
page += 1 | |
print(f"π Found {len(results)} {label.lower()} stories.") | |
return results | |
def save_to_csv(data, filename="hn_stories.csv"): | |
with open(filename, "w", newline="", encoding="utf-8") as f: | |
writer = csv.DictWriter(f, fieldnames=["title", "url"]) | |
writer.writeheader() | |
writer.writerows(data) | |
print(f"πΎ Saved to {filename}") | |
if __name__ == "__main__": | |
parser = argparse.ArgumentParser(description="π° Hacker News Scraper (Upvotes / Favorites)") | |
parser.add_argument("-u", "--username", help="HN username (required)") | |
parser.add_argument("-p", "--password", help="HN password (optional, will be prompted)") | |
parser.add_argument("-c", "--cookie", help="HN session cookie (alternative to login)") | |
parser.add_argument("-f", "--favorites", action="store_true", help="Scrape favorites instead of upvotes") | |
parser.add_argument("-o", "--output", default="hn_stories.csv", help="Output CSV file (default: hn_stories.csv)") | |
args = parser.parse_args() | |
if not args.username: | |
print("β --username is required.") | |
sys.exit(1) | |
print("π Hacker News Scraper\n") | |
try: | |
if args.cookie: | |
use_cookie(args.cookie) | |
else: | |
password = args.password or getpass.getpass("HN Password: ") | |
login(args.username, password) | |
if args.favorites: | |
stories = scrape_favorites(args.username) | |
else: | |
stories = scrape_upvoted(args.username) | |
save_to_csv(stories, args.output) | |
except Exception as e: | |
print(f"β Error: {e}") | |
sys.exit(1) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment