petersandor · April 18, 2025 20:56
diff --git a/hn_scraper.py b/hn_scraper.py
 import requests
 from bs4 import BeautifulSoup
 import getpass
 import csv
 import argparse
 import sys

 LOGIN_URL = "https://news.ycombinator.com/login"
 UPVOTED_URL = "https://news.ycombinator.com/upvoted"
 FAVORITES_URL = "https://news.ycombinator.com/favorites"

 session = requests.Session()
 session.headers.update({"User-Agent": "Mozilla/5.0"})

 def login(username, password):
    payload = {
        "acct": username,
        "pw": password
    }

    res = session.post(LOGIN_URL, data=payload)
    if "Bad login" in res.text:
        raise Exception("❌ Login failed. Check your username/password.")
    print("✅ Login successful!")

 def use_cookie(cookie_value):
    session.cookies.set("user", cookie_value)
    print("🔐 Using provided session cookie.")

 def scrape_upvoted(username):
    return scrape_story_list(f"{UPVOTED_URL}?id={username}", "Upvoted")

 def scrape_favorites(username):
    return scrape_story_list(f"{FAVORITES_URL}?id={username}", "Favorites", filter_comments=True)

 def scrape_story_list(base_url, label, filter_comments=False):
    results = []
    page = 1

    while True:
        url = f"{base_url}&p={page}"
        print(f"🔍 Fetching {label} page {page}...")
        res = session.get(url)
        soup = BeautifulSoup(res.text, "html.parser")

        items = soup.select("tr.athing")
        print(f"   - Found {len(items)} stories on page {page}")

        if not items:
            break

        for item in items:
            # Optional filtering to skip comments
            # subtext = item.find_next_sibling("tr")
            # if filter_comments and subtext and "comment" in subtext.text.lower():
            #     continue

            title_elem = item.select_one(".titleline > a")
            if not title_elem:
                print("   -- Couldn't find link")
                continue

            results.append({
                "title": title_elem.text.strip(),
                "url": title_elem["href"]
            })

        # if len(items) < 30:
        #     break

        page += 1

    print(f"🎉 Found {len(results)} {label.lower()} stories.")
    return results

 def save_to_csv(data, filename="hn_stories.csv"):
    with open(filename, "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=["title", "url"])
        writer.writeheader()
        writer.writerows(data)
    print(f"💾 Saved to {filename}")

 if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="📰 Hacker News Scraper (Upvotes / Favorites)")
    parser.add_argument("-u", "--username", help="HN username (required)")
    parser.add_argument("-p", "--password", help="HN password (optional, will be prompted)")
    parser.add_argument("-c", "--cookie", help="HN session cookie (alternative to login)")
    parser.add_argument("-f", "--favorites", action="store_true", help="Scrape favorites instead of upvotes")
    parser.add_argument("-o", "--output", default="hn_stories.csv", help="Output CSV file (default: hn_stories.csv)")

    args = parser.parse_args()

    if not args.username:
        print("❌ --username is required.")
        sys.exit(1)

    print("🔐 Hacker News Scraper\n")

    try:
        if args.cookie:
            use_cookie(args.cookie)
        else:
            password = args.password or getpass.getpass("HN Password: ")
            login(args.username, password)

        if args.favorites:
            stories = scrape_favorites(args.username)
        else:
            stories = scrape_upvoted(args.username)

        save_to_csv(stories, args.output)

    except Exception as e:
        print(f"❌ Error: {e}")
        sys.exit(1)
	import requests
	from bs4 import BeautifulSoup
	import getpass
	import csv
	import argparse
	import sys

	LOGIN_URL = "https://news.ycombinator.com/login"
	UPVOTED_URL = "https://news.ycombinator.com/upvoted"
	FAVORITES_URL = "https://news.ycombinator.com/favorites"

	session = requests.Session()
	session.headers.update({"User-Agent": "Mozilla/5.0"})

	def login(username, password):
	payload = {
	"acct": username,
	"pw": password
	}

	res = session.post(LOGIN_URL, data=payload)
	if "Bad login" in res.text:
	raise Exception("❌ Login failed. Check your username/password.")
	print("✅ Login successful!")

	def use_cookie(cookie_value):
	session.cookies.set("user", cookie_value)
	print("🔐 Using provided session cookie.")

	def scrape_upvoted(username):
	return scrape_story_list(f"{UPVOTED_URL}?id={username}", "Upvoted")

	def scrape_favorites(username):
	return scrape_story_list(f"{FAVORITES_URL}?id={username}", "Favorites", filter_comments=True)

	def scrape_story_list(base_url, label, filter_comments=False):
	results = []
	page = 1

	while True:
	url = f"{base_url}&p={page}"
	print(f"🔍 Fetching {label} page {page}...")
	res = session.get(url)
	soup = BeautifulSoup(res.text, "html.parser")

	items = soup.select("tr.athing")
	print(f" - Found {len(items)} stories on page {page}")

	if not items:
	break

	for item in items:
	# Optional filtering to skip comments
	# subtext = item.find_next_sibling("tr")
	# if filter_comments and subtext and "comment" in subtext.text.lower():
	# continue

	title_elem = item.select_one(".titleline > a")
	if not title_elem:
	print(" -- Couldn't find link")
	continue

	results.append({
	"title": title_elem.text.strip(),
	"url": title_elem["href"]
	})

	# if len(items) < 30:
	# break

	page += 1

	print(f"🎉 Found {len(results)} {label.lower()} stories.")
	return results

	def save_to_csv(data, filename="hn_stories.csv"):
	with open(filename, "w", newline="", encoding="utf-8") as f:
	writer = csv.DictWriter(f, fieldnames=["title", "url"])
	writer.writeheader()
	writer.writerows(data)
	print(f"💾 Saved to {filename}")

	if __name__ == "__main__":
	parser = argparse.ArgumentParser(description="📰 Hacker News Scraper (Upvotes / Favorites)")
	parser.add_argument("-u", "--username", help="HN username (required)")
	parser.add_argument("-p", "--password", help="HN password (optional, will be prompted)")
	parser.add_argument("-c", "--cookie", help="HN session cookie (alternative to login)")
	parser.add_argument("-f", "--favorites", action="store_true", help="Scrape favorites instead of upvotes")
	parser.add_argument("-o", "--output", default="hn_stories.csv", help="Output CSV file (default: hn_stories.csv)")

	args = parser.parse_args()

	if not args.username:
	print("❌ --username is required.")
	sys.exit(1)

	print("🔐 Hacker News Scraper\n")

	try:
	if args.cookie:
	use_cookie(args.cookie)
	else:
	password = args.password or getpass.getpass("HN Password: ")
	login(args.username, password)

	if args.favorites:
	stories = scrape_favorites(args.username)
	else:
	stories = scrape_upvoted(args.username)

	save_to_csv(stories, args.output)

	except Exception as e:
	print(f"❌ Error: {e}")
	sys.exit(1)