Last active
February 6, 2025 20:51
-
-
Save danielrmeyer/57b341c6618161d69e89865ee0c5d000 to your computer and use it in GitHub Desktop.
Scrape Idaho Bills
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# To use install a newer versions of python3 along with the following packages | |
# pip install pdfminer pdfminer.six pandas beautifulsoup4 | |
import requests | |
from bs4 import BeautifulSoup | |
import pandas as pd | |
import io | |
import time | |
import datetime | |
from pdfminer.high_level import extract_text | |
from datetime import datetime | |
# def generate_bills_filename(): | |
# current_date = datetime.now().strftime("%m_%d_%Y") | |
# return f"idaho_bills_{current_date}.csv" | |
def write_soup_to_file(soup, filename): | |
with open(filename, 'w', encoding='utf-8') as f: | |
f.write(soup.prettify()) | |
def parse_detail_page(detail_url): | |
base_url = "https://legislature.idaho.gov" | |
full_url = base_url + detail_url | |
resp = requests.get(full_url) | |
resp.raise_for_status() | |
soup = BeautifulSoup(resp.text, "html.parser") | |
bill_table = soup.find("table", class_="bill-table") | |
row = bill_table.find("tr") | |
cells = row.find_all("td") | |
sponsor_text = cells[2].get_text(strip=True) | |
sponsor = sponsor_text.replace("by ", "").strip() | |
return sponsor | |
def fetch_and_extract_pdf_text(pdf_url): | |
""" | |
Given a PDF URL, fetch the PDF and extract its text using pdfminer.six. | |
Returns the extracted text as a string. | |
""" | |
try: | |
pdf_response = requests.get(pdf_url) | |
pdf_response.raise_for_status() | |
# Use pdfminer to extract text | |
pdf_text = extract_text(io.BytesIO(pdf_response.content)) | |
return pdf_text | |
except requests.RequestException as e: | |
print(f"Failed to download {pdf_url}: {e}") | |
return "" | |
except Exception as e: | |
print(f"Error parsing PDF {pdf_url}: {e}") | |
return "" | |
def scrape_idaho_legislation(url): | |
response = requests.get(url) | |
response.raise_for_status() | |
soup = BeautifulSoup(response.text, "html.parser") | |
mini_tables = soup.find_all("table", class_="mini-data-table")[2:] | |
results = [] | |
for table in mini_tables: | |
bill_row = table.find("tr", id=lambda x: x and x.startswith("bill")) | |
if not bill_row: | |
continue | |
cells = bill_row.find_all("td") | |
if len(cells) < 4: | |
continue | |
bill_number = cells[0].get_text(strip=True) | |
link_tag = cells[0].find("a") | |
detail_link = link_tag["href"] | |
bill_title = cells[1].get_text(strip=True) if len(cells) > 1 else "" | |
pdf_url = f"https://legislature.idaho.gov/wp-content/uploads/sessioninfo/2025/legislation/{bill_number}.pdf" | |
status = cells[3].get_text(strip=True) | |
results.append([bill_number, bill_title, status, detail_link, pdf_url]) | |
return results | |
url = "https://legislature.idaho.gov/sessioninfo/2025/legislation/" | |
bill_data = scrape_idaho_legislation(url) | |
bill_df = pd.DataFrame( | |
bill_data, columns=[ | |
"bill_number", "bill_title", "bill_status", "detail_link", "pdf_url" | |
] | |
) | |
sponsors = [] | |
for link in bill_df["detail_link"]: | |
sponsor = parse_detail_page(link) if link else "" | |
print(sponsor) | |
sponsors.append(sponsor) | |
time.sleep(0.1) #TODO rate limit | |
bill_df["sponsor"] = sponsors | |
pdf_texts = [] | |
for pdf_url in bill_df["pdf_url"]: | |
print(len(pdf_texts)) | |
print(pdf_url) | |
text = fetch_and_extract_pdf_text(pdf_url) # TODO Add try retry with backoff | |
pdf_texts.append(text) # TODO store file to disc after successful download | |
time.sleep(0.1) # TODO replace with rate limiter | |
bill_df["pdf_texts"] = pdf_texts | |
bill_df.to_csv("idaho_bill_06_02_2025.csv", index=False) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment