Skip to content

Instantly share code, notes, and snippets.

@danielrmeyer
Last active February 6, 2025 20:51
Show Gist options
  • Save danielrmeyer/57b341c6618161d69e89865ee0c5d000 to your computer and use it in GitHub Desktop.
Save danielrmeyer/57b341c6618161d69e89865ee0c5d000 to your computer and use it in GitHub Desktop.
Scrape Idaho Bills
# To use install a newer versions of python3 along with the following packages
# pip install pdfminer pdfminer.six pandas beautifulsoup4
import requests
from bs4 import BeautifulSoup
import pandas as pd
import io
import time
import datetime
from pdfminer.high_level import extract_text
from datetime import datetime
# def generate_bills_filename():
# current_date = datetime.now().strftime("%m_%d_%Y")
# return f"idaho_bills_{current_date}.csv"
def write_soup_to_file(soup, filename):
with open(filename, 'w', encoding='utf-8') as f:
f.write(soup.prettify())
def parse_detail_page(detail_url):
base_url = "https://legislature.idaho.gov"
full_url = base_url + detail_url
resp = requests.get(full_url)
resp.raise_for_status()
soup = BeautifulSoup(resp.text, "html.parser")
bill_table = soup.find("table", class_="bill-table")
row = bill_table.find("tr")
cells = row.find_all("td")
sponsor_text = cells[2].get_text(strip=True)
sponsor = sponsor_text.replace("by ", "").strip()
return sponsor
def fetch_and_extract_pdf_text(pdf_url):
"""
Given a PDF URL, fetch the PDF and extract its text using pdfminer.six.
Returns the extracted text as a string.
"""
try:
pdf_response = requests.get(pdf_url)
pdf_response.raise_for_status()
# Use pdfminer to extract text
pdf_text = extract_text(io.BytesIO(pdf_response.content))
return pdf_text
except requests.RequestException as e:
print(f"Failed to download {pdf_url}: {e}")
return ""
except Exception as e:
print(f"Error parsing PDF {pdf_url}: {e}")
return ""
def scrape_idaho_legislation(url):
response = requests.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")
mini_tables = soup.find_all("table", class_="mini-data-table")[2:]
results = []
for table in mini_tables:
bill_row = table.find("tr", id=lambda x: x and x.startswith("bill"))
if not bill_row:
continue
cells = bill_row.find_all("td")
if len(cells) < 4:
continue
bill_number = cells[0].get_text(strip=True)
link_tag = cells[0].find("a")
detail_link = link_tag["href"]
bill_title = cells[1].get_text(strip=True) if len(cells) > 1 else ""
pdf_url = f"https://legislature.idaho.gov/wp-content/uploads/sessioninfo/2025/legislation/{bill_number}.pdf"
status = cells[3].get_text(strip=True)
results.append([bill_number, bill_title, status, detail_link, pdf_url])
return results
url = "https://legislature.idaho.gov/sessioninfo/2025/legislation/"
bill_data = scrape_idaho_legislation(url)
bill_df = pd.DataFrame(
bill_data, columns=[
"bill_number", "bill_title", "bill_status", "detail_link", "pdf_url"
]
)
sponsors = []
for link in bill_df["detail_link"]:
sponsor = parse_detail_page(link) if link else ""
print(sponsor)
sponsors.append(sponsor)
time.sleep(0.1) #TODO rate limit
bill_df["sponsor"] = sponsors
pdf_texts = []
for pdf_url in bill_df["pdf_url"]:
print(len(pdf_texts))
print(pdf_url)
text = fetch_and_extract_pdf_text(pdf_url) # TODO Add try retry with backoff
pdf_texts.append(text) # TODO store file to disc after successful download
time.sleep(0.1) # TODO replace with rate limiter
bill_df["pdf_texts"] = pdf_texts
bill_df.to_csv("idaho_bill_06_02_2025.csv", index=False)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment