Last active
July 19, 2019 00:52
-
-
Save IanCarrasco/db2cb29f52e3d34434d39209580e2849 to your computer and use it in GitHub Desktop.
Stanford CS224n Scraper
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
from bs4 import BeautifulSoup | |
from PyPDF2 import PdfFileMerger, PdfFileReader | |
from io import BytesIO | |
#Parse page with beautiful soup | |
origin = 'https://web.stanford.edu/class/cs224n/readings/' | |
page = requests.get(origin) | |
content = BeautifulSoup(page.text) | |
#Compile web links for all pdfs in directory | |
pdf_links = [] | |
for elem in content.findAll('a'): | |
if 'pdf' in elem['href'] and 'cs' in elem['href']: | |
pdf_links.append(origin+elem['href']) | |
#Download All PDFs | |
pdfs = [] | |
for link in pdf_links: | |
memory = BytesIO(requests.get(link).content) | |
pdfs.append(PdfFileReader(memory)) | |
#Merge Downloaded PDFs | |
merger = PdfFileMerger() | |
for pdf in pdfs: | |
merger.append(pdf) | |
#Export merged pdf to outfile | |
with open('cs224n_notes.pdf', 'wb') as fout: | |
merger.write(fout) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment