Created
October 19, 2017 08:19
-
-
Save sumeet-bansal/6387768bbf5fc28b9af28e8db2395ba8 to your computer and use it in GitHub Desktop.
Generates PDFs from embedded images on Alexander Street Press online textbooks.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from bs4 import BeautifulSoup # for parsing HTML | |
import os # for managing files | |
import sys # for cleaner stdout | |
import urllib.request # for downloading images | |
from fpdf import FPDF # for generating PDFs | |
# page content saved as HTML file | |
inputfile = 'MUS-17-Tricia-Rose-reading.html' | |
output = inputfile[:-5] + '.pdf' | |
# sets up BeautifulSoup HTML parser | |
body = open(inputfile, encoding='utf8').read() | |
soup = BeautifulSoup(body, 'html.parser') | |
# retrieves all images | |
directory = 'images/' | |
images = [] | |
if not os.path.exists(directory): | |
os.makedirs(directory) | |
for link in soup.find_all('a'): | |
href = str(link.get('href')) | |
if 'cdn.alexanderstreet.com' in href: | |
filename = 'images/' + href[-24:-18] + href[-4:] | |
sys.stdout.write("\rRetrieving image: " + href) | |
sys.stdout.flush() | |
urllib.request.urlretrieve(href, filename) | |
images.append(filename) | |
print("\nRetrieved %s images." % len(images)) | |
# writes PDF | |
print() | |
pdf = FPDF() | |
for image in images: | |
pdf.add_page() | |
pdf.image(image, 0, 0, 210, 297) | |
print("Writing PDF...") | |
pdf.output(output, 'F') | |
print("Successfully wrote PDF: " + output) | |
# cleans up | |
for image in images: | |
os.remove(image) | |
os.rmdir(directory) | |
print("Cleaned up.") |
Also if anyone besides me ever uses this, sometimes urllib
gets stuck on images sometimes so just give it a little nudge (read: abort the retrieval by tapping CTRL-C
)—this shouldn't affect how rest of the script runs.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
If I have to use this again, I'll add some sort of async image retrieval to speed up the process (but it was mostly a one-off type thing because their UI is a garbage fire and I wanted an offline copy anyways).