Created
October 19, 2017 08:19
-
-
Save sumeet-bansal/6387768bbf5fc28b9af28e8db2395ba8 to your computer and use it in GitHub Desktop.
Generates PDFs from embedded images on Alexander Street Press online textbooks.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from bs4 import BeautifulSoup # for parsing HTML | |
import os # for managing files | |
import sys # for cleaner stdout | |
import urllib.request # for downloading images | |
from fpdf import FPDF # for generating PDFs | |
# page content saved as HTML file | |
inputfile = 'MUS-17-Tricia-Rose-reading.html' | |
output = inputfile[:-5] + '.pdf' | |
# sets up BeautifulSoup HTML parser | |
body = open(inputfile, encoding='utf8').read() | |
soup = BeautifulSoup(body, 'html.parser') | |
# retrieves all images | |
directory = 'images/' | |
images = [] | |
if not os.path.exists(directory): | |
os.makedirs(directory) | |
for link in soup.find_all('a'): | |
href = str(link.get('href')) | |
if 'cdn.alexanderstreet.com' in href: | |
filename = 'images/' + href[-24:-18] + href[-4:] | |
sys.stdout.write("\rRetrieving image: " + href) | |
sys.stdout.flush() | |
urllib.request.urlretrieve(href, filename) | |
images.append(filename) | |
print("\nRetrieved %s images." % len(images)) | |
# writes PDF | |
print() | |
pdf = FPDF() | |
for image in images: | |
pdf.add_page() | |
pdf.image(image, 0, 0, 210, 297) | |
print("Writing PDF...") | |
pdf.output(output, 'F') | |
print("Successfully wrote PDF: " + output) | |
# cleans up | |
for image in images: | |
os.remove(image) | |
os.rmdir(directory) | |
print("Cleaned up.") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Also if anyone besides me ever uses this, sometimes
urllib
gets stuck on images sometimes so just give it a little nudge (read: abort the retrieval by tappingCTRL-C
)—this shouldn't affect how rest of the script runs.