sumeet-bansal · October 19, 2017 08:19 · sumeet-bansal · Oct 19, 2017 · sumeet-bansal · Oct 19, 2017
diff --git a/extractor.py b/extractor.py
 from bs4 import BeautifulSoup   # for parsing HTML
 import os                       # for managing files
 import sys                      # for cleaner stdout
 import urllib.request           # for downloading images
 from fpdf import FPDF           # for generating PDFs

 # page content saved as HTML file
 inputfile = 'MUS-17-Tricia-Rose-reading.html'
 output = inputfile[:-5] + '.pdf'

 # sets up BeautifulSoup HTML parser
 body = open(inputfile, encoding='utf8').read()
 soup = BeautifulSoup(body, 'html.parser')

 # retrieves all images
 directory = 'images/'
 images = []
 if not os.path.exists(directory):
    os.makedirs(directory)
 for link in soup.find_all('a'):
 	href = str(link.get('href'))
 	if 'cdn.alexanderstreet.com' in href:
 		filename = 'images/' + href[-24:-18] + href[-4:]
 		sys.stdout.write("\rRetrieving image: " + href)
 		sys.stdout.flush()
 		urllib.request.urlretrieve(href, filename)
 		images.append(filename)
 print("\nRetrieved %s images." % len(images))

 # writes PDF
 print()
 pdf = FPDF()
 for image in images:
 	pdf.add_page()
 	pdf.image(image, 0, 0, 210, 297)
 print("Writing PDF...")
 pdf.output(output, 'F')
 print("Successfully wrote PDF: " + output)

 # cleans up
 for image in images:
 	os.remove(image)
 os.rmdir(directory)
 print("Cleaned up.")
	from bs4 import BeautifulSoup # for parsing HTML
	import os # for managing files
	import sys # for cleaner stdout
	import urllib.request # for downloading images
	from fpdf import FPDF # for generating PDFs

	# page content saved as HTML file
	inputfile = 'MUS-17-Tricia-Rose-reading.html'
	output = inputfile[:-5] + '.pdf'

	# sets up BeautifulSoup HTML parser
	body = open(inputfile, encoding='utf8').read()
	soup = BeautifulSoup(body, 'html.parser')

	# retrieves all images
	directory = 'images/'
	images = []
	if not os.path.exists(directory):
	os.makedirs(directory)
	for link in soup.find_all('a'):
	href = str(link.get('href'))
	if 'cdn.alexanderstreet.com' in href:
	filename = 'images/' + href[-24:-18] + href[-4:]
	sys.stdout.write("\rRetrieving image: " + href)
	sys.stdout.flush()
	urllib.request.urlretrieve(href, filename)
	images.append(filename)
	print("\nRetrieved %s images." % len(images))

	# writes PDF
	print()
	pdf = FPDF()
	for image in images:
	pdf.add_page()
	pdf.image(image, 0, 0, 210, 297)
	print("Writing PDF...")
	pdf.output(output, 'F')
	print("Successfully wrote PDF: " + output)

	# cleans up
	for image in images:
	os.remove(image)
	os.rmdir(directory)
	print("Cleaned up.")