-
-
Save nosahama/f00eff2dd0755fd1d7fec2d93d59e11a to your computer and use it in GitHub Desktop.
Download all pdf files from a website
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import requests | |
from urllib.parse import urljoin | |
from bs4 import BeautifulSoup | |
import argparse | |
#%% Example | |
# one pdf | |
# python all_pdf_dl.py -l https://memento.epfl.ch/academic-calendar/ --save-here | |
# many pdfs | |
# python all_pdf_dl.py -l https://idsc.ethz.ch/education/lectures/recursive-estimation.html | |
#%% TODO | |
# rewrite as a function [okay] | |
# add argparse [okay] | |
# print name all note | |
# set save folder | |
# TODO: download all files | |
# update running examples [okay] | |
# TODO: merge files with the same name | |
# TODO: save subfoldname as webpagename -> change all webpage as underscore case | |
# TODO: update to other type of files using class | |
# TODO: download files with name containing non-latin characters | |
# TODO: add tqdm viz | |
# TODO: add file selection | |
# TODO: add log-in | |
# TODO: modify according to style guide | |
#%% Functions | |
def all_pdf_download(args): | |
base_url = args.link | |
if args.save_here: | |
folder_path = os.getcwd() | |
else: | |
folder_path = args.folder_path | |
if not os.path.exists(args.folder_path):os.mkdir(args.folder_path) | |
print("====== 1. Set savepath: {} ======".format(folder_path)) | |
print("====== 2. Start searching ======") | |
#response = requests.get(base_url) | |
response = requests.get(base_url, headers={'User-Agent': 'Custom'}) | |
soup= BeautifulSoup(response.text, "html.parser") | |
search_res = soup.select("a[href$='.pdf']") | |
print("{} files found!!!".format(len(search_res))) | |
print("====== 3. Start downloading ======") | |
for counter, link in enumerate(search_res): | |
#Name the pdf files using the last portion of each link which are unique in this case | |
filename = link['href'].split('/')[-1] | |
file_save_path = os.path.join(folder_path,link['href'].split('/')[-1]) | |
if args.print_all: | |
print("[{}/{}] {}".format(counter+1, len(search_res), filename)) | |
with open(file_save_path, 'wb') as f: | |
f.write(requests.get(urljoin(base_url,link['href'])).content) | |
print("====== 4. Finished!!! ======") | |
if __name__ == "__main__": | |
parser = argparse.ArgumentParser(description='Test argparse') | |
#################################### | |
############ ALL OPTION ############ | |
## Main option | |
# -l/--link | |
parser.add_argument('-l', '--link', required=True, type=str, | |
help='write down site name') | |
# --print-all | |
parser.add_argument('--print-all', dest='print_all', action='store_true', | |
help="print all filename") | |
parser.set_defaults(print_all=True) | |
# --save-here | |
parser.add_argument('--save-here', dest='save_here', action='store_true', | |
help="save files here") | |
parser.set_defaults(save_here=False) | |
# --save--folder | |
# default setting -> Downloads/ in user’s home directory obtained by (os.path.expanduser('~')) | |
parser.add_argument('-f', '--folder_path', default=r""+os.path.join(os.path.expanduser('~'), "Downloads"), | |
type=str, help='save files in the given folder') | |
######################################## | |
############ PARSING OPTION ############ | |
args = parser.parse_args() | |
all_pdf_download(args) | |
#%% reference | |
# from https://stackoverflow.com/questions/54616638/download-all-pdf-files-from-a-website-using-python |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment