Created
January 13, 2020 09:47
-
-
Save mluerig/3ec0a506afeb5a10439cff314f3786ee to your computer and use it in GitHub Desktop.
paperpile shared folder download pdf crawler
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# this is not a standalone script, i.e. needs to be run from some IDE | |
import os | |
from bs4 import BeautifulSoup | |
import re | |
shared_folder_url = "" # your shared folder url | |
save_dir = "D:\\Temp\\pdfs" | |
all_urls = [] | |
for page in ["","/2", "/3", "/4"]: # if you have multiple pages | |
base_url = shared_folder_url + page | |
req = urllib.request.urlopen(base_url) | |
soup = BeautifulSoup(req, from_encoding=resp.info().get_param('charset')) | |
for link in soup.find_all("a", href=re.compile("download")): | |
print(link['href']) | |
all_urls.append("https://paperpile.com" + link['href']) | |
idx = 1 | |
for pdf_link in all_urls: | |
print(pdf_link) | |
try: | |
urllib.request.urlretrieve(pdf_link, os.path.join(save_dir, str(idx) + ".pdf")) | |
idx += 1 | |
except Exception as ex: | |
print(str(ex.__class__.__name__) + " - " + str(ex)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment