Last active
November 25, 2022 00:47
-
-
Save mstevenson/8794ae4e7b23d7f5181e69bb3c9b6756 to your computer and use it in GitHub Desktop.
Download all images from an Internet Archive collection and write the caption to a text file
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import internetarchive as ia | |
from pathlib import Path | |
import argparse | |
import time | |
config = dict(general=dict(secure=False)) | |
def download_collection(collection_name, output_dir): | |
search = ia.search_items(f'collection:{collection_name}', config=config) | |
dir = Path(output_dir) / collection_name | |
dir.mkdir(exist_ok=True) | |
for result in search: | |
download_item(result['identifier'], dir) | |
time.sleep(0.1) | |
def download_item(item_id, dir): | |
item = ia.get_item(item_id, config=config) | |
meta = item.metadata | |
title = meta['title'] | |
if isinstance(title, list): # not sure why a list is sometimes returned, so just move on | |
return | |
description = meta.get('description', None) | |
formats = ['JPEG'] | |
for file in item.files: | |
if file['format'] in formats: | |
filename = file['name'] | |
try: | |
item.download(files=filename, \ | |
formats=file['format'], \ | |
destdir=dir, | |
no_directory=True, \ | |
verbose=True, \ | |
ignore_existing=True, \ | |
retries=2) | |
except Exception as e: | |
print(f'Download failed: {e}') | |
return | |
caption = dir / Path(filename).with_suffix('.txt') | |
with open(caption, 'w', encoding='utf-8') as f: | |
if description and not isinstance(description, list): | |
f.write(description) | |
else: | |
f.write(title) | |
return | |
print(f'no image for {item_id}') | |
if __name__ == '__main__': | |
parser = argparse.ArgumentParser() | |
parser.add_argument('collection', help='Collection name') | |
parser.add_argument('output_dir', help='Output directory') | |
args = parser.parse_args() | |
print(f'Downloading collection {args.collection} to {args.output_dir}') | |
download_collection(args.collection, args.output_dir) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment