Created
November 12, 2021 10:08
-
-
Save hendriks73/238f58b04b979d44c8b8b2e8d2fedb50 to your computer and use it in GitHub Desktop.
Download ISMIR 2021 papers and rename using conference metadata ("speaking" names)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# coding: utf-8 | |
########################################################################### | |
# Download papers of ISMIR 2021 save them under "speaking" file names | |
# derived from the JSON describing the conference. | |
# | |
# NOTE: Paper will be downloaded to your _current_ dir!! | |
# | |
# author: Hendrik Schreiber ([email protected]) | |
########################################################################### | |
import json | |
import urllib.request | |
# adjust these to your need/preferences | |
MAX_TITLE_LEN = 25 | |
MAX_SUBJECT_AREA_LEN = 25 | |
def parse_subject_areas(subject_area): | |
""" | |
Parse a subject area string like "MIR tasks -> Indexing and querying" into | |
a pair. | |
:param subject_area: subject area string | |
:return: pair of subject areas | |
""" | |
if '->' in subject_area: | |
return ( | |
subject_area[0:subject_area.index('->')].strip(), | |
subject_area[subject_area.index('->')+3:].strip() | |
) | |
else: | |
return subject_area.strip(), '' | |
def lastname(author): | |
""" | |
Extract last name. | |
:param author: author | |
:return: lastname | |
""" | |
return author[author.strip().rindex(' ')+1:] | |
def transform_authors(authors): | |
""" | |
Create some author representation. | |
:param authors: list of authors | |
:return: shortened author string | |
""" | |
res = lastname(authors[0]) | |
for author in authors[1:]: | |
res += lastname(author)[0] | |
return res | |
def transform_title(title): | |
""" | |
Transform title somehow. | |
:param title: raw title | |
:return: transformed title | |
""" | |
title = fix_name(title) | |
if len(title) > MAX_TITLE_LEN: | |
title = title[:MAX_TITLE_LEN] + '_' | |
return title | |
def fix_name(name): | |
""" | |
Replace some unwanted chars. | |
:param name: name | |
:return: fixed name | |
""" | |
return name.replace(' ', '_').replace('/', '_').replace(':', '') | |
def transform_subject_area(subject_area_primary): | |
""" | |
Parse and transform subject areas. | |
:param subject_area_primary: subject area | |
:return: transformed representation | |
""" | |
subject_area, subject_area_detail = parse_subject_areas(subject_area_primary) | |
if subject_area_detail: | |
subject_area = subject_area_detail | |
subject_area = fix_name(subject_area) | |
if len(subject_area) > MAX_SUBJECT_AREA_LEN: | |
subject_area = subject_area[:MAX_SUBJECT_AREA_LEN] + '_' | |
return subject_area | |
def transform_year(year): | |
""" | |
Transform year (int) to a string. | |
:param year: year | |
:return: year string | |
""" | |
return str(year)[-2:] | |
def derive_file_name(paper): | |
""" | |
Based on the JSON representation of a paper, create a file name. | |
:param paper: paper JSON object | |
:return: (hopefully) value file name | |
""" | |
authors = paper['author'] | |
title = paper['title'] | |
year = paper['year'] | |
subject_area_primary = paper['extra'].get('subject_area_primary', '') | |
t_subject = transform_subject_area(subject_area_primary) | |
t_title = transform_title(title) | |
t_authors = transform_authors(authors) | |
t_year = transform_year(year) | |
return '{authors}{year}-{subject}-{title}-ISMIR.pdf'.format( | |
authors=t_authors, | |
year=t_year, | |
subject=t_subject, | |
title=t_title | |
) | |
def derive_all_file_names(accepted_papers): | |
""" | |
Get all new names for accepted papers. | |
:param accepted_papers: accepted papers URL | |
:return: dict of URL to new file names | |
""" | |
with urllib.request.urlopen(accepted_papers) as f: | |
data = json.loads(f.read()) | |
return {paper['ee']: derive_file_name(paper) for paper in data} | |
def download_and_rename(url, new_name): | |
""" | |
Downloads a PDF and store it under the given name. | |
:param url: URL | |
:param new_name: file name | |
""" | |
print('Downloading paper from {}'.format(url)) | |
request = urllib.request.Request(url, headers={'Accept': 'application/pdf', 'User-Agent': 'mozilla'}) | |
with urllib.request.urlopen(request) as in_f: | |
with open(new_name, 'wb') as out_f: | |
out_f.write(in_f.read()) | |
# run this thing | |
if __name__ == '__main__': | |
accepted_papers = 'https://raw.githubusercontent.com/ismir2021/ismir2021.github.io/gh-pages/_data/accepted.json' | |
url_to_names = derive_all_file_names(accepted_papers) | |
for url, name in url_to_names.items(): | |
download_and_rename(url, name) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment