Skip to content

Instantly share code, notes, and snippets.

@hendriks73
Created November 12, 2021 10:08
Show Gist options
  • Save hendriks73/238f58b04b979d44c8b8b2e8d2fedb50 to your computer and use it in GitHub Desktop.
Save hendriks73/238f58b04b979d44c8b8b2e8d2fedb50 to your computer and use it in GitHub Desktop.
Download ISMIR 2021 papers and rename using conference metadata ("speaking" names)
#!/usr/bin/env python
# coding: utf-8
###########################################################################
# Download papers of ISMIR 2021 save them under "speaking" file names
# derived from the JSON describing the conference.
#
# NOTE: Paper will be downloaded to your _current_ dir!!
#
# author: Hendrik Schreiber ([email protected])
###########################################################################
import json
import urllib.request
# adjust these to your need/preferences
MAX_TITLE_LEN = 25
MAX_SUBJECT_AREA_LEN = 25
def parse_subject_areas(subject_area):
"""
Parse a subject area string like "MIR tasks -> Indexing and querying" into
a pair.
:param subject_area: subject area string
:return: pair of subject areas
"""
if '->' in subject_area:
return (
subject_area[0:subject_area.index('->')].strip(),
subject_area[subject_area.index('->')+3:].strip()
)
else:
return subject_area.strip(), ''
def lastname(author):
"""
Extract last name.
:param author: author
:return: lastname
"""
return author[author.strip().rindex(' ')+1:]
def transform_authors(authors):
"""
Create some author representation.
:param authors: list of authors
:return: shortened author string
"""
res = lastname(authors[0])
for author in authors[1:]:
res += lastname(author)[0]
return res
def transform_title(title):
"""
Transform title somehow.
:param title: raw title
:return: transformed title
"""
title = fix_name(title)
if len(title) > MAX_TITLE_LEN:
title = title[:MAX_TITLE_LEN] + '_'
return title
def fix_name(name):
"""
Replace some unwanted chars.
:param name: name
:return: fixed name
"""
return name.replace(' ', '_').replace('/', '_').replace(':', '')
def transform_subject_area(subject_area_primary):
"""
Parse and transform subject areas.
:param subject_area_primary: subject area
:return: transformed representation
"""
subject_area, subject_area_detail = parse_subject_areas(subject_area_primary)
if subject_area_detail:
subject_area = subject_area_detail
subject_area = fix_name(subject_area)
if len(subject_area) > MAX_SUBJECT_AREA_LEN:
subject_area = subject_area[:MAX_SUBJECT_AREA_LEN] + '_'
return subject_area
def transform_year(year):
"""
Transform year (int) to a string.
:param year: year
:return: year string
"""
return str(year)[-2:]
def derive_file_name(paper):
"""
Based on the JSON representation of a paper, create a file name.
:param paper: paper JSON object
:return: (hopefully) value file name
"""
authors = paper['author']
title = paper['title']
year = paper['year']
subject_area_primary = paper['extra'].get('subject_area_primary', '')
t_subject = transform_subject_area(subject_area_primary)
t_title = transform_title(title)
t_authors = transform_authors(authors)
t_year = transform_year(year)
return '{authors}{year}-{subject}-{title}-ISMIR.pdf'.format(
authors=t_authors,
year=t_year,
subject=t_subject,
title=t_title
)
def derive_all_file_names(accepted_papers):
"""
Get all new names for accepted papers.
:param accepted_papers: accepted papers URL
:return: dict of URL to new file names
"""
with urllib.request.urlopen(accepted_papers) as f:
data = json.loads(f.read())
return {paper['ee']: derive_file_name(paper) for paper in data}
def download_and_rename(url, new_name):
"""
Downloads a PDF and store it under the given name.
:param url: URL
:param new_name: file name
"""
print('Downloading paper from {}'.format(url))
request = urllib.request.Request(url, headers={'Accept': 'application/pdf', 'User-Agent': 'mozilla'})
with urllib.request.urlopen(request) as in_f:
with open(new_name, 'wb') as out_f:
out_f.write(in_f.read())
# run this thing
if __name__ == '__main__':
accepted_papers = 'https://raw.githubusercontent.com/ismir2021/ismir2021.github.io/gh-pages/_data/accepted.json'
url_to_names = derive_all_file_names(accepted_papers)
for url, name in url_to_names.items():
download_and_rename(url, name)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment