hendriks73 · November 12, 2021 10:08
diff --git a/download_rename.py b/download_rename.py
 #!/usr/bin/env python
 # coding: utf-8

 ###########################################################################
 #   Download papers of ISMIR 2021 save them under "speaking" file names
 #   derived from the JSON describing the conference.
 #
 #   NOTE: Paper will be downloaded to your _current_ dir!!
 #
 #   author: Hendrik Schreiber ([email protected])
 ###########################################################################

 import json
 import urllib.request

 # adjust these to your need/preferences
 MAX_TITLE_LEN = 25
 MAX_SUBJECT_AREA_LEN = 25


 def parse_subject_areas(subject_area):
    """
    Parse a subject area string like "MIR tasks -> Indexing and querying" into
    a pair.

    :param subject_area: subject area string
    :return: pair of subject areas
    """
    if '->' in subject_area:
        return (
            subject_area[0:subject_area.index('->')].strip(),
            subject_area[subject_area.index('->')+3:].strip()
        )
    else:
        return subject_area.strip(), ''


 def lastname(author):
    """
    Extract last name.

    :param author: author
    :return: lastname
    """
    return author[author.strip().rindex(' ')+1:]


 def transform_authors(authors):
    """
    Create some author representation.

    :param authors: list of authors
    :return: shortened author string
    """
    res = lastname(authors[0])

    for author in authors[1:]:
        res += lastname(author)[0]

    return res


 def transform_title(title):
    """
    Transform title somehow.

    :param title: raw title
    :return: transformed title
    """
    title = fix_name(title)

    if len(title) > MAX_TITLE_LEN:
        title = title[:MAX_TITLE_LEN] + '_'

    return title


 def fix_name(name):
    """
    Replace some unwanted chars.

    :param name: name
    :return: fixed name
    """
    return name.replace(' ', '_').replace('/', '_').replace(':', '')


 def transform_subject_area(subject_area_primary):
    """
    Parse and transform subject areas.

    :param subject_area_primary: subject area
    :return: transformed representation
    """
    subject_area, subject_area_detail = parse_subject_areas(subject_area_primary)
    if subject_area_detail:
        subject_area = subject_area_detail

    subject_area = fix_name(subject_area)

    if len(subject_area) > MAX_SUBJECT_AREA_LEN:
        subject_area = subject_area[:MAX_SUBJECT_AREA_LEN] + '_'

    return subject_area


 def transform_year(year):
    """
    Transform year (int) to a string.

    :param year: year
    :return: year string
    """
    return str(year)[-2:]


 def derive_file_name(paper):
    """
    Based on the JSON representation of a paper, create a file name.

    :param paper: paper JSON object
    :return: (hopefully) value file name
    """
    authors = paper['author']
    title = paper['title']
    year = paper['year']
    subject_area_primary = paper['extra'].get('subject_area_primary', '')

    t_subject = transform_subject_area(subject_area_primary)
    t_title = transform_title(title)
    t_authors = transform_authors(authors)
    t_year = transform_year(year)

    return '{authors}{year}-{subject}-{title}-ISMIR.pdf'.format(
        authors=t_authors,
        year=t_year,
        subject=t_subject,
        title=t_title
    )


 def derive_all_file_names(accepted_papers):
    """
    Get all new names for accepted papers.

    :param accepted_papers: accepted papers URL
    :return: dict of URL to new file names
    """
    with urllib.request.urlopen(accepted_papers) as f:
        data = json.loads(f.read())
    return {paper['ee']: derive_file_name(paper) for paper in data}


 def download_and_rename(url, new_name):
    """
    Downloads a PDF and store it under the given name.

    :param url: URL
    :param new_name: file name
    """
    print('Downloading paper from {}'.format(url))
    request = urllib.request.Request(url, headers={'Accept': 'application/pdf', 'User-Agent': 'mozilla'})
    with urllib.request.urlopen(request) as in_f:
        with open(new_name, 'wb') as out_f:
            out_f.write(in_f.read())


 # run this thing
 if __name__ == '__main__':
    accepted_papers = 'https://raw.githubusercontent.com/ismir2021/ismir2021.github.io/gh-pages/_data/accepted.json'
    url_to_names = derive_all_file_names(accepted_papers)
    for url, name in url_to_names.items():
        download_and_rename(url, name)
	#!/usr/bin/env python
	# coding: utf-8

	###########################################################################
	# Download papers of ISMIR 2021 save them under "speaking" file names
	# derived from the JSON describing the conference.
	#
	# NOTE: Paper will be downloaded to your _current_ dir!!
	#
	# author: Hendrik Schreiber ([email protected])
	###########################################################################

	import json
	import urllib.request

	# adjust these to your need/preferences
	MAX_TITLE_LEN = 25
	MAX_SUBJECT_AREA_LEN = 25


	def parse_subject_areas(subject_area):
	"""
	Parse a subject area string like "MIR tasks -> Indexing and querying" into
	a pair.

	:param subject_area: subject area string
	:return: pair of subject areas
	"""
	if '->' in subject_area:
	return (
	subject_area[0:subject_area.index('->')].strip(),
	subject_area[subject_area.index('->')+3:].strip()
	)
	else:
	return subject_area.strip(), ''


	def lastname(author):
	"""
	Extract last name.

	:param author: author
	:return: lastname
	"""
	return author[author.strip().rindex(' ')+1:]


	def transform_authors(authors):
	"""
	Create some author representation.

	:param authors: list of authors
	:return: shortened author string
	"""
	res = lastname(authors[0])

	for author in authors[1:]:
	res += lastname(author)[0]

	return res


	def transform_title(title):
	"""
	Transform title somehow.

	:param title: raw title
	:return: transformed title
	"""
	title = fix_name(title)

	if len(title) > MAX_TITLE_LEN:
	title = title[:MAX_TITLE_LEN] + '_'

	return title


	def fix_name(name):
	"""
	Replace some unwanted chars.

	:param name: name
	:return: fixed name
	"""
	return name.replace(' ', '_').replace('/', '_').replace(':', '')


	def transform_subject_area(subject_area_primary):
	"""
	Parse and transform subject areas.

	:param subject_area_primary: subject area
	:return: transformed representation
	"""
	subject_area, subject_area_detail = parse_subject_areas(subject_area_primary)
	if subject_area_detail:
	subject_area = subject_area_detail

	subject_area = fix_name(subject_area)

	if len(subject_area) > MAX_SUBJECT_AREA_LEN:
	subject_area = subject_area[:MAX_SUBJECT_AREA_LEN] + '_'

	return subject_area


	def transform_year(year):
	"""
	Transform year (int) to a string.

	:param year: year
	:return: year string
	"""
	return str(year)[-2:]


	def derive_file_name(paper):
	"""
	Based on the JSON representation of a paper, create a file name.

	:param paper: paper JSON object
	:return: (hopefully) value file name
	"""
	authors = paper['author']
	title = paper['title']
	year = paper['year']
	subject_area_primary = paper['extra'].get('subject_area_primary', '')

	t_subject = transform_subject_area(subject_area_primary)
	t_title = transform_title(title)
	t_authors = transform_authors(authors)
	t_year = transform_year(year)

	return '{authors}{year}-{subject}-{title}-ISMIR.pdf'.format(
	authors=t_authors,
	year=t_year,
	subject=t_subject,
	title=t_title
	)


	def derive_all_file_names(accepted_papers):
	"""
	Get all new names for accepted papers.

	:param accepted_papers: accepted papers URL
	:return: dict of URL to new file names
	"""
	with urllib.request.urlopen(accepted_papers) as f:
	data = json.loads(f.read())
	return {paper['ee']: derive_file_name(paper) for paper in data}


	def download_and_rename(url, new_name):
	"""
	Downloads a PDF and store it under the given name.

	:param url: URL
	:param new_name: file name
	"""
	print('Downloading paper from {}'.format(url))
	request = urllib.request.Request(url, headers={'Accept': 'application/pdf', 'User-Agent': 'mozilla'})
	with urllib.request.urlopen(request) as in_f:
	with open(new_name, 'wb') as out_f:
	out_f.write(in_f.read())


	# run this thing
	if __name__ == '__main__':
	accepted_papers = 'https://raw.githubusercontent.com/ismir2021/ismir2021.github.io/gh-pages/_data/accepted.json'
	url_to_names = derive_all_file_names(accepted_papers)
	for url, name in url_to_names.items():
	download_and_rename(url, name)