rlskoeser · May 11, 2021 20:02
diff --git a/genizabibliography_sources_ris2csv.py b/genizabibliography_sources_ris2csv.py
 # pip install pandas rispy
 import pandas as pd
 import rispy

 # download RIS file: https://www.repository.cam.ac.uk/handle/1810/256117

 # parse RIS file into entries
 with open('genizahbibliography20160203.txt') as bibfile:
    entries = rispy.load(bibfile)

 print('Loaded %d entries' % len(entries))
 # load as dataframe
 df = pd.DataFrame(data=entries)

 # convert author list into string
 df['author_list'] = df.authors.apply(lambda x: '; '.join(x))

 # create copy dataframe, limited to fields consistent per source
 source_df = df[['author_list', 'year', 'short_title', 'title', 'volume', 'place_published']].copy()

 # drop duplicates to get the set of unique sources
 uniq_sources = source_df.drop_duplicates().copy()
 print('%d unique sources' % len(uniq_sources))

 # get a list of shelfmarks from the original dataframe for each source
 def shelfmarks_for_source(row):
    return '; '.join(list(df[(df.author_list == row.author_list) & (df.title == row.title) & (df.year == row.year)].label.unique()))

 print('Aggregating shelfmarks ...')
 uniq_sources['shelfmarks'] = uniq_sources.apply(shelfmarks_for_source, axis=1)

 # rename author column and save as csv
 uniq_sources.rename(columns={'author_list': 'authors'}).to_csv('genizahbibliography20160203_sources_shelfmarksv2.csv', index=False)
	# pip install pandas rispy
	import pandas as pd
	import rispy

	# download RIS file: https://www.repository.cam.ac.uk/handle/1810/256117

	# parse RIS file into entries
	with open('genizahbibliography20160203.txt') as bibfile:
	entries = rispy.load(bibfile)

	print('Loaded %d entries' % len(entries))
	# load as dataframe
	df = pd.DataFrame(data=entries)

	# convert author list into string
	df['author_list'] = df.authors.apply(lambda x: '; '.join(x))

	# create copy dataframe, limited to fields consistent per source
	source_df = df[['author_list', 'year', 'short_title', 'title', 'volume', 'place_published']].copy()

	# drop duplicates to get the set of unique sources
	uniq_sources = source_df.drop_duplicates().copy()
	print('%d unique sources' % len(uniq_sources))

	# get a list of shelfmarks from the original dataframe for each source
	def shelfmarks_for_source(row):
	return '; '.join(list(df[(df.author_list == row.author_list) & (df.title == row.title) & (df.year == row.year)].label.unique()))

	print('Aggregating shelfmarks ...')
	uniq_sources['shelfmarks'] = uniq_sources.apply(shelfmarks_for_source, axis=1)

	# rename author column and save as csv
	uniq_sources.rename(columns={'author_list': 'authors'}).to_csv('genizahbibliography20160203_sources_shelfmarksv2.csv', index=False)