Last active
December 2, 2022 22:39
-
-
Save derlin/917a64e6412de6c503f3f52e0878f919 to your computer and use it in GitHub Desktop.
Python scripts for downloading Leipzig Corpora Languages
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import tarfile | |
from html.parser import HTMLParser | |
from io import BytesIO | |
import requests | |
import os | |
class LeipzigResourceFinder(HTMLParser): | |
"""Find available leipzig resources for a given language.""" | |
def __init__(self, code, variant=None): | |
""" | |
Fetch all resources for a given language. Example: | |
.. code:: | |
lcr = LeipzigResourceFinder(code='fra') | |
lcr.data[-1] # 'fra_mixed_2009_10K.tar.gz' | |
lcr.data[-1] # 'fra_wikipedia_2010_1M.tar.gz' | |
lcr.find_one(size='10K', type='news', year=2009) # 'fra_news_2009_10K.tar.gz' | |
lcr.find_one(size='10K', type='news', year=2014) # 'fra_news_2009_10K.tar.gz' | |
:param code: the language code, e.g. 'eng' or 'ban' | |
:param variant: an optional language variant, e.g. 'ca' for 'eng-ca', 'eu' for 'eng-eu', etc. | |
""" | |
super().__init__() | |
# store | |
self.code = code | |
self.variant = variant | |
# list of all resources, index of resources | |
self.data = [] | |
self.index = {k: {} for k in ['type', 'year', 'size']} | |
# to the deed | |
res = requests.post( | |
'http://wortschatz.uni-leipzig.de/download/service', | |
data=dict(corpora_language=code, func='table'), | |
headers={'X-Requested-With': 'XMLHttpRequest'}) | |
self.feed(res.text) | |
def find_one(self, size='10K', typ='wikipedia', year='2016'): | |
""" | |
Get one resource, giving priority first to size, then resource type and finally year. | |
If the exact match is not available, it will return something else in this order. | |
:param size: one of 10K, 30K, 100K, 1M, 3M | |
:param typ: one of news, news-typical, wikipedia, web, mixed | |
:param year: a year, e.g. 2016 | |
:return: a resource in the form `<lang/variant>_<type>_<year>_<size>.tar.gz` or None (rare !) | |
""" | |
if len(self.data) == 0: | |
return None | |
# first, try to find a match | |
sets = [set(self.index[k].get(v, [])) for k, v in zip(['size', 'type', 'year'], (size, typ, year))] | |
while len(sets) > 0: | |
matches = set.intersection(*sets) | |
if len(matches) > 0: | |
return matches.pop() | |
sets.pop() | |
# return anything of the good size (might happen if only variants are available... | |
for i in range(1, len(self.data) + 1): | |
if size in self.data[-i]: | |
return self.data[-i] | |
# last resort: return anything | |
return self.data[-1] | |
def handle_starttag(self, tag, attrs): | |
# HTMLParser override | |
if tag == 'a': | |
attrs = dict(attrs) | |
cls = attrs.get('class', '') | |
if 'link_corpora_download' in cls and 'data-corpora-file' in attrs: | |
self._process_resource(attrs.get('data-corpora-file')) | |
def _process_resource(self, res): | |
self.data.append(res) | |
split = res.split('.')[0].split('_') | |
if len(split) != 4: # might happen on some rare occasions, such as Serbo-Croation hbs_ba_web_2015_30K | |
return | |
lang, typ, year, size = split | |
if self.variant is None: | |
if lang != self.code: return | |
else: | |
if lang != self.variant: return | |
for k, v in zip(['type', 'year', 'size'], [typ, year, size]): | |
if v not in self.index[k]: | |
self.index[k][v] = [] | |
self.index[k][v].append(res) | |
class Leipzig: | |
DOWNLOAD_URL = 'http://pcai056.informatik.uni-leipzig.de/downloads/corpora' | |
_gist_meta_url = 'https://gist.githubusercontent.com/derlin/917a64e6412de6c503f3f52e0878f919/raw/leipzig_meta.json' | |
def __init__(self, version=None): | |
url = self._gist_meta_url | |
if version is not None: | |
url = url.replace('raw', f'raw/{version}') | |
try: | |
leipzig_meta = requests.get(url).json() | |
except: | |
raise Exception(f'Could not fetch meta from {url}.') | |
self.code2lang = leipzig_meta['languages'] | |
self.lang2code = {v: k for k, v in self.code2lang.items()} | |
@staticmethod | |
def download_sentences(url): | |
"""Download sentences from a leipzig resource URL.""" | |
# get tar archive | |
res = requests.get(url) | |
# extract sentences file from archive | |
tar = tarfile.open(mode='r:gz', fileobj=BytesIO(res.content)) | |
tar_info = [member for member in tar.getmembers() if member.name.endswith('sentences.txt')][0] | |
handle = tar.extractfile(tar_info) | |
# read sentence file | |
raw_text = handle.read().decode('utf-8') | |
return [line.split('\t')[1] for line in raw_text.split('\n') if '\t' in line] | |
def download_all(self, download_folder, language_codes=None, filename='{code}.{size}.txt', | |
size='10K', typ='wikipedia', year='2016', | |
normalize_func=lambda t: t, filter_func=lambda t: True): | |
""" | |
Download all resources into a folder. For size, typ and year argument, see LeipzigResourceFinder | |
:param download_folder: the download folder | |
:param language_codes: a list of language codes to download, will download everything if not set | |
:param filename: available placeholders are code, size and typ | |
:param size: the size to prioritize | |
:param typ: the type of resource to prioritize, e.g. 'wikipedia', 'web', etc. | |
:param year: the year to prioritize | |
:param normalize_func: an optional function called on each sentence | |
:param filter_func: an optional filter to exclude sentences | |
""" | |
os.makedirs(download_folder, exist_ok=True) | |
if language_codes is None: | |
language_codes = list(self.code2lang.keys()) | |
for code in language_codes: | |
variant = None | |
if '-' in code: | |
code, variant = code.split('-') | |
outpath = os.path.join(download_folder, filename.format(code=code, typ=typ, size=size)) | |
if not os.path.exists(outpath): | |
try: | |
print(f'Processing {code} {self.code2lang[code]}...', end=' ', flush=True) | |
resource = LeipzigResourceFinder(code, variant).find_one(size, typ, year) | |
print(resource, end=' ', flush=True) | |
lines = [ | |
normalize_func(l) | |
for l in self.download_sentences(f'{self.DOWNLOAD_URL}/{resource}') | |
if len(l) > 0 and not l.isspace() and filter_func(l) | |
] | |
if len(lines): | |
with open(outpath, 'w') as f: | |
f.write('\n'.join(lines)) | |
print(f'{len(lines)} lines. OK') | |
else: | |
print('no line.') | |
except Exception as e: | |
print('ERROR', e) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"download_url":"http://pcai056.informatik.uni-leipzig.de/downloads/corpora", | |
"service_url":"http://wortschatz.uni-leipzig.de/download/service", | |
"sizes":[ | |
"10K", | |
"30K", | |
"100K", | |
"300K", | |
"1M" | |
], | |
"languages":{ | |
"afr":"Afrikaans", | |
"sqi":"Albanian", | |
"amh":"Amharic", | |
"ara":"Arabic", | |
"arg":"Aragonese", | |
"hye":"Armenian", | |
"asm":"Assamese", | |
"ast":"Asturian", | |
"aze":"Azerbaijani", | |
"ban":"Balinese", | |
"bjn":"Banjar", | |
"bak":"Bashkir", | |
"eus":"Basque", | |
"bar":"Bavarian", | |
"bel":"Belarusian", | |
"ben":"Bengali", | |
"bih":"Bihari languages", | |
"bik":"Bikol", | |
"bpy":"Bishnupriya", | |
"bos":"Bosnian", | |
"bre":"Breton", | |
"bul":"Bulgarian", | |
"bua":"Buriat", | |
"cat":"Catalan", | |
"ceb":"Cebuano", | |
"bcl":"Central Bikol", | |
"ckb":"Central Kurdish", | |
"che":"Chechen", | |
"zho":"Chinese", | |
"chv":"Chuvash", | |
"cos":"Corsican", | |
"hrv":"Croatian", | |
"ces":"Czech", | |
"dan":"Danish", | |
"div":"Dhivehi", | |
"diq":"Dimli", | |
"nld":"Dutch", | |
"mhr":"Eastern Mari", | |
"arz":"Egyptian Arabic", | |
"eml":"Emiliano-Romagnolo", | |
"eng":"English", | |
"epo":"Esperanto", | |
"est":"Estonian", | |
"ext":"Extremaduran", | |
"fao":"Faroese", | |
"hif":"Fiji Hindi", | |
"fin":"Finnish", | |
"fra":"French", | |
"glg":"Galician", | |
"lug":"Ganda", | |
"kat":"Georgian", | |
"deu":"German", | |
"glk":"Gilaki", | |
"gom":"Goan Konkani", | |
"ell":"Greek, Modern", | |
"grn":"Guarani", | |
"guj":"Gujarati", | |
"hat":"Haitian", | |
"heb":"Hebrew", | |
"hin":"Hindi", | |
"hun":"Hungarian", | |
"isl":"Icelandic", | |
"ido":"Ido", | |
"ilo":"Iloko", | |
"ind":"Indonesian", | |
"ina":"Interlingua", | |
"pes":"Iranian Persian", | |
"gle":"Irish", | |
"ita":"Italian", | |
"jpn":"Japanese", | |
"jav":"Javanese", | |
"kal":"Kalaallisut", | |
"kan":"Kannada", | |
"krc":"Karachay-Balkar", | |
"csb":"Kashubian", | |
"kaz":"Kazakh", | |
"kin":"Kinyarwanda", | |
"koi":"Kiowa", | |
"kir":"Kirghiz", | |
"kom":"Komi", | |
"knn":"Konkani", | |
"kor":"Korean", | |
"kur":"Kurdish", | |
"ksh":"Kölsch", | |
"lat":"Latin", | |
"lav":"Latvian", | |
"lim":"Limburgan", | |
"lit":"Lithuanian", | |
"lmo":"Lombard", | |
"nds":"Low German", | |
"dsb":"Lower Sorbian", | |
"lus":"Lushai", | |
"ltz":"Luxembourgish", | |
"mkd":"Macedonian", | |
"mad":"Madurese", | |
"mlg":"Malagasy", | |
"msa":"Malay", | |
"mal":"Malayalam", | |
"mlt":"Maltese", | |
"cmn":"Mandarin Chinese", | |
"glv":"Manx", | |
"mri":"Maori", | |
"mar":"Marathi", | |
"mzn":"Mazanderani", | |
"nan":"Min Nan Chinese", | |
"min":"Minangkabau", | |
"xmf":"Mingrelian", | |
"mwl":"Mirandese", | |
"mon":"Mongolian", | |
"nep":"Nepali", | |
"new":"Newari", | |
"azj":"North Azerbaijani", | |
"frr":"Northern Frisian", | |
"sme":"Northern Sami", | |
"uzn":"Northern Uzbek", | |
"nor":"Norwegian", | |
"nob":"Norwegian Bokmål", | |
"nno":"Norwegian Nynorsk", | |
"oci":"Occitan", | |
"ori":"Oriya", | |
"oss":"Ossetian", | |
"pam":"Pampanga", | |
"pan":"Panjabi", | |
"pap":"Papiamento", | |
"nso":"Pedi", | |
"fas":"Persian", | |
"pfl":"Pfaelzisch", | |
"pms":"Piemontese", | |
"plt":"Plateau Malagasy", | |
"pol":"Polish", | |
"por":"Portuguese", | |
"pus":"Pushto", | |
"que":"Quechua", | |
"ron":"Romanian", | |
"roh":"Romansh", | |
"rus":"Russian", | |
"rue":"Rusyn", | |
"sgs":"Samogitian", | |
"san":"Sanskrit", | |
"srd":"Sardinian", | |
"sco":"Scots", | |
"srp":"Serbian", | |
"hbs":"Serbo-Croatian", | |
"sna":"Shona", | |
"scn":"Sicilian", | |
"szl":"Silesian", | |
"snd":"Sindhi", | |
"sin":"Sinhala", | |
"slk":"Slovak", | |
"slv":"Slovenian", | |
"som":"Somali", | |
"sot":"Sotho, Southern", | |
"spa":"Spanish", | |
"ekk":"Standard Estonian", | |
"lvs":"Standard Latvian", | |
"sun":"Sundanese", | |
"swh":"Swahili", | |
"swa":"Swahili", | |
"swe":"Swedish", | |
"gsw":"Swiss German", | |
"tgl":"Tagalog", | |
"tgk":"Tajik", | |
"tam":"Tamil", | |
"tat":"Tatar", | |
"tel":"Telugu", | |
"tha":"Thai", | |
"als":"Tosk Albanian", | |
"tso":"Tsonga", | |
"tsn":"Tswana", | |
"tur":"Turkish", | |
"tuk":"Turkmen", | |
"tyv":"Tuvinian", | |
"udm":"Udmurt", | |
"uig":"Uighur", | |
"ukr":"Ukrainian", | |
"hsb":"Upper Sorbian", | |
"urd":"Urdu", | |
"uzb":"Uzbek", | |
"vec":"Venetian", | |
"vie":"Vietnamese", | |
"vls":"Vlaams", | |
"vol":"Volapük", | |
"vro":"Võro", | |
"wln":"Walloon", | |
"war":"Waray", | |
"cym":"Welsh", | |
"fry":"Western Frisian", | |
"mrj":"Western Mari", | |
"pnb":"Western Panjabi", | |
"xho":"Xhosa", | |
"sah":"Yakut", | |
"yid":"Yiddish", | |
"yor":"Yoruba", | |
"zea":"Zeeuws", | |
"zul":"Zulu" | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment