Last active
December 21, 2015 14:23
-
-
Save evgenybf/e5689dfa3734dd4a100d to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Extracts word transcriptions from En-En_Oxford Advanced Learners Dictionary.dsl | |
These words have invalid definitions: | |
shit, to | |
""" | |
import re | |
IGNORE_TEXT_IN_PARENTHESIS = True | |
DICTIONARY_DSL = "En-En_Oxford Advanced Learners Dictionary.dsl" | |
IN_ENCODING = "utf-16" | |
OUT_ENCODING = "utf-8" | |
RE_PSPEECH = re.compile(r"\[c orange\](?:[ ]*)(.*?)(?:[, ]*)\[/c\]") | |
RE_TRAN = re.compile(r"\[c darkcyan\](?:\\\[)(.*?)(?:\\\])\[/c\]") | |
RE_P_BRE = re.compile(r"\[p\]BrE\[/p\]") | |
RE_P_AME = re.compile(r"\[p\]N?AmE\[/p\]") | |
RE_PARENTHESIS=re.compile(r"\([^()]*\)") | |
def read_all_lines(filename, encoding): | |
with open(filename, "r", encoding=IN_ENCODING) as f: | |
for line in f.readlines(): | |
yield line.rstrip() | |
def extract_trs(str): | |
trs_bre = [] | |
trs_ame = [] | |
for part in RE_P_BRE.split(str): | |
subpart = RE_P_AME.split(part, 1) | |
trs_bre.extend(RE_TRAN.findall(subpart[0])) | |
if len(subpart) > 1: | |
trs_ame.extend(RE_TRAN.findall(subpart[1])) | |
return trs_bre, trs_ame | |
class DefEntry: | |
def __init__(self, pspeech, trs_bre, trs_ame): | |
self.pspeech = pspeech | |
self.trs_bre = trs_bre | |
self.trs_ame = trs_ame | |
class WordDef: | |
def __init__(self): | |
self.words = [] | |
self.entries = [] | |
def remove_text_parenths(s): | |
while True: | |
tmps = RE_PARENTHESIS.sub("", s) | |
if tmps == s: | |
break | |
s = tmps | |
return s | |
def parse_dsl(): | |
worddef = WordDef() | |
wasworddef = False | |
for lineno, line in enumerate(read_all_lines(DICTIONARY_DSL, IN_ENCODING)): | |
if len(line) == 0 or line.startswith('#'): | |
continue | |
if line[0] != '\t': | |
if wasworddef: | |
if worddef.words: | |
yield worddef | |
worddef = WordDef() | |
worddef.words.append(line) | |
wasworddef = False | |
else: | |
s_line = line.strip('\t') | |
if s_line.startswith("[m0]") or s_line.startswith("[m1][c red]"): | |
wasworddef = True | |
pspeech = RE_PSPEECH.findall(s_line) | |
trs_all = RE_TRAN.findall(s_line) | |
if IGNORE_TEXT_IN_PARENTHESIS: | |
s_line = remove_text_parenths(s_line) | |
if (RE_P_BRE.search(s_line) or RE_P_AME.search(s_line)): | |
trs_bre, trs_ame = extract_trs(s_line) | |
if len(trs_all) != len(trs_bre) + len(trs_ame): | |
print("WARNING: (%s) extracted too few transcriptions, line %d" % (",".join(worddef.words).encode("utf-8"), lineno)) | |
print((s_line.encode("utf-8"))) | |
if trs_bre or trs_ame: | |
worddef.entries.append(DefEntry(pspeech, trs_bre, trs_ame)) | |
elif len(trs_all) > 0: | |
print("WARNING: (%s) all transcriptions were left out, line %d" % (",".join(worddef.words).encode("utf-8"), lineno)) | |
print((s_line.encode("utf-8"))) | |
if worddef.words: | |
yield worddef | |
def wrap_tr(l): | |
return ["[%s]" % s for s in l] | |
def list2str(l): | |
return ",".join(l) | |
def gen_csv(fileout): | |
import csv | |
with open(fileout, "w", encoding=OUT_ENCODING, newline='') as f: | |
writer = csv.writer(f, dialect="excel") | |
writer.writerow(["word", "BrE", "NAmE", "both",]) | |
i = 0 | |
for worddef in parse_dsl(): | |
for word in worddef.words: | |
i += 1 | |
if worddef.entries: | |
l_bre = [] | |
l_ame = [] | |
l_all = [] | |
for entry in worddef.entries: | |
if len(worddef.entries) == 1: | |
s_pspeech = "" | |
else: | |
s_pspeech = list2str(entry.pspeech) | |
if s_pspeech: | |
s_pspeech += ": " | |
if len(entry.trs_bre) != len(set(entry.trs_bre)) or len(entry.trs_ame) != len(set(entry.trs_ame)): | |
print("WARNING: (%s) duplicated transcription" % (",".join(worddef.words).encode("utf-8"),)) | |
s_bre = list2str(wrap_tr(entry.trs_bre)) | |
s_ame = list2str(wrap_tr(entry.trs_ame)) | |
if s_bre: | |
l_bre.append(s_pspeech + s_bre) | |
if s_ame: | |
l_ame.append(s_pspeech + s_ame) | |
if s_bre or s_ame: | |
s = s_pspeech | |
p = [] | |
if s_bre == s_ame: | |
p.append(s_bre) | |
else: | |
if s_bre: | |
p.append("BrE " + s_bre) | |
if s_ame: | |
p.append("NAmE " + s_ame) | |
s += " ".join(p) | |
l_all.append(s) | |
row = (word, "; ".join(l_bre), "; ".join(l_ame), "; ".join(l_all),) | |
writer.writerow(row) | |
print("Processed %d word(s)" % i) | |
if __name__ == "__main__": | |
gen_csv("en_oxford_adv_dict.csv") |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Adds word transcriptions from a dictionary generated by extracttrs.py into anki csv file. | |
""" | |
import csv | |
import re | |
RE_DELIM = re.compile(r"[,;]") | |
def normalize_word(word: str) -> str: | |
word = word.replace(" ", " ") | |
word = RE_DELIM.split(word, 1)[0] | |
return word.strip().lower() | |
""" | |
dictcsvfile: | |
word,BrE,NAmE,both | |
ankicsvfile (no header) | |
N,keyword,transcription | |
""" | |
def mergetrs(dictcsvfile: str, ankicsvfile: str, outcsvfile: str) -> None: | |
words = {} | |
with open(dictcsvfile, "r", encoding="utf-8") as fin: | |
r = csv.reader(fin, dialect="excel") | |
# skip header | |
r.__next__() | |
for row in r: | |
words[row[0].strip().lower()] = row[1:] | |
with open(ankicsvfile, "r", encoding="utf-8") as fin, open(outcsvfile, "w", encoding="utf-8", newline='') as fout: | |
r = csv.reader(fin, dialect="excel") | |
w = csv.writer(fout, dialect="excel") | |
for row in r: | |
" " | |
word = normalize_word(row[1]) | |
extra = words.get(word, []) | |
if not extra: | |
print("WARNING: %s not found" % word) | |
extra = [""] * 3 | |
row += extra | |
w.writerow(row) | |
if __name__ == "__main__": | |
mergetrs("en_oxford_adv_dict.csv", "4000_eew_cloze.csv", "4000_eew_cloze_with_tr.csv") | |
mergetrs("en_oxford_adv_dict.csv", "4000_eew_basic.csv", "4000_eew_basic_with_tr.csv") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment