Last active
May 27, 2018 13:35
-
-
Save evgenybf/34d6a9bf037a3ecb010019e18b3c0bed to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import collections | |
import csv | |
import itertools | |
# http://www.manythings.org/anki/ | |
# Russian - English rus-eng.zip (304513) | |
INPUT_FILE_1 = "rus.txt" | |
# Spanish - English spa-eng.zip (118964) | |
# INPUT_FILE_2 = "spa.txt" | |
# Chinese (Mandarin) - English cmn-eng.zip (20085) | |
INPUT_FILE_2 = "cmn.txt" | |
OUTPUT_FILE = "out.txt" | |
def load_phrases_as_dict(filename): | |
phrases = collections.OrderedDict() | |
with open(filename, "r", encoding="utf-8", newline='') as f: | |
reader = csv.reader(f, delimiter='\t') | |
for record in reader: | |
if len(record) < 2: | |
print("Skipped:", record) | |
continue | |
if len(record) > 2: | |
print("Too many elements: {}: {}".format(len(record), record)) | |
key_phrase, translation, *__ignored = (x.strip() for x in record) | |
phrases.setdefault(key_phrase, []).append(translation) | |
return phrases | |
def merge_files(file1, file2, output_file): | |
phrases1 = load_phrases_as_dict(file1) | |
print("Key phrases in file 1:", len(phrases1)) | |
phrases2 = load_phrases_as_dict(file2) | |
print("Key phrases in file 2:", len(phrases2)) | |
with open(output_file, "w", encoding="utf-8", newline='') as f: | |
writer = csv.writer(f, delimiter='\t') | |
skipped = 0 | |
for key_phrase, translations1 in phrases1.items(): | |
translations2 = phrases2.get(key_phrase) | |
if not translations2: | |
skipped += 1 | |
continue | |
# From documentation: product('ABCD', 'xy') --> Ax Ay Bx By Cx Cy Dx Dy | |
for translation1, translation2 in itertools.product(translations1, translations2): | |
writer.writerow((key_phrase, translation1, translation2)) | |
print("Phrases without translation:", skipped) | |
def main(): | |
merge_files(INPUT_FILE_1, INPUT_FILE_2, OUTPUT_FILE) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment