Last active
November 22, 2024 23:52
-
-
Save aerickt/03871d801534067066d4587cd61144eb to your computer and use it in GitHub Desktop.
basic viet.py for Plover
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This is a (work-in-progress) python dictionary implementation of my Vietnamese steno system: https://github.com/aerickt/steno-dictionaries/wiki/Vietnamese-Steno | |
# As this dictionary generates words on the fly, it is not (or shouldn't be) necessary to add missing entries. | |
# Therefore, it is a better implementation of my Vietnamese steno system than the json dictionary. | |
# However, this is very work-in-progress. I particularly don't have a great way to test the tone placement system. | |
# This dictionary also has some minor differences to the viet.json implementation. For words such as qua, this system | |
# only considers the q (KW) as a consonant -- you'll have to consider the ua as a vowel. Alternatively, you can consider qu | |
# as a separate consonant using a different vowel chord (KWR). | |
# This dictionary also does not support fingerspelling yet. However, a json dictionary may be provided instead to keep this | |
# python dictionary readable (I'm not sure if I am able to add fingerspelling in a competent way). | |
import re | |
# Only consider single stroke outlines | |
LONGEST_KEY = 1 | |
chord_space = { | |
# Define initial consonant chords | |
"initial": { | |
"S": "s", | |
"STKPW": "gi", | |
"STPH": "ng", | |
"STPHR": "ngh", | |
"SR": "v", | |
"T": "t", | |
"TK": "đ", | |
"TKP": "d", | |
"TKPW": "g", | |
"TKPWH": "gh", | |
"TKR": "đr", | |
"TP": "ph", | |
"TPH": "n", | |
"TPHR": "nh", | |
"TH": "th", | |
"TR": "tr", | |
"K": "k", | |
"KP": "x", | |
"KW": "q", | |
"KWR": "qu", | |
"KH": "kh", | |
"KHR": "ch", | |
"KR": "c", | |
"P": "p", | |
"PW": "b", | |
"PH": "m", | |
"H": "h", | |
"HR": "l", | |
"R": "r", | |
"": "" | |
}, | |
# Define vowel chords | |
"vowel": { | |
"A": "a", | |
"AO": "ao", | |
"AOE": "ưu", | |
"AOEU": "oo", | |
"AOEUFR": "uyu", | |
"AOEUR": "uya", | |
"AOU": "oa", | |
"AOF": "oai", | |
"AOFR": "oă", | |
"AOR": "oay", | |
"A*R": "ya", | |
"AE": "ê", | |
"AEU": "êu", | |
"AEUR": "yêu", | |
"AEF": "iê", | |
"AEFR": "uê", | |
"AER": "yê", | |
"AU": "au", | |
"AUF": "ia", | |
"AUFR": "uă", | |
"AUR": "ưa", | |
"AF": "ai", | |
"AFR": "ă", | |
"AR": "ay", | |
"O": "o", | |
"O*E": "oe", | |
"O*UR": "uơ", | |
"OE": "ô", | |
"OEU": "uô", | |
"OEUF": "uôi", | |
"OEUFR": "uâ", | |
"OEF": "ôi", | |
"OER": "oeo", | |
"OU": "ơ", | |
"OUF": "ơi", | |
"OUFR": "ươi", | |
"OUR": "ươ", | |
"OF": "oi", | |
"OFR": "ươu", | |
"OR": "ua", | |
"*EUFR": "uây", | |
"E": "e", | |
"EU": "iêu", | |
"EUF": "ưi", | |
"EUFR": "âu", | |
"EUR": "uyê", | |
"EF": "eo", | |
"EFR": "â", | |
"ER": "ây", | |
"U": "u", | |
"UF": "ui", | |
"UFR": "uy", | |
"UR": "ư", | |
"-F": "i", | |
"-FR": "iu", | |
"-R": "y", | |
"": "" | |
}, | |
# Define ending consonant chords | |
"final": { | |
"-P": "p", | |
"-PB": "nh", | |
"-PBLG": "ch", | |
"-PL": "m", | |
"-PLG": "mh", | |
"-B": "h", | |
"-BG": "c", | |
"-L": "n", | |
"-LG": "ng", | |
"-G": "t", | |
"": "" | |
}, | |
# Define tone chords | |
"tone": { | |
"-T": u'\u0341', # or u0301 | |
"-TS": u'\u0303', # or u0342 | |
"-S": u'\u0300', # or u0340 | |
"-D": u'\u0309', | |
"-Z": u'\u0323', | |
"": "", | |
} | |
} | |
def lookup(key): | |
# Initialize word string | |
word = "" | |
# Take input stroke and split it into a list with 4 elements | |
# These elements represent the chord groups: intials, vowels, finals, and tone (in that order) | |
stroke_chords = split_stroke(key[0]) | |
# print(stroke_chords) | |
# Iterate through the chord groups and determine if they are present in the possible chords. | |
for i in ["initial", "vowel", "final", "tone"]: | |
if stroke_chords[i] not in chord_space[i]: | |
raise KeyError | |
# Start the word with the initial consonant since it is constant | |
word += chord_space["initial"][stroke_chords["initial"]] | |
# Determine the vowel by merging the tone with the vowel letters | |
word += get_tone_vowel(stroke_chords["vowel"], stroke_chords["final"], stroke_chords["tone"]) | |
# End the word with the final consonant since it is constant | |
word += chord_space["final"][stroke_chords["final"]] | |
# Return the final word | |
return word | |
# Split a stroke into the 4 groups of interest | |
def split_stroke(stroke): | |
# Use a regex to split the stroke into 7 groups | |
all_groups = re.match(r'(S?T?K?P?W?H?R?)(A?O?)-?(\*?)(E?U?)(F?R?)(P?B?L?G?)(T?S?D?Z?)', stroke).groups() | |
# Consolidate the 7 groups into the 4 that are needed | |
# Also use a list instead of a tuple | |
groups = { | |
"initial": all_groups[0], | |
"vowel": all_groups[1] + all_groups[2] + all_groups[3] + all_groups[4], | |
"final": all_groups[5], | |
"tone": all_groups[6] | |
} | |
# Iterate through the last 3 groups, and prepend a hyphen if needed | |
for i in ["vowel", "final", "tone"]: | |
if groups[i] != "": | |
if groups[i][0] not in ["A", "O", "E", "U", "*"]: | |
groups[i] = "-" + groups[i] | |
return groups | |
# Using a vowel chord and a tone chord as input, return the vowel letters with the correct tone diacritic | |
def get_tone_vowel(vowel_chord, final_chord, tone_chord): | |
# Determine the tone character | |
tone = chord_space["tone"][tone_chord] | |
# Determine the vowel letters | |
vowel_letters = chord_space["vowel"][vowel_chord] | |
# Early return if no tone | |
if tone == "": | |
return vowel_letters | |
tone_position = -1 | |
# Determine the diacritic position | |
# If the vowel consists of one letter, place the tone on the first letter (that is, itself) | |
if len(vowel_letters) == 1: | |
tone_position = 1 | |
# If the vowel consists of zero letters, place the tone on the previous letter (useful for words like gì) | |
elif len(vowel_letters) == 0: | |
tone_position = 0 | |
# If the vowel has any accented letters, place the tone on that letter | |
elif any(x in ["ă", "â", "ê", "ô", "ơ", "ư"] for x in vowel_letters): | |
# Give accented characters priority | |
for i in ["ă", "â", "ê", "ô", "ư", "ơ"]: | |
if i in vowel_letters: | |
tone_position = vowel_letters.find(i) + 1 | |
# If an ending consonant is present, place tone on last letter | |
elif final_chord != "": | |
tone_position = len(vowel_letters) | |
# If an ending consonant is not present, place tone on second last letter | |
elif final_chord == "": | |
tone_position - len(vowel_letters) - 1 | |
# Merge the vowel letters and diacritic in the right position | |
vowel_with_tone = vowel_letters[0:tone_position] + tone + vowel_letters[tone_position:] | |
return vowel_with_tone | |
# for i in ["T", "TS", "S", "D", "Z"]: | |
# print(lookup(["STPHEURL" + i])) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment