aerickt · November 22, 2024 23:52
diff --git a/viet.py b/viet.py
 # This is a (work-in-progress) python dictionary implementation of my Vietnamese steno system: https://github.com/aerickt/steno-dictionaries/wiki/Vietnamese-Steno
 # As this dictionary generates words on the fly, it is not (or shouldn't be) necessary to add missing entries.
 # Therefore, it is a better implementation of my Vietnamese steno system than the json dictionary.
 # However, this is very work-in-progress. I particularly don't have a great way to test the tone placement system.
 # This dictionary also has some minor differences to the viet.json implementation. For words such as qua, this system
 # only considers the q (KW) as a consonant -- you'll have to consider the ua as a vowel. Alternatively, you can consider qu
 # as a separate consonant using a different vowel chord (KWR).
 # This dictionary also does not support fingerspelling yet. However, a json dictionary may be provided instead to keep this
 # python dictionary readable (I'm not sure if I am able to add fingerspelling in a competent way).

 import re

 # Only consider single stroke outlines
 LONGEST_KEY = 1

 chord_space = {

    # Define initial consonant chords
    "initial": {
        "S": "s",
        "STKPW": "gi",
        "STPH": "ng",
        "STPHR": "ngh",
        "SR": "v",
        "T": "t",
        "TK": "đ",
        "TKP": "d",
        "TKPW": "g",
        "TKPWH": "gh",
        "TKR": "đr",
        "TP": "ph",
        "TPH": "n",
        "TPHR": "nh",
        "TH": "th",
        "TR": "tr",
        "K": "k",
        "KP": "x",
        "KW": "q",
        "KWR": "qu",
        "KH": "kh",
        "KHR": "ch",
        "KR": "c",
        "P": "p",
        "PW": "b",
        "PH": "m",
        "H": "h",
        "HR": "l",
        "R": "r",
        "": ""
    },

    # Define vowel chords
    "vowel": {
        "A": "a",
        "AO": "ao",
        "AOE": "ưu",
        "AOEU": "oo",
        "AOEUFR": "uyu",
        "AOEUR": "uya",
        "AOU": "oa",
        "AOF": "oai",
        "AOFR": "oă",
        "AOR": "oay",
        "A*R": "ya",
        "AE": "ê",
        "AEU": "êu",
        "AEUR": "yêu",
        "AEF": "iê",
        "AEFR": "uê",
        "AER": "yê",
        "AU": "au",
        "AUF": "ia",
        "AUFR": "uă",
        "AUR": "ưa",
        "AF": "ai",
        "AFR": "ă",
        "AR": "ay",
        "O": "o",
        "O*E": "oe",
        "O*UR": "uơ",
        "OE": "ô",
        "OEU": "uô",
        "OEUF": "uôi",
        "OEUFR": "uâ",
        "OEF": "ôi",
        "OER": "oeo",
        "OU": "ơ",
        "OUF": "ơi",
        "OUFR": "ươi",
        "OUR": "ươ",
        "OF": "oi",
        "OFR": "ươu",
        "OR": "ua",
        "*EUFR": "uây",
        "E": "e",
        "EU": "iêu",
        "EUF": "ưi",
        "EUFR": "âu",
        "EUR": "uyê",
        "EF": "eo",
        "EFR": "â",
        "ER": "ây",
        "U": "u",
        "UF": "ui",
        "UFR": "uy",
        "UR": "ư",
        "-F": "i",
        "-FR": "iu",
        "-R": "y",
        "": ""
    },

    # Define ending consonant chords
    "final": {
        "-P": "p",
        "-PB": "nh",
        "-PBLG": "ch",
        "-PL": "m",
        "-PLG": "mh",
        "-B": "h",
        "-BG": "c",
        "-L": "n",
        "-LG": "ng",
        "-G": "t",
        "": ""
    },

    # Define tone chords
    "tone": {
        "-T": u'\u0341',  # or u0301
        "-TS": u'\u0303', # or u0342
        "-S": u'\u0300',  # or u0340
        "-D": u'\u0309',
        "-Z": u'\u0323',
        "": "",
    }
 }

 def lookup(key):

    # Initialize word string
    word = ""

    # Take input stroke and split it into a list with 4 elements
    # These elements represent the chord groups: intials, vowels, finals, and tone (in that order)
    stroke_chords = split_stroke(key[0])

    # print(stroke_chords)

    # Iterate through the chord groups and determine if they are present in the possible chords.
    for i in ["initial", "vowel", "final", "tone"]:
        if stroke_chords[i] not in chord_space[i]:
            raise KeyError

    # Start the word with the initial consonant since it is constant
    word += chord_space["initial"][stroke_chords["initial"]]

    # Determine the vowel by merging the tone with the vowel letters
    word += get_tone_vowel(stroke_chords["vowel"], stroke_chords["final"], stroke_chords["tone"])

    # End the word with the final consonant since it is constant
    word +=  chord_space["final"][stroke_chords["final"]]

    # Return the final word
    return word

 # Split a stroke into the 4 groups of interest
 def split_stroke(stroke):

    # Use a regex to split the stroke into 7 groups
    all_groups = re.match(r'(S?T?K?P?W?H?R?)(A?O?)-?(\*?)(E?U?)(F?R?)(P?B?L?G?)(T?S?D?Z?)', stroke).groups()

    # Consolidate the 7 groups into the 4 that are needed
    # Also use a list instead of a tuple
    groups = {
        "initial": all_groups[0],
        "vowel": all_groups[1] + all_groups[2] + all_groups[3] + all_groups[4],
        "final": all_groups[5],
        "tone": all_groups[6]
    }

    # Iterate through the last 3 groups, and prepend a hyphen if needed
    for i in ["vowel", "final", "tone"]:
        if groups[i] != "":
            if groups[i][0] not in ["A", "O", "E", "U", "*"]:
                groups[i] = "-" + groups[i]

    return groups

 # Using a vowel chord and a tone chord as input, return the vowel letters with the correct tone diacritic
 def get_tone_vowel(vowel_chord, final_chord, tone_chord):

    # Determine the tone character
    tone = chord_space["tone"][tone_chord]

    # Determine the vowel letters
    vowel_letters = chord_space["vowel"][vowel_chord]

    # Early return if no tone
    if tone == "":
        return vowel_letters

    tone_position = -1

    # Determine the diacritic position

    # If the vowel consists of one letter, place the tone on the first letter (that is, itself)
    if len(vowel_letters) == 1:
        tone_position = 1

    # If the vowel consists of zero letters, place the tone on the previous letter (useful for words like gì)
    elif len(vowel_letters) == 0:
        tone_position = 0

    # If the vowel has any accented letters, place the tone on that letter
    elif any(x in ["ă", "â", "ê", "ô", "ơ", "ư"] for x in vowel_letters):

        # Give accented characters priority
        for i in ["ă", "â", "ê", "ô", "ư", "ơ"]:
            if i in vowel_letters:
                tone_position = vowel_letters.find(i) + 1

    # If an ending consonant is present, place tone on last letter
    elif final_chord != "":
        tone_position = len(vowel_letters)

    # If an ending consonant is not present, place tone on second last letter
    elif final_chord == "":
        tone_position - len(vowel_letters) - 1

    # Merge the vowel letters and diacritic in the right position
    vowel_with_tone = vowel_letters[0:tone_position] + tone + vowel_letters[tone_position:]

    return vowel_with_tone


 # for i in ["T", "TS", "S", "D", "Z"]:
 #     print(lookup(["STPHEURL" + i]))
	# This is a (work-in-progress) python dictionary implementation of my Vietnamese steno system: https://github.com/aerickt/steno-dictionaries/wiki/Vietnamese-Steno
	# As this dictionary generates words on the fly, it is not (or shouldn't be) necessary to add missing entries.
	# Therefore, it is a better implementation of my Vietnamese steno system than the json dictionary.
	# However, this is very work-in-progress. I particularly don't have a great way to test the tone placement system.
	# This dictionary also has some minor differences to the viet.json implementation. For words such as qua, this system
	# only considers the q (KW) as a consonant -- you'll have to consider the ua as a vowel. Alternatively, you can consider qu
	# as a separate consonant using a different vowel chord (KWR).
	# This dictionary also does not support fingerspelling yet. However, a json dictionary may be provided instead to keep this
	# python dictionary readable (I'm not sure if I am able to add fingerspelling in a competent way).

	import re

	# Only consider single stroke outlines
	LONGEST_KEY = 1

	chord_space = {

	# Define initial consonant chords
	"initial": {
	"S": "s",
	"STKPW": "gi",
	"STPH": "ng",
	"STPHR": "ngh",
	"SR": "v",
	"T": "t",
	"TK": "đ",
	"TKP": "d",
	"TKPW": "g",
	"TKPWH": "gh",
	"TKR": "đr",
	"TP": "ph",
	"TPH": "n",
	"TPHR": "nh",
	"TH": "th",
	"TR": "tr",
	"K": "k",
	"KP": "x",
	"KW": "q",
	"KWR": "qu",
	"KH": "kh",
	"KHR": "ch",
	"KR": "c",
	"P": "p",
	"PW": "b",
	"PH": "m",
	"H": "h",
	"HR": "l",
	"R": "r",
	"": ""
	},

	# Define vowel chords
	"vowel": {
	"A": "a",
	"AO": "ao",
	"AOE": "ưu",
	"AOEU": "oo",
	"AOEUFR": "uyu",
	"AOEUR": "uya",
	"AOU": "oa",
	"AOF": "oai",
	"AOFR": "oă",
	"AOR": "oay",
	"A*R": "ya",
	"AE": "ê",
	"AEU": "êu",
	"AEUR": "yêu",
	"AEF": "iê",
	"AEFR": "uê",
	"AER": "yê",
	"AU": "au",
	"AUF": "ia",
	"AUFR": "uă",
	"AUR": "ưa",
	"AF": "ai",
	"AFR": "ă",
	"AR": "ay",
	"O": "o",
	"O*E": "oe",
	"O*UR": "uơ",
	"OE": "ô",
	"OEU": "uô",
	"OEUF": "uôi",
	"OEUFR": "uâ",
	"OEF": "ôi",
	"OER": "oeo",
	"OU": "ơ",
	"OUF": "ơi",
	"OUFR": "ươi",
	"OUR": "ươ",
	"OF": "oi",
	"OFR": "ươu",
	"OR": "ua",
	"*EUFR": "uây",
	"E": "e",
	"EU": "iêu",
	"EUF": "ưi",
	"EUFR": "âu",
	"EUR": "uyê",
	"EF": "eo",
	"EFR": "â",
	"ER": "ây",
	"U": "u",
	"UF": "ui",
	"UFR": "uy",
	"UR": "ư",
	"-F": "i",
	"-FR": "iu",
	"-R": "y",
	"": ""
	},

	# Define ending consonant chords
	"final": {
	"-P": "p",
	"-PB": "nh",
	"-PBLG": "ch",
	"-PL": "m",
	"-PLG": "mh",
	"-B": "h",
	"-BG": "c",
	"-L": "n",
	"-LG": "ng",
	"-G": "t",
	"": ""
	},

	# Define tone chords
	"tone": {
	"-T": u'\u0341', # or u0301
	"-TS": u'\u0303', # or u0342
	"-S": u'\u0300', # or u0340
	"-D": u'\u0309',
	"-Z": u'\u0323',
	"": "",
	}
	}

	def lookup(key):

	# Initialize word string
	word = ""

	# Take input stroke and split it into a list with 4 elements
	# These elements represent the chord groups: intials, vowels, finals, and tone (in that order)
	stroke_chords = split_stroke(key[0])

	# print(stroke_chords)

	# Iterate through the chord groups and determine if they are present in the possible chords.
	for i in ["initial", "vowel", "final", "tone"]:
	if stroke_chords[i] not in chord_space[i]:
	raise KeyError

	# Start the word with the initial consonant since it is constant
	word += chord_space["initial"][stroke_chords["initial"]]

	# Determine the vowel by merging the tone with the vowel letters
	word += get_tone_vowel(stroke_chords["vowel"], stroke_chords["final"], stroke_chords["tone"])

	# End the word with the final consonant since it is constant
	word += chord_space["final"][stroke_chords["final"]]

	# Return the final word
	return word

	# Split a stroke into the 4 groups of interest
	def split_stroke(stroke):

	# Use a regex to split the stroke into 7 groups
	all_groups = re.match(r'(S?T?K?P?W?H?R?)(A?O?)-?(\*?)(E?U?)(F?R?)(P?B?L?G?)(T?S?D?Z?)', stroke).groups()

	# Consolidate the 7 groups into the 4 that are needed
	# Also use a list instead of a tuple
	groups = {
	"initial": all_groups[0],
	"vowel": all_groups[1] + all_groups[2] + all_groups[3] + all_groups[4],
	"final": all_groups[5],
	"tone": all_groups[6]
	}

	# Iterate through the last 3 groups, and prepend a hyphen if needed
	for i in ["vowel", "final", "tone"]:
	if groups[i] != "":
	if groups[i][0] not in ["A", "O", "E", "U", "*"]:
	groups[i] = "-" + groups[i]

	return groups

	# Using a vowel chord and a tone chord as input, return the vowel letters with the correct tone diacritic
	def get_tone_vowel(vowel_chord, final_chord, tone_chord):

	# Determine the tone character
	tone = chord_space["tone"][tone_chord]

	# Determine the vowel letters
	vowel_letters = chord_space["vowel"][vowel_chord]

	# Early return if no tone
	if tone == "":
	return vowel_letters

	tone_position = -1

	# Determine the diacritic position

	# If the vowel consists of one letter, place the tone on the first letter (that is, itself)
	if len(vowel_letters) == 1:
	tone_position = 1

	# If the vowel consists of zero letters, place the tone on the previous letter (useful for words like gì)
	elif len(vowel_letters) == 0:
	tone_position = 0

	# If the vowel has any accented letters, place the tone on that letter
	elif any(x in ["ă", "â", "ê", "ô", "ơ", "ư"] for x in vowel_letters):

	# Give accented characters priority
	for i in ["ă", "â", "ê", "ô", "ư", "ơ"]:
	if i in vowel_letters:
	tone_position = vowel_letters.find(i) + 1

	# If an ending consonant is present, place tone on last letter
	elif final_chord != "":
	tone_position = len(vowel_letters)

	# If an ending consonant is not present, place tone on second last letter
	elif final_chord == "":
	tone_position - len(vowel_letters) - 1

	# Merge the vowel letters and diacritic in the right position
	vowel_with_tone = vowel_letters[0:tone_position] + tone + vowel_letters[tone_position:]

	return vowel_with_tone


	# for i in ["T", "TS", "S", "D", "Z"]:
	# print(lookup(["STPHEURL" + i]))