Created
June 21, 2024 11:49
-
-
Save M0r13n/3144d244aef4a0a40e8adad9154e5868 to your computer and use it in GitHub Desktop.
word count like using `wc` using async state machine parsing. Inspired by: https://github.com/robertdavidgraham/wc2
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
WAS_SPACE = 0 | |
NEW_LINE = 1 | |
NEW_WORD = 2 | |
WAS_WORD = 3 | |
SPACES = [9,10,11,12,13,32] | |
NEWLINE = 10 | |
def init_table(): | |
# 0 => was space | |
# 1 => was line | |
# 2 => new word | |
# 3 => was word | |
table = [0,] * 1024 | |
# Transitions when not a space | |
for c in range(256): | |
table[WAS_SPACE* 256 + c] = NEW_WORD | |
table[NEW_LINE * 256 + c] = NEW_WORD | |
table[NEW_WORD * 256 + c] = WAS_WORD | |
table[WAS_WORD * 256 + c] = WAS_WORD | |
# Transitions when space | |
for c in SPACES: | |
table[WAS_SPACE* 256 + c] = WAS_SPACE; | |
table[NEW_LINE * 256 + c] = WAS_SPACE; | |
table[NEW_WORD * 256 + c] = WAS_SPACE; | |
table[WAS_WORD * 256 + c] = WAS_SPACE; | |
# Transitions when newline | |
table[WAS_SPACE* 256 + NEWLINE] = NEW_LINE; | |
table[NEW_LINE * 256 + NEWLINE] = NEW_LINE; | |
table[NEW_WORD * 256 + NEWLINE] = NEW_LINE; | |
table[WAS_WORD * 256 + NEWLINE] = NEW_LINE; | |
return table | |
if __name__ == '__main__': | |
# State Machine | |
table = init_table() | |
# Counts: num spaces, num new lines, num new words, num chars of words | |
counts = [0,0,0,0] | |
# Initial state | |
state = WAS_SPACE | |
# Read & parse | |
# TODO: read in chunks for better efficiency | |
with open('./test.txt', 'rb') as fd: | |
for c in fd.read(): | |
state = table[state * 256 + c] | |
counts[state] += 1 | |
print(f'{counts[1]} {counts[2]} {sum(counts)}') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment