Last active
November 25, 2018 16:30
-
-
Save goophile/e0b5272bc04b5c4c2245161e63c81085 to your computer and use it in GitHub Desktop.
Get a list of words from a text file.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import os | |
import sys | |
import re | |
import subprocess | |
from collections import Counter, OrderedDict | |
CUR_DIR = os.path.dirname(os.path.realpath(__file__)) | |
MARKS = r""" ,./?;':"|\<>[]{}()-_=+*&^%$#@!~0987654321` """ | |
KNOWN_FILE = os.path.join(CUR_DIR, 'known_words.txt') | |
NAME_FILE = os.path.join(CUR_DIR, 'names.txt') | |
def bash_cmd(c): | |
""" | |
Run a cmd with bash and return the exit code and text output. | |
""" | |
cmd = ['bash', '-c', c] | |
child = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, universal_newlines=True, shell=False) | |
stdout, _stderr = child.communicate() | |
rc = int(child.returncode) | |
return (rc, stdout) | |
def read_words(file_path): | |
""" | |
Read all words from the file, remove all punctuations and change all letters to lower. | |
Return a OrderedDict ordered by words frequency. | |
Note: only words_by_frequency.keys() are ordered, words_by_frequency.items() are not ordered. | |
""" | |
with open(file_path) as f: | |
lines = f.readlines() | |
words = [] | |
for line in lines: | |
for mark in MARKS: | |
line = line.replace(mark, ' ') | |
line = line.lower() | |
words += line.split() | |
# remove short one | |
words = [word for word in words if len(word) > 2] | |
# sort by frequency | |
counts = Counter(words) | |
words_by_frequency = OrderedDict(sorted(counts.items(), key=lambda t: t[1], reverse=True)) | |
return words_by_frequency | |
def search_wordnet(word): | |
""" | |
WordNet can handle tenses, comparatives and plurals, etc. | |
The cmd wordnet returns the number of matches. (return 0 if no match, -1 if database error.) | |
We only handle 4 types of words here: noun/verb/adj/adv. | |
Some output examples: | |
1) Overview of noun girl | |
The noun girl has 5 senses (first 5 from tagged texts) | |
2) Overview of noun walk | |
The noun walk has 7 senses (first 6 from tagged texts) | |
Overview of verb walk | |
The verb walk has 10 senses (first 5 from tagged texts) | |
3) Overview of adj able | |
The adj able has 4 senses (first 3 from tagged texts) | |
4) Overview of adv well | |
The adv well has 13 senses (first 8 from tagged texts) | |
Return value: return a list of tuples. | |
Each tuple has the type, basic form (remove tenses/plurals...), and a list of all their meanings. | |
[ | |
('noun', 'basic form', ['first meaning', 'second meaning', ]), | |
('verb', 'basic form', ['first meaning', 'second meaning', 'third meaning']), | |
('adj' , 'basic form', []), | |
('adv' , 'basic form', ['first meaning', ]), | |
] | |
""" | |
rc, output = bash_cmd('wordnet {word} -over'.format(word=word)) | |
# no match | |
if rc == 0: | |
return None | |
# database not found | |
if rc == 255: | |
print(output) | |
raise Exception('Fatal error - cannot open WordNet database') | |
# separators = ['\nOverview of noun ', '\nOverview of verb ', '\nOverview of adj ', '\nOverview of adv '] | |
word_types = ['noun', 'verb', 'adj', 'adv'] | |
groups = output.split('\nOverview of ') | |
return_list = [] | |
for group in groups: | |
lines = group.splitlines() | |
if len(lines) == 0: | |
continue | |
overview_line = lines[0] | |
if not overview_line.strip(): | |
continue | |
word_type, basic_form = overview_line.split()[0], overview_line.split()[1] | |
meanings = [] | |
for line in lines[1:]: | |
re_obj = re.search(r'^\d+\. (.*)', line) | |
if re_obj: | |
meaning = re_obj.group(1) | |
meanings.append(meaning) | |
return_list.append((word_type, basic_form, meanings)) | |
return return_list | |
def _test_wordnet(): | |
""" | |
Here we use the word 'well' to test. | |
""" | |
word = 'better' | |
return_list = search_wordnet(word) | |
for return_tuple in return_list: | |
print('======>>>') | |
print(return_tuple) | |
print('<<<======') | |
def main(): | |
known_words = read_words(KNOWN_FILE) | |
name_words = read_words(NAME_FILE) | |
all_words = read_words(sys.argv[1]) | |
# print frequency | |
# for word in all_words.keys(): | |
# print(word, all_words[word]) | |
# print('===========') | |
basic_words = OrderedDict() | |
for word in all_words.keys(): | |
search_result = search_wordnet(word) | |
if not search_result: | |
# print('word meaning not found: {}'.format(word)) | |
continue | |
for result in search_result: | |
word_type, basic_form, meanings = result | |
basic_words[basic_form] = meanings | |
for word in list(known_words.keys()) + list(name_words.keys()): | |
if word in basic_words: | |
basic_words.pop(word) | |
new_words = [word for word in basic_words.keys() if len(word) > 2] | |
for word in new_words: | |
print(word) | |
return | |
for word in basic_words.keys(): | |
print('\n===>>> {} <<<===\n'.format(word)) | |
for meaning in basic_words[word]: | |
print(' {}\n'.format(meaning)) | |
if __name__ == '__main__': | |
# _test_wordnet() | |
main() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment