Created
March 13, 2015 18:17
-
-
Save drussellmrichie/ee9ebe63610553bd2d27 to your computer and use it in GitHub Desktop.
Lookup tagger from Chp 5 of NLTK book
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Natural Language Toolkit: code_baseline_tagger | |
# functions from http://www.nltk.org/book/ch05.html | |
from nltk.corpus import brown | |
import nltk | |
def performance(cfd, wordlist): | |
lt = dict((word, cfd[word].max()) for word in wordlist) | |
baseline_tagger = nltk.UnigramTagger(model=lt, backoff=nltk.DefaultTagger('NN')) | |
return baseline_tagger.evaluate(brown.tagged_sents(categories='news')) | |
def display(): | |
import pylab | |
words_by_freq = list(nltk.FreqDist(brown.words(categories='news'))) | |
cfd = nltk.ConditionalFreqDist(brown.tagged_words(categories='news')) | |
sizes = 2 ** pylab.arange(15) | |
perfs = [performance(cfd, words_by_freq[:size]) for size in sizes] | |
pylab.plot(sizes, perfs, '-bo') | |
pylab.title('Lookup Tagger Performance with Varying Model Size') | |
pylab.xlabel('Model Size') | |
pylab.ylabel('Perfdlormance') | |
pylab.show() | |
display() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment