timbennett · August 2, 2015 23:29
diff --git a/generate.py b/generate.py
 # based entirely on http://nbviewer.ipython.org/gist/yoavg/d76121dfde2618422139 by Yoav Goldberg
 # inputfile.txt (specified on line 42): is the corpus you want to learn & generate from
 # nletters=1000 (on line 33): is how many characters you want to generate
 # order=10 (line 42): is the history length (higher generally gives better output with diminishing returns above 7ish)
 # print generate_text(lm, 10) (line 43): 10 is the order again, but for generation

 from collections import *

 def train_char_lm(fname, order=4):
    data = file(fname).read()
    lm = defaultdict(Counter)
    pad = "~" * order
    data = pad + data
    for i in xrange(len(data)-order):
        history, char = data[i:i+order], data[i+order]
        lm[history][char]+=1
    def normalize(counter):
        s = float(sum(counter.values()))
        return [(c,cnt/s) for c,cnt in counter.iteritems()]
    outlm = {hist:normalize(chars) for hist, chars in lm.iteritems()}
    return outlm
    
 from random import random

 def generate_letter(lm, history, order):
        history = history[-order:]
        dist = lm[history]
        x = random()
        for c,v in dist:
            x = x - v
            if x <= 0: return c
            
 def generate_text(lm, order, nletters=1000):
    history = "~" * order
    out = []
    for i in xrange(nletters):
        c = generate_letter(lm, history, order)
        history = history[-order:] + c
        out.append(c)
    return "".join(out)
    
 lm = train_char_lm("inputfile.txt", order=10)
 print generate_text(lm, 10)
	# based entirely on http://nbviewer.ipython.org/gist/yoavg/d76121dfde2618422139 by Yoav Goldberg
	# inputfile.txt (specified on line 42): is the corpus you want to learn & generate from
	# nletters=1000 (on line 33): is how many characters you want to generate
	# order=10 (line 42): is the history length (higher generally gives better output with diminishing returns above 7ish)
	# print generate_text(lm, 10) (line 43): 10 is the order again, but for generation

	from collections import *

	def train_char_lm(fname, order=4):
	data = file(fname).read()
	lm = defaultdict(Counter)
	pad = "~" * order
	data = pad + data
	for i in xrange(len(data)-order):
	history, char = data[i:i+order], data[i+order]
	lm[history][char]+=1
	def normalize(counter):
	s = float(sum(counter.values()))
	return [(c,cnt/s) for c,cnt in counter.iteritems()]
	outlm = {hist:normalize(chars) for hist, chars in lm.iteritems()}
	return outlm

	from random import random

	def generate_letter(lm, history, order):
	history = history[-order:]
	dist = lm[history]
	x = random()
	for c,v in dist:
	x = x - v
	if x <= 0: return c

	def generate_text(lm, order, nletters=1000):
	history = "~" * order
	out = []
	for i in xrange(nletters):
	c = generate_letter(lm, history, order)
	history = history[-order:] + c
	out.append(c)
	return "".join(out)

	lm = train_char_lm("inputfile.txt", order=10)
	print generate_text(lm, 10)