Created
August 2, 2015 23:29
-
-
Save timbennett/1500512037827d309b12 to your computer and use it in GitHub Desktop.
Generate text with a Character-level Language Model
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# based entirely on http://nbviewer.ipython.org/gist/yoavg/d76121dfde2618422139 by Yoav Goldberg | |
# inputfile.txt (specified on line 42): is the corpus you want to learn & generate from | |
# nletters=1000 (on line 33): is how many characters you want to generate | |
# order=10 (line 42): is the history length (higher generally gives better output with diminishing returns above 7ish) | |
# print generate_text(lm, 10) (line 43): 10 is the order again, but for generation | |
from collections import * | |
def train_char_lm(fname, order=4): | |
data = file(fname).read() | |
lm = defaultdict(Counter) | |
pad = "~" * order | |
data = pad + data | |
for i in xrange(len(data)-order): | |
history, char = data[i:i+order], data[i+order] | |
lm[history][char]+=1 | |
def normalize(counter): | |
s = float(sum(counter.values())) | |
return [(c,cnt/s) for c,cnt in counter.iteritems()] | |
outlm = {hist:normalize(chars) for hist, chars in lm.iteritems()} | |
return outlm | |
from random import random | |
def generate_letter(lm, history, order): | |
history = history[-order:] | |
dist = lm[history] | |
x = random() | |
for c,v in dist: | |
x = x - v | |
if x <= 0: return c | |
def generate_text(lm, order, nletters=1000): | |
history = "~" * order | |
out = [] | |
for i in xrange(nletters): | |
c = generate_letter(lm, history, order) | |
history = history[-order:] + c | |
out.append(c) | |
return "".join(out) | |
lm = train_char_lm("inputfile.txt", order=10) | |
print generate_text(lm, 10) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment