Created
November 30, 2016 13:59
-
-
Save avostryakov/7c82247cd9fd8d16641c3ea8484b9f6f to your computer and use it in GitHub Desktop.
Memory Network implementation
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import cPickle | |
import numpy as np | |
import tensorflow as tf | |
from data_utils_ubuntu import CustomRunner | |
def position_encoding(sentence_size, embedding_size): | |
""" | |
Position Encoding described in section 4.1 [1] | |
""" | |
encoding = np.ones((embedding_size, sentence_size), dtype=np.float32) | |
ls = sentence_size + 1 | |
le = embedding_size + 1 | |
for i in range(1, le): | |
for j in range(1, ls): | |
encoding[i-1, j-1] = (i - (le-1)/2) * (j - (ls-1)/2) | |
encoding = 1 + 4 * encoding / embedding_size / sentence_size | |
return np.transpose(encoding) | |
def add_gradient_noise(t, stddev=1e-3, name=None): | |
""" | |
Adds gradient noise as described in http://arxiv.org/abs/1511.06807 [2]. | |
The input Tensor `t` should be a gradient. | |
The output will be `t` + gaussian noise. | |
0.001 was said to be a good fixed value for memory networks [2]. | |
""" | |
with tf.op_scope([t, stddev], name, "add_gradient_noise") as name: | |
t = tf.convert_to_tensor(t, name="t") | |
gn = tf.random_normal(tf.shape(t), stddev=stddev) | |
return tf.add(t, gn, name=name) | |
class MemN2N(object): | |
"""End-To-End Memory Network.""" | |
def __init__(self, batch_size, sentence_size, memory_size, embedding_size, vocab_file, train_file, test_file, | |
hops=2, | |
max_grad_norm=10.0, | |
initializer=tf.contrib.layers.xavier_initializer(dtype=tf.float32), | |
encoding=position_encoding, | |
l2=0.02, | |
tensor_board=True, | |
opt='adam', | |
name='MemN2N'): | |
"""Creates an End-To-End Memory Network | |
Args: | |
batch_size: The size of the batch. | |
sentence_size: The max size of a sentence in the data. All sentences should be padded | |
to this length. If padding is required it should be done with nil one-hot encoding (0). | |
memory_size: The max size of the memory. Since Tensorflow currently does not support jagged arrays | |
all memories must be padded to this length. If padding is required, the extra memories should be | |
empty memories; memories filled with the nil word ([0, 0, 0, ......, 0]). | |
embedding_size: The size of the word embedding. | |
hops: The number of hops. A hop consists of reading and addressing a memory slot. | |
Defaults to `2`. | |
max_grad_norm: Maximum L2 norm clipping value. Defaults to `40.0`. | |
initializer: Weight initializer. Defaults to `tf.random_normal_initializer(stddev=0.1)`. | |
encoding: A function returning a 2D Tensor (sentence_size, embedding_size). Defaults to `position_encoding`. | |
name: Name of the End-To-End Memory Network. Defaults to `MemN2N`. | |
""" | |
with open(vocab_file, 'rb') as f: | |
self.vocab = cPickle.load(f) | |
self._vocab_size = len(self.vocab) | |
self._batch_size = batch_size | |
self._sentence_size = sentence_size | |
self._memory_size = memory_size | |
self._embedding_size = embedding_size | |
self._hops = hops | |
self._max_grad_norm = max_grad_norm | |
self._init = initializer | |
self._name = name | |
self._l2 = l2 | |
with tf.device("/cpu:0"): | |
self.train_runner = CustomRunner(batch_size, memory_size, sentence_size, self.vocab, train_file) | |
self.test_runner = CustomRunner(batch_size, memory_size, sentence_size, self.vocab, test_file, is_test=True) | |
train_story_var, train_question_var, train_answer_var, train_label_var = self.train_runner.get_inputs() | |
test_story_var, test_question_var, test_answer_var, test_label_var = self.test_runner.get_inputs() | |
self._build_vars() | |
self._learning_rate = tf.placeholder(tf.float32) | |
self._encoding = tf.constant(encoding(self._sentence_size, self._embedding_size), name="encoding") | |
if opt == 'momentum': | |
self._opt = tf.train.MomentumOptimizer(self._learning_rate, 0.97, use_nesterov=True) | |
elif opt == 'sgd': | |
self._opt = tf.train.GradientDescentOptimizer(self._learning_rate) | |
else: | |
self._opt = tf.train.AdamOptimizer(learning_rate=self._learning_rate, epsilon=1e-8) | |
logits = self._get_network_output(train_story_var, train_question_var, train_answer_var) | |
# Calculate the binary cross-entropy loss | |
train_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits, tf.to_float(train_label_var)), | |
name="train_mean_loss") | |
reg_loss = self._l2 * tf.add_n(tf.get_collection('reg_loss')) | |
loss_op = train_loss + reg_loss | |
# gradient pipeline with noise adding | |
grads_and_vars = self._opt.compute_gradients(loss_op) | |
grads_and_vars = [(tf.clip_by_norm(g, self._max_grad_norm), v) for g, v in grads_and_vars] | |
# grads_and_vars = [(add_gradient_noise(g), v) for g, v in grads_and_vars] | |
train_op = self._opt.apply_gradients(grads_and_vars, name="train_op") | |
test_logits = self._get_network_output(test_story_var, test_question_var, test_answer_var, is_test=True) | |
test_probs = tf.sigmoid(test_logits) | |
accuracy = tf.reduce_mean(tf.cast(tf.equal(tf.to_int32(tf.argmax(test_probs, 1)), test_label_var), tf.float32)) | |
accuracy_top2 = tf.reduce_mean(tf.cast(tf.nn.in_top_k(test_logits, test_label_var, k=2), tf.float32)) | |
accuracy_top5 = tf.reduce_mean(tf.cast(tf.nn.in_top_k(test_logits, test_label_var, k=5), tf.float32)) | |
if tensor_board: | |
tf.scalar_summary("loss", loss_op) | |
tf.scalar_summary('accuracy', accuracy) | |
tf.scalar_summary('accuracy_top2', accuracy_top2) | |
tf.scalar_summary('accuracy_top5', accuracy_top5) | |
# Add histograms for trainable variables. | |
for var in tf.trainable_variables(): | |
tf.histogram_summary(var.op.name, var) | |
# Add histograms for gradients. | |
for grad, var in grads_and_vars: | |
if grad is not None: | |
tf.histogram_summary(var.op.name + '/gradients', grad) | |
# Summaries | |
self.merged_summaries = tf.merge_all_summaries() | |
# assign ops | |
self.loss_op = loss_op | |
self.train_op = train_op | |
self.logits_op = logits | |
self.test_logits_op = test_logits | |
self.test_probs_op = test_probs | |
self.accuracy_op = accuracy | |
self.accuracy_top2_op = accuracy_top2 | |
self.accuracy_top5_op = accuracy_top5 | |
self._sess = None | |
def _build_vars(self): | |
with tf.variable_scope(self._name + '_variables'): | |
nil_word_slot = tf.zeros([1, self._embedding_size]) | |
Q = tf.get_variable(name="query_embedding", shape=[self._vocab_size, self._embedding_size], | |
initializer=self._init) | |
A = tf.get_variable(name="memory_embedding_in", shape=[self._vocab_size, self._embedding_size], | |
initializer=self._init) | |
C = tf.get_variable(name="memory_embedding_out", shape=[self._vocab_size, self._embedding_size], | |
initializer=self._init) | |
W = tf.get_variable(name="answers_embedding", shape=[self._vocab_size, self._embedding_size], | |
initializer=self._init) | |
self.Q = tf.concat(0, [nil_word_slot, Q]) | |
self.A = tf.concat(0, [nil_word_slot, A]) | |
self.C = tf.concat(0, [nil_word_slot, C]) | |
self.W = tf.concat(0, [nil_word_slot, W]) | |
self.TA = tf.get_variable(shape=[self._memory_size, self._embedding_size], name='TA', | |
initializer=self._init) | |
self.TC = tf.get_variable(shape=[self._memory_size, self._embedding_size], name='TC', | |
initializer=self._init) | |
self.H = tf.get_variable(shape=[self._embedding_size, self._embedding_size], name='H', | |
initializer=self._init) | |
tf.add_to_collection('reg_loss', tf.nn.l2_loss(Q)) | |
tf.add_to_collection('reg_loss', tf.nn.l2_loss(A)) | |
tf.add_to_collection('reg_loss', tf.nn.l2_loss(C)) | |
tf.add_to_collection('reg_loss', tf.nn.l2_loss(W)) | |
tf.add_to_collection('reg_loss', tf.nn.l2_loss(self.TA)) | |
tf.add_to_collection('reg_loss', tf.nn.l2_loss(self.TC)) | |
tf.add_to_collection('reg_loss', tf.nn.l2_loss(self.H)) | |
def _get_network_output(self, stories, queries, answers, is_test=False): | |
with tf.variable_scope(self._name + ('_train' if not is_test else '_test')): | |
# TODO remove C, Q and W matrix, use only A embedding matrix | |
q_emb = tf.nn.embedding_lookup(self.Q, queries) # B x S x E | |
u = tf.reduce_sum(q_emb * self._encoding, 1) # B x E | |
# dropout on question | |
# if not is_test: | |
# u = tf.nn.dropout(u, keep_prob=0.5, name='question_dropout') | |
for _ in range(self._hops): | |
m_emb = tf.nn.embedding_lookup(self.A, stories) # B x M x S x E | |
m = tf.reduce_sum(m_emb * self._encoding, 2) + self.TA # B x M x E | |
# hack to get around no reduce_dot | |
u_temp = tf.expand_dims(u, 1) # B x 1 x E | |
dotted = tf.reduce_sum(m * u_temp, 2) # B x M | |
# Calculate probabilities: memory cells weights | |
probs = tf.nn.softmax(dotted) | |
probs_temp = tf.expand_dims(probs, 1) | |
c_emb = tf.nn.embedding_lookup(self.C, stories) | |
c = tf.reduce_sum(c_emb * self._encoding, 2) + self.TC # B x M x E | |
c_temp = tf.transpose(c, [0, 2, 1]) | |
o_k = tf.reduce_sum(c_temp * probs_temp, 2) # B x E | |
u = u + o_k | |
u = tf.matmul(u, self.H) # B x E | |
a_emb = tf.nn.embedding_lookup(self.W, answers) # B x S x E or B x A x S x E | |
if is_test: | |
a = tf.reduce_sum(a_emb * self._encoding, 2) # B x A x E | |
u_temp = tf.expand_dims(u, 1) # B x 1 x E | |
output = tf.reduce_sum(a * u_temp, 2) # B x A | |
else: | |
a = tf.reduce_sum(a_emb * self._encoding, 1) # B x E | |
# dropout on answers | |
# a = tf.nn.dropout(a, keep_prob=0.5, name='answers_dropout') | |
output = tf.reduce_sum(u * a, 1) # B | |
return output | |
def set_session(self, session): | |
self._sess = session | |
def load_model(self, saver, location): | |
print 'Init variables' | |
init_op = tf.initialize_all_variables() | |
self._sess.run(init_op) | |
if location is not None: | |
print 'Load model' | |
saver.restore(self._sess, location) | |
def save_model(self, location): | |
saver = tf.train.Saver() | |
saver.save(self._sess, location, write_meta_graph=False) | |
def batch_fit(self, save_summary, lr): | |
"""Runs the training algorithm over the passed batch | |
Args: | |
save_summary: true or false | |
lr: learning rate (float) | |
Returns: | |
loss: floating-point number, the loss computed for the batch | |
""" | |
feed_dict = {self._learning_rate: lr} | |
if save_summary: | |
summaries, loss, logits, _ = self._sess.run([self.merged_summaries, self.loss_op, self.logits_op, | |
self.train_op], feed_dict=feed_dict) | |
pass | |
else: | |
loss, _ = self._sess.run([self.loss_op, self.train_op], feed_dict=feed_dict) | |
summaries = None | |
return loss, summaries | |
def predict(self): | |
"""Predicts answers as one-hot encoding.""" | |
acc, acc_top2, acc_top5, probs = self._sess.run([self.accuracy_op, self.accuracy_top2_op, self.accuracy_top5_op, | |
self.test_probs_op]) | |
return acc, acc_top2, acc_top5 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment