Skip to content

Instantly share code, notes, and snippets.

@avostryakov
Created November 30, 2016 13:59
Show Gist options
  • Save avostryakov/7c82247cd9fd8d16641c3ea8484b9f6f to your computer and use it in GitHub Desktop.
Save avostryakov/7c82247cd9fd8d16641c3ea8484b9f6f to your computer and use it in GitHub Desktop.
Memory Network implementation
import cPickle
import numpy as np
import tensorflow as tf
from data_utils_ubuntu import CustomRunner
def position_encoding(sentence_size, embedding_size):
"""
Position Encoding described in section 4.1 [1]
"""
encoding = np.ones((embedding_size, sentence_size), dtype=np.float32)
ls = sentence_size + 1
le = embedding_size + 1
for i in range(1, le):
for j in range(1, ls):
encoding[i-1, j-1] = (i - (le-1)/2) * (j - (ls-1)/2)
encoding = 1 + 4 * encoding / embedding_size / sentence_size
return np.transpose(encoding)
def add_gradient_noise(t, stddev=1e-3, name=None):
"""
Adds gradient noise as described in http://arxiv.org/abs/1511.06807 [2].
The input Tensor `t` should be a gradient.
The output will be `t` + gaussian noise.
0.001 was said to be a good fixed value for memory networks [2].
"""
with tf.op_scope([t, stddev], name, "add_gradient_noise") as name:
t = tf.convert_to_tensor(t, name="t")
gn = tf.random_normal(tf.shape(t), stddev=stddev)
return tf.add(t, gn, name=name)
class MemN2N(object):
"""End-To-End Memory Network."""
def __init__(self, batch_size, sentence_size, memory_size, embedding_size, vocab_file, train_file, test_file,
hops=2,
max_grad_norm=10.0,
initializer=tf.contrib.layers.xavier_initializer(dtype=tf.float32),
encoding=position_encoding,
l2=0.02,
tensor_board=True,
opt='adam',
name='MemN2N'):
"""Creates an End-To-End Memory Network
Args:
batch_size: The size of the batch.
sentence_size: The max size of a sentence in the data. All sentences should be padded
to this length. If padding is required it should be done with nil one-hot encoding (0).
memory_size: The max size of the memory. Since Tensorflow currently does not support jagged arrays
all memories must be padded to this length. If padding is required, the extra memories should be
empty memories; memories filled with the nil word ([0, 0, 0, ......, 0]).
embedding_size: The size of the word embedding.
hops: The number of hops. A hop consists of reading and addressing a memory slot.
Defaults to `2`.
max_grad_norm: Maximum L2 norm clipping value. Defaults to `40.0`.
initializer: Weight initializer. Defaults to `tf.random_normal_initializer(stddev=0.1)`.
encoding: A function returning a 2D Tensor (sentence_size, embedding_size). Defaults to `position_encoding`.
name: Name of the End-To-End Memory Network. Defaults to `MemN2N`.
"""
with open(vocab_file, 'rb') as f:
self.vocab = cPickle.load(f)
self._vocab_size = len(self.vocab)
self._batch_size = batch_size
self._sentence_size = sentence_size
self._memory_size = memory_size
self._embedding_size = embedding_size
self._hops = hops
self._max_grad_norm = max_grad_norm
self._init = initializer
self._name = name
self._l2 = l2
with tf.device("/cpu:0"):
self.train_runner = CustomRunner(batch_size, memory_size, sentence_size, self.vocab, train_file)
self.test_runner = CustomRunner(batch_size, memory_size, sentence_size, self.vocab, test_file, is_test=True)
train_story_var, train_question_var, train_answer_var, train_label_var = self.train_runner.get_inputs()
test_story_var, test_question_var, test_answer_var, test_label_var = self.test_runner.get_inputs()
self._build_vars()
self._learning_rate = tf.placeholder(tf.float32)
self._encoding = tf.constant(encoding(self._sentence_size, self._embedding_size), name="encoding")
if opt == 'momentum':
self._opt = tf.train.MomentumOptimizer(self._learning_rate, 0.97, use_nesterov=True)
elif opt == 'sgd':
self._opt = tf.train.GradientDescentOptimizer(self._learning_rate)
else:
self._opt = tf.train.AdamOptimizer(learning_rate=self._learning_rate, epsilon=1e-8)
logits = self._get_network_output(train_story_var, train_question_var, train_answer_var)
# Calculate the binary cross-entropy loss
train_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits, tf.to_float(train_label_var)),
name="train_mean_loss")
reg_loss = self._l2 * tf.add_n(tf.get_collection('reg_loss'))
loss_op = train_loss + reg_loss
# gradient pipeline with noise adding
grads_and_vars = self._opt.compute_gradients(loss_op)
grads_and_vars = [(tf.clip_by_norm(g, self._max_grad_norm), v) for g, v in grads_and_vars]
# grads_and_vars = [(add_gradient_noise(g), v) for g, v in grads_and_vars]
train_op = self._opt.apply_gradients(grads_and_vars, name="train_op")
test_logits = self._get_network_output(test_story_var, test_question_var, test_answer_var, is_test=True)
test_probs = tf.sigmoid(test_logits)
accuracy = tf.reduce_mean(tf.cast(tf.equal(tf.to_int32(tf.argmax(test_probs, 1)), test_label_var), tf.float32))
accuracy_top2 = tf.reduce_mean(tf.cast(tf.nn.in_top_k(test_logits, test_label_var, k=2), tf.float32))
accuracy_top5 = tf.reduce_mean(tf.cast(tf.nn.in_top_k(test_logits, test_label_var, k=5), tf.float32))
if tensor_board:
tf.scalar_summary("loss", loss_op)
tf.scalar_summary('accuracy', accuracy)
tf.scalar_summary('accuracy_top2', accuracy_top2)
tf.scalar_summary('accuracy_top5', accuracy_top5)
# Add histograms for trainable variables.
for var in tf.trainable_variables():
tf.histogram_summary(var.op.name, var)
# Add histograms for gradients.
for grad, var in grads_and_vars:
if grad is not None:
tf.histogram_summary(var.op.name + '/gradients', grad)
# Summaries
self.merged_summaries = tf.merge_all_summaries()
# assign ops
self.loss_op = loss_op
self.train_op = train_op
self.logits_op = logits
self.test_logits_op = test_logits
self.test_probs_op = test_probs
self.accuracy_op = accuracy
self.accuracy_top2_op = accuracy_top2
self.accuracy_top5_op = accuracy_top5
self._sess = None
def _build_vars(self):
with tf.variable_scope(self._name + '_variables'):
nil_word_slot = tf.zeros([1, self._embedding_size])
Q = tf.get_variable(name="query_embedding", shape=[self._vocab_size, self._embedding_size],
initializer=self._init)
A = tf.get_variable(name="memory_embedding_in", shape=[self._vocab_size, self._embedding_size],
initializer=self._init)
C = tf.get_variable(name="memory_embedding_out", shape=[self._vocab_size, self._embedding_size],
initializer=self._init)
W = tf.get_variable(name="answers_embedding", shape=[self._vocab_size, self._embedding_size],
initializer=self._init)
self.Q = tf.concat(0, [nil_word_slot, Q])
self.A = tf.concat(0, [nil_word_slot, A])
self.C = tf.concat(0, [nil_word_slot, C])
self.W = tf.concat(0, [nil_word_slot, W])
self.TA = tf.get_variable(shape=[self._memory_size, self._embedding_size], name='TA',
initializer=self._init)
self.TC = tf.get_variable(shape=[self._memory_size, self._embedding_size], name='TC',
initializer=self._init)
self.H = tf.get_variable(shape=[self._embedding_size, self._embedding_size], name='H',
initializer=self._init)
tf.add_to_collection('reg_loss', tf.nn.l2_loss(Q))
tf.add_to_collection('reg_loss', tf.nn.l2_loss(A))
tf.add_to_collection('reg_loss', tf.nn.l2_loss(C))
tf.add_to_collection('reg_loss', tf.nn.l2_loss(W))
tf.add_to_collection('reg_loss', tf.nn.l2_loss(self.TA))
tf.add_to_collection('reg_loss', tf.nn.l2_loss(self.TC))
tf.add_to_collection('reg_loss', tf.nn.l2_loss(self.H))
def _get_network_output(self, stories, queries, answers, is_test=False):
with tf.variable_scope(self._name + ('_train' if not is_test else '_test')):
# TODO remove C, Q and W matrix, use only A embedding matrix
q_emb = tf.nn.embedding_lookup(self.Q, queries) # B x S x E
u = tf.reduce_sum(q_emb * self._encoding, 1) # B x E
# dropout on question
# if not is_test:
# u = tf.nn.dropout(u, keep_prob=0.5, name='question_dropout')
for _ in range(self._hops):
m_emb = tf.nn.embedding_lookup(self.A, stories) # B x M x S x E
m = tf.reduce_sum(m_emb * self._encoding, 2) + self.TA # B x M x E
# hack to get around no reduce_dot
u_temp = tf.expand_dims(u, 1) # B x 1 x E
dotted = tf.reduce_sum(m * u_temp, 2) # B x M
# Calculate probabilities: memory cells weights
probs = tf.nn.softmax(dotted)
probs_temp = tf.expand_dims(probs, 1)
c_emb = tf.nn.embedding_lookup(self.C, stories)
c = tf.reduce_sum(c_emb * self._encoding, 2) + self.TC # B x M x E
c_temp = tf.transpose(c, [0, 2, 1])
o_k = tf.reduce_sum(c_temp * probs_temp, 2) # B x E
u = u + o_k
u = tf.matmul(u, self.H) # B x E
a_emb = tf.nn.embedding_lookup(self.W, answers) # B x S x E or B x A x S x E
if is_test:
a = tf.reduce_sum(a_emb * self._encoding, 2) # B x A x E
u_temp = tf.expand_dims(u, 1) # B x 1 x E
output = tf.reduce_sum(a * u_temp, 2) # B x A
else:
a = tf.reduce_sum(a_emb * self._encoding, 1) # B x E
# dropout on answers
# a = tf.nn.dropout(a, keep_prob=0.5, name='answers_dropout')
output = tf.reduce_sum(u * a, 1) # B
return output
def set_session(self, session):
self._sess = session
def load_model(self, saver, location):
print 'Init variables'
init_op = tf.initialize_all_variables()
self._sess.run(init_op)
if location is not None:
print 'Load model'
saver.restore(self._sess, location)
def save_model(self, location):
saver = tf.train.Saver()
saver.save(self._sess, location, write_meta_graph=False)
def batch_fit(self, save_summary, lr):
"""Runs the training algorithm over the passed batch
Args:
save_summary: true or false
lr: learning rate (float)
Returns:
loss: floating-point number, the loss computed for the batch
"""
feed_dict = {self._learning_rate: lr}
if save_summary:
summaries, loss, logits, _ = self._sess.run([self.merged_summaries, self.loss_op, self.logits_op,
self.train_op], feed_dict=feed_dict)
pass
else:
loss, _ = self._sess.run([self.loss_op, self.train_op], feed_dict=feed_dict)
summaries = None
return loss, summaries
def predict(self):
"""Predicts answers as one-hot encoding."""
acc, acc_top2, acc_top5, probs = self._sess.run([self.accuracy_op, self.accuracy_top2_op, self.accuracy_top5_op,
self.test_probs_op])
return acc, acc_top2, acc_top5
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment