avostryakov · November 30, 2016 13:59
diff --git a/gistfile1.txt b/gistfile1.txt
 import cPickle
 import numpy as np
 import tensorflow as tf

 from data_utils_ubuntu import CustomRunner


 def position_encoding(sentence_size, embedding_size):
    """
    Position Encoding described in section 4.1 [1]
    """
    encoding = np.ones((embedding_size, sentence_size), dtype=np.float32)
    ls = sentence_size + 1
    le = embedding_size + 1
    for i in range(1, le):
        for j in range(1, ls):
            encoding[i-1, j-1] = (i - (le-1)/2) * (j - (ls-1)/2)
    encoding = 1 + 4 * encoding / embedding_size / sentence_size
    return np.transpose(encoding)


 def add_gradient_noise(t, stddev=1e-3, name=None):
    """
    Adds gradient noise as described in http://arxiv.org/abs/1511.06807 [2].
    The input Tensor `t` should be a gradient.
    The output will be `t` + gaussian noise.
    0.001 was said to be a good fixed value for memory networks [2].
    """
    with tf.op_scope([t, stddev], name, "add_gradient_noise") as name:
        t = tf.convert_to_tensor(t, name="t")
        gn = tf.random_normal(tf.shape(t), stddev=stddev)
        return tf.add(t, gn, name=name)


 class MemN2N(object):
    """End-To-End Memory Network."""
    def __init__(self, batch_size, sentence_size, memory_size, embedding_size, vocab_file, train_file, test_file,
                 hops=2,
                 max_grad_norm=10.0,
                 initializer=tf.contrib.layers.xavier_initializer(dtype=tf.float32),
                 encoding=position_encoding,
                 l2=0.02,
                 tensor_board=True,
                 opt='adam',
                 name='MemN2N'):
        """Creates an End-To-End Memory Network
        Args:
            batch_size: The size of the batch.
            sentence_size: The max size of a sentence in the data. All sentences should be padded
            to this length. If padding is required it should be done with nil one-hot encoding (0).
            memory_size: The max size of the memory. Since Tensorflow currently does not support jagged arrays
            all memories must be padded to this length. If padding is required, the extra memories should be
            empty memories; memories filled with the nil word ([0, 0, 0, ......, 0]).
            embedding_size: The size of the word embedding.
            hops: The number of hops. A hop consists of reading and addressing a memory slot.
            Defaults to `2`.
            max_grad_norm: Maximum L2 norm clipping value. Defaults to `40.0`.
            initializer: Weight initializer. Defaults to `tf.random_normal_initializer(stddev=0.1)`.
            encoding: A function returning a 2D Tensor (sentence_size, embedding_size). Defaults to `position_encoding`.
            name: Name of the End-To-End Memory Network. Defaults to `MemN2N`.
        """

        with open(vocab_file, 'rb') as f:
            self.vocab = cPickle.load(f)
            self._vocab_size = len(self.vocab)

        self._batch_size = batch_size
        self._sentence_size = sentence_size
        self._memory_size = memory_size
        self._embedding_size = embedding_size
        self._hops = hops
        self._max_grad_norm = max_grad_norm
        self._init = initializer

        self._name = name
        self._l2 = l2

        with tf.device("/cpu:0"):
            self.train_runner = CustomRunner(batch_size, memory_size, sentence_size, self.vocab, train_file)
            self.test_runner = CustomRunner(batch_size, memory_size, sentence_size, self.vocab, test_file, is_test=True)
            train_story_var, train_question_var, train_answer_var, train_label_var = self.train_runner.get_inputs()
            test_story_var, test_question_var, test_answer_var, test_label_var = self.test_runner.get_inputs()

        self._build_vars()
        self._learning_rate = tf.placeholder(tf.float32)
        self._encoding = tf.constant(encoding(self._sentence_size, self._embedding_size), name="encoding")

        if opt == 'momentum':
            self._opt = tf.train.MomentumOptimizer(self._learning_rate, 0.97, use_nesterov=True)
        elif opt == 'sgd':
            self._opt = tf.train.GradientDescentOptimizer(self._learning_rate)
        else:
            self._opt = tf.train.AdamOptimizer(learning_rate=self._learning_rate, epsilon=1e-8)

        logits = self._get_network_output(train_story_var, train_question_var, train_answer_var)
        # Calculate the binary cross-entropy loss
        train_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits, tf.to_float(train_label_var)),
                                    name="train_mean_loss")

        reg_loss = self._l2 * tf.add_n(tf.get_collection('reg_loss'))
        loss_op = train_loss + reg_loss

        # gradient pipeline with noise adding
        grads_and_vars = self._opt.compute_gradients(loss_op)
        grads_and_vars = [(tf.clip_by_norm(g, self._max_grad_norm), v) for g, v in grads_and_vars]
        # grads_and_vars = [(add_gradient_noise(g), v) for g, v in grads_and_vars]
        train_op = self._opt.apply_gradients(grads_and_vars, name="train_op")

        test_logits = self._get_network_output(test_story_var, test_question_var, test_answer_var, is_test=True)
        test_probs = tf.sigmoid(test_logits)
        accuracy = tf.reduce_mean(tf.cast(tf.equal(tf.to_int32(tf.argmax(test_probs, 1)), test_label_var), tf.float32))
        accuracy_top2 = tf.reduce_mean(tf.cast(tf.nn.in_top_k(test_logits, test_label_var, k=2), tf.float32))
        accuracy_top5 = tf.reduce_mean(tf.cast(tf.nn.in_top_k(test_logits, test_label_var, k=5), tf.float32))

        if tensor_board:
            tf.scalar_summary("loss", loss_op)
            tf.scalar_summary('accuracy', accuracy)
            tf.scalar_summary('accuracy_top2', accuracy_top2)
            tf.scalar_summary('accuracy_top5', accuracy_top5)
            # Add histograms for trainable variables.
            for var in tf.trainable_variables():
                tf.histogram_summary(var.op.name, var)
            # Add histograms for gradients.
            for grad, var in grads_and_vars:
                if grad is not None:
                    tf.histogram_summary(var.op.name + '/gradients', grad)
            # Summaries
            self.merged_summaries = tf.merge_all_summaries()

        # assign ops
        self.loss_op = loss_op
        self.train_op = train_op
        self.logits_op = logits
        self.test_logits_op = test_logits
        self.test_probs_op = test_probs
        self.accuracy_op = accuracy
        self.accuracy_top2_op = accuracy_top2
        self.accuracy_top5_op = accuracy_top5
        self._sess = None

    def _build_vars(self):
        with tf.variable_scope(self._name + '_variables'):
            nil_word_slot = tf.zeros([1, self._embedding_size])
            Q = tf.get_variable(name="query_embedding", shape=[self._vocab_size, self._embedding_size],
                                initializer=self._init)
            A = tf.get_variable(name="memory_embedding_in", shape=[self._vocab_size, self._embedding_size],
                                initializer=self._init)
            C = tf.get_variable(name="memory_embedding_out", shape=[self._vocab_size, self._embedding_size],
                                initializer=self._init)
            W = tf.get_variable(name="answers_embedding", shape=[self._vocab_size, self._embedding_size],
                                initializer=self._init)
            self.Q = tf.concat(0, [nil_word_slot, Q])
            self.A = tf.concat(0, [nil_word_slot, A])
            self.C = tf.concat(0, [nil_word_slot, C])
            self.W = tf.concat(0, [nil_word_slot, W])

            self.TA = tf.get_variable(shape=[self._memory_size, self._embedding_size], name='TA',
                                      initializer=self._init)
            self.TC = tf.get_variable(shape=[self._memory_size, self._embedding_size], name='TC',
                                      initializer=self._init)
            self.H = tf.get_variable(shape=[self._embedding_size, self._embedding_size], name='H',
                                     initializer=self._init)

        tf.add_to_collection('reg_loss', tf.nn.l2_loss(Q))
        tf.add_to_collection('reg_loss', tf.nn.l2_loss(A))
        tf.add_to_collection('reg_loss', tf.nn.l2_loss(C))
        tf.add_to_collection('reg_loss', tf.nn.l2_loss(W))
        tf.add_to_collection('reg_loss', tf.nn.l2_loss(self.TA))
        tf.add_to_collection('reg_loss', tf.nn.l2_loss(self.TC))
        tf.add_to_collection('reg_loss', tf.nn.l2_loss(self.H))

    def _get_network_output(self, stories, queries, answers, is_test=False):
        with tf.variable_scope(self._name + ('_train' if not is_test else '_test')):
            # TODO remove C, Q and W matrix, use only A embedding matrix
            q_emb = tf.nn.embedding_lookup(self.Q, queries)  # B x S x E
            u = tf.reduce_sum(q_emb * self._encoding, 1)  # B x E
            # dropout on question
            # if not is_test:
            #    u = tf.nn.dropout(u, keep_prob=0.5, name='question_dropout')
            for _ in range(self._hops):
                m_emb = tf.nn.embedding_lookup(self.A, stories)  # B x M x S x E
                m = tf.reduce_sum(m_emb * self._encoding, 2) + self.TA  # B x M x E
                # hack to get around no reduce_dot
                u_temp = tf.expand_dims(u, 1)  # B x 1 x E
                dotted = tf.reduce_sum(m * u_temp, 2)  # B x M

                # Calculate probabilities: memory cells weights
                probs = tf.nn.softmax(dotted)
                probs_temp = tf.expand_dims(probs, 1)

                c_emb = tf.nn.embedding_lookup(self.C, stories)
                c = tf.reduce_sum(c_emb * self._encoding, 2) + self.TC  # B x M x E

                c_temp = tf.transpose(c, [0, 2, 1])
                o_k = tf.reduce_sum(c_temp * probs_temp, 2)  # B x E

                u = u + o_k
                u = tf.matmul(u, self.H)  # B x E

            a_emb = tf.nn.embedding_lookup(self.W, answers)  # B x S x E or B x A x S x E
            if is_test:
                a = tf.reduce_sum(a_emb * self._encoding, 2)  # B x A x E
                u_temp = tf.expand_dims(u, 1)  # B x 1 x E
                output = tf.reduce_sum(a * u_temp, 2)  # B x A
            else:
                a = tf.reduce_sum(a_emb * self._encoding, 1)  # B x E
                # dropout on answers
                # a = tf.nn.dropout(a, keep_prob=0.5, name='answers_dropout')
                output = tf.reduce_sum(u * a, 1)  # B
            return output

    def set_session(self, session):
        self._sess = session

    def load_model(self, saver, location):
        print 'Init variables'
        init_op = tf.initialize_all_variables()
        self._sess.run(init_op)
        if location is not None:
            print 'Load model'
            saver.restore(self._sess, location)

    def save_model(self, location):
        saver = tf.train.Saver()
        saver.save(self._sess, location, write_meta_graph=False)

    def batch_fit(self, save_summary, lr):
        """Runs the training algorithm over the passed batch
        Args:
            save_summary: true or false
            lr: learning rate (float)
        Returns:
            loss: floating-point number, the loss computed for the batch
        """
        feed_dict = {self._learning_rate: lr}
        if save_summary:
            summaries, loss, logits, _ = self._sess.run([self.merged_summaries, self.loss_op, self.logits_op,
                                                         self.train_op], feed_dict=feed_dict)
            pass
        else:
            loss, _ = self._sess.run([self.loss_op, self.train_op], feed_dict=feed_dict)
            summaries = None
        return loss, summaries

    def predict(self):
        """Predicts answers as one-hot encoding."""
        acc, acc_top2, acc_top5, probs = self._sess.run([self.accuracy_op, self.accuracy_top2_op, self.accuracy_top5_op,
                                                         self.test_probs_op])
        return acc, acc_top2, acc_top5
	import cPickle
	import numpy as np
	import tensorflow as tf

	from data_utils_ubuntu import CustomRunner


	def position_encoding(sentence_size, embedding_size):
	"""
	Position Encoding described in section 4.1 [1]
	"""
	encoding = np.ones((embedding_size, sentence_size), dtype=np.float32)
	ls = sentence_size + 1
	le = embedding_size + 1
	for i in range(1, le):
	for j in range(1, ls):
	encoding[i-1, j-1] = (i - (le-1)/2) * (j - (ls-1)/2)
	encoding = 1 + 4 * encoding / embedding_size / sentence_size
	return np.transpose(encoding)


	def add_gradient_noise(t, stddev=1e-3, name=None):
	"""
	Adds gradient noise as described in http://arxiv.org/abs/1511.06807 [2].
	The input Tensor `t` should be a gradient.
	The output will be `t` + gaussian noise.
	0.001 was said to be a good fixed value for memory networks [2].
	"""
	with tf.op_scope([t, stddev], name, "add_gradient_noise") as name:
	t = tf.convert_to_tensor(t, name="t")
	gn = tf.random_normal(tf.shape(t), stddev=stddev)
	return tf.add(t, gn, name=name)


	class MemN2N(object):
	"""End-To-End Memory Network."""
	def __init__(self, batch_size, sentence_size, memory_size, embedding_size, vocab_file, train_file, test_file,
	hops=2,
	max_grad_norm=10.0,
	initializer=tf.contrib.layers.xavier_initializer(dtype=tf.float32),
	encoding=position_encoding,
	l2=0.02,
	tensor_board=True,
	opt='adam',
	name='MemN2N'):
	"""Creates an End-To-End Memory Network
	Args:
	batch_size: The size of the batch.
	sentence_size: The max size of a sentence in the data. All sentences should be padded
	to this length. If padding is required it should be done with nil one-hot encoding (0).
	memory_size: The max size of the memory. Since Tensorflow currently does not support jagged arrays
	all memories must be padded to this length. If padding is required, the extra memories should be
	empty memories; memories filled with the nil word ([0, 0, 0, ......, 0]).
	embedding_size: The size of the word embedding.
	hops: The number of hops. A hop consists of reading and addressing a memory slot.
	Defaults to `2`.
	max_grad_norm: Maximum L2 norm clipping value. Defaults to `40.0`.
	initializer: Weight initializer. Defaults to `tf.random_normal_initializer(stddev=0.1)`.
	encoding: A function returning a 2D Tensor (sentence_size, embedding_size). Defaults to `position_encoding`.
	name: Name of the End-To-End Memory Network. Defaults to `MemN2N`.
	"""

	with open(vocab_file, 'rb') as f:
	self.vocab = cPickle.load(f)
	self._vocab_size = len(self.vocab)

	self._batch_size = batch_size
	self._sentence_size = sentence_size
	self._memory_size = memory_size
	self._embedding_size = embedding_size
	self._hops = hops
	self._max_grad_norm = max_grad_norm
	self._init = initializer

	self._name = name
	self._l2 = l2

	with tf.device("/cpu:0"):
	self.train_runner = CustomRunner(batch_size, memory_size, sentence_size, self.vocab, train_file)
	self.test_runner = CustomRunner(batch_size, memory_size, sentence_size, self.vocab, test_file, is_test=True)
	train_story_var, train_question_var, train_answer_var, train_label_var = self.train_runner.get_inputs()
	test_story_var, test_question_var, test_answer_var, test_label_var = self.test_runner.get_inputs()

	self._build_vars()
	self._learning_rate = tf.placeholder(tf.float32)
	self._encoding = tf.constant(encoding(self._sentence_size, self._embedding_size), name="encoding")

	if opt == 'momentum':
	self._opt = tf.train.MomentumOptimizer(self._learning_rate, 0.97, use_nesterov=True)
	elif opt == 'sgd':
	self._opt = tf.train.GradientDescentOptimizer(self._learning_rate)
	else:
	self._opt = tf.train.AdamOptimizer(learning_rate=self._learning_rate, epsilon=1e-8)

	logits = self._get_network_output(train_story_var, train_question_var, train_answer_var)
	# Calculate the binary cross-entropy loss
	train_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits, tf.to_float(train_label_var)),
	name="train_mean_loss")

	reg_loss = self._l2 * tf.add_n(tf.get_collection('reg_loss'))
	loss_op = train_loss + reg_loss

	# gradient pipeline with noise adding
	grads_and_vars = self._opt.compute_gradients(loss_op)
	grads_and_vars = [(tf.clip_by_norm(g, self._max_grad_norm), v) for g, v in grads_and_vars]
	# grads_and_vars = [(add_gradient_noise(g), v) for g, v in grads_and_vars]
	train_op = self._opt.apply_gradients(grads_and_vars, name="train_op")

	test_logits = self._get_network_output(test_story_var, test_question_var, test_answer_var, is_test=True)
	test_probs = tf.sigmoid(test_logits)
	accuracy = tf.reduce_mean(tf.cast(tf.equal(tf.to_int32(tf.argmax(test_probs, 1)), test_label_var), tf.float32))
	accuracy_top2 = tf.reduce_mean(tf.cast(tf.nn.in_top_k(test_logits, test_label_var, k=2), tf.float32))
	accuracy_top5 = tf.reduce_mean(tf.cast(tf.nn.in_top_k(test_logits, test_label_var, k=5), tf.float32))

	if tensor_board:
	tf.scalar_summary("loss", loss_op)
	tf.scalar_summary('accuracy', accuracy)
	tf.scalar_summary('accuracy_top2', accuracy_top2)
	tf.scalar_summary('accuracy_top5', accuracy_top5)
	# Add histograms for trainable variables.
	for var in tf.trainable_variables():
	tf.histogram_summary(var.op.name, var)
	# Add histograms for gradients.
	for grad, var in grads_and_vars:
	if grad is not None:
	tf.histogram_summary(var.op.name + '/gradients', grad)
	# Summaries
	self.merged_summaries = tf.merge_all_summaries()

	# assign ops
	self.loss_op = loss_op
	self.train_op = train_op
	self.logits_op = logits
	self.test_logits_op = test_logits
	self.test_probs_op = test_probs
	self.accuracy_op = accuracy
	self.accuracy_top2_op = accuracy_top2
	self.accuracy_top5_op = accuracy_top5
	self._sess = None

	def _build_vars(self):
	with tf.variable_scope(self._name + '_variables'):
	nil_word_slot = tf.zeros([1, self._embedding_size])
	Q = tf.get_variable(name="query_embedding", shape=[self._vocab_size, self._embedding_size],
	initializer=self._init)
	A = tf.get_variable(name="memory_embedding_in", shape=[self._vocab_size, self._embedding_size],
	initializer=self._init)
	C = tf.get_variable(name="memory_embedding_out", shape=[self._vocab_size, self._embedding_size],
	initializer=self._init)
	W = tf.get_variable(name="answers_embedding", shape=[self._vocab_size, self._embedding_size],
	initializer=self._init)
	self.Q = tf.concat(0, [nil_word_slot, Q])
	self.A = tf.concat(0, [nil_word_slot, A])
	self.C = tf.concat(0, [nil_word_slot, C])
	self.W = tf.concat(0, [nil_word_slot, W])

	self.TA = tf.get_variable(shape=[self._memory_size, self._embedding_size], name='TA',
	initializer=self._init)
	self.TC = tf.get_variable(shape=[self._memory_size, self._embedding_size], name='TC',
	initializer=self._init)
	self.H = tf.get_variable(shape=[self._embedding_size, self._embedding_size], name='H',
	initializer=self._init)

	tf.add_to_collection('reg_loss', tf.nn.l2_loss(Q))
	tf.add_to_collection('reg_loss', tf.nn.l2_loss(A))
	tf.add_to_collection('reg_loss', tf.nn.l2_loss(C))
	tf.add_to_collection('reg_loss', tf.nn.l2_loss(W))
	tf.add_to_collection('reg_loss', tf.nn.l2_loss(self.TA))
	tf.add_to_collection('reg_loss', tf.nn.l2_loss(self.TC))
	tf.add_to_collection('reg_loss', tf.nn.l2_loss(self.H))

	def _get_network_output(self, stories, queries, answers, is_test=False):
	with tf.variable_scope(self._name + ('_train' if not is_test else '_test')):
	# TODO remove C, Q and W matrix, use only A embedding matrix
	q_emb = tf.nn.embedding_lookup(self.Q, queries) # B x S x E
	u = tf.reduce_sum(q_emb * self._encoding, 1) # B x E
	# dropout on question
	# if not is_test:
	# u = tf.nn.dropout(u, keep_prob=0.5, name='question_dropout')
	for _ in range(self._hops):
	m_emb = tf.nn.embedding_lookup(self.A, stories) # B x M x S x E
	m = tf.reduce_sum(m_emb * self._encoding, 2) + self.TA # B x M x E
	# hack to get around no reduce_dot
	u_temp = tf.expand_dims(u, 1) # B x 1 x E
	dotted = tf.reduce_sum(m * u_temp, 2) # B x M

	# Calculate probabilities: memory cells weights
	probs = tf.nn.softmax(dotted)
	probs_temp = tf.expand_dims(probs, 1)

	c_emb = tf.nn.embedding_lookup(self.C, stories)
	c = tf.reduce_sum(c_emb * self._encoding, 2) + self.TC # B x M x E

	c_temp = tf.transpose(c, [0, 2, 1])
	o_k = tf.reduce_sum(c_temp * probs_temp, 2) # B x E

	u = u + o_k
	u = tf.matmul(u, self.H) # B x E

	a_emb = tf.nn.embedding_lookup(self.W, answers) # B x S x E or B x A x S x E
	if is_test:
	a = tf.reduce_sum(a_emb * self._encoding, 2) # B x A x E
	u_temp = tf.expand_dims(u, 1) # B x 1 x E
	output = tf.reduce_sum(a * u_temp, 2) # B x A
	else:
	a = tf.reduce_sum(a_emb * self._encoding, 1) # B x E
	# dropout on answers
	# a = tf.nn.dropout(a, keep_prob=0.5, name='answers_dropout')
	output = tf.reduce_sum(u * a, 1) # B
	return output

	def set_session(self, session):
	self._sess = session

	def load_model(self, saver, location):
	print 'Init variables'
	init_op = tf.initialize_all_variables()
	self._sess.run(init_op)
	if location is not None:
	print 'Load model'
	saver.restore(self._sess, location)

	def save_model(self, location):
	saver = tf.train.Saver()
	saver.save(self._sess, location, write_meta_graph=False)

	def batch_fit(self, save_summary, lr):
	"""Runs the training algorithm over the passed batch
	Args:
	save_summary: true or false
	lr: learning rate (float)
	Returns:
	loss: floating-point number, the loss computed for the batch
	"""
	feed_dict = {self._learning_rate: lr}
	if save_summary:
	summaries, loss, logits, _ = self._sess.run([self.merged_summaries, self.loss_op, self.logits_op,
	self.train_op], feed_dict=feed_dict)
	pass
	else:
	loss, _ = self._sess.run([self.loss_op, self.train_op], feed_dict=feed_dict)
	summaries = None
	return loss, summaries

	def predict(self):
	"""Predicts answers as one-hot encoding."""
	acc, acc_top2, acc_top5, probs = self._sess.run([self.accuracy_op, self.accuracy_top2_op, self.accuracy_top5_op,
	self.test_probs_op])
	return acc, acc_top2, acc_top5