-
-
Save rmdort/596e75e864295365798836d9e8636033 to your computer and use it in GitHub Desktop.
Keras Layer that implements an Attention mechanism, with a context/query vector, for temporal data. Supports Masking. Follows the work of Yang et al. [https://www.cs.cmu.edu/~diyiy/docs/naacl16.pdf] "Hierarchical Attention Networks for Document Classification"
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class AttentionWithContext(Layer): | |
""" | |
Attention operation, with a context/query vector, for temporal data. | |
Supports Masking. | |
Follows the work of Yang et al. [https://www.cs.cmu.edu/~diyiy/docs/naacl16.pdf] | |
"Hierarchical Attention Networks for Document Classification" | |
by using a context vector to assist the attention | |
# Input shape | |
3D tensor with shape: `(samples, steps, features)`. | |
# Output shape | |
2D tensor with shape: `(samples, features)`. | |
:param kwargs: | |
Just put it on top of an RNN Layer (GRU/LSTM/SimpleRNN) with return_sequences=True. | |
The dimensions are inferred based on the output shape of the RNN. | |
Example: | |
model.add(LSTM(64, return_sequences=True)) | |
model.add(AttentionWithContext()) | |
""" | |
def __init__(self, init='glorot_uniform', kernel_regularizer=None, bias_regularizer=None, kernel_constraint=None, bias_constraint=None, **kwargs): | |
self.supports_masking = True | |
self.init = initializers.get(init) | |
self.kernel_initializer = initializers.get('glorot_uniform') | |
self.kernel_regularizer = regularizers.get(kernel_regularizer) | |
self.bias_regularizer = regularizers.get(bias_regularizer) | |
self.kernel_constraint = constraints.get(kernel_constraint) | |
self.bias_constraint = constraints.get(bias_constraint) | |
super(AttentionWithContext, self).__init__(**kwargs) | |
def build(self, input_shape): | |
self.kernel = self.add_weight((input_shape[-1], 1), | |
initializer=self.kernel_initializer, | |
name='{}_W'.format(self.name), | |
regularizer=self.kernel_regularizer, | |
constraint=self.kernel_constraint) | |
self.b = self.add_weight((input_shape[1],), | |
initializer='zero', | |
name='{}_b'.format(self.name), | |
regularizer=self.bias_regularizer, | |
constraint=self.bias_constraint) | |
self.u = self.add_weight((input_shape[1],), | |
initializer=self.kernel_initializer, | |
name='{}_u'.format(self.name), | |
regularizer=self.kernel_regularizer, | |
constraint=self.kernel_constraint) | |
self.built = True | |
def compute_mask(self, input, mask): | |
return None | |
def call(self, x, mask=None): | |
# (x, 40, 300) x (300, 1) | |
multData = K.dot(x, self.kernel) # (x, 40, 1) | |
multData = K.squeeze(multData, -1) # (x, 40) | |
multData = multData + self.b # (x, 40) + (40,) | |
multData = K.tanh(multData) # (x, 40) | |
multData = multData * self.u # (x, 40) * (40, 1) => (x, 1) | |
multData = K.exp(multData) # (X, 1) | |
# apply mask after the exp. will be re-normalized next | |
if mask is not None: | |
mask = K.cast(mask, K.floatx()) #(x, 40) | |
multData = mask*multData #(x, 40) * (x, 40, ) | |
# in some cases especially in the early stages of training the sum may be almost zero | |
# and this results in NaN's. A workaround is to add a very small positive number ε to the sum. | |
# a /= K.cast(K.sum(a, axis=1, keepdims=True), K.floatx()) | |
multData /= K.cast(K.sum(multData, axis=1, keepdims=True) + K.epsilon(), K.floatx()) | |
multData = K.expand_dims(multData) | |
weighted_input = x * multData | |
return K.sum(weighted_input, axis=1) | |
def compute_output_shape(self, input_shape): | |
return (input_shape[0], input_shape[-1],) |
@rmdort, like your pedagogical comments of tensor shape changes. Those are really helpful for better understanding the math behind the context attention.
I dont know why, but, getting dimension error.
Code:
def generate_model(output_len, chars=None):
"""Generate the model"""
print('Building model...')
chars = chars or CHARS
in_out_neurons = CONFIG.max_input_len
hidden_neurons = CONFIG.hidden_size
model = Sequential()
model.add(recurrent.GRU(512, input_shape=( 128, 100),
return_sequences=True,
kernel_initializer=CONFIG.initialization, activation='linear'))
model.add(AttentionWithContext())
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])
return model
and the error is
________________________________
_________________________________
Layer (type) Output Shape Param #
=================================================================
gru_1 (GRU) (None, 128, 512) 941568
_________________________________________________________________
attention_with_context_1 (At (None, 512) 263168
=================================================================
Total params: 1,204,736
Trainable params: 1,204,736
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/500
Traceback (most recent call last):
File "test.py", line 580, in <module>
train_speller()
File "test.py", line 482, in train_speller
itarative_train(model)
File "test.py", line 467, in itarative_train
class_weight=None, max_queue_size=10, workers=1)
File "/ssd/anaconda3/lib/python3.6/site-packages/keras/legacy/interfaces.py", line 91, in wrapper
return func(*args, **kwargs)
File "/ssd/anaconda3/lib/python3.6/site-packages/keras/models.py", line 1315, in fit_generator
initial_epoch=initial_epoch)
File "/ssd/anaconda3/lib/python3.6/site-packages/keras/legacy/interfaces.py", line 91, in wrapper
return func(*args, **kwargs)
File "/ssd/anaconda3/lib/python3.6/site-packages/keras/engine/training.py", line 2230, in fit_generator
class_weight=class_weight)
File "/ssd/anaconda3/lib/python3.6/site-packages/keras/engine/training.py", line 1877, in train_on_batch
class_weight=class_weight)
File "/ssd/anaconda3/lib/python3.6/site-packages/keras/engine/training.py", line 1480, in _standardize_user_data
exception_prefix='target')
File "/ssd/anaconda3/lib/python3.6/site-packages/keras/engine/training.py", line 113, in _standardize_input_data
'with shape ' + str(data_shape))
ValueError: Error when checking target: expected attention_with_context_1 to have 2 dimensions, but got array with shape (64, 128, 100)
Any idea?
as output share is anyway 3dim, i tried to change line 81 as
return (input_shape[0], input_shape[1],input_shape[2])
then for different error and model is not compiling
How can I add attention in functional api
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
@rmdort Despite adding the Epsilon, I am still getting NaN loss at the start of training. Any reccomended debugging paths?