This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Stratified sampling of data in such way that the distribution of the grouped column in the sample | |
# is almost same as in original data | |
def group_sampler(group_data, total_df_len, n_samples): | |
return group_data.sample(n=int(np.ceil((len(group_data)/ total_df_len)*n_samples))) | |
group_sampler_200 = partial(group_sampler, total_df_len=len(filtered_cells), n_samples=200) | |
filtered_200_cells = filtered_cells.groupby('group_column', as_index=False).apply(cell_group_sampler_200) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def reduce_mem_usage(df, verbose=True): | |
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64'] | |
start_mem = df.memory_usage().sum() / 1024**2 | |
for col in df.columns: | |
col_type = df[col].dtypes | |
if col_type in numerics: | |
c_min = df[col].min() | |
c_max = df[col].max() | |
if str(col_type)[:3] == 'int': | |
if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max: |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class RNNEncoder(nn.Module): | |
def __init__(self, rnn_num_layers=1, input_feature_len=1, sequence_len=168, hidden_size=100, bidirectional=False, device='cpu', rnn_dropout=0.2): | |
super().__init__() | |
self.sequence_len = sequence_len | |
self.hidden_size = hidden_size | |
self.input_feature_len = input_feature_len | |
self.num_layers = rnn_num_layers | |
self.rnn_directions = 2 if bidirectional else 1 | |
self.gru = nn.GRU( | |
num_layers=rnn_num_layers, |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class StoreItemDataset(Dataset): | |
def __init__(self, cat_columns=[], num_columns=[], embed_vector_size=None, decoder_input=True, ohe_cat_columns=False): | |
super().__init__() | |
self.sequence_data = None | |
self.cat_columns = cat_columns | |
self.num_columns = num_columns | |
self.cat_classes = {} | |
self.cat_embed_shape = [] | |
self.cat_embed_vector_size = embed_vector_size if embed_vector_size is not None else {} | |
self.pass_decoder_input=decoder_input |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class EncoderDecoderWrapper(nn.Module): | |
def __init__(self, encoder, decoder_cell, output_size=3, teacher_forcing=0.3, sequence_len=336, decoder_input=True, device='cpu'): | |
super().__init__() | |
self.encoder = encoder | |
self.decoder_cell = decoder_cell | |
self.output_size = output_size | |
self.teacher_forcing = teacher_forcing | |
self.sequence_length = sequence_len | |
self.decoder_input = decoder_input | |
self.device = device |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class DecoderCell(nn.Module): | |
def __init__(self, input_feature_len, hidden_size, dropout=0.2): | |
super().__init__() | |
self.decoder_rnn_cell = nn.GRUCell( | |
input_size=input_feature_len, | |
hidden_size=hidden_size, | |
) | |
self.out = nn.Linear(hidden_size, 1) | |
self.attention = False | |
self.dropout = nn.Dropout(dropout) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class RNNEncoder(nn.Module): | |
def __init__(self, rnn_num_layers=1, input_feature_len=1, sequence_len=168, hidden_size=100, bidirectional=False, device='cpu', rnn_dropout=0.2): | |
super().__init__() | |
self.sequence_len = sequence_len | |
self.hidden_size = hidden_size | |
self.input_feature_len = input_feature_len | |
self.num_layers = rnn_num_layers | |
self.rnn_directions = 2 if bidirectional else 1 | |
self.gru = nn.GRU( | |
num_layers=rnn_num_layers, |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def get_similar_images_annoy(img_index): | |
start = time.time() | |
base_img_id, base_vector, base_label = img_repr_df.iloc[img_index, [0, 1, 2]] | |
similar_img_ids = t.get_nns_by_item(img_index, 13) | |
end = time.time() | |
print(f'{(end - start) * 1000} ms') | |
return base_img_id, base_label, img_repr_df.iloc[similar_img_ids[1:]] | |
base_image, base_label, similar_images_df = get_similar_images_annoy(212693) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from annoy import AnnoyIndex | |
feature_dim = len(img_repr_df['img_repr'][0]) | |
t = AnnoyIndex(feature_dim, metric='euclidean') | |
for i, vector in enumerate(img_repr_df['img_repr']): | |
t.add_item(i, vector) | |
_ = t.build(inference_data.c) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def get_similar_images(img_index, n=10): | |
start = time.time() | |
base_img_id, base_vector, base_label = img_repr_df.iloc[img_index, [0, 1, 2]] | |
cosine_similarity = 1 - img_repr_df['img_repr'].apply(lambda x: cosine(x, base_vector)) | |
similar_img_ids = np.argsort(cosine_similarity)[-11:-1][::-1] | |
end = time.time() | |
print(f'{end - start} secs') | |
return base_img_id, base_label, img_repr_df.iloc[similar_img_ids] |
NewerOlder