gautham20’s gists

gautham20 / equal_group_sampling.py

Last active October 14, 2020 16:47

Stratified sampling of data in such way that the distribution of the grouped column in the sample is almost same as in original data

	# Stratified sampling of data in such way that the distribution of the grouped column in the sample
	# is almost same as in original data

	def group_sampler(group_data, total_df_len, n_samples):
	return group_data.sample(n=int(np.ceil((len(group_data)/ total_df_len)*n_samples)))

	group_sampler_200 = partial(group_sampler, total_df_len=len(filtered_cells), n_samples=200)

	filtered_200_cells = filtered_cells.groupby('group_column', as_index=False).apply(cell_group_sampler_200)

gautham20 / reduce_mem_usage.py

Created October 9, 2020 10:47

	def reduce_mem_usage(df, verbose=True):
	numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
	start_mem = df.memory_usage().sum() / 1024**2
	for col in df.columns:
	col_type = df[col].dtypes
	if col_type in numerics:
	c_min = df[col].min()
	c_max = df[col].max()
	if str(col_type)[:3] == 'int':
	if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:

gautham20 / RNNEncoder.py

Created June 8, 2020 22:32

	class RNNEncoder(nn.Module):
	def __init__(self, rnn_num_layers=1, input_feature_len=1, sequence_len=168, hidden_size=100, bidirectional=False, device='cpu', rnn_dropout=0.2):
	super().__init__()
	self.sequence_len = sequence_len
	self.hidden_size = hidden_size
	self.input_feature_len = input_feature_len
	self.num_layers = rnn_num_layers
	self.rnn_directions = 2 if bidirectional else 1
	self.gru = nn.GRU(
	num_layers=rnn_num_layers,

gautham20 / StoreItemDataset.py

Created June 7, 2020 18:13

	class StoreItemDataset(Dataset):
	def __init__(self, cat_columns=[], num_columns=[], embed_vector_size=None, decoder_input=True, ohe_cat_columns=False):
	super().__init__()
	self.sequence_data = None
	self.cat_columns = cat_columns
	self.num_columns = num_columns
	self.cat_classes = {}
	self.cat_embed_shape = []
	self.cat_embed_vector_size = embed_vector_size if embed_vector_size is not None else {}
	self.pass_decoder_input=decoder_input

gautham20 / encoderdecoderwrapper.py

Created June 6, 2020 22:14

	class EncoderDecoderWrapper(nn.Module):
	def __init__(self, encoder, decoder_cell, output_size=3, teacher_forcing=0.3, sequence_len=336, decoder_input=True, device='cpu'):
	super().__init__()
	self.encoder = encoder
	self.decoder_cell = decoder_cell
	self.output_size = output_size
	self.teacher_forcing = teacher_forcing
	self.sequence_length = sequence_len
	self.decoder_input = decoder_input
	self.device = device

gautham20 / decoder_cell.py

Created June 6, 2020 22:05

	class DecoderCell(nn.Module):
	def __init__(self, input_feature_len, hidden_size, dropout=0.2):
	super().__init__()
	self.decoder_rnn_cell = nn.GRUCell(
	input_size=input_feature_len,
	hidden_size=hidden_size,
	)
	self.out = nn.Linear(hidden_size, 1)
	self.attention = False
	self.dropout = nn.Dropout(dropout)

gautham20 / RNNEncoder.py

Created June 6, 2020 20:59

	class RNNEncoder(nn.Module):
	def __init__(self, rnn_num_layers=1, input_feature_len=1, sequence_len=168, hidden_size=100, bidirectional=False, device='cpu', rnn_dropout=0.2):
	super().__init__()
	self.sequence_len = sequence_len
	self.hidden_size = hidden_size
	self.input_feature_len = input_feature_len
	self.num_layers = rnn_num_layers
	self.rnn_directions = 2 if bidirectional else 1
	self.gru = nn.GRU(
	num_layers=rnn_num_layers,

gautham20 / annoy_query.py

Created July 13, 2019 08:22

	def get_similar_images_annoy(img_index):
	start = time.time()
	base_img_id, base_vector, base_label = img_repr_df.iloc[img_index, [0, 1, 2]]
	similar_img_ids = t.get_nns_by_item(img_index, 13)
	end = time.time()
	print(f'{(end - start) * 1000} ms')
	return base_img_id, base_label, img_repr_df.iloc[similar_img_ids[1:]]

	base_image, base_label, similar_images_df = get_similar_images_annoy(212693)

gautham20 / annoy_build_index.py

Created July 13, 2019 08:21

	from annoy import AnnoyIndex

	feature_dim = len(img_repr_df['img_repr'][0])
	t = AnnoyIndex(feature_dim, metric='euclidean')

	for i, vector in enumerate(img_repr_df['img_repr']):
	t.add_item(i, vector)

	_ = t.build(inference_data.c)

gautham20 / similar_images_naive.py

Created July 13, 2019 08:00

	def get_similar_images(img_index, n=10):
	start = time.time()
	base_img_id, base_vector, base_label = img_repr_df.iloc[img_index, [0, 1, 2]]
	cosine_similarity = 1 - img_repr_df['img_repr'].apply(lambda x: cosine(x, base_vector))
	similar_img_ids = np.argsort(cosine_similarity)[-11:-1][::-1]
	end = time.time()
	print(f'{end - start} secs')
	return base_img_id, base_label, img_repr_df.iloc[similar_img_ids]

Gautham Kumaran gautham20