Skip to content

Commit 915f783

Browse files
committed
- added visualization of embeddings
- small bug fixes
1 parent 547bf48 commit 915f783

File tree

4 files changed

+58
-17
lines changed

4 files changed

+58
-17
lines changed

data/base_data_loader.py

Lines changed: 33 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import numpy as np
55
import sugartensor as tf
66
from abc import abstractclassmethod
7+
from tensorflow.contrib.tensorboard.plugins import projector
78

89
from data.preprocessors.kaggle_preprocessor import KagglePreprocessor
910

@@ -24,10 +25,14 @@ class BaseDataLoader(object):
2425
DEFAULT_VOCABULARY_SIZE = 50000
2526
DEFAULT_PRETRAINED_EMBEDDINGS = 'data/embeddings/glove.6B.300d.txt'
2627

28+
DEFAULT_META_DATA_FILE = 'metadata.tsv'
29+
DEFAULT_METADATA_DIR = 'asset/train/'
30+
2731
def __init__(self, record_defaults, field_delim, data_column, bucket_boundaries, file_names,
2832
skip_header_lines=_DEFAULT_SKIP_HEADER_LINES,
2933
num_threads=_num_threads, batch_size=_batch_size, min_after_dequeue=_min_after_dequeue,
30-
capacity=_capacity, used_for_test_data=False, name=_name):
34+
capacity=_capacity, used_for_test_data=False, meta_file=DEFAULT_META_DATA_FILE,
35+
save_dir=DEFAULT_METADATA_DIR, name=_name):
3136
self.__file_names = file_names
3237
self.__field_delim = field_delim
3338
self.__record_defaults = record_defaults
@@ -42,6 +47,8 @@ def __init__(self, record_defaults, field_delim, data_column, bucket_boundaries,
4247
self._capacity = capacity
4348
self._name = name
4449

50+
self.meta_file = meta_file
51+
self.save_dir = save_dir
4552
self.table = None
4653
self.num_threads = num_threads
4754
self.vocabulary_size = 0
@@ -210,7 +217,7 @@ def preload_embeddings(self, embed_dim, file_name=DEFAULT_PRETRAINED_EMBEDDINGS)
210217

211218
if word in dictionary:
212219
mapped_words = mapped_words + 1
213-
pre_trained_emb[dictionary[word]] = row[1:]
220+
pre_trained_emb[dictionary[word]-1] = row[1:]
214221
del missing_words[word]
215222

216223
print('Mapped words to pre-trained embeddings: %d' % mapped_words)
@@ -221,3 +228,27 @@ def preload_embeddings(self, embed_dim, file_name=DEFAULT_PRETRAINED_EMBEDDINGS)
221228
print('Loaded pre-trained embeddings')
222229

223230
return pre_trained_emb
231+
232+
def visualize_embeddings(self, sess, tensor, name):
233+
"""
234+
Visualises an embedding vector into Tensorboard
235+
236+
:param sess: Tensorflow session object
237+
:param tensor: The embedding tensor to be visualizd
238+
:param name: Name of the tensor
239+
"""
240+
241+
# make directory if not exist
242+
if not tf.os.path.exists(self.save_dir):
243+
tf.os.makedirs(self.save_dir)
244+
245+
# summary writer
246+
summary_writer = tf.summary.FileWriter(self.save_dir, graph=tf.get_default_graph())
247+
248+
# embedding visualizer
249+
config = projector.ProjectorConfig()
250+
emb = config.embeddings.add()
251+
emb.tensor_name = name # tensor
252+
emb.metadata_path = tf.os.path.join(self.save_dir, self.meta_file) # metadata file
253+
print(tf.os.path.abspath(emb.metadata_path))
254+
projector.visualize_embeddings(summary_writer, config)

data/kaggle_loader.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,9 @@ class KaggleLoader(BaseDataLoader):
1111
TSV_DELIM = '\t'
1212
DATA_COLUMN = 'review'
1313

14+
DEFAULT_META_DATA_FILE = 'metadata_labeledTrainData.tsv'
15+
DEFAULT_METADATA_DIR = 'data/datasets/kaggle_popcorn_challenge/'
16+
1417
def __init__(self, bucket_boundaries, file_names, *args, **kwargs):
1518
self.__file_preprocessor = None
1619

@@ -22,7 +25,8 @@ def __init__(self, bucket_boundaries, file_names, *args, **kwargs):
2225
data_column = KaggleLoader.DATA_COLUMN
2326

2427
super().__init__(record_defaults, self.field_delim, data_column, bucket_boundaries, file_names, *args,
25-
skip_header_lines=skip_header_lines, **kwargs)
28+
skip_header_lines=skip_header_lines, meta_file=KaggleLoader.DEFAULT_META_DATA_FILE,
29+
save_dir=KaggleLoader.DEFAULT_METADATA_DIR, **kwargs)
2630

2731
self.source, self.target = self.get_data()
2832

model/model.py

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -13,13 +13,13 @@
1313
GLOVE_6B_200d_EMBEDDINGS = 'glove.6B.200d.txt'
1414
GLOVE_6B_300d_EMBEDDINGS = 'glove.6B.300d.txt'
1515

16-
embedding_dim = 100 # embedding dimension
17-
latent_dim = 64 # hidden layer dimension
18-
num_blocks = 1 # dilated blocks
16+
embedding_dim = 300 # 300 # embedding dimension
17+
latent_dim = 64 # 256 # hidden layer dimension
18+
num_blocks = 1 # 2 # dilated blocks
1919
reg_type = 'l2' # type of regularization used
20-
default_dout = 0.2 # define the default dropout rate
20+
default_dout = 0.5 # define the default dropout rate
2121
use_pre_trained_embeddings = True # whether to use pre-trained embedding vectors
22-
pre_trained_embeddings_file = EMBEDDINGS_DIR + GLOVE_6B_100d_EMBEDDINGS # the location of the pre-trained embeddings
22+
pre_trained_embeddings_file = EMBEDDINGS_DIR + GLOVE_6B_300d_EMBEDDINGS # the location of the pre-trained embeddings
2323

2424

2525
# residual block
@@ -64,11 +64,11 @@ def classifier(x, num_classes, voca_size, test=False):
6464
# loop dilated causal conv block
6565
for i in range(num_blocks):
6666
res = (res
67-
.sg_res_block(size=3, block=i, rate=1, causal=False, is_first=True)
68-
.sg_res_block(size=3, block=i, rate=2, causal=False)
69-
.sg_res_block(size=3, block=i, rate=4, causal=False)
70-
.sg_res_block(size=3, block=i, rate=8, causal=False)
71-
.sg_res_block(size=3, block=i, rate=16, causal=False))
67+
.sg_res_block(size=8, block=i, rate=1, causal=False, is_first=True)
68+
.sg_res_block(size=8, block=i, rate=2, causal=False)
69+
.sg_res_block(size=8, block=i, rate=4, causal=False)
70+
.sg_res_block(size=5, block=i, rate=8, causal=False)
71+
.sg_res_block(size=5, block=i, rate=16, causal=False))
7272

7373
in_dim = res.get_shape().as_list()[-1]
7474
res = res.sg_conv1d(size=1, dim=in_dim, dout=dropout, bn=True, regularizer=reg_type, name='conv_dout_final')

train.py

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44

55
__author__ = 'georgi.val.stoyan0v@gmail.com'
66

7-
BATCH_SIZE = 32
7+
BATCH_SIZE = 64
88

99
BUCKETS = [100, 170, 240, 290, 340]
1010
DATA_FILE = ['data/datasets/kaggle_popcorn_challenge/labeledTrainData.tsv']
@@ -21,13 +21,18 @@
2121

2222
# setup embeddings, preload pre-trained embeddings if needed
2323
emb = None
24+
embedding_name = 'emb'
25+
2426
if use_pre_trained_embeddings:
2527
embedding_matrix = data.preload_embeddings(embedding_dim, pre_trained_embeddings_file)
26-
emb = init_custom_embeddings(name='emb_x', embeddings_matrix=embedding_matrix)
28+
emb = init_custom_embeddings(name=embedding_name, embeddings_matrix=embedding_matrix, trainable=True)
2729
else:
28-
emb = tf.sg_emb(name='emb', voca_size=data.vocabulary_size, dim=embedding_dim)
30+
emb = tf.sg_emb(name=embedding_name, voca_size=data.vocabulary_size, dim=embedding_dim)
31+
32+
data.visualize_embeddings(sess, emb, embedding_name)
2933

3034

35+
# setup the model for training and validation. Enable multi-GPU support
3136
@tf.sg_parallel
3237
def get_train_loss(opt):
3338
with tf.sg_context(name='model'):
@@ -59,7 +64,8 @@ def get_val_metrics(opt):
5964

6065
return acc, val_loss
6166

67+
6268
# train
63-
classifier_train(sess=sess, log_interval=50, lr=1e-3, loss=get_train_loss(input=x, target=y),
69+
classifier_train(sess=sess, log_interval=50, lr=1e-3, loss=get_train_loss(input=x, target=y)[0],
6470
eval_metric=get_val_metrics(input=val_x, target=val_y)[0],
6571
ep_size=data.num_batches, max_ep=10, early_stop=False, data=data)

0 commit comments

Comments
 (0)