- added visualization of embeddings

randomrandom · randomrandom · commit 915f78316441 · 2017-06-30T07:56:01.000+03:00
- small bug fixes
diff --git a/data/base_data_loader.py b/data/base_data_loader.py
@@ -4,6 +4,7 @@
 import numpy as np
 import sugartensor as tf
 from abc import abstractclassmethod
+from tensorflow.contrib.tensorboard.plugins import projector
 
 from data.preprocessors.kaggle_preprocessor import KagglePreprocessor
 
@@ -24,10 +25,14 @@ class BaseDataLoader(object):
     DEFAULT_VOCABULARY_SIZE = 50000
     DEFAULT_PRETRAINED_EMBEDDINGS = 'data/embeddings/glove.6B.300d.txt'
 
+    DEFAULT_META_DATA_FILE = 'metadata.tsv'
+    DEFAULT_METADATA_DIR = 'asset/train/'
+
     def __init__(self, record_defaults, field_delim, data_column, bucket_boundaries, file_names,
                  skip_header_lines=_DEFAULT_SKIP_HEADER_LINES,
                  num_threads=_num_threads, batch_size=_batch_size, min_after_dequeue=_min_after_dequeue,
-                 capacity=_capacity, used_for_test_data=False, name=_name):
+                 capacity=_capacity, used_for_test_data=False, meta_file=DEFAULT_META_DATA_FILE,
+                 save_dir=DEFAULT_METADATA_DIR, name=_name):
         self.__file_names = file_names
         self.__field_delim = field_delim
         self.__record_defaults = record_defaults
@@ -42,6 +47,8 @@ def __init__(self, record_defaults, field_delim, data_column, bucket_boundaries,
         self._capacity = capacity
         self._name = name
 
+        self.meta_file = meta_file
+        self.save_dir = save_dir
         self.table = None
         self.num_threads = num_threads
         self.vocabulary_size = 0
@@ -210,7 +217,7 @@ def preload_embeddings(self, embed_dim, file_name=DEFAULT_PRETRAINED_EMBEDDINGS)
 
                 if word in dictionary:
                     mapped_words = mapped_words + 1
-                    pre_trained_emb[dictionary[word]] = row[1:]
+                    pre_trained_emb[dictionary[word]-1] = row[1:]
                     del missing_words[word]
 
             print('Mapped words to pre-trained embeddings: %d' % mapped_words)
@@ -221,3 +228,27 @@ def preload_embeddings(self, embed_dim, file_name=DEFAULT_PRETRAINED_EMBEDDINGS)
         print('Loaded pre-trained embeddings')
 
         return pre_trained_emb
+
+    def visualize_embeddings(self, sess, tensor, name):
+        """
+        Visualises an embedding vector into Tensorboard
+
+        :param sess: Tensorflow session object
+        :param tensor:  The embedding tensor to be visualizd
+        :param name: Name of the tensor
+        """
+
+        # make directory if not exist
+        if not tf.os.path.exists(self.save_dir):
+            tf.os.makedirs(self.save_dir)
+
+        # summary writer
+        summary_writer = tf.summary.FileWriter(self.save_dir, graph=tf.get_default_graph())
+
+        # embedding visualizer
+        config = projector.ProjectorConfig()
+        emb = config.embeddings.add()
+        emb.tensor_name = name  # tensor
+        emb.metadata_path = tf.os.path.join(self.save_dir, self.meta_file)  # metadata file
+        print(tf.os.path.abspath(emb.metadata_path))
+        projector.visualize_embeddings(summary_writer, config)
diff --git a/data/kaggle_loader.py b/data/kaggle_loader.py
@@ -11,6 +11,9 @@ class KaggleLoader(BaseDataLoader):
     TSV_DELIM = '\t'
     DATA_COLUMN = 'review'
 
+    DEFAULT_META_DATA_FILE = 'metadata_labeledTrainData.tsv'
+    DEFAULT_METADATA_DIR = 'data/datasets/kaggle_popcorn_challenge/'
+
     def __init__(self, bucket_boundaries, file_names, *args, **kwargs):
         self.__file_preprocessor = None
 
@@ -22,7 +25,8 @@ def __init__(self, bucket_boundaries, file_names, *args, **kwargs):
         data_column = KaggleLoader.DATA_COLUMN
 
         super().__init__(record_defaults, self.field_delim, data_column, bucket_boundaries, file_names, *args,
-                         skip_header_lines=skip_header_lines, **kwargs)
+                         skip_header_lines=skip_header_lines, meta_file=KaggleLoader.DEFAULT_META_DATA_FILE,
+                         save_dir=KaggleLoader.DEFAULT_METADATA_DIR, **kwargs)
 
         self.source, self.target = self.get_data()
 
diff --git a/model/model.py b/model/model.py
@@ -13,13 +13,13 @@
 GLOVE_6B_200d_EMBEDDINGS = 'glove.6B.200d.txt'
 GLOVE_6B_300d_EMBEDDINGS = 'glove.6B.300d.txt'
 
-embedding_dim = 100  # embedding dimension
-latent_dim = 64  # hidden layer dimension
-num_blocks = 1  # dilated blocks
+embedding_dim = 300  # 300 # embedding dimension
+latent_dim = 64  # 256 # hidden layer dimension
+num_blocks = 1  # 2 # dilated blocks
 reg_type = 'l2'  # type of regularization used
-default_dout = 0.2  # define the default dropout rate
+default_dout = 0.5  # define the default dropout rate
 use_pre_trained_embeddings = True  # whether to use pre-trained embedding vectors
-pre_trained_embeddings_file = EMBEDDINGS_DIR + GLOVE_6B_100d_EMBEDDINGS  # the location of the pre-trained embeddings
+pre_trained_embeddings_file = EMBEDDINGS_DIR + GLOVE_6B_300d_EMBEDDINGS  # the location of the pre-trained embeddings
 
 
 # residual block
@@ -64,11 +64,11 @@ def classifier(x, num_classes, voca_size, test=False):
         # loop dilated causal conv block
         for i in range(num_blocks):
             res = (res
-                   .sg_res_block(size=3, block=i, rate=1, causal=False, is_first=True)
-                   .sg_res_block(size=3, block=i, rate=2, causal=False)
-                   .sg_res_block(size=3, block=i, rate=4, causal=False)
-                   .sg_res_block(size=3, block=i, rate=8, causal=False)
-                   .sg_res_block(size=3, block=i, rate=16, causal=False))
+                   .sg_res_block(size=8, block=i, rate=1, causal=False, is_first=True)
+                   .sg_res_block(size=8, block=i, rate=2, causal=False)
+                   .sg_res_block(size=8, block=i, rate=4, causal=False)
+                   .sg_res_block(size=5, block=i, rate=8, causal=False)
+                   .sg_res_block(size=5, block=i, rate=16, causal=False))
 
         in_dim = res.get_shape().as_list()[-1]
         res = res.sg_conv1d(size=1, dim=in_dim, dout=dropout, bn=True, regularizer=reg_type, name='conv_dout_final')
diff --git a/train.py b/train.py
@@ -4,7 +4,7 @@
 
 __author__ = 'georgi.val.stoyan0v@gmail.com'
 
-BATCH_SIZE = 32
+BATCH_SIZE = 64
 
 BUCKETS = [100, 170, 240, 290, 340]
 DATA_FILE = ['data/datasets/kaggle_popcorn_challenge/labeledTrainData.tsv']
@@ -21,13 +21,18 @@
 
 # setup embeddings, preload pre-trained embeddings if needed
 emb = None
+embedding_name = 'emb'
+
 if use_pre_trained_embeddings:
     embedding_matrix = data.preload_embeddings(embedding_dim, pre_trained_embeddings_file)
-    emb = init_custom_embeddings(name='emb_x', embeddings_matrix=embedding_matrix)
+    emb = init_custom_embeddings(name=embedding_name, embeddings_matrix=embedding_matrix, trainable=True)
 else:
-    emb = tf.sg_emb(name='emb', voca_size=data.vocabulary_size, dim=embedding_dim)
+    emb = tf.sg_emb(name=embedding_name, voca_size=data.vocabulary_size, dim=embedding_dim)
+
+data.visualize_embeddings(sess, emb, embedding_name)
 
 
+# setup the model for training and validation. Enable multi-GPU support
 @tf.sg_parallel
 def get_train_loss(opt):
     with tf.sg_context(name='model'):
@@ -59,7 +64,8 @@ def get_val_metrics(opt):
 
         return acc, val_loss
 
+
 # train
-classifier_train(sess=sess, log_interval=50, lr=1e-3, loss=get_train_loss(input=x, target=y),
+classifier_train(sess=sess, log_interval=50, lr=1e-3, loss=get_train_loss(input=x, target=y)[0],
                  eval_metric=get_val_metrics(input=val_x, target=val_y)[0],
                  ep_size=data.num_batches, max_ep=10, early_stop=False, data=data)