@@ -66,11 +66,36 @@ def __getitem__(self, item):
6666 return self .wv [item ] if index not in self .constants .values () else np .zeros ((self .vector_size ,))
6767
6868 def infer_vector (self , items , agg = "mean" , ** kwargs ) -> list :
69+ """
70+ get sentence embedding with word2vec model
71+
72+ Parameters
73+ ----------
74+ item: list, the tokens after tokenizer processing
75+
76+ Return
77+ ------
78+ vector: list
79+ [array(), ..., array()]
80+ """
6981 token_vectors = self .infer_tokens (items , ** kwargs )
7082 # return [eval("np.%s" % agg)(item, axis=0) if item else np.array([]) for item in token_vectors]
7183 return [eval ("np.%s" % agg )(item , axis = 0 ) if item else np .zeros (self .vector_size ,) for item in token_vectors ]
7284
7385 def infer_tokens (self , items , ** kwargs ) -> list :
86+ """
87+ get token embedding with word2vec model
88+
89+ Parameters
90+ ----------
91+ item: list
92+ the tokens after tokenizer processing
93+
94+ Return
95+ ------
96+ vector: list
97+ [[array(), ..., array()], [...], [...]]
98+ """
7499 return [list (self (* item )) for item in items ]
75100
76101
@@ -95,6 +120,19 @@ def __init__(self, filepath):
95120 self .dictionary = corpora .Dictionary .load (filepath )
96121
97122 def infer_vector (self , item , return_vec = False ):
123+ """
124+ get Bow vector
125+
126+ Parameters
127+ ----------
128+ item: list
129+ the tokens after tokenizer processing
130+
131+ Return
132+ ------
133+ vector: list
134+ [array(), ..., array()]
135+ """
98136 item = self .dictionary .doc2bow (item )
99137 if not return_vec :
100138 return item # return dic as default
@@ -121,6 +159,19 @@ def __init__(self, filepath):
121159 self .dictionary = corpora .Dictionary .load (dictionary_path )
122160
123161 def infer_vector (self , item , return_vec = False ):
162+ """
163+ get Tf-idf vector
164+
165+ Parameters
166+ ----------
167+ item: list
168+ the tokens after tokenizer processing
169+
170+ Return
171+ ------
172+ vector: list
173+ [array(), ..., array()]
174+ """
124175 dic_item = self .dictionary .doc2bow (item )
125176 tfidf_item = self .tfidf_model [dic_item ]
126177 # return dic as default
@@ -181,7 +232,25 @@ def vector_size(self):
181232 return self .d2v .vector_size
182233
183234 def infer_vector (self , items , * args , ** kwargs ) -> list :
235+ """
236+ get vector with D2V model
237+
238+ Parameters
239+ ----------
240+ item: list
241+ the tokens after tokenizer processing
242+
243+ Return
244+ ------
245+ vector: list
246+ [array(), ..., array()]
247+ """
184248 return [self (item ) for item in items ]
185249
186250 def infer_tokens (self , item , * args , ** kwargs ) -> ...:
251+ """
252+ get token embeddings with D2V
253+
254+ NotImplemented
255+ """
187256 raise NotImplementedError
0 commit comments