1- import copy
1+ import copy
22import functools
33
44import numpy as np
@@ -139,9 +139,9 @@ def test_featurevalidator_fitontypeA_transformtypeB(input_data_featuretest):
139139 if isinstance (input_data_featuretest , pd .DataFrame ):
140140 pytest .skip ("Column order change in pandas is not supported" )
141141 elif isinstance (input_data_featuretest , np .ndarray ):
142- complementary_type = validator .numpy_to_pandas (input_data_featuretest )
142+ complementary_type = validator .numpy_array_to_pandas (input_data_featuretest )
143143 elif isinstance (input_data_featuretest , list ):
144- complementary_type , _ = validator .list_to_pandas (input_data_featuretest )
144+ complementary_type , _ = validator .list_to_dataframe (input_data_featuretest )
145145 elif sparse .issparse (input_data_featuretest ):
146146 complementary_type = sparse .csr_matrix (input_data_featuretest .todense ())
147147 else :
@@ -167,128 +167,10 @@ def test_featurevalidator_get_columns_to_encode():
167167 for col in df .columns :
168168 df [col ] = df [col ].astype (col )
169169
170- < << << << HEAD
171170 transformed_columns , feature_types = validator ._get_columns_to_encode (df )
172171
173172 assert transformed_columns == ['category' , 'bool' ]
174173 assert feature_types == ['numerical' , 'numerical' , 'categorical' , 'categorical' ]
175- == == == =
176- validator .fit (df )
177-
178- categorical_columns , numerical_columns , feat_type = validator ._get_columns_info (df )
179-
180- assert numerical_columns == ['int' , 'float' ]
181- assert categorical_columns == ['category' , 'bool' ]
182- assert feat_type == ['numerical' , 'numerical' , 'categorical' , 'categorical' ]
183-
184-
185- def feature_validator_remove_nan_catcolumns (df_train : pd .DataFrame , df_test : pd .DataFrame ,
186- ans_train : np .ndarray , ans_test : np .ndarray ) - > None :
187- validator = TabularFeatureValidator ()
188- validator .fit (df_train )
189- transformed_df_train = validator .transform (df_train )
190- transformed_df_test = validator .transform (df_test )
191-
192- assert np .array_equal (transformed_df_train , ans_train )
193- assert np .array_equal (transformed_df_test , ans_test )
194-
195-
196- def test_feature_validator_remove_nan_catcolumns ():
197- """
198- Make sure categorical columns that have only nan values are removed.
199- Transform performs the folloing:
200- * simple imputation for both
201- * scaling for numerical
202- * one-hot encoding for categorical
203- For example,
204- data = [
205- {'A': 1, 'B': np.nan, 'C': np.nan},
206- {'A': np.nan, 'B': 3, 'C': np.nan},
207- {'A': 2, 'B': np.nan, 'C': np.nan}
208- ]
209- and suppose all the columns are categorical,
210- then
211- * `A` in {np.nan, 1, 2}
212- * `B` in {np.nan, 3}
213- * `C` in {np.nan} <=== it will be dropped.
214-
215- So in the column A,
216- * np.nan ==> [1, 0, 0]
217- * 1 ==> [0, 1, 0]
218- * 2 ==> [0, 0, 1]
219- in the column B,
220- * np.nan ==> [1, 0]
221- * 3 ==> [0, 1]
222- Therefore, by concatenating,
223- * {'A': 1, 'B': np.nan, 'C': np.nan} ==> [0, 1, 0, 1, 0]
224- * {'A': np.nan, 'B': 3, 'C': np.nan} ==> [1, 0, 0, 0, 1]
225- * {'A': 2, 'B': np.nan, 'C': np.nan} ==> [0, 0, 1, 1, 0]
226- """
227- # First case, there exist null columns (B and C) in the train set
228- # and a same column (C) are not all null for the test set.
229-
230- df_train = pd .DataFrame (
231- [
232- {'A' : 1 , 'B' : np .nan , 'C' : np .nan },
233- {'A' : np .nan , 'C' : np .nan },
234- {'A' : 1 }
235- ],
236- dtype = 'category' ,
237- )
238- ans_train = np .array ([[0 , 1 ], [1 , 0 ], [0 , 1 ]], dtype = np .float64 )
239- df_test = pd .DataFrame (
240- [
241- {'A' : np .nan , 'B' : np .nan , 'C' : 5 },
242- {'A' : np .nan , 'C' : np .nan },
243- {'A' : 1 }
244- ],
245- dtype = 'category' ,
246- )
247- ans_test = np .array ([[1 , 0 ], [1 , 0 ], [0 , 1 ]], dtype = np .float64 )
248- feature_validator_remove_nan_catcolumns (df_train , df_test , ans_train , ans_test )
249-
250- # Second case, there exist null columns (B and C) in the training set and
251- # the same columns (B and C) are null in the test set.
252- df_train = pd .DataFrame (
253- [
254- {'A' : 1 , 'B' : np .nan , 'C' : np .nan },
255- {'A' : np .nan , 'C' : np .nan },
256- {'A' : 1 }
257- ],
258- dtype = 'category' ,
259- )
260- ans_train = np .array ([[0 , 1 ], [1 , 0 ], [0 , 1 ]], dtype = np .float64 )
261- df_test = pd .DataFrame (
262- [
263- {'A' : np .nan , 'B' : np .nan , 'C' : np .nan },
264- {'A' : np .nan , 'C' : np .nan },
265- {'A' : 1 }
266- ],
267- dtype = 'category' ,
268- )
269- ans_test = np .array ([[1 , 0 ], [1 , 0 ], [0 , 1 ]], dtype = np .float64 )
270- feature_validator_remove_nan_catcolumns (df_train , df_test , ans_train , ans_test )
271-
272- # Third case, there exist no null columns in the training set and
273- # null columns exist in the test set.
274- df_train = pd .DataFrame (
275- [
276- {'A' : 1 , 'B' : 1 },
277- {'A' : 2 , 'B' : 2 }
278- ],
279- dtype = 'category' ,
280- )
281- ans_train = np .array ([[1 , 0 , 1 , 0 ], [0 , 1 , 0 , 1 ]], dtype = np .float64 )
282- df_test = pd .DataFrame (
283- [
284- {'A' : np .nan , 'B' : np .nan },
285- {'A' : np .nan , 'B' : np .nan }
286- ],
287- dtype = 'category' ,
288- )
289- ans_test = np .array ([[0 , 0 , 0 , 0 ], [0 , 0 , 0 , 0 ]], dtype = np .float64 )
290- feature_validator_remove_nan_catcolumns (df_train , df_test , ans_train , ans_test )
291- >> >> >> > Bug fixes (#249)
292174
293175
294176def test_features_unsupported_calls_are_raised ():
@@ -529,6 +411,7 @@ def test_comparator():
529411 assert ans == feat_type
530412
531413
414+ < << << << HEAD
532415@pytest .fixture
533416def input_data_feature_feat_types (request ):
534417 if request .param == 'pandas_categoricalonly' :
@@ -648,6 +531,8 @@ def test_feature_validator_get_columns_to_encode_error_feat_type(input_data_feat
648531 with pytest .raises (ValueError , match = r"Expected type of features to be in .*" ):
649532 validator ._validate_feat_types (X )
650533
534+ == == == =
535+ >> >> >> > [FIX ] Passing checks (#298)
651536def test_feature_validator_imbalanced_data ():
652537
653538 # Null columns in the train split but not necessarily in the test split
@@ -670,16 +555,15 @@ def test_feature_validator_imbalanced_data():
670555 validator .fit (X_train )
671556
672557 train_feature_types = copy .deepcopy (validator .feat_type )
673- assert train_feature_types == ['numerical' ]
558+ assert train_feature_types == ['numerical' , 'numerical' , 'numerical' , 'numerical' ]
674559 # validator will throw an error if the column types are not the same
675560 transformed_X_test = validator .transform (X_test )
676561 transformed_X_test = pd .DataFrame (transformed_X_test )
677- assert sorted (validator .all_nan_columns ) == sorted (['A' , 'C' , 'D' ])
678- # as there are no categorical columns, we can make such an
679- # assertion. We only expect to drop the all nan columns
680- total_all_nan_columns = len (validator .all_nan_columns )
681- total_columns = len (validator .column_order )
682- assert total_columns - total_all_nan_columns == len (transformed_X_test .columns )
562+ null_columns = []
563+ for column in transformed_X_test .columns :
564+ if transformed_X_test [column ].isna ().all ():
565+ null_columns .append (column )
566+ assert null_columns == [0 , 2 , 3 ]
683567
684568 # Columns with not all null values in the train split and
685569 # completely null on the test split.
@@ -698,12 +582,12 @@ def test_feature_validator_imbalanced_data():
698582 X_test = pd .DataFrame .from_dict (test_features )
699583 validator = TabularFeatureValidator ()
700584 validator .fit (X_train )
701-
702585 train_feature_types = copy .deepcopy (validator .feat_type )
703586 assert train_feature_types == ['categorical' , 'numerical' , 'numerical' ]
704587
705588 transformed_X_test = validator .transform (X_test )
706589 transformed_X_test = pd .DataFrame (transformed_X_test )
590+ << < << << HEAD
707591 assert not len (validator .all_nan_columns )
708592
709593
@@ -733,3 +617,11 @@ def test_comparator():
733617 )
734618 assert ans == feat_type
735619>> > >> >> Bug fixes (#249)
620+ == == == =
621+ null_columns = []
622+ for column in transformed_X_test .columns :
623+ if transformed_X_test [column ].isna ().all ():
624+ null_columns .append (column )
625+
626+ assert null_columns == [1 ]
627+ >> >> > >> [FIX ] Passing checks (#298)
0 commit comments