@@ -205,7 +205,6 @@ def test_featurevalidator_supported_types(input_data_featuretest):
205205 assert sparse .issparse (transformed_X )
206206 else :
207207 assert isinstance (transformed_X , np .ndarray )
208- assert np .shape (input_data_featuretest ) == np .shape (transformed_X )
209208 assert np .issubdtype (transformed_X .dtype , np .number )
210209 assert validator ._is_fitted
211210
@@ -238,11 +237,10 @@ def test_featurevalidator_categorical_nan(input_data_featuretest):
238237 validator .fit (input_data_featuretest )
239238 transformed_X = validator .transform (input_data_featuretest )
240239 assert any (pd .isna (input_data_featuretest ))
241- categories_ = validator .column_transformer .named_transformers_ [ 'categorical_pipeline' ]. \
242- named_steps ['ordinalencoder ' ].categories_
240+ categories_ = validator .column_transformer .\
241+ named_transformers_ [ 'categorical_pipeline' ]. named_steps ['onehotencoder ' ].categories_
243242 assert any (('0' in categories ) or (0 in categories ) or ('missing_value' in categories ) for categories in
244243 categories_ )
245- assert np .shape (input_data_featuretest ) == np .shape (transformed_X )
246244 assert np .issubdtype (transformed_X .dtype , np .number )
247245 assert validator ._is_fitted
248246 assert isinstance (transformed_X , np .ndarray )
@@ -295,7 +293,6 @@ def test_featurevalidator_fitontypeA_transformtypeB(input_data_featuretest):
295293 else :
296294 raise ValueError (type (input_data_featuretest ))
297295 transformed_X = validator .transform (complementary_type )
298- assert np .shape (input_data_featuretest ) == np .shape (transformed_X )
299296 assert np .issubdtype (transformed_X .dtype , np .number )
300297 assert validator ._is_fitted
301298
@@ -315,12 +312,6 @@ def test_featurevalidator_get_columns_to_encode():
315312 for col in df .columns :
316313 df [col ] = df [col ].astype (col )
317314
318- < << << << HEAD
319- transformed_columns , feature_types = validator ._get_columns_to_encode (df )
320-
321- assert transformed_columns == ['category' , 'bool' ]
322- assert feature_types == ['numerical' , 'numerical' , 'categorical' , 'categorical' ]
323- == == == =
324315 validator .fit (df )
325316
326317 categorical_columns , numerical_columns , feat_type = validator ._get_columns_info (df )
@@ -436,7 +427,6 @@ def test_feature_validator_remove_nan_catcolumns():
436427 )
437428 ans_test = np .array ([[0 , 0 , 0 , 0 ], [0 , 0 , 0 , 0 ]], dtype = np .float64 )
438429 feature_validator_remove_nan_catcolumns (df_train , df_test , ans_train , ans_test )
439- >> >> >> > Bug fixes (#249)
440430
441431
442432def test_features_unsupported_calls_are_raised ():
@@ -446,36 +436,29 @@ def test_features_unsupported_calls_are_raised():
446436 expected
447437 """
448438 validator = TabularFeatureValidator ()
449- with pytest .raises (ValueError , match = r"AutoPyTorch does not support time " ):
439+ with pytest .raises (TypeError , match = r".*?Convert the time information to a numerical value " ):
450440 validator .fit (
451441 pd .DataFrame ({'datetime' : [pd .Timestamp ('20180310' )]})
452442 )
443+ validator = TabularFeatureValidator ()
453444 with pytest .raises (ValueError , match = r"AutoPyTorch only supports.*yet, the provided input" ):
454445 validator .fit ({'input1' : 1 , 'input2' : 2 })
455- with pytest .raises (ValueError , match = r"has unsupported dtype string" ):
446+ validator = TabularFeatureValidator ()
447+ with pytest .raises (TypeError , match = r".*?but input column A has an invalid type `string`.*" ):
456448 validator .fit (pd .DataFrame ([{'A' : 1 , 'B' : 2 }], dtype = 'string' ))
449+ validator = TabularFeatureValidator ()
457450 with pytest .raises (ValueError , match = r"The feature dimensionality of the train and test" ):
458451 validator .fit (X_train = np .array ([[1 , 2 , 3 ], [4 , 5 , 6 ]]),
459452 X_test = np .array ([[1 , 2 , 3 , 4 ], [4 , 5 , 6 , 7 ]]),
460453 )
454+ validator = TabularFeatureValidator ()
461455 with pytest .raises (ValueError , match = r"Cannot call transform on a validator that is not fit" ):
462456 validator .transform (np .array ([[1 , 2 , 3 ], [4 , 5 , 6 ]]))
463457
464458
465459@pytest .mark .parametrize (
466460 'input_data_featuretest' ,
467461 (
468- 'numpy_numericalonly_nonan' ,
469- 'numpy_numericalonly_nan' ,
470- 'pandas_numericalonly_nonan' ,
471- 'pandas_numericalonly_nan' ,
472- 'list_numericalonly_nonan' ,
473- 'list_numericalonly_nan' ,
474- # Category in numpy is handled via feat_type
475- 'numpy_categoricalonly_nonan' ,
476- 'numpy_mixed_nonan' ,
477- 'numpy_categoricalonly_nan' ,
478- 'numpy_mixed_nan' ,
479462 'sparse_bsr_nonan' ,
480463 'sparse_bsr_nan' ,
481464 'sparse_coo_nonan' ,
@@ -513,7 +496,7 @@ def test_no_column_transformer_created(input_data_featuretest):
513496)
514497def test_column_transformer_created (input_data_featuretest ):
515498 """
516- This test ensures an encoder is created if categorical data is provided
499+ This test ensures an column transformer is created if categorical data is provided
517500 """
518501 validator = TabularFeatureValidator ()
519502 validator .fit (input_data_featuretest )
@@ -522,7 +505,7 @@ def test_column_transformer_created(input_data_featuretest):
522505
523506 # Make sure that the encoded features are actually encoded. Categorical columns are at
524507 # the start after transformation. In our fixtures, this is also honored prior encode
525- transformed_columns , feature_types = validator ._get_columns_to_encode (input_data_featuretest )
508+ cat_columns , _ , feature_types = validator ._get_columns_info (input_data_featuretest )
526509
527510 # At least one categorical
528511 assert 'categorical' in validator .feat_type
@@ -531,20 +514,13 @@ def test_column_transformer_created(input_data_featuretest):
531514 if np .any ([pd .api .types .is_numeric_dtype (input_data_featuretest [col ]
532515 ) for col in input_data_featuretest .columns ]):
533516 assert 'numerical' in validator .feat_type
534- for i , feat_type in enumerate (feature_types ):
535- if 'numerical' in feat_type :
536- np .testing .assert_array_equal (
537- transformed_X [:, i ],
538- input_data_featuretest [input_data_featuretest .columns [i ]].to_numpy ()
539- )
540- elif 'categorical' in feat_type :
541- np .testing .assert_array_equal (
542- transformed_X [:, i ],
543- # Expect always 0, 1... because we use a ordinal encoder
544- np .array ([0 , 1 ])
545- )
546- else :
547- raise ValueError (feat_type )
517+ # we expect this input to be the fixture 'pandas_mixed_nan'
518+ np .testing .assert_array_equal (transformed_X , np .array ([[1. , 0. , - 1. ], [0. , 1. , 1. ]]))
519+ else :
520+ np .testing .assert_array_equal (transformed_X , np .array ([[1. , 0. , 1. , 0. ], [0. , 1. , 0. , 1. ]]))
521+
522+ if not all ([feat_type in ['numerical' , 'categorical' ] for feat_type in feature_types ]):
523+ raise ValueError ("Expected only numerical and categorical feature types" )
548524
549525
550526def test_no_new_category_after_fit ():
@@ -576,13 +552,12 @@ def test_unknown_encode_value():
576552 x ['c' ].cat .add_categories (['NA' ], inplace = True )
577553 x .loc [0 , 'c' ] = 'NA' # unknown value
578554 x_t = validator .transform (x )
579- # The first row should have a -1 as we added a new categorical there
580- expected_row = [- 1 , - 41 , - 3 , - 987.2 ]
555+ # The first row should have a 0, 0 as we added a
556+ # new categorical there and one hot encoder marks
557+ # it as all zeros for the transformed column
558+ expected_row = [0.0 , 0.0 , - 0.5584294383572701 , 0.5000000000000004 , - 1.5136598016833485 ]
581559 assert expected_row == x_t [0 ].tolist ()
582560
583- # Notice how there is only one column 'c' to encode
584- assert validator .categories == [list (range (2 )) for i in range (1 )]
585-
586561
587562# Actual checks for the features
588563@pytest .mark .parametrize (
@@ -634,19 +609,20 @@ def test_feature_validator_new_data_after_fit(
634609 assert sparse .issparse (transformed_X )
635610 else :
636611 assert isinstance (transformed_X , np .ndarray )
637- assert np .shape (X_test ) == np .shape (transformed_X )
638612
639613 # And then check proper error messages
640614 if train_data_type == 'pandas' :
641615 old_dtypes = copy .deepcopy (validator .dtypes )
642616 validator .dtypes = ['dummy' for dtype in X_train .dtypes ]
643- with pytest .raises (ValueError , match = r"Changing the dtype of the features after fit" ):
617+ with pytest .raises (ValueError ,
618+ match = r"The dtype of the features must not be changed after fit" ):
644619 transformed_X = validator .transform (X_test )
645620 validator .dtypes = old_dtypes
646621 if test_data_type == 'pandas' :
647622 columns = X_test .columns .tolist ()
648623 X_test = X_test [reversed (columns )]
649- with pytest .raises (ValueError , match = r"Changing the column order of the features" ):
624+ with pytest .raises (ValueError ,
625+ match = r"The column order of the features must not be changed after fit" ):
650626 transformed_X = validator .transform (X_test )
651627
652628
0 commit comments