1414from sklearn .exceptions import NotFittedError
1515from sklearn .impute import SimpleImputer
1616from sklearn .pipeline import make_pipeline
17- from sklearn .preprocessing import OneHotEncoder , StandardScaler
17+ from sklearn .preprocessing import OrdinalEncoder
1818
1919from autoPyTorch .data .base_feature_validator import BaseFeatureValidator , SUPPORTED_FEAT_TYPES
2020
2121
2222def _create_column_transformer (
2323 preprocessors : Dict [str , List [BaseEstimator ]],
24- numerical_columns : List [str ],
2524 categorical_columns : List [str ],
2625) -> ColumnTransformer :
2726 """
@@ -32,49 +31,36 @@ def _create_column_transformer(
3231 Args:
3332 preprocessors (Dict[str, List[BaseEstimator]]):
3433 Dictionary containing list of numerical and categorical preprocessors.
35- numerical_columns (List[str]):
36- List of names of numerical columns
3734 categorical_columns (List[str]):
3835 List of names of categorical columns
3936
4037 Returns:
4138 ColumnTransformer
4239 """
4340
44- numerical_pipeline = 'drop'
45- categorical_pipeline = 'drop'
46- if len (numerical_columns ) > 0 :
47- numerical_pipeline = make_pipeline (* preprocessors ['numerical' ])
48- if len (categorical_columns ) > 0 :
49- categorical_pipeline = make_pipeline (* preprocessors ['categorical' ])
41+ categorical_pipeline = make_pipeline (* preprocessors ['categorical' ])
5042
5143 return ColumnTransformer ([
52- ('categorical_pipeline' , categorical_pipeline , categorical_columns ),
53- ('numerical_pipeline' , numerical_pipeline , numerical_columns )],
54- remainder = 'drop'
44+ ('categorical_pipeline' , categorical_pipeline , categorical_columns )],
45+ remainder = 'passthrough'
5546 )
5647
5748
5849def get_tabular_preprocessors () -> Dict [str , List [BaseEstimator ]]:
5950 """
6051 This function creates a Dictionary containing a list
6152 of numerical and categorical preprocessors
62-
6353 Returns:
6454 Dict[str, List[BaseEstimator]]
6555 """
6656 preprocessors : Dict [str , List [BaseEstimator ]] = dict ()
6757
6858 # Categorical Preprocessors
69- onehot_encoder = OneHotEncoder (categories = 'auto' , sparse = False , handle_unknown = 'ignore' )
59+ ordinal_encoder = OrdinalEncoder (handle_unknown = 'use_encoded_value' ,
60+ unknown_value = - 1 )
7061 categorical_imputer = SimpleImputer (strategy = 'constant' , copy = False )
7162
72- # Numerical Preprocessors
73- numerical_imputer = SimpleImputer (strategy = 'median' , copy = False )
74- standard_scaler = StandardScaler (with_mean = True , with_std = True , copy = False )
75-
76- preprocessors ['categorical' ] = [categorical_imputer , onehot_encoder ]
77- preprocessors ['numerical' ] = [numerical_imputer , standard_scaler ]
63+ preprocessors ['categorical' ] = [categorical_imputer , ordinal_encoder ]
7864
7965 return preprocessors
8066
@@ -161,31 +147,47 @@ def _fit(
161147
162148 X = cast (pd .DataFrame , X )
163149
164- self .all_nan_columns = set ([column for column in X .columns if X [column ].isna ().all ()])
150+ all_nan_columns = X .columns [X .isna ().all ()]
151+ for col in all_nan_columns :
152+ X [col ] = pd .to_numeric (X [col ])
153+
154+ # Handle objects if possible
155+ exist_object_columns = has_object_columns (X .dtypes .values )
156+ if exist_object_columns :
157+ X = self .infer_objects (X )
165158
166- categorical_columns , numerical_columns , feat_type = self ._get_columns_info (X )
159+ self .dtypes = [dt .name for dt in X .dtypes ] # Also note this change in self.dtypes
160+ self .all_nan_columns = set (all_nan_columns )
167161
168- self .enc_columns = categorical_columns
162+ self .enc_columns , self . feat_type = self . _get_columns_info ( X )
169163
170- preprocessors = get_tabular_preprocessors ()
171- self .column_transformer = _create_column_transformer (
172- preprocessors = preprocessors ,
173- numerical_columns = numerical_columns ,
174- categorical_columns = categorical_columns ,
175- )
164+ if len (self .enc_columns ) > 0 :
176165
177- # Mypy redefinition
178- assert self .column_transformer is not None
179- self .column_transformer .fit (X )
166+ preprocessors = get_tabular_preprocessors ()
167+ self .column_transformer = _create_column_transformer (
168+ preprocessors = preprocessors ,
169+ categorical_columns = self .enc_columns ,
170+ )
180171
181- # The column transformer reorders the feature types
182- # therefore, we need to change the order of columns as well
183- # This means categorical columns are shifted to the left
172+ # Mypy redefinition
173+ assert self . column_transformer is not None
174+ self . column_transformer . fit ( X )
184175
185- self .feat_type = sorted (
186- feat_type ,
187- key = functools .cmp_to_key (self ._comparator )
188- )
176+ # The column transformer moves categorical columns before all numerical columns
177+ # therefore, we need to sort categorical columns so that it complies this change
178+
179+ self .feat_type = sorted (
180+ self .feat_type ,
181+ key = functools .cmp_to_key (self ._comparator )
182+ )
183+
184+ encoded_categories = self .column_transformer .\
185+ named_transformers_ ['categorical_pipeline' ].\
186+ named_steps ['ordinalencoder' ].categories_
187+ self .categories = [
188+ list (range (len (cat )))
189+ for cat in encoded_categories
190+ ]
189191
190192 # differently to categorical_columns and numerical_columns,
191193 # this saves the index of the column.
@@ -265,6 +267,23 @@ def transform(
265267 if hasattr (X , "iloc" ) and not scipy .sparse .issparse (X ):
266268 X = cast (Type [pd .DataFrame ], X )
267269
270+ if self .all_nan_columns is None :
271+ raise ValueError ('_fit must be called before calling transform' )
272+
273+ for col in list (self .all_nan_columns ):
274+ X [col ] = np .nan
275+ X [col ] = pd .to_numeric (X [col ])
276+
277+ if len (self .categorical_columns ) > 0 :
278+ # when some categorical columns are not all nan in the training set
279+ # but they are all nan in the testing or validation set
280+ # we change those columns to `object` dtype
281+ # to ensure that these columns are changed to appropriate dtype
282+ # in self.infer_objects
283+ all_nan_cat_cols = set (X [self .enc_columns ].columns [X [self .enc_columns ].isna ().all ()])
284+ dtype_dict = {col : 'object' for col in self .enc_columns if col in all_nan_cat_cols }
285+ X = X .astype (dtype_dict )
286+
268287 # Check the data here so we catch problems on new test data
269288 self ._check_data (X )
270289
@@ -273,11 +292,6 @@ def transform(
273292 # We need to convert the column in test data to
274293 # object otherwise the test column is interpreted as float
275294 if self .column_transformer is not None :
276- if len (self .categorical_columns ) > 0 :
277- categorical_columns = self .column_transformer .transformers_ [0 ][- 1 ]
278- for column in categorical_columns :
279- if X [column ].isna ().all ():
280- X [column ] = X [column ].astype ('object' )
281295 X = self .column_transformer .transform (X )
282296
283297 # Sparse related transformations
@@ -361,7 +375,6 @@ def _check_data(
361375 self .column_order = column_order
362376
363377 dtypes = [dtype .name for dtype in X .dtypes ]
364-
365378 diff_cols = X .columns [[s_dtype != dtype for s_dtype , dtype in zip (self .dtypes , dtypes )]]
366379 if len (self .dtypes ) == 0 :
367380 self .dtypes = dtypes
@@ -373,7 +386,7 @@ def _check_data(
373386 def _get_columns_info (
374387 self ,
375388 X : pd .DataFrame ,
376- ) -> Tuple [List [str ], List [str ], List [ str ] ]:
389+ ) -> Tuple [List [str ], List [str ]]:
377390 """
378391 Return the columns to be encoded from a pandas dataframe
379392
@@ -392,15 +405,12 @@ def _get_columns_info(
392405 """
393406
394407 # Register if a column needs encoding
395- numerical_columns = []
396408 categorical_columns = []
397409 # Also, register the feature types for the estimator
398410 feat_type = []
399411
400412 # Make sure each column is a valid type
401413 for i , column in enumerate (X .columns ):
402- if self .all_nan_columns is not None and column in self .all_nan_columns :
403- continue
404414 column_dtype = self .dtypes [i ]
405415 err_msg = "Valid types are `numerical`, `categorical` or `boolean`, " \
406416 "but input column {} has an invalid type `{}`." .format (column , column_dtype )
@@ -411,7 +421,6 @@ def _get_columns_info(
411421 # TypeError: data type not understood in certain pandas types
412422 elif is_numeric_dtype (column_dtype ):
413423 feat_type .append ('numerical' )
414- numerical_columns .append (column )
415424 elif column_dtype == 'object' :
416425 # TODO verify how would this happen when we always convert the object dtypes to category
417426 raise TypeError (
@@ -437,7 +446,7 @@ def _get_columns_info(
437446 "before feeding it to AutoPyTorch." .format (err_msg )
438447 )
439448
440- return categorical_columns , numerical_columns , feat_type
449+ return categorical_columns , feat_type
441450
442451 def list_to_pandas (
443452 self ,
@@ -507,22 +516,26 @@ def infer_objects(self, X: pd.DataFrame) -> pd.DataFrame:
507516 pd.DataFrame
508517 """
509518 if hasattr (self , 'object_dtype_mapping' ):
510- # Mypy does not process the has attr. This dict is defined below
511- for key , dtype in self . object_dtype_mapping . items (): # type: ignore[has-type]
512- # honor the training data types
513- try :
514- X [ key ] = X [ key ]. astype ( dtype . name )
515- except Exception as e :
516- # Try inference if possible
517- self . logger . warning ( f'Casting the column { key } to { dtype } caused the exception { e } ' )
518- pass
519+ # honor the training data types
520+ try :
521+ # Mypy does not process the has attr.
522+ X = X . astype ( self . object_dtype_mapping ) # type: ignore[has-type]
523+ except Exception as e :
524+ # Try inference if possible
525+ self . logger . warning ( f'Casting the columns to training dtypes ' # type: ignore[has-type]
526+ f' { self . object_dtype_mapping } caused the exception { e } ' )
527+ pass
519528 else :
520- # Calling for the first time to infer the categories
521- X = X .infer_objects ()
522- for column , data_type in zip (X .columns , X .dtypes ):
523- if not is_numeric_dtype (data_type ):
524- X [column ] = X [column ].astype ('category' )
525-
529+ if len (self .dtypes ) != 0 :
530+ # when train data has no object dtype, but test does
531+ # we prioritise the datatype given in training data
532+ dtype_dict = {col : dtype for col , dtype in zip (X .columns , self .dtypes )}
533+ X = X .astype (dtype_dict )
534+ else :
535+ # Calling for the first time to infer the categories
536+ X = X .infer_objects ()
537+ dtype_dict = {col : 'category' for col , dtype in zip (X .columns , X .dtypes ) if not is_numeric_dtype (dtype )}
538+ X = X .astype (dtype_dict )
526539 # only numerical attributes and categories
527540 self .object_dtype_mapping = {column : data_type for column , data_type in zip (X .columns , X .dtypes )}
528541
0 commit comments