1919from sklearn .exceptions import NotFittedError
2020from sklearn .impute import SimpleImputer
2121from sklearn .pipeline import make_pipeline
22- from sklearn .preprocessing import OneHotEncoder , StandardScaler
22+ from sklearn .preprocessing import OrdinalEncoder
2323
2424from autoPyTorch .data .base_feature_validator import BaseFeatureValidator , SupportedFeatTypes
2525from autoPyTorch .data .utils import (
3232
3333def _create_column_transformer (
3434 preprocessors : Dict [str , List [BaseEstimator ]],
35- numerical_columns : List [str ],
3635 categorical_columns : List [str ],
3736) -> ColumnTransformer :
3837 """
@@ -43,49 +42,36 @@ def _create_column_transformer(
4342 Args:
4443 preprocessors (Dict[str, List[BaseEstimator]]):
4544 Dictionary containing list of numerical and categorical preprocessors.
46- numerical_columns (List[str]):
47- List of names of numerical columns
4845 categorical_columns (List[str]):
4946 List of names of categorical columns
5047
5148 Returns:
5249 ColumnTransformer
5350 """
5451
55- numerical_pipeline = 'drop'
56- categorical_pipeline = 'drop'
57- if len (numerical_columns ) > 0 :
58- numerical_pipeline = make_pipeline (* preprocessors ['numerical' ])
59- if len (categorical_columns ) > 0 :
60- categorical_pipeline = make_pipeline (* preprocessors ['categorical' ])
52+ categorical_pipeline = make_pipeline (* preprocessors ['categorical' ])
6153
6254 return ColumnTransformer ([
63- ('categorical_pipeline' , categorical_pipeline , categorical_columns ),
64- ('numerical_pipeline' , numerical_pipeline , numerical_columns )],
65- remainder = 'drop'
55+ ('categorical_pipeline' , categorical_pipeline , categorical_columns )],
56+ remainder = 'passthrough'
6657 )
6758
6859
6960def get_tabular_preprocessors () -> Dict [str , List [BaseEstimator ]]:
7061 """
7162 This function creates a Dictionary containing a list
7263 of numerical and categorical preprocessors
73-
7464 Returns:
7565 Dict[str, List[BaseEstimator]]
7666 """
7767 preprocessors : Dict [str , List [BaseEstimator ]] = dict ()
7868
7969 # Categorical Preprocessors
80- onehot_encoder = OneHotEncoder (categories = 'auto' , sparse = False , handle_unknown = 'ignore' )
70+ ordinal_encoder = OrdinalEncoder (handle_unknown = 'use_encoded_value' ,
71+ unknown_value = - 1 )
8172 categorical_imputer = SimpleImputer (strategy = 'constant' , copy = False )
8273
83- # Numerical Preprocessors
84- numerical_imputer = SimpleImputer (strategy = 'median' , copy = False )
85- standard_scaler = StandardScaler (with_mean = True , with_std = True , copy = False )
86-
87- preprocessors ['categorical' ] = [categorical_imputer , onehot_encoder ]
88- preprocessors ['numerical' ] = [numerical_imputer , standard_scaler ]
74+ preprocessors ['categorical' ] = [categorical_imputer , ordinal_encoder ]
8975
9076 return preprocessors
9177
@@ -180,31 +166,47 @@ def _fit(
180166 if hasattr (X , "iloc" ) and not issparse (X ):
181167 X = cast (pd .DataFrame , X )
182168
183- self .all_nan_columns = set ([column for column in X .columns if X [column ].isna ().all ()])
169+ all_nan_columns = X .columns [X .isna ().all ()]
170+ for col in all_nan_columns :
171+ X [col ] = pd .to_numeric (X [col ])
172+
173+ # Handle objects if possible
174+ exist_object_columns = has_object_columns (X .dtypes .values )
175+ if exist_object_columns :
176+ X = self .infer_objects (X )
184177
185- categorical_columns , numerical_columns , feat_type = self ._get_columns_info (X )
178+ self .dtypes = [dt .name for dt in X .dtypes ] # Also note this change in self.dtypes
179+ self .all_nan_columns = set (all_nan_columns )
186180
187- self .enc_columns = categorical_columns
181+ self .enc_columns , self . feat_type = self . _get_columns_info ( X )
188182
189- preprocessors = get_tabular_preprocessors ()
190- self .column_transformer = _create_column_transformer (
191- preprocessors = preprocessors ,
192- numerical_columns = numerical_columns ,
193- categorical_columns = categorical_columns ,
194- )
183+ if len (self .enc_columns ) > 0 :
195184
196- # Mypy redefinition
197- assert self .column_transformer is not None
198- self .column_transformer .fit (X )
185+ preprocessors = get_tabular_preprocessors ()
186+ self .column_transformer = _create_column_transformer (
187+ preprocessors = preprocessors ,
188+ categorical_columns = self .enc_columns ,
189+ )
199190
200- # The column transformer reorders the feature types
201- # therefore, we need to change the order of columns as well
202- # This means categorical columns are shifted to the left
191+ # Mypy redefinition
192+ assert self . column_transformer is not None
193+ self . column_transformer . fit ( X )
203194
204- self .feat_type = sorted (
205- feat_type ,
206- key = functools .cmp_to_key (self ._comparator )
207- )
195+ # The column transformer moves categorical columns before all numerical columns
196+ # therefore, we need to sort categorical columns so that it complies this change
197+
198+ self .feat_type = sorted (
199+ self .feat_type ,
200+ key = functools .cmp_to_key (self ._comparator )
201+ )
202+
203+ encoded_categories = self .column_transformer .\
204+ named_transformers_ ['categorical_pipeline' ].\
205+ named_steps ['ordinalencoder' ].categories_
206+ self .categories = [
207+ list (range (len (cat )))
208+ for cat in encoded_categories
209+ ]
208210
209211 # differently to categorical_columns and numerical_columns,
210212 # this saves the index of the column.
@@ -289,6 +291,23 @@ def transform(
289291 X = cast (Type [pd .DataFrame ], X )
290292> >> >> >> [FIX ] Tests after rebase of `reg_cocktails` (#359)
291293
294+ if self .all_nan_columns is None :
295+ raise ValueError ('_fit must be called before calling transform' )
296+
297+ for col in list (self .all_nan_columns ):
298+ X [col ] = np .nan
299+ X [col ] = pd .to_numeric (X [col ])
300+
301+ if len (self .categorical_columns ) > 0 :
302+ # when some categorical columns are not all nan in the training set
303+ # but they are all nan in the testing or validation set
304+ # we change those columns to `object` dtype
305+ # to ensure that these columns are changed to appropriate dtype
306+ # in self.infer_objects
307+ all_nan_cat_cols = set (X [self .enc_columns ].columns [X [self .enc_columns ].isna ().all ()])
308+ dtype_dict = {col : 'object' for col in self .enc_columns if col in all_nan_cat_cols }
309+ X = X .astype (dtype_dict )
310+
292311 # Check the data here so we catch problems on new test data
293312 self ._check_data (X )
294313
@@ -297,11 +316,6 @@ def transform(
297316 # We need to convert the column in test data to
298317 # object otherwise the test column is interpreted as float
299318 if self .column_transformer is not None :
300- if len (self .categorical_columns ) > 0 :
301- categorical_columns = self .column_transformer .transformers_ [0 ][- 1 ]
302- for column in categorical_columns :
303- if X [column ].isna ().all ():
304- X [column ] = X [column ].astype ('object' )
305319 X = self .column_transformer .transform (X )
306320
307321 # Sparse related transformations
@@ -416,7 +430,6 @@ def _check_data(
416430 self .column_order = column_order
417431
418432 dtypes = [dtype .name for dtype in X .dtypes ]
419-
420433 diff_cols = X .columns [[s_dtype != dtype for s_dtype , dtype in zip (self .dtypes , dtypes )]]
421434 if len (self .dtypes ) == 0 :
422435 self .dtypes = dtypes
@@ -428,7 +441,7 @@ def _check_data(
428441 def _get_columns_info (
429442 self ,
430443 X : pd .DataFrame ,
431- ) - > Tuple [List [str ], List [str ], List [ str ] ]:
444+ ) -> Tuple [List [str ], List [str ]]:
432445 """
433446 Return the columns to be encoded from a pandas dataframe
434447
@@ -447,15 +460,12 @@ def _get_columns_info(
447460 """
448461
449462 # Register if a column needs encoding
450- numerical_columns = []
451463 categorical_columns = []
452464 # Also, register the feature types for the estimator
453465 feat_type = []
454466
455467 # Make sure each column is a valid type
456468 for i , column in enumerate (X .columns ):
457- if self .all_nan_columns is not None and column in self .all_nan_columns :
458- continue
459469 column_dtype = self .dtypes [i ]
460470 err_msg = "Valid types are `numerical`, `categorical` or `boolean`, " \
461471 "but input column {} has an invalid type `{}`." .format (column , column_dtype )
@@ -466,7 +476,6 @@ def _get_columns_info(
466476 # TypeError: data type not understood in certain pandas types
467477 elif is_numeric_dtype (column_dtype ):
468478 feat_type .append ('numerical' )
469- numerical_columns .append (column )
470479 elif column_dtype == 'object' :
471480 # TODO verify how would this happen when we always convert the object dtypes to category
472481 raise TypeError (
@@ -492,7 +501,7 @@ def _get_columns_info(
492501 "before feeding it to AutoPyTorch." .format (err_msg )
493502 )
494503
495- return categorical_columns , numerical_columns , feat_type
504+ return categorical_columns , feat_type
496505
497506 def list_to_pandas (
498507 self ,
@@ -562,22 +571,26 @@ def infer_objects(self, X: pd.DataFrame) -> pd.DataFrame:
562571 pd.DataFrame
563572 """
564573 if hasattr (self , 'object_dtype_mapping' ):
565- # Mypy does not process the has attr. This dict is defined below
566- for key , dtype in self . object_dtype_mapping . items (): # type: ignore[has-type]
567- # honor the training data types
568- try :
569- X [ key ] = X [ key ]. astype ( dtype . name )
570- except Exception as e :
571- # Try inference if possible
572- self . logger . warning ( f'Casting the column { key } to { dtype } caused the exception { e } ' )
573- pass
574+ # honor the training data types
575+ try :
576+ # Mypy does not process the has attr.
577+ X = X . astype ( self . object_dtype_mapping ) # type: ignore[has-type]
578+ except Exception as e :
579+ # Try inference if possible
580+ self . logger . warning ( f'Casting the columns to training dtypes ' # type: ignore[has-type]
581+ f' { self . object_dtype_mapping } caused the exception { e } ' )
582+ pass
574583 else :
575- # Calling for the first time to infer the categories
576- X = X .infer_objects ()
577- for column , data_type in zip (X .columns , X .dtypes ):
578- if not is_numeric_dtype (data_type ):
579- X [column ] = X [column ].astype ('category' )
580-
584+ if len (self .dtypes ) != 0 :
585+ # when train data has no object dtype, but test does
586+ # we prioritise the datatype given in training data
587+ dtype_dict = {col : dtype for col , dtype in zip (X .columns , self .dtypes )}
588+ X = X .astype (dtype_dict )
589+ else :
590+ # Calling for the first time to infer the categories
591+ X = X .infer_objects ()
592+ dtype_dict = {col : 'category' for col , dtype in zip (X .columns , X .dtypes ) if not is_numeric_dtype (dtype )}
593+ X = X .astype (dtype_dict )
581594 # only numerical attributes and categories
582595 self .object_dtype_mapping = {column : data_type for column , data_type in zip (X .columns , X .dtypes )}
583596
0 commit comments