fix error with pytorch embeddings

ravinkohli · ravinkohli · commit 1f6351e15b11 · 2022-08-18T18:34:30.000+02:00
diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/TabularColumnTransformer.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/TabularColumnTransformer.py
@@ -23,7 +23,9 @@ def __init__(self, random_state: Optional[Union[np.random.RandomState, int]] = N
         self.preprocessor: Optional[ColumnTransformer] = None
         self.add_fit_requirements([
             FitRequirement('numerical_columns', (List,), user_defined=True, dataset_property=True),
-            FitRequirement('categorical_columns', (List,), user_defined=True, dataset_property=True)])
+            FitRequirement('categorical_columns', (List,), user_defined=True, dataset_property=True),
+            FitRequirement('encode_columns', (List,), user_defined=True, dataset_property=False),
+            FitRequirement('embed_columns', (List,), user_defined=True, dataset_property=False)])
         
 
     def get_column_transformer(self) -> ColumnTransformer:
@@ -53,17 +55,32 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> "TabularColumnTransformer":
         self.check_requirements(X, y)
 
         preprocessors = get_tabular_preprocessers(X)
+
         column_transformers: List[Tuple[str, BaseEstimator, List[int]]] = []
+
+        numerical_pipeline = 'passthrough'
+        categorical_pipeline = 'passthrough'
+        encode_pipeline = 'passthrough'
+
         if len(preprocessors['numerical']) > 0:
             numerical_pipeline = make_pipeline(*preprocessors['numerical'])
-            column_transformers.append(
-                ('numerical_pipeline', numerical_pipeline, X['dataset_properties']['numerical_columns'])
-            )
+
+        column_transformers.append(
+            ('numerical_pipeline', numerical_pipeline, X['dataset_properties']['numerical_columns'])
+        )
         if len(preprocessors['categorical']) > 0:
             categorical_pipeline = make_pipeline(*preprocessors['categorical'])
-            column_transformers.append(
-                ('categorical_pipeline', categorical_pipeline, X['dataset_properties']['categorical_columns'])
-            )
+
+        column_transformers.append(
+            ('categorical_pipeline', categorical_pipeline, X['dataset_properties']['categorical_columns'])
+        )
+
+        if len(preprocessors['encode']) > 0:
+            encode_pipeline = make_pipeline(*preprocessors['encode'])
+
+        column_transformers.append(
+            ('encode_pipeline', encode_pipeline, X['encode_columns'])
+        )
 
         # in case the preprocessing steps are disabled
         # i.e, NoEncoder for categorical, we want to
diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/base_tabular_preprocessing.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/base_tabular_preprocessing.py
@@ -14,19 +14,19 @@ class autoPyTorchTabularPreprocessingComponent(autoPyTorchPreprocessingComponent
     def __init__(self) -> None:
         super().__init__()
         self.preprocessor: Union[Dict[str, Optional[BaseEstimator]], BaseEstimator] = dict(
-            numerical=None, categorical=None)
+            numerical=None, encode=None, categorical=None)
 
     def get_preprocessor_dict(self) -> Dict[str, BaseEstimator]:
         """
-        Returns early_preprocessor dictionary containing the sklearn numerical
-        and categorical early_preprocessor with "numerical" and "categorical"
-        keys. May contain None for a key if early_preprocessor does not
+        Returns early_preprocessor dictionary containing the sklearn numerical,
+        categorical and encode early_preprocessor with "numerical", "categorical"
+        "encode" keys. May contain None for a key if early_preprocessor does not
         handle the datatype defined by key
 
         Returns:
             Dict[str, BaseEstimator]: early_preprocessor dictionary
         """
-        if (self.preprocessor['numerical'] and self.preprocessor['categorical']) is None:
+        if (self.preprocessor['numerical'] and self.preprocessor['categorical'] and self.preprocessor['encode']) is None:
             raise AttributeError("{} can't return early_preprocessor dict without fitting first"
                                  .format(self.__class__.__name__))
         return self.preprocessor
diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/column_splitting/ColumnSplitter.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/column_splitting/ColumnSplitter.py
@@ -1,3 +1,5 @@
+import logging
+import time
 from typing import Any, Dict, List, Optional, Union
 
 from ConfigSpace.configuration_space import ConfigurationSpace
@@ -12,6 +14,7 @@
 from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.base_tabular_preprocessing import \
     autoPyTorchTabularPreprocessingComponent
 from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter
+from autoPyTorch.utils.logging_ import get_named_client_logger
 
 
 class ColumnSplitter(autoPyTorchTabularPreprocessingComponent):
diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/encoding/OneHotEncoder.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/encoding/OneHotEncoder.py
@@ -20,12 +20,12 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEncoder:
 
         self.check_requirements(X, y)
 
-        self.preprocessor['categorical'] = OHE(
-            # It is safer to have the OHE produce a 0 array than to crash a good configuration
-            categories='auto',
-            sparse=False,
-            handle_unknown='ignore',
-            dtype=np.float32)
+        if self._has_encode_columns(X):
+            self.preprocessor['encode'] = OHE(
+                # It is safer to have the OHE produce a 0 array than to crash a good configuration
+                sparse=False,
+                handle_unknown='ignore',
+                dtype=np.float32)
         return self
 
     @staticmethod
diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/encoding/base_encoder.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/encoding/base_encoder.py
@@ -13,7 +13,11 @@ class BaseEncoder(autoPyTorchTabularPreprocessingComponent):
     def __init__(self) -> None:
         super().__init__()
         self.add_fit_requirements([
-            FitRequirement('categorical_columns', (List,), user_defined=True, dataset_property=True), ])
+            FitRequirement('encode_columns', (List,), user_defined=True, dataset_property=False)])
+
+    @staticmethod
+    def _has_encode_columns(X: Dict[str, Any]):
+        return len(X.get('encode_columns', [])) > 0
 
     def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
         """
@@ -24,8 +28,5 @@ def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
         Returns:
             (Dict[str, Any]): the updated 'X' dictionary
         """
-        if self.preprocessor['numerical'] is None and self.preprocessor['categorical'] is None:
-            raise ValueError("cant call transform on {} without fitting first."
-                             .format(self.__class__.__name__))
         X.update({'encoder': self.preprocessor})
         return X
diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/utils.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/utils.py
@@ -21,13 +21,15 @@ def get_tabular_preprocessers(X: Dict[str, Any]) -> Dict[str, List[BaseEstimator
     Returns:
         (Dict[str, List[BaseEstimator]]): dictionary with list of numerical and categorical preprocessors
     """
-    preprocessor: Dict[str, List[BaseEstimator]] = dict(numerical=list(), categorical=list())
+    preprocessor: Dict[str, List[BaseEstimator]] = dict(numerical=list(), categorical=list(), encode=list())
     for key, value in X.items():
         if isinstance(value, dict):
             # as each preprocessor is child of BaseEstimator
             if 'numerical' in value and isinstance(value['numerical'], BaseEstimator):
                 preprocessor['numerical'].append(value['numerical'])
             if 'categorical' in value and isinstance(value['categorical'], BaseEstimator):
                 preprocessor['categorical'].append(value['categorical'])
+            if 'encode' in value and isinstance(value['encode'], BaseEstimator):
+                preprocessor['encode'].append(value['encode'])
 
     return preprocessor