Skip to content

Commit 1f6351e

Browse files
committed
fix error with pytorch embeddings
1 parent 49d49c2 commit 1f6351e

File tree

6 files changed

+46
-23
lines changed

6 files changed

+46
-23
lines changed

autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/TabularColumnTransformer.py

Lines changed: 24 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,9 @@ def __init__(self, random_state: Optional[Union[np.random.RandomState, int]] = N
2323
self.preprocessor: Optional[ColumnTransformer] = None
2424
self.add_fit_requirements([
2525
FitRequirement('numerical_columns', (List,), user_defined=True, dataset_property=True),
26-
FitRequirement('categorical_columns', (List,), user_defined=True, dataset_property=True)])
26+
FitRequirement('categorical_columns', (List,), user_defined=True, dataset_property=True),
27+
FitRequirement('encode_columns', (List,), user_defined=True, dataset_property=False),
28+
FitRequirement('embed_columns', (List,), user_defined=True, dataset_property=False)])
2729

2830

2931
def get_column_transformer(self) -> ColumnTransformer:
@@ -53,17 +55,32 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> "TabularColumnTransformer":
5355
self.check_requirements(X, y)
5456

5557
preprocessors = get_tabular_preprocessers(X)
58+
5659
column_transformers: List[Tuple[str, BaseEstimator, List[int]]] = []
60+
61+
numerical_pipeline = 'passthrough'
62+
categorical_pipeline = 'passthrough'
63+
encode_pipeline = 'passthrough'
64+
5765
if len(preprocessors['numerical']) > 0:
5866
numerical_pipeline = make_pipeline(*preprocessors['numerical'])
59-
column_transformers.append(
60-
('numerical_pipeline', numerical_pipeline, X['dataset_properties']['numerical_columns'])
61-
)
67+
68+
column_transformers.append(
69+
('numerical_pipeline', numerical_pipeline, X['dataset_properties']['numerical_columns'])
70+
)
6271
if len(preprocessors['categorical']) > 0:
6372
categorical_pipeline = make_pipeline(*preprocessors['categorical'])
64-
column_transformers.append(
65-
('categorical_pipeline', categorical_pipeline, X['dataset_properties']['categorical_columns'])
66-
)
73+
74+
column_transformers.append(
75+
('categorical_pipeline', categorical_pipeline, X['dataset_properties']['categorical_columns'])
76+
)
77+
78+
if len(preprocessors['encode']) > 0:
79+
encode_pipeline = make_pipeline(*preprocessors['encode'])
80+
81+
column_transformers.append(
82+
('encode_pipeline', encode_pipeline, X['encode_columns'])
83+
)
6784

6885
# in case the preprocessing steps are disabled
6986
# i.e, NoEncoder for categorical, we want to

autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/base_tabular_preprocessing.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -14,19 +14,19 @@ class autoPyTorchTabularPreprocessingComponent(autoPyTorchPreprocessingComponent
1414
def __init__(self) -> None:
1515
super().__init__()
1616
self.preprocessor: Union[Dict[str, Optional[BaseEstimator]], BaseEstimator] = dict(
17-
numerical=None, categorical=None)
17+
numerical=None, encode=None, categorical=None)
1818

1919
def get_preprocessor_dict(self) -> Dict[str, BaseEstimator]:
2020
"""
21-
Returns early_preprocessor dictionary containing the sklearn numerical
22-
and categorical early_preprocessor with "numerical" and "categorical"
23-
keys. May contain None for a key if early_preprocessor does not
21+
Returns early_preprocessor dictionary containing the sklearn numerical,
22+
categorical and encode early_preprocessor with "numerical", "categorical"
23+
"encode" keys. May contain None for a key if early_preprocessor does not
2424
handle the datatype defined by key
2525
2626
Returns:
2727
Dict[str, BaseEstimator]: early_preprocessor dictionary
2828
"""
29-
if (self.preprocessor['numerical'] and self.preprocessor['categorical']) is None:
29+
if (self.preprocessor['numerical'] and self.preprocessor['categorical'] and self.preprocessor['encode']) is None:
3030
raise AttributeError("{} can't return early_preprocessor dict without fitting first"
3131
.format(self.__class__.__name__))
3232
return self.preprocessor

autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/column_splitting/ColumnSplitter.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
import logging
2+
import time
13
from typing import Any, Dict, List, Optional, Union
24

35
from ConfigSpace.configuration_space import ConfigurationSpace
@@ -12,6 +14,7 @@
1214
from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.base_tabular_preprocessing import \
1315
autoPyTorchTabularPreprocessingComponent
1416
from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter
17+
from autoPyTorch.utils.logging_ import get_named_client_logger
1518

1619

1720
class ColumnSplitter(autoPyTorchTabularPreprocessingComponent):

autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/encoding/OneHotEncoder.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -20,12 +20,12 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEncoder:
2020

2121
self.check_requirements(X, y)
2222

23-
self.preprocessor['categorical'] = OHE(
24-
# It is safer to have the OHE produce a 0 array than to crash a good configuration
25-
categories='auto',
26-
sparse=False,
27-
handle_unknown='ignore',
28-
dtype=np.float32)
23+
if self._has_encode_columns(X):
24+
self.preprocessor['encode'] = OHE(
25+
# It is safer to have the OHE produce a 0 array than to crash a good configuration
26+
sparse=False,
27+
handle_unknown='ignore',
28+
dtype=np.float32)
2929
return self
3030

3131
@staticmethod

autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/encoding/base_encoder.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,11 @@ class BaseEncoder(autoPyTorchTabularPreprocessingComponent):
1313
def __init__(self) -> None:
1414
super().__init__()
1515
self.add_fit_requirements([
16-
FitRequirement('categorical_columns', (List,), user_defined=True, dataset_property=True), ])
16+
FitRequirement('encode_columns', (List,), user_defined=True, dataset_property=False)])
17+
18+
@staticmethod
19+
def _has_encode_columns(X: Dict[str, Any]):
20+
return len(X.get('encode_columns', [])) > 0
1721

1822
def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
1923
"""
@@ -24,8 +28,5 @@ def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
2428
Returns:
2529
(Dict[str, Any]): the updated 'X' dictionary
2630
"""
27-
if self.preprocessor['numerical'] is None and self.preprocessor['categorical'] is None:
28-
raise ValueError("cant call transform on {} without fitting first."
29-
.format(self.__class__.__name__))
3031
X.update({'encoder': self.preprocessor})
3132
return X

autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/utils.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,13 +21,15 @@ def get_tabular_preprocessers(X: Dict[str, Any]) -> Dict[str, List[BaseEstimator
2121
Returns:
2222
(Dict[str, List[BaseEstimator]]): dictionary with list of numerical and categorical preprocessors
2323
"""
24-
preprocessor: Dict[str, List[BaseEstimator]] = dict(numerical=list(), categorical=list())
24+
preprocessor: Dict[str, List[BaseEstimator]] = dict(numerical=list(), categorical=list(), encode=list())
2525
for key, value in X.items():
2626
if isinstance(value, dict):
2727
# as each preprocessor is child of BaseEstimator
2828
if 'numerical' in value and isinstance(value['numerical'], BaseEstimator):
2929
preprocessor['numerical'].append(value['numerical'])
3030
if 'categorical' in value and isinstance(value['categorical'], BaseEstimator):
3131
preprocessor['categorical'].append(value['categorical'])
32+
if 'encode' in value and isinstance(value['encode'], BaseEstimator):
33+
preprocessor['encode'].append(value['encode'])
3234

3335
return preprocessor

0 commit comments

Comments
 (0)