Skip to content

Commit 392f07a

Browse files
[FIX] Passing checks (#298)
* Initial fix for all tests passing locally py=3.8 * fix bug in tests * fix bug in test for data * debugging error in dummy forward pass * debug try -2 * catch runtime error in ci * catch runtime error in ci * add better debug test setup * debug some more * run this test only * remove sum backward * remove inplace in inception block * undo silly change * Enable all tests * fix flake * fix bug in test setup * remove anamoly detection * minor changes to comments * Apply suggestions from code review Co-authored-by: nabenabe0928 <47781922+nabenabe0928@users.noreply.github.com> * Address comments from Shuhei * revert change leading to bug * fix flake * change comment position in feature validator * Add documentation for _is_datasets_consistent * address comments from arlind * case when all nans in test Co-authored-by: nabenabe0928 <47781922+nabenabe0928@users.noreply.github.com>
1 parent 6543316 commit 392f07a

File tree

18 files changed

+109
-200
lines changed

18 files changed

+109
-200
lines changed

autoPyTorch/api/base_task.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1773,7 +1773,7 @@ def fit_ensemble(
17731773
Args:
17741774
optimize_metric (str): name of the metric that is used to
17751775
evaluate a pipeline. if not specified, value passed to search will be used
1776-
precision (int), (default=32): Numeric precision used when loading
1776+
precision (Optional[int]): Numeric precision used when loading
17771777
ensemble data. Can be either 16, 32 or 64.
17781778
ensemble_nbest (Optional[int]):
17791779
only consider the ensemble_nbest models to build the ensemble.
@@ -1816,6 +1816,7 @@ def fit_ensemble(
18161816
"Please call the `search()` method of {} prior to "
18171817
"fit_ensemble().".format(self.__class__.__name__))
18181818

1819+
precision = precision if precision is not None else self.precision
18191820
if precision not in [16, 32, 64]:
18201821
raise ValueError("precision must be one of 16, 32, 64 but got {}".format(precision))
18211822

@@ -1866,7 +1867,7 @@ def fit_ensemble(
18661867
manager = self._init_ensemble_builder(
18671868
time_left_for_ensembles=time_left_for_ensemble,
18681869
optimize_metric=self.opt_metric if optimize_metric is None else optimize_metric,
1869-
precision=self.precision if precision is None else precision,
1870+
precision=precision,
18701871
ensemble_size=ensemble_size,
18711872
ensemble_nbest=ensemble_nbest,
18721873
)

autoPyTorch/data/tabular_feature_validator.py

Lines changed: 36 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -139,6 +139,7 @@ def _comparator(cmp1: str, cmp2: str) -> int:
139139
if cmp1 not in choices or cmp2 not in choices:
140140
raise ValueError('The comparator for the column order only accepts {}, '
141141
'but got {} and {}'.format(choices, cmp1, cmp2))
142+
142143
idx1, idx2 = choices.index(cmp1), choices.index(cmp2)
143144
return idx1 - idx2
144145

@@ -284,13 +285,12 @@ def transform(
284285
# having a value for a categorical column.
285286
# We need to convert the column in test data to
286287
# object otherwise the test column is interpreted as float
287-
if len(self.categorical_columns) > 0:
288-
categorical_columns = self.column_transformer.transformers_[0][-1]
289-
for column in categorical_columns:
290-
if X[column].isna().all():
291-
X[column] = X[column].astype('object')
292-
293288
if self.column_transformer is not None:
289+
if len(self.categorical_columns) > 0:
290+
categorical_columns = self.column_transformer.transformers_[0][-1]
291+
for column in categorical_columns:
292+
if X[column].isna().all():
293+
X[column] = X[column].astype('object')
294294
X = self.column_transformer.transform(X)
295295

296296
# Sparse related transformations
@@ -379,16 +379,11 @@ def _check_data(
379379
self.column_order = column_order
380380

381381
dtypes = [dtype.name for dtype in X.dtypes]
382-
dtypes_diff = [s_dtype != dtype for s_dtype, dtype in zip(self.dtypes, dtypes)]
382+
383+
diff_cols = X.columns[[s_dtype != dtype for s_dtype, dtype in zip(self.dtypes, dtypes)]]
383384
if len(self.dtypes) == 0:
384385
self.dtypes = dtypes
385-
elif (
386-
any(dtypes_diff) # the dtypes of some columns are different in train and test dataset
387-
and self.all_nan_columns is not None # Ignore all_nan_columns is None
388-
and len(set(X.columns[dtypes_diff]).difference(self.all_nan_columns)) != 0
389-
):
390-
# The dtypes can be different if and only if the column belongs
391-
# to all_nan_columns as these columns would be imputed.
386+
elif not self._is_datasets_consistent(diff_cols, X):
392387
raise ValueError("The dtype of the features must not be changed after fit(), but"
393388
" the dtypes of some columns are different between training ({}) and"
394389
" test ({}) datasets.".format(self.dtypes, dtypes))
@@ -619,6 +614,33 @@ def infer_objects(self, X: pd.DataFrame) -> pd.DataFrame:
619614

620615
return X
621616

617+
def _is_datasets_consistent(self, diff_cols: List[Union[int, str]], X: pd.DataFrame) -> bool:
618+
"""
619+
Check the consistency of dtypes between training and test datasets.
620+
The dtypes can be different if the column belongs to `self.all_nan_columns`
621+
(list of column names with all nans in training data) or if the column is
622+
all nan as these columns would be imputed.
623+
624+
Args:
625+
diff_cols (List[bool]):
626+
The column labels that have different dtypes.
627+
X (pd.DataFrame):
628+
A validation or test dataset to be compared with the training dataset
629+
Returns:
630+
_ (bool): Whether the training and test datasets are consistent.
631+
"""
632+
if self.all_nan_columns is None:
633+
if len(diff_cols) == 0:
634+
return True
635+
else:
636+
return all(X[diff_cols].isna().all())
637+
638+
# dtype is different ==> the column in at least either of train or test datasets must be all NaN
639+
# inconsistent <==> dtype is different and the col in both train and test is not all NaN
640+
inconsistent_cols = list(set(diff_cols) - self.all_nan_columns)
641+
642+
return len(inconsistent_cols) == 0 or all(X[inconsistent_cols].isna().all())
643+
622644

623645
def has_object_columns(
624646
feature_types: pd.Series,

autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/encoding/NoEncoder.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
4040
Returns:
4141
(Dict[str, Any]): the updated 'X' dictionary
4242
"""
43-
X.update({'encoder': self.preprocessor})
43+
# X.update({'encoder': self.preprocessor})
4444
return X
4545

4646
@staticmethod

autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/scaling/NoScaler.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
4343
Returns:
4444
np.ndarray: Transformed features
4545
"""
46-
X.update({'scaler': self.preprocessor})
46+
# X.update({'scaler': self.preprocessor})
4747
return X
4848

4949
@staticmethod

autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,6 @@
1-
<<<<<<< HEAD
21
import copy
32
from typing import Any, Dict, List, Optional, Tuple, Union
4-
=======
5-
# import copy
6-
from typing import Any, Dict, Optional, Tuple
7-
>>>>>>> Bug fixes (#249)
3+
84

95
import numpy as np
106

@@ -40,6 +36,7 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
4036
self.feature_shapes = feature_shapes
4137
else:
4238
self.feature_shapes = X['dataset_properties']['feature_shapes']
39+
4340
return self
4441

4542
def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:

autoPyTorch/pipeline/components/training/trainer/AdversarialTrainer.py

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -109,11 +109,7 @@ def train_step(self, data: np.ndarray, targets: np.ndarray) -> Tuple[float, torc
109109
loss = loss_func(self.criterion, original_outputs, adversarial_outputs)
110110
loss.backward()
111111
self.optimizer.step()
112-
if self.scheduler:
113-
if 'ReduceLROnPlateau' in self.scheduler.__class__.__name__:
114-
self.scheduler.step(loss)
115-
else:
116-
self.scheduler.step()
112+
117113
# only passing the original outputs since we do not care about
118114
# the adversarial performance.
119115
return loss.item(), original_outputs

autoPyTorch/pipeline/components/training/trainer/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -282,6 +282,7 @@ def fit(self, X: Dict[str, Any], y: Any = None, **kwargs: Any) -> autoPyTorchCom
282282
y=y,
283283
**kwargs
284284
)
285+
285286
# Add snapshots to base network to enable
286287
# predicting with snapshot ensemble
287288
self.choice: autoPyTorchComponent = cast(autoPyTorchComponent, self.choice)

examples/40_advanced/40_advanced/example_custom_configuration_space.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ def get_search_space_updates():
5959
value_range=['shake-shake'],
6060
default_value='shake-shake')
6161
updates.append(node_name='network_backbone',
62-
hyperparameter='ResNetBackbone:shake_shake_method',
62+
hyperparameter='ResNetBackbone:shake_shake_update_func',
6363
value_range=['M3'],
6464
default_value='M3'
6565
)

test/test_data/test_feature_validator.py

Lines changed: 21 additions & 129 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
import copy
1+
import copy
22
import functools
33

44
import numpy as np
@@ -139,9 +139,9 @@ def test_featurevalidator_fitontypeA_transformtypeB(input_data_featuretest):
139139
if isinstance(input_data_featuretest, pd.DataFrame):
140140
pytest.skip("Column order change in pandas is not supported")
141141
elif isinstance(input_data_featuretest, np.ndarray):
142-
complementary_type = validator.numpy_to_pandas(input_data_featuretest)
142+
complementary_type = validator.numpy_array_to_pandas(input_data_featuretest)
143143
elif isinstance(input_data_featuretest, list):
144-
complementary_type, _ = validator.list_to_pandas(input_data_featuretest)
144+
complementary_type, _ = validator.list_to_dataframe(input_data_featuretest)
145145
elif sparse.issparse(input_data_featuretest):
146146
complementary_type = sparse.csr_matrix(input_data_featuretest.todense())
147147
else:
@@ -167,128 +167,10 @@ def test_featurevalidator_get_columns_to_encode():
167167
for col in df.columns:
168168
df[col] = df[col].astype(col)
169169

170-
<<<<<<< HEAD
171170
transformed_columns, feature_types = validator._get_columns_to_encode(df)
172171

173172
assert transformed_columns == ['category', 'bool']
174173
assert feature_types == ['numerical', 'numerical', 'categorical', 'categorical']
175-
=======
176-
validator.fit(df)
177-
178-
categorical_columns, numerical_columns, feat_type = validator._get_columns_info(df)
179-
180-
assert numerical_columns == ['int', 'float']
181-
assert categorical_columns == ['category', 'bool']
182-
assert feat_type == ['numerical', 'numerical', 'categorical', 'categorical']
183-
184-
185-
def feature_validator_remove_nan_catcolumns(df_train: pd.DataFrame, df_test: pd.DataFrame,
186-
ans_train: np.ndarray, ans_test: np.ndarray) -> None:
187-
validator = TabularFeatureValidator()
188-
validator.fit(df_train)
189-
transformed_df_train = validator.transform(df_train)
190-
transformed_df_test = validator.transform(df_test)
191-
192-
assert np.array_equal(transformed_df_train, ans_train)
193-
assert np.array_equal(transformed_df_test, ans_test)
194-
195-
196-
def test_feature_validator_remove_nan_catcolumns():
197-
"""
198-
Make sure categorical columns that have only nan values are removed.
199-
Transform performs the folloing:
200-
* simple imputation for both
201-
* scaling for numerical
202-
* one-hot encoding for categorical
203-
For example,
204-
data = [
205-
{'A': 1, 'B': np.nan, 'C': np.nan},
206-
{'A': np.nan, 'B': 3, 'C': np.nan},
207-
{'A': 2, 'B': np.nan, 'C': np.nan}
208-
]
209-
and suppose all the columns are categorical,
210-
then
211-
* `A` in {np.nan, 1, 2}
212-
* `B` in {np.nan, 3}
213-
* `C` in {np.nan} <=== it will be dropped.
214-
215-
So in the column A,
216-
* np.nan ==> [1, 0, 0]
217-
* 1 ==> [0, 1, 0]
218-
* 2 ==> [0, 0, 1]
219-
in the column B,
220-
* np.nan ==> [1, 0]
221-
* 3 ==> [0, 1]
222-
Therefore, by concatenating,
223-
* {'A': 1, 'B': np.nan, 'C': np.nan} ==> [0, 1, 0, 1, 0]
224-
* {'A': np.nan, 'B': 3, 'C': np.nan} ==> [1, 0, 0, 0, 1]
225-
* {'A': 2, 'B': np.nan, 'C': np.nan} ==> [0, 0, 1, 1, 0]
226-
"""
227-
# First case, there exist null columns (B and C) in the train set
228-
# and a same column (C) are not all null for the test set.
229-
230-
df_train = pd.DataFrame(
231-
[
232-
{'A': 1, 'B': np.nan, 'C': np.nan},
233-
{'A': np.nan, 'C': np.nan},
234-
{'A': 1}
235-
],
236-
dtype='category',
237-
)
238-
ans_train = np.array([[0, 1], [1, 0], [0, 1]], dtype=np.float64)
239-
df_test = pd.DataFrame(
240-
[
241-
{'A': np.nan, 'B': np.nan, 'C': 5},
242-
{'A': np.nan, 'C': np.nan},
243-
{'A': 1}
244-
],
245-
dtype='category',
246-
)
247-
ans_test = np.array([[1, 0], [1, 0], [0, 1]], dtype=np.float64)
248-
feature_validator_remove_nan_catcolumns(df_train, df_test, ans_train, ans_test)
249-
250-
# Second case, there exist null columns (B and C) in the training set and
251-
# the same columns (B and C) are null in the test set.
252-
df_train = pd.DataFrame(
253-
[
254-
{'A': 1, 'B': np.nan, 'C': np.nan},
255-
{'A': np.nan, 'C': np.nan},
256-
{'A': 1}
257-
],
258-
dtype='category',
259-
)
260-
ans_train = np.array([[0, 1], [1, 0], [0, 1]], dtype=np.float64)
261-
df_test = pd.DataFrame(
262-
[
263-
{'A': np.nan, 'B': np.nan, 'C': np.nan},
264-
{'A': np.nan, 'C': np.nan},
265-
{'A': 1}
266-
],
267-
dtype='category',
268-
)
269-
ans_test = np.array([[1, 0], [1, 0], [0, 1]], dtype=np.float64)
270-
feature_validator_remove_nan_catcolumns(df_train, df_test, ans_train, ans_test)
271-
272-
# Third case, there exist no null columns in the training set and
273-
# null columns exist in the test set.
274-
df_train = pd.DataFrame(
275-
[
276-
{'A': 1, 'B': 1},
277-
{'A': 2, 'B': 2}
278-
],
279-
dtype='category',
280-
)
281-
ans_train = np.array([[1, 0, 1, 0], [0, 1, 0, 1]], dtype=np.float64)
282-
df_test = pd.DataFrame(
283-
[
284-
{'A': np.nan, 'B': np.nan},
285-
{'A': np.nan, 'B': np.nan}
286-
],
287-
dtype='category',
288-
)
289-
ans_test = np.array([[0, 0, 0, 0], [0, 0, 0, 0]], dtype=np.float64)
290-
feature_validator_remove_nan_catcolumns(df_train, df_test, ans_train, ans_test)
291-
>>>>>>> Bug fixes (#249)
292174

293175

294176
def test_features_unsupported_calls_are_raised():
@@ -529,6 +411,7 @@ def test_comparator():
529411
assert ans == feat_type
530412

531413

414+
<<<<<<< HEAD
532415
@pytest.fixture
533416
def input_data_feature_feat_types(request):
534417
if request.param == 'pandas_categoricalonly':
@@ -648,6 +531,8 @@ def test_feature_validator_get_columns_to_encode_error_feat_type(input_data_feat
648531
with pytest.raises(ValueError, match=r"Expected type of features to be in .*"):
649532
validator._validate_feat_types(X)
650533

534+
=======
535+
>>>>>>> [FIX] Passing checks (#298)
651536
def test_feature_validator_imbalanced_data():
652537

653538
# Null columns in the train split but not necessarily in the test split
@@ -670,16 +555,15 @@ def test_feature_validator_imbalanced_data():
670555
validator.fit(X_train)
671556

672557
train_feature_types = copy.deepcopy(validator.feat_type)
673-
assert train_feature_types == ['numerical']
558+
assert train_feature_types == ['numerical', 'numerical', 'numerical', 'numerical']
674559
# validator will throw an error if the column types are not the same
675560
transformed_X_test = validator.transform(X_test)
676561
transformed_X_test = pd.DataFrame(transformed_X_test)
677-
assert sorted(validator.all_nan_columns) == sorted(['A', 'C', 'D'])
678-
# as there are no categorical columns, we can make such an
679-
# assertion. We only expect to drop the all nan columns
680-
total_all_nan_columns = len(validator.all_nan_columns)
681-
total_columns = len(validator.column_order)
682-
assert total_columns - total_all_nan_columns == len(transformed_X_test.columns)
562+
null_columns = []
563+
for column in transformed_X_test.columns:
564+
if transformed_X_test[column].isna().all():
565+
null_columns.append(column)
566+
assert null_columns == [0, 2, 3]
683567

684568
# Columns with not all null values in the train split and
685569
# completely null on the test split.
@@ -698,12 +582,12 @@ def test_feature_validator_imbalanced_data():
698582
X_test = pd.DataFrame.from_dict(test_features)
699583
validator = TabularFeatureValidator()
700584
validator.fit(X_train)
701-
702585
train_feature_types = copy.deepcopy(validator.feat_type)
703586
assert train_feature_types == ['categorical', 'numerical', 'numerical']
704587

705588
transformed_X_test = validator.transform(X_test)
706589
transformed_X_test = pd.DataFrame(transformed_X_test)
590+
<<<<<<< HEAD
707591
assert not len(validator.all_nan_columns)
708592

709593

@@ -733,3 +617,11 @@ def test_comparator():
733617
)
734618
assert ans == feat_type
735619
>>>>>>> Bug fixes (#249)
620+
=======
621+
null_columns = []
622+
for column in transformed_X_test.columns:
623+
if transformed_X_test[column].isna().all():
624+
null_columns.append(column)
625+
626+
assert null_columns == [1]
627+
>>>>>>> [FIX] Passing checks (#298)

0 commit comments

Comments
 (0)