Skip to content

Commit 33b2223

Browse files
authored
[FIX] Tests after rebase of reg_cocktails (#359)
* update requirements * update requirements * resolve remaining conflicts and fix flake and mypy * Fix remaining tests and examples * fix failing checks * fix flake
1 parent 87ee242 commit 33b2223

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

43 files changed

+348
-1099
lines changed

autoPyTorch/api/base_task.py

Lines changed: 50 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@
2727

2828
import pandas as pd
2929

30-
from smac.runhistory.runhistory import DataOrigin, RunHistory, RunInfo, RunValue
30+
from smac.runhistory.runhistory import DataOrigin, RunHistory
3131
from smac.stats.stats import Stats
3232
from smac.tae import StatusType
3333

@@ -238,7 +238,7 @@ def __init__(
238238
" HyperparameterSearchSpaceUpdates got {}".format(type(self.search_space_updates)))
239239

240240
@abstractmethod
241-
def build_pipeline(self, dataset_properties: Dict[str, Any]) -> BasePipeline:
241+
def build_pipeline(self, dataset_properties: Dict[str, BaseDatasetPropertiesType]) -> BasePipeline:
242242
"""
243243
Build pipeline according to current task
244244
and for the passed dataset properties
@@ -486,11 +486,16 @@ def _load_models(self) -> bool:
486486
raise ValueError("Resampling strategy is needed to determine what models to load")
487487
self.ensemble_ = self._backend.load_ensemble(self.seed)
488488

489-
if isinstance(self._disable_file_output, List):
490-
disabled_file_outputs = self._disable_file_output
489+
# TODO: remove this code after `fit_pipeline` is rebased.
490+
if hasattr(self, '_disable_file_output'):
491+
if isinstance(self._disable_file_output, List):
492+
disabled_file_outputs = self._disable_file_output
493+
disable_file_output = False
494+
elif isinstance(self._disable_file_output, bool):
495+
disable_file_output = self._disable_file_output
496+
disabled_file_outputs = []
497+
else:
491498
disable_file_output = False
492-
elif isinstance(self._disable_file_output, bool):
493-
disable_file_output = self._disable_file_output
494499
disabled_file_outputs = []
495500

496501
# If no ensemble is loaded, try to get the best performing model
@@ -794,18 +799,15 @@ def run_traditional_ml(
794799
learning algorithm runs over the time limit.
795800
"""
796801
assert self._logger is not None # for mypy compliancy
797-
if STRING_TO_TASK_TYPES[self.task_type] in REGRESSION_TASKS:
798-
self._logger.warning("Traditional Pipeline is not enabled for regression. Skipping...")
799-
else:
800-
traditional_task_name = 'runTraditional'
801-
self._stopwatch.start_task(traditional_task_name)
802-
elapsed_time = self._stopwatch.wall_elapsed(current_task_name)
803-
time_for_traditional = int(runtime_limit - elapsed_time)
804-
self._do_traditional_prediction(
805-
func_eval_time_limit_secs=func_eval_time_limit_secs,
806-
time_left=time_for_traditional,
807-
)
808-
self._stopwatch.stop_task(traditional_task_name)
802+
traditional_task_name = 'runTraditional'
803+
self._stopwatch.start_task(traditional_task_name)
804+
elapsed_time = self._stopwatch.wall_elapsed(current_task_name)
805+
time_for_traditional = int(runtime_limit - elapsed_time)
806+
self._do_traditional_prediction(
807+
func_eval_time_limit_secs=func_eval_time_limit_secs,
808+
time_left=time_for_traditional,
809+
)
810+
self._stopwatch.stop_task(traditional_task_name)
809811

810812
def _search(
811813
self,
@@ -1165,22 +1167,7 @@ def _search(
11651167
self._logger.info("Starting Shutdown")
11661168

11671169
if proc_ensemble is not None:
1168-
self._results_manager.ensemble_performance_history = list(proc_ensemble.history)
1169-
1170-
if len(proc_ensemble.futures) > 0:
1171-
# Also add ensemble runs that did not finish within smac time
1172-
# and add them into the ensemble history
1173-
self._logger.info("Ensemble script still running, waiting for it to finish.")
1174-
result = proc_ensemble.futures.pop().result()
1175-
if result:
1176-
ensemble_history, _, _, _ = result
1177-
self._results_manager.ensemble_performance_history.extend(ensemble_history)
1178-
self._logger.info("Ensemble script finished, continue shutdown.")
1179-
1180-
# save the ensemble performance history file
1181-
if len(self.ensemble_performance_history) > 0:
1182-
pd.DataFrame(self.ensemble_performance_history).to_json(
1183-
os.path.join(self._backend.internals_directory, 'ensemble_history.json'))
1170+
self._collect_results_ensemble(proc_ensemble)
11841171

11851172
if load_models:
11861173
self._logger.info("Loading models...")
@@ -1321,7 +1308,7 @@ def fit(self,
13211308
exclude=self.exclude_components,
13221309
search_space_updates=self.search_space_updates)
13231310
dataset_properties = dataset.get_dataset_properties(dataset_requirements)
1324-
self._backend.replace_datamanager(dataset)
1311+
self._backend.save_datamanager(dataset)
13251312

13261313
# build pipeline
13271314
pipeline = self.build_pipeline(dataset_properties)
@@ -1339,7 +1326,6 @@ def fit(self,
13391326
self._clean_logger()
13401327
return pipeline
13411328

1342-
13431329
def fit_ensemble(
13441330
self,
13451331
optimize_metric: Optional[str] = None,
@@ -1418,7 +1404,7 @@ def fit_ensemble(
14181404
ensemble_fit_task_name = 'EnsembleFit'
14191405
self._stopwatch.start_task(ensemble_fit_task_name)
14201406
if enable_traditional_pipeline:
1421-
if func_eval_time_limit_secs is None or func_eval_time_limit_secs > time_for_task:
1407+
if func_eval_time_limit_secs > time_for_task:
14221408
self._logger.warning(
14231409
'Time limit for a single run is higher than total time '
14241410
'limit. Capping the limit for a single run to the total '
@@ -1459,12 +1445,8 @@ def fit_ensemble(
14591445
)
14601446

14611447
manager.build_ensemble(self._dask_client)
1462-
future = manager.futures.pop()
1463-
result = future.result()
1464-
if result is None:
1465-
raise ValueError("Errors occurred while building the ensemble - please"
1466-
" check the log file and command line output for error messages.")
1467-
self.ensemble_performance_history, _, _, _ = result
1448+
if manager is not None:
1449+
self._collect_results_ensemble(manager)
14681450

14691451
if load_models:
14701452
self._load_models()
@@ -1542,6 +1524,31 @@ def _init_ensemble_builder(
15421524

15431525
return proc_ensemble
15441526

1527+
def _collect_results_ensemble(
1528+
self,
1529+
manager: EnsembleBuilderManager
1530+
) -> None:
1531+
1532+
if self._logger is None:
1533+
raise ValueError("logger should be initialized to fit ensemble")
1534+
1535+
self._results_manager.ensemble_performance_history = list(manager.history)
1536+
1537+
if len(manager.futures) > 0:
1538+
# Also add ensemble runs that did not finish within smac time
1539+
# and add them into the ensemble history
1540+
self._logger.info("Ensemble script still running, waiting for it to finish.")
1541+
result = manager.futures.pop().result()
1542+
if result:
1543+
ensemble_history, _, _, _ = result
1544+
self._results_manager.ensemble_performance_history.extend(ensemble_history)
1545+
self._logger.info("Ensemble script finished, continue shutdown.")
1546+
1547+
# save the ensemble performance history file
1548+
if len(self.ensemble_performance_history) > 0:
1549+
pd.DataFrame(self.ensemble_performance_history).to_json(
1550+
os.path.join(self._backend.internals_directory, 'ensemble_history.json'))
1551+
15451552
def predict(
15461553
self,
15471554
X_test: np.ndarray,

autoPyTorch/api/tabular_classification.py

Lines changed: 21 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
TASK_TYPES_TO_STRING,
1414
)
1515
from autoPyTorch.data.tabular_validator import TabularInputValidator
16+
from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
1617
from autoPyTorch.datasets.resampling_strategy import (
1718
CrossValTypes,
1819
HoldoutValTypes,
@@ -109,7 +110,7 @@ def __init__(
109110
task_type=TASK_TYPES_TO_STRING[TABULAR_CLASSIFICATION],
110111
)
111112

112-
def build_pipeline(self, dataset_properties: Dict[str, Any]) -> TabularClassificationPipeline:
113+
def build_pipeline(self, dataset_properties: Dict[str, BaseDatasetPropertiesType]) -> TabularClassificationPipeline:
113114
"""
114115
Build pipeline according to current task and for the passed dataset properties
115116
@@ -120,16 +121,7 @@ def build_pipeline(self, dataset_properties: Dict[str, Any]) -> TabularClassific
120121
TabularClassificationPipeline:
121122
Pipeline compatible with the given dataset properties.
122123
"""
123-
124-
def build_pipeline(self, dataset_properties: Dict[str, Any],
125-
include_components: Optional[Dict] = None,
126-
exclude_components: Optional[Dict] = None,
127-
search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None
128-
) -> TabularClassificationPipeline:
129-
return TabularClassificationPipeline(dataset_properties=dataset_properties,
130-
include=include_components,
131-
exclude=exclude_components,
132-
search_space_updates=search_space_updates)
124+
return TabularClassificationPipeline(dataset_properties=dataset_properties)
133125

134126
def search(
135127
self,
@@ -281,6 +273,18 @@ def search(
281273
self
282274
283275
"""
276+
if dataset_name is None:
277+
dataset_name = str(uuid.uuid1(clock_seq=os.getpid()))
278+
279+
# we have to create a logger for at this point for the validator
280+
self._logger = self._get_logger(dataset_name)
281+
282+
# Create a validator object to make sure that the data provided by
283+
# the user matches the autopytorch requirements
284+
self.InputValidator = TabularInputValidator(
285+
is_classification=True,
286+
logger_port=self._logger_port,
287+
)
284288

285289
# Fit a input validator to check the provided data
286290
# Also, an encoder is fit to both train and test data,
@@ -303,9 +307,9 @@ def search(
303307
'(CrossValTypes, HoldoutValTypes), but got {}'.format(self.resampling_strategy)
304308
)
305309

306-
307310
if self.dataset is None:
308311
raise ValueError("`dataset` in {} must be initialized, but got None".format(self.__class__.__name__))
312+
309313
return self._search(
310314
dataset=self.dataset,
311315
optimize_metric=optimize_metric,
@@ -345,24 +349,24 @@ def predict(
345349
raise ValueError("predict() is only supported after calling search. Kindly call first "
346350
"the estimator fit() method.")
347351

348-
X_test = self.input_validator.feature_validator.transform(X_test)
352+
X_test = self.InputValidator.feature_validator.transform(X_test)
349353
predicted_probabilities = super().predict(X_test, batch_size=batch_size,
350354
n_jobs=n_jobs)
351355

352-
if self.input_validator.target_validator.is_single_column_target():
356+
if self.InputValidator.target_validator.is_single_column_target():
353357
predicted_indexes = np.argmax(predicted_probabilities, axis=1)
354358
else:
355359
predicted_indexes = (predicted_probabilities > 0.5).astype(int)
356360

357361
# Allow to predict in the original domain -- that is, the user is not interested
358362
# in our encoded values
359-
return self.input_validator.target_validator.inverse_transform(predicted_indexes)
363+
return self.InputValidator.target_validator.inverse_transform(predicted_indexes)
360364

361365
def predict_proba(self,
362366
X_test: Union[np.ndarray, pd.DataFrame, List],
363367
batch_size: Optional[int] = None, n_jobs: int = 1) -> np.ndarray:
364-
if self.input_validator is None or not self.input_validator._is_fitted:
368+
if self.InputValidator is None or not self.InputValidator._is_fitted:
365369
raise ValueError("predict() is only supported after calling search. Kindly call first "
366370
"the estimator fit() method.")
367-
X_test = self.input_validator.feature_validator.transform(X_test)
371+
X_test = self.InputValidator.feature_validator.transform(X_test)
368372
return super().predict(X_test, batch_size=batch_size, n_jobs=n_jobs)

autoPyTorch/api/tabular_regression.py

Lines changed: 14 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
TASK_TYPES_TO_STRING
1414
)
1515
from autoPyTorch.data.tabular_validator import TabularInputValidator
16+
from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
1617
from autoPyTorch.datasets.resampling_strategy import (
1718
CrossValTypes,
1819
HoldoutValTypes,
@@ -81,9 +82,9 @@ def __init__(
8182
delete_output_folder_after_terminate: bool = True,
8283
include_components: Optional[Dict] = None,
8384
exclude_components: Optional[Dict] = None,
84-
resampling_strategy:Union[CrossValTypes,
85-
HoldoutValTypes,
86-
NoResamplingStrategyTypes] = HoldoutValTypes.holdout_validation,
85+
resampling_strategy: Union[CrossValTypes,
86+
HoldoutValTypes,
87+
NoResamplingStrategyTypes] = HoldoutValTypes.holdout_validation,
8788
resampling_strategy_args: Optional[Dict[str, Any]] = None,
8889
backend: Optional[Backend] = None,
8990
search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None
@@ -109,7 +110,7 @@ def __init__(
109110
task_type=TASK_TYPES_TO_STRING[TABULAR_REGRESSION],
110111
)
111112

112-
def build_pipeline(self, dataset_properties: Dict[str, Any]) -> TabularRegressionPipeline:
113+
def build_pipeline(self, dataset_properties: Dict[str, BaseDatasetPropertiesType]) -> TabularRegressionPipeline:
113114
"""
114115
Build pipeline according to current task and for the passed dataset properties
115116
@@ -272,6 +273,11 @@ def search(
272273
self
273274
274275
"""
276+
if dataset_name is None:
277+
dataset_name = str(uuid.uuid1(clock_seq=os.getpid()))
278+
279+
# we have to create a logger for at this point for the validator
280+
self._logger = self._get_logger(dataset_name)
275281

276282
# Create a validator object to make sure that the data provided by
277283
# the user matches the autopytorch requirements
@@ -301,9 +307,9 @@ def search(
301307
'(CrossValTypes, HoldoutValTypes), but got {}'.format(self.resampling_strategy)
302308
)
303309

304-
305310
if self.dataset is None:
306311
raise ValueError("`dataset` in {} must be initialized, but got None".format(self.__class__.__name__))
312+
307313
return self._search(
308314
dataset=self.dataset,
309315
optimize_metric=optimize_metric,
@@ -329,14 +335,14 @@ def predict(
329335
batch_size: Optional[int] = None,
330336
n_jobs: int = 1
331337
) -> np.ndarray:
332-
if self.input_validator is None or not self.input_validator._is_fitted:
338+
if self.InputValidator is None or not self.InputValidator._is_fitted:
333339
raise ValueError("predict() is only supported after calling search. Kindly call first "
334340
"the estimator fit() method.")
335341

336-
X_test = self.input_validator.feature_validator.transform(X_test)
342+
X_test = self.InputValidator.feature_validator.transform(X_test)
337343
predicted_values = super().predict(X_test, batch_size=batch_size,
338344
n_jobs=n_jobs)
339345

340346
# Allow to predict in the original domain -- that is, the user is not interested
341347
# in our encoded values
342-
return self.input_validator.target_validator.inverse_transform(predicted_values)
348+
return self.InputValidator.target_validator.inverse_transform(predicted_values)

autoPyTorch/data/base_target_validator.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,6 @@ def fit(
9898
np.shape(y_test)
9999
))
100100
if isinstance(y_train, pd.DataFrame):
101-
y_train = cast(pd.DataFrame, y_train)
102101
y_test = cast(pd.DataFrame, y_test)
103102
if y_train.columns.tolist() != y_test.columns.tolist():
104103
raise ValueError(

autoPyTorch/data/tabular_feature_validator.py

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
import functools
2-
from typing import Dict, List, Optional, Tuple, Union, cast
2+
from typing import Dict, List, Optional, Tuple, Type, Union, cast
33

44
import numpy as np
55

@@ -263,7 +263,7 @@ def transform(
263263
X = self.numpy_to_pandas(X)
264264

265265
if hasattr(X, "iloc") and not scipy.sparse.issparse(X):
266-
X = cast(pd.DataFrame, X)
266+
X = cast(Type[pd.DataFrame], X)
267267

268268
# Check the data here so we catch problems on new test data
269269
self._check_data(X)
@@ -391,9 +391,6 @@ def _get_columns_info(
391391
Type of each column numerical/categorical
392392
"""
393393

394-
if len(self.transformed_columns) > 0 and self.feat_type is not None:
395-
return self.transformed_columns, self.feat_type
396-
397394
# Register if a column needs encoding
398395
numerical_columns = []
399396
categorical_columns = []

autoPyTorch/data/tabular_target_validator.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from typing import List, Optional, Union, cast
1+
from typing import List, Optional, cast
22

33
import numpy as np
44

autoPyTorch/datasets/resampling_strategy.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,10 @@ class NoResamplingStrategyTypes(IntEnum):
9292
RESAMPLING_STRATEGIES = [CrossValTypes, HoldoutValTypes, NoResamplingStrategyTypes]
9393

9494

95-
DEFAULT_RESAMPLING_PARAMETERS: Dict[Union[HoldoutValTypes, CrossValTypes], Dict[str, Any]] = {
95+
DEFAULT_RESAMPLING_PARAMETERS: Dict[Union[CrossValTypes,
96+
HoldoutValTypes,
97+
NoResamplingStrategyTypes],
98+
Dict[str, Any]] = {
9699
HoldoutValTypes.holdout_validation: {
97100
'val_share': 0.33,
98101
},
@@ -117,7 +120,7 @@ class NoResamplingStrategyTypes(IntEnum):
117120
NoResamplingStrategyTypes.shuffle_no_resampling: {
118121
'shuffle': True
119122
}
120-
} # type: Dict[Union[HoldoutValTypes, CrossValTypes, NoResamplingStrategyTypes], Dict[str, Any]]
123+
}
121124

122125

123126
class HoldOutFuncs():

0 commit comments

Comments
 (0)