Skip to content

Commit 45076ee

Browse files
authored
Run history traditional (#121)
* In progress, issue with failed traditional * working traditional classifiers * Addressed comments from francisco * Changed test loop in test_api * Add .autopytorch runs back again * Addressed comments, better documentation and dict for runhistory * Fix flake * Fix tests and add additional run info for crossval * fix tests for train evaluator and api * Addressed comments * Addressed comments * Addressed comments from shuhei, removed deleting from additioninfo
1 parent 68fc77f commit 45076ee

File tree

6 files changed

+70
-34
lines changed

6 files changed

+70
-34
lines changed

autoPyTorch/api/base_task.py

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@
2525

2626
import pandas as pd
2727

28-
from smac.runhistory.runhistory import RunHistory
28+
from smac.runhistory.runhistory import DataOrigin, RunHistory
2929
from smac.stats.stats import Stats
3030
from smac.tae import StatusType
3131

@@ -173,7 +173,7 @@ def __init__(
173173
self._dataset_requirements: Optional[List[FitRequirement]] = None
174174
self._metric: Optional[autoPyTorchMetric] = None
175175
self._logger: Optional[PicklableClientLogger] = None
176-
self.run_history: Optional[RunHistory] = None
176+
self.run_history: RunHistory = RunHistory()
177177
self.trajectory: Optional[List] = None
178178
self.dataset_name: Optional[str] = None
179179
self.cv_models_: Dict = {}
@@ -582,6 +582,10 @@ def _do_traditional_prediction(self, num_run: int, time_left: int, func_eval_tim
582582
assert self._logger is not None
583583
assert self._dask_client is not None
584584

585+
self._logger.info("Starting to create traditional classifier predictions.")
586+
587+
# Initialise run history for the traditional classifiers
588+
run_history = RunHistory()
585589
memory_limit = self._memory_limit
586590
if memory_limit is not None:
587591
memory_limit = int(math.ceil(memory_limit))
@@ -651,6 +655,11 @@ def _do_traditional_prediction(self, num_run: int, time_left: int, func_eval_tim
651655
if status == StatusType.SUCCESS:
652656
self._logger.info(
653657
f"Fitting {cls} took {runtime}s, performance:{cost}/{additional_info}")
658+
configuration = additional_info['pipeline_configuration']
659+
origin = additional_info['configuration_origin']
660+
run_history.add(config=configuration, cost=cost,
661+
time=runtime, status=status, seed=self.seed,
662+
origin=origin)
654663
else:
655664
if additional_info.get('exitcode') == -6:
656665
self._logger.error(
@@ -677,6 +686,11 @@ def _do_traditional_prediction(self, num_run: int, time_left: int, func_eval_tim
677686
"Please consider increasing the run time to further improve performance.")
678687
break
679688

689+
self._logger.debug("Run history traditional: {}".format(run_history))
690+
# add run history of traditional to api run history
691+
self.run_history.update(run_history, DataOrigin.EXTERNAL_SAME_INSTANCES)
692+
run_history.save_json(os.path.join(self._backend.internals_directory, 'traditional_run_history.json'),
693+
save_external=True)
680694
return num_run
681695

682696
def _search(
@@ -947,8 +961,9 @@ def _search(
947961
search_space_updates=self.search_space_updates
948962
)
949963
try:
950-
self.run_history, self.trajectory, budget_type = \
964+
run_history, self.trajectory, budget_type = \
951965
_proc_smac.run_smbo()
966+
self.run_history.update(run_history, DataOrigin.INTERNAL)
952967
trajectory_filename = os.path.join(
953968
self._backend.get_smac_output_directory_for_run(self.seed),
954969
'trajectory.json')

autoPyTorch/evaluation/abstract_evaluator.py

Lines changed: 18 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -84,8 +84,8 @@ def __init__(self, config: str,
8484
configuration_space = self.pipeline.get_hyperparameter_search_space()
8585
default_configuration = configuration_space.get_default_configuration().get_dictionary()
8686
default_configuration['model_trainer:tabular_classifier:classifier'] = config
87-
configuration = Configuration(configuration_space, default_configuration)
88-
self.pipeline.set_hyperparameters(configuration)
87+
self.configuration = Configuration(configuration_space, default_configuration)
88+
self.pipeline.set_hyperparameters(self.configuration)
8989

9090
def fit(self, X: Dict[str, Any], y: Any,
9191
sample_weight: Optional[np.ndarray] = None) -> object:
@@ -102,8 +102,18 @@ def predict(self, X: Union[np.ndarray, pd.DataFrame],
102102
def estimator_supports_iterative_fit(self) -> bool: # pylint: disable=R0201
103103
return False
104104

105-
def get_additional_run_info(self) -> None: # pylint: disable=R0201
106-
return None
105+
def get_additional_run_info(self) -> Dict[str, Any]: # pylint: disable=R0201
106+
"""
107+
Can be used to return additional info for the run.
108+
Returns:
109+
Dict[str, Any]:
110+
Currently contains
111+
1. pipeline_configuration: the configuration of the pipeline, i.e, the traditional model used
112+
2. trainer_configuration: the parameters for the traditional model used.
113+
Can be found in autoPyTorch/pipeline/components/setup/traditional_ml/classifier_configs
114+
"""
115+
return {'pipeline_configuration': self.configuration,
116+
'trainer_configuration': self.pipeline.named_steps['model_trainer'].choice.model.get_config()}
107117

108118
def get_pipeline_representation(self) -> Dict[str, str]:
109119
return self.pipeline.get_pipeline_representation()
@@ -163,8 +173,8 @@ def predict(self, X: Union[np.ndarray, pd.DataFrame],
163173
def estimator_supports_iterative_fit(self) -> bool: # pylint: disable=R0201
164174
return False
165175

166-
def get_additional_run_info(self) -> None: # pylint: disable=R0201
167-
return None
176+
def get_additional_run_info(self) -> Dict: # pylint: disable=R0201
177+
return {}
168178

169179
def get_pipeline_representation(self) -> Dict[str, str]:
170180
return {
@@ -219,8 +229,8 @@ def predict(self, X: Union[np.ndarray, pd.DataFrame],
219229
def estimator_supports_iterative_fit(self) -> bool: # pylint: disable=R0201
220230
return False
221231

222-
def get_additional_run_info(self) -> None: # pylint: disable=R0201
223-
return None
232+
def get_additional_run_info(self) -> Dict: # pylint: disable=R0201
233+
return {}
224234

225235
@staticmethod
226236
def get_default_pipeline_options() -> Dict[str, Any]:

autoPyTorch/evaluation/tae.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ def fit_predict_try_except_decorator(
5757
def get_cost_of_crash(metric: autoPyTorchMetric) -> float:
5858
# The metric must always be defined to extract optimum/worst
5959
if not isinstance(metric, autoPyTorchMetric):
60-
raise ValueError("The metric must be stricly be an instance of autoPyTorchMetric")
60+
raise ValueError("The metric must be strictly be an instance of autoPyTorchMetric")
6161

6262
# Autopytorch optimizes the err. This function translates
6363
# worst_possible_result to be a minimization problem.

autoPyTorch/evaluation/train_evaluator.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -143,6 +143,8 @@ def fit_predict_and_loss(self) -> None:
143143
# weights for opt_losses.
144144
opt_fold_weights = [np.NaN] * self.num_folds
145145

146+
additional_run_info = {}
147+
146148
for i, (train_split, test_split) in enumerate(self.splits):
147149

148150
pipeline = self.pipelines[i]
@@ -178,7 +180,8 @@ def fit_predict_and_loss(self) -> None:
178180
# number of optimization data points for this fold.
179181
# Used for weighting the average.
180182
opt_fold_weights[i] = len(train_split)
181-
183+
additional_run_info.update(pipeline.get_additional_run_info() if hasattr(
184+
pipeline, 'get_additional_run_info') and pipeline.get_additional_run_info() is not None else {})
182185
# Compute weights of each fold based on the number of samples in each
183186
# fold.
184187
train_fold_weights = [w / sum(train_fold_weights)

test/test_api/test_api.py

Lines changed: 27 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,8 @@
1414
import sklearn.datasets
1515
from sklearn.ensemble import VotingClassifier, VotingRegressor
1616

17+
from smac.runhistory.runhistory import RunHistory
18+
1719
import torch
1820

1921
from autoPyTorch.api.tabular_classification import TabularClassificationTask
@@ -104,17 +106,20 @@ def test_tabular_classification(openml_id, resampling_strategy, backend):
104106

105107
# Search for an existing run key in disc. A individual model might have
106108
# a timeout and hence was not written to disc
109+
successful_num_run = None
110+
SUCCESS = False
107111
for i, (run_key, value) in enumerate(estimator.run_history.data.items()):
108-
if 'SUCCESS' not in str(value.status):
109-
continue
110-
111-
run_key_model_run_dir = estimator._backend.get_numrun_directory(
112-
estimator.seed, run_key.config_id + 1, run_key.budget)
113-
if os.path.exists(run_key_model_run_dir):
114-
# Runkey config id is different from the num_run
115-
# more specifically num_run = config_id + 1(dummy)
112+
if 'SUCCESS' in str(value.status):
113+
run_key_model_run_dir = estimator._backend.get_numrun_directory(
114+
estimator.seed, run_key.config_id + 1, run_key.budget)
116115
successful_num_run = run_key.config_id + 1
117-
break
116+
if os.path.exists(run_key_model_run_dir):
117+
# Runkey config id is different from the num_run
118+
# more specifically num_run = config_id + 1(dummy)
119+
SUCCESS = True
120+
break
121+
122+
assert SUCCESS, f"Successful run was not properly saved for num_run: {successful_num_run}"
118123

119124
if resampling_strategy == HoldoutValTypes.holdout_validation:
120125
model_file = os.path.join(run_key_model_run_dir,
@@ -272,17 +277,20 @@ def test_tabular_regression(openml_name, resampling_strategy, backend):
272277

273278
# Search for an existing run key in disc. A individual model might have
274279
# a timeout and hence was not written to disc
280+
successful_num_run = None
281+
SUCCESS = False
275282
for i, (run_key, value) in enumerate(estimator.run_history.data.items()):
276-
if 'SUCCESS' not in str(value.status):
277-
continue
278-
279-
run_key_model_run_dir = estimator._backend.get_numrun_directory(
280-
estimator.seed, run_key.config_id + 1, run_key.budget)
281-
if os.path.exists(run_key_model_run_dir):
282-
# Runkey config id is different from the num_run
283-
# more specifically num_run = config_id + 1(dummy)
283+
if 'SUCCESS' in str(value.status):
284+
run_key_model_run_dir = estimator._backend.get_numrun_directory(
285+
estimator.seed, run_key.config_id + 1, run_key.budget)
284286
successful_num_run = run_key.config_id + 1
285-
break
287+
if os.path.exists(run_key_model_run_dir):
288+
# Runkey config id is different from the num_run
289+
# more specifically num_run = config_id + 1(dummy)
290+
SUCCESS = True
291+
break
292+
293+
assert SUCCESS, f"Successful run was not properly saved for num_run: {successful_num_run}"
286294

287295
if resampling_strategy == HoldoutValTypes.holdout_validation:
288296
model_file = os.path.join(run_key_model_run_dir,
@@ -384,7 +392,7 @@ def test_tabular_input_support(openml_id, backend):
384392
estimator._do_dummy_prediction = unittest.mock.MagicMock()
385393

386394
with unittest.mock.patch.object(AutoMLSMBO, 'run_smbo') as AutoMLSMBOMock:
387-
AutoMLSMBOMock.return_value = ({}, {}, 'epochs')
395+
AutoMLSMBOMock.return_value = (RunHistory(), {}, 'epochs')
388396
estimator.search(
389397
X_train=X_train, y_train=y_train,
390398
X_test=X_test, y_test=y_test,

test/test_evaluation/test_train_evaluator.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -50,8 +50,8 @@ def __init__(self):
5050
def predict_proba(self, X, batch_size=None):
5151
return np.tile([0.6, 0.4], (len(X), 1))
5252

53-
def get_additional_run_info(self) -> None:
54-
return None
53+
def get_additional_run_info(self):
54+
return {}
5555

5656

5757
class TestTrainEvaluator(BaseEvaluatorTest, unittest.TestCase):

0 commit comments

Comments
 (0)