1+ import os
2+ import uuid
13from typing import Any , Callable , Dict , List , Optional , Union
24
35import numpy as np
@@ -86,10 +88,6 @@ def __init__(
8688 task_type = TASK_TYPES_TO_STRING [TABULAR_CLASSIFICATION ],
8789 )
8890
89- # Create a validator object to make sure that the data provided by
90- # the user matches the autopytorch requirements
91- self .InputValidator = TabularInputValidator (is_classification = True )
92-
9391 def _get_required_dataset_properties (self , dataset : BaseDataset ) -> Dict [str , Any ]:
9492 if not isinstance (dataset , TabularDataset ):
9593 raise ValueError ("Dataset is incompatible for the given task,: {}" .format (
@@ -105,24 +103,25 @@ def build_pipeline(self, dataset_properties: Dict[str, Any]) -> TabularClassific
105103 return TabularClassificationPipeline (dataset_properties = dataset_properties )
106104
107105 def search (
108- self ,
109- optimize_metric : str ,
110- X_train : Optional [Union [List , pd .DataFrame , np .ndarray ]] = None ,
111- y_train : Optional [Union [List , pd .DataFrame , np .ndarray ]] = None ,
112- X_test : Optional [Union [List , pd .DataFrame , np .ndarray ]] = None ,
113- y_test : Optional [Union [List , pd .DataFrame , np .ndarray ]] = None ,
114- budget_type : Optional [str ] = None ,
115- budget : Optional [float ] = None ,
116- total_walltime_limit : int = 100 ,
117- func_eval_time_limit : int = 60 ,
118- traditional_per_total_budget : float = 0.1 ,
119- memory_limit : Optional [int ] = 4096 ,
120- smac_scenario_args : Optional [Dict [str , Any ]] = None ,
121- get_smac_object_callback : Optional [Callable ] = None ,
122- all_supported_metrics : bool = True ,
123- precision : int = 32 ,
124- disable_file_output : List = [],
125- load_models : bool = True ,
106+ self ,
107+ optimize_metric : str ,
108+ X_train : Optional [Union [List , pd .DataFrame , np .ndarray ]] = None ,
109+ y_train : Optional [Union [List , pd .DataFrame , np .ndarray ]] = None ,
110+ X_test : Optional [Union [List , pd .DataFrame , np .ndarray ]] = None ,
111+ y_test : Optional [Union [List , pd .DataFrame , np .ndarray ]] = None ,
112+ dataset_name : Optional [str ] = None ,
113+ budget_type : Optional [str ] = None ,
114+ budget : Optional [float ] = None ,
115+ total_walltime_limit : int = 100 ,
116+ func_eval_time_limit : int = 60 ,
117+ traditional_per_total_budget : float = 0.1 ,
118+ memory_limit : Optional [int ] = 4096 ,
119+ smac_scenario_args : Optional [Dict [str , Any ]] = None ,
120+ get_smac_object_callback : Optional [Callable ] = None ,
121+ all_supported_metrics : bool = True ,
122+ precision : int = 32 ,
123+ disable_file_output : List = [],
124+ load_models : bool = True ,
126125 ) -> 'BaseTask' :
127126 """
128127 Search for the best pipeline configuration for the given dataset.
@@ -133,9 +132,8 @@ def search(
133132 Args:
134133 X_train, y_train, X_test, y_test: Union[np.ndarray, List, pd.DataFrame]
135134 A pair of features (X_train) and targets (y_train) used to fit a
136- pipeline. Additionally, a holdout of this paris (X_test, y_test) can
135+ pipeline. Additionally, a holdout of this pairs (X_test, y_test) can
137136 be provided to track the generalization performance of each stage.
138- Providing X_train, y_train and dataset together is not supported.
139137 optimize_metric (str): name of the metric that is used to
140138 evaluate a pipeline.
141139 budget_type (Optional[str]):
@@ -189,6 +187,18 @@ def search(
189187 self
190188
191189 """
190+ if dataset_name is None :
191+ dataset_name = str (uuid .uuid1 (clock_seq = os .getpid ()))
192+
193+ # we have to create a logger for at this point for the validator
194+ self ._logger = self ._get_logger (dataset_name )
195+
196+ # Create a validator object to make sure that the data provided by
197+ # the user matches the autopytorch requirements
198+ self .InputValidator = TabularInputValidator (
199+ is_classification = True ,
200+ logger_port = self ._logger_port ,
201+ )
192202
193203 # Fit a input validator to check the provided data
194204 # Also, an encoder is fit to both train and test data,
@@ -227,7 +237,7 @@ def predict(
227237 n_jobs : int = 1
228238 ) -> np .ndarray :
229239 if self .InputValidator is None or not self .InputValidator ._is_fitted :
230- raise ValueError ("predict() is only supported after calling fit . Kindly call first "
240+ raise ValueError ("predict() is only supported after calling search . Kindly call first "
231241 "the estimator fit() method." )
232242
233243 X_test = self .InputValidator .feature_validator .transform (X_test )
@@ -247,7 +257,7 @@ def predict_proba(self,
247257 X_test : Union [np .ndarray , pd .DataFrame , List ],
248258 batch_size : Optional [int ] = None , n_jobs : int = 1 ) -> np .ndarray :
249259 if self .InputValidator is None or not self .InputValidator ._is_fitted :
250- raise ValueError ("predict() is only supported after calling fit . Kindly call first "
260+ raise ValueError ("predict() is only supported after calling search . Kindly call first "
251261 "the estimator fit() method." )
252262 X_test = self .InputValidator .feature_validator .transform (X_test )
253263 return super ().predict (X_test , batch_size = batch_size , n_jobs = n_jobs )
0 commit comments