Comment fixes

franchuterivera · franchuterivera · commit a717d6084381 · 2021-02-11T00:41:41.000+01:00
diff --git a/autoPyTorch/api/base_task.py b/autoPyTorch/api/base_task.py
@@ -9,6 +9,7 @@
 import time
 import typing
 import unittest.mock
+import uuid
 import warnings
 from abc import abstractmethod
 from typing import Any, Callable, Dict, List, Optional, Union, cast
@@ -707,7 +708,8 @@ def _search(
         dataset_properties = dataset.get_dataset_properties(dataset_requirements)
         self._stopwatch.start_task(experiment_task_name)
         self.dataset_name = dataset.dataset_name
-        self._logger = self._get_logger(self.dataset_name)
+        if self._logger is None:
+            self._logger = self._get_logger(self.dataset_name)
         self._all_supported_metrics = all_supported_metrics
         self._disable_file_output = disable_file_output
         self._memory_limit = memory_limit
@@ -907,8 +909,11 @@ def refit(
         Returns:
             self
         """
+        if self.dataset_name is None:
+            self.dataset_name = str(uuid.uuid1(clock_seq=os.getpid()))
 
-        self._logger = self._get_logger(dataset.dataset_name)
+        if self._logger is None:
+            self._logger = self._get_logger(self.dataset_name)
 
         dataset_requirements = get_dataset_requirements(
             info=self._get_required_dataset_properties(dataset))
@@ -974,7 +979,11 @@ def fit(self,
         Returns:
             (BasePipeline): fitted pipeline
         """
-        self._logger = self._get_logger(dataset.dataset_name)
+        if self.dataset_name is None:
+            self.dataset_name = str(uuid.uuid1(clock_seq=os.getpid()))
+
+        if self._logger is None:
+            self._logger = self._get_logger(self.dataset_name)
 
         # get dataset properties
         dataset_requirements = get_dataset_requirements(
diff --git a/autoPyTorch/api/tabular_classification.py b/autoPyTorch/api/tabular_classification.py
@@ -1,3 +1,5 @@
+import os
+import uuid
 from typing import Any, Callable, Dict, List, Optional, Union
 
 import numpy as np
@@ -86,10 +88,6 @@ def __init__(
             task_type=TASK_TYPES_TO_STRING[TABULAR_CLASSIFICATION],
         )
 
-        # Create a validator object to make sure that the data provided by
-        # the user matches the autopytorch requirements
-        self.InputValidator = TabularInputValidator(is_classification=True)
-
     def _get_required_dataset_properties(self, dataset: BaseDataset) -> Dict[str, Any]:
         if not isinstance(dataset, TabularDataset):
             raise ValueError("Dataset is incompatible for the given task,: {}".format(
@@ -105,24 +103,25 @@ def build_pipeline(self, dataset_properties: Dict[str, Any]) -> TabularClassific
         return TabularClassificationPipeline(dataset_properties=dataset_properties)
 
     def search(
-            self,
-            optimize_metric: str,
-            X_train: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
-            y_train: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
-            X_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
-            y_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
-            budget_type: Optional[str] = None,
-            budget: Optional[float] = None,
-            total_walltime_limit: int = 100,
-            func_eval_time_limit: int = 60,
-            traditional_per_total_budget: float = 0.1,
-            memory_limit: Optional[int] = 4096,
-            smac_scenario_args: Optional[Dict[str, Any]] = None,
-            get_smac_object_callback: Optional[Callable] = None,
-            all_supported_metrics: bool = True,
-            precision: int = 32,
-            disable_file_output: List = [],
-            load_models: bool = True,
+        self,
+        optimize_metric: str,
+        X_train: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
+        y_train: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
+        X_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
+        y_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
+        dataset_name: Optional[str] = None,
+        budget_type: Optional[str] = None,
+        budget: Optional[float] = None,
+        total_walltime_limit: int = 100,
+        func_eval_time_limit: int = 60,
+        traditional_per_total_budget: float = 0.1,
+        memory_limit: Optional[int] = 4096,
+        smac_scenario_args: Optional[Dict[str, Any]] = None,
+        get_smac_object_callback: Optional[Callable] = None,
+        all_supported_metrics: bool = True,
+        precision: int = 32,
+        disable_file_output: List = [],
+        load_models: bool = True,
     ) -> 'BaseTask':
         """
         Search for the best pipeline configuration for the given dataset.
@@ -133,9 +132,8 @@ def search(
         Args:
             X_train, y_train, X_test, y_test: Union[np.ndarray, List, pd.DataFrame]
                 A pair of features (X_train) and targets (y_train) used to fit a
-                pipeline. Additionally, a holdout of this paris (X_test, y_test) can
+                pipeline. Additionally, a holdout of this pairs (X_test, y_test) can
                 be provided to track the generalization performance of each stage.
-                Providing X_train, y_train and dataset together is not supported.
             optimize_metric (str): name of the metric that is used to
                 evaluate a pipeline.
             budget_type (Optional[str]):
@@ -189,6 +187,18 @@ def search(
             self
 
         """
+        if dataset_name is None:
+            dataset_name = str(uuid.uuid1(clock_seq=os.getpid()))
+
+        # we have to create a logger for at this point for the validator
+        self._logger = self._get_logger(dataset_name)
+
+        # Create a validator object to make sure that the data provided by
+        # the user matches the autopytorch requirements
+        self.InputValidator = TabularInputValidator(
+            is_classification=True,
+            logger_port=self._logger_port,
+        )
 
         # Fit a input validator to check the provided data
         # Also, an encoder is fit to both train and test data,
@@ -227,7 +237,7 @@ def predict(
             n_jobs: int = 1
     ) -> np.ndarray:
         if self.InputValidator is None or not self.InputValidator._is_fitted:
-            raise ValueError("predict() is only supported after calling fit. Kindly call first "
+            raise ValueError("predict() is only supported after calling search. Kindly call first "
                              "the estimator fit() method.")
 
         X_test = self.InputValidator.feature_validator.transform(X_test)
@@ -247,7 +257,7 @@ def predict_proba(self,
                       X_test: Union[np.ndarray, pd.DataFrame, List],
                       batch_size: Optional[int] = None, n_jobs: int = 1) -> np.ndarray:
         if self.InputValidator is None or not self.InputValidator._is_fitted:
-            raise ValueError("predict() is only supported after calling fit. Kindly call first "
+            raise ValueError("predict() is only supported after calling search. Kindly call first "
                              "the estimator fit() method.")
         X_test = self.InputValidator.feature_validator.transform(X_test)
         return super().predict(X_test, batch_size=batch_size, n_jobs=n_jobs)
diff --git a/autoPyTorch/data/base_feature_validator.py b/autoPyTorch/data/base_feature_validator.py
@@ -39,7 +39,7 @@ class BaseFeatureValidator(BaseEstimator):
             Host a encoder object if the data requires transformation (for example,
             if provided a categorical column in a pandas DataFrame)
         enc_columns (typing.List[str])
-            List of columns that where encoded.
+            List of columns that were encoded.
     """
     def __init__(self,
                  logger: typing.Optional[typing.Union[PicklableClientLogger, logging.Logger
diff --git a/autoPyTorch/data/tabular_feature_validator.py b/autoPyTorch/data/tabular_feature_validator.py
@@ -72,6 +72,7 @@ def _fit(
 
                 # The column transformer reoders the feature types - we therefore need to change
                 # it as well
+                # This means columns are shifted to the right
                 def comparator(cmp1: str, cmp2: str) -> int:
                     if (
                         cmp1 == 'categorical' and cmp2 == 'categorical'
diff --git a/autoPyTorch/datasets/tabular_dataset.py b/autoPyTorch/datasets/tabular_dataset.py
@@ -81,17 +81,16 @@ def __init__(self,
         # dataset.
         # TODO: Consider moving the validator to the pipeline itself when we
         # move to using the fit_params on scikit learn 0.24
-        self.validator = validator
-        if self.validator is None:
+        if validator is None:
             raise ValueError("A feature validator is required to build a tabular pipeline")
 
-        X, Y = self.validator.transform(X, Y)
+        X, Y = validator.transform(X, Y)
         if X_test is not None:
-            X_test, Y_test = self.validator.transform(X_test, Y_test)
-        self.categorical_columns = self.validator.feature_validator.categorical_columns
-        self.numerical_columns = self.validator.feature_validator.numerical_columns
-        self.num_features = self.validator.feature_validator.num_features
-        self.categories = self.validator.feature_validator.categories
+            X_test, Y_test = validator.transform(X_test, Y_test)
+        self.categorical_columns = validator.feature_validator.categorical_columns
+        self.numerical_columns = validator.feature_validator.numerical_columns
+        self.num_features = validator.feature_validator.num_features
+        self.categories = validator.feature_validator.categories
 
         super().__init__(train_tensors=(X, Y), test_tensors=(X_test, Y_test), shuffle=shuffle,
                          resampling_strategy=resampling_strategy,
@@ -122,8 +121,3 @@ def get_required_dataset_info(self) -> Dict[str, Any]:
             'task_type': self.task_type
         })
         return info
-
-    def __getstate__(self) -> Dict[str, Any]:
-        # Make pickable!
-        self.validator = None
-        return self.__dict__
diff --git a/autoPyTorch/pipeline/base_pipeline.py b/autoPyTorch/pipeline/base_pipeline.py
@@ -111,7 +111,7 @@ def fit(self, X: Dict[str, Any], y: Optional[np.ndarray] = None,
         """Fit the selected algorithm to the training data.
         Arguments:
             X (typing.Dict):
-            A fit dictionary so that contains information to fit a pipeline
+            A fit dictionary that contains information to fit a pipeline
             TODO: Use fit_params support from 0.24 scikit learn version instead
             y (None):
             Used for Compatibility, but it has no funciton in out fit strategy