[refactor] Address Shuhei's comments

nabenabe0928 · ravinkohli · commit b7a08978c8d2 · 2021-12-21T17:36:40.000+01:00
[fix] Fix Flake8 issues

[refactor] Address Shuhei's comment

[refactor] Address Shuhei's comments

[refactor] Address Shuhei's comments

[refactor] Address Shuhei's comments
diff --git a/autoPyTorch/evaluation/tae.py b/autoPyTorch/evaluation/tae.py
@@ -186,8 +186,7 @@ def __init__(
         else:
             raise ValueError("resampling strategy must be in "
                              "(HoldoutValTypes, CrossValTypes, NoResamplingStrategyTypes), "
-                             "but got {}.".format(self.resampling_strategy)
-            )
+                             "but got {}.".format(self.resampling_strategy))
 
         self.worst_possible_result = cost_for_crash
 
diff --git a/autoPyTorch/evaluation/train_evaluator.py b/autoPyTorch/evaluation/train_evaluator.py
@@ -156,7 +156,6 @@ def __init__(self, backend: Backend, queue: Queue,
                 'resampling_strategy, but got {}'.format(self.datamanager.resampling_strategy)
             )
 
-
         self.splits = self.datamanager.splits
         if self.splits is None:
             raise AttributeError("Must have called create_splits on {}".format(self.datamanager.__class__.__name__))
diff --git a/autoPyTorch/pipeline/components/training/trainer/AdversarialTrainer.py b/autoPyTorch/pipeline/components/training/trainer/AdversarialTrainer.py
@@ -37,7 +37,11 @@ def __init__(
 
         Args:
             epsilon (float): The perturbation magnitude.
-
+        
+        References:
+            Explaining and Harnessing Adversarial Examples
+            Ian J. Goodfellow et. al.
+            https://arxiv.org/pdf/1412.6572.pdf
         """
         super().__init__(random_state=random_state,
                          weighted_loss=weighted_loss,
@@ -96,10 +100,10 @@ def train_step(self, data: np.ndarray, targets: np.ndarray) -> Tuple[float, torc
         # training
         self.optimizer.zero_grad()
         original_outputs = self.model(original_data)
-        adversarial_output = self.model(adversarial_data)
+        adversarial_outputs = self.model(adversarial_data)
 
         loss_func = self.criterion_preparation(**criterion_kwargs)
-        loss = loss_func(self.criterion, original_outputs, adversarial_output)
+        loss = loss_func(self.criterion, original_outputs, adversarial_outputs)
         loss.backward()
         self.optimizer.step()
         if self.scheduler:
@@ -125,6 +129,9 @@ def fgsm_attack(
 
         Returns:
             adv_data (np.ndarray): the adversarial examples.
+        
+        References:
+            https://pytorch.org/tutorials/beginner/fgsm_tutorial.html#fgsm-attack
         """
         data_copy = deepcopy(data)
         data_copy = data_copy.float().to(self.device)
@@ -159,7 +166,7 @@ def get_hyperparameter_search_space(
         dataset_properties: Optional[Dict] = None,
         weighted_loss: HyperparameterSearchSpace = HyperparameterSearchSpace(
             hyperparameter="weighted_loss",
-            value_range=[True, False],
+            value_range=(True, False),
             default_value=True),
         la_steps: HyperparameterSearchSpace = HyperparameterSearchSpace(
             hyperparameter="la_steps",
@@ -196,9 +203,7 @@ def get_hyperparameter_search_space(
 
         add_hyperparameter(cs, epsilon, UniformFloatHyperparameter)
         add_hyperparameter(cs, use_stochastic_weight_averaging, CategoricalHyperparameter)
-        snapshot_ensemble_flag = False
-        if any(use_snapshot_ensemble.value_range):
-            snapshot_ensemble_flag = True
+        snapshot_ensemble_flag = any(use_snapshot_ensemble.value_range)
 
         use_snapshot_ensemble = get_hyperparameter(use_snapshot_ensemble, CategoricalHyperparameter)
         cs.add_hyperparameter(use_snapshot_ensemble)
@@ -209,9 +214,7 @@ def get_hyperparameter_search_space(
             cond = EqualsCondition(se_lastk, use_snapshot_ensemble, True)
             cs.add_condition(cond)
 
-        lookahead_flag = False
-        if any(use_lookahead_optimizer.value_range):
-            lookahead_flag = True
+        lookahead_flag = any(use_lookahead_optimizer.value_range)
 
         use_lookahead_optimizer = get_hyperparameter(use_lookahead_optimizer, CategoricalHyperparameter)
         cs.add_hyperparameter(use_lookahead_optimizer)
diff --git a/autoPyTorch/pipeline/components/training/trainer/GridCutMixTrainer.py b/autoPyTorch/pipeline/components/training/trainer/GridCutMixTrainer.py
@@ -26,14 +26,15 @@ def data_preparation(self, X: np.ndarray, y: np.ndarray,
             np.ndarray: that processes data
             typing.Dict[str, np.ndarray]: arguments to the criterion function
         """
-        beta = 1.0
-        lam = self.random_state.beta(beta, beta)
-        batch_size, channel, W, H = X.size()
-        index = torch.randperm(batch_size).cuda() if X.is_cuda else torch.randperm(batch_size)
+        alpha, beta = 1.0, 1.0
+        lam = self.random_state.beta(alpha, beta)
+        batch_size, _, W, H = X.shape
+        device = torch.device('cuda' if X.is_cuda else 'cpu')
+        batch_indices = torch.randperm(batch_size).to(device)
 
         r = self.random_state.rand(1)
         if beta <= 0 or r > self.alpha:
-            return X, {'y_a': y, 'y_b': y[index], 'lam': 1}
+            return X, {'y_a': y, 'y_b': y[batch_indices], 'lam': 1}
 
         # Draw parameters of a random bounding box
         # Where to cut basically
@@ -47,12 +48,13 @@ def data_preparation(self, X: np.ndarray, y: np.ndarray,
         bbx2 = np.clip(cx + cut_w // 2, 0, W)
         bby2 = np.clip(cy + cut_h // 2, 0, H)
 
-        X[:, :, bbx1:bbx2, bby1:bby2] = X[index, :, bbx1:bbx2, bby1:bby2]
+        X[:, :, bbx1:bbx2, bby1:bby2] = X[batch_indices, :, bbx1:bbx2, bby1:bby2]
 
         # Adjust lam
-        lam = 1 - ((bbx2 - bbx1) * (bby2 - bby1) / (X.size()[-1] * X.size()[-2]))
+        pixel_size = W * H
+        lam = 1 - ((bbx2 - bbx1) * (bby2 - bby1) / pixel_size)
 
-        y_a, y_b = y, y[index]
+        y_a, y_b = y, y[batch_indices]
 
         return X, {'y_a': y_a, 'y_b': y_b, 'lam': lam}
 
diff --git a/autoPyTorch/pipeline/components/training/trainer/RowCutMixTrainer.py b/autoPyTorch/pipeline/components/training/trainer/RowCutMixTrainer.py
@@ -26,25 +26,31 @@ def data_preparation(self, X: np.ndarray, y: np.ndarray,
             np.ndarray: that processes data
             typing.Dict[str, np.ndarray]: arguments to the criterion function
         """
-        beta = 1.0
-        lam = self.random_state.beta(beta, beta)
-        batch_size = X.size()[0]
-        index = torch.randperm(batch_size).cuda() if X.is_cuda else torch.randperm(batch_size)
+        alpha, beta = 1.0, 1.0
+        lam = self.random_state.beta(alpha, beta)
+        batch_size = X.shape[0]
+        device = torch.device('cuda' if X.is_cuda else 'cpu')
+        batch_indices = torch.randperm(batch_size).to(device)
 
         r = self.random_state.rand(1)
         if beta <= 0 or r > self.alpha:
-            return X, {'y_a': y, 'y_b': y[index], 'lam': 1}
+            return X, {'y_a': y, 'y_b': y[batch_indices], 'lam': 1}
 
-        size = X.shape[1]
-        indices = torch.tensor(self.random_state.choice(range(1, size), max(1, np.int32(size * lam)),
-                                                        replace=False))
+        row_size = X.shape[1]
+        row_indices = torch.tensor(
+            self.random_state.choice(
+                range(1, row_size),
+                max(1, int(row_size * lam)),
+                replace=False
+            )
+        )
 
-        X[:, indices] = X[index, :][:, indices]
+        X[:, row_indices] = X[batch_indices, :][:, row_indices]
 
         # Adjust lam
-        lam = 1 - ((len(indices)) / (X.size()[1]))
+        lam = 1 - len(row_indices) / X.shape[1]
 
-        y_a, y_b = y, y[index]
+        y_a, y_b = y, y[batch_indices]
 
         return X, {'y_a': y_a, 'y_b': y_b, 'lam': lam}
 
diff --git a/autoPyTorch/pipeline/components/training/trainer/RowCutOutTrainer.py b/autoPyTorch/pipeline/components/training/trainer/RowCutOutTrainer.py
@@ -9,7 +9,9 @@
 
 
 class RowCutOutTrainer(CutOut, BaseTrainerComponent):
+    # 0 is non-informative in image data
     NUMERICAL_VALUE = 0
+    # -1 is the conceptually equivalent to 0 in a image, i.e. 0-pad
     CATEGORICAL_VALUE = -1
 
     def data_preparation(self, X: np.ndarray, y: np.ndarray,
@@ -36,23 +38,18 @@ def data_preparation(self, X: np.ndarray, y: np.ndarray,
             lam = 1
             return X, {'y_a': y_a, 'y_b': y_b, 'lam': lam}
 
-        size = X.shape[1]
-        indices = self.random_state.choice(range(1, size), max(1, np.int32(size * self.patch_ratio)),
-                                           replace=False)
+        row_size = X.shape[1]
+        row_indices = self.random_state.choice(range(1, row_size), max(1, int(row_size * self.patch_ratio)),
+                                               replace=False)
 
         if not isinstance(self.numerical_columns, typing.Iterable):
-            raise ValueError("{} requires numerical columns information of {}"
-                             "to prepare data got {}.".format(self.__class__.__name__,
-                                                              typing.Iterable,
-                                                              self.numerical_columns))
+            raise ValueError("numerical_columns in {} must be iterable, "
+                             "but got {}.".format(self.__class__.__name__,
+                                                  self.numerical_columns))
+
         numerical_indices = torch.tensor(self.numerical_columns)
-        categorical_indices = torch.tensor([index for index in indices if index not in self.numerical_columns])
+        categorical_indices = torch.tensor([idx for idx in row_indices if idx not in self.numerical_columns])
 
-        # We use an ordinal encoder on the categorical columns of tabular data
-        # -1 is the conceptual equivalent to 0 in a image, that does not
-        # have color as a feature and hence the network has to learn to deal
-        # without this data. For numerical columns we use 0 to cutout the features
-        # similar to the effect that setting 0 as a pixel value in an image.
         X[:, categorical_indices.long()] = self.CATEGORICAL_VALUE
         X[:, numerical_indices.long()] = self.NUMERICAL_VALUE
 
diff --git a/autoPyTorch/pipeline/components/training/trainer/__init__.py b/autoPyTorch/pipeline/components/training/trainer/__init__.py
@@ -413,12 +413,13 @@ def _fit(self, X: Dict[str, Any], y: Any = None, **kwargs: Any) -> 'TrainerChoic
 
         if self.choice.use_stochastic_weight_averaging and self.choice.swa_updated:
             # update batch norm statistics
-            swa_utils.update_bn(X['train_data_loader'], self.choice.swa_model.double())
+            swa_utils.update_bn(loader=X['train_data_loader'], model=self.choice.swa_model.double())
+
             # change model
             update_model_state_dict_from_swa(X['network'], self.choice.swa_model.state_dict())
             if self.choice.use_snapshot_ensemble:
                 for model in self.choice.model_snapshots:
-                    swa_utils.update_bn(X['train_data_loader'], model.double())
+                    swa_utils.update_bn(loader=X['train_data_loader'], model=model.double())
 
         # wrap up -- add score if not evaluating every epoch
         if not self.eval_valid_each_epoch(X):
@@ -490,13 +491,10 @@ def early_stop_handler(self, X: Dict[str, Any]) -> bool:
         if self.checkpoint_dir is None:
             self.checkpoint_dir = tempfile.mkdtemp(dir=X['backend'].temporary_directory)
 
+        target_metrics = 'val_loss'
         if X['val_indices'] is None:
-            if X['X_test'] is not None:
-                epochs_since_best = self.run_summary.get_last_epoch() - self.run_summary.get_best_epoch('test_loss')
-            else:
-                epochs_since_best = self.run_summary.get_last_epoch() - self.run_summary.get_best_epoch('train_loss')
-        else:
-            epochs_since_best = self.run_summary.get_last_epoch() - self.run_summary.get_best_epoch()
+            target_metrics = 'test_loss' if X['X_test'] is not None else 'train_loss'
+        epochs_since_best = self.run_summary.get_last_epoch() - self.run_summary.get_best_epoch(target_metrics)
 
         # Save the checkpoint if there is a new best epoch
         best_path = os.path.join(self.checkpoint_dir, 'best.pth')
@@ -626,11 +624,12 @@ def __str__(self) -> str:
     def _get_search_space_updates(self, prefix: Optional[str] = None) -> Dict[str, HyperparameterSearchSpace]:
         """Get the search space updates with the given prefix
 
-        Keyword Arguments:
-            prefix {str} -- Only return search space updates with given prefix (default: {None})
+        Args:
+            prefix (Optional[str]): Only return search space updates with given prefix
 
         Returns:
-            dict -- Mapping of search space updates. Keys don't contain the prefix.
+            Dict[str, HyperparameterSearchSpace]:
+                Mapping of search space updates. Keys don't contain the prefix.
         """
         updates = super()._get_search_space_updates(prefix=prefix)
 
diff --git a/autoPyTorch/pipeline/components/training/trainer/base_trainer.py b/autoPyTorch/pipeline/components/training/trainer/base_trainer.py
@@ -26,7 +26,7 @@
 from autoPyTorch.pipeline.components.training.base_training import autoPyTorchTrainingComponent
 from autoPyTorch.pipeline.components.training.metrics.metrics import CLASSIFICATION_METRICS, REGRESSION_METRICS
 from autoPyTorch.pipeline.components.training.metrics.utils import calculate_score
-from autoPyTorch.pipeline.components.training.trainer.utils import Lookahead, swa_average_function
+from autoPyTorch.pipeline.components.training.trainer.utils import Lookahead, swa_update
 from autoPyTorch.utils.common import FitRequirement, HyperparameterSearchSpace, add_hyperparameter, get_hyperparameter
 from autoPyTorch.utils.implementations import get_loss_weight_strategy
 
@@ -214,7 +214,7 @@ def __init__(self, weighted_loss: bool = False,
                  use_snapshot_ensemble: bool = True,
                  se_lastk: int = 3,
                  use_lookahead_optimizer: bool = True,
-                 random_state: Optional[Union[np.random.RandomState, int]] = None,
+                 random_state: Optional[np.random.RandomState] = None,
                  swa_model: Optional[torch.nn.Module] = None,
                  model_snapshots: Optional[List[torch.nn.Module]] = None,
                  **lookahead_config: Any) -> None:
@@ -275,13 +275,14 @@ def prepare(
 
         # in case we are using swa, maintain an averaged model,
         if self.use_stochastic_weight_averaging:
-            self.swa_model = swa_utils.AveragedModel(self.model, avg_fn=swa_average_function)
+            self.swa_model = swa_utils.AveragedModel(self.model, avg_fn=swa_update)
 
         # in case we are using se or swa, initialise budget_threshold to know when to start swa or se
         self._budget_threshold = 0
         if self.use_stochastic_weight_averaging or self.use_snapshot_ensemble:
-            assert budget_tracker.max_epochs is not None, "Can only use stochastic weight averaging or snapshot " \
-                                                          "ensemble when budget is epochs"
+            if budget_tracker.max_epochs is None:
+                raise ValueError("Budget for stochastic weight averaging or snapshot ensemble must be `epoch`.")
+
             self._budget_threshold = int(0.75 * budget_tracker.max_epochs)
 
         # in case we are using se, initialise list to store model snapshots
@@ -576,7 +577,7 @@ def get_hyperparameter_search_space(
         dataset_properties: Optional[Dict] = None,
         weighted_loss: HyperparameterSearchSpace = HyperparameterSearchSpace(
             hyperparameter="weighted_loss",
-            value_range=[True, False],
+            value_range=(True, False),
             default_value=True),
         la_steps: HyperparameterSearchSpace = HyperparameterSearchSpace(
             hyperparameter="la_steps",
@@ -608,9 +609,7 @@ def get_hyperparameter_search_space(
         cs = ConfigurationSpace()
 
         add_hyperparameter(cs, use_stochastic_weight_averaging, CategoricalHyperparameter)
-        snapshot_ensemble_flag = False
-        if any(use_snapshot_ensemble.value_range):
-            snapshot_ensemble_flag = True
+        snapshot_ensemble_flag = any(use_snapshot_ensemble.value_range)
 
         use_snapshot_ensemble = get_hyperparameter(use_snapshot_ensemble, CategoricalHyperparameter)
         cs.add_hyperparameter(use_snapshot_ensemble)
@@ -621,9 +620,7 @@ def get_hyperparameter_search_space(
             cond = EqualsCondition(se_lastk, use_snapshot_ensemble, True)
             cs.add_condition(cond)
 
-        lookahead_flag = False
-        if any(use_lookahead_optimizer.value_range):
-            lookahead_flag = True
+        lookahead_flag = any(use_lookahead_optimizer.value_range)
 
         use_lookahead_optimizer = get_hyperparameter(use_lookahead_optimizer, CategoricalHyperparameter)
         cs.add_hyperparameter(use_lookahead_optimizer)
diff --git a/autoPyTorch/pipeline/components/training/trainer/cutout_utils.py b/autoPyTorch/pipeline/components/training/trainer/cutout_utils.py
diff --git a/autoPyTorch/pipeline/components/training/trainer/mixup_utils.py b/autoPyTorch/pipeline/components/training/trainer/mixup_utils.py
diff --git a/autoPyTorch/pipeline/components/training/trainer/utils.py b/autoPyTorch/pipeline/components/training/trainer/utils.py

Original file line number	Diff line number	Diff line change
`@@ -156,7 +156,6 @@ def __init__(self, backend: Backend, queue: Queue,`
`156`	`156`	`'resampling_strategy, but got {}'.format(self.datamanager.resampling_strategy)`
`157`	`157`	`)`
`158`	`158`
`159`		`-`
`160`	`159`	`self.splits = self.datamanager.splits`
`161`	`160`	`if self.splits is None:`
`162`	`161`	`raise AttributeError("Must have called create_splits on {}".format(self.datamanager.__class__.__name__))`