[fix] Bring back the data generator shuffle

nabenabe0928 · nabenabe0928 · commit 8c9b89568b41 · 2021-05-19T14:08:00.000+09:00
diff --git a/autoPyTorch/datasets/base_dataset.py b/autoPyTorch/datasets/base_dataset.py
@@ -71,6 +71,7 @@ def __init__(
         test_tensors: Optional[BaseDatasetInputType] = None,
         resampling_strategy: Union[CrossValTypes, HoldoutValTypes] = HoldoutValTypes.holdout_validation,
         resampling_strategy_args: Optional[Dict[str, Any]] = None,
+        shuffle: Optional[bool] = True,
         seed: Optional[int] = 42,
         train_transforms: Optional[torchvision.transforms.Compose] = None,
         val_transforms: Optional[torchvision.transforms.Compose] = None,
@@ -91,7 +92,7 @@ def __init__(
             resampling_strategy_args (Optional[Dict[str, Any]]):
                 arguments required for the chosen resampling strategy.
                 The details are provided in autoPytorch/datasets/resampling_strategy.py
-            shuffle:  Whether to shuffle the data when performing splits
+            shuffle:  Whether to shuffle the data before performing splits
             seed (int), (default=1): seed to be used for reproducibility.
             train_transforms (Optional[torchvision.transforms.Compose]):
                 Additional Transforms to be applied to the training data
@@ -107,12 +108,14 @@ def __init__(
             type_check(train_tensors, val_tensors)
         self.train_tensors, self.val_tensors, self.test_tensors = train_tensors, val_tensors, test_tensors
         self.random_state = np.random.RandomState(seed=seed)
+        self.shuffle = shuffle
+
         self.resampling_strategy = resampling_strategy
         self.resampling_strategy_args: Dict[str, Any] = {}
         if resampling_strategy_args is not None:
             self.resampling_strategy_args = resampling_strategy_args
 
-        self.shuffle = self.resampling_strategy_args.get('shuffle', False)
+        self.shuffle_split = self.resampling_strategy_args.get('shuffle', False)
         self.is_stratify = self.resampling_strategy_args.get('stratify', False)
 
         self.task_type: Optional[str] = None
@@ -195,7 +198,7 @@ def __len__(self) -> int:
         return self.train_tensors[0].shape[0]
 
     def _get_indices(self) -> np.ndarray:
-        return np.arange(len(self))
+        return self.random_state.permutation(len(self)) if self.shuffle else np.arange(len(self))
 
     def _process_resampling_strategy_args(self) -> None:
         if not any(isinstance(self.resampling_strategy, val_type)
@@ -238,7 +241,7 @@ def get_splits_from_resampling_strategy(self) -> List[Tuple[List[int], List[int]
             return self.resampling_strategy(
                 random_state=self.random_state,
                 val_share=val_share,
-                shuffle=self.shuffle,
+                shuffle=self.shuffle_split,
                 indices=self._get_indices(),
                 labels_to_stratify=labels_to_stratify
             )
@@ -248,7 +251,7 @@ def get_splits_from_resampling_strategy(self) -> List[Tuple[List[int], List[int]
             return self.resampling_strategy(
                 random_state=self.random_state,
                 num_splits=num_splits,
-                shuffle=self.shuffle,
+                shuffle=self.shuffle_split,
                 indices=self._get_indices(),
                 labels_to_stratify=labels_to_stratify
             )
diff --git a/autoPyTorch/datasets/image_dataset.py b/autoPyTorch/datasets/image_dataset.py
@@ -45,7 +45,7 @@ class ImageDataset(BaseDataset):
         resampling_strategy_args (Optional[Dict[str, Any]]):
             arguments required for the chosen resampling strategy.
             The details are provided in autoPytorch/datasets/resampling_strategy.py
-        shuffle:  Whether to shuffle the data when performing splits
+        shuffle:  Whether to shuffle the data before performing splits
         seed (int), (default=1): seed to be used for reproducibility.
         train_transforms (Optional[torchvision.transforms.Compose]):
             Additional Transforms to be applied to the training data
@@ -58,6 +58,7 @@ def __init__(self,
                  test: Optional[IMAGE_DATASET_INPUT] = None,
                  resampling_strategy: Union[CrossValTypes, HoldoutValTypes] = HoldoutValTypes.holdout_validation,
                  resampling_strategy_args: Optional[Dict[str, Any]] = None,
+                 shuffle: Optional[bool] = True,
                  seed: Optional[int] = 42,
                  train_transforms: Optional[torchvision.transforms.Compose] = None,
                  val_transforms: Optional[torchvision.transforms.Compose] = None,
@@ -70,7 +71,7 @@ def __init__(self,
             test = _create_image_dataset(data=test)
         self.mean, self.std = _calc_mean_std(train=train)
 
-        super().__init__(train_tensors=train, val_tensors=val, test_tensors=test,
+        super().__init__(train_tensors=train, val_tensors=val, test_tensors=test, shuffle=shuffle,
                          resampling_strategy=resampling_strategy, resampling_strategy_args=resampling_strategy_args,
                          seed=seed,
                          train_transforms=train_transforms,
diff --git a/autoPyTorch/datasets/tabular_dataset.py b/autoPyTorch/datasets/tabular_dataset.py
@@ -50,7 +50,7 @@ class TabularDataset(BaseDataset):
             resampling_strategy_args (Optional[Dict[str, Any]]):
                 arguments required for the chosen resampling strategy.
                 The details are provided in autoPytorch/datasets/resampling_strategy.py
-            shuffle:  Whether to shuffle the data when performing splits
+            shuffle:  Whether to shuffle the data before performing splits
             seed (int), (default=1): seed to be used for reproducibility.
             train_transforms (Optional[torchvision.transforms.Compose]):
                 Additional Transforms to be applied to the training data.
@@ -68,6 +68,7 @@ def __init__(self,
                  Y_test: Optional[Union[np.ndarray, pd.DataFrame]] = None,
                  resampling_strategy: Union[CrossValTypes, HoldoutValTypes] = HoldoutValTypes.holdout_validation,
                  resampling_strategy_args: Optional[Dict[str, Any]] = None,
+                 shuffle: Optional[bool] = True,
                  seed: Optional[int] = 42,
                  train_transforms: Optional[torchvision.transforms.Compose] = None,
                  val_transforms: Optional[torchvision.transforms.Compose] = None,
@@ -90,7 +91,7 @@ def __init__(self,
         self.num_features = validator.feature_validator.num_features
         self.categories = validator.feature_validator.categories
 
-        super().__init__(train_tensors=(X, Y), test_tensors=(X_test, Y_test),
+        super().__init__(train_tensors=(X, Y), test_tensors=(X_test, Y_test), shuffle=shuffle,
                          resampling_strategy=resampling_strategy,
                          resampling_strategy_args=resampling_strategy_args,
                          seed=seed, train_transforms=train_transforms,
diff --git a/autoPyTorch/datasets/time_series_dataset.py b/autoPyTorch/datasets/time_series_dataset.py
@@ -41,6 +41,7 @@ def __init__(self,
                  val: Optional[TIME_SERIES_FORECASTING_INPUT] = None,
                  resampling_strategy: Union[CrossValTypes, HoldoutValTypes] = HoldoutValTypes.holdout_validation,
                  resampling_strategy_args: Optional[Dict[str, Any]] = None,
+                 shuffle: Optional[bool] = False,
                  seed: Optional[int] = 42,
                  train_transforms: Optional[torchvision.transforms.Compose] = None,
                  val_transforms: Optional[torchvision.transforms.Compose] = None,
@@ -68,7 +69,7 @@ def __init__(self,
                                                           target_variables=target_variables,
                                                           sequence_length=sequence_length,
                                                           n_steps=n_steps)
-        super().__init__(train_tensors=train, val_tensors=val,
+        super().__init__(train_tensors=train, val_tensors=val, shuffle=shuffle,
                          resampling_strategy=resampling_strategy, resampling_strategy_args=resampling_strategy_args,
                          seed=seed,
                          train_transforms=train_transforms,
@@ -128,17 +129,15 @@ def __init__(self,
         _check_time_series_inputs(train=train,
                                   val=val,
                                   task_type="time_series_classification")
-        resampling_strategy_args = {'shuffle': True}
-        super().__init__(train_tensors=train, val_tensors=val, resampling_strategy_args=resampling_strategy_args)
+        super().__init__(train_tensors=train, val_tensors=val, shuffle=True)
 
 
 class TimeSeriesRegressionDataset(BaseDataset):
     def __init__(self, train: Tuple[np.ndarray, np.ndarray], val: Optional[Tuple[np.ndarray, np.ndarray]] = None):
         _check_time_series_inputs(train=train,
                                   val=val,
                                   task_type="time_series_regression")
-        resampling_strategy_args = {'shuffle': True}
-        super().__init__(train_tensors=train, val_tensors=val, resampling_strategy_args=resampling_strategy_args)
+        super().__init__(train_tensors=train, val_tensors=val, shuffle=True)
 
 
 def _check_time_series_inputs(task_type: str,