Skip to content

Commit 4bf86b0

Browse files
authored
Merge pull request #82 from franchuterivera/refactor_development_network_performance
Make sure the performance of pipeline is at least 0.8
2 parents 36e7db6 + 00313c8 commit 4bf86b0

File tree

11 files changed

+84
-28
lines changed

11 files changed

+84
-28
lines changed

autoPyTorch/api/base_task.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -216,7 +216,7 @@ def set_pipeline_config(
216216
then sets them to the current pipeline
217217
configuration.
218218
Args:
219-
**pipeline_config_kwargs: Valid config options include "job_id",
219+
**pipeline_config_kwargs: Valid config options include "num_run",
220220
"device", "budget_type", "epochs", "runtime", "torch_num_threads",
221221
"early_stopping", "use_tensorboard_logger", "use_pynisher",
222222
"metrics_during_training"
@@ -923,7 +923,7 @@ def refit(
923923
'train_indices': dataset.splits[split_id][0],
924924
'val_indices': dataset.splits[split_id][1],
925925
'split_id': split_id,
926-
'job_id': 0
926+
'num_run': 0
927927
})
928928
X.update({**self.pipeline_options, **budget_config})
929929
if self.models_ is None or len(self.models_) == 0 or self.ensemble_ is None:
@@ -996,7 +996,7 @@ def fit(self,
996996
'train_indices': dataset.splits[split_id][0],
997997
'val_indices': dataset.splits[split_id][1],
998998
'split_id': split_id,
999-
'job_id': 0
999+
'num_run': 0
10001000
})
10011001
X.update({**self.pipeline_options, **budget_config})
10021002

autoPyTorch/evaluation/train_evaluator.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -258,7 +258,7 @@ def _fit_and_predict(self, pipeline: BaseEstimator, fold: int, train_indices: Un
258258
X = {'train_indices': train_indices,
259259
'val_indices': test_indices,
260260
'split_id': fold,
261-
'job_id': self.num_run,
261+
'num_run': self.num_run,
262262
**self.fit_dictionary} # fit dictionary
263263
y = None
264264
fit_and_suppress_warnings(self.logger, pipeline, X, y)

autoPyTorch/pipeline/base_pipeline.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -480,7 +480,7 @@ def get_additional_run_info(self) -> Dict:
480480
@staticmethod
481481
def get_default_pipeline_options() -> Dict[str, Any]:
482482
return {
483-
'job_id': '1',
483+
'num_run': 0,
484484
'device': 'cpu',
485485
'budget_type': 'epochs',
486486
'epochs': 5,

autoPyTorch/pipeline/components/setup/lr_scheduler/base_scheduler_choice.py

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -138,13 +138,14 @@ def get_hyperparameter_search_space(
138138
raise ValueError("No scheduler found")
139139

140140
if default is None:
141-
defaults = ['no_LRScheduler',
142-
'LambdaLR',
143-
'StepLR',
144-
'ExponentialLR',
145-
'CosineAnnealingLR',
146-
'ReduceLROnPlateau'
147-
]
141+
defaults = [
142+
'ReduceLROnPlateau',
143+
'CosineAnnealingLR',
144+
'no_LRScheduler',
145+
'LambdaLR',
146+
'StepLR',
147+
'ExponentialLR',
148+
]
148149
for default_ in defaults:
149150
if default_ in available_schedulers:
150151
default = default_

autoPyTorch/pipeline/components/training/trainer/base_trainer.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -118,7 +118,7 @@ def get_best_epoch(self, loss_type: str = 'val_loss') -> int:
118118
[self.performance_tracker[loss_type][e] for e in range(1, len(
119119
self.performance_tracker[loss_type]) + 1
120120
)]
121-
)
121+
) + 1 # Epochs start at 1
122122

123123
def get_last_epoch(self) -> int:
124124
if 'train_loss' not in self.performance_tracker:

autoPyTorch/pipeline/components/training/trainer/base_trainer_choice.py

Lines changed: 28 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import collections
22
import logging.handlers
33
import os
4+
import tempfile
45
import time
56
from typing import Any, Dict, List, Optional, Tuple, cast
67

@@ -66,7 +67,7 @@ def __init__(self,
6667
self.writer = None # type: Optional[SummaryWriter]
6768
self._fit_requirements: Optional[List[FitRequirement]] = [
6869
FitRequirement("lr_scheduler", (_LRScheduler,), user_defined=False, dataset_property=False),
69-
FitRequirement("job_id", (str,), user_defined=False, dataset_property=False),
70+
FitRequirement("num_run", (int,), user_defined=False, dataset_property=False),
7071
FitRequirement(
7172
"optimizer", (Optimizer,), user_defined=False, dataset_property=False),
7273
FitRequirement("train_data_loader",
@@ -75,6 +76,7 @@ def __init__(self,
7576
FitRequirement("val_data_loader",
7677
(torch.utils.data.DataLoader,),
7778
user_defined=False, dataset_property=False)]
79+
self.checkpoint_dir = None # type: Optional[str]
7880

7981
def get_fit_requirements(self) -> Optional[List[FitRequirement]]:
8082
return self._fit_requirements
@@ -185,7 +187,7 @@ def fit(self, X: Dict[str, Any], y: Any = None, **kwargs: Any) -> autoPyTorchCom
185187

186188
# Setup the logger
187189
self.logger = get_named_client_logger(
188-
name=X['job_id'],
190+
name=X['num_run'],
189191
# Log to a user provided port else to the default logging port
190192
port=X['logger_port'
191193
] if 'logger_port' in X else logging.handlers.DEFAULT_TCP_LOGGING_PORT,
@@ -369,8 +371,29 @@ def early_stop_handler(self, X: Dict[str, Any]) -> bool:
369371
bool: If true, training should be stopped
370372
"""
371373
assert self.run_summary is not None
372-
epochs_since_best = self.run_summary.get_best_epoch() - self.run_summary.get_last_epoch()
374+
375+
# Allow to disable early stopping
376+
if X['early_stopping'] is None or X['early_stopping'] < 0:
377+
return False
378+
379+
# Store the best weights seen so far:
380+
if self.checkpoint_dir is None:
381+
self.checkpoint_dir = tempfile.mkdtemp(dir=X['backend'].temporary_directory)
382+
383+
epochs_since_best = self.run_summary.get_last_epoch() - self.run_summary.get_best_epoch()
384+
385+
# Save the checkpoint if there is a new best epoch
386+
best_path = os.path.join(self.checkpoint_dir, 'best.pth')
387+
if epochs_since_best == 0:
388+
torch.save(X['network'].state_dict(), best_path)
389+
373390
if epochs_since_best > X['early_stopping']:
391+
self.logger.debug(f" Early stopped model {X['num_run']} on epoch {self.run_summary.get_best_epoch()}")
392+
# We will stop the training. Load the last best performing weights
393+
X['network'].load_state_dict(torch.load(best_path))
394+
395+
# Let the tempfile module clean the temp dir
396+
self.checkpoint_dir = None
374397
return True
375398

376399
return False
@@ -458,8 +481,8 @@ def check_requirements(self, X: Dict[str, Any], y: Any = None) -> None:
458481
X['budget_type']
459482
))
460483

461-
if 'job_id' not in X:
462-
raise ValueError('To fit a trainer, expected fit dictionary to have a job_id')
484+
if 'num_run' not in X:
485+
raise ValueError('To fit a trainer, expected fit dictionary to have a num_run')
463486

464487
for config_option in ["torch_num_threads", 'device']:
465488
if config_option not in X:

examples/example_tabular_classification.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,16 @@
33
Tabular Classification
44
======================
55
"""
6+
import os
7+
import tempfile as tmp
68
import typing
79
import warnings
810

11+
os.environ['JOBLIB_TEMP_FOLDER'] = tmp.gettempdir()
12+
os.environ['OMP_NUM_THREADS'] = '1'
13+
os.environ['OPENBLAS_NUM_THREADS'] = '1'
14+
os.environ['MKL_NUM_THREADS'] = '1'
15+
916
warnings.simplefilter(action='ignore', category=UserWarning)
1017
warnings.simplefilter(action='ignore', category=FutureWarning)
1118

test/conftest.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -181,7 +181,7 @@ def fit_dictionary_numerical_only(backend):
181181
'X_train': X,
182182
'y_train': y,
183183
'dataset_properties': dataset_properties,
184-
'job_id': 'example_tabular_classification_1',
184+
'num_run': np.random.randint(50),
185185
'device': 'cpu',
186186
'budget_type': 'epochs',
187187
'epochs': 1,
@@ -220,7 +220,7 @@ def fit_dictionary_categorical_only(backend):
220220
'X_train': X,
221221
'y_train': y,
222222
'dataset_properties': dataset_properties,
223-
'job_id': 'example_tabular_classification_1',
223+
'num_run': np.random.randint(50),
224224
'device': 'cpu',
225225
'budget_type': 'epochs',
226226
'epochs': 1,
@@ -262,7 +262,7 @@ def fit_dictionary_num_and_categorical(backend):
262262
'X_train': X,
263263
'y_train': y,
264264
'dataset_properties': dataset_properties,
265-
'job_id': 'example_tabular_classification_1',
265+
'num_run': np.random.randint(50),
266266
'device': 'cpu',
267267
'budget_type': 'epochs',
268268
'epochs': 1,

test/test_pipeline/components/test_setup_networks.py

Lines changed: 29 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,12 @@ def test_pipeline_fit(self, fit_dictionary, backbone, head):
3232
assert backbone == config.get('network_backbone:__choice__', None)
3333
assert head == config.get('network_head:__choice__', None)
3434
pipeline.set_hyperparameters(config)
35+
36+
# Need more epochs to make sure validation performance is met
37+
fit_dictionary['epochs'] = 100
38+
# Early stop to the best configuration seen
39+
fit_dictionary['early_stopping'] = 50
40+
3541
pipeline.fit(fit_dictionary)
3642

3743
# To make sure we fitted the model, there should be a
@@ -44,9 +50,28 @@ def test_pipeline_fit(self, fit_dictionary, backbone, head):
4450
assert run_summary.total_parameter_count > 0
4551
assert 'accuracy' in run_summary.performance_tracker['train_metrics'][1]
4652

47-
# Commented out the next line as some pipelines are not
48-
# achieving this accuracy with default configuration and 10 epochs
49-
# To be added once we fix the search space
50-
# assert run_summary.performance_tracker['val_metrics'][fit_dictionary['epochs']]['accuracy'] >= 0.8
53+
# Make sure default pipeline achieves a good score for dummy datasets
54+
epoch2loss = run_summary.performance_tracker['val_loss']
55+
best_loss = min(list(epoch2loss.values()))
56+
epoch_where_best = list(epoch2loss.keys())[list(epoch2loss.values()).index(best_loss)]
57+
score = run_summary.performance_tracker['val_metrics'][epoch_where_best]['accuracy']
58+
59+
assert score >= 0.8, run_summary.performance_tracker['val_metrics']
60+
61+
# Check that early stopping happened, if it did
62+
63+
# We should not stop before patience
64+
assert run_summary.get_last_epoch() >= fit_dictionary['early_stopping']
65+
66+
# we should not be greater than max allowed epoch
67+
assert run_summary.get_last_epoch() <= fit_dictionary['epochs']
68+
69+
# every trained epoch has a val metric
70+
assert run_summary.get_last_epoch() == max(list(run_summary.performance_tracker['train_metrics'].keys()))
71+
72+
epochs_since_best = run_summary.get_last_epoch() - run_summary.get_best_epoch()
73+
if epochs_since_best >= fit_dictionary['early_stopping']:
74+
assert run_summary.get_best_epoch() == epoch_where_best
75+
5176
# Make sure a network was fit
5277
assert isinstance(pipeline.named_steps['network'].get_network(), torch.nn.Module)

test/test_pipeline/components/test_setup_preprocessing_node.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ def test_tabular_preprocess(self):
4646
val_indices=[6, 7, 8, 9],
4747
dataset_properties=dataset_properties,
4848
# Training configuration
49-
job_id='test',
49+
num_run=15,
5050
device='cpu',
5151
budget_type='epochs',
5252
epochs=10,
@@ -82,7 +82,7 @@ def test_tabular_no_preprocess(self):
8282
val_indices=[6, 7, 8, 9],
8383
dataset_properties=dataset_properties,
8484
# Training configuration
85-
job_id='test',
85+
num_run=16,
8686
device='cpu',
8787
budget_type='epochs',
8888
epochs=10,

0 commit comments

Comments
 (0)