Skip to content

Commit 3f51d8b

Browse files
[Fix] budget allocation to enable runtime/epoch as budget (#271)
* FIX_177 * FIx unit test
1 parent 54aab63 commit 3f51d8b

File tree

11 files changed

+282
-96
lines changed

11 files changed

+282
-96
lines changed

autoPyTorch/api/base_task.py

Lines changed: 63 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -202,7 +202,6 @@ def __init__(
202202
self._multiprocessing_context = 'forkserver'
203203
if self.n_jobs == 1:
204204
self._multiprocessing_context = 'fork'
205-
self._dask_client = SingleThreadedClient()
206205

207206
self.InputValidator: Optional[BaseInputValidator] = None
208207

@@ -698,8 +697,9 @@ def _search(
698697
self,
699698
optimize_metric: str,
700699
dataset: BaseDataset,
701-
budget_type: Optional[str] = None,
702-
budget: Optional[float] = None,
700+
budget_type: str = 'epochs',
701+
min_budget: int = 5,
702+
max_budget: int = 50,
703703
total_walltime_limit: int = 100,
704704
func_eval_time_limit_secs: Optional[int] = None,
705705
enable_traditional_pipeline: bool = True,
@@ -728,13 +728,36 @@ def _search(
728728
Providing X_train, y_train and dataset together is not supported.
729729
optimize_metric (str): name of the metric that is used to
730730
evaluate a pipeline.
731-
budget_type (Optional[str]):
731+
budget_type (str):
732732
Type of budget to be used when fitting the pipeline.
733-
Either 'epochs' or 'runtime'. If not provided, uses
734-
the default in the pipeline config ('epochs')
735-
budget (Optional[float]):
736-
Budget to fit a single run of the pipeline. If not
737-
provided, uses the default in the pipeline config
733+
It can be one of:
734+
+ 'epochs': The training of each pipeline will be terminated after
735+
a number of epochs have passed. This number of epochs is determined by the
736+
budget argument of this method.
737+
+ 'runtime': The training of each pipeline will be terminated after
738+
a number of seconds have passed. This number of seconds is determined by the
739+
budget argument of this method. The overall fitting time of a pipeline is
740+
controlled by func_eval_time_limit_secs. 'runtime' only controls the allocated
741+
time to train a pipeline, but it does not consider the overall time it takes
742+
to create a pipeline (data loading and preprocessing, other i/o operations, etc.).
743+
budget_type will determine the units of min_budget/max_budget. If budget_type=='epochs'
744+
is used, min_budget will refer to epochs whereas if budget_type=='runtime' then
745+
min_budget will refer to seconds.
746+
min_budget (int):
747+
Auto-PyTorch uses `Hyperband <https://arxiv.org/abs/1603.06560>_` to
748+
trade-off resources between running many pipelines at min_budget and
749+
running the top performing pipelines on max_budget.
750+
min_budget states the minimum resource allocation a pipeline should have
751+
so that we can compare and quickly discard bad performing models.
752+
For example, if the budget_type is epochs, and min_budget=5, then we will
753+
run every pipeline to a minimum of 5 epochs before performance comparison.
754+
max_budget (int):
755+
Auto-PyTorch uses `Hyperband <https://arxiv.org/abs/1603.06560>_` to
756+
trade-off resources between running many pipelines at min_budget and
757+
running the top performing pipelines on max_budget.
758+
max_budget states the maximum resource allocation a pipeline is going to
759+
be ran. For example, if the budget_type is epochs, and max_budget=50,
760+
then the pipeline training will be terminated after 50 epochs.
738761
total_walltime_limit (int), (default=100): Time limit
739762
in seconds for the search of appropriate models.
740763
By increasing this value, autopytorch has a higher
@@ -843,23 +866,27 @@ def _search(
843866

844867
self.search_space = self.get_search_space(dataset)
845868

846-
budget_config: Dict[str, Union[float, str]] = {}
847-
if budget_type is not None and budget is not None:
848-
budget_config['budget_type'] = budget_type
849-
budget_config[budget_type] = budget
850-
elif budget_type is not None or budget is not None:
851-
raise ValueError(
852-
"budget type was not specified in budget_config"
853-
)
869+
# Incorporate budget to pipeline config
870+
if budget_type not in ('epochs', 'runtime'):
871+
raise ValueError("Budget type must be one ('epochs', 'runtime')"
872+
f" yet {budget_type} was provided")
873+
self.pipeline_options['budget_type'] = budget_type
874+
875+
# Here the budget is set to max because the SMAC intensifier can be:
876+
# Hyperband: in this case the budget is determined on the fly and overwritten
877+
# by the ExecuteTaFuncWithQueue
878+
# SimpleIntensifier (and others): in this case, we use max_budget as a target
879+
# budget, and hece the below line is honored
880+
self.pipeline_options[budget_type] = max_budget
854881

855882
if self.task_type is None:
856883
raise ValueError("Cannot interpret task type from the dataset")
857884

858885
# If no dask client was provided, we create one, so that we can
859886
# start a ensemble process in parallel to smbo optimize
860-
if (
861-
dask_client is None and (self.ensemble_size > 0 or self.n_jobs > 1)
862-
):
887+
if self.n_jobs == 1:
888+
self._dask_client = SingleThreadedClient()
889+
elif dask_client is None:
863890
self._create_dask_client()
864891
else:
865892
self._dask_client = dask_client
@@ -878,7 +905,7 @@ def _search(
878905

879906
# Make sure that at least 2 models are created for the ensemble process
880907
num_models = time_left_for_modelfit // func_eval_time_limit_secs
881-
if num_models < 2:
908+
if num_models < 2 and self.ensemble_size > 0:
882909
func_eval_time_limit_secs = time_left_for_modelfit // 2
883910
self._logger.warning(
884911
"Capping the func_eval_time_limit_secs to {} to have "
@@ -978,7 +1005,9 @@ def _search(
9781005
all_supported_metrics=self._all_supported_metrics,
9791006
smac_scenario_args=smac_scenario_args,
9801007
get_smac_object_callback=get_smac_object_callback,
981-
pipeline_config={**self.pipeline_options, **budget_config},
1008+
pipeline_config=self.pipeline_options,
1009+
min_budget=min_budget,
1010+
max_budget=max_budget,
9821011
ensemble_callback=proc_ensemble,
9831012
logger_port=self._logger_port,
9841013
# We do not increase the num_run here, this is something
@@ -1046,7 +1075,6 @@ def _search(
10461075
def refit(
10471076
self,
10481077
dataset: BaseDataset,
1049-
budget_config: Dict[str, Union[int, str]] = {},
10501078
split_id: int = 0
10511079
) -> "BaseTask":
10521080
"""
@@ -1058,14 +1086,16 @@ def refit(
10581086
This methods fits all models found during a call to fit on the data
10591087
given. This method may also be used together with holdout to avoid
10601088
only using 66% of the training data to fit the final model.
1089+
1090+
Refit uses the estimator pipeline_config attribute, which the user
1091+
can interact via the get_pipeline_config()/set_pipeline_config()
1092+
methods.
1093+
10611094
Args:
10621095
dataset: (Dataset)
10631096
The argument that will provide the dataset splits. It can either
10641097
be a dictionary with the splits, or the dataset object which can
10651098
generate the splits based on different restrictions.
1066-
budget_config: (Optional[Dict[str, Union[int, str]]])
1067-
can contain keys from 'budget_type' and the budget
1068-
specified using 'epochs' or 'runtime'.
10691099
split_id: (int)
10701100
split id to fit on.
10711101
Returns:
@@ -1096,7 +1126,7 @@ def refit(
10961126
'split_id': split_id,
10971127
'num_run': self._backend.get_next_num_run(),
10981128
})
1099-
X.update({**self.pipeline_options, **budget_config})
1129+
X.update(self.pipeline_options)
11001130
if self.models_ is None or len(self.models_) == 0 or self.ensemble_ is None:
11011131
self._load_models()
11021132

@@ -1120,21 +1150,22 @@ def refit(
11201150

11211151
def fit(self,
11221152
dataset: BaseDataset,
1123-
budget_config: Dict[str, Union[int, str]] = {},
11241153
pipeline_config: Optional[Configuration] = None,
11251154
split_id: int = 0) -> BasePipeline:
11261155
"""
11271156
Fit a pipeline on the given task for the budget.
11281157
A pipeline configuration can be specified if None,
11291158
uses default
1159+
1160+
Fit uses the estimator pipeline_config attribute, which the user
1161+
can interact via the get_pipeline_config()/set_pipeline_config()
1162+
methods.
1163+
11301164
Args:
11311165
dataset: (Dataset)
11321166
The argument that will provide the dataset splits. It can either
11331167
be a dictionary with the splits, or the dataset object which can
11341168
generate the splits based on different restrictions.
1135-
budget_config: (Optional[Dict[str, Union[int, str]]])
1136-
can contain keys from 'budget_type' and the budget
1137-
specified using 'epochs' or 'runtime'.
11381169
split_id: (int) (default=0)
11391170
split id to fit on.
11401171
pipeline_config: (Optional[Configuration])
@@ -1175,7 +1206,7 @@ def fit(self,
11751206
'split_id': split_id,
11761207
'num_run': self._backend.get_next_num_run(),
11771208
})
1178-
X.update({**self.pipeline_options, **budget_config})
1209+
X.update(self.pipeline_options)
11791210

11801211
fit_and_suppress_warnings(self._logger, pipeline, X, y=None)
11811212

autoPyTorch/api/tabular_classification.py

Lines changed: 36 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -110,8 +110,9 @@ def search(
110110
X_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
111111
y_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
112112
dataset_name: Optional[str] = None,
113-
budget_type: Optional[str] = None,
114-
budget: Optional[float] = None,
113+
budget_type: str = 'epochs',
114+
min_budget: int = 5,
115+
max_budget: int = 50,
115116
total_walltime_limit: int = 100,
116117
func_eval_time_limit_secs: Optional[int] = None,
117118
enable_traditional_pipeline: bool = True,
@@ -137,15 +138,38 @@ def search(
137138
be provided to track the generalization performance of each stage.
138139
optimize_metric (str):
139140
name of the metric that is used to evaluate a pipeline.
140-
budget_type (Optional[str]):
141+
budget_type (str):
141142
Type of budget to be used when fitting the pipeline.
142-
Either 'epochs' or 'runtime'. If not provided, uses
143-
the default in the pipeline config ('epochs')
144-
budget (Optional[float]):
145-
Budget to fit a single run of the pipeline. If not
146-
provided, uses the default in the pipeline config
147-
total_walltime_limit (int), (default=100):
148-
Time limit in seconds for the search of appropriate models.
143+
It can be one of:
144+
+ 'epochs': The training of each pipeline will be terminated after
145+
a number of epochs have passed. This number of epochs is determined by the
146+
budget argument of this method.
147+
+ 'runtime': The training of each pipeline will be terminated after
148+
a number of seconds have passed. This number of seconds is determined by the
149+
budget argument of this method. The overall fitting time of a pipeline is
150+
controlled by func_eval_time_limit_secs. 'runtime' only controls the allocated
151+
time to train a pipeline, but it does not consider the overall time it takes
152+
to create a pipeline (data loading and preprocessing, other i/o operations, etc.).
153+
budget_type will determine the units of min_budget/max_budget. If budget_type=='epochs'
154+
is used, min_budget will refer to epochs whereas if budget_type=='runtime' then
155+
min_budget will refer to seconds.
156+
min_budget (int):
157+
Auto-PyTorch uses `Hyperband <https://arxiv.org/abs/1603.06560>_` to
158+
trade-off resources between running many pipelines at min_budget and
159+
running the top performing pipelines on max_budget.
160+
min_budget states the minimum resource allocation a pipeline should have
161+
so that we can compare and quickly discard bad performing models.
162+
For example, if the budget_type is epochs, and min_budget=5, then we will
163+
run every pipeline to a minimum of 5 epochs before performance comparison.
164+
max_budget (int):
165+
Auto-PyTorch uses `Hyperband <https://arxiv.org/abs/1603.06560>_` to
166+
trade-off resources between running many pipelines at min_budget and
167+
running the top performing pipelines on max_budget.
168+
max_budget states the maximum resource allocation a pipeline is going to
169+
be ran. For example, if the budget_type is epochs, and max_budget=50,
170+
then the pipeline training will be terminated after 50 epochs.
171+
total_walltime_limit (int), (default=100): Time limit
172+
in seconds for the search of appropriate models.
149173
By increasing this value, autopytorch has a higher
150174
chance of finding better models.
151175
func_eval_time_limit_secs (int), (default=None):
@@ -234,7 +258,8 @@ def search(
234258
dataset=self.dataset,
235259
optimize_metric=optimize_metric,
236260
budget_type=budget_type,
237-
budget=budget,
261+
min_budget=min_budget,
262+
max_budget=max_budget,
238263
total_walltime_limit=total_walltime_limit,
239264
func_eval_time_limit_secs=func_eval_time_limit_secs,
240265
enable_traditional_pipeline=enable_traditional_pipeline,

autoPyTorch/api/tabular_regression.py

Lines changed: 51 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -53,23 +53,23 @@ class TabularRegressionTask(BaseTask):
5353
"""
5454

5555
def __init__(
56-
self,
57-
seed: int = 1,
58-
n_jobs: int = 1,
59-
logging_config: Optional[Dict] = None,
60-
ensemble_size: int = 50,
61-
ensemble_nbest: int = 50,
62-
max_models_on_disc: int = 50,
63-
temporary_directory: Optional[str] = None,
64-
output_directory: Optional[str] = None,
65-
delete_tmp_folder_after_terminate: bool = True,
66-
delete_output_folder_after_terminate: bool = True,
67-
include_components: Optional[Dict] = None,
68-
exclude_components: Optional[Dict] = None,
69-
resampling_strategy: Union[CrossValTypes, HoldoutValTypes] = HoldoutValTypes.holdout_validation,
70-
resampling_strategy_args: Optional[Dict[str, Any]] = None,
71-
backend: Optional[Backend] = None,
72-
search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None
56+
self,
57+
seed: int = 1,
58+
n_jobs: int = 1,
59+
logging_config: Optional[Dict] = None,
60+
ensemble_size: int = 50,
61+
ensemble_nbest: int = 50,
62+
max_models_on_disc: int = 50,
63+
temporary_directory: Optional[str] = None,
64+
output_directory: Optional[str] = None,
65+
delete_tmp_folder_after_terminate: bool = True,
66+
delete_output_folder_after_terminate: bool = True,
67+
include_components: Optional[Dict] = None,
68+
exclude_components: Optional[Dict] = None,
69+
resampling_strategy: Union[CrossValTypes, HoldoutValTypes] = HoldoutValTypes.holdout_validation,
70+
resampling_strategy_args: Optional[Dict[str, Any]] = None,
71+
backend: Optional[Backend] = None,
72+
search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None
7373
):
7474
super().__init__(
7575
seed=seed,
@@ -102,8 +102,9 @@ def search(
102102
X_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
103103
y_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
104104
dataset_name: Optional[str] = None,
105-
budget_type: Optional[str] = None,
106-
budget: Optional[float] = None,
105+
budget_type: str = 'epochs',
106+
min_budget: int = 5,
107+
max_budget: int = 50,
107108
total_walltime_limit: int = 100,
108109
func_eval_time_limit_secs: Optional[int] = None,
109110
enable_traditional_pipeline: bool = True,
@@ -129,13 +130,36 @@ def search(
129130
be provided to track the generalization performance of each stage.
130131
optimize_metric (str): name of the metric that is used to
131132
evaluate a pipeline.
132-
budget_type (Optional[str]):
133+
budget_type (str):
133134
Type of budget to be used when fitting the pipeline.
134-
Either 'epochs' or 'runtime'. If not provided, uses
135-
the default in the pipeline config ('epochs')
136-
budget (Optional[float]):
137-
Budget to fit a single run of the pipeline. If not
138-
provided, uses the default in the pipeline config
135+
It can be one of:
136+
+ 'epochs': The training of each pipeline will be terminated after
137+
a number of epochs have passed. This number of epochs is determined by the
138+
budget argument of this method.
139+
+ 'runtime': The training of each pipeline will be terminated after
140+
a number of seconds have passed. This number of seconds is determined by the
141+
budget argument of this method. The overall fitting time of a pipeline is
142+
controlled by func_eval_time_limit_secs. 'runtime' only controls the allocated
143+
time to train a pipeline, but it does not consider the overall time it takes
144+
to create a pipeline (data loading and preprocessing, other i/o operations, etc.).
145+
budget_type will determine the units of min_budget/max_budget. If budget_type=='epochs'
146+
is used, min_budget will refer to epochs whereas if budget_type=='runtime' then
147+
min_budget will refer to seconds.
148+
min_budget (int):
149+
Auto-PyTorch uses `Hyperband <https://arxiv.org/abs/1603.06560>_` to
150+
trade-off resources between running many pipelines at min_budget and
151+
running the top performing pipelines on max_budget.
152+
min_budget states the minimum resource allocation a pipeline should have
153+
so that we can compare and quickly discard bad performing models.
154+
For example, if the budget_type is epochs, and min_budget=5, then we will
155+
run every pipeline to a minimum of 5 epochs before performance comparison.
156+
max_budget (int):
157+
Auto-PyTorch uses `Hyperband <https://arxiv.org/abs/1603.06560>_` to
158+
trade-off resources between running many pipelines at min_budget and
159+
running the top performing pipelines on max_budget.
160+
max_budget states the maximum resource allocation a pipeline is going to
161+
be ran. For example, if the budget_type is epochs, and max_budget=50,
162+
then the pipeline training will be terminated after 50 epochs.
139163
total_walltime_limit (int), (default=100): Time limit
140164
in seconds for the search of appropriate models.
141165
By increasing this value, autopytorch has a higher
@@ -227,7 +251,8 @@ def search(
227251
dataset=self.dataset,
228252
optimize_metric=optimize_metric,
229253
budget_type=budget_type,
230-
budget=budget,
254+
min_budget=min_budget,
255+
max_budget=max_budget,
231256
total_walltime_limit=total_walltime_limit,
232257
func_eval_time_limit_secs=func_eval_time_limit_secs,
233258
enable_traditional_pipeline=enable_traditional_pipeline,

0 commit comments

Comments
 (0)