@@ -202,7 +202,6 @@ def __init__(
202
202
self ._multiprocessing_context = 'forkserver'
203
203
if self .n_jobs == 1 :
204
204
self ._multiprocessing_context = 'fork'
205
- self ._dask_client = SingleThreadedClient ()
206
205
207
206
self .InputValidator : Optional [BaseInputValidator ] = None
208
207
@@ -698,8 +697,9 @@ def _search(
698
697
self ,
699
698
optimize_metric : str ,
700
699
dataset : BaseDataset ,
701
- budget_type : Optional [str ] = None ,
702
- budget : Optional [float ] = None ,
700
+ budget_type : str = 'epochs' ,
701
+ min_budget : int = 5 ,
702
+ max_budget : int = 50 ,
703
703
total_walltime_limit : int = 100 ,
704
704
func_eval_time_limit_secs : Optional [int ] = None ,
705
705
enable_traditional_pipeline : bool = True ,
@@ -728,13 +728,36 @@ def _search(
728
728
Providing X_train, y_train and dataset together is not supported.
729
729
optimize_metric (str): name of the metric that is used to
730
730
evaluate a pipeline.
731
- budget_type (Optional[ str] ):
731
+ budget_type (str):
732
732
Type of budget to be used when fitting the pipeline.
733
- Either 'epochs' or 'runtime'. If not provided, uses
734
- the default in the pipeline config ('epochs')
735
- budget (Optional[float]):
736
- Budget to fit a single run of the pipeline. If not
737
- provided, uses the default in the pipeline config
733
+ It can be one of:
734
+ + 'epochs': The training of each pipeline will be terminated after
735
+ a number of epochs have passed. This number of epochs is determined by the
736
+ budget argument of this method.
737
+ + 'runtime': The training of each pipeline will be terminated after
738
+ a number of seconds have passed. This number of seconds is determined by the
739
+ budget argument of this method. The overall fitting time of a pipeline is
740
+ controlled by func_eval_time_limit_secs. 'runtime' only controls the allocated
741
+ time to train a pipeline, but it does not consider the overall time it takes
742
+ to create a pipeline (data loading and preprocessing, other i/o operations, etc.).
743
+ budget_type will determine the units of min_budget/max_budget. If budget_type=='epochs'
744
+ is used, min_budget will refer to epochs whereas if budget_type=='runtime' then
745
+ min_budget will refer to seconds.
746
+ min_budget (int):
747
+ Auto-PyTorch uses `Hyperband <https://arxiv.org/abs/1603.06560>_` to
748
+ trade-off resources between running many pipelines at min_budget and
749
+ running the top performing pipelines on max_budget.
750
+ min_budget states the minimum resource allocation a pipeline should have
751
+ so that we can compare and quickly discard bad performing models.
752
+ For example, if the budget_type is epochs, and min_budget=5, then we will
753
+ run every pipeline to a minimum of 5 epochs before performance comparison.
754
+ max_budget (int):
755
+ Auto-PyTorch uses `Hyperband <https://arxiv.org/abs/1603.06560>_` to
756
+ trade-off resources between running many pipelines at min_budget and
757
+ running the top performing pipelines on max_budget.
758
+ max_budget states the maximum resource allocation a pipeline is going to
759
+ be ran. For example, if the budget_type is epochs, and max_budget=50,
760
+ then the pipeline training will be terminated after 50 epochs.
738
761
total_walltime_limit (int), (default=100): Time limit
739
762
in seconds for the search of appropriate models.
740
763
By increasing this value, autopytorch has a higher
@@ -843,23 +866,27 @@ def _search(
843
866
844
867
self .search_space = self .get_search_space (dataset )
845
868
846
- budget_config : Dict [str , Union [float , str ]] = {}
847
- if budget_type is not None and budget is not None :
848
- budget_config ['budget_type' ] = budget_type
849
- budget_config [budget_type ] = budget
850
- elif budget_type is not None or budget is not None :
851
- raise ValueError (
852
- "budget type was not specified in budget_config"
853
- )
869
+ # Incorporate budget to pipeline config
870
+ if budget_type not in ('epochs' , 'runtime' ):
871
+ raise ValueError ("Budget type must be one ('epochs', 'runtime')"
872
+ f" yet { budget_type } was provided" )
873
+ self .pipeline_options ['budget_type' ] = budget_type
874
+
875
+ # Here the budget is set to max because the SMAC intensifier can be:
876
+ # Hyperband: in this case the budget is determined on the fly and overwritten
877
+ # by the ExecuteTaFuncWithQueue
878
+ # SimpleIntensifier (and others): in this case, we use max_budget as a target
879
+ # budget, and hece the below line is honored
880
+ self .pipeline_options [budget_type ] = max_budget
854
881
855
882
if self .task_type is None :
856
883
raise ValueError ("Cannot interpret task type from the dataset" )
857
884
858
885
# If no dask client was provided, we create one, so that we can
859
886
# start a ensemble process in parallel to smbo optimize
860
- if (
861
- dask_client is None and ( self .ensemble_size > 0 or self . n_jobs > 1 )
862
- ) :
887
+ if self . n_jobs == 1 :
888
+ self ._dask_client = SingleThreadedClient ( )
889
+ elif dask_client is None :
863
890
self ._create_dask_client ()
864
891
else :
865
892
self ._dask_client = dask_client
@@ -878,7 +905,7 @@ def _search(
878
905
879
906
# Make sure that at least 2 models are created for the ensemble process
880
907
num_models = time_left_for_modelfit // func_eval_time_limit_secs
881
- if num_models < 2 :
908
+ if num_models < 2 and self . ensemble_size > 0 :
882
909
func_eval_time_limit_secs = time_left_for_modelfit // 2
883
910
self ._logger .warning (
884
911
"Capping the func_eval_time_limit_secs to {} to have "
@@ -978,7 +1005,9 @@ def _search(
978
1005
all_supported_metrics = self ._all_supported_metrics ,
979
1006
smac_scenario_args = smac_scenario_args ,
980
1007
get_smac_object_callback = get_smac_object_callback ,
981
- pipeline_config = {** self .pipeline_options , ** budget_config },
1008
+ pipeline_config = self .pipeline_options ,
1009
+ min_budget = min_budget ,
1010
+ max_budget = max_budget ,
982
1011
ensemble_callback = proc_ensemble ,
983
1012
logger_port = self ._logger_port ,
984
1013
# We do not increase the num_run here, this is something
@@ -1046,7 +1075,6 @@ def _search(
1046
1075
def refit (
1047
1076
self ,
1048
1077
dataset : BaseDataset ,
1049
- budget_config : Dict [str , Union [int , str ]] = {},
1050
1078
split_id : int = 0
1051
1079
) -> "BaseTask" :
1052
1080
"""
@@ -1058,14 +1086,16 @@ def refit(
1058
1086
This methods fits all models found during a call to fit on the data
1059
1087
given. This method may also be used together with holdout to avoid
1060
1088
only using 66% of the training data to fit the final model.
1089
+
1090
+ Refit uses the estimator pipeline_config attribute, which the user
1091
+ can interact via the get_pipeline_config()/set_pipeline_config()
1092
+ methods.
1093
+
1061
1094
Args:
1062
1095
dataset: (Dataset)
1063
1096
The argument that will provide the dataset splits. It can either
1064
1097
be a dictionary with the splits, or the dataset object which can
1065
1098
generate the splits based on different restrictions.
1066
- budget_config: (Optional[Dict[str, Union[int, str]]])
1067
- can contain keys from 'budget_type' and the budget
1068
- specified using 'epochs' or 'runtime'.
1069
1099
split_id: (int)
1070
1100
split id to fit on.
1071
1101
Returns:
@@ -1096,7 +1126,7 @@ def refit(
1096
1126
'split_id' : split_id ,
1097
1127
'num_run' : self ._backend .get_next_num_run (),
1098
1128
})
1099
- X .update ({ ** self .pipeline_options , ** budget_config } )
1129
+ X .update (self .pipeline_options )
1100
1130
if self .models_ is None or len (self .models_ ) == 0 or self .ensemble_ is None :
1101
1131
self ._load_models ()
1102
1132
@@ -1120,21 +1150,22 @@ def refit(
1120
1150
1121
1151
def fit (self ,
1122
1152
dataset : BaseDataset ,
1123
- budget_config : Dict [str , Union [int , str ]] = {},
1124
1153
pipeline_config : Optional [Configuration ] = None ,
1125
1154
split_id : int = 0 ) -> BasePipeline :
1126
1155
"""
1127
1156
Fit a pipeline on the given task for the budget.
1128
1157
A pipeline configuration can be specified if None,
1129
1158
uses default
1159
+
1160
+ Fit uses the estimator pipeline_config attribute, which the user
1161
+ can interact via the get_pipeline_config()/set_pipeline_config()
1162
+ methods.
1163
+
1130
1164
Args:
1131
1165
dataset: (Dataset)
1132
1166
The argument that will provide the dataset splits. It can either
1133
1167
be a dictionary with the splits, or the dataset object which can
1134
1168
generate the splits based on different restrictions.
1135
- budget_config: (Optional[Dict[str, Union[int, str]]])
1136
- can contain keys from 'budget_type' and the budget
1137
- specified using 'epochs' or 'runtime'.
1138
1169
split_id: (int) (default=0)
1139
1170
split id to fit on.
1140
1171
pipeline_config: (Optional[Configuration])
@@ -1175,7 +1206,7 @@ def fit(self,
1175
1206
'split_id' : split_id ,
1176
1207
'num_run' : self ._backend .get_next_num_run (),
1177
1208
})
1178
- X .update ({ ** self .pipeline_options , ** budget_config } )
1209
+ X .update (self .pipeline_options )
1179
1210
1180
1211
fit_and_suppress_warnings (self ._logger , pipeline , X , y = None )
1181
1212
0 commit comments