Skip to content

Commit a044a19

Browse files
authored
Merge branch 'refactor_development_regularization_cocktails' into cocktail_fixes
2 parents 1488978 + 42a7676 commit a044a19

25 files changed

+224
-150
lines changed

autoPyTorch/api/base_task.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -254,7 +254,7 @@ def get_dataset(self,
254254
NoResamplingStrategyTypes]] = None,
255255
resampling_strategy_args: Optional[Dict[str, Any]] = None,
256256
dataset_name: Optional[str] = None,
257-
return_only: Optional[bool] = False
257+
update_dataset_attribute: Optional[bool] = True
258258
) -> BaseDataset:
259259
raise NotImplementedError("Function called on BaseTask, this can only be called by "
260260
"specific task which is a child of the BaseTask")
@@ -276,7 +276,7 @@ def set_pipeline_config(
276276
None
277277
"""
278278
unknown_keys = []
279-
for option, value in pipeline_config_kwargs.items():
279+
for option in pipeline_config_kwargs.keys():
280280
if option in self.pipeline_options.keys():
281281
pass
282282
else:
@@ -587,7 +587,7 @@ def _do_dummy_prediction(self) -> None:
587587
all_supported_metrics=self._all_supported_metrics
588588
)
589589

590-
status, cost, runtime, additional_info = ta.run(num_run, cutoff=self._time_for_task)
590+
status, _, _, additional_info = ta.run(num_run, cutoff=self._time_for_task)
591591
if status == StatusType.SUCCESS:
592592
self._logger.info("Finished creating dummy predictions.")
593593
else:
@@ -1263,7 +1263,7 @@ def fit_pipeline(self,
12631263
resampling_strategy=resampling_strategy,
12641264
resampling_strategy_args=resampling_strategy_args,
12651265
dataset_name=dataset_name,
1266-
return_only=True)
1266+
update_dataset_attribute=False)
12671267

12681268
# TAE expects each configuration to have a config_id.
12691269
# For fitting a pipeline as it is not part of the

autoPyTorch/api/tabular_classification.py

Lines changed: 19 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -136,7 +136,7 @@ def get_dataset(self,
136136
NoResamplingStrategyTypes]] = None,
137137
resampling_strategy_args: Optional[Dict[str, Any]] = None,
138138
dataset_name: Optional[str] = None,
139-
return_only: Optional[bool] = False
139+
update_dataset_attribute: Optional[bool] = True
140140
) -> BaseDataset:
141141

142142
if dataset_name is None:
@@ -148,27 +148,27 @@ def get_dataset(self,
148148

149149
# Create a validator object to make sure that the data provided by
150150
# the user matches the autopytorch requirements
151-
InputValidator = TabularInputValidator(
151+
input_validator = TabularInputValidator(
152152
is_classification=True,
153153
logger_port=self._logger_port,
154154
)
155155

156156
# Fit a input validator to check the provided data
157157
# Also, an encoder is fit to both train and test data,
158158
# to prevent unseen categories during inference
159-
InputValidator.fit(X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test)
159+
input_validator.fit(X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test)
160160

161161
dataset = TabularDataset(
162162
X=X_train, Y=y_train,
163163
X_test=X_test, Y_test=y_test,
164-
validator=InputValidator,
164+
validator=input_validator,
165165
resampling_strategy=resampling_strategy,
166166
resampling_strategy_args=resampling_strategy_args,
167167
dataset_name=dataset_name,
168168
seed=self.seed
169169
)
170-
if not return_only:
171-
self.InputValidator = InputValidator
170+
if update_dataset_attribute:
171+
self.input_validator = input_validator
172172
self.dataset = dataset
173173

174174
return dataset
@@ -206,7 +206,7 @@ def search(
206206
pipeline. Additionally, a holdout of this pairs (X_test, y_test) can
207207
be provided to track the generalization performance of each stage.
208208
dataset_name (Optional[str]):
209-
Name of the dayaset, if None, random value is used
209+
Name of the dayaset, if None, time hashed value is used
210210
optimize_metric (str): name of the metric that is used to
211211
evaluate a pipeline.
212212
budget_type (Optional[str]):
@@ -269,10 +269,12 @@ def search(
269269
270270
"""
271271

272-
assert isinstance(self.resampling_strategy, (CrossValTypes, HoldoutValTypes)), \
273-
"Val Split is required for HPO search. " \
274-
"Expected 'self.resampling_strategy' in" \
275-
" '(CrossValTypes, HoldoutValTypes) got {}".format(self.resampling_strategy)
272+
if not isinstance(self.resampling_strategy, (CrossValTypes, HoldoutValTypes)):
273+
raise ValueError(
274+
'Hyperparameter optimization requires a validation split. '
275+
'Expected `self.resampling_strategy` to be either '
276+
'(CrossValTypes, HoldoutValTypes), but got {}'.format(self.resampling_strategy)
277+
)
276278

277279
self.get_dataset(X_train=X_train,
278280
y_train=y_train,
@@ -305,28 +307,28 @@ def predict(
305307
batch_size: Optional[int] = None,
306308
n_jobs: int = 1
307309
) -> np.ndarray:
308-
if self.InputValidator is None or not self.InputValidator._is_fitted:
310+
if self.input_validator is None or not self.input_validator._is_fitted:
309311
raise ValueError("predict() is only supported after calling search. Kindly call first "
310312
"the estimator fit() method.")
311313

312-
X_test = self.InputValidator.feature_validator.transform(X_test)
314+
X_test = self.input_validator.feature_validator.transform(X_test)
313315
predicted_probabilities = super().predict(X_test, batch_size=batch_size,
314316
n_jobs=n_jobs)
315317

316-
if self.InputValidator.target_validator.is_single_column_target():
318+
if self.input_validator.target_validator.is_single_column_target():
317319
predicted_indexes = np.argmax(predicted_probabilities, axis=1)
318320
else:
319321
predicted_indexes = (predicted_probabilities > 0.5).astype(int)
320322

321323
# Allow to predict in the original domain -- that is, the user is not interested
322324
# in our encoded values
323-
return self.InputValidator.target_validator.inverse_transform(predicted_indexes)
325+
return self.input_validator.target_validator.inverse_transform(predicted_indexes)
324326

325327
def predict_proba(self,
326328
X_test: Union[np.ndarray, pd.DataFrame, List],
327329
batch_size: Optional[int] = None, n_jobs: int = 1) -> np.ndarray:
328-
if self.InputValidator is None or not self.InputValidator._is_fitted:
330+
if self.input_validator is None or not self.input_validator._is_fitted:
329331
raise ValueError("predict() is only supported after calling search. Kindly call first "
330332
"the estimator fit() method.")
331-
X_test = self.InputValidator.feature_validator.transform(X_test)
333+
X_test = self.input_validator.feature_validator.transform(X_test)
332334
return super().predict(X_test, batch_size=batch_size, n_jobs=n_jobs)

autoPyTorch/api/tabular_regression.py

Lines changed: 15 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -128,7 +128,7 @@ def get_dataset(self,
128128
NoResamplingStrategyTypes]] = None,
129129
resampling_strategy_args: Optional[Dict[str, Any]] = None,
130130
dataset_name: Optional[str] = None,
131-
return_only: Optional[bool] = False
131+
update_dataset_attribute: Optional[bool] = True
132132
) -> BaseDataset:
133133

134134
if dataset_name is None:
@@ -140,27 +140,27 @@ def get_dataset(self,
140140

141141
# Create a validator object to make sure that the data provided by
142142
# the user matches the autopytorch requirements
143-
InputValidator = TabularInputValidator(
143+
input_validator = TabularInputValidator(
144144
is_classification=False,
145145
logger_port=self._logger_port,
146146
)
147147

148148
# Fit a input validator to check the provided data
149149
# Also, an encoder is fit to both train and test data,
150150
# to prevent unseen categories during inference
151-
InputValidator.fit(X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test)
151+
input_validator.fit(X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test)
152152

153153
dataset = TabularDataset(
154154
X=X_train, Y=y_train,
155155
X_test=X_test, Y_test=y_test,
156-
validator=InputValidator,
156+
validator=input_validator,
157157
resampling_strategy=resampling_strategy,
158158
resampling_strategy_args=resampling_strategy_args,
159159
dataset_name=dataset_name,
160160
seed=self.seed
161161
)
162-
if not return_only:
163-
self.InputValidator = InputValidator
162+
if update_dataset_attribute:
163+
self.input_validator = input_validator
164164
self.dataset = dataset
165165

166166
return dataset
@@ -255,10 +255,12 @@ def search(
255255
256256
"""
257257

258-
assert isinstance(self.resampling_strategy, (CrossValTypes, HoldoutValTypes)), \
259-
"Val Split is required for HPO search. " \
260-
"Expected 'self.resampling_strategy' in" \
261-
" '(CrossValTypes, HoldoutValTypes) got {}".format(self.resampling_strategy)
258+
if not isinstance(self.resampling_strategy, (CrossValTypes, HoldoutValTypes)):
259+
raise ValueError(
260+
'Hyperparameter optimization requires a validation split. '
261+
'Expected `self.resampling_strategy` to be either '
262+
'(CrossValTypes, HoldoutValTypes), but got {}'.format(self.resampling_strategy)
263+
)
262264

263265
self.get_dataset(X_train=X_train,
264266
y_train=y_train,
@@ -291,14 +293,14 @@ def predict(
291293
batch_size: Optional[int] = None,
292294
n_jobs: int = 1
293295
) -> np.ndarray:
294-
if self.InputValidator is None or not self.InputValidator._is_fitted:
296+
if self.input_validator is None or not self.input_validator._is_fitted:
295297
raise ValueError("predict() is only supported after calling search. Kindly call first "
296298
"the estimator fit() method.")
297299

298-
X_test = self.InputValidator.feature_validator.transform(X_test)
300+
X_test = self.input_validator.feature_validator.transform(X_test)
299301
predicted_values = super().predict(X_test, batch_size=batch_size,
300302
n_jobs=n_jobs)
301303

302304
# Allow to predict in the original domain -- that is, the user is not interested
303305
# in our encoded values
304-
return self.InputValidator.target_validator.inverse_transform(predicted_values)
306+
return self.input_validator.target_validator.inverse_transform(predicted_values)

autoPyTorch/data/tabular_feature_validator.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -492,7 +492,7 @@ def infer_objects(self, X: pd.DataFrame) -> pd.DataFrame:
492492
X[key] = X[key].astype(dtype.name)
493493
except Exception as e:
494494
# Try inference if possible
495-
self.logger.warning(f"Tried to cast column {key} to {dtype} caused {e}")
495+
self.logger.warning(f'Casting the column {key} to {dtype} caused the exception {e}')
496496
pass
497497
else:
498498
# Calling for the first time to infer the categories

autoPyTorch/evaluation/abstract_evaluator.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -368,6 +368,8 @@ def __init__(self, backend: Backend,
368368
self.additional_metrics = get_metrics(dataset_properties=self.dataset_properties,
369369
all_supported_metrics=all_supported_metrics)
370370

371+
# See autoPyTorch/pipeline/components/base_component.py::autoPyTorchComponent for more details
372+
# about fit_dictionary
371373
self.fit_dictionary: Dict[str, Any] = {'dataset_properties': self.dataset_properties}
372374
self._init_params = init_params
373375
self.fit_dictionary.update({
@@ -380,8 +382,7 @@ def __init__(self, backend: Backend,
380382
})
381383

382384
# Update fit dictionary with metrics passed to the evaluator
383-
metrics_dict: Dict[str, List[str]] = {'additional_metrics': []}
384-
metrics_dict['additional_metrics'].append(self.metric.name)
385+
metrics_dict: Dict[str, List[str]] = {'additional_metrics': [self.metric.name]}
385386
if all_supported_metrics:
386387
assert self.additional_metrics is not None
387388
for metric in self.additional_metrics:

autoPyTorch/evaluation/fit_evaluator.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -58,10 +58,12 @@ def __init__(self, backend: Backend, queue: Queue,
5858
pipeline_config=pipeline_config,
5959
search_space_updates=search_space_updates
6060
)
61-
assert isinstance(self.datamanager.resampling_strategy, NoResamplingStrategyTypes),\
62-
"This Evaluator is used for fitting a pipeline on the whole dataset. " \
63-
"Expected 'self.resampling_strategy' to be" \
64-
" 'NoResamplingStrategyTypes' got {}".format(self.datamanager.resampling_strategy)
61+
if not isinstance(self.datamanager.resampling_strategy, NoResamplingStrategyTypes):
62+
raise ValueError(
63+
"FitEvaluator needs to be fitted on the whole dataset and resampling_strategy "
64+
"must be `NoResamplingStrategyTypes`, but got {}".format(
65+
self.datamanager.resampling_strategy
66+
))
6567

6668
self.splits = self.datamanager.splits
6769
self.Y_target: Optional[np.ndarray] = None

autoPyTorch/evaluation/tae.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -174,10 +174,9 @@ def __init__(
174174
elif isinstance(self.resampling_strategy, NoResamplingStrategyTypes):
175175
eval_function = autoPyTorch.evaluation.fit_evaluator.eval_function
176176
else:
177-
raise ValueError("Unknown resampling strategy specified."
178-
"Expected resampling strategy to be in "
179-
"'(HoldoutValTypes, CrossValTypes, NoResamplingStrategyTypes)"
180-
"got {}".format(self.resampling_strategy))
177+
raise ValueError("resampling strategy must be in "
178+
"(HoldoutValTypes, CrossValTypes, NoResamplingStrategyTypes), "
179+
"but got {}.".format(self.resampling_strategy))
181180

182181
self.worst_possible_result = cost_for_crash
183182

@@ -319,6 +318,7 @@ def run(
319318
info: typing.Optional[typing.List[RunValue]]
320319
additional_run_info: typing.Dict[str, typing.Any]
321320
try:
321+
# By default, self.ta is fit_predict_try_except_decorator
322322
obj = pynisher.enforce_limits(**pynisher_arguments)(self.ta)
323323
obj(**obj_kwargs)
324324
except Exception as e:

autoPyTorch/evaluation/train_evaluator.py

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -71,11 +71,12 @@ def __init__(self, backend: Backend, queue: Queue,
7171
pipeline_config=pipeline_config,
7272
search_space_updates=search_space_updates
7373
)
74-
assert isinstance(self.datamanager.resampling_strategy, (CrossValTypes, HoldoutValTypes)),\
75-
"This Evaluator is used for HPO Search. " \
76-
"Val Split is required for HPO search. " \
77-
"Expected 'self.resampling_strategy' in" \
78-
" '(CrossValTypes, HoldoutValTypes)' got {}".format(self.datamanager.resampling_strategy)
74+
75+
if not isinstance(self.datamanager.resampling_strategy, (CrossValTypes, HoldoutValTypes)):
76+
raise ValueError(
77+
'TrainEvaluator expect to have (CrossValTypes, HoldoutValTypes) as '
78+
'resampling_strategy, but got {}'.format(self.datamanager.resampling_strategy)
79+
)
7980

8081
self.splits = self.datamanager.splits
8182
if self.splits is None:
@@ -271,6 +272,8 @@ def _fit_and_predict(self, pipeline: BaseEstimator, fold: int, train_indices: Un
271272

272273
self.indices[fold] = ((train_indices, test_indices))
273274

275+
# See autoPyTorch/pipeline/components/base_component.py::autoPyTorchComponent for more details
276+
# about fit_dictionary
274277
X = {'train_indices': train_indices,
275278
'val_indices': test_indices,
276279
'split_id': fold,

autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/utils.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,12 @@ def get_tabular_preprocessers(X: Dict[str, Any]) -> Dict[str, List[BaseEstimator
1212
Creates a dictionary with two keys,
1313
numerical- containing list of numerical preprocessors
1414
categorical- containing list of categorical preprocessors
15+
1516
Args:
1617
X: fit dictionary
18+
See autoPyTorch/pipeline/components/base_component.py::autoPyTorchComponent for more details
19+
about fit_dictionary
20+
1721
Returns:
1822
(Dict[str, List[BaseEstimator]]): dictionary with list of numerical and categorical preprocessors
1923
"""

autoPyTorch/pipeline/components/setup/network_backbone/ShapedResNetBackbone.py

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -31,11 +31,11 @@ def build_backbone(self, input_shape: Tuple[int, ...]) -> None:
3131

3232
# use the get_shaped_neuron_counts to update the number of units
3333
neuron_counts = get_shaped_neuron_counts(
34-
self.config['resnet_shape'],
35-
in_features,
36-
out_features,
37-
self.config['max_units'],
38-
self.config['num_groups'] + 2,
34+
shape=self.config['resnet_shape'],
35+
in_feat=in_features,
36+
out_feat=out_features,
37+
max_neurons=self.config['max_units'],
38+
layer_count=self.config['num_groups'] + 2,
3939
)[:-1]
4040
self.config.update(
4141
{"num_units_%d" % (i): num for i, num in enumerate(neuron_counts)}
@@ -46,11 +46,11 @@ def build_backbone(self, input_shape: Tuple[int, ...]) -> None:
4646
# nr of units for the architecture, since, it is mostly implemented for the
4747
# output layer, which is part of the head and not of the backbone.
4848
dropout_shape = get_shaped_neuron_counts(
49-
self.config['dropout_shape'],
50-
0,
51-
0,
52-
self.config["max_dropout"],
53-
self.config['num_groups'] + 1,
49+
shape=self.config['dropout_shape'],
50+
in_feat=0,
51+
out_feat=0,
52+
max_neurons=self.config["max_dropout"],
53+
layer_count=self.config['num_groups'] + 1,
5454
)[:-1]
5555

5656
self.config.update(

autoPyTorch/pipeline/components/setup/network_backbone/utils.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,13 @@ def get_output_shape(network: torch.nn.Module, input_shape: typing.Tuple[int, ..
3333

3434

3535
class ShakeShakeFunction(Function):
36+
"""
37+
References:
38+
Title: Shake-Shake regularization
39+
Authors: Xavier Gastaldi
40+
URL: https://arxiv.org/pdf/1705.07485.pdf
41+
Github URL: https://github.com/hysts/pytorch_shake_shake/blob/master/functions/shake_shake_function.py
42+
"""
3643
@staticmethod
3744
def forward(
3845
ctx: typing.Any, # No typing for AutogradContext
@@ -65,6 +72,18 @@ def backward(ctx: typing.Any,
6572

6673

6774
class ShakeDropFunction(Function):
75+
"""
76+
References:
77+
Title: ShakeDrop Regularization for Deep Residual Learning
78+
Authors: Yoshihiro Yamada et. al.
79+
URL: https://arxiv.org/pdf/1802.02375.pdf
80+
81+
Title: ShakeDrop Regularization
82+
Authors: Yoshihiro Yamada et. al.
83+
URL: https://openreview.net/pdf?id=S1NHaMW0b
84+
85+
Github URL: https://github.com/owruby/shake-drop_pytorch/blob/master/models/shakedrop.py
86+
"""
6887
@staticmethod
6988
def forward(ctx: typing.Any,
7089
x: torch.Tensor,

0 commit comments

Comments
 (0)