From eade3871d8bde91da79b64950cb8d3419a7f4e3c Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Tue, 3 Aug 2021 18:35:46 +0200 Subject: [PATCH 01/54] preprocess inside data validator --- autoPyTorch/data/tabular_feature_validator.py | 234 +++++++----------- .../TabularColumnTransformer.py | 16 +- .../encoding/base_encoder.py | 2 +- .../imputation/base_imputer.py | 2 +- .../scaling/base_scaler.py | 2 +- .../base_network_embedding.py | 33 +-- 6 files changed, 120 insertions(+), 169 deletions(-) diff --git a/autoPyTorch/data/tabular_feature_validator.py b/autoPyTorch/data/tabular_feature_validator.py index 4c8a8fbc2..69ff55fa5 100644 --- a/autoPyTorch/data/tabular_feature_validator.py +++ b/autoPyTorch/data/tabular_feature_validator.py @@ -9,14 +9,62 @@ import scipy.sparse import sklearn.utils -from sklearn import preprocessing + from sklearn.base import BaseEstimator from sklearn.compose import ColumnTransformer from sklearn.exceptions import NotFittedError +from sklearn.pipeline import make_pipeline +from sklearn.preprocessing import OneHotEncoder +from sklearn.impute import SimpleImputer +from sklearn.preprocessing import StandardScaler from autoPyTorch.data.base_feature_validator import BaseFeatureValidator, SUPPORTED_FEAT_TYPES +def _create_column_transformer(preprocessors: typing.Dict, numerical_columns, categorical_columns): + numerical_pipeline = 'drop' + categorical_pipeline = 'drop' + if len(numerical_columns) > 0: + numerical_pipeline = make_pipeline(*preprocessors['numerical']) + if len(categorical_columns) > 0: + categorical_pipeline = make_pipeline(*preprocessors['categorical']) + + return ColumnTransformer([ + ('categorical_pipeline', categorical_pipeline, categorical_columns), + ('numerical_pipeline', numerical_pipeline, numerical_columns)], + remainder='passthrough' + ) + + +def get_tabular_preprocessors(): + preprocessors = dict() + preprocessors['numerical'] = list() + preprocessors['categorical'] = list() + + preprocessors['categorical'].append(SimpleImputer(strategy='constant', + # Train data is numpy + # as of this point, where + # Ordinal Encoding is using + # for categorical. Only + # Numbers are allowed + # fill_value='!missing!', + fill_value=-1, + copy=False)) + + # preprocessors['categorical'].append(("ordinal-encoder", OrdinalEncoder( + # handle_unknown='use_encoded_value', + # unknown_value=-1))) + preprocessors['categorical'].append(OneHotEncoder( + categories='auto', + sparse=False, + handle_unknown='ignore')) + preprocessors['numerical'].append(SimpleImputer(strategy='median', + copy=False)) + preprocessors['numerical'].append(StandardScaler(with_mean=True, with_std=True, copy=False)) + + return preprocessors + + class TabularFeatureValidator(BaseFeatureValidator): def _fit( self, @@ -43,73 +91,50 @@ def _fit( if hasattr(X, "iloc") and not scipy.sparse.issparse(X): X = typing.cast(pd.DataFrame, X) - # Treat a column with all instances a NaN as numerical - # This will prevent doing encoding to a categorical column made completely - # out of nan values -- which will trigger a fail, as encoding is not supported - # with nan values. - # Columns that are completely made of NaN values are provided to the pipeline - # so that later stages decide how to handle them - - # Clear whatever null column markers we had previously - self.null_columns.clear() - if np.any(pd.isnull(X)): - for column in X.columns: - if X[column].isna().all(): - self.null_columns.add(column) - X[column] = pd.to_numeric(X[column]) - # Also note this change in self.dtypes - if len(self.dtypes) != 0: - self.dtypes[list(X.columns).index(column)] = X[column].dtype if not X.select_dtypes(include='object').empty: X = self.infer_objects(X) self._check_data(X) - self.enc_columns, self.feat_type = self._get_columns_to_encode(X) - - if len(self.enc_columns) > 0: - X = self.impute_nan_in_categories(X) - - self.encoder = ColumnTransformer( - [ - ("encoder", - preprocessing.OrdinalEncoder( - handle_unknown='use_encoded_value', - unknown_value=-1, - ), self.enc_columns)], - remainder="passthrough" - ) + categorical_columns, numerical_columns, feat_type = self._get_columns_info(X) + + preprocessors = get_tabular_preprocessors() + self.column_transformer = _create_column_transformer(preprocessors=preprocessors, + numerical_columns=numerical_columns, + categorical_columns=categorical_columns) + + # Mypy redefinition + assert self.column_transformer is not None + self.column_transformer.fit(X) + + # The column transformer reoders the feature types - we therefore need to change + # it as well + # This means columns are shifted to the right + def comparator(cmp1: str, cmp2: str) -> int: + if ( + cmp1 == 'categorical' and cmp2 == 'categorical' + or cmp1 == 'numerical' and cmp2 == 'numerical' + ): + return 0 + elif cmp1 == 'categorical' and cmp2 == 'numerical': + return -1 + elif cmp1 == 'numerical' and cmp2 == 'categorical': + return 1 + else: + raise ValueError((cmp1, cmp2)) - # Mypy redefinition - assert self.encoder is not None - self.encoder.fit(X) - - # The column transformer reoders the feature types - we therefore need to change - # it as well - # This means columns are shifted to the right - def comparator(cmp1: str, cmp2: str) -> int: - if ( - cmp1 == 'categorical' and cmp2 == 'categorical' - or cmp1 == 'numerical' and cmp2 == 'numerical' - ): - return 0 - elif cmp1 == 'categorical' and cmp2 == 'numerical': - return -1 - elif cmp1 == 'numerical' and cmp2 == 'categorical': - return 1 - else: - raise ValueError((cmp1, cmp2)) - - self.feat_type = sorted( - self.feat_type, - key=functools.cmp_to_key(comparator) - ) + self.feat_type = sorted( + feat_type, + key=functools.cmp_to_key(comparator) + ) + if len(categorical_columns) > 0: + print(self.column_transformer.named_transformers_['categorical_pipeline'].named_steps) self.categories = [ # We fit an ordinal encoder, where all categorical # columns are shifted to the left list(range(len(cat))) - for cat in self.encoder.transformers_[0][1].categories_ + for cat in self.column_transformer.named_transformers_['categorical_pipeline'].named_steps['onehotencoder'].categories_ ] for i, type_ in enumerate(self.feat_type): @@ -151,23 +176,6 @@ def transform( if hasattr(X, "iloc") and not scipy.sparse.issparse(X): X = typing.cast(pd.DataFrame, X) - # If we had null columns in our fit call and we made them numeric, then: - # - If the columns are null even in transform, apply the same procedure. - # - Otherwise, substitute the values with np.NaN and then make the columns numeric. - # If the column is null here, but it was not in fit, it does not matter. - for column in self.null_columns: - # The column is not null, make it null since it was null in fit. - if not X[column].isna().all(): - X[column] = np.NaN - X[column] = pd.to_numeric(X[column]) - - # for the test set, if we have columns with only null values - # they will probably have a numeric type. If these columns were not - # with only null values in the train set, they should be converted - # to the type that they had during fitting. - for column in X.columns: - if X[column].isna().all(): - X[column] = X[column].astype(self.dtypes[list(X.columns).index(column)]) # Also remove the object dtype for new data if not X.select_dtypes(include='object').empty: @@ -177,10 +185,7 @@ def transform( self._check_data(X) # We also need to fillna on the transformation # in case test data is provided - X = self.impute_nan_in_categories(X) - - if self.encoder is not None: - X = self.encoder.transform(X) + X = self.column_transformer.transform(X) # Sparse related transformations # Not all sparse format support index sorting @@ -254,7 +259,7 @@ def _check_data( # Define the column to be encoded here as the feature validator is fitted once # per estimator - enc_columns, _ = self._get_columns_to_encode(X) + # enc_columns, _ = self._get_columns_to_encode(X) column_order = [column for column in X.columns] if len(self.column_order) > 0: @@ -279,10 +284,10 @@ def _check_data( else: self.dtypes = dtypes - def _get_columns_to_encode( + def _get_columns_info( self, X: pd.DataFrame, - ) -> typing.Tuple[typing.List[str], typing.List[str]]: + ) -> typing.Tuple[typing.List[str], typing.List[str], typing.List[str]]: """ Return the columns to be encoded from a pandas dataframe @@ -297,8 +302,8 @@ def _get_columns_to_encode( Type of each column numerical/categorical """ # Register if a column needs encoding - enc_columns = [] - + numerical_columns = [] + categorical_columns = [] # Also, register the feature types for the estimator feat_type = [] @@ -306,7 +311,7 @@ def _get_columns_to_encode( for i, column in enumerate(X.columns): if X[column].dtype.name in ['category', 'bool']: - enc_columns.append(column) + categorical_columns.append(column) feat_type.append('categorical') # Move away from np.issubdtype as it causes # TypeError: data type not understood in certain pandas types @@ -348,7 +353,8 @@ def _get_columns_to_encode( ) else: feat_type.append('numerical') - return enc_columns, feat_type + numerical_columns.append(column) + return categorical_columns, numerical_columns, feat_type def list_to_dataframe( self, @@ -432,60 +438,4 @@ def infer_objects(self, X: pd.DataFrame) -> pd.DataFrame: X[column] = X[column].astype('category') self.object_dtype_mapping = {column: X[column].dtype for column in X.columns} self.logger.debug(f"Infer Objects: {self.object_dtype_mapping}") - return X - - def impute_nan_in_categories(self, X: pd.DataFrame) -> pd.DataFrame: - """ - impute missing values before encoding, - remove once sklearn natively supports - it in ordinal encoding. Sklearn issue: - "https://github.com/scikit-learn/scikit-learn/issues/17123)" - - Arguments: - X (pd.DataFrame): - data to be interpreted. - - Returns: - pd.DataFrame - """ - - # To be on the safe side, map always to the same missing - # value per column - if not hasattr(self, 'dict_nancol_to_missing'): - self.dict_missing_value_per_col: typing.Dict[str, typing.Any] = {} - - # First make sure that we do not alter the type of the column which cause: - # TypeError: '<' not supported between instances of 'int' and 'str' - # in the encoding - for column in self.enc_columns: - if X[column].isna().any(): - if column not in self.dict_missing_value_per_col: - try: - float(X[column].dropna().values[0]) - can_cast_as_number = True - except Exception: - can_cast_as_number = False - if can_cast_as_number: - # In this case, we expect to have a number as category - # it might be string, but its value represent a number - missing_value: typing.Union[str, int] = '-1' if isinstance(X[column].dropna().values[0], - str) else -1 - else: - missing_value = 'Missing!' - - # Make sure this missing value is not seen before - # Do this check for categorical columns - # else modify the value - if hasattr(X[column], 'cat'): - while missing_value in X[column].cat.categories: - if isinstance(missing_value, str): - missing_value += '0' - else: - missing_value += missing_value - self.dict_missing_value_per_col[column] = missing_value - - # Convert the frame in place - X[column].cat.add_categories([self.dict_missing_value_per_col[column]], - inplace=True) - X.fillna({column: self.dict_missing_value_per_col[column]}, inplace=True) - return X + return X \ No newline at end of file diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/TabularColumnTransformer.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/TabularColumnTransformer.py index e1e08e94e..46feb0ac4 100644 --- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/TabularColumnTransformer.py +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/TabularColumnTransformer.py @@ -48,14 +48,14 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> "TabularColumnTransformer": "TabularColumnTransformer": an instance of self """ self.check_requirements(X, y) - numerical_pipeline = 'drop' - categorical_pipeline = 'drop' - - preprocessors = get_tabular_preprocessers(X) - if len(X['dataset_properties']['numerical_columns']): - numerical_pipeline = make_pipeline(*preprocessors['numerical']) - if len(X['dataset_properties']['categorical_columns']): - categorical_pipeline = make_pipeline(*preprocessors['categorical']) + numerical_pipeline = 'passthrough' + categorical_pipeline = 'passthrough' + + # preprocessors = get_tabular_preprocessers(X) + # if len(X['dataset_properties']['numerical_columns']): + # numerical_pipeline = make_pipeline(*preprocessors['numerical']) + # if len(X['dataset_properties']['categorical_columns']): + # categorical_pipeline = make_pipeline(*preprocessors['categorical']) self.preprocessor = ColumnTransformer([ ('numerical_pipeline', numerical_pipeline, X['dataset_properties']['numerical_columns']), diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/encoding/base_encoder.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/encoding/base_encoder.py index eadc0a188..9829cadcd 100644 --- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/encoding/base_encoder.py +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/encoding/base_encoder.py @@ -28,5 +28,5 @@ def transform(self, X: Dict[str, Any]) -> Dict[str, Any]: if self.preprocessor['numerical'] is None and self.preprocessor['categorical'] is None: raise ValueError("cant call transform on {} without fitting first." .format(self.__class__.__name__)) - X.update({'encoder': self.preprocessor}) + # X.update({'encoder': self.preprocessor}) return X diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/imputation/base_imputer.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/imputation/base_imputer.py index b65f3c229..ac0648481 100644 --- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/imputation/base_imputer.py +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/imputation/base_imputer.py @@ -29,5 +29,5 @@ def transform(self, X: Dict[str, Any]) -> Dict[str, Any]: if self.preprocessor['numerical'] is None and self.preprocessor['categorical'] is None: raise ValueError("cant call transform on {} without fitting first." .format(self.__class__.__name__)) - X.update({'imputer': self.preprocessor}) + # X.update({'imputer': self.preprocessor}) return X diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/scaling/base_scaler.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/scaling/base_scaler.py index 39834dd2b..270fac246 100644 --- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/scaling/base_scaler.py +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/scaling/base_scaler.py @@ -28,5 +28,5 @@ def transform(self, X: Dict[str, Any]) -> Dict[str, Any]: if self.preprocessor['numerical'] is None and self.preprocessor['categorical'] is None: raise ValueError("cant call transform on {} without fitting first." .format(self.__class__.__name__)) - X.update({'scaler': self.preprocessor}) + # X.update({'scaler': self.preprocessor}) return X diff --git a/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py b/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py index 5ae2880ed..42cbc62bb 100644 --- a/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py +++ b/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py @@ -33,19 +33,20 @@ def build_embedding(self, num_input_features: np.ndarray, num_numerical_features def _get_args(self, X: Dict[str, Any]) -> Tuple[int, np.ndarray]: # Feature preprocessors can alter numerical columns - if len(X['dataset_properties']['numerical_columns']) == 0: - num_numerical_columns = 0 - else: - X_train = copy.deepcopy(X['backend'].load_datamanager().train_tensors[0][:2]) - - numerical_column_transformer = X['tabular_transformer'].preprocessor. \ - named_transformers_['numerical_pipeline'] - num_numerical_columns = numerical_column_transformer.transform( - X_train[:, X['dataset_properties']['numerical_columns']]).shape[1] - num_input_features = np.zeros((num_numerical_columns + len(X['dataset_properties']['categorical_columns'])), - dtype=int) - categories = X['dataset_properties']['categories'] - - for i, category in enumerate(categories): - num_input_features[num_numerical_columns + i, ] = len(category) - return num_numerical_columns, num_input_features + # if len(X['dataset_properties']['numerical_columns']) == 0: + # num_numerical_columns = 0 + # else: + # X_train = copy.deepcopy(X['backend'].load_datamanager().train_tensors[0][:2]) + # + # numerical_column_transformer = X['tabular_transformer'].preprocessor. \ + # named_transformers_['numerical_pipeline'] + # num_numerical_columns = numerical_column_transformer.transform( + # X_train[:, X['dataset_properties']['numerical_columns']]).shape[1] + # num_input_features = np.zeros((num_numerical_columns + len(X['dataset_properties']['categorical_columns'])), + # dtype=int) + # categories = X['dataset_properties']['categories'] + # + # for i, category in enumerate(categories): + # num_input_features[num_numerical_columns + i, ] = len(category) + # return num_numerical_columns, num_input_features + return None, None \ No newline at end of file From b76b05e57ff0d383c2e0e0b602386407ea8c391b Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Tue, 3 Aug 2021 18:40:32 +0200 Subject: [PATCH 02/54] add time debug statements --- .../tabular_preprocessing/TabularColumnTransformer.py | 7 ++++++- .../pipeline/components/training/trainer/base_trainer.py | 7 +++++++ .../components/training/trainer/base_trainer_choice.py | 4 +++- .../pipeline/components/training/trainer/cutout_utils.py | 2 ++ .../pipeline/components/training/trainer/mixup_utils.py | 2 ++ 5 files changed, 20 insertions(+), 2 deletions(-) diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/TabularColumnTransformer.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/TabularColumnTransformer.py index 46feb0ac4..5fcf5cfb5 100644 --- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/TabularColumnTransformer.py +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/TabularColumnTransformer.py @@ -4,7 +4,7 @@ from sklearn.compose import ColumnTransformer from sklearn.pipeline import make_pipeline - +import time import torch from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.base_tabular_preprocessing import ( @@ -23,6 +23,7 @@ def __init__(self, random_state: Optional[Union[np.random.RandomState, int]] = N self.add_fit_requirements([ FitRequirement('numerical_columns', (List,), user_defined=True, dataset_property=True), FitRequirement('categorical_columns', (List,), user_defined=True, dataset_property=True)]) + self.fit_time = None def get_column_transformer(self) -> ColumnTransformer: """ @@ -47,6 +48,8 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> "TabularColumnTransformer": Returns: "TabularColumnTransformer": an instance of self """ + start_time = time.time() + self.check_requirements(X, y) numerical_pipeline = 'passthrough' categorical_pipeline = 'passthrough' @@ -71,6 +74,8 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> "TabularColumnTransformer": X_train = X['backend'].load_datamanager().train_tensors[0] self.preprocessor.fit(X_train) + self.fit_time = time.time() - start_time + return self def transform(self, X: Dict[str, Any]) -> Dict[str, Any]: diff --git a/autoPyTorch/pipeline/components/training/trainer/base_trainer.py b/autoPyTorch/pipeline/components/training/trainer/base_trainer.py index b77bb729a..934c6c315 100644 --- a/autoPyTorch/pipeline/components/training/trainer/base_trainer.py +++ b/autoPyTorch/pipeline/components/training/trainer/base_trainer.py @@ -221,6 +221,8 @@ def __init__(self, weighted_loss: int = 0, self.add_fit_requirements([ FitRequirement("is_cyclic_scheduler", (bool,), user_defined=False, dataset_property=False), ]) + self.batch_fit_times = [] + self.data_loading_times = [] def prepare( self, @@ -363,12 +365,16 @@ def train_epoch(self, train_loader: torch.utils.data.DataLoader, epoch: int, outputs_data = list() targets_data = list() + batch_load_start_time = time.time() for step, (data, targets) in enumerate(train_loader): + self.data_loading_times.append(time.time() - batch_load_start_time) + batch_train_start = time.time() if self.budget_tracker.is_max_time_reached(): break loss, outputs = self.train_step(data, targets) + self.batch_fit_times.append(time.time() - batch_train_start) # save for metric evaluation outputs_data.append(outputs.detach().cpu()) targets_data.append(targets.detach().cpu()) @@ -383,6 +389,7 @@ def train_epoch(self, train_loader: torch.utils.data.DataLoader, epoch: int, loss, epoch * len(train_loader) + step, ) + batch_load_start_time = time.time() if self.scheduler: if 'ReduceLROnPlateau' in self.scheduler.__class__.__name__: diff --git a/autoPyTorch/pipeline/components/training/trainer/base_trainer_choice.py b/autoPyTorch/pipeline/components/training/trainer/base_trainer_choice.py index 502445c14..27c64461e 100755 --- a/autoPyTorch/pipeline/components/training/trainer/base_trainer_choice.py +++ b/autoPyTorch/pipeline/components/training/trainer/base_trainer_choice.py @@ -77,6 +77,7 @@ def __init__(self, (torch.utils.data.DataLoader,), user_defined=False, dataset_property=False)] self.checkpoint_dir = None # type: Optional[str] + self.fit_time = None def get_fit_requirements(self) -> Optional[List[FitRequirement]]: return self._fit_requirements @@ -263,6 +264,7 @@ def fit(self, X: Dict[str, Any], y: Any = None, **kwargs: Any) -> autoPyTorchCom Returns: A instance of self """ + start_time = time.time() # Make sure that the prerequisites are there self.check_requirements(X, y) @@ -285,7 +287,7 @@ def fit(self, X: Dict[str, Any], y: Any = None, **kwargs: Any) -> autoPyTorchCom self.choice: autoPyTorchComponent = cast(autoPyTorchComponent, self.choice) if self.choice.use_snapshot_ensemble: X['network_snapshots'].extend(self.choice.model_snapshots) - + self.fit_time = time.time() - start_time return self.choice def _fit(self, X: Dict[str, Any], y: Any = None, **kwargs: Any) -> 'TrainerChoice': diff --git a/autoPyTorch/pipeline/components/training/trainer/cutout_utils.py b/autoPyTorch/pipeline/components/training/trainer/cutout_utils.py index c7feb2214..c58546a4c 100644 --- a/autoPyTorch/pipeline/components/training/trainer/cutout_utils.py +++ b/autoPyTorch/pipeline/components/training/trainer/cutout_utils.py @@ -53,6 +53,8 @@ def __init__(self, patch_ratio: float, self.lookahead_config = lookahead_config self.patch_ratio = patch_ratio self.cutout_prob = cutout_prob + self.batch_fit_times = [] + self.data_loading_times = [] def criterion_preparation(self, y_a: np.ndarray, y_b: np.ndarray = None, lam: float = 1.0 ) -> Callable: diff --git a/autoPyTorch/pipeline/components/training/trainer/mixup_utils.py b/autoPyTorch/pipeline/components/training/trainer/mixup_utils.py index a2325b91c..b1cf37972 100644 --- a/autoPyTorch/pipeline/components/training/trainer/mixup_utils.py +++ b/autoPyTorch/pipeline/components/training/trainer/mixup_utils.py @@ -51,6 +51,8 @@ def __init__(self, alpha: float, f'{Lookahead.__name__}:la_alpha': 0.6} self.lookahead_config = lookahead_config self.alpha = alpha + self.batch_fit_times = [] + self.data_loading_times = [] def criterion_preparation(self, y_a: np.ndarray, y_b: np.ndarray = None, lam: float = 1.0 ) -> Callable: From bbf9b07f6d48a1fd1441a4992f86c068c11b197a Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Tue, 3 Aug 2021 19:50:09 +0200 Subject: [PATCH 03/54] Add fixes for categorical data --- autoPyTorch/data/tabular_feature_validator.py | 76 +++++++++++++++++-- 1 file changed, 68 insertions(+), 8 deletions(-) diff --git a/autoPyTorch/data/tabular_feature_validator.py b/autoPyTorch/data/tabular_feature_validator.py index 69ff55fa5..16185817b 100644 --- a/autoPyTorch/data/tabular_feature_validator.py +++ b/autoPyTorch/data/tabular_feature_validator.py @@ -16,7 +16,7 @@ from sklearn.pipeline import make_pipeline from sklearn.preprocessing import OneHotEncoder from sklearn.impute import SimpleImputer -from sklearn.preprocessing import StandardScaler +from sklearn.preprocessing import StandardScaler, OrdinalEncoder from autoPyTorch.data.base_feature_validator import BaseFeatureValidator, SUPPORTED_FEAT_TYPES @@ -41,6 +41,10 @@ def get_tabular_preprocessors(): preprocessors['numerical'] = list() preprocessors['categorical'] = list() + preprocessors['categorical'].append(OneHotEncoder( + categories='auto', + sparse=False, + handle_unknown='ignore')) preprocessors['categorical'].append(SimpleImputer(strategy='constant', # Train data is numpy # as of this point, where @@ -51,13 +55,10 @@ def get_tabular_preprocessors(): fill_value=-1, copy=False)) - # preprocessors['categorical'].append(("ordinal-encoder", OrdinalEncoder( - # handle_unknown='use_encoded_value', - # unknown_value=-1))) - preprocessors['categorical'].append(OneHotEncoder( - categories='auto', - sparse=False, - handle_unknown='ignore')) + preprocessors['categorical'].append(OrdinalEncoder( + handle_unknown='use_encoded_value', + unknown_value=-1)) + preprocessors['numerical'].append(SimpleImputer(strategy='median', copy=False)) preprocessors['numerical'].append(StandardScaler(with_mean=True, with_std=True, copy=False)) @@ -98,6 +99,9 @@ def _fit( self._check_data(X) categorical_columns, numerical_columns, feat_type = self._get_columns_info(X) + self.enc_columns = categorical_columns + if len(categorical_columns) >= 0: + X = self.impute_nan_in_categories(X) preprocessors = get_tabular_preprocessors() self.column_transformer = _create_column_transformer(preprocessors=preprocessors, numerical_columns=numerical_columns, @@ -185,6 +189,8 @@ def transform( self._check_data(X) # We also need to fillna on the transformation # in case test data is provided + if len(self.categorical_columns) >= 0: + X = self.impute_nan_in_categories(X) X = self.column_transformer.transform(X) # Sparse related transformations @@ -438,4 +444,58 @@ def infer_objects(self, X: pd.DataFrame) -> pd.DataFrame: X[column] = X[column].astype('category') self.object_dtype_mapping = {column: X[column].dtype for column in X.columns} self.logger.debug(f"Infer Objects: {self.object_dtype_mapping}") + return X + + def impute_nan_in_categories(self, X: pd.DataFrame, categorical_columns=None) -> pd.DataFrame: + """ + impute missing values before encoding, + remove once sklearn natively supports + it in ordinal encoding. Sklearn issue: + "https://github.com/scikit-learn/scikit-learn/issues/17123)" + Arguments: + X (pd.DataFrame): + data to be interpreted. + Returns: + pd.DataFrame + """ + + # To be on the safe side, map always to the same missing + # value per column + if not hasattr(self, 'dict_nancol_to_missing'): + self.dict_missing_value_per_col: typing.Dict[str, typing.Any] = {} + + # First make sure that we do not alter the type of the column which cause: + # TypeError: '<' not supported between instances of 'int' and 'str' + # in the encoding + for column in self.enc_columns: + if X[column].isna().any(): + if column not in self.dict_missing_value_per_col: + try: + float(X[column].dropna().values[0]) + can_cast_as_number = True + except Exception: + can_cast_as_number = False + if can_cast_as_number: + # In this case, we expect to have a number as category + # it might be string, but its value represent a number + missing_value: typing.Union[str, int] = '-1' if isinstance(X[column].dropna().values[0], + str) else -1 + else: + missing_value = 'Missing!' + + # Make sure this missing value is not seen before + # Do this check for categorical columns + # else modify the value + if hasattr(X[column], 'cat'): + while missing_value in X[column].cat.categories: + if isinstance(missing_value, str): + missing_value += '0' + else: + missing_value += missing_value + self.dict_missing_value_per_col[column] = missing_value + + # Convert the frame in place + X[column].cat.add_categories([self.dict_missing_value_per_col[column]], + inplace=True) + X.fillna({column: self.dict_missing_value_per_col[column]}, inplace=True) return X \ No newline at end of file From 99d74077fc9a145c1dd9b518fff439589f8b013c Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Thu, 5 Aug 2021 16:00:21 +0200 Subject: [PATCH 04/54] add fit_ensemble --- autoPyTorch/api/base_task.py | 164 ++++++++++++++++++++++++++++++----- autoPyTorch/utils/backend.py | 4 +- 2 files changed, 146 insertions(+), 22 deletions(-) diff --git a/autoPyTorch/api/base_task.py b/autoPyTorch/api/base_task.py index 11b0de273..c2e220875 100644 --- a/autoPyTorch/api/base_task.py +++ b/autoPyTorch/api/base_task.py @@ -189,6 +189,9 @@ def __init__( self.trajectory: Optional[List] = None self.dataset_name: Optional[str] = None self.cv_models_: Dict = {} + self.precision: Optional[int] = None + self.opt_metric: Optional[str] = None + self.dataset: Optional[BaseDataset] = None # By default try to use the TCP logging port or get a new port self._logger_port = logging.handlers.DEFAULT_TCP_LOGGING_PORT @@ -936,26 +939,12 @@ def _search( self._logger.info("Starting ensemble") ensemble_task_name = 'ensemble' self._stopwatch.start_task(ensemble_task_name) - proc_ensemble = EnsembleBuilderManager( - start_time=time.time(), - time_left_for_ensembles=time_left_for_ensembles, - backend=copy.deepcopy(self._backend), - dataset_name=str(dataset.dataset_name), - output_type=STRING_TO_OUTPUT_TYPES[dataset.output_type], - task_type=STRING_TO_TASK_TYPES[self.task_type], - metrics=[self._metric], - opt_metric=optimize_metric, - ensemble_size=self.ensemble_size, - ensemble_nbest=self.ensemble_nbest, - max_models_on_disc=self.max_models_on_disc, - seed=self.seed, - max_iterations=None, - read_at_most=sys.maxsize, - ensemble_memory_limit=self._memory_limit, - random_state=self.seed, - precision=precision, - logger_port=self._logger_port, - ) + proc_ensemble = self._init_ensemble_builder(time_left_for_ensembles=time_left_for_ensembles, + ensemble_size=self.ensemble_size, + ensemble_nbest=self.ensemble_nbest, + precision=precision, + optimize_metric=self.opt_metric + ) self._stopwatch.stop_task(ensemble_task_name) # ==> Run SMAC @@ -1333,6 +1322,141 @@ def fit_pipeline(self, return fitted_pipeline, run_info, run_value, dataset + def fit_ensemble( + self, + ensemble_nbest: int = 50, + ensemble_size: int = 50, + precision: int = 32, + load_models: bool = True + ) -> 'BaseTask': + """ + Enables post-hoc fitting of the ensemble after the `search()` + method is finished. This method creates an ensemble using all + the models stored on disk during the smbo run + Args: + ensemble_nbest (Optional[int]): + only consider the ensemble_nbest models to build the ensemble. + If None, uses the value stored in class attribute `ensemble_nbest`. + ensemble_size (int) (default=50): + Number of models added to the ensemble built by + Ensemble selection from libraries of models. + Models are drawn with replacement. + precision (int), (default=32): Numeric precision used when loading + ensemble data. Can be either 16, 32 or 64. + Returns: + self + """ + # Make sure that input is valid + if self.dataset is None or self.opt_metric is None: + raise ValueError("fit_ensemble() can only be called after `search()`. " + "Please call the `search()` method of {} prior to " + "fit_ensemble().".format(self.__class__.__name__)) + + if self._logger is None: + self._logger = self._get_logger(self.dataset.dataset_name) + + # Create a client if needed + if self._dask_client is None: + self._create_dask_client() + else: + self._is_dask_client_internally_created = False + + manager = self._init_ensemble_builder( + time_left_for_ensembles=self._time_for_task, + optimize_metric=self.opt_metric, + precision=precision, + ensemble_size=ensemble_size, + ensemble_nbest=ensemble_nbest, + ) + + manager.build_ensemble(self._dask_client) + future = manager.futures.pop() + result = future.result() + if result is None: + raise ValueError("Errors occurred while building the ensemble - please" + " check the log file and command line output for error messages.") + self.ensemble_performance_history, _, _, _ = result + + if load_models: + self._load_models() + if self._logger is not None: + self._logger.info("Closing the dask infrastructure") + self._close_dask_client() + self._logger.info("Finished closing the dask infrastructure") + + # Clean up the logger + self._logger.info("Starting to clean up the logger") + self._clean_logger() + else: + self._close_dask_client() + + return self + + def _init_ensemble_builder( + self, + time_left_for_ensembles: float, + optimize_metric: str, + ensemble_nbest: int, + ensemble_size: int, + precision: int = 32, + ) -> EnsembleBuilderManager: + """ + Initializes an `EnsembleBuilderManager`. + Args: + time_left_for_ensembles (float): + Time (in seconds) allocated to building the ensemble + optimize_metric (str): + Name of the metric to optimize the ensemble. + ensemble_nbest (int): + only consider the ensemble_nbest models to build the ensemble. + ensemble_size (int): + Number of models added to the ensemble built by + Ensemble selection from libraries of models. + Models are drawn with replacement. + precision (int), (default=32): Numeric precision used when loading + ensemble data. Can be either 16, 32 or 64. + Returns: + EnsembleBuilderManager + """ + if self._logger is None: + raise ValueError("logger should be initialized to fit ensemble") + if self.dataset is None: + raise ValueError("ensemble can only be initialised after or during `search()`. " + "Please call the `search()` method of {}.".format(self.__class__.__name__)) + + self._logger.info("Starting ensemble") + ensemble_task_name = 'ensemble' + self._stopwatch.start_task(ensemble_task_name) + + # Use the current thread to start the ensemble builder process + # The function ensemble_builder_process will internally create a ensemble + # builder in the provide dask client + required_dataset_properties = {'task_type': self.task_type, + 'output_type': self.dataset.output_type} + proc_ensemble = EnsembleBuilderManager( + start_time=time.time(), + time_left_for_ensembles=time_left_for_ensembles, + backend=copy.deepcopy(self._backend), + dataset_name=str(self.dataset.dataset_name), + output_type=STRING_TO_OUTPUT_TYPES[self.dataset.output_type], + task_type=STRING_TO_TASK_TYPES[self.task_type], + metrics=[self._metric] if self._metric is not None else get_metrics( + dataset_properties=required_dataset_properties, names=[optimize_metric]), + opt_metric=optimize_metric, + ensemble_size=ensemble_size, + ensemble_nbest=ensemble_nbest, + max_models_on_disc=self.max_models_on_disc, + seed=self.seed, + max_iterations=None, + read_at_most=sys.maxsize, + ensemble_memory_limit=self._memory_limit, + random_state=self.seed, + precision=precision, + logger_port=self._logger_port, + ) + self._stopwatch.stop_task(ensemble_task_name) + return proc_ensemble + def predict( self, X_test: np.ndarray, diff --git a/autoPyTorch/utils/backend.py b/autoPyTorch/utils/backend.py index c9681adb3..713c7d572 100644 --- a/autoPyTorch/utils/backend.py +++ b/autoPyTorch/utils/backend.py @@ -205,12 +205,12 @@ def temporary_directory(self) -> str: def _make_internals_directory(self) -> None: try: - os.makedirs(self.internals_directory) + os.makedirs(self.internals_directory, exist_ok=True) except Exception as e: if self._logger is not None: self._logger.debug("_make_internals_directory: %s" % e) try: - os.makedirs(self.get_runs_directory()) + os.makedirs(self.get_runs_directory(), exist_ok=True) except Exception as e: if self._logger is not None: self._logger.debug("_make_internals_directory: %s" % e) From 814477455752a762f86fccb7e955377416d8a92a Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Tue, 31 Aug 2021 11:54:11 +0200 Subject: [PATCH 05/54] add arlind fix for swa and se --- .../components/training/trainer/base_trainer.py | 10 ++++++++-- .../components/training/trainer/base_trainer_choice.py | 5 +++-- requirements.txt | 6 +++--- 3 files changed, 14 insertions(+), 7 deletions(-) diff --git a/autoPyTorch/pipeline/components/training/trainer/base_trainer.py b/autoPyTorch/pipeline/components/training/trainer/base_trainer.py index 934c6c315..188504da3 100644 --- a/autoPyTorch/pipeline/components/training/trainer/base_trainer.py +++ b/autoPyTorch/pipeline/components/training/trainer/base_trainer.py @@ -320,8 +320,14 @@ def on_epoch_end(self, X: Dict[str, Any], epoch: int) -> bool: if self.use_snapshot_ensemble: assert self.model_snapshots is not None, "model snapshots container can't be " \ "none when snapshot ensembling is enabled" - model_copy = deepcopy(self.swa_model) if self.use_stochastic_weight_averaging \ - else deepcopy(self.model) + if epoch == self.budget_tracker.max_epochs: + if self.use_stochastic_weight_averaging: + model_copy = deepcopy(self.swa_model) + else: + model_copy = deepcopy(self.model) + else: + model_copy = deepcopy(self.model) + assert model_copy is not None model_copy.cpu() self.model_snapshots.append(model_copy) diff --git a/autoPyTorch/pipeline/components/training/trainer/base_trainer_choice.py b/autoPyTorch/pipeline/components/training/trainer/base_trainer_choice.py index 27c64461e..2dcb8fe16 100755 --- a/autoPyTorch/pipeline/components/training/trainer/base_trainer_choice.py +++ b/autoPyTorch/pipeline/components/training/trainer/base_trainer_choice.py @@ -410,8 +410,9 @@ def _fit(self, X: Dict[str, Any], y: Any = None, **kwargs: Any) -> 'TrainerChoic # change model update_model_state_dict_from_swa(X['network'], self.choice.swa_model.state_dict()) if self.choice.use_snapshot_ensemble: - for model in self.choice.model_snapshots: - swa_utils.update_bn(X['train_data_loader'], model.double()) + swa_utils.update_bn(X['train_data_loader'], model.double()) + # we update only the last network which pertains to the stochastic weight averaging model + swa_utils.update_bn(X['train_data_loader'], self.choice.model_snapshots[-1].double()) # wrap up -- add score if not evaluating every epoch if not self.eval_valid_each_epoch(X): diff --git a/requirements.txt b/requirements.txt index c79104461..2195e64b4 100755 --- a/requirements.txt +++ b/requirements.txt @@ -1,10 +1,10 @@ pandas -torch -torchvision +torch<=1.8 +torchvision<=0.9 tensorboard scikit-learn>=0.24.0,<0.25.0 numpy -scipy +scipy==1.6.3 lockfile imgaug>=0.4.0 ConfigSpace>=0.4.14,<0.5 From 06ad6584746f5852dd4d31a3c7706ef44a218159 Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Mon, 6 Sep 2021 18:26:43 +0200 Subject: [PATCH 06/54] fix bug in trainer choice fit --- .../pipeline/components/training/trainer/base_trainer_choice.py | 1 - 1 file changed, 1 deletion(-) diff --git a/autoPyTorch/pipeline/components/training/trainer/base_trainer_choice.py b/autoPyTorch/pipeline/components/training/trainer/base_trainer_choice.py index 2dcb8fe16..7119df201 100755 --- a/autoPyTorch/pipeline/components/training/trainer/base_trainer_choice.py +++ b/autoPyTorch/pipeline/components/training/trainer/base_trainer_choice.py @@ -410,7 +410,6 @@ def _fit(self, X: Dict[str, Any], y: Any = None, **kwargs: Any) -> 'TrainerChoic # change model update_model_state_dict_from_swa(X['network'], self.choice.swa_model.state_dict()) if self.choice.use_snapshot_ensemble: - swa_utils.update_bn(X['train_data_loader'], model.double()) # we update only the last network which pertains to the stochastic weight averaging model swa_utils.update_bn(X['train_data_loader'], self.choice.model_snapshots[-1].double()) From 1942279d1ec5aabfcc2d2127ddc3bd9dfd056293 Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Wed, 8 Sep 2021 13:16:20 +0200 Subject: [PATCH 07/54] fix ensemble bug --- autoPyTorch/api/base_task.py | 35 +++++++++++++++++++++++------------ 1 file changed, 23 insertions(+), 12 deletions(-) diff --git a/autoPyTorch/api/base_task.py b/autoPyTorch/api/base_task.py index c2e220875..6ed0559e0 100644 --- a/autoPyTorch/api/base_task.py +++ b/autoPyTorch/api/base_task.py @@ -397,6 +397,7 @@ def _clean_logger(self) -> None: self.logging_server.join(timeout=5) self.logging_server.terminate() del self.stop_logging_server + self._logger = None def _create_dask_client(self) -> None: """ @@ -491,6 +492,23 @@ def _load_models(self) -> bool: return True + def _cleanup(self) -> None: + """ + Closes the different servers created during api search. + Returns: + None + """ + if self._logger is not None: + self._logger.info("Closing the dask infrastructure") + self._close_dask_client() + self._logger.info("Finished closing the dask infrastructure") + + # Clean up the logger + self._logger.info("Starting to clean up the logger") + self._clean_logger() + else: + self._close_dask_client() + def _load_best_individual_model(self) -> SingleBest: """ In case of failure during ensemble building, @@ -923,6 +941,8 @@ def _search( self._stopwatch.stop_task(traditional_task_name) # ============> Starting ensemble + self.precision = precision + self.opt_metric = optimize_metric elapsed_time = self._stopwatch.wall_elapsed(self.dataset_name) time_left_for_ensembles = max(0, total_walltime_limit - elapsed_time) proc_ensemble = None @@ -1024,18 +1044,12 @@ def _search( pd.DataFrame(self.ensemble_performance_history).to_json( os.path.join(self._backend.internals_directory, 'ensemble_history.json')) - self._logger.info("Closing the dask infrastructure") - self._close_dask_client() - self._logger.info("Finished closing the dask infrastructure") - if load_models: self._logger.info("Loading models...") self._load_models() self._logger.info("Finished loading models...") - # Clean up the logger - self._logger.info("Starting to clean up the logger") - self._clean_logger() + self._cleanup() return self @@ -1506,7 +1520,7 @@ def predict( predictions = self.ensemble_.predict(all_predictions) - self._clean_logger() + self._cleanup() return predictions @@ -1543,10 +1557,7 @@ def __getstate__(self) -> Dict[str, Any]: return self.__dict__ def __del__(self) -> None: - # Clean up the logger - self._clean_logger() - - self._close_dask_client() + self._cleanup() # When a multiprocessing work is done, the # objects are deleted. We don't want to delete run areas From 2dc88500566ac67ad30018fcba00a4e7e62d1cb3 Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Wed, 8 Sep 2021 16:48:58 +0200 Subject: [PATCH 08/54] Correct bug in cleanup --- autoPyTorch/api/base_task.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/autoPyTorch/api/base_task.py b/autoPyTorch/api/base_task.py index 6ed0559e0..19951a3a5 100644 --- a/autoPyTorch/api/base_task.py +++ b/autoPyTorch/api/base_task.py @@ -498,7 +498,7 @@ def _cleanup(self) -> None: Returns: None """ - if self._logger is not None: + if hasattr(self, '_logger') and self._logger is not None: self._logger.info("Closing the dask infrastructure") self._close_dask_client() self._logger.info("Finished closing the dask infrastructure") From 06d80d471898b41b59104a3bc92800f02f275b54 Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Thu, 16 Sep 2021 14:44:10 +0200 Subject: [PATCH 09/54] Cleanup for removing time debug statements --- .../tabular_preprocessing/TabularColumnTransformer.py | 3 --- .../pipeline/components/training/trainer/base_trainer.py | 5 ----- .../components/training/trainer/base_trainer_choice.py | 3 --- .../pipeline/components/training/trainer/cutout_utils.py | 2 -- .../pipeline/components/training/trainer/mixup_utils.py | 2 -- 5 files changed, 15 deletions(-) diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/TabularColumnTransformer.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/TabularColumnTransformer.py index 5fcf5cfb5..c7ca61e09 100644 --- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/TabularColumnTransformer.py +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/TabularColumnTransformer.py @@ -23,7 +23,6 @@ def __init__(self, random_state: Optional[Union[np.random.RandomState, int]] = N self.add_fit_requirements([ FitRequirement('numerical_columns', (List,), user_defined=True, dataset_property=True), FitRequirement('categorical_columns', (List,), user_defined=True, dataset_property=True)]) - self.fit_time = None def get_column_transformer(self) -> ColumnTransformer: """ @@ -48,7 +47,6 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> "TabularColumnTransformer": Returns: "TabularColumnTransformer": an instance of self """ - start_time = time.time() self.check_requirements(X, y) numerical_pipeline = 'passthrough' @@ -74,7 +72,6 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> "TabularColumnTransformer": X_train = X['backend'].load_datamanager().train_tensors[0] self.preprocessor.fit(X_train) - self.fit_time = time.time() - start_time return self diff --git a/autoPyTorch/pipeline/components/training/trainer/base_trainer.py b/autoPyTorch/pipeline/components/training/trainer/base_trainer.py index 188504da3..6040f32e9 100644 --- a/autoPyTorch/pipeline/components/training/trainer/base_trainer.py +++ b/autoPyTorch/pipeline/components/training/trainer/base_trainer.py @@ -371,16 +371,12 @@ def train_epoch(self, train_loader: torch.utils.data.DataLoader, epoch: int, outputs_data = list() targets_data = list() - batch_load_start_time = time.time() for step, (data, targets) in enumerate(train_loader): - self.data_loading_times.append(time.time() - batch_load_start_time) - batch_train_start = time.time() if self.budget_tracker.is_max_time_reached(): break loss, outputs = self.train_step(data, targets) - self.batch_fit_times.append(time.time() - batch_train_start) # save for metric evaluation outputs_data.append(outputs.detach().cpu()) targets_data.append(targets.detach().cpu()) @@ -395,7 +391,6 @@ def train_epoch(self, train_loader: torch.utils.data.DataLoader, epoch: int, loss, epoch * len(train_loader) + step, ) - batch_load_start_time = time.time() if self.scheduler: if 'ReduceLROnPlateau' in self.scheduler.__class__.__name__: diff --git a/autoPyTorch/pipeline/components/training/trainer/base_trainer_choice.py b/autoPyTorch/pipeline/components/training/trainer/base_trainer_choice.py index 7119df201..a344e92ce 100755 --- a/autoPyTorch/pipeline/components/training/trainer/base_trainer_choice.py +++ b/autoPyTorch/pipeline/components/training/trainer/base_trainer_choice.py @@ -77,7 +77,6 @@ def __init__(self, (torch.utils.data.DataLoader,), user_defined=False, dataset_property=False)] self.checkpoint_dir = None # type: Optional[str] - self.fit_time = None def get_fit_requirements(self) -> Optional[List[FitRequirement]]: return self._fit_requirements @@ -264,7 +263,6 @@ def fit(self, X: Dict[str, Any], y: Any = None, **kwargs: Any) -> autoPyTorchCom Returns: A instance of self """ - start_time = time.time() # Make sure that the prerequisites are there self.check_requirements(X, y) @@ -287,7 +285,6 @@ def fit(self, X: Dict[str, Any], y: Any = None, **kwargs: Any) -> autoPyTorchCom self.choice: autoPyTorchComponent = cast(autoPyTorchComponent, self.choice) if self.choice.use_snapshot_ensemble: X['network_snapshots'].extend(self.choice.model_snapshots) - self.fit_time = time.time() - start_time return self.choice def _fit(self, X: Dict[str, Any], y: Any = None, **kwargs: Any) -> 'TrainerChoice': diff --git a/autoPyTorch/pipeline/components/training/trainer/cutout_utils.py b/autoPyTorch/pipeline/components/training/trainer/cutout_utils.py index c58546a4c..c7feb2214 100644 --- a/autoPyTorch/pipeline/components/training/trainer/cutout_utils.py +++ b/autoPyTorch/pipeline/components/training/trainer/cutout_utils.py @@ -53,8 +53,6 @@ def __init__(self, patch_ratio: float, self.lookahead_config = lookahead_config self.patch_ratio = patch_ratio self.cutout_prob = cutout_prob - self.batch_fit_times = [] - self.data_loading_times = [] def criterion_preparation(self, y_a: np.ndarray, y_b: np.ndarray = None, lam: float = 1.0 ) -> Callable: diff --git a/autoPyTorch/pipeline/components/training/trainer/mixup_utils.py b/autoPyTorch/pipeline/components/training/trainer/mixup_utils.py index b1cf37972..a2325b91c 100644 --- a/autoPyTorch/pipeline/components/training/trainer/mixup_utils.py +++ b/autoPyTorch/pipeline/components/training/trainer/mixup_utils.py @@ -51,8 +51,6 @@ def __init__(self, alpha: float, f'{Lookahead.__name__}:la_alpha': 0.6} self.lookahead_config = lookahead_config self.alpha = alpha - self.batch_fit_times = [] - self.data_loading_times = [] def criterion_preparation(self, y_a: np.ndarray, y_b: np.ndarray = None, lam: float = 1.0 ) -> Callable: From d8b553aa2262825786440c26779e1f39b142d6e5 Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Mon, 20 Sep 2021 15:55:20 +0200 Subject: [PATCH 10/54] ablation for adversarial --- .../components/training/trainer/AdversarialTrainer.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/autoPyTorch/pipeline/components/training/trainer/AdversarialTrainer.py b/autoPyTorch/pipeline/components/training/trainer/AdversarialTrainer.py index c5a536dd0..36d586919 100644 --- a/autoPyTorch/pipeline/components/training/trainer/AdversarialTrainer.py +++ b/autoPyTorch/pipeline/components/training/trainer/AdversarialTrainer.py @@ -189,12 +189,17 @@ def get_hyperparameter_search_space( default_value=3), epsilon: HyperparameterSearchSpace = HyperparameterSearchSpace( hyperparameter="epsilon", - value_range=(0.05, 0.2), - default_value=0.2), + value_range=(0.001, 0.15), + default_value=0.007, + log=True), ) -> ConfigurationSpace: cs = ConfigurationSpace() + epsilon = HyperparameterSearchSpace(hyperparameter="epsilon", + value_range=(0.007, 0.007), + default_value=0.007) add_hyperparameter(cs, epsilon, UniformFloatHyperparameter) + add_hyperparameter(cs, use_stochastic_weight_averaging, CategoricalHyperparameter) snapshot_ensemble_flag = False if any(use_snapshot_ensemble.value_range): From 34712b3b2d2c7a5ab810f795cf80abcf4090adb4 Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Tue, 21 Sep 2021 13:08:54 +0200 Subject: [PATCH 11/54] shuffle false in dataloader --- .../components/training/data_loader/base_data_loader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/autoPyTorch/pipeline/components/training/data_loader/base_data_loader.py b/autoPyTorch/pipeline/components/training/data_loader/base_data_loader.py index 5b8e445ac..8dff86052 100644 --- a/autoPyTorch/pipeline/components/training/data_loader/base_data_loader.py +++ b/autoPyTorch/pipeline/components/training/data_loader/base_data_loader.py @@ -112,7 +112,7 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader: self.train_data_loader = torch.utils.data.DataLoader( train_dataset, batch_size=min(self.batch_size, len(train_dataset)), - shuffle=True, + shuffle=False, num_workers=X.get('num_workers', 0), pin_memory=X.get('pin_memory', True), drop_last=X.get('drop_last', True), From 49f40dc2715f9e40ec38455e073c5116e2ee2b1a Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Tue, 21 Sep 2021 14:44:48 +0200 Subject: [PATCH 12/54] drop last false in dataloader --- .../components/training/data_loader/base_data_loader.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/autoPyTorch/pipeline/components/training/data_loader/base_data_loader.py b/autoPyTorch/pipeline/components/training/data_loader/base_data_loader.py index 8dff86052..7302ac6f5 100644 --- a/autoPyTorch/pipeline/components/training/data_loader/base_data_loader.py +++ b/autoPyTorch/pipeline/components/training/data_loader/base_data_loader.py @@ -112,10 +112,10 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader: self.train_data_loader = torch.utils.data.DataLoader( train_dataset, batch_size=min(self.batch_size, len(train_dataset)), - shuffle=False, + shuffle=True, num_workers=X.get('num_workers', 0), pin_memory=X.get('pin_memory', True), - drop_last=X.get('drop_last', True), + drop_last=X.get('drop_last', False), collate_fn=custom_collate_fn, ) From f4ea158a4c5611137a2522dfa4237b32b4ca1941 Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Thu, 23 Sep 2021 15:39:25 +0200 Subject: [PATCH 13/54] fix bug for validation set, and cutout and cutmix --- autoPyTorch/api/base_task.py | 2 +- .../pipeline/components/training/trainer/RowCutMixTrainer.py | 2 +- .../pipeline/components/training/trainer/RowCutOutTrainer.py | 2 +- autoPyTorch/utils/backend.py | 5 +++++ 4 files changed, 8 insertions(+), 3 deletions(-) diff --git a/autoPyTorch/api/base_task.py b/autoPyTorch/api/base_task.py index 19951a3a5..14aa6ab83 100644 --- a/autoPyTorch/api/base_task.py +++ b/autoPyTorch/api/base_task.py @@ -1244,7 +1244,7 @@ def fit_pipeline(self, dataset_requirements = get_dataset_requirements( info=self._get_required_dataset_properties(dataset)) dataset_properties = dataset.get_dataset_properties(dataset_requirements) - self._backend.save_datamanager(dataset) + self._backend.replace_datamanager(dataset) if self._logger is None: self._logger = self._get_logger(dataset.dataset_name) diff --git a/autoPyTorch/pipeline/components/training/trainer/RowCutMixTrainer.py b/autoPyTorch/pipeline/components/training/trainer/RowCutMixTrainer.py index 20d02c793..f1b606046 100644 --- a/autoPyTorch/pipeline/components/training/trainer/RowCutMixTrainer.py +++ b/autoPyTorch/pipeline/components/training/trainer/RowCutMixTrainer.py @@ -36,7 +36,7 @@ def data_preparation(self, X: np.ndarray, y: np.ndarray, return X, {'y_a': y, 'y_b': y[index], 'lam': 1} size = X.shape[1] - indices = torch.tensor(self.random_state.choice(range(1, size), max(1, np.int32(size * lam)), + indices = torch.tensor(self.random_state.choice(range(size), max(1, np.int32(size * lam)), replace=False)) X[:, indices] = X[index, :][:, indices] diff --git a/autoPyTorch/pipeline/components/training/trainer/RowCutOutTrainer.py b/autoPyTorch/pipeline/components/training/trainer/RowCutOutTrainer.py index c09603523..d7bd23f4e 100644 --- a/autoPyTorch/pipeline/components/training/trainer/RowCutOutTrainer.py +++ b/autoPyTorch/pipeline/components/training/trainer/RowCutOutTrainer.py @@ -37,7 +37,7 @@ def data_preparation(self, X: np.ndarray, y: np.ndarray, return X, {'y_a': y_a, 'y_b': y_b, 'lam': lam} size = X.shape[1] - indices = self.random_state.choice(range(1, size), max(1, np.int32(size * self.patch_ratio)), + indices = self.random_state.choice(range(size), max(1, np.int32(size * self.patch_ratio)), replace=False) """if not isinstance(self.numerical_columns, typing.Iterable): diff --git a/autoPyTorch/utils/backend.py b/autoPyTorch/utils/backend.py index 713c7d572..7a7399a9f 100644 --- a/autoPyTorch/utils/backend.py +++ b/autoPyTorch/utils/backend.py @@ -328,6 +328,11 @@ def load_datamanager(self) -> BaseDataset: with open(filepath, 'rb') as fh: return pickle.load(fh) + def replace_datamanager(self, datamanager: BaseDataset): + warnings.warn("Original dataset will be overwritten with the provided dataset") + os.remove(self._get_datamanager_pickle_filename()) + self.save_datamanager(datamanager=datamanager) + def get_runs_directory(self) -> str: return os.path.join(self.internals_directory, 'runs') From 209a4e82a362cb0edf15432bafb8a526f7c19b3e Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Fri, 24 Sep 2021 12:36:54 +0200 Subject: [PATCH 14/54] shuffle = False --- .../components/training/data_loader/base_data_loader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/autoPyTorch/pipeline/components/training/data_loader/base_data_loader.py b/autoPyTorch/pipeline/components/training/data_loader/base_data_loader.py index 7302ac6f5..bf0f23fa6 100644 --- a/autoPyTorch/pipeline/components/training/data_loader/base_data_loader.py +++ b/autoPyTorch/pipeline/components/training/data_loader/base_data_loader.py @@ -112,7 +112,7 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader: self.train_data_loader = torch.utils.data.DataLoader( train_dataset, batch_size=min(self.batch_size, len(train_dataset)), - shuffle=True, + shuffle=False, num_workers=X.get('num_workers', 0), pin_memory=X.get('pin_memory', True), drop_last=X.get('drop_last', False), From 8fb0bc2c5c7b4f98e95785293b4da86e8a58f214 Mon Sep 17 00:00:00 2001 From: Ravin Kohli <13005107+ravinkohli@users.noreply.github.com> Date: Thu, 30 Sep 2021 16:06:57 +0200 Subject: [PATCH 15/54] Shake Shake updates (#287) * To test locally * fix bug in trainer choice fit * fix ensemble bug * Correct bug in cleanup * To test locally * Cleanup for removing time debug statements * ablation for adversarial * shuffle false in dataloader * drop last false in dataloader * fix bug for validation set, and cutout and cutmix * To test locally * shuffle = False * To test locally * updates to search space * updates to search space * update branch with search space * undo search space update * fix bug in shake shake flag * limit to shake-even * restrict to even even * Add even even and others for shake-drop also * fix bug in passing alpha beta method * restrict to only even even * fix silly bug: * remove imputer and ordinal encoder for categorical transformer in feature validator * Address comments from shuhei --- autoPyTorch/data/tabular_feature_validator.py | 30 +++++++++---------- autoPyTorch/pipeline/base_pipeline.py | 5 ++-- .../setup/network_backbone/ResNetBackbone.py | 24 +++++++++++---- .../network_backbone/ShapedResNetBackbone.py | 17 +++++++++-- .../setup/network_backbone/utils.py | 28 ++++++++++++++--- .../setup/optimizer/AdamWOptimizer.py | 4 +-- .../training/data_loader/base_data_loader.py | 2 +- .../example_custom_configuration_space.py | 11 ++++++- 8 files changed, 88 insertions(+), 33 deletions(-) diff --git a/autoPyTorch/data/tabular_feature_validator.py b/autoPyTorch/data/tabular_feature_validator.py index 16185817b..28d64a4b1 100644 --- a/autoPyTorch/data/tabular_feature_validator.py +++ b/autoPyTorch/data/tabular_feature_validator.py @@ -41,26 +41,26 @@ def get_tabular_preprocessors(): preprocessors['numerical'] = list() preprocessors['categorical'] = list() + # preprocessors['categorical'].append(SimpleImputer(strategy='constant', + # # Train data is numpy + # # as of this point, where + # # Ordinal Encoding is using + # # for categorical. Only + # # Numbers are allowed + # # fill_value='!missing!', + # fill_value=-1, + # copy=False)) + + # preprocessors['categorical'].append(OrdinalEncoder( + # handle_unknown='use_encoded_value', + # unknown_value=-1)) + preprocessors['categorical'].append(OneHotEncoder( categories='auto', sparse=False, handle_unknown='ignore')) - preprocessors['categorical'].append(SimpleImputer(strategy='constant', - # Train data is numpy - # as of this point, where - # Ordinal Encoding is using - # for categorical. Only - # Numbers are allowed - # fill_value='!missing!', - fill_value=-1, - copy=False)) - - preprocessors['categorical'].append(OrdinalEncoder( - handle_unknown='use_encoded_value', - unknown_value=-1)) - preprocessors['numerical'].append(SimpleImputer(strategy='median', - copy=False)) + copy=False)) preprocessors['numerical'].append(StandardScaler(with_mean=True, with_std=True, copy=False)) return preprocessors diff --git a/autoPyTorch/pipeline/base_pipeline.py b/autoPyTorch/pipeline/base_pipeline.py index 842f63271..80d59a68f 100644 --- a/autoPyTorch/pipeline/base_pipeline.py +++ b/autoPyTorch/pipeline/base_pipeline.py @@ -451,12 +451,13 @@ def _check_search_space_updates(self, include: Optional[Dict[str, Any]], continue raise ValueError("Unknown hyperparameter for component {}. " "Expected update hyperparameter " - "to be in {} got {}".format(node.__class__.__name__, + "to be in {} got {}. choice is {}".format(node.__class__.__name__, component. get_hyperparameter_search_space( dataset_properties=self.dataset_properties). get_hyperparameter_names(), - split_hyperparameter[1])) + split_hyperparameter[1], + component.__name__)) else: if update.hyperparameter not in node.get_hyperparameter_search_space( dataset_properties=self.dataset_properties): diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/ResNetBackbone.py b/autoPyTorch/pipeline/components/setup/network_backbone/ResNetBackbone.py index 069ca4679..10f509741 100644 --- a/autoPyTorch/pipeline/components/setup/network_backbone/ResNetBackbone.py +++ b/autoPyTorch/pipeline/components/setup/network_backbone/ResNetBackbone.py @@ -139,6 +139,14 @@ def get_hyperparameter_search_space( value_range=(True, False), default_value=True, ), + shake_alpha_beta_method: HyperparameterSearchSpace = HyperparameterSearchSpace( + hyperparameter="shake_alpha_beta_method", + value_range=('shake-shake', + 'shake-even', + 'even-even', + 'M3'), + default_value='shake-shake', + ), use_shake_drop: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="use_shake_drop", value_range=(True, False), default_value=True, @@ -180,9 +188,8 @@ def get_hyperparameter_search_space( if skip_connection_flag: - shake_drop_prob_flag = False - if 'shake-drop' in multi_branch_choice.value_range: - shake_drop_prob_flag = True + shake_shake_flag = 'shake-shake' in multi_branch_choice.value_range + shake_drop_prob_flag = 'shake-drop' in multi_branch_choice.value_range mb_choice = get_hyperparameter(multi_branch_choice, CategoricalHyperparameter) cs.add_hyperparameter(mb_choice) @@ -192,6 +199,10 @@ def get_hyperparameter_search_space( shake_drop_prob = get_hyperparameter(max_shake_drop_probability, UniformFloatHyperparameter) cs.add_hyperparameter(shake_drop_prob) cs.add_condition(CS.EqualsCondition(shake_drop_prob, mb_choice, "shake-drop")) + if shake_shake_flag or shake_drop_prob_flag: + method = get_hyperparameter(shake_alpha_beta_method, CategoricalHyperparameter) + cs.add_hyperparameter(method) + cs.add_condition(CS.InCondition(method, mb_choice, ["shake-shake", "shake-drop"])) # It is the upper bound of the nr of groups, # since the configuration will actually be sampled. @@ -327,11 +338,14 @@ def forward(self, x: torch.FloatTensor) -> torch.FloatTensor: if self.config["multi_branch_choice"] == 'shake-shake': x1 = self.layers(x) x2 = self.shake_shake_layers(x) - alpha, beta = shake_get_alpha_beta(self.training, x.is_cuda) + alpha, beta = shake_get_alpha_beta(is_training=self.training, + is_cuda=x.is_cuda, + method=self.config['shake_alpha_beta_method']) x = shake_shake(x1, x2, alpha, beta) elif self.config["multi_branch_choice"] == 'shake-drop': x = self.layers(x) - alpha, beta = shake_get_alpha_beta(self.training, x.is_cuda) + alpha, beta = shake_get_alpha_beta(self.training, x.is_cuda, + method=self.config['shake_alpha_beta_method']) bl = shake_drop_get_bl( self.block_index, 1 - self.config["max_shake_drop_probability"], diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/ShapedResNetBackbone.py b/autoPyTorch/pipeline/components/setup/network_backbone/ShapedResNetBackbone.py index e0867cdd3..12c6d4e74 100644 --- a/autoPyTorch/pipeline/components/setup/network_backbone/ShapedResNetBackbone.py +++ b/autoPyTorch/pipeline/components/setup/network_backbone/ShapedResNetBackbone.py @@ -145,6 +145,14 @@ def get_hyperparameter_search_space( # type: ignore[override] 'stairs'), default_value='funnel', ), + shake_alpha_beta_method: HyperparameterSearchSpace = HyperparameterSearchSpace( + hyperparameter="shake_alpha_beta_method", + value_range=('shake-shake', + 'shake-even', + 'even-even', + 'M3'), + default_value='shake-shake', + ), max_shake_drop_probability: HyperparameterSearchSpace = HyperparameterSearchSpace( hyperparameter="max_shake_drop_probability", value_range=(0, 1), @@ -188,9 +196,8 @@ def get_hyperparameter_search_space( # type: ignore[override] if skip_connection_flag: - shake_drop_prob_flag = False - if 'shake-drop' in multi_branch_choice.value_range: - shake_drop_prob_flag = True + shake_shake_flag = 'shake-shake' in multi_branch_choice.value_range + shake_drop_prob_flag = 'shake-drop' in multi_branch_choice.value_range mb_choice = get_hyperparameter(multi_branch_choice, CategoricalHyperparameter) cs.add_hyperparameter(mb_choice) @@ -200,5 +207,9 @@ def get_hyperparameter_search_space( # type: ignore[override] shake_drop_prob = get_hyperparameter(max_shake_drop_probability, UniformFloatHyperparameter) cs.add_hyperparameter(shake_drop_prob) cs.add_condition(CS.EqualsCondition(shake_drop_prob, mb_choice, "shake-drop")) + if shake_shake_flag or shake_drop_prob_flag: + method = get_hyperparameter(shake_alpha_beta_method, CategoricalHyperparameter) + cs.add_hyperparameter(method) + cs.add_condition(CS.InCondition(method, mb_choice, ["shake-shake", "shake-drop"])) return cs diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/utils.py b/autoPyTorch/pipeline/components/setup/network_backbone/utils.py index ef19beac8..9a1f9dd4e 100644 --- a/autoPyTorch/pipeline/components/setup/network_backbone/utils.py +++ b/autoPyTorch/pipeline/components/setup/network_backbone/utils.py @@ -92,15 +92,35 @@ def backward(ctx: typing.Any, shake_drop = ShakeDropFunction.apply -def shake_get_alpha_beta(is_training: bool, is_cuda: bool - ) -> typing.Tuple[torch.tensor, torch.tensor]: +def shake_get_alpha_beta( + is_training: bool, + is_cuda: bool, + method: str +) -> typing.Tuple[torch.tensor, torch.tensor]: + """ + The methods used in this function have been introduced in 'ShakeShake Regularisation' + https://arxiv.org/abs/1705.07485. The names have been taken from the paper as well. + """ if not is_training: result = (torch.FloatTensor([0.5]), torch.FloatTensor([0.5])) return result if not is_cuda else (result[0].cuda(), result[1].cuda()) # TODO implement other update methods - alpha = torch.rand(1) - beta = torch.rand(1) + if method == 'even-even': + alpha = torch.FloatTensor([0.5]) + else: + alpha = torch.rand(1) + + if method == 'shake-shake': + beta = torch.rand(1) + elif method in ['shake-even', 'even-even']: + beta = torch.FloatTensor([0.5]) + elif method == 'M3': + beta = torch.FloatTensor( + [torch.rand(1)*(0.5 - alpha)*alpha if alpha < 0.5 else torch.rand(1)*(alpha - 0.5)*alpha] + ) + else: + raise ValueError("Unknown method for ShakeShakeRegularisation in NetworkBackbone") if is_cuda: alpha = alpha.cuda() diff --git a/autoPyTorch/pipeline/components/setup/optimizer/AdamWOptimizer.py b/autoPyTorch/pipeline/components/setup/optimizer/AdamWOptimizer.py index 4d11c3026..a415ff1c6 100644 --- a/autoPyTorch/pipeline/components/setup/optimizer/AdamWOptimizer.py +++ b/autoPyTorch/pipeline/components/setup/optimizer/AdamWOptimizer.py @@ -95,9 +95,9 @@ def get_hyperparameter_search_space( default_value=True, ), weight_decay: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="weight_decay", - value_range=(1E-7, 0.1), + value_range=(1E-5, 0.1), default_value=1E-4, - log=True), + log=False), ) -> ConfigurationSpace: cs = ConfigurationSpace() diff --git a/autoPyTorch/pipeline/components/training/data_loader/base_data_loader.py b/autoPyTorch/pipeline/components/training/data_loader/base_data_loader.py index bf0f23fa6..7302ac6f5 100644 --- a/autoPyTorch/pipeline/components/training/data_loader/base_data_loader.py +++ b/autoPyTorch/pipeline/components/training/data_loader/base_data_loader.py @@ -112,7 +112,7 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader: self.train_data_loader = torch.utils.data.DataLoader( train_dataset, batch_size=min(self.batch_size, len(train_dataset)), - shuffle=False, + shuffle=True, num_workers=X.get('num_workers', 0), pin_memory=X.get('pin_memory', True), drop_last=X.get('drop_last', False), diff --git a/examples/tabular/40_advanced/example_custom_configuration_space.py b/examples/tabular/40_advanced/example_custom_configuration_space.py index 6a3764b94..b95ceeaa5 100644 --- a/examples/tabular/40_advanced/example_custom_configuration_space.py +++ b/examples/tabular/40_advanced/example_custom_configuration_space.py @@ -54,6 +54,15 @@ def get_search_space_updates(): hyperparameter='ResNetBackbone:dropout', value_range=[0, 0.5], default_value=0.2) + updates.append(node_name='network_backbone', + hyperparameter='ResNetBackbone:multi_branch_choice', + value_range=['shake-shake'], + default_value='shake-shake') + updates.append(node_name='network_backbone', + hyperparameter='ResNetBackbone:shake_shake_method', + value_range=['M3'], + default_value='M3' + ) return updates @@ -74,7 +83,7 @@ def get_search_space_updates(): # ================================================== api = TabularClassificationTask( search_space_updates=get_search_space_updates(), - include_components={'network_backbone': ['MLPBackbone', 'ResNetBackbone'], + include_components={'network_backbone': ['ResNetBackbone'], 'encoder': ['OneHotEncoder']} ) From 064e4a93eb0c116611140d0ca21e094d7a91d7a6 Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Thu, 30 Sep 2021 18:36:42 +0200 Subject: [PATCH 16/54] fix issues with ensemble fitting post hoc --- autoPyTorch/api/base_task.py | 106 +++++++++++++++--- autoPyTorch/ensemble/singlebest_ensemble.py | 5 +- .../example_posthoc_ensemble_fit.py | 81 +++++++++++++ 3 files changed, 173 insertions(+), 19 deletions(-) create mode 100644 examples/tabular/40_advanced/example_posthoc_ensemble_fit.py diff --git a/autoPyTorch/api/base_task.py b/autoPyTorch/api/base_task.py index 14aa6ab83..c90306f3a 100644 --- a/autoPyTorch/api/base_task.py +++ b/autoPyTorch/api/base_task.py @@ -887,7 +887,7 @@ def _search( # If no dask client was provided, we create one, so that we can # start a ensemble process in parallel to smbo optimize if ( - self._dask_client is None and (self.ensemble_size > 0 or self.n_jobs is not None and self.n_jobs > 1) + self._dask_client is None and (self.ensemble_size > 0 or self.n_jobs > 1) ): self._create_dask_client() else: @@ -916,14 +916,16 @@ def _search( ) # ============> Run dummy predictions - dummy_task_name = 'runDummy' - self._stopwatch.start_task(dummy_task_name) - self._do_dummy_prediction() - self._stopwatch.stop_task(dummy_task_name) + # We only want to run dummy predictions in case we want to build an ensemble + if self.ensemble_size > 0: + dummy_task_name = 'runDummy' + self._stopwatch.start_task(dummy_task_name) + self._do_dummy_prediction() + self._stopwatch.stop_task(dummy_task_name) # ============> Run traditional ml - - if enable_traditional_pipeline: + # We only want to run traditional predictions in case we want to build an ensemble + if enable_traditional_pipeline and self.ensemble_size > 0: if STRING_TO_TASK_TYPES[self.task_type] in REGRESSION_TASKS: self._logger.warning("Traditional Pipeline is not enabled for regression. Skipping...") else: @@ -1341,7 +1343,10 @@ def fit_ensemble( ensemble_nbest: int = 50, ensemble_size: int = 50, precision: int = 32, - load_models: bool = True + load_models: bool = True, + time_for_task: int = 100, + func_eval_time_limit_secs: Optional[int] = None, + enable_traditional_pipeline: bool = True ) -> 'BaseTask': """ Enables post-hoc fitting of the ensemble after the `search()` @@ -1357,6 +1362,30 @@ def fit_ensemble( Models are drawn with replacement. precision (int), (default=32): Numeric precision used when loading ensemble data. Can be either 16, 32 or 64. + enable_traditional_pipeline (bool), (default=True): + We fit traditional machine learning algorithms + (LightGBM, CatBoost, RandomForest, ExtraTrees, KNN, SVM) + prior building PyTorch Neural Networks. You can disable this + feature by turning this flag to False. All machine learning + algorithms that are fitted during search() are considered for + ensemble building. + load_models (bool), (default=True): Whether to load the + models after fitting AutoPyTorch. + time_for_task (int), (default=100): Time limit + in seconds for the search of appropriate models. + By increasing this value, autopytorch has a higher + chance of finding better models. + func_eval_time_limit_secs (int), (default=None): Time limit + for a single call to the machine learning model. + Model fitting will be terminated if the machine + learning algorithm runs over the time limit. Set + this value high enough so that typical machine + learning algorithms can be fit on the training + data. + When set to None, this time will automatically be set to + total_walltime_limit // 2 to allow enough time to fit + at least 2 individual machine learning algorithms. + Set to np.inf in case no time limit is desired. Returns: self """ @@ -1375,8 +1404,55 @@ def fit_ensemble( else: self._is_dask_client_internally_created = False + ensemble_fit_task_name = 'EnsembleFit' + self._stopwatch.start_task(ensemble_fit_task_name) + if enable_traditional_pipeline: + if func_eval_time_limit_secs is None or func_eval_time_limit_secs > time_for_task: + self._logger.warning( + 'Time limit for a single run is higher than total time ' + 'limit. Capping the limit for a single run to the total ' + 'time given to Ensemble fit (%f)' % time_for_task + ) + func_eval_time_limit_secs = time_for_task + + # Make sure that at least 2 models are created for the ensemble process + num_models = time_for_task // func_eval_time_limit_secs + if num_models < 2: + func_eval_time_limit_secs = time_for_task // 2 + self._logger.warning( + "Capping the func_eval_time_limit_secs to {} to have " + "time for a least 2 models to ensemble.".format( + func_eval_time_limit_secs + ) + ) + # We only want to run dummy predictions in case we want to build an ensemble + dummy_task_name = 'runDummy' + self._stopwatch.start_task(dummy_task_name) + self._do_dummy_prediction() + self._stopwatch.stop_task(dummy_task_name) + + # ============> Run traditional ml + # We only want to run traditional predictions in case we want to build an ensemble + if enable_traditional_pipeline and self.ensemble_size > 0: + if STRING_TO_TASK_TYPES[self.task_type] in REGRESSION_TASKS: + self._logger.warning("Traditional Pipeline is not enabled for regression. Skipping...") + else: + traditional_task_name = 'runTraditional' + self._stopwatch.start_task(traditional_task_name) + elapsed_time = self._stopwatch.wall_elapsed(ensemble_fit_task_name) + time_for_traditional = int( + time_for_task - elapsed_time + ) + self._do_traditional_prediction( + func_eval_time_limit_secs=func_eval_time_limit_secs, + time_left=time_for_traditional, + ) + self._stopwatch.stop_task(traditional_task_name) + + elapsed_time = self._stopwatch.wall_elapsed(ensemble_fit_task_name) + time_left_for_ensemble = int(time_for_task - elapsed_time) manager = self._init_ensemble_builder( - time_left_for_ensembles=self._time_for_task, + time_left_for_ensembles=time_left_for_ensemble, optimize_metric=self.opt_metric, precision=precision, ensemble_size=ensemble_size, @@ -1393,16 +1469,10 @@ def fit_ensemble( if load_models: self._load_models() - if self._logger is not None: - self._logger.info("Closing the dask infrastructure") - self._close_dask_client() - self._logger.info("Finished closing the dask infrastructure") - # Clean up the logger - self._logger.info("Starting to clean up the logger") - self._clean_logger() - else: - self._close_dask_client() + self._stopwatch.stop_task(ensemble_fit_task_name) + + self._cleanup() return self diff --git a/autoPyTorch/ensemble/singlebest_ensemble.py b/autoPyTorch/ensemble/singlebest_ensemble.py index c6fbaf576..6f82cbdf4 100644 --- a/autoPyTorch/ensemble/singlebest_ensemble.py +++ b/autoPyTorch/ensemble/singlebest_ensemble.py @@ -3,7 +3,7 @@ import numpy as np -from smac.runhistory.runhistory import RunHistory +from smac.runhistory.runhistory import RunHistory, StatusType from autoPyTorch.ensemble.abstract_ensemble import AbstractEnsemble from autoPyTorch.pipeline.base_pipeline import BasePipeline @@ -49,6 +49,9 @@ def get_identifiers_from_run_history(self) -> List[Tuple[int, int, float]]: for run_key in self.run_history.data.keys(): run_value = self.run_history.data[run_key] + if run_value.status == StatusType.CRASHED: + continue + score = self.metric._optimum - (self.metric._sign * run_value.cost) if (score > best_model_score and self.metric._sign > 0) \ diff --git a/examples/tabular/40_advanced/example_posthoc_ensemble_fit.py b/examples/tabular/40_advanced/example_posthoc_ensemble_fit.py new file mode 100644 index 000000000..b9383b2a6 --- /dev/null +++ b/examples/tabular/40_advanced/example_posthoc_ensemble_fit.py @@ -0,0 +1,81 @@ +""" +===================================================== +Tabular Classification with Post-Hoc Ensemble Fitting +===================================================== + +The following example shows how to fit a sample classification model +and create an ensemble post-hoc with AutoPyTorch +""" +import os +import tempfile as tmp +import warnings + +os.environ['JOBLIB_TEMP_FOLDER'] = tmp.gettempdir() +os.environ['OMP_NUM_THREADS'] = '1' +os.environ['OPENBLAS_NUM_THREADS'] = '1' +os.environ['MKL_NUM_THREADS'] = '1' + +warnings.simplefilter(action='ignore', category=UserWarning) +warnings.simplefilter(action='ignore', category=FutureWarning) + +import sklearn.datasets +import sklearn.model_selection + +from autoPyTorch.api.tabular_classification import TabularClassificationTask + + +if __name__ == '__main__': + + ############################################################################ + # Data Loading + # ============ + X, y = sklearn.datasets.fetch_openml(data_id=40981, return_X_y=True, as_frame=True) + X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split( + X, + y, + random_state=42, + ) + + ############################################################################ + # Build and fit a classifier + # ========================== + api = TabularClassificationTask( + ensemble_size=0, + seed=42, + ) + + ############################################################################ + # Search for the best neural network + # ================================== + api.search( + X_train=X_train, + y_train=y_train, + X_test=X_test.copy(), + y_test=y_test.copy(), + optimize_metric='accuracy', + total_walltime_limit=250, + func_eval_time_limit_secs=50 + ) + + ############################################################################ + # Print the final performance of the incumbent neural network + # =========================================================== + print(api.run_history, api.trajectory) + y_pred = api.predict(X_test) + score = api.score(y_pred, y_test) + print(score) + + ############################################################################ + # Fit an ensemble with the neural networks fitted during the search + # ================================================================= + + api.fit_ensemble(ensemble_size=5, + # Set the enable_traditional_pipeline=True + # to also include traditional models + # in the ensemble + enable_traditional_pipeline=False) + # Print the final ensemble built by AutoPyTorch + y_pred = api.predict(X_test) + score = api.score(y_pred, y_test) + print(score) + print(api.show_models()) From ed48dab3537676a096124378d8fe8eb170b909e4 Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Thu, 30 Sep 2021 19:14:16 +0200 Subject: [PATCH 17/54] Address comments on the PR --- autoPyTorch/api/base_task.py | 20 ++++-- autoPyTorch/data/tabular_feature_validator.py | 69 +++++++++++-------- 2 files changed, 54 insertions(+), 35 deletions(-) diff --git a/autoPyTorch/api/base_task.py b/autoPyTorch/api/base_task.py index c90306f3a..4d784b6c2 100644 --- a/autoPyTorch/api/base_task.py +++ b/autoPyTorch/api/base_task.py @@ -842,6 +842,8 @@ def _search( raise ValueError("Incompatible dataset entered for current task," "expected dataset to have task type :{} got " ":{}".format(self.task_type, dataset.task_type)) + if precision not in [16, 32, 64]: + raise ValueError("precision must be one of 16, 32, 64. Got {}".format(precision)) # Initialise information needed for the experiment experiment_task_name: str = 'runSearch' @@ -1340,19 +1342,24 @@ def fit_pipeline(self, def fit_ensemble( self, + optimize_metric: Optional[str] = None, + precision: Optional[int] = None, ensemble_nbest: int = 50, ensemble_size: int = 50, - precision: int = 32, load_models: bool = True, time_for_task: int = 100, func_eval_time_limit_secs: Optional[int] = None, - enable_traditional_pipeline: bool = True + enable_traditional_pipeline: bool = True, ) -> 'BaseTask': """ Enables post-hoc fitting of the ensemble after the `search()` method is finished. This method creates an ensemble using all the models stored on disk during the smbo run Args: + optimize_metric (str): name of the metric that is used to + evaluate a pipeline. if not specified, value passed to search will be used + precision (int), (default=32): Numeric precision used when loading + ensemble data. Can be either 16, 32 or 64. ensemble_nbest (Optional[int]): only consider the ensemble_nbest models to build the ensemble. If None, uses the value stored in class attribute `ensemble_nbest`. @@ -1360,8 +1367,6 @@ def fit_ensemble( Number of models added to the ensemble built by Ensemble selection from libraries of models. Models are drawn with replacement. - precision (int), (default=32): Numeric precision used when loading - ensemble data. Can be either 16, 32 or 64. enable_traditional_pipeline (bool), (default=True): We fit traditional machine learning algorithms (LightGBM, CatBoost, RandomForest, ExtraTrees, KNN, SVM) @@ -1395,6 +1400,9 @@ def fit_ensemble( "Please call the `search()` method of {} prior to " "fit_ensemble().".format(self.__class__.__name__)) + if precision not in [16, 32, 64]: + raise ValueError("precision must be one of 16, 32, 64. Got {}".format(precision)) + if self._logger is None: self._logger = self._get_logger(self.dataset.dataset_name) @@ -1453,8 +1461,8 @@ def fit_ensemble( time_left_for_ensemble = int(time_for_task - elapsed_time) manager = self._init_ensemble_builder( time_left_for_ensembles=time_left_for_ensemble, - optimize_metric=self.opt_metric, - precision=precision, + optimize_metric=self.opt_metric if optimize_metric is None else optimize_metric, + precision=self.precision if precision is None else precision, ensemble_size=ensemble_size, ensemble_nbest=ensemble_nbest, ) diff --git a/autoPyTorch/data/tabular_feature_validator.py b/autoPyTorch/data/tabular_feature_validator.py index 28d64a4b1..e4ff2a179 100644 --- a/autoPyTorch/data/tabular_feature_validator.py +++ b/autoPyTorch/data/tabular_feature_validator.py @@ -1,5 +1,5 @@ import functools -import typing +from typing import Any, Dict, List, Optional, Tuple, Union, cast import numpy as np @@ -16,12 +16,31 @@ from sklearn.pipeline import make_pipeline from sklearn.preprocessing import OneHotEncoder from sklearn.impute import SimpleImputer -from sklearn.preprocessing import StandardScaler, OrdinalEncoder +from sklearn.preprocessing import StandardScaler from autoPyTorch.data.base_feature_validator import BaseFeatureValidator, SUPPORTED_FEAT_TYPES -def _create_column_transformer(preprocessors: typing.Dict, numerical_columns, categorical_columns): +def _create_column_transformer( + preprocessors: Dict[str, List[BaseEstimator]], + numerical_columns: List[str], + categorical_columns: List[str] +) -> ColumnTransformer: + """ + Given a dictionary of preprocessors, this function + creates a sklearn column transformer with appropriate + columns associated with their preprocessors. + Args: + preprocessors (Dict[str, List]): + Dictionary containing list of numerical and categorical preprocessors. + numerical_columns (List[int]): + List of names of numerical columns + categorical_columns (List[int]): + List of names of categorical columns + + Returns: + ColumnTransformer + """ numerical_pipeline = 'drop' categorical_pipeline = 'drop' if len(numerical_columns) > 0: @@ -36,25 +55,17 @@ def _create_column_transformer(preprocessors: typing.Dict, numerical_columns, ca ) -def get_tabular_preprocessors(): +def get_tabular_preprocessors() -> Dict[str, List[BaseEstimator]]: + """ + This function creates a Dictionary containing list + of numerical and categorical preprocessors + Returns: + + """ preprocessors = dict() preprocessors['numerical'] = list() preprocessors['categorical'] = list() - # preprocessors['categorical'].append(SimpleImputer(strategy='constant', - # # Train data is numpy - # # as of this point, where - # # Ordinal Encoding is using - # # for categorical. Only - # # Numbers are allowed - # # fill_value='!missing!', - # fill_value=-1, - # copy=False)) - - # preprocessors['categorical'].append(OrdinalEncoder( - # handle_unknown='use_encoded_value', - # unknown_value=-1)) - preprocessors['categorical'].append(OneHotEncoder( categories='auto', sparse=False, @@ -91,7 +102,7 @@ def _fit( X = self.numpy_array_to_pandas(X) if hasattr(X, "iloc") and not scipy.sparse.issparse(X): - X = typing.cast(pd.DataFrame, X) + X = cast(pd.DataFrame, X) if not X.select_dtypes(include='object').empty: X = self.infer_objects(X) @@ -179,7 +190,7 @@ def transform( X = self.numpy_array_to_pandas(X) if hasattr(X, "iloc") and not scipy.sparse.issparse(X): - X = typing.cast(pd.DataFrame, X) + X = cast(pd.DataFrame, X) # Also remove the object dtype for new data if not X.select_dtypes(include='object').empty: @@ -257,7 +268,7 @@ def _check_data( # Then for Pandas, we do not support Nan in categorical columns if hasattr(X, "iloc"): # If entered here, we have a pandas dataframe - X = typing.cast(pd.DataFrame, X) + X = cast(pd.DataFrame, X) # Handle objects if possible if not X.select_dtypes(include='object').empty: @@ -293,7 +304,7 @@ def _check_data( def _get_columns_info( self, X: pd.DataFrame, - ) -> typing.Tuple[typing.List[str], typing.List[str], typing.List[str]]: + ) -> Tuple[List[str], List[str], List[str]]: """ Return the columns to be encoded from a pandas dataframe @@ -365,8 +376,8 @@ def _get_columns_info( def list_to_dataframe( self, X_train: SUPPORTED_FEAT_TYPES, - X_test: typing.Optional[SUPPORTED_FEAT_TYPES] = None, - ) -> typing.Tuple[pd.DataFrame, typing.Optional[pd.DataFrame]]: + X_test: Optional[SUPPORTED_FEAT_TYPES] = None, + ) -> Tuple[pd.DataFrame, Optional[pd.DataFrame]]: """ Converts a list to a pandas DataFrame. In this process, column types are inferred. @@ -376,7 +387,7 @@ def list_to_dataframe( X_train (SUPPORTED_FEAT_TYPES): A set of features that are going to be validated (type and dimensionality checks) and a encoder fitted in the case the data needs encoding - X_test (typing.Optional[SUPPORTED_FEAT_TYPES]): + X_test (Optional[SUPPORTED_FEAT_TYPES]): A hold out set of data used for checking Returns: pd.DataFrame: @@ -398,9 +409,9 @@ def list_to_dataframe( X_test = pd.DataFrame(data=X_test).infer_objects() return X_train, X_test + @staticmethod def numpy_array_to_pandas( - self, - X: np.ndarray, + X: np.ndarray, ) -> pd.DataFrame: """ Converts a numpy array to pandas for type inference @@ -462,7 +473,7 @@ def impute_nan_in_categories(self, X: pd.DataFrame, categorical_columns=None) -> # To be on the safe side, map always to the same missing # value per column if not hasattr(self, 'dict_nancol_to_missing'): - self.dict_missing_value_per_col: typing.Dict[str, typing.Any] = {} + self.dict_missing_value_per_col: Dict[str, Any] = {} # First make sure that we do not alter the type of the column which cause: # TypeError: '<' not supported between instances of 'int' and 'str' @@ -478,7 +489,7 @@ def impute_nan_in_categories(self, X: pd.DataFrame, categorical_columns=None) -> if can_cast_as_number: # In this case, we expect to have a number as category # it might be string, but its value represent a number - missing_value: typing.Union[str, int] = '-1' if isinstance(X[column].dropna().values[0], + missing_value: Union[str, int] = '-1' if isinstance(X[column].dropna().values[0], str) else -1 else: missing_value = 'Missing!' From 9cdfb64ec3fe9a138de2eb93db5cebead6b91a20 Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Thu, 30 Sep 2021 19:55:28 +0200 Subject: [PATCH 18/54] Fix flake and mypy errors --- autoPyTorch/api/base_task.py | 2 +- autoPyTorch/api/tabular_classification.py | 1 + autoPyTorch/api/tabular_regression.py | 1 + autoPyTorch/data/tabular_feature_validator.py | 44 +++++++++---------- autoPyTorch/datasets/base_dataset.py | 10 ++++- autoPyTorch/pipeline/base_pipeline.py | 15 ++++--- .../TabularColumnTransformer.py | 6 +-- .../setup/network_backbone/utils.py | 2 +- .../base_network_embedding.py | 44 +++++++++---------- .../training/trainer/RowCutOutTrainer.py | 7 +-- .../training/trainer/base_trainer.py | 2 - .../pipeline/tabular_classification.py | 8 +--- autoPyTorch/utils/backend.py | 2 +- 13 files changed, 72 insertions(+), 72 deletions(-) diff --git a/autoPyTorch/api/base_task.py b/autoPyTorch/api/base_task.py index 4d784b6c2..27b7cbbb1 100644 --- a/autoPyTorch/api/base_task.py +++ b/autoPyTorch/api/base_task.py @@ -1348,7 +1348,7 @@ def fit_ensemble( ensemble_size: int = 50, load_models: bool = True, time_for_task: int = 100, - func_eval_time_limit_secs: Optional[int] = None, + func_eval_time_limit_secs: int = 50, enable_traditional_pipeline: bool = True, ) -> 'BaseTask': """ diff --git a/autoPyTorch/api/tabular_classification.py b/autoPyTorch/api/tabular_classification.py index 1a73d8625..ae2d53ef9 100644 --- a/autoPyTorch/api/tabular_classification.py +++ b/autoPyTorch/api/tabular_classification.py @@ -275,6 +275,7 @@ def search( y_test=y_test, dataset_name=dataset_name) + assert self.dataset is not None, "Something went wrong, expected dataset to be initialised" return self._search( dataset=self.dataset, optimize_metric=optimize_metric, diff --git a/autoPyTorch/api/tabular_regression.py b/autoPyTorch/api/tabular_regression.py index e7fb919bd..0236d861f 100644 --- a/autoPyTorch/api/tabular_regression.py +++ b/autoPyTorch/api/tabular_regression.py @@ -261,6 +261,7 @@ def search( y_test=y_test, dataset_name=dataset_name) + assert self.dataset is not None, "Something went wrong, expected dataset to be initialised" return self._search( dataset=self.dataset, optimize_metric=optimize_metric, diff --git a/autoPyTorch/data/tabular_feature_validator.py b/autoPyTorch/data/tabular_feature_validator.py index e4ff2a179..698e92438 100644 --- a/autoPyTorch/data/tabular_feature_validator.py +++ b/autoPyTorch/data/tabular_feature_validator.py @@ -9,35 +9,32 @@ import scipy.sparse import sklearn.utils - from sklearn.base import BaseEstimator from sklearn.compose import ColumnTransformer from sklearn.exceptions import NotFittedError -from sklearn.pipeline import make_pipeline -from sklearn.preprocessing import OneHotEncoder from sklearn.impute import SimpleImputer -from sklearn.preprocessing import StandardScaler +from sklearn.pipeline import make_pipeline +from sklearn.preprocessing import OneHotEncoder, StandardScaler from autoPyTorch.data.base_feature_validator import BaseFeatureValidator, SUPPORTED_FEAT_TYPES def _create_column_transformer( - preprocessors: Dict[str, List[BaseEstimator]], - numerical_columns: List[str], - categorical_columns: List[str] + preprocessors: Dict[str, List[BaseEstimator]], + numerical_columns: List[str], + categorical_columns: List[str] ) -> ColumnTransformer: """ - Given a dictionary of preprocessors, this function - creates a sklearn column transformer with appropriate - columns associated with their preprocessors. + Given a dictionary of preprocessors, this function + creates a sklearn column transformer with appropriate + columns associated with their preprocessors. Args: - preprocessors (Dict[str, List]): + preprocessors (Dict[str, List]): Dictionary containing list of numerical and categorical preprocessors. numerical_columns (List[int]): List of names of numerical columns categorical_columns (List[int]): List of names of categorical columns - Returns: ColumnTransformer """ @@ -57,12 +54,12 @@ def _create_column_transformer( def get_tabular_preprocessors() -> Dict[str, List[BaseEstimator]]: """ - This function creates a Dictionary containing list + This function creates a Dictionary containing list of numerical and categorical preprocessors Returns: - + Dict[str, List[BaseEstimator]] """ - preprocessors = dict() + preprocessors: Dict[str, List[BaseEstimator]] = dict() preprocessors['numerical'] = list() preprocessors['categorical'] = list() @@ -144,12 +141,12 @@ def comparator(cmp1: str, cmp2: str) -> int: ) if len(categorical_columns) > 0: - print(self.column_transformer.named_transformers_['categorical_pipeline'].named_steps) self.categories = [ # We fit an ordinal encoder, where all categorical # columns are shifted to the left list(range(len(cat))) - for cat in self.column_transformer.named_transformers_['categorical_pipeline'].named_steps['onehotencoder'].categories_ + for cat in self.column_transformer.named_transformers_[ + 'categorical_pipeline'].named_steps['onehotencoder'].categories_ ] for i, type_ in enumerate(self.feat_type): @@ -284,7 +281,7 @@ def _check_data( raise ValueError("Changing the column order of the features after fit() is " "not supported. Fit() method was called with " "{} whereas the new features have {} as type".format(self.column_order, - column_order,) + column_order, ) ) else: self.column_order = column_order @@ -411,7 +408,7 @@ def list_to_dataframe( @staticmethod def numpy_array_to_pandas( - X: np.ndarray, + X: np.ndarray, ) -> pd.DataFrame: """ Converts a numpy array to pandas for type inference @@ -457,7 +454,9 @@ def infer_objects(self, X: pd.DataFrame) -> pd.DataFrame: self.logger.debug(f"Infer Objects: {self.object_dtype_mapping}") return X - def impute_nan_in_categories(self, X: pd.DataFrame, categorical_columns=None) -> pd.DataFrame: + def impute_nan_in_categories(self, + X: pd.DataFrame + ) -> pd.DataFrame: """ impute missing values before encoding, remove once sklearn natively supports @@ -489,8 +488,7 @@ def impute_nan_in_categories(self, X: pd.DataFrame, categorical_columns=None) -> if can_cast_as_number: # In this case, we expect to have a number as category # it might be string, but its value represent a number - missing_value: Union[str, int] = '-1' if isinstance(X[column].dropna().values[0], - str) else -1 + missing_value: Union[str, int] = '-1' if isinstance(X[column].dropna().values[0], str) else -1 else: missing_value = 'Missing!' @@ -509,4 +507,4 @@ def impute_nan_in_categories(self, X: pd.DataFrame, categorical_columns=None) -> X[column].cat.add_categories([self.dict_missing_value_per_col[column]], inplace=True) X.fillna({column: self.dict_missing_value_per_col[column]}, inplace=True) - return X \ No newline at end of file + return X diff --git a/autoPyTorch/datasets/base_dataset.py b/autoPyTorch/datasets/base_dataset.py index 8cb951977..cf67e1a95 100644 --- a/autoPyTorch/datasets/base_dataset.py +++ b/autoPyTorch/datasets/base_dataset.py @@ -330,13 +330,19 @@ def get_dataset_for_training(self, split_id: int, train: bool, subset: int = 0) to provide training data to fit a pipeline Args: - split (int): The desired subset of the dataset to split and use + split_id (int): which split id to get from the splits + train (bool): whether the train or valid transforms are to be applied + subset (int, default=0): 0 is for train_indices, 1 is for valid_indices Returns: + Dataset: the reduced dataset to be used for testing """ # Subset creates a dataset. Splits is a (train_indices, test_indices) tuple - return TransformSubset(self, self.splits[split_id][subset], train=train) + assert split_id <= len(self.splits), "Expected split id to be less than length of splits" + indices = self.splits[split_id][subset] + assert indices is not None, "Trying to get subset when it does not exist" + return TransformSubset(self, indices, train=train) def replace_data(self, X_train: BaseDatasetInputType, X_test: Optional[BaseDatasetInputType]) -> 'BaseDataset': diff --git a/autoPyTorch/pipeline/base_pipeline.py b/autoPyTorch/pipeline/base_pipeline.py index 80d59a68f..4697345f4 100644 --- a/autoPyTorch/pipeline/base_pipeline.py +++ b/autoPyTorch/pipeline/base_pipeline.py @@ -451,13 +451,14 @@ def _check_search_space_updates(self, include: Optional[Dict[str, Any]], continue raise ValueError("Unknown hyperparameter for component {}. " "Expected update hyperparameter " - "to be in {} got {}. choice is {}".format(node.__class__.__name__, - component. - get_hyperparameter_search_space( - dataset_properties=self.dataset_properties). - get_hyperparameter_names(), - split_hyperparameter[1], - component.__name__)) + "to be in {} got {}." + " component is {}".format(node.__class__.__name__, + component.get_hyperparameter_search_space( + dataset_properties=self.dataset_properties + ).get_hyperparameter_names(), + split_hyperparameter[1], + component.__name__) + ) else: if update.hyperparameter not in node.get_hyperparameter_search_space( dataset_properties=self.dataset_properties): diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/TabularColumnTransformer.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/TabularColumnTransformer.py index c7ca61e09..e513b8729 100644 --- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/TabularColumnTransformer.py +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/TabularColumnTransformer.py @@ -3,14 +3,14 @@ import numpy as np from sklearn.compose import ColumnTransformer -from sklearn.pipeline import make_pipeline -import time +# from sklearn.pipeline import make_pipeline + import torch from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.base_tabular_preprocessing import ( autoPyTorchTabularPreprocessingComponent ) -from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.utils import get_tabular_preprocessers +# from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.utils import get_tabular_preprocessers from autoPyTorch.utils.common import FitRequirement, subsampler diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/utils.py b/autoPyTorch/pipeline/components/setup/network_backbone/utils.py index 9a1f9dd4e..d10d15dca 100644 --- a/autoPyTorch/pipeline/components/setup/network_backbone/utils.py +++ b/autoPyTorch/pipeline/components/setup/network_backbone/utils.py @@ -117,7 +117,7 @@ def shake_get_alpha_beta( beta = torch.FloatTensor([0.5]) elif method == 'M3': beta = torch.FloatTensor( - [torch.rand(1)*(0.5 - alpha)*alpha if alpha < 0.5 else torch.rand(1)*(alpha - 0.5)*alpha] + [torch.rand(1) * (0.5 - alpha) * alpha if alpha < 0.5 else torch.rand(1) * (alpha - 0.5) * alpha] ) else: raise ValueError("Unknown method for ShakeShakeRegularisation in NetworkBackbone") diff --git a/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py b/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py index 42cbc62bb..18028cddd 100644 --- a/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py +++ b/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py @@ -1,5 +1,5 @@ -import copy -from typing import Any, Dict, Optional, Tuple +# import copy +from typing import Any, Dict, Optional # , Tuple import numpy as np @@ -30,23 +30,23 @@ def transform(self, X: Dict[str, Any]) -> Dict[str, Any]: def build_embedding(self, num_input_features: np.ndarray, num_numerical_features: int) -> nn.Module: raise NotImplementedError - - def _get_args(self, X: Dict[str, Any]) -> Tuple[int, np.ndarray]: - # Feature preprocessors can alter numerical columns - # if len(X['dataset_properties']['numerical_columns']) == 0: - # num_numerical_columns = 0 - # else: - # X_train = copy.deepcopy(X['backend'].load_datamanager().train_tensors[0][:2]) - # - # numerical_column_transformer = X['tabular_transformer'].preprocessor. \ - # named_transformers_['numerical_pipeline'] - # num_numerical_columns = numerical_column_transformer.transform( - # X_train[:, X['dataset_properties']['numerical_columns']]).shape[1] - # num_input_features = np.zeros((num_numerical_columns + len(X['dataset_properties']['categorical_columns'])), - # dtype=int) - # categories = X['dataset_properties']['categories'] - # - # for i, category in enumerate(categories): - # num_input_features[num_numerical_columns + i, ] = len(category) - # return num_numerical_columns, num_input_features - return None, None \ No newline at end of file + # + # def _get_args(self, X: Dict[str, Any]) -> Tuple[int, np.ndarray]: + # # Feature preprocessors can alter numerical columns + # # if len(X['dataset_properties']['numerical_columns']) == 0: + # # num_numerical_columns = 0 + # # else: + # # X_train = copy.deepcopy(X['backend'].load_datamanager().train_tensors[0][:2]) + # # + # # numerical_column_transformer = X['tabular_transformer'].preprocessor. \ + # # named_transformers_['numerical_pipeline'] + # # num_numerical_columns = numerical_column_transformer.transform( + # # X_train[:, X['dataset_properties']['numerical_columns']]).shape[1] + # # num_input_features = np.zeros((num_numerical_columns + len(X['dataset_properties']['categorical_columns'])), + # # dtype=int) + # # categories = X['dataset_properties']['categories'] + # # + # # for i, category in enumerate(categories): + # # num_input_features[num_numerical_columns + i, ] = len(category) + # # return num_numerical_columns, num_input_features + # return None, None diff --git a/autoPyTorch/pipeline/components/training/trainer/RowCutOutTrainer.py b/autoPyTorch/pipeline/components/training/trainer/RowCutOutTrainer.py index d7bd23f4e..e04728f4b 100644 --- a/autoPyTorch/pipeline/components/training/trainer/RowCutOutTrainer.py +++ b/autoPyTorch/pipeline/components/training/trainer/RowCutOutTrainer.py @@ -2,7 +2,7 @@ import numpy as np -import torch +# import torch from autoPyTorch.pipeline.components.training.trainer.base_trainer import BaseTrainerComponent from autoPyTorch.pipeline.components.training.trainer.cutout_utils import CutOut @@ -40,14 +40,15 @@ def data_preparation(self, X: np.ndarray, y: np.ndarray, indices = self.random_state.choice(range(size), max(1, np.int32(size * self.patch_ratio)), replace=False) - """if not isinstance(self.numerical_columns, typing.Iterable): + """ + if not isinstance(self.numerical_columns, typing.Iterable): raise ValueError("{} requires numerical columns information of {}" "to prepare data got {}.".format(self.__class__.__name__, typing.Iterable, self.numerical_columns)) numerical_indices = torch.tensor(self.numerical_columns) categorical_indices = torch.tensor([index for index in indices if index not in self.numerical_columns]) - + # We use an ordinal encoder on the categorical columns of tabular data # -1 is the conceptual equivalent to 0 in a image, that does not # have color as a feature and hence the network has to learn to deal diff --git a/autoPyTorch/pipeline/components/training/trainer/base_trainer.py b/autoPyTorch/pipeline/components/training/trainer/base_trainer.py index 6040f32e9..85ba39c04 100644 --- a/autoPyTorch/pipeline/components/training/trainer/base_trainer.py +++ b/autoPyTorch/pipeline/components/training/trainer/base_trainer.py @@ -221,8 +221,6 @@ def __init__(self, weighted_loss: int = 0, self.add_fit_requirements([ FitRequirement("is_cyclic_scheduler", (bool,), user_defined=False, dataset_property=False), ]) - self.batch_fit_times = [] - self.data_loading_times = [] def prepare( self, diff --git a/autoPyTorch/pipeline/tabular_classification.py b/autoPyTorch/pipeline/tabular_classification.py index d19fc7215..c1901eb26 100644 --- a/autoPyTorch/pipeline/tabular_classification.py +++ b/autoPyTorch/pipeline/tabular_classification.py @@ -7,7 +7,6 @@ import numpy as np -import sklearn.preprocessing from sklearn.base import ClassifierMixin import torch @@ -91,7 +90,7 @@ def _predict_proba(self, X: np.ndarray) -> np.ndarray: loader = self.named_steps['data_loader'].get_loader(X=X) pred = self.named_steps['network'].predict(loader) if isinstance(self.dataset_properties['output_shape'], int): - return pred + return pred else: all_proba = [] @@ -140,11 +139,6 @@ def predict_proba(self, X: np.ndarray, batch_size: Optional[int] = None) -> np.n pred_prob = self.predict_proba(X[batch_from:batch_to], batch_size=None) y[batch_from:batch_to] = pred_prob.astype(np.float32) - # Neural networks might not be fit to produce a [0-1] output - # For instance, after small number of epochs. - # y = np.clip(y, 0, 1) - # y = sklearn.preprocessing.normalize(y, axis=1, norm='l1') - return y def _get_hyperparameter_search_space(self, diff --git a/autoPyTorch/utils/backend.py b/autoPyTorch/utils/backend.py index 7a7399a9f..667e6abd9 100644 --- a/autoPyTorch/utils/backend.py +++ b/autoPyTorch/utils/backend.py @@ -328,7 +328,7 @@ def load_datamanager(self) -> BaseDataset: with open(filepath, 'rb') as fh: return pickle.load(fh) - def replace_datamanager(self, datamanager: BaseDataset): + def replace_datamanager(self, datamanager: BaseDataset) -> None: warnings.warn("Original dataset will be overwritten with the provided dataset") os.remove(self._get_datamanager_pickle_filename()) self.save_datamanager(datamanager=datamanager) From 6bd43004d8f2c0ef555690c4155f826264174d03 Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Mon, 4 Oct 2021 12:19:25 +0200 Subject: [PATCH 19/54] Address comments from PR #286 --- autoPyTorch/datasets/base_dataset.py | 6 ++-- autoPyTorch/pipeline/base_pipeline.py | 30 +++++++++---------- .../training/data_loader/base_data_loader.py | 2 +- autoPyTorch/utils/backend.py | 12 +++++++- 4 files changed, 30 insertions(+), 20 deletions(-) diff --git a/autoPyTorch/datasets/base_dataset.py b/autoPyTorch/datasets/base_dataset.py index cf67e1a95..c1b09d4a9 100644 --- a/autoPyTorch/datasets/base_dataset.py +++ b/autoPyTorch/datasets/base_dataset.py @@ -322,7 +322,7 @@ def create_holdout_val_split( self.random_state, val_share, self._get_indices(), **kwargs) return train, val - def get_dataset_for_training(self, split_id: int, train: bool, subset: int = 0) -> Dataset: + def get_dataset_for_training(self, split_id: int, train: bool) -> Dataset: """ The above split methods employ the Subset to internally subsample the whole dataset. @@ -331,8 +331,7 @@ def get_dataset_for_training(self, split_id: int, train: bool, subset: int = 0) Args: split_id (int): which split id to get from the splits - train (bool): whether the train or valid transforms are to be applied - subset (int, default=0): 0 is for train_indices, 1 is for valid_indices + train (bool): whether the dataset is required for training or evaluating. Returns: @@ -340,6 +339,7 @@ def get_dataset_for_training(self, split_id: int, train: bool, subset: int = 0) """ # Subset creates a dataset. Splits is a (train_indices, test_indices) tuple assert split_id <= len(self.splits), "Expected split id to be less than length of splits" + subset = int(not train) indices = self.splits[split_id][subset] assert indices is not None, "Trying to get subset when it does not exist" return TransformSubset(self, indices, train=train) diff --git a/autoPyTorch/pipeline/base_pipeline.py b/autoPyTorch/pipeline/base_pipeline.py index 4697345f4..205da414a 100644 --- a/autoPyTorch/pipeline/base_pipeline.py +++ b/autoPyTorch/pipeline/base_pipeline.py @@ -21,8 +21,9 @@ get_match_array ) from autoPyTorch.utils.common import FitRequirement -from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates - +from autoPyTorch.utils.hyperparameter_search_space_update import ( + HyperparameterSearchSpaceUpdates +) class BasePipeline(Pipeline): """Base class for all pipeline objects. @@ -425,7 +426,7 @@ def _check_search_space_updates(self, include: Optional[Dict[str, Any]], if choice in exclude[update.node_name]: raise ValueError("Found {} in exclude".format(choice)) if choice not in components.keys(): - raise ValueError("Unknown hyperparameter for choice {}. " + raise ValueError("Unknown component choice for node {}. " "Expected update hyperparameter " "to be in {} got {}".format(node.__class__.__name__, components.keys(), choice)) @@ -433,8 +434,8 @@ def _check_search_space_updates(self, include: Optional[Dict[str, Any]], # needs to be updated is in components of the # choice module elif split_hyperparameter[0] not in components.keys(): - raise ValueError("Unknown hyperparameter for choice {}. " - "Expected update hyperparameter " + raise ValueError("Unknown component choice for node {}. " + "Expected update component " "to be in {} got {}".format(node.__class__.__name__, components.keys(), split_hyperparameter[0])) else: @@ -449,15 +450,14 @@ def _check_search_space_updates(self, include: Optional[Dict[str, Any]], component.get_hyperparameter_search_space( dataset_properties=self.dataset_properties).get_hyperparameter_names()]): continue - raise ValueError("Unknown hyperparameter for component {}. " - "Expected update hyperparameter " - "to be in {} got {}." - " component is {}".format(node.__class__.__name__, - component.get_hyperparameter_search_space( - dataset_properties=self.dataset_properties - ).get_hyperparameter_names(), - split_hyperparameter[1], - component.__name__) + raise ValueError("Unknown hyperparameter for component {} of node {}. Expected update hyperparameter " + "to be in {} got {}.".format(component.__name__, + node.__class__.__name__, + component.get_hyperparameter_search_space( + dataset_properties=self.dataset_properties + ).get_hyperparameter_names(), + split_hyperparameter[1] + ) ) else: if update.hyperparameter not in node.get_hyperparameter_search_space( @@ -466,7 +466,7 @@ def _check_search_space_updates(self, include: Optional[Dict[str, Any]], node.get_hyperparameter_search_space( dataset_properties=self.dataset_properties).get_hyperparameter_names()]): continue - raise ValueError("Unknown hyperparameter for component {}. " + raise ValueError("Unknown hyperparameter for node {}. " "Expected update hyperparameter " "to be in {} got {}".format(node.__class__.__name__, node. diff --git a/autoPyTorch/pipeline/components/training/data_loader/base_data_loader.py b/autoPyTorch/pipeline/components/training/data_loader/base_data_loader.py index 7302ac6f5..15d568002 100644 --- a/autoPyTorch/pipeline/components/training/data_loader/base_data_loader.py +++ b/autoPyTorch/pipeline/components/training/data_loader/base_data_loader.py @@ -120,7 +120,7 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader: ) if X['val_indices'] is not None: - val_dataset = datamanager.get_dataset_for_training(split_id=X['split_id'], train=False, subset=1) + val_dataset = datamanager.get_dataset_for_training(split_id=X['split_id'], train=False) self.val_data_loader = torch.utils.data.DataLoader( val_dataset, batch_size=min(self.batch_size, len(val_dataset)), diff --git a/autoPyTorch/utils/backend.py b/autoPyTorch/utils/backend.py index 667e6abd9..8baba0367 100644 --- a/autoPyTorch/utils/backend.py +++ b/autoPyTorch/utils/backend.py @@ -17,6 +17,7 @@ from autoPyTorch.ensemble.abstract_ensemble import AbstractEnsemble from autoPyTorch.pipeline.base_pipeline import BasePipeline from autoPyTorch.utils.logging_ import PicklableClientLogger, get_named_client_logger +from torch.utils import data __all__ = [ 'Backend' @@ -329,8 +330,17 @@ def load_datamanager(self) -> BaseDataset: return pickle.load(fh) def replace_datamanager(self, datamanager: BaseDataset) -> None: + """ + This function is called to replace the old datamanager with a datamanager + in case it is required. + + Args: + datamanager (BaseDataset): the new datamanager to replace the old. + """ warnings.warn("Original dataset will be overwritten with the provided dataset") - os.remove(self._get_datamanager_pickle_filename()) + datamanager_pickle_file = self._get_datamanager_pickle_filename() + if os.path.exists(datamanager_pickle_file): + os.remove(datamanager_pickle_file) self.save_datamanager(datamanager=datamanager) def get_runs_directory(self) -> str: From 9c0c47b2af7226be2f2e910b271a51fe98e97089 Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Mon, 4 Oct 2021 12:22:23 +0200 Subject: [PATCH 20/54] fix bug in embedding --- .../base_network_embedding.py | 42 +++++++++---------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py b/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py index 18028cddd..d516c4e84 100644 --- a/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py +++ b/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py @@ -1,5 +1,5 @@ # import copy -from typing import Any, Dict, Optional # , Tuple +from typing import Any, Dict, Optional, Tuple import numpy as np @@ -30,23 +30,23 @@ def transform(self, X: Dict[str, Any]) -> Dict[str, Any]: def build_embedding(self, num_input_features: np.ndarray, num_numerical_features: int) -> nn.Module: raise NotImplementedError - # - # def _get_args(self, X: Dict[str, Any]) -> Tuple[int, np.ndarray]: - # # Feature preprocessors can alter numerical columns - # # if len(X['dataset_properties']['numerical_columns']) == 0: - # # num_numerical_columns = 0 - # # else: - # # X_train = copy.deepcopy(X['backend'].load_datamanager().train_tensors[0][:2]) - # # - # # numerical_column_transformer = X['tabular_transformer'].preprocessor. \ - # # named_transformers_['numerical_pipeline'] - # # num_numerical_columns = numerical_column_transformer.transform( - # # X_train[:, X['dataset_properties']['numerical_columns']]).shape[1] - # # num_input_features = np.zeros((num_numerical_columns + len(X['dataset_properties']['categorical_columns'])), - # # dtype=int) - # # categories = X['dataset_properties']['categories'] - # # - # # for i, category in enumerate(categories): - # # num_input_features[num_numerical_columns + i, ] = len(category) - # # return num_numerical_columns, num_input_features - # return None, None + + def _get_args(self, X: Dict[str, Any]) -> Tuple[None, None]: # Tuple[int, np.ndarray]: + # Feature preprocessors can alter numerical columns + # if len(X['dataset_properties']['numerical_columns']) == 0: + # num_numerical_columns = 0 + # else: + # X_train = copy.deepcopy(X['backend'].load_datamanager().train_tensors[0][:2]) + # + # numerical_column_transformer = X['tabular_transformer'].preprocessor. \ + # named_transformers_['numerical_pipeline'] + # num_numerical_columns = numerical_column_transformer.transform( + # X_train[:, X['dataset_properties']['numerical_columns']]).shape[1] + # num_input_features = np.zeros((num_numerical_columns + len(X['dataset_properties']['categorical_columns'])), + # dtype=int) + # categories = X['dataset_properties']['categories'] + # + # for i, category in enumerate(categories): + # num_input_features[num_numerical_columns + i, ] = len(category) + # return num_numerical_columns, num_input_features + return None, None From e83800451c48e1bb56aa2f8eb4b793f7d9cd5651 Mon Sep 17 00:00:00 2001 From: Ravin Kohli <13005107+ravinkohli@users.noreply.github.com> Date: Mon, 4 Oct 2021 12:24:48 +0200 Subject: [PATCH 21/54] Update autoPyTorch/api/tabular_classification.py Co-authored-by: nabenabe0928 <47781922+nabenabe0928@users.noreply.github.com> --- autoPyTorch/api/tabular_classification.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/autoPyTorch/api/tabular_classification.py b/autoPyTorch/api/tabular_classification.py index ae2d53ef9..06d2aacb1 100644 --- a/autoPyTorch/api/tabular_classification.py +++ b/autoPyTorch/api/tabular_classification.py @@ -275,7 +275,8 @@ def search( y_test=y_test, dataset_name=dataset_name) - assert self.dataset is not None, "Something went wrong, expected dataset to be initialised" + if self.dataset is None: + raise ValueError("`dataset` in {} must be initialized, but got None".format(self.__class__.__name__)) return self._search( dataset=self.dataset, optimize_metric=optimize_metric, From 893a15dca0bba22fa34a08c88c5a7fe78c4ca074 Mon Sep 17 00:00:00 2001 From: Ravin Kohli <13005107+ravinkohli@users.noreply.github.com> Date: Mon, 4 Oct 2021 12:25:13 +0200 Subject: [PATCH 22/54] Update autoPyTorch/datasets/base_dataset.py Co-authored-by: nabenabe0928 <47781922+nabenabe0928@users.noreply.github.com> --- autoPyTorch/datasets/base_dataset.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/autoPyTorch/datasets/base_dataset.py b/autoPyTorch/datasets/base_dataset.py index c1b09d4a9..db18f0315 100644 --- a/autoPyTorch/datasets/base_dataset.py +++ b/autoPyTorch/datasets/base_dataset.py @@ -338,7 +338,8 @@ def get_dataset_for_training(self, split_id: int, train: bool) -> Dataset: Dataset: the reduced dataset to be used for testing """ # Subset creates a dataset. Splits is a (train_indices, test_indices) tuple - assert split_id <= len(self.splits), "Expected split id to be less than length of splits" + if split_id >= len(self.splits): # old version: split_id > len(self.splits) + raise IndexError("split_id out of range, got split_id={} (>= num_splits={})".format(split_id, len(self.splits))) subset = int(not train) indices = self.splits[split_id][subset] assert indices is not None, "Trying to get subset when it does not exist" From ed0602c9888d04ad6ebc3f4b26fc8841f4ee0306 Mon Sep 17 00:00:00 2001 From: Ravin Kohli <13005107+ravinkohli@users.noreply.github.com> Date: Mon, 4 Oct 2021 12:25:20 +0200 Subject: [PATCH 23/54] Update autoPyTorch/datasets/base_dataset.py Co-authored-by: nabenabe0928 <47781922+nabenabe0928@users.noreply.github.com> --- autoPyTorch/datasets/base_dataset.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/autoPyTorch/datasets/base_dataset.py b/autoPyTorch/datasets/base_dataset.py index db18f0315..22163031c 100644 --- a/autoPyTorch/datasets/base_dataset.py +++ b/autoPyTorch/datasets/base_dataset.py @@ -342,7 +342,8 @@ def get_dataset_for_training(self, split_id: int, train: bool) -> Dataset: raise IndexError("split_id out of range, got split_id={} (>= num_splits={})".format(split_id, len(self.splits))) subset = int(not train) indices = self.splits[split_id][subset] - assert indices is not None, "Trying to get subset when it does not exist" + if indices is None: + raise ValueError("Specified fold (or subset) does not exist") return TransformSubset(self, indices, train=train) def replace_data(self, X_train: BaseDatasetInputType, From 224c69ea1cb42f160c41a6808fe655444c5d01a2 Mon Sep 17 00:00:00 2001 From: Ravin Kohli <13005107+ravinkohli@users.noreply.github.com> Date: Mon, 4 Oct 2021 12:25:46 +0200 Subject: [PATCH 24/54] Update autoPyTorch/pipeline/components/training/trainer/base_trainer.py Co-authored-by: nabenabe0928 <47781922+nabenabe0928@users.noreply.github.com> --- .../pipeline/components/training/trainer/base_trainer.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/autoPyTorch/pipeline/components/training/trainer/base_trainer.py b/autoPyTorch/pipeline/components/training/trainer/base_trainer.py index 85ba39c04..60bf7a69b 100644 --- a/autoPyTorch/pipeline/components/training/trainer/base_trainer.py +++ b/autoPyTorch/pipeline/components/training/trainer/base_trainer.py @@ -318,11 +318,9 @@ def on_epoch_end(self, X: Dict[str, Any], epoch: int) -> bool: if self.use_snapshot_ensemble: assert self.model_snapshots is not None, "model snapshots container can't be " \ "none when snapshot ensembling is enabled" - if epoch == self.budget_tracker.max_epochs: - if self.use_stochastic_weight_averaging: - model_copy = deepcopy(self.swa_model) - else: - model_copy = deepcopy(self.model) + is_last_epoch = (epoch == self.budget_tracker.max_epochs) + if is_last_epoch and self.use_stochastic_weight_averaging: + model_copy = deepcopy(self.swa_model) else: model_copy = deepcopy(self.model) From e61c1a31eea82519e16a2024eb808a3fb1633b05 Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Mon, 4 Oct 2021 12:28:37 +0200 Subject: [PATCH 25/54] Address comments from shuhei --- autoPyTorch/api/tabular_regression.py | 3 ++- autoPyTorch/pipeline/tabular_classification.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/autoPyTorch/api/tabular_regression.py b/autoPyTorch/api/tabular_regression.py index 0236d861f..8742549af 100644 --- a/autoPyTorch/api/tabular_regression.py +++ b/autoPyTorch/api/tabular_regression.py @@ -261,7 +261,8 @@ def search( y_test=y_test, dataset_name=dataset_name) - assert self.dataset is not None, "Something went wrong, expected dataset to be initialised" + if self.dataset is None: + raise ValueError("`dataset` in {} must be initialized, but got None".format(self.__class__.__name__)) return self._search( dataset=self.dataset, optimize_metric=optimize_metric, diff --git a/autoPyTorch/pipeline/tabular_classification.py b/autoPyTorch/pipeline/tabular_classification.py index c1901eb26..b059c783c 100644 --- a/autoPyTorch/pipeline/tabular_classification.py +++ b/autoPyTorch/pipeline/tabular_classification.py @@ -102,7 +102,7 @@ def _predict_proba(self, X: np.ndarray) -> np.ndarray: proba_k /= normalizer all_proba.append(proba_k) - return all_proba + return np.ndarray(all_proba) def predict_proba(self, X: np.ndarray, batch_size: Optional[int] = None) -> np.ndarray: """predict_proba. From 3d47afa6d9afe9b5a5af27b10e971f07e2a45ec4 Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Mon, 4 Oct 2021 15:50:39 +0200 Subject: [PATCH 26/54] adress comments from shuhei --- .../training/trainer/AdversarialTrainer.py | 2 +- .../training/trainer/RowCutMixTrainer.py | 15 ++++----- .../training/trainer/RowCutOutTrainer.py | 33 ++++--------------- 3 files changed, 14 insertions(+), 36 deletions(-) diff --git a/autoPyTorch/pipeline/components/training/trainer/AdversarialTrainer.py b/autoPyTorch/pipeline/components/training/trainer/AdversarialTrainer.py index 36d586919..709ee197f 100644 --- a/autoPyTorch/pipeline/components/training/trainer/AdversarialTrainer.py +++ b/autoPyTorch/pipeline/components/training/trainer/AdversarialTrainer.py @@ -150,7 +150,7 @@ def get_properties(dataset_properties: Optional[Dict[str, Any]] = None 'shortname': 'AdversarialTrainer', 'name': 'AdversarialTrainer', 'handles_tabular': True, - 'handles_image': False, + 'handles_image': True, 'handles_time_series': False, } diff --git a/autoPyTorch/pipeline/components/training/trainer/RowCutMixTrainer.py b/autoPyTorch/pipeline/components/training/trainer/RowCutMixTrainer.py index f1b606046..53500741b 100644 --- a/autoPyTorch/pipeline/components/training/trainer/RowCutMixTrainer.py +++ b/autoPyTorch/pipeline/components/training/trainer/RowCutMixTrainer.py @@ -1,4 +1,4 @@ -import typing +from typing import Any, Dict, Optional, Tuple, Union import numpy as np @@ -11,7 +11,7 @@ class RowCutMixTrainer(MixUp, BaseTrainerComponent): def data_preparation(self, X: np.ndarray, y: np.ndarray, - ) -> typing.Tuple[np.ndarray, typing.Dict[str, np.ndarray]]: + ) -> Tuple[np.ndarray, Dict[str, np.ndarray]]: """ Depending on the trainer choice, data fed to the network might be pre-processed on a different way. That is, in standard training we provide the data to the @@ -28,29 +28,28 @@ def data_preparation(self, X: np.ndarray, y: np.ndarray, """ beta = 1.0 lam = self.random_state.beta(beta, beta) - batch_size = X.size()[0] + batch_size, n_columns = np.shape(X) index = torch.randperm(batch_size).cuda() if X.is_cuda else torch.randperm(batch_size) r = self.random_state.rand(1) if beta <= 0 or r > self.alpha: return X, {'y_a': y, 'y_b': y[index], 'lam': 1} - size = X.shape[1] - indices = torch.tensor(self.random_state.choice(range(size), max(1, np.int32(size * lam)), + indices = torch.tensor(self.random_state.choice(range(batch_size), max(1, np.int32(n_columns * lam)), replace=False)) X[:, indices] = X[index, :][:, indices] # Adjust lam - lam = 1 - ((len(indices)) / (X.size()[1])) + lam = 1 - ((len(indices)) / (n_columns)) y_a, y_b = y, y[index] return X, {'y_a': y_a, 'y_b': y_b, 'lam': lam} @staticmethod - def get_properties(dataset_properties: typing.Optional[typing.Dict[str, typing.Any]] = None - ) -> typing.Dict[str, typing.Union[str, bool]]: + def get_properties(dataset_properties: Optional[Dict[str, Any]] = None + ) -> Dict[str, Union[str, bool]]: return { 'shortname': 'RowCutMixTrainer', 'name': 'MixUp Regularized with Cutoff Tabular Trainer', diff --git a/autoPyTorch/pipeline/components/training/trainer/RowCutOutTrainer.py b/autoPyTorch/pipeline/components/training/trainer/RowCutOutTrainer.py index e04728f4b..fffc35476 100644 --- a/autoPyTorch/pipeline/components/training/trainer/RowCutOutTrainer.py +++ b/autoPyTorch/pipeline/components/training/trainer/RowCutOutTrainer.py @@ -1,19 +1,15 @@ -import typing +from typing import Any, Dict, Optional, Tuple, Union import numpy as np -# import torch - from autoPyTorch.pipeline.components.training.trainer.base_trainer import BaseTrainerComponent from autoPyTorch.pipeline.components.training.trainer.cutout_utils import CutOut class RowCutOutTrainer(CutOut, BaseTrainerComponent): - NUMERICAL_VALUE = 0 - CATEGORICAL_VALUE = -1 def data_preparation(self, X: np.ndarray, y: np.ndarray, - ) -> typing.Tuple[np.ndarray, typing.Dict[str, np.ndarray]]: + ) -> Tuple[np.ndarray, Dict[str, np.ndarray]]: """ Depending on the trainer choice, data fed to the network might be pre-processed on a different way. That is, in standard training we provide the data to the @@ -26,7 +22,7 @@ def data_preparation(self, X: np.ndarray, y: np.ndarray, Returns: np.ndarray: that processes data - typing.Dict[str, np.ndarray]: arguments to the criterion function + Dict[str, np.ndarray]: arguments to the criterion function """ r = self.random_state.rand(1) @@ -36,27 +32,10 @@ def data_preparation(self, X: np.ndarray, y: np.ndarray, lam = 1 return X, {'y_a': y_a, 'y_b': y_b, 'lam': lam} - size = X.shape[1] + size: int = np.shape(X)[1] indices = self.random_state.choice(range(size), max(1, np.int32(size * self.patch_ratio)), replace=False) - """ - if not isinstance(self.numerical_columns, typing.Iterable): - raise ValueError("{} requires numerical columns information of {}" - "to prepare data got {}.".format(self.__class__.__name__, - typing.Iterable, - self.numerical_columns)) - numerical_indices = torch.tensor(self.numerical_columns) - categorical_indices = torch.tensor([index for index in indices if index not in self.numerical_columns]) - - # We use an ordinal encoder on the categorical columns of tabular data - # -1 is the conceptual equivalent to 0 in a image, that does not - # have color as a feature and hence the network has to learn to deal - # without this data. For numerical columns we use 0 to cutout the features - # similar to the effect that setting 0 as a pixel value in an image. - X[:, categorical_indices.long()] = self.CATEGORICAL_VALUE - X[:, numerical_indices.long()] = self.NUMERICAL_VALUE - """ X[:, indices] = 0 lam = 1 y_a = y @@ -64,8 +43,8 @@ def data_preparation(self, X: np.ndarray, y: np.ndarray, return X, {'y_a': y_a, 'y_b': y_b, 'lam': lam} @staticmethod - def get_properties(dataset_properties: typing.Optional[typing.Dict[str, typing.Any]] = None - ) -> typing.Dict[str, typing.Union[str, bool]]: + def get_properties(dataset_properties: Optional[Dict[str, Any]] = None + ) -> Dict[str, Union[str, bool]]: return { 'shortname': 'RowCutOutTrainer', 'name': 'RowCutOutTrainer', From b41734617e014be1fff3cd28765ddcb274cf116c Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Mon, 4 Oct 2021 15:56:12 +0200 Subject: [PATCH 27/54] fix flake and mypy --- autoPyTorch/api/tabular_classification.py | 2 +- autoPyTorch/datasets/base_dataset.py | 5 +++-- autoPyTorch/pipeline/base_pipeline.py | 4 +++- .../setup/network_embedding/base_network_embedding.py | 2 +- autoPyTorch/utils/backend.py | 5 ++--- 5 files changed, 10 insertions(+), 8 deletions(-) diff --git a/autoPyTorch/api/tabular_classification.py b/autoPyTorch/api/tabular_classification.py index 06d2aacb1..7be504f6d 100644 --- a/autoPyTorch/api/tabular_classification.py +++ b/autoPyTorch/api/tabular_classification.py @@ -276,7 +276,7 @@ def search( dataset_name=dataset_name) if self.dataset is None: - raise ValueError("`dataset` in {} must be initialized, but got None".format(self.__class__.__name__)) + raise ValueError("`dataset` in {} must be initialized, but got None".format(self.__class__.__name__)) return self._search( dataset=self.dataset, optimize_metric=optimize_metric, diff --git a/autoPyTorch/datasets/base_dataset.py b/autoPyTorch/datasets/base_dataset.py index 22163031c..f041be5ec 100644 --- a/autoPyTorch/datasets/base_dataset.py +++ b/autoPyTorch/datasets/base_dataset.py @@ -339,11 +339,12 @@ def get_dataset_for_training(self, split_id: int, train: bool) -> Dataset: """ # Subset creates a dataset. Splits is a (train_indices, test_indices) tuple if split_id >= len(self.splits): # old version: split_id > len(self.splits) - raise IndexError("split_id out of range, got split_id={} (>= num_splits={})".format(split_id, len(self.splits))) + raise IndexError("split_id out of range, got split_id={}" + " (>= num_splits={})".format(split_id, len(self.splits))) subset = int(not train) indices = self.splits[split_id][subset] if indices is None: - raise ValueError("Specified fold (or subset) does not exist") + raise ValueError("Specified fold (or subset) does not exist") return TransformSubset(self, indices, train=train) def replace_data(self, X_train: BaseDatasetInputType, diff --git a/autoPyTorch/pipeline/base_pipeline.py b/autoPyTorch/pipeline/base_pipeline.py index 205da414a..51db438e8 100644 --- a/autoPyTorch/pipeline/base_pipeline.py +++ b/autoPyTorch/pipeline/base_pipeline.py @@ -25,6 +25,7 @@ HyperparameterSearchSpaceUpdates ) + class BasePipeline(Pipeline): """Base class for all pipeline objects. Notes @@ -450,7 +451,8 @@ def _check_search_space_updates(self, include: Optional[Dict[str, Any]], component.get_hyperparameter_search_space( dataset_properties=self.dataset_properties).get_hyperparameter_names()]): continue - raise ValueError("Unknown hyperparameter for component {} of node {}. Expected update hyperparameter " + raise ValueError("Unknown hyperparameter for component {} of node {}." + " Expected update hyperparameter " "to be in {} got {}.".format(component.__name__, node.__class__.__name__, component.get_hyperparameter_search_space( diff --git a/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py b/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py index d516c4e84..6feac0fba 100644 --- a/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py +++ b/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py @@ -30,7 +30,7 @@ def transform(self, X: Dict[str, Any]) -> Dict[str, Any]: def build_embedding(self, num_input_features: np.ndarray, num_numerical_features: int) -> nn.Module: raise NotImplementedError - + def _get_args(self, X: Dict[str, Any]) -> Tuple[None, None]: # Tuple[int, np.ndarray]: # Feature preprocessors can alter numerical columns # if len(X['dataset_properties']['numerical_columns']) == 0: diff --git a/autoPyTorch/utils/backend.py b/autoPyTorch/utils/backend.py index 8baba0367..85160af42 100644 --- a/autoPyTorch/utils/backend.py +++ b/autoPyTorch/utils/backend.py @@ -17,7 +17,6 @@ from autoPyTorch.ensemble.abstract_ensemble import AbstractEnsemble from autoPyTorch.pipeline.base_pipeline import BasePipeline from autoPyTorch.utils.logging_ import PicklableClientLogger, get_named_client_logger -from torch.utils import data __all__ = [ 'Backend' @@ -331,8 +330,8 @@ def load_datamanager(self) -> BaseDataset: def replace_datamanager(self, datamanager: BaseDataset) -> None: """ - This function is called to replace the old datamanager with a datamanager - in case it is required. + This function is called to replace the old datamanager with a datamanager + in case it is required. Args: datamanager (BaseDataset): the new datamanager to replace the old. From 23541592261cd35234f838e1c89753d9ec621fdb Mon Sep 17 00:00:00 2001 From: Ravin Kohli <13005107+ravinkohli@users.noreply.github.com> Date: Mon, 4 Oct 2021 16:12:53 +0200 Subject: [PATCH 28/54] Update autoPyTorch/pipeline/components/training/trainer/RowCutMixTrainer.py Co-authored-by: nabenabe0928 <47781922+nabenabe0928@users.noreply.github.com> --- .../pipeline/components/training/trainer/RowCutMixTrainer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/autoPyTorch/pipeline/components/training/trainer/RowCutMixTrainer.py b/autoPyTorch/pipeline/components/training/trainer/RowCutMixTrainer.py index 53500741b..6b4f7b343 100644 --- a/autoPyTorch/pipeline/components/training/trainer/RowCutMixTrainer.py +++ b/autoPyTorch/pipeline/components/training/trainer/RowCutMixTrainer.py @@ -41,7 +41,7 @@ def data_preparation(self, X: np.ndarray, y: np.ndarray, X[:, indices] = X[index, :][:, indices] # Adjust lam - lam = 1 - ((len(indices)) / (n_columns)) + lam = 1 - (len(indices) / n_columns) y_a, y_b = y, y[index] From 7e59f4de8564a36b63f7cae0281e45836d50e3a0 Mon Sep 17 00:00:00 2001 From: Ravin Kohli <13005107+ravinkohli@users.noreply.github.com> Date: Mon, 4 Oct 2021 16:13:19 +0200 Subject: [PATCH 29/54] Update autoPyTorch/pipeline/tabular_classification.py Co-authored-by: nabenabe0928 <47781922+nabenabe0928@users.noreply.github.com> --- autoPyTorch/pipeline/tabular_classification.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/autoPyTorch/pipeline/tabular_classification.py b/autoPyTorch/pipeline/tabular_classification.py index b059c783c..926d6308c 100644 --- a/autoPyTorch/pipeline/tabular_classification.py +++ b/autoPyTorch/pipeline/tabular_classification.py @@ -102,7 +102,7 @@ def _predict_proba(self, X: np.ndarray) -> np.ndarray: proba_k /= normalizer all_proba.append(proba_k) - return np.ndarray(all_proba) + return np.array(all_proba) def predict_proba(self, X: np.ndarray, batch_size: Optional[int] = None) -> np.ndarray: """predict_proba. From 7ab5d267e576042328207e42d9f6e51497b641aa Mon Sep 17 00:00:00 2001 From: Ravin Kohli <13005107+ravinkohli@users.noreply.github.com> Date: Mon, 4 Oct 2021 16:48:30 +0200 Subject: [PATCH 30/54] Update autoPyTorch/pipeline/components/setup/network_backbone/utils.py Co-authored-by: nabenabe0928 <47781922+nabenabe0928@users.noreply.github.com> --- .../pipeline/components/setup/network_backbone/utils.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/utils.py b/autoPyTorch/pipeline/components/setup/network_backbone/utils.py index d10d15dca..609c364aa 100644 --- a/autoPyTorch/pipeline/components/setup/network_backbone/utils.py +++ b/autoPyTorch/pipeline/components/setup/network_backbone/utils.py @@ -116,8 +116,10 @@ def shake_get_alpha_beta( elif method in ['shake-even', 'even-even']: beta = torch.FloatTensor([0.5]) elif method == 'M3': + # Table 4 in the paper `Shake-Shake regularization` + rnd = torch.rand(1) beta = torch.FloatTensor( - [torch.rand(1) * (0.5 - alpha) * alpha if alpha < 0.5 else torch.rand(1) * (alpha - 0.5) * alpha] + [rnd * (0.5 - alpha) + alpha if alpha < 0.5 else rnd * (alpha - 0.5) + 0.5] ) else: raise ValueError("Unknown method for ShakeShakeRegularisation in NetworkBackbone") From 0032834f41f698eee6514e308f9b3d64d37dfe48 Mon Sep 17 00:00:00 2001 From: Ravin Kohli <13005107+ravinkohli@users.noreply.github.com> Date: Mon, 4 Oct 2021 17:47:43 +0200 Subject: [PATCH 31/54] Update autoPyTorch/pipeline/components/setup/network_backbone/utils.py Co-authored-by: nabenabe0928 <47781922+nabenabe0928@users.noreply.github.com> --- autoPyTorch/pipeline/components/setup/network_backbone/utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/utils.py b/autoPyTorch/pipeline/components/setup/network_backbone/utils.py index 609c364aa..b0675eb14 100644 --- a/autoPyTorch/pipeline/components/setup/network_backbone/utils.py +++ b/autoPyTorch/pipeline/components/setup/network_backbone/utils.py @@ -100,6 +100,7 @@ def shake_get_alpha_beta( """ The methods used in this function have been introduced in 'ShakeShake Regularisation' https://arxiv.org/abs/1705.07485. The names have been taken from the paper as well. + Currently, this function supports `even-even`, `shake-even` and `shake-shake` """ if not is_training: result = (torch.FloatTensor([0.5]), torch.FloatTensor([0.5])) From 90ce40c07c6aa2f9cdef98d0801f0b511366f3b5 Mon Sep 17 00:00:00 2001 From: Ravin Kohli <13005107+ravinkohli@users.noreply.github.com> Date: Mon, 4 Oct 2021 17:47:59 +0200 Subject: [PATCH 32/54] Update autoPyTorch/pipeline/components/setup/network_backbone/utils.py Co-authored-by: nabenabe0928 <47781922+nabenabe0928@users.noreply.github.com> --- autoPyTorch/pipeline/components/setup/network_backbone/utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/utils.py b/autoPyTorch/pipeline/components/setup/network_backbone/utils.py index b0675eb14..fefe85e7a 100644 --- a/autoPyTorch/pipeline/components/setup/network_backbone/utils.py +++ b/autoPyTorch/pipeline/components/setup/network_backbone/utils.py @@ -107,6 +107,7 @@ def shake_get_alpha_beta( return result if not is_cuda else (result[0].cuda(), result[1].cuda()) # TODO implement other update methods + # alpha is the weight ratio for the forward pass and beta is that for the backward pass if method == 'even-even': alpha = torch.FloatTensor([0.5]) else: From f51d2390ec23cf5c2b23b295689806496b411146 Mon Sep 17 00:00:00 2001 From: Ravin Kohli <13005107+ravinkohli@users.noreply.github.com> Date: Mon, 4 Oct 2021 17:49:16 +0200 Subject: [PATCH 33/54] Apply suggestions from code review Co-authored-by: nabenabe0928 <47781922+nabenabe0928@users.noreply.github.com> --- autoPyTorch/pipeline/base_pipeline.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/autoPyTorch/pipeline/base_pipeline.py b/autoPyTorch/pipeline/base_pipeline.py index 51db438e8..7d4fd17a9 100644 --- a/autoPyTorch/pipeline/base_pipeline.py +++ b/autoPyTorch/pipeline/base_pipeline.py @@ -429,7 +429,7 @@ def _check_search_space_updates(self, include: Optional[Dict[str, Any]], if choice not in components.keys(): raise ValueError("Unknown component choice for node {}. " "Expected update hyperparameter " - "to be in {} got {}".format(node.__class__.__name__, + "to be in {}, but got {}".format(node.__class__.__name__, components.keys(), choice)) # check if the component whose hyperparameter # needs to be updated is in components of the @@ -437,7 +437,7 @@ def _check_search_space_updates(self, include: Optional[Dict[str, Any]], elif split_hyperparameter[0] not in components.keys(): raise ValueError("Unknown component choice for node {}. " "Expected update component " - "to be in {} got {}".format(node.__class__.__name__, + "to be in {}, but got {}".format(node.__class__.__name__, components.keys(), split_hyperparameter[0])) else: # check if hyperparameter is in the search space of the component @@ -453,7 +453,7 @@ def _check_search_space_updates(self, include: Optional[Dict[str, Any]], continue raise ValueError("Unknown hyperparameter for component {} of node {}." " Expected update hyperparameter " - "to be in {} got {}.".format(component.__name__, + "to be in {}, but got {}.".format(component.__name__, node.__class__.__name__, component.get_hyperparameter_search_space( dataset_properties=self.dataset_properties @@ -470,7 +470,7 @@ def _check_search_space_updates(self, include: Optional[Dict[str, Any]], continue raise ValueError("Unknown hyperparameter for node {}. " "Expected update hyperparameter " - "to be in {} got {}".format(node.__class__.__name__, + "to be in {}, but got {}".format(node.__class__.__name__, node. get_hyperparameter_search_space( dataset_properties=self.dataset_properties). From 42e6b5ae631d0075b7a64e2dfd018ea0eb46ab55 Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Mon, 4 Oct 2021 22:57:21 +0200 Subject: [PATCH 34/54] increase threads_per_worker --- autoPyTorch/api/base_task.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/autoPyTorch/api/base_task.py b/autoPyTorch/api/base_task.py index 27b7cbbb1..189919013 100644 --- a/autoPyTorch/api/base_task.py +++ b/autoPyTorch/api/base_task.py @@ -412,7 +412,7 @@ def _create_dask_client(self) -> None: dask.distributed.LocalCluster( n_workers=self.n_jobs, processes=True, - threads_per_worker=1, + threads_per_worker=2, # We use the temporal directory to save the # dask workers, because deleting workers # more time than deleting backend directories From f79a4fc895aa18403bdcdc89226893918d9278a4 Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Tue, 5 Oct 2021 12:11:15 +0200 Subject: [PATCH 35/54] fix bug in rowcutmix --- .../pipeline/components/training/trainer/RowCutMixTrainer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/autoPyTorch/pipeline/components/training/trainer/RowCutMixTrainer.py b/autoPyTorch/pipeline/components/training/trainer/RowCutMixTrainer.py index 6b4f7b343..9ab76ed59 100644 --- a/autoPyTorch/pipeline/components/training/trainer/RowCutMixTrainer.py +++ b/autoPyTorch/pipeline/components/training/trainer/RowCutMixTrainer.py @@ -35,7 +35,7 @@ def data_preparation(self, X: np.ndarray, y: np.ndarray, if beta <= 0 or r > self.alpha: return X, {'y_a': y, 'y_b': y[index], 'lam': 1} - indices = torch.tensor(self.random_state.choice(range(batch_size), max(1, np.int32(n_columns * lam)), + indices = torch.tensor(self.random_state.choice(range(n_columns), max(1, np.int32(n_columns * lam)), replace=False)) X[:, indices] = X[index, :][:, indices] From 6d9f99f3c71732e5a03be64505f32a63f2b5e6c9 Mon Sep 17 00:00:00 2001 From: Arlind Kadra Date: Fri, 8 Oct 2021 17:30:52 +0200 Subject: [PATCH 36/54] Enhancement for the tabular validator. (#291) * Initial try at an enhancement for the tabular validator * Adding a few type annotations * Fixing bugs in implementation * Adding wrongly deleted code part during rebase * Fix bug in _get_args * Fix bug in _get_args * Addressing Shuhei's comments * Address Shuhei's comments * Refactoring code * Refactoring code * Typos fix and additional comments * Replace nan in categoricals with simple imputer * Remove unused function * add comment * Update autoPyTorch/data/tabular_feature_validator.py Co-authored-by: nabenabe0928 <47781922+nabenabe0928@users.noreply.github.com> * Update autoPyTorch/data/tabular_feature_validator.py Co-authored-by: nabenabe0928 <47781922+nabenabe0928@users.noreply.github.com> * Adding unit test for only nall columns in the tabular feature categorical evaluator * fix bug in remove all nan columns * Bug fix for making tests run by arlind * fix flake errors in feature validator * made typing code uniform * Apply suggestions from code review Co-authored-by: nabenabe0928 <47781922+nabenabe0928@users.noreply.github.com> * address comments from shuhei * address comments from shuhei (2) Co-authored-by: Ravin Kohli Co-authored-by: Ravin Kohli <13005107+ravinkohli@users.noreply.github.com> Co-authored-by: nabenabe0928 <47781922+nabenabe0928@users.noreply.github.com> --- autoPyTorch/data/base_feature_validator.py | 73 +++-- autoPyTorch/data/base_target_validator.py | 40 +-- autoPyTorch/data/tabular_feature_validator.py | 269 ++++++++---------- test/test_data/test_feature_validator.py | 108 ++++++- 4 files changed, 290 insertions(+), 200 deletions(-) diff --git a/autoPyTorch/data/base_feature_validator.py b/autoPyTorch/data/base_feature_validator.py index 0106a3aa8..9ed46d6e6 100644 --- a/autoPyTorch/data/base_feature_validator.py +++ b/autoPyTorch/data/base_feature_validator.py @@ -1,5 +1,5 @@ import logging -import typing +from typing import List, Optional, Set, Tuple, Union import numpy as np @@ -12,8 +12,8 @@ from autoPyTorch.utils.logging_ import PicklableClientLogger -SUPPORTED_FEAT_TYPES = typing.Union[ - typing.List, +SUPPORTED_FEAT_TYPES = Union[ + List, pd.DataFrame, np.ndarray, scipy.sparse.bsr_matrix, @@ -35,43 +35,44 @@ class BaseFeatureValidator(BaseEstimator): List of the column types found by this estimator during fit. data_type (str): Class name of the data type provided during fit. - encoder (typing.Optional[BaseEstimator]) + encoder (Optional[BaseEstimator]) Host a encoder object if the data requires transformation (for example, if provided a categorical column in a pandas DataFrame) - enc_columns (typing.List[str]) + enc_columns (List[str]) List of columns that were encoded. """ def __init__(self, - logger: typing.Optional[typing.Union[PicklableClientLogger, logging.Logger - ]] = None, + logger: Optional[Union[PicklableClientLogger, logging.Logger + ] + ] = None, ) -> None: # Register types to detect unsupported data format changes - self.feat_type = None # type: typing.Optional[typing.List[str]] - self.data_type = None # type: typing.Optional[type] - self.dtypes = [] # type: typing.List[str] - self.column_order = [] # type: typing.List[str] + self.feat_type: Optional[List[str]] = None + self.data_type: Optional[type] = None + self.dtypes: List[str] = [] + self.column_order: List[str] = [] - self.encoder = None # type: typing.Optional[BaseEstimator] - self.enc_columns = [] # type: typing.List[str] + self.encoder: Optional[BaseEstimator] = None + self.enc_columns: List[str] = [] - self.logger: typing.Union[ + self.logger: Union[ PicklableClientLogger, logging.Logger ] = logger if logger is not None else logging.getLogger(__name__) # Required for dataset properties - self.num_features = None # type: typing.Optional[int] - self.categories = [] # type: typing.List[typing.List[int]] - self.categorical_columns: typing.List[int] = [] - self.numerical_columns: typing.List[int] = [] - # column identifiers may be integers or strings - self.null_columns: typing.Set[str] = set() + self.num_features: Optional[int] = None + self.categories: List[List[int]] = [] + self.categorical_columns: List[int] = [] + self.numerical_columns: List[int] = [] + + self.all_nan_columns: Optional[Set[Union[int, str]]] = None self._is_fitted = False def fit( self, X_train: SUPPORTED_FEAT_TYPES, - X_test: typing.Optional[SUPPORTED_FEAT_TYPES] = None, + X_test: Optional[SUPPORTED_FEAT_TYPES] = None, ) -> BaseEstimator: """ Validates and fit a categorical encoder (if needed) to the features. @@ -82,7 +83,7 @@ def fit( X_train (SUPPORTED_FEAT_TYPES): A set of features that are going to be validated (type and dimensionality checks) and a encoder fitted in the case the data needs encoding - X_test (typing.Optional[SUPPORTED_FEAT_TYPES]): + X_test (Optional[SUPPORTED_FEAT_TYPES]): A hold out set of data used for checking """ @@ -122,6 +123,7 @@ def _fit( self: The fitted base estimator """ + raise NotImplementedError() def _check_data( @@ -136,6 +138,7 @@ def _check_data( A set of features that are going to be validated (type and dimensionality checks) and a encoder fitted in the case the data needs encoding """ + raise NotImplementedError() def transform( @@ -152,4 +155,30 @@ def transform( np.ndarray: The transformed array """ + + raise NotImplementedError() + + def list_to_dataframe( + self, + X_train: SUPPORTED_FEAT_TYPES, + X_test: Optional[SUPPORTED_FEAT_TYPES] = None, + ) -> Tuple[pd.DataFrame, Optional[pd.DataFrame]]: + """ + Converts a list to a pandas DataFrame. In this process, column types are inferred. + + If test data is provided, we proactively match it to train data + + Arguments: + X_train (SUPPORTED_FEAT_TYPES): + A set of features that are going to be validated (type and dimensionality + checks) and a encoder fitted in the case the data needs encoding + X_test (Optional[SUPPORTED_FEAT_TYPES]): + A hold out set of data used for checking + Returns: + pd.DataFrame: + transformed train data from list to pandas DataFrame + pd.DataFrame: + transformed test data from list to pandas DataFrame + """ + raise NotImplementedError() diff --git a/autoPyTorch/data/base_target_validator.py b/autoPyTorch/data/base_target_validator.py index dba9c19e3..0fb318476 100644 --- a/autoPyTorch/data/base_target_validator.py +++ b/autoPyTorch/data/base_target_validator.py @@ -1,5 +1,5 @@ import logging -import typing +from typing import List, Optional, Union, cast import numpy as np @@ -12,8 +12,8 @@ from autoPyTorch.utils.logging_ import PicklableClientLogger -SUPPORTED_TARGET_TYPES = typing.Union[ - typing.List, +SUPPORTED_TARGET_TYPES = Union[ + List, pd.Series, pd.DataFrame, np.ndarray, @@ -35,39 +35,39 @@ class BaseTargetValidator(BaseEstimator): is_classification (bool): A bool that indicates if the validator should operate in classification mode. During classification, the targets are encoded. - encoder (typing.Optional[BaseEstimator]): + encoder (Optional[BaseEstimator]): Host a encoder object if the data requires transformation (for example, if provided a categorical column in a pandas DataFrame) - enc_columns (typing.List[str]) + enc_columns (List[str]) List of columns that where encoded """ def __init__(self, is_classification: bool = False, - logger: typing.Optional[typing.Union[PicklableClientLogger, logging.Logger + logger: Optional[Union[PicklableClientLogger, logging.Logger ]] = None, ) -> None: self.is_classification = is_classification - self.data_type = None # type: typing.Optional[type] + self.data_type: Optional[type] = None - self.encoder = None # type: typing.Optional[BaseEstimator] + self.encoder: Optional[BaseEstimator] = None - self.out_dimensionality = None # type: typing.Optional[int] - self.type_of_target = None # type: typing.Optional[str] + self.out_dimensionality: Optional[int] = None + self.type_of_target: Optional[str] = None - self.logger: typing.Union[ + self.logger: Union[ PicklableClientLogger, logging.Logger ] = logger if logger is not None else logging.getLogger(__name__) # Store the dtype for remapping to correct type - self.dtype = None # type: typing.Optional[type] + self.dtype: Optional[type] = None self._is_fitted = False def fit( self, y_train: SUPPORTED_TARGET_TYPES, - y_test: typing.Optional[SUPPORTED_TARGET_TYPES] = None, + y_test: Optional[SUPPORTED_TARGET_TYPES] = None, ) -> BaseEstimator: """ Validates and fit a categorical encoder (if needed) to the targets @@ -76,7 +76,7 @@ def fit( Arguments: y_train (SUPPORTED_TARGET_TYPES) A set of targets set aside for training - y_test (typing.Union[SUPPORTED_TARGET_TYPES]) + y_test (Union[SUPPORTED_TARGET_TYPES]) A hold out set of data used of the targets. It is also used to fit the categories of the encoder. """ @@ -95,8 +95,8 @@ def fit( np.shape(y_test) )) if isinstance(y_train, pd.DataFrame): - y_train = typing.cast(pd.DataFrame, y_train) - y_test = typing.cast(pd.DataFrame, y_test) + y_train = cast(pd.DataFrame, y_train) + y_test = cast(pd.DataFrame, y_test) if y_train.columns.tolist() != y_test.columns.tolist(): raise ValueError( "Train and test targets must both have the same columns, yet " @@ -127,21 +127,21 @@ def fit( def _fit( self, y_train: SUPPORTED_TARGET_TYPES, - y_test: typing.Optional[SUPPORTED_TARGET_TYPES] = None, + y_test: Optional[SUPPORTED_TARGET_TYPES] = None, ) -> BaseEstimator: """ Arguments: y_train (SUPPORTED_TARGET_TYPES) The labels of the current task. They are going to be encoded in case of classification - y_test (typing.Optional[SUPPORTED_TARGET_TYPES]) + y_test (Optional[SUPPORTED_TARGET_TYPES]) A holdout set of labels """ raise NotImplementedError() def transform( self, - y: typing.Union[SUPPORTED_TARGET_TYPES], + y: Union[SUPPORTED_TARGET_TYPES], ) -> np.ndarray: """ Arguments: @@ -162,7 +162,7 @@ def inverse_transform( Revert any encoding transformation done on a target array Arguments: - y (typing.Union[np.ndarray, pd.DataFrame, pd.Series]): + y (Union[np.ndarray, pd.DataFrame, pd.Series]): Target array to be transformed back to original form before encoding Returns: np.ndarray: diff --git a/autoPyTorch/data/tabular_feature_validator.py b/autoPyTorch/data/tabular_feature_validator.py index 698e92438..3f939bc98 100644 --- a/autoPyTorch/data/tabular_feature_validator.py +++ b/autoPyTorch/data/tabular_feature_validator.py @@ -1,5 +1,5 @@ import functools -from typing import Any, Dict, List, Optional, Tuple, Union, cast +from typing import Dict, List, Optional, Tuple, cast import numpy as np @@ -22,7 +22,7 @@ def _create_column_transformer( preprocessors: Dict[str, List[BaseEstimator]], numerical_columns: List[str], - categorical_columns: List[str] + categorical_columns: List[str], ) -> ColumnTransformer: """ Given a dictionary of preprocessors, this function @@ -38,6 +38,7 @@ def _create_column_transformer( Returns: ColumnTransformer """ + numerical_pipeline = 'drop' categorical_pipeline = 'drop' if len(numerical_columns) > 0: @@ -48,7 +49,7 @@ def _create_column_transformer( return ColumnTransformer([ ('categorical_pipeline', categorical_pipeline, categorical_columns), ('numerical_pipeline', numerical_pipeline, numerical_columns)], - remainder='passthrough' + remainder='drop' ) @@ -60,21 +61,23 @@ def get_tabular_preprocessors() -> Dict[str, List[BaseEstimator]]: Dict[str, List[BaseEstimator]] """ preprocessors: Dict[str, List[BaseEstimator]] = dict() - preprocessors['numerical'] = list() - preprocessors['categorical'] = list() - preprocessors['categorical'].append(OneHotEncoder( - categories='auto', - sparse=False, - handle_unknown='ignore')) - preprocessors['numerical'].append(SimpleImputer(strategy='median', - copy=False)) - preprocessors['numerical'].append(StandardScaler(with_mean=True, with_std=True, copy=False)) + # Categorical Preprocessors + onehot_encoder = OneHotEncoder(categories='auto', sparse=False, handle_unknown='ignore') + categorical_imputer = SimpleImputer(strategy='constant', copy=False) + + # Numerical Preprocessors + numerical_imputer = SimpleImputer(strategy='median', copy=False) + standard_scaler = StandardScaler(with_mean=True, with_std=True, copy=False) + + preprocessors['categorical'] = [categorical_imputer, onehot_encoder] + preprocessors['numerical'] = [numerical_imputer, standard_scaler] return preprocessors class TabularFeatureValidator(BaseFeatureValidator): + def _fit( self, X: SUPPORTED_FEAT_TYPES, @@ -96,24 +99,29 @@ def _fit( # The final output of a validator is a numpy array. But pandas # gives us information about the column dtype if isinstance(X, np.ndarray): + X = self.numpy_array_to_pandas(X) + # Replace the data type from the previously saved type. + self.data_type = type(X) + # save all the information about the column order and data types + self._check_data(X) if hasattr(X, "iloc") and not scipy.sparse.issparse(X): + X = cast(pd.DataFrame, X) - if not X.select_dtypes(include='object').empty: - X = self.infer_objects(X) + self.all_nan_columns = set([column for column in X.columns if X[column].isna().all()]) - self._check_data(X) categorical_columns, numerical_columns, feat_type = self._get_columns_info(X) self.enc_columns = categorical_columns - if len(categorical_columns) >= 0: - X = self.impute_nan_in_categories(X) + preprocessors = get_tabular_preprocessors() - self.column_transformer = _create_column_transformer(preprocessors=preprocessors, - numerical_columns=numerical_columns, - categorical_columns=categorical_columns) + self.column_transformer = _create_column_transformer( + preprocessors=preprocessors, + numerical_columns=numerical_columns, + categorical_columns=categorical_columns, + ) # Mypy redefinition assert self.column_transformer is not None @@ -140,15 +148,8 @@ def comparator(cmp1: str, cmp2: str) -> int: key=functools.cmp_to_key(comparator) ) - if len(categorical_columns) > 0: - self.categories = [ - # We fit an ordinal encoder, where all categorical - # columns are shifted to the left - list(range(len(cat))) - for cat in self.column_transformer.named_transformers_[ - 'categorical_pipeline'].named_steps['onehotencoder'].categories_ - ] - + # differently to categorical_columns and numerical_columns, + # this saves the index of the column. for i, type_ in enumerate(self.feat_type): if 'numerical' in type_: self.numerical_columns.append(i) @@ -156,7 +157,8 @@ def comparator(cmp1: str, cmp2: str) -> int: self.categorical_columns.append(i) # Lastly, store the number of features - self.num_features = np.shape(X)[1] + self.num_features = len(X.columns) + return self def transform( @@ -189,16 +191,19 @@ def transform( if hasattr(X, "iloc") and not scipy.sparse.issparse(X): X = cast(pd.DataFrame, X) - # Also remove the object dtype for new data - if not X.select_dtypes(include='object').empty: - X = self.infer_objects(X) - # Check the data here so we catch problems on new test data self._check_data(X) - # We also need to fillna on the transformation - # in case test data is provided - if len(self.categorical_columns) >= 0: - X = self.impute_nan_in_categories(X) + + # in case of test data being all none and train data + # having a value for a categorical column. + # We need to convert the column in test data to + # object otherwise the test column is interpreted as float + if len(self.categorical_columns) > 0: + categorical_columns = self.column_transformer.transformers_[0][-1] + for column in categorical_columns: + if X[column].isna().all(): + X[column] = X[column].astype('object') + X = self.column_transformer.transform(X) # Sparse related transformations @@ -268,13 +273,13 @@ def _check_data( X = cast(pd.DataFrame, X) # Handle objects if possible - if not X.select_dtypes(include='object').empty: + exist_object_columns = has_object_columns(X.dtypes.values) + if exist_object_columns: X = self.infer_objects(X) # Define the column to be encoded here as the feature validator is fitted once # per estimator # enc_columns, _ = self._get_columns_to_encode(X) - column_order = [column for column in X.columns] if len(self.column_order) > 0: if self.column_order != column_order: @@ -288,13 +293,21 @@ def _check_data( dtypes = [dtype.name for dtype in X.dtypes] if len(self.dtypes) > 0: - if self.dtypes != dtypes: - raise ValueError("Changing the dtype of the features after fit() is " - "not supported. Fit() method was called with " - "{} whereas the new features have {} as type".format(self.dtypes, - dtypes, - ) - ) + dtypes_diff = [s_dtype != dtype for s_dtype, dtype in zip(self.dtypes, dtypes)] + if any(dtypes_diff): + if self.all_nan_columns is not None and len(self.all_nan_columns) > 0: + if len(set(X.columns[dtypes_diff]).difference(self.all_nan_columns)) != 0: + # we expect the dtypes to only be different if the column belongs + # to all_nan_columns as these columns would be imputed. if there is + # a value in the test set for a column in all_nan_columns, pandas + # does not recognise the dtype of the test column properly + raise ValueError("Changing the dtype of the features after fit() is " + "not supported. The dtype of some columns are different " + "between training and test datasets. Fit() method was called with " + "{} whereas the new features have {} as type".format(self.dtypes, + dtypes, + ) + ) else: self.dtypes = dtypes @@ -310,8 +323,10 @@ def _get_columns_info( A set of features that are going to be validated (type and dimensionality checks) and a encoder fitted in the case the data needs encoding Returns: - enc_columns (List[str]): - Columns to encode, if any + categorical_columns: (List[str]) + List of the names of categorical columns. + numerical_columns: (List[str]) + List of the names of numerical columns. feat_type: Type of each column numerical/categorical """ @@ -323,51 +338,44 @@ def _get_columns_info( # Make sure each column is a valid type for i, column in enumerate(X.columns): - if X[column].dtype.name in ['category', 'bool']: - + if self.all_nan_columns is not None and column in self.all_nan_columns: + continue + column_dtype = self.dtypes[i] + err_msg = "Valid types are `numerical`, `categorical` or `boolean`, " \ + "but input Column {} has an invalid type `{}`.".format(column, column_dtype) + if column_dtype in ['category', 'bool']: categorical_columns.append(column) feat_type.append('categorical') # Move away from np.issubdtype as it causes # TypeError: data type not understood in certain pandas types - elif not is_numeric_dtype(X[column]): - if X[column].dtype.name == 'object': - raise ValueError( - "Input Column {} has invalid type object. " - "Cast it to a valid dtype before using it in AutoPyTorch. " - "Valid types are numerical, categorical or boolean. " - "You can cast it to a valid dtype using " - "pandas.Series.astype ." - "If working with string objects, the following " - "tutorial illustrates how to work with text data: " - "https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html".format( - # noqa: E501 - column, - ) - ) - elif pd.core.dtypes.common.is_datetime_or_timedelta_dtype( - X[column].dtype - ): - raise ValueError( - "AutoPyTorch does not support time and/or date datatype as given " - "in column {}. Please convert the time information to a numerical value " - "first. One example on how to do this can be found on " - "https://stats.stackexchange.com/questions/311494/".format( - column, - ) - ) - else: - raise ValueError( - "Input Column {} has unsupported dtype {}. " - "Supported column types are categorical/bool/numerical dtypes. " - "Make sure your data is formatted in a correct way, " - "before feeding it to AutoPyTorch.".format( - column, - X[column].dtype.name, - ) - ) - else: + elif is_numeric_dtype(column_dtype): feat_type.append('numerical') numerical_columns.append(column) + elif column_dtype == 'object': + # TODO verify how would this happen when we always convert the object dtypes to category + raise ValueError( + "{} Cast it to a valid dtype before feeding it to AutoPyTorch. " + "You can cast it to a valid dtype using pandas.Series.astype." + "If you are working with string objects, the following " + "tutorial illustrates how to work with text data: " + "https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html".format( + # noqa: E501 + err_msg, + ) + ) + elif pd.core.dtypes.common.is_datetime_or_timedelta_dtype(column_dtype): + raise ValueError( + "{} Convert the time information to a numerical value" + " before feeding it to AutoPyTorch. " + "One example of the conversion can be found on " + "https://stats.stackexchange.com/questions/311494/".format(err_msg) + ) + else: + raise ValueError( + "{} Make sure your data is formatted in a correct way" + "before feeding it to AutoPyTorch.".format(err_msg) + ) + return categorical_columns, numerical_columns, feat_type def list_to_dataframe( @@ -394,7 +402,7 @@ def list_to_dataframe( """ # If a list was provided, it will be converted to pandas - X_train = pd.DataFrame(data=X_train).infer_objects() + X_train = pd.DataFrame(data=X_train).convert_dtypes() self.logger.warning("The provided feature types to AutoPyTorch are of type list." "Features have been interpreted as: {}".format([(col, t) for col, t in zip(X_train.columns, X_train.dtypes)])) @@ -403,7 +411,8 @@ def list_to_dataframe( self.logger.warning("Train features are a list while the provided test data" "is {}. X_test will be casted as DataFrame.".format(type(X_test)) ) - X_test = pd.DataFrame(data=X_test).infer_objects() + X_test = pd.DataFrame(data=X_test).convert_dtypes() + return X_train, X_test @staticmethod @@ -446,65 +455,33 @@ def infer_objects(self, X: pd.DataFrame) -> pd.DataFrame: self.logger.warning(f"Tried to cast column {key} to {dtype} caused {e}") pass else: + # Calling for the first time to infer the categories X = X.infer_objects() - for column in X.columns: - if not is_numeric_dtype(X[column]): + for column, data_type in zip(X.columns, X.dtypes): + if not is_numeric_dtype(data_type): X[column] = X[column].astype('category') - self.object_dtype_mapping = {column: X[column].dtype for column in X.columns} + + # only numerical attributes and categories + self.object_dtype_mapping = {column: data_type for column, data_type in zip(X.columns, X.dtypes)} + self.logger.debug(f"Infer Objects: {self.object_dtype_mapping}") + return X - def impute_nan_in_categories(self, - X: pd.DataFrame - ) -> pd.DataFrame: - """ - impute missing values before encoding, - remove once sklearn natively supports - it in ordinal encoding. Sklearn issue: - "https://github.com/scikit-learn/scikit-learn/issues/17123)" - Arguments: - X (pd.DataFrame): - data to be interpreted. - Returns: - pd.DataFrame - """ - # To be on the safe side, map always to the same missing - # value per column - if not hasattr(self, 'dict_nancol_to_missing'): - self.dict_missing_value_per_col: Dict[str, Any] = {} - - # First make sure that we do not alter the type of the column which cause: - # TypeError: '<' not supported between instances of 'int' and 'str' - # in the encoding - for column in self.enc_columns: - if X[column].isna().any(): - if column not in self.dict_missing_value_per_col: - try: - float(X[column].dropna().values[0]) - can_cast_as_number = True - except Exception: - can_cast_as_number = False - if can_cast_as_number: - # In this case, we expect to have a number as category - # it might be string, but its value represent a number - missing_value: Union[str, int] = '-1' if isinstance(X[column].dropna().values[0], str) else -1 - else: - missing_value = 'Missing!' - - # Make sure this missing value is not seen before - # Do this check for categorical columns - # else modify the value - if hasattr(X[column], 'cat'): - while missing_value in X[column].cat.categories: - if isinstance(missing_value, str): - missing_value += '0' - else: - missing_value += missing_value - self.dict_missing_value_per_col[column] = missing_value - - # Convert the frame in place - X[column].cat.add_categories([self.dict_missing_value_per_col[column]], - inplace=True) - X.fillna({column: self.dict_missing_value_per_col[column]}, inplace=True) - return X +def has_object_columns( + feature_types: pd.Series, +) -> bool: + """ + Indicate whether on a Series of dtypes for a Pandas DataFrame + there exists one or more object columns. + + Arguments: + feature_types (pd.Series): + The feature types for a DataFrame. + Returns: + bool: + True if the DataFrame dtypes contain an object column, False + otherwise. + """ + return np.dtype('O') in feature_types diff --git a/test/test_data/test_feature_validator.py b/test/test_data/test_feature_validator.py index f9ba2855e..535023cd2 100644 --- a/test/test_data/test_feature_validator.py +++ b/test/test_data/test_feature_validator.py @@ -317,6 +317,93 @@ def test_featurevalidator_get_columns_to_encode(): assert feature_types == ['numerical', 'numerical', 'categorical', 'categorical'] +def test_featurevalidator_remove_nan_catcolumns(): + """ + Make sure categorical columns that have only nan values are removed. + """ + # First case, there exist null columns in the train set + # and the same columns are not all null for the test set. + validator = TabularFeatureValidator() + + df_train = pd.DataFrame( + [ + {'A': 1, 'B': np.nan, 'C': np.nan}, + {'A': np.nan, 'C': np.nan}, + {'A': 1} + ], + dtype='category', + ) + df_test = pd.DataFrame( + [ + {'A': np.nan, 'B': np.nan, 'C': 5}, + {'A': np.nan, 'C': np.nan}, + {'A': 1} + ], + dtype='category', + ) + + validator.fit(df_train) + transformed_df_train = validator.transform(df_train) + transformed_df_test = validator.transform(df_test) + + assert np.array_equal(transformed_df_train, np.array([[0, 1], [1, 0], [0, 1]], dtype=float)) + assert np.array_equal(transformed_df_test, np.array([[1, 0], [1, 0], [0, 1]], dtype=float)) + + # Second case, there exist null columns in the training set and the same + # are null in the test set. + validator = TabularFeatureValidator() + + df_train = pd.DataFrame( + [ + {'A': 1, 'B': np.nan, 'C': np.nan}, + {'A': np.nan, 'C': np.nan}, + {'A': 1} + ], + dtype='category', + ) + df_test = pd.DataFrame( + [ + {'A': np.nan, 'B': np.nan, 'C': np.nan}, + {'A': np.nan, 'C': np.nan}, + {'A': 1} + ], + dtype='category', + ) + + validator.fit(df_train) + transformed_df_train = validator.transform(df_train) + transformed_df_test = validator.transform(df_test) + + assert np.array_equal(transformed_df_train, np.array([[0, 1], [1, 0], [0, 1]], dtype=float)) + assert np.array_equal(transformed_df_test, np.array([[1, 0], [1, 0], [0, 1]], dtype=float)) + + # Third case, there exist no null columns in the training set and a + # few null columns exist in the test set. + validator = TabularFeatureValidator() + + df_train = pd.DataFrame( + [ + {'A': 1, 'B': 1}, + {'A': 2, 'B': 2} + ], + dtype='category', + ) + df_test = pd.DataFrame( + [ + {'A': np.nan, 'B': np.nan}, + {'A': np.nan, 'B': np.nan} + ], + dtype='category', + ) + + validator.fit(df_train) + transformed_df_train = validator.transform(df_train) + transformed_df_test = validator.transform(df_test) + + assert np.array_equal(transformed_df_train, np.array([[1, 0, 1, 0], [0, 1, 0, 1]], dtype=float)) + assert np.array_equal(transformed_df_test, np.array([[0, 0, 0, 0], [0, 0, 0, 0]], dtype=float)) + + def test_features_unsupported_calls_are_raised(): """ Makes sure we raise a proper message to the user, @@ -550,15 +637,16 @@ def test_feature_validator_imbalanced_data(): validator.fit(X_train) train_feature_types = copy.deepcopy(validator.feat_type) - assert train_feature_types == ['numerical', 'numerical', 'numerical', 'numerical'] + assert train_feature_types == ['numerical'] # validator will throw an error if the column types are not the same transformed_X_test = validator.transform(X_test) transformed_X_test = pd.DataFrame(transformed_X_test) - null_columns = [] - for column in transformed_X_test.columns: - if transformed_X_test[column].isna().all(): - null_columns.append(column) - assert null_columns == [0, 2, 3] + assert sorted(validator.all_nan_columns) == sorted(['A', 'C', 'D']) + # as there are no categorical columns, we can make such an + # assertion. We only expect to drop the all nan columns + total_all_nan_columns = len(validator.all_nan_columns) + total_columns = len(validator.column_order) + assert total_columns - total_all_nan_columns == len(transformed_X_test.columns) # Columns with not all null values in the train split and # completely null on the test split. @@ -577,14 +665,10 @@ def test_feature_validator_imbalanced_data(): X_test = pd.DataFrame.from_dict(test_features) validator = TabularFeatureValidator() validator.fit(X_train) + train_feature_types = copy.deepcopy(validator.feat_type) assert train_feature_types == ['categorical', 'numerical', 'numerical'] transformed_X_test = validator.transform(X_test) transformed_X_test = pd.DataFrame(transformed_X_test) - null_columns = [] - for column in transformed_X_test.columns: - if transformed_X_test[column].isna().all(): - null_columns.append(column) - - assert null_columns == [1] + assert not len(validator.all_nan_columns) From 96614099ee598dd823bf6fcfc86c10358eaab0b3 Mon Sep 17 00:00:00 2001 From: Ravin Kohli <13005107+ravinkohli@users.noreply.github.com> Date: Mon, 11 Oct 2021 15:05:10 +0200 Subject: [PATCH 37/54] Apply suggestions from code review Co-authored-by: nabenabe0928 <47781922+nabenabe0928@users.noreply.github.com> --- autoPyTorch/data/tabular_feature_validator.py | 31 +++++++++---------- .../training/trainer/RowCutMixTrainer.py | 17 +++++----- 2 files changed, 24 insertions(+), 24 deletions(-) diff --git a/autoPyTorch/data/tabular_feature_validator.py b/autoPyTorch/data/tabular_feature_validator.py index 3f939bc98..e5944da19 100644 --- a/autoPyTorch/data/tabular_feature_validator.py +++ b/autoPyTorch/data/tabular_feature_validator.py @@ -127,21 +127,18 @@ def _fit( assert self.column_transformer is not None self.column_transformer.fit(X) - # The column transformer reoders the feature types - we therefore need to change - # it as well - # This means columns are shifted to the right + # The column transformer reorders the feature types + # therefore, we need to change the order of columns as well + # This means categorical columns are shifted to the right def comparator(cmp1: str, cmp2: str) -> int: - if ( - cmp1 == 'categorical' and cmp2 == 'categorical' - or cmp1 == 'numerical' and cmp2 == 'numerical' - ): - return 0 - elif cmp1 == 'categorical' and cmp2 == 'numerical': - return -1 - elif cmp1 == 'numerical' and cmp2 == 'categorical': - return 1 - else: - raise ValueError((cmp1, cmp2)) + """ Order so that categorical columns come right and numerical columns come left """ + choices = ['categorical', 'numerical'] + if cmp1 not in choices or cmp2 not in choices: + raise ValueError('The comparator for the column order only accepts {}, ' + 'but got {} and {}'.format(choices, cmp1, cmp2)) + + idx1, idx2 = choices.index(cmp1), choices.index(cmp2) + return idx1 - idx2 self.feat_type = sorted( feat_type, @@ -353,7 +350,7 @@ def _get_columns_info( numerical_columns.append(column) elif column_dtype == 'object': # TODO verify how would this happen when we always convert the object dtypes to category - raise ValueError( + raise TypeError( "{} Cast it to a valid dtype before feeding it to AutoPyTorch. " "You can cast it to a valid dtype using pandas.Series.astype." "If you are working with string objects, the following " @@ -364,14 +361,14 @@ def _get_columns_info( ) ) elif pd.core.dtypes.common.is_datetime_or_timedelta_dtype(column_dtype): - raise ValueError( + raise TypeError( "{} Convert the time information to a numerical value" " before feeding it to AutoPyTorch. " "One example of the conversion can be found on " "https://stats.stackexchange.com/questions/311494/".format(err_msg) ) else: - raise ValueError( + raise TypeError( "{} Make sure your data is formatted in a correct way" "before feeding it to AutoPyTorch.".format(err_msg) ) diff --git a/autoPyTorch/pipeline/components/training/trainer/RowCutMixTrainer.py b/autoPyTorch/pipeline/components/training/trainer/RowCutMixTrainer.py index 9ab76ed59..409b07e9d 100644 --- a/autoPyTorch/pipeline/components/training/trainer/RowCutMixTrainer.py +++ b/autoPyTorch/pipeline/components/training/trainer/RowCutMixTrainer.py @@ -29,21 +29,24 @@ def data_preparation(self, X: np.ndarray, y: np.ndarray, beta = 1.0 lam = self.random_state.beta(beta, beta) batch_size, n_columns = np.shape(X) - index = torch.randperm(batch_size).cuda() if X.is_cuda else torch.randperm(batch_size) + # shuffled_indices: Shuffled version of torch.arange(batch_size) + shuffled_indices = torch.randperm(batch_size).cuda() if X.is_cuda else torch.randperm(batch_size) r = self.random_state.rand(1) if beta <= 0 or r > self.alpha: - return X, {'y_a': y, 'y_b': y[index], 'lam': 1} + return X, {'y_a': y, 'y_b': y[shuffled_indices], 'lam': 1} - indices = torch.tensor(self.random_state.choice(range(n_columns), max(1, np.int32(n_columns * lam)), + cut_column_indices = torch.tensor(self.random_state.choice(range(n_columns), max(1, np.int32(n_columns * lam)), replace=False)) - X[:, indices] = X[index, :][:, indices] + # Replace the values in `cut_indices` columns with + # the values from `permed_indices` + X[:, cut_indices] = X[shuffled_indices, :][:, cut_column_indices] - # Adjust lam - lam = 1 - (len(indices) / n_columns) + # Since we cannot cut exactly `lam x 100 %` of rows, we need to adjust the `lam` + lam = 1 - (len(cut_column_indices) / n_columns) - y_a, y_b = y, y[index] + y_a, y_b = y, y[shuffled_indices] return X, {'y_a': y_a, 'y_b': y_b, 'lam': lam} From 36cb3c4438fa07a4f60ea0515b38b5bc14c36953 Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Mon, 11 Oct 2021 15:05:26 +0200 Subject: [PATCH 38/54] resolve code issues with new versions --- .../normalise/ImageNormalizer.py | 8 ++++---- .../normalise/NoNormalizer.py | 8 ++++---- .../setup/network_backbone/ResNetBackbone.py | 18 ++++++++++------- .../network_backbone/ShapedResNetBackbone.py | 14 ++++++++----- .../setup/network_backbone/utils.py | 20 +++++++++---------- autoPyTorch/utils/common.py | 2 +- requirements.txt | 8 ++++---- 7 files changed, 43 insertions(+), 35 deletions(-) diff --git a/autoPyTorch/pipeline/components/preprocessing/image_preprocessing/normalise/ImageNormalizer.py b/autoPyTorch/pipeline/components/preprocessing/image_preprocessing/normalise/ImageNormalizer.py index 4327d6346..a3be8fa79 100644 --- a/autoPyTorch/pipeline/components/preprocessing/image_preprocessing/normalise/ImageNormalizer.py +++ b/autoPyTorch/pipeline/components/preprocessing/image_preprocessing/normalise/ImageNormalizer.py @@ -2,7 +2,7 @@ import numpy as np -import torch.tensor +import torch from autoPyTorch.pipeline.components.preprocessing.image_preprocessing.normalise.base_normalizer import BaseNormalizer @@ -30,16 +30,16 @@ def fit(self, X: Dict[str, Any], y: Optional[Any] = None) -> "ImageNormalizer": self.std = X['dataset_properties']['std'] return self - def __call__(self, X: Union[np.ndarray, torch.tensor]) -> Union[np.ndarray, torch.tensor]: + def __call__(self, X: Union[np.ndarray, torch.Tensor]) -> Union[np.ndarray, torch.Tensor]: """ Makes the autoPyTorchPreprocessingComponent Callable. Calling the component calls the transform function of the underlying early_preprocessor and returns the transformed array. Args: - X (Union[np.ndarray, torch.tensor]): input data tensor + X (Union[np.ndarray, torch.Tensor]): input data tensor Returns: - Union[np.ndarray, torch.tensor]: Transformed data tensor + Union[np.ndarray, torch.Tensor]: Transformed data tensor """ X = (X - self.mean) / self.std return X diff --git a/autoPyTorch/pipeline/components/preprocessing/image_preprocessing/normalise/NoNormalizer.py b/autoPyTorch/pipeline/components/preprocessing/image_preprocessing/normalise/NoNormalizer.py index 7aeb83a9c..b36a50f4e 100644 --- a/autoPyTorch/pipeline/components/preprocessing/image_preprocessing/normalise/NoNormalizer.py +++ b/autoPyTorch/pipeline/components/preprocessing/image_preprocessing/normalise/NoNormalizer.py @@ -2,7 +2,7 @@ import numpy as np -import torch.tensor +import torch from autoPyTorch.pipeline.components.preprocessing.image_preprocessing.normalise.base_normalizer import ( BaseNormalizer @@ -34,16 +34,16 @@ def transform(self, X: Dict[str, Any]) -> Dict[str, Any]: X.update({'normalise': self}) return X - def __call__(self, X: Union[np.ndarray, torch.tensor]) -> Union[np.ndarray, torch.tensor]: + def __call__(self, X: Union[np.ndarray, torch.Tensor]) -> Union[np.ndarray, torch.Tensor]: """ Makes the autoPyTorchPreprocessingComponent Callable. Calling the component calls the transform function of the underlying early_preprocessor and returns the transformed array. Args: - X (Union[np.ndarray, torch.tensor]): input data tensor + X (Union[np.ndarray, torch.Tensor]): input data tensor Returns: - Union[np.ndarray, torch.tensor]: Transformed data tensor + Union[np.ndarray, torch.Tensor]: Transformed data tensor """ return X diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/ResNetBackbone.py b/autoPyTorch/pipeline/components/setup/network_backbone/ResNetBackbone.py index 10f509741..4a7893f94 100644 --- a/autoPyTorch/pipeline/components/setup/network_backbone/ResNetBackbone.py +++ b/autoPyTorch/pipeline/components/setup/network_backbone/ResNetBackbone.py @@ -139,8 +139,8 @@ def get_hyperparameter_search_space( value_range=(True, False), default_value=True, ), - shake_alpha_beta_method: HyperparameterSearchSpace = HyperparameterSearchSpace( - hyperparameter="shake_alpha_beta_method", + shake_shake_update_func: HyperparameterSearchSpace = HyperparameterSearchSpace( + hyperparameter="shake_shake_update_func", value_range=('shake-shake', 'shake-even', 'even-even', @@ -195,14 +195,18 @@ def get_hyperparameter_search_space( cs.add_hyperparameter(mb_choice) cs.add_condition(CS.EqualsCondition(mb_choice, use_sc, True)) + shake_shake_update_func_conditional: List[str] = list() if shake_drop_prob_flag: shake_drop_prob = get_hyperparameter(max_shake_drop_probability, UniformFloatHyperparameter) cs.add_hyperparameter(shake_drop_prob) cs.add_condition(CS.EqualsCondition(shake_drop_prob, mb_choice, "shake-drop")) - if shake_shake_flag or shake_drop_prob_flag: - method = get_hyperparameter(shake_alpha_beta_method, CategoricalHyperparameter) + shake_shake_update_func_conditional.append('shake-drop') + if shake_shake_flag: + shake_shake_update_func_conditional.append('shake-shake') + if len(shake_shake_update_func_conditional) > 0: + method = get_hyperparameter(shake_shake_update_func, CategoricalHyperparameter) cs.add_hyperparameter(method) - cs.add_condition(CS.InCondition(method, mb_choice, ["shake-shake", "shake-drop"])) + cs.add_condition(CS.InCondition(method, mb_choice, shake_shake_update_func_conditional)) # It is the upper bound of the nr of groups, # since the configuration will actually be sampled. @@ -340,12 +344,12 @@ def forward(self, x: torch.FloatTensor) -> torch.FloatTensor: x2 = self.shake_shake_layers(x) alpha, beta = shake_get_alpha_beta(is_training=self.training, is_cuda=x.is_cuda, - method=self.config['shake_alpha_beta_method']) + method=self.config['shake_shake_update_func']) x = shake_shake(x1, x2, alpha, beta) elif self.config["multi_branch_choice"] == 'shake-drop': x = self.layers(x) alpha, beta = shake_get_alpha_beta(self.training, x.is_cuda, - method=self.config['shake_alpha_beta_method']) + method=self.config['shake_shake_update_func']) bl = shake_drop_get_bl( self.block_index, 1 - self.config["max_shake_drop_probability"], diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/ShapedResNetBackbone.py b/autoPyTorch/pipeline/components/setup/network_backbone/ShapedResNetBackbone.py index 12c6d4e74..f9ad4e6e0 100644 --- a/autoPyTorch/pipeline/components/setup/network_backbone/ShapedResNetBackbone.py +++ b/autoPyTorch/pipeline/components/setup/network_backbone/ShapedResNetBackbone.py @@ -145,8 +145,8 @@ def get_hyperparameter_search_space( # type: ignore[override] 'stairs'), default_value='funnel', ), - shake_alpha_beta_method: HyperparameterSearchSpace = HyperparameterSearchSpace( - hyperparameter="shake_alpha_beta_method", + shake_shake_update_func: HyperparameterSearchSpace = HyperparameterSearchSpace( + hyperparameter="shake_shake_update_func", value_range=('shake-shake', 'shake-even', 'even-even', @@ -203,13 +203,17 @@ def get_hyperparameter_search_space( # type: ignore[override] cs.add_hyperparameter(mb_choice) cs.add_condition(CS.EqualsCondition(mb_choice, use_sc, True)) + shake_shake_update_func_conditional: List[str] = list() if shake_drop_prob_flag: shake_drop_prob = get_hyperparameter(max_shake_drop_probability, UniformFloatHyperparameter) cs.add_hyperparameter(shake_drop_prob) cs.add_condition(CS.EqualsCondition(shake_drop_prob, mb_choice, "shake-drop")) - if shake_shake_flag or shake_drop_prob_flag: - method = get_hyperparameter(shake_alpha_beta_method, CategoricalHyperparameter) + shake_shake_update_func_conditional.append('shake-drop') + if shake_shake_flag: + shake_shake_update_func_conditional.append('shake-shake') + if len(shake_shake_update_func_conditional) > 0: + method = get_hyperparameter(shake_shake_update_func, CategoricalHyperparameter) cs.add_hyperparameter(method) - cs.add_condition(CS.InCondition(method, mb_choice, ["shake-shake", "shake-drop"])) + cs.add_condition(CS.InCondition(method, mb_choice, shake_shake_update_func_conditional)) return cs diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/utils.py b/autoPyTorch/pipeline/components/setup/network_backbone/utils.py index fefe85e7a..315badb5a 100644 --- a/autoPyTorch/pipeline/components/setup/network_backbone/utils.py +++ b/autoPyTorch/pipeline/components/setup/network_backbone/utils.py @@ -37,8 +37,8 @@ def forward( ctx: typing.Any, # No typing for AutogradContext x1: torch.Tensor, x2: torch.Tensor, - alpha: torch.tensor, - beta: torch.tensor, + alpha: torch.Tensor, + beta: torch.Tensor, ) -> torch.Tensor: ctx.save_for_backward(x1, x2, alpha, beta) @@ -66,10 +66,10 @@ def backward(ctx: typing.Any, class ShakeDropFunction(Function): @staticmethod def forward(ctx: typing.Any, - x: torch.tensor, - alpha: torch.tensor, - beta: torch.tensor, - bl: torch.tensor, + x: torch.Tensor, + alpha: torch.Tensor, + beta: torch.Tensor, + bl: torch.Tensor, ) -> torch.Tensor: ctx.save_for_backward(x, alpha, beta, bl) @@ -96,7 +96,7 @@ def shake_get_alpha_beta( is_training: bool, is_cuda: bool, method: str -) -> typing.Tuple[torch.tensor, torch.tensor]: +) -> typing.Tuple[torch.Tensor, torch.Tensor]: """ The methods used in this function have been introduced in 'ShakeShake Regularisation' https://arxiv.org/abs/1705.07485. The names have been taken from the paper as well. @@ -139,14 +139,14 @@ def shake_drop_get_bl( num_blocks: int, is_training: bool, is_cuda: bool -) -> torch.tensor: +) -> torch.Tensor: pl = 1 - ((block_index + 1) / num_blocks) * (1 - min_prob_no_shake) if is_training: # Move to torch.randn(1) for reproducibility - bl = torch.tensor(1.0) if torch.randn(1) <= pl else torch.tensor(0.0) + bl = torch.Tensor(1.0) if torch.randn(1) <= pl else torch.Tensor(0.0) else: - bl = torch.tensor(pl) + bl = torch.Tensor(pl) if is_cuda: bl = bl.cuda() diff --git a/autoPyTorch/utils/common.py b/autoPyTorch/utils/common.py index 98bd20a68..13543b5fc 100644 --- a/autoPyTorch/utils/common.py +++ b/autoPyTorch/utils/common.py @@ -96,7 +96,7 @@ def replace_prefix_in_config_dict(config: Dict[str, Any], prefix: str, replace: k.startswith(prefix)} -def custom_collate_fn(batch: List) -> List[Optional[torch.tensor]]: +def custom_collate_fn(batch: List) -> List[Optional[torch.Tensor]]: """ In the case of not providing a y tensor, in a dataset of form {X, y}, y would be None. diff --git a/requirements.txt b/requirements.txt index 2195e64b4..f4a913789 100755 --- a/requirements.txt +++ b/requirements.txt @@ -1,16 +1,16 @@ pandas -torch<=1.8 -torchvision<=0.9 +torch +torchvision tensorboard scikit-learn>=0.24.0,<0.25.0 numpy -scipy==1.6.3 +scipy lockfile imgaug>=0.4.0 ConfigSpace>=0.4.14,<0.5 pynisher>=0.6.3 pyrfr>=0.7,<0.9 -smac>=0.13.1,<0.14 +smac dask distributed>=2.2.0 catboost From 6953ee72d09863fe5a838c5c38a00775cae4266b Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Mon, 11 Oct 2021 15:11:03 +0200 Subject: [PATCH 39/54] Address comments from shuhei --- autoPyTorch/api/base_task.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/autoPyTorch/api/base_task.py b/autoPyTorch/api/base_task.py index 189919013..71fa82ded 100644 --- a/autoPyTorch/api/base_task.py +++ b/autoPyTorch/api/base_task.py @@ -840,10 +840,10 @@ def _search( if self.task_type != dataset.task_type: raise ValueError("Incompatible dataset entered for current task," - "expected dataset to have task type :{} got " + "expected dataset to have task type :{} but got " ":{}".format(self.task_type, dataset.task_type)) if precision not in [16, 32, 64]: - raise ValueError("precision must be one of 16, 32, 64. Got {}".format(precision)) + raise ValueError("precision must be one of 16, 32, 64 but got {}".format(precision)) # Initialise information needed for the experiment experiment_task_name: str = 'runSearch' @@ -1429,7 +1429,7 @@ def fit_ensemble( func_eval_time_limit_secs = time_for_task // 2 self._logger.warning( "Capping the func_eval_time_limit_secs to {} to have " - "time for a least 2 models to ensemble.".format( + "time for at least 2 models to ensemble.".format( func_eval_time_limit_secs ) ) From 4b7e75f6ecf815059e42076e93e1e1b519a2c4be Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Mon, 11 Oct 2021 17:32:36 +0200 Subject: [PATCH 40/54] make run_traditional_ml function --- autoPyTorch/api/base_task.py | 74 ++++++++++++++++++++---------------- 1 file changed, 42 insertions(+), 32 deletions(-) diff --git a/autoPyTorch/api/base_task.py b/autoPyTorch/api/base_task.py index 71fa82ded..623ffb1db 100644 --- a/autoPyTorch/api/base_task.py +++ b/autoPyTorch/api/base_task.py @@ -9,6 +9,7 @@ import tempfile import time import typing +from typing_extensions import runtime import unittest.mock import warnings from abc import abstractmethod @@ -746,6 +747,37 @@ def _do_traditional_prediction(self, time_left: int, func_eval_time_limit_secs: save_external=True) return + def run_traditional_ml( + self, + current_task_name: str, + runtime_limit: int, + func_eval_time_limit_secs: int) -> None: + """ + This function can be used to run the suite of traditional machine + learning models during the current task (for e.g, ensemble fit, search) + + Args: + current_task_name (str): name of the current task, + runtime_limit (int): time limit for fitting traditional models, + func_eval_time_limit_secs (int): Time limit + for a single call to the machine learning model. + Model fitting will be terminated if the machine + learning algorithm runs over the time limit. + """ + assert self._logger is not None # for mypy compliancy + if STRING_TO_TASK_TYPES[self.task_type] in REGRESSION_TASKS: + self._logger.warning("Traditional Pipeline is not enabled for regression. Skipping...") + else: + traditional_task_name = 'runTraditional' + self._stopwatch.start_task(traditional_task_name) + elapsed_time = self._stopwatch.wall_elapsed(current_task_name) + time_for_traditional = int(runtime_limit - elapsed_time) + self._do_traditional_prediction( + func_eval_time_limit_secs=func_eval_time_limit_secs, + time_left=time_for_traditional, + ) + self._stopwatch.stop_task(traditional_task_name) + def _search( self, optimize_metric: str, @@ -927,22 +959,12 @@ def _search( # ============> Run traditional ml # We only want to run traditional predictions in case we want to build an ensemble + # We want time for at least 1 Neural network in SMAC if enable_traditional_pipeline and self.ensemble_size > 0: - if STRING_TO_TASK_TYPES[self.task_type] in REGRESSION_TASKS: - self._logger.warning("Traditional Pipeline is not enabled for regression. Skipping...") - else: - traditional_task_name = 'runTraditional' - self._stopwatch.start_task(traditional_task_name) - elapsed_time = self._stopwatch.wall_elapsed(self.dataset_name) - # We want time for at least 1 Neural network in SMAC - time_for_traditional = int( - self._time_for_task - elapsed_time - func_eval_time_limit_secs - ) - self._do_traditional_prediction( - func_eval_time_limit_secs=func_eval_time_limit_secs, - time_left=time_for_traditional, - ) - self._stopwatch.stop_task(traditional_task_name) + traditional_runtime_limit = int(self._time_for_task - func_eval_time_limit_secs) + self.run_traditional_ml(current_task_name=self.dataset_name, + runtime_limit=traditional_runtime_limit, + func_eval_time_limit_secs=func_eval_time_limit_secs) # ============> Starting ensemble self.precision = precision @@ -1433,29 +1455,17 @@ def fit_ensemble( func_eval_time_limit_secs ) ) - # We only want to run dummy predictions in case we want to build an ensemble + # ============> Run Dummy predictions dummy_task_name = 'runDummy' self._stopwatch.start_task(dummy_task_name) self._do_dummy_prediction() self._stopwatch.stop_task(dummy_task_name) # ============> Run traditional ml - # We only want to run traditional predictions in case we want to build an ensemble - if enable_traditional_pipeline and self.ensemble_size > 0: - if STRING_TO_TASK_TYPES[self.task_type] in REGRESSION_TASKS: - self._logger.warning("Traditional Pipeline is not enabled for regression. Skipping...") - else: - traditional_task_name = 'runTraditional' - self._stopwatch.start_task(traditional_task_name) - elapsed_time = self._stopwatch.wall_elapsed(ensemble_fit_task_name) - time_for_traditional = int( - time_for_task - elapsed_time - ) - self._do_traditional_prediction( - func_eval_time_limit_secs=func_eval_time_limit_secs, - time_left=time_for_traditional, - ) - self._stopwatch.stop_task(traditional_task_name) + if enable_traditional_pipeline: + self.run_traditional_ml(current_task_name=ensemble_fit_task_name, + runtime_limit=time_for_task, + func_eval_time_limit_secs=func_eval_time_limit_secs) elapsed_time = self._stopwatch.wall_elapsed(ensemble_fit_task_name) time_left_for_ensemble = int(time_for_task - elapsed_time) From cce21a6328916828d596496a76f398460fd2e1b6 Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Mon, 11 Oct 2021 18:11:17 +0200 Subject: [PATCH 41/54] implement suggestion from shuhei and fix bug in rowcutmixtrainer --- .../setup/network_backbone/utils.py | 28 ++++++++----------- .../training/trainer/RowCutMixTrainer.py | 2 +- 2 files changed, 13 insertions(+), 17 deletions(-) diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/utils.py b/autoPyTorch/pipeline/components/setup/network_backbone/utils.py index 315badb5a..6aab99449 100644 --- a/autoPyTorch/pipeline/components/setup/network_backbone/utils.py +++ b/autoPyTorch/pipeline/components/setup/network_backbone/utils.py @@ -93,9 +93,9 @@ def backward(ctx: typing.Any, def shake_get_alpha_beta( - is_training: bool, - is_cuda: bool, - method: str + is_training: bool, + is_cuda: bool, + method: str ) -> typing.Tuple[torch.Tensor, torch.Tensor]: """ The methods used in this function have been introduced in 'ShakeShake Regularisation' @@ -108,15 +108,11 @@ def shake_get_alpha_beta( # TODO implement other update methods # alpha is the weight ratio for the forward pass and beta is that for the backward pass - if method == 'even-even': - alpha = torch.FloatTensor([0.5]) - else: - alpha = torch.rand(1) - - if method == 'shake-shake': - beta = torch.rand(1) - elif method in ['shake-even', 'even-even']: + alpha = torch.FloatTensor([0.5]) if method.startswith('even') else torch.rand(1) + if method.endswith('even'): beta = torch.FloatTensor([0.5]) + elif method.endswith('shake'): + beta = torch.rand(1) elif method == 'M3': # Table 4 in the paper `Shake-Shake regularization` rnd = torch.rand(1) @@ -134,11 +130,11 @@ def shake_get_alpha_beta( def shake_drop_get_bl( - block_index: int, - min_prob_no_shake: float, - num_blocks: int, - is_training: bool, - is_cuda: bool + block_index: int, + min_prob_no_shake: float, + num_blocks: int, + is_training: bool, + is_cuda: bool ) -> torch.Tensor: pl = 1 - ((block_index + 1) / num_blocks) * (1 - min_prob_no_shake) diff --git a/autoPyTorch/pipeline/components/training/trainer/RowCutMixTrainer.py b/autoPyTorch/pipeline/components/training/trainer/RowCutMixTrainer.py index 409b07e9d..1c8a78d38 100644 --- a/autoPyTorch/pipeline/components/training/trainer/RowCutMixTrainer.py +++ b/autoPyTorch/pipeline/components/training/trainer/RowCutMixTrainer.py @@ -41,7 +41,7 @@ def data_preparation(self, X: np.ndarray, y: np.ndarray, # Replace the values in `cut_indices` columns with # the values from `permed_indices` - X[:, cut_indices] = X[shuffled_indices, :][:, cut_column_indices] + X[:, cut_column_indices] = X[shuffled_indices, :][:, cut_column_indices] # Since we cannot cut exactly `lam x 100 %` of rows, we need to adjust the `lam` lam = 1 - (len(cut_column_indices) / n_columns) From 4b5db0de51e59bd1770cf9d95d3f0e9830a1dd7e Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Mon, 11 Oct 2021 18:25:13 +0200 Subject: [PATCH 42/54] fix return type docstring --- autoPyTorch/data/tabular_feature_validator.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/autoPyTorch/data/tabular_feature_validator.py b/autoPyTorch/data/tabular_feature_validator.py index e5944da19..9f25956e2 100644 --- a/autoPyTorch/data/tabular_feature_validator.py +++ b/autoPyTorch/data/tabular_feature_validator.py @@ -320,11 +320,11 @@ def _get_columns_info( A set of features that are going to be validated (type and dimensionality checks) and a encoder fitted in the case the data needs encoding Returns: - categorical_columns: (List[str]) + categorical_columns (List[str]) List of the names of categorical columns. - numerical_columns: (List[str]) + numerical_columns (List[str]) List of the names of numerical columns. - feat_type: + feat_type (List[str]) Type of each column numerical/categorical """ # Register if a column needs encoding From 80f1c1e3c84bf53207930f19f960b0428eac2f02 Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Tue, 12 Oct 2021 00:24:05 +0200 Subject: [PATCH 43/54] add better documentation and fix bug in shake_drop_get_bl --- autoPyTorch/api/base_task.py | 2 +- autoPyTorch/data/tabular_feature_validator.py | 11 ++++------- .../setup/network_backbone/utils.py | 19 ++++++++++++++++++- .../training/trainer/RowCutMixTrainer.py | 5 +++-- 4 files changed, 26 insertions(+), 11 deletions(-) diff --git a/autoPyTorch/api/base_task.py b/autoPyTorch/api/base_task.py index 623ffb1db..23a3bb854 100644 --- a/autoPyTorch/api/base_task.py +++ b/autoPyTorch/api/base_task.py @@ -1423,7 +1423,7 @@ def fit_ensemble( "fit_ensemble().".format(self.__class__.__name__)) if precision not in [16, 32, 64]: - raise ValueError("precision must be one of 16, 32, 64. Got {}".format(precision)) + raise ValueError("precision must be one of 16, 32, 64 but got {}".format(precision)) if self._logger is None: self._logger = self._get_logger(self.dataset.dataset_name) diff --git a/autoPyTorch/data/tabular_feature_validator.py b/autoPyTorch/data/tabular_feature_validator.py index 9f25956e2..7305f9de7 100644 --- a/autoPyTorch/data/tabular_feature_validator.py +++ b/autoPyTorch/data/tabular_feature_validator.py @@ -29,11 +29,11 @@ def _create_column_transformer( creates a sklearn column transformer with appropriate columns associated with their preprocessors. Args: - preprocessors (Dict[str, List]): + preprocessors (Dict[str, List[BaseEstimator]]): Dictionary containing list of numerical and categorical preprocessors. - numerical_columns (List[int]): + numerical_columns (List[str]): List of names of numerical columns - categorical_columns (List[int]): + categorical_columns (List[str]): List of names of categorical columns Returns: ColumnTransformer @@ -135,7 +135,7 @@ def comparator(cmp1: str, cmp2: str) -> int: choices = ['categorical', 'numerical'] if cmp1 not in choices or cmp2 not in choices: raise ValueError('The comparator for the column order only accepts {}, ' - 'but got {} and {}'.format(choices, cmp1, cmp2)) + 'but got {} and {}'.format(choices, cmp1, cmp2)) idx1, idx2 = choices.index(cmp1), choices.index(cmp2) return idx1 - idx2 @@ -274,9 +274,6 @@ def _check_data( if exist_object_columns: X = self.infer_objects(X) - # Define the column to be encoded here as the feature validator is fitted once - # per estimator - # enc_columns, _ = self._get_columns_to_encode(X) column_order = [column for column in X.columns] if len(self.column_order) > 0: if self.column_order != column_order: diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/utils.py b/autoPyTorch/pipeline/components/setup/network_backbone/utils.py index 6aab99449..7b5287062 100644 --- a/autoPyTorch/pipeline/components/setup/network_backbone/utils.py +++ b/autoPyTorch/pipeline/components/setup/network_backbone/utils.py @@ -136,11 +136,28 @@ def shake_drop_get_bl( is_training: bool, is_cuda: bool ) -> torch.Tensor: + """ + The sampling of Bernoulli random variable + based on Eq. (4) in the paper + Args: + block_index (int): The index of the block from the input layer + min_prob_no_shake (float): The initial shake probability + num_blocks (int): The total number of building blocks + is_training (bool): Whether it is training + is_cuda (bool): Whether the tensor is on CUDA + Returns: + bl (torch.Tensor): a Bernoulli random variable in {0, 1} + Reference: + ShakeDrop Regularization for Deep Residual Learning + Yoshihiro Yamada et. al. (2020) + paper: https://arxiv.org/pdf/1802.02375.pdf + implementation: https://github.com/imenurok/ShakeDrop + """ pl = 1 - ((block_index + 1) / num_blocks) * (1 - min_prob_no_shake) if is_training: # Move to torch.randn(1) for reproducibility - bl = torch.Tensor(1.0) if torch.randn(1) <= pl else torch.Tensor(0.0) + bl = torch.Tensor(1.0) if torch.rand(1) <= pl else torch.Tensor(0.0) else: bl = torch.Tensor(pl) diff --git a/autoPyTorch/pipeline/components/training/trainer/RowCutMixTrainer.py b/autoPyTorch/pipeline/components/training/trainer/RowCutMixTrainer.py index 1c8a78d38..00012c711 100644 --- a/autoPyTorch/pipeline/components/training/trainer/RowCutMixTrainer.py +++ b/autoPyTorch/pipeline/components/training/trainer/RowCutMixTrainer.py @@ -36,8 +36,9 @@ def data_preparation(self, X: np.ndarray, y: np.ndarray, if beta <= 0 or r > self.alpha: return X, {'y_a': y, 'y_b': y[shuffled_indices], 'lam': 1} - cut_column_indices = torch.tensor(self.random_state.choice(range(n_columns), max(1, np.int32(n_columns * lam)), - replace=False)) + cut_column_indices = torch.tensor(self.random_state.choice(range(n_columns), + max(1, np.int32(n_columns * lam)), + replace=False)) # Replace the values in `cut_indices` columns with # the values from `permed_indices` From dc01cd3b9b669f0e81f1c0e77f869317a20bbd54 Mon Sep 17 00:00:00 2001 From: Ravin Kohli <13005107+ravinkohli@users.noreply.github.com> Date: Tue, 12 Oct 2021 10:26:50 +0200 Subject: [PATCH 44/54] Apply suggestions from code review Co-authored-by: nabenabe0928 <47781922+nabenabe0928@users.noreply.github.com> --- .../setup/network_backbone/utils.py | 2 +- test/test_data/test_feature_validator.py | 62 ++++++++----------- 2 files changed, 28 insertions(+), 36 deletions(-) diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/utils.py b/autoPyTorch/pipeline/components/setup/network_backbone/utils.py index 7b5287062..ea0a3c9d0 100644 --- a/autoPyTorch/pipeline/components/setup/network_backbone/utils.py +++ b/autoPyTorch/pipeline/components/setup/network_backbone/utils.py @@ -100,7 +100,7 @@ def shake_get_alpha_beta( """ The methods used in this function have been introduced in 'ShakeShake Regularisation' https://arxiv.org/abs/1705.07485. The names have been taken from the paper as well. - Currently, this function supports `even-even`, `shake-even` and `shake-shake` + Currently, this function supports `even-even`, `shake-even`, `shake-shake` and `M3`. """ if not is_training: result = (torch.FloatTensor([0.5]), torch.FloatTensor([0.5])) diff --git a/test/test_data/test_feature_validator.py b/test/test_data/test_feature_validator.py index 535023cd2..c7b817e0f 100644 --- a/test/test_data/test_feature_validator.py +++ b/test/test_data/test_feature_validator.py @@ -317,14 +317,23 @@ def test_featurevalidator_get_columns_to_encode(): assert feature_types == ['numerical', 'numerical', 'categorical', 'categorical'] -def test_featurevalidator_remove_nan_catcolumns(): +def feature_validator_remove_nan_catcolumns(df_train: pd.DataFrame, df_test: pd.DataFrame, + ans_train: np.ndarray, ans_test: np.ndarray) -> None: + validator = TabularFeatureValidator() + validator.fit(df_train) + transformed_df_train = validator.transform(df_train) + transformed_df_test = validator.transform(df_test) + + assert np.array_equal(transformed_df_train, ans_train) + assert np.array_equal(transformed_df_test, ans_test) + + +def test_feature_validator_remove_nan_catcolumns(): """ Make sure categorical columns that have only nan values are removed. """ - # First case, there exist null columns in the train set - # and the same columns are not all null for the test set. - validator = TabularFeatureValidator() - + # First case, there exist null columns (B and C) in the train set + # and a same column (C) are not all null for the test set. df_train = pd.DataFrame( [ {'A': 1, 'B': np.nan, 'C': np.nan}, @@ -333,6 +342,7 @@ def test_featurevalidator_remove_nan_catcolumns(): ], dtype='category', ) + ans_train = np.array([[0, 1], [1, 0], [0, 1]], dtype=np.float64) df_test = pd.DataFrame( [ {'A': np.nan, 'B': np.nan, 'C': 5}, @@ -341,18 +351,11 @@ def test_featurevalidator_remove_nan_catcolumns(): ], dtype='category', ) + ans_test = np.array([[1, 0], [1, 0], [0, 1]], dtype=np.float64) + feature_validator_remove_nan_catcolumns(df_train, df_test, ans_train, ans_test) - validator.fit(df_train) - transformed_df_train = validator.transform(df_train) - transformed_df_test = validator.transform(df_test) - - assert np.array_equal(transformed_df_train, np.array([[0, 1], [1, 0], [0, 1]], dtype=float)) - assert np.array_equal(transformed_df_test, np.array([[1, 0], [1, 0], [0, 1]], dtype=float)) - - # Second case, there exist null columns in the training set and the same - # are null in the test set. - validator = TabularFeatureValidator() - + # Second case, there exist null columns (B and C) in the training set and + # the same columns (B and C) are null in the test set. df_train = pd.DataFrame( [ {'A': 1, 'B': np.nan, 'C': np.nan}, @@ -361,6 +364,7 @@ def test_featurevalidator_remove_nan_catcolumns(): ], dtype='category', ) + ans_train = np.array([[0, 1], [1, 0], [0, 1]], dtype=np.float64) df_test = pd.DataFrame( [ {'A': np.nan, 'B': np.nan, 'C': np.nan}, @@ -369,18 +373,11 @@ def test_featurevalidator_remove_nan_catcolumns(): ], dtype='category', ) + ans_test = np.array([[1, 0], [1, 0], [0, 1]], dtype=np.float64) + feature_validator_remove_nan_catcolumns(df_train, df_test, ans_train, ans_test) - validator.fit(df_train) - transformed_df_train = validator.transform(df_train) - transformed_df_test = validator.transform(df_test) - - assert np.array_equal(transformed_df_train, np.array([[0, 1], [1, 0], [0, 1]], dtype=float)) - assert np.array_equal(transformed_df_test, np.array([[1, 0], [1, 0], [0, 1]], dtype=float)) - - # Third case, there exist no null columns in the training set and a - # few null columns exist in the test set. - validator = TabularFeatureValidator() - + # Third case, there exist no null columns in the training set and + # null columns exist in the test set. df_train = pd.DataFrame( [ {'A': 1, 'B': 1}, @@ -388,6 +385,7 @@ def test_featurevalidator_remove_nan_catcolumns(): ], dtype='category', ) + ans_train = np.array([[1, 0, 1, 0], [0, 1, 0, 1]], dtype=np.float64) df_test = pd.DataFrame( [ {'A': np.nan, 'B': np.nan}, @@ -395,14 +393,8 @@ def test_featurevalidator_remove_nan_catcolumns(): ], dtype='category', ) - - validator.fit(df_train) - transformed_df_train = validator.transform(df_train) - transformed_df_test = validator.transform(df_test) - - assert np.array_equal(transformed_df_train, np.array([[1, 0, 1, 0], [0, 1, 0, 1]], dtype=float)) - assert np.array_equal(transformed_df_test, np.array([[0, 0, 0, 0], [0, 0, 0, 0]], dtype=float)) - + ans_test = np.array([[0, 0, 0, 0], [0, 0, 0, 0]], dtype=np.float64) + feature_validator_remove_nan_catcolumns(df_train, df_test, ans_train, ans_test) def test_features_unsupported_calls_are_raised(): """ From f0c2aa04b9521d59a371d263596dc573f3b4339d Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Tue, 12 Oct 2021 10:53:40 +0200 Subject: [PATCH 45/54] add test for comparator and other improvements based on PR comments --- autoPyTorch/api/base_task.py | 8 ++--- autoPyTorch/data/base_target_validator.py | 6 ++-- autoPyTorch/data/tabular_feature_validator.py | 34 +++++++++++++------ autoPyTorch/pipeline/base_pipeline.py | 31 +++++++++-------- .../training/trainer/RowCutMixTrainer.py | 2 +- test/test_data/test_feature_validator.py | 30 ++++++++++++++++ 6 files changed, 79 insertions(+), 32 deletions(-) diff --git a/autoPyTorch/api/base_task.py b/autoPyTorch/api/base_task.py index 23a3bb854..a85695801 100644 --- a/autoPyTorch/api/base_task.py +++ b/autoPyTorch/api/base_task.py @@ -9,7 +9,6 @@ import tempfile import time import typing -from typing_extensions import runtime import unittest.mock import warnings from abc import abstractmethod @@ -751,13 +750,14 @@ def run_traditional_ml( self, current_task_name: str, runtime_limit: int, - func_eval_time_limit_secs: int) -> None: + func_eval_time_limit_secs: int + ) -> None: """ This function can be used to run the suite of traditional machine - learning models during the current task (for e.g, ensemble fit, search) + learning models during the current task (for e.g, ensemble fit, search) Args: - current_task_name (str): name of the current task, + current_task_name (str): name of the current task, runtime_limit (int): time limit for fitting traditional models, func_eval_time_limit_secs (int): Time limit for a single call to the machine learning model. diff --git a/autoPyTorch/data/base_target_validator.py b/autoPyTorch/data/base_target_validator.py index 0fb318476..c88dc5e9b 100644 --- a/autoPyTorch/data/base_target_validator.py +++ b/autoPyTorch/data/base_target_validator.py @@ -43,8 +43,10 @@ class BaseTargetValidator(BaseEstimator): """ def __init__(self, is_classification: bool = False, - logger: Optional[Union[PicklableClientLogger, logging.Logger - ]] = None, + logger: Optional[Union[PicklableClientLogger, + logging.Logger + ] + ] = None, ) -> None: self.is_classification = is_classification diff --git a/autoPyTorch/data/tabular_feature_validator.py b/autoPyTorch/data/tabular_feature_validator.py index 7305f9de7..ba0687c13 100644 --- a/autoPyTorch/data/tabular_feature_validator.py +++ b/autoPyTorch/data/tabular_feature_validator.py @@ -78,6 +78,29 @@ def get_tabular_preprocessors() -> Dict[str, List[BaseEstimator]]: class TabularFeatureValidator(BaseFeatureValidator): + @staticmethod + def _comparator(cmp1: str, cmp2: str) -> int: + """Order so that categorical columns come right and numerical columns come left + + Args: + cmp1 (str): First variable to compare + cmp2 (str): Second variable to compare + + Raises: + ValueError: if the values of the variables to compare + are not in 'categorical' or 'numerical' + + Returns: + int: either [0, -1, 1] + """ + choices = ['categorical', 'numerical'] + if cmp1 not in choices or cmp2 not in choices: + raise ValueError('The comparator for the column order only accepts {}, ' + 'but got {} and {}'.format(choices, cmp1, cmp2)) + + idx1, idx2 = choices.index(cmp1), choices.index(cmp2) + return idx1 - idx2 + def _fit( self, X: SUPPORTED_FEAT_TYPES, @@ -130,19 +153,10 @@ def _fit( # The column transformer reorders the feature types # therefore, we need to change the order of columns as well # This means categorical columns are shifted to the right - def comparator(cmp1: str, cmp2: str) -> int: - """ Order so that categorical columns come right and numerical columns come left """ - choices = ['categorical', 'numerical'] - if cmp1 not in choices or cmp2 not in choices: - raise ValueError('The comparator for the column order only accepts {}, ' - 'but got {} and {}'.format(choices, cmp1, cmp2)) - - idx1, idx2 = choices.index(cmp1), choices.index(cmp2) - return idx1 - idx2 self.feat_type = sorted( feat_type, - key=functools.cmp_to_key(comparator) + key=functools.cmp_to_key(self._comparator) ) # differently to categorical_columns and numerical_columns, diff --git a/autoPyTorch/pipeline/base_pipeline.py b/autoPyTorch/pipeline/base_pipeline.py index 7d4fd17a9..d98be9bd4 100644 --- a/autoPyTorch/pipeline/base_pipeline.py +++ b/autoPyTorch/pipeline/base_pipeline.py @@ -400,6 +400,7 @@ def _check_search_space_updates(self, include: Optional[Dict[str, Any]], raise ValueError("Unknown node name. Expected update node name to be in {} " "got {}".format(self.named_steps.keys(), update.node_name)) node = self.named_steps[update.node_name] + node_name = node.__class__.__name__ # if node is a choice module if hasattr(node, 'get_components'): split_hyperparameter = update.hyperparameter.split(':') @@ -429,16 +430,16 @@ def _check_search_space_updates(self, include: Optional[Dict[str, Any]], if choice not in components.keys(): raise ValueError("Unknown component choice for node {}. " "Expected update hyperparameter " - "to be in {}, but got {}".format(node.__class__.__name__, - components.keys(), choice)) + "to be in {}, but got {}".format(node_name, + components.keys(), choice)) # check if the component whose hyperparameter # needs to be updated is in components of the # choice module elif split_hyperparameter[0] not in components.keys(): raise ValueError("Unknown component choice for node {}. " "Expected update component " - "to be in {}, but got {}".format(node.__class__.__name__, - components.keys(), split_hyperparameter[0])) + "to be in {}, but got {}".format(node_name, + components.keys(), split_hyperparameter[0])) else: # check if hyperparameter is in the search space of the component component = components[split_hyperparameter[0]] @@ -451,15 +452,15 @@ def _check_search_space_updates(self, include: Optional[Dict[str, Any]], component.get_hyperparameter_search_space( dataset_properties=self.dataset_properties).get_hyperparameter_names()]): continue + component_hyperparameters = component.get_hyperparameter_search_space( + dataset_properties=self.dataset_properties).get_hyperparameter_names() raise ValueError("Unknown hyperparameter for component {} of node {}." " Expected update hyperparameter " "to be in {}, but got {}.".format(component.__name__, - node.__class__.__name__, - component.get_hyperparameter_search_space( - dataset_properties=self.dataset_properties - ).get_hyperparameter_names(), - split_hyperparameter[1] - ) + node_name, + component_hyperparameters, + split_hyperparameter[1] + ) ) else: if update.hyperparameter not in node.get_hyperparameter_search_space( @@ -468,13 +469,13 @@ def _check_search_space_updates(self, include: Optional[Dict[str, Any]], node.get_hyperparameter_search_space( dataset_properties=self.dataset_properties).get_hyperparameter_names()]): continue + node_hyperparameters = node.get_hyperparameter_search_space( + dataset_properties=self.dataset_properties).get_hyperparameter_names() raise ValueError("Unknown hyperparameter for node {}. " "Expected update hyperparameter " - "to be in {}, but got {}".format(node.__class__.__name__, - node. - get_hyperparameter_search_space( - dataset_properties=self.dataset_properties). - get_hyperparameter_names(), update.hyperparameter)) + "to be in {}, but got {}".format(node_name, + node_hyperparameters, + update.hyperparameter)) def _get_pipeline_steps(self, dataset_properties: Optional[Dict[str, Any]] ) -> List[Tuple[str, autoPyTorchChoice]]: diff --git a/autoPyTorch/pipeline/components/training/trainer/RowCutMixTrainer.py b/autoPyTorch/pipeline/components/training/trainer/RowCutMixTrainer.py index 00012c711..f85cf253f 100644 --- a/autoPyTorch/pipeline/components/training/trainer/RowCutMixTrainer.py +++ b/autoPyTorch/pipeline/components/training/trainer/RowCutMixTrainer.py @@ -29,7 +29,7 @@ def data_preparation(self, X: np.ndarray, y: np.ndarray, beta = 1.0 lam = self.random_state.beta(beta, beta) batch_size, n_columns = np.shape(X) - # shuffled_indices: Shuffled version of torch.arange(batch_size) + # shuffled_indices: Shuffled version of torch.arange(batch_size) shuffled_indices = torch.randperm(batch_size).cuda() if X.is_cuda else torch.randperm(batch_size) r = self.random_state.rand(1) diff --git a/test/test_data/test_feature_validator.py b/test/test_data/test_feature_validator.py index c7b817e0f..54570b7a8 100644 --- a/test/test_data/test_feature_validator.py +++ b/test/test_data/test_feature_validator.py @@ -1,4 +1,5 @@ import copy +import functools import numpy as np @@ -331,6 +332,11 @@ def feature_validator_remove_nan_catcolumns(df_train: pd.DataFrame, df_test: pd. def test_feature_validator_remove_nan_catcolumns(): """ Make sure categorical columns that have only nan values are removed. + The ans arrays contain the final output after calling transform on + datasets, this includes fitting and transforming a column transformer + containing simple imputation for both categorical and numerical + columns, scaling for numerical columns and one hot encoding for + categorical columns. """ # First case, there exist null columns (B and C) in the train set # and a same column (C) are not all null for the test set. @@ -396,6 +402,7 @@ def test_feature_validator_remove_nan_catcolumns(): ans_test = np.array([[0, 0, 0, 0], [0, 0, 0, 0]], dtype=np.float64) feature_validator_remove_nan_catcolumns(df_train, df_test, ans_train, ans_test) + def test_features_unsupported_calls_are_raised(): """ Makes sure we raise a proper message to the user, @@ -664,3 +671,26 @@ def test_feature_validator_imbalanced_data(): transformed_X_test = validator.transform(X_test) transformed_X_test = pd.DataFrame(transformed_X_test) assert not len(validator.all_nan_columns) + + +def test_comparator(): + numerical = 'numerical' + categorical = 'categorical' + + validator = TabularFeatureValidator + + feat_type = [numerical, categorical] * 10 + ans = [categorical] * 10 + [numerical] * 10 + feat_type = sorted( + feat_type, + key=functools.cmp_to_key(validator._comparator) + ) + assert ans == feat_type + + feat_type = [numerical] * 10 + [categorical] * 10 + ans = [categorical] * 10 + [numerical] * 10 + feat_type = sorted( + feat_type, + key=functools.cmp_to_key(validator._comparator) + ) + assert ans == feat_type From 57111e9c9479fa982974d145b1da5f6c3d71e8d3 Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Tue, 12 Oct 2021 11:02:56 +0200 Subject: [PATCH 46/54] fix bug in test --- test/test_data/test_feature_validator.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/test/test_data/test_feature_validator.py b/test/test_data/test_feature_validator.py index 54570b7a8..ae9b7102c 100644 --- a/test/test_data/test_feature_validator.py +++ b/test/test_data/test_feature_validator.py @@ -312,10 +312,13 @@ def test_featurevalidator_get_columns_to_encode(): for col in df.columns: df[col] = df[col].astype(col) - enc_columns, feature_types = validator._get_columns_to_encode(df) + validator.fit(df) - assert enc_columns == ['category', 'bool'] - assert feature_types == ['numerical', 'numerical', 'categorical', 'categorical'] + categorical_columns, numerical_columns, feat_type = validator._get_columns_info(df) + + assert numerical_columns == ['int', 'float'] + assert categorical_columns == ['category', 'bool'] + assert feat_type == ['numerical', 'numerical', 'categorical', 'categorical'] def feature_validator_remove_nan_catcolumns(df_train: pd.DataFrame, df_test: pd.DataFrame, From 153878f9170230d069566adbe6bfb946bb49c4fb Mon Sep 17 00:00:00 2001 From: nabenabe0928 Date: Tue, 12 Oct 2021 19:49:11 +0200 Subject: [PATCH 47/54] [fix] Fix the condition in the raising error of all_nan_columns --- autoPyTorch/data/tabular_feature_validator.py | 40 ++++++++----------- 1 file changed, 17 insertions(+), 23 deletions(-) diff --git a/autoPyTorch/data/tabular_feature_validator.py b/autoPyTorch/data/tabular_feature_validator.py index ba0687c13..e20a29e6f 100644 --- a/autoPyTorch/data/tabular_feature_validator.py +++ b/autoPyTorch/data/tabular_feature_validator.py @@ -291,33 +291,27 @@ def _check_data( column_order = [column for column in X.columns] if len(self.column_order) > 0: if self.column_order != column_order: - raise ValueError("Changing the column order of the features after fit() is " - "not supported. Fit() method was called with " - "{} whereas the new features have {} as type".format(self.column_order, - column_order, ) - ) + raise ValueError("The column order of the features must not be changed after fit(), but" + " the column order are different between training ({}) and" + " test ({}) datasets.".format(self.column_order, column_order)) else: self.column_order = column_order dtypes = [dtype.name for dtype in X.dtypes] - if len(self.dtypes) > 0: - dtypes_diff = [s_dtype != dtype for s_dtype, dtype in zip(self.dtypes, dtypes)] - if any(dtypes_diff): - if self.all_nan_columns is not None and len(self.all_nan_columns) > 0: - if len(set(X.columns[dtypes_diff]).difference(self.all_nan_columns)) != 0: - # we expect the dtypes to only be different if the column belongs - # to all_nan_columns as these columns would be imputed. if there is - # a value in the test set for a column in all_nan_columns, pandas - # does not recognise the dtype of the test column properly - raise ValueError("Changing the dtype of the features after fit() is " - "not supported. The dtype of some columns are different " - "between training and test datasets. Fit() method was called with " - "{} whereas the new features have {} as type".format(self.dtypes, - dtypes, - ) - ) - else: + + dtypes_diff = [s_dtype != dtype for s_dtype, dtype in zip(self.dtypes, dtypes)] + if len(self.dtypes) == 0: self.dtypes = dtypes + elif ( + any(dtypes_diff) # the dtypes of some columns are different in train and test dataset + and self.all_nan_columns is not None # Ignore all_nan_columns is None + and len(set(X.columns[dtypes_diff]).difference(self.all_nan_columns)) != 0 + ): + # The dtypes can be different if and only if the column belongs + # to all_nan_columns as these columns would be imputed. + raise ValueError("The dtype of the features must not be changed after fit(), but" + " the dtypes of some columns are different between training ({}) and" + " test ({}) datasets.".format(self.dtypes, dtypes)) def _get_columns_info( self, @@ -350,7 +344,7 @@ def _get_columns_info( continue column_dtype = self.dtypes[i] err_msg = "Valid types are `numerical`, `categorical` or `boolean`, " \ - "but input Column {} has an invalid type `{}`.".format(column, column_dtype) + "but input column {} has an invalid type `{}`.".format(column, column_dtype) if column_dtype in ['category', 'bool']: categorical_columns.append(column) feat_type.append('categorical') From 64862fef618cc08ce5310bcfcf093e72637d756f Mon Sep 17 00:00:00 2001 From: nabenabe0928 Date: Tue, 12 Oct 2021 19:52:50 +0200 Subject: [PATCH 48/54] [refactor] Unite name conventions of numpy array and pandas dataframe --- autoPyTorch/data/base_feature_validator.py | 4 ++-- autoPyTorch/data/tabular_feature_validator.py | 10 +++++----- test/test_data/test_feature_validator.py | 4 ++-- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/autoPyTorch/data/base_feature_validator.py b/autoPyTorch/data/base_feature_validator.py index 9ed46d6e6..420f0808c 100644 --- a/autoPyTorch/data/base_feature_validator.py +++ b/autoPyTorch/data/base_feature_validator.py @@ -89,7 +89,7 @@ def fit( # If a list was provided, it will be converted to pandas if isinstance(X_train, list): - X_train, X_test = self.list_to_dataframe(X_train, X_test) + X_train, X_test = self.list_to_pandas(X_train, X_test) self._check_data(X_train) @@ -158,7 +158,7 @@ def transform( raise NotImplementedError() - def list_to_dataframe( + def list_to_pandas( self, X_train: SUPPORTED_FEAT_TYPES, X_test: Optional[SUPPORTED_FEAT_TYPES] = None, diff --git a/autoPyTorch/data/tabular_feature_validator.py b/autoPyTorch/data/tabular_feature_validator.py index e20a29e6f..5f3bdc787 100644 --- a/autoPyTorch/data/tabular_feature_validator.py +++ b/autoPyTorch/data/tabular_feature_validator.py @@ -123,7 +123,7 @@ def _fit( # gives us information about the column dtype if isinstance(X, np.ndarray): - X = self.numpy_array_to_pandas(X) + X = self.numpy_to_pandas(X) # Replace the data type from the previously saved type. self.data_type = type(X) # save all the information about the column order and data types @@ -194,10 +194,10 @@ def transform( # If a list was provided, it will be converted to pandas if isinstance(X, list): - X, _ = self.list_to_dataframe(X) + X, _ = self.list_to_pandas(X) if isinstance(X, np.ndarray): - X = self.numpy_array_to_pandas(X) + X = self.numpy_to_pandas(X) if hasattr(X, "iloc") and not scipy.sparse.issparse(X): X = cast(pd.DataFrame, X) @@ -380,7 +380,7 @@ def _get_columns_info( return categorical_columns, numerical_columns, feat_type - def list_to_dataframe( + def list_to_pandas( self, X_train: SUPPORTED_FEAT_TYPES, X_test: Optional[SUPPORTED_FEAT_TYPES] = None, @@ -418,7 +418,7 @@ def list_to_dataframe( return X_train, X_test @staticmethod - def numpy_array_to_pandas( + def numpy_to_pandas( X: np.ndarray, ) -> pd.DataFrame: """ diff --git a/test/test_data/test_feature_validator.py b/test/test_data/test_feature_validator.py index ae9b7102c..2b388f2e3 100644 --- a/test/test_data/test_feature_validator.py +++ b/test/test_data/test_feature_validator.py @@ -284,9 +284,9 @@ def test_featurevalidator_fitontypeA_transformtypeB(input_data_featuretest): if isinstance(input_data_featuretest, pd.DataFrame): pytest.skip("Column order change in pandas is not supported") elif isinstance(input_data_featuretest, np.ndarray): - complementary_type = validator.numpy_array_to_pandas(input_data_featuretest) + complementary_type = validator.numpy_to_pandas(input_data_featuretest) elif isinstance(input_data_featuretest, list): - complementary_type, _ = validator.list_to_dataframe(input_data_featuretest) + complementary_type, _ = validator.list_to_pandas(input_data_featuretest) elif sparse.issparse(input_data_featuretest): complementary_type = sparse.csr_matrix(input_data_featuretest.todense()) else: From 410c7fe39afc25f6b0894398ad6dcfa1f031c1df Mon Sep 17 00:00:00 2001 From: nabenabe0928 Date: Tue, 12 Oct 2021 20:39:42 +0200 Subject: [PATCH 49/54] [doc] Add the description about the tabular feature transformation --- test/test_data/test_feature_validator.py | 33 ++++++++++++++++++++---- 1 file changed, 28 insertions(+), 5 deletions(-) diff --git a/test/test_data/test_feature_validator.py b/test/test_data/test_feature_validator.py index 2b388f2e3..3a70549b0 100644 --- a/test/test_data/test_feature_validator.py +++ b/test/test_data/test_feature_validator.py @@ -335,14 +335,37 @@ def feature_validator_remove_nan_catcolumns(df_train: pd.DataFrame, df_test: pd. def test_feature_validator_remove_nan_catcolumns(): """ Make sure categorical columns that have only nan values are removed. - The ans arrays contain the final output after calling transform on - datasets, this includes fitting and transforming a column transformer - containing simple imputation for both categorical and numerical - columns, scaling for numerical columns and one hot encoding for - categorical columns. + Transform performs the folloing: + * simple imputation for both + * scaling for numerical + * one-hot encoding for categorical + For example, + data = [ + {'A': 1, 'B': np.nan, 'C': np.nan}, + {'A': np.nan, 'B': 3, 'C': np.nan}, + {'A': 2, 'B': np.nan, 'C': np.nan} + ] + and suppose all the columns are categorical, + then + * `A` in {np.nan, 1, 2} + * `B` in {np.nan, 3} + * `C` in {np.nan} <=== it will be dropped. + + So in the column A, + * np.nan ==> [1, 0, 0] + * 1 ==> [0, 1, 0] + * 2 ==> [0, 0, 1] + in the column B, + * np.nan ==> [1, 0] + * 3 ==> [0, 1] + Therefore, by concatenating, + * {'A': 1, 'B': np.nan, 'C': np.nan} ==> [0, 1, 0, 1, 0] + * {'A': np.nan, 'B': 3, 'C': np.nan} ==> [1, 0, 0, 0, 1] + * {'A': 2, 'B': np.nan, 'C': np.nan} ==> [0, 0, 1, 1, 0] """ # First case, there exist null columns (B and C) in the train set # and a same column (C) are not all null for the test set. + df_train = pd.DataFrame( [ {'A': 1, 'B': np.nan, 'C': np.nan}, From baa7ab87fe64cf6d8e1f9638ff029147e6b2a310 Mon Sep 17 00:00:00 2001 From: nabenabe0928 Date: Tue, 12 Oct 2021 21:01:45 +0200 Subject: [PATCH 50/54] [doc] Add the description of the tabular feature transformation --- autoPyTorch/data/tabular_feature_validator.py | 35 +++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/autoPyTorch/data/tabular_feature_validator.py b/autoPyTorch/data/tabular_feature_validator.py index 5f3bdc787..ae449fa14 100644 --- a/autoPyTorch/data/tabular_feature_validator.py +++ b/autoPyTorch/data/tabular_feature_validator.py @@ -188,6 +188,41 @@ def transform( Return: np.ndarray: The transformed array + + Note: + The default transform performs the folloing: + * simple imputation for both + * scaling for numerical + * one-hot encoding for categorical + For example, here is a simple case + of which all the columns are categorical. + data = [ + {'A': 1, 'B': np.nan, 'C': np.nan}, + {'A': np.nan, 'B': 3, 'C': np.nan}, + {'A': 2, 'B': np.nan, 'C': np.nan} + ] + and suppose all the columns are categorical, + then + * `A` in {np.nan, 1, 2} + * `B` in {np.nan, 3} + * `C` in {np.nan} <=== it will be dropped. + + So in the column A, + * np.nan ==> [1, 0, 0] (always the index 0) + * 1 ==> [0, 1, 0] + * 2 ==> [0, 0, 1] + in the column B, + * np.nan ==> [1, 0] + * 3 ==> [0, 1] + Therefore, by concatenating, + * {'A': 1, 'B': np.nan, 'C': np.nan} ==> [0, 1, 0, 1, 0] + * {'A': np.nan, 'B': 3, 'C': np.nan} ==> [1, 0, 0, 0, 1] + * {'A': 2, 'B': np.nan, 'C': np.nan} ==> [0, 0, 1, 1, 0] + ==> [ + [0, 1, 0, 1, 0], + [1, 0, 0, 0, 1], + [0, 0, 1, 1, 0] + ] """ if not self._is_fitted: raise NotFittedError("Cannot call transform on a validator that is not fitted") From e1eb8547ac45e0747527a3d33c307837eb21e9ab Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Wed, 20 Oct 2021 11:31:56 +0200 Subject: [PATCH 51/54] address comments from arlind --- autoPyTorch/data/tabular_feature_validator.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/autoPyTorch/data/tabular_feature_validator.py b/autoPyTorch/data/tabular_feature_validator.py index ae449fa14..62f571037 100644 --- a/autoPyTorch/data/tabular_feature_validator.py +++ b/autoPyTorch/data/tabular_feature_validator.py @@ -55,7 +55,7 @@ def _create_column_transformer( def get_tabular_preprocessors() -> Dict[str, List[BaseEstimator]]: """ - This function creates a Dictionary containing list + This function creates a Dictionary containing a list of numerical and categorical preprocessors Returns: Dict[str, List[BaseEstimator]] @@ -80,7 +80,7 @@ class TabularFeatureValidator(BaseFeatureValidator): @staticmethod def _comparator(cmp1: str, cmp2: str) -> int: - """Order so that categorical columns come right and numerical columns come left + """Order so that categorical columns come left and numerical columns come right Args: cmp1 (str): First variable to compare @@ -97,7 +97,6 @@ def _comparator(cmp1: str, cmp2: str) -> int: if cmp1 not in choices or cmp2 not in choices: raise ValueError('The comparator for the column order only accepts {}, ' 'but got {} and {}'.format(choices, cmp1, cmp2)) - idx1, idx2 = choices.index(cmp1), choices.index(cmp2) return idx1 - idx2 @@ -152,7 +151,7 @@ def _fit( # The column transformer reorders the feature types # therefore, we need to change the order of columns as well - # This means categorical columns are shifted to the right + # This means categorical columns are shifted to the left self.feat_type = sorted( feat_type, From 4545fdbf08b8abc62df1a6a3d8c54c4737ea6170 Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Wed, 20 Oct 2021 11:41:26 +0200 Subject: [PATCH 52/54] address comments from arlind --- autoPyTorch/data/tabular_feature_validator.py | 1 + 1 file changed, 1 insertion(+) diff --git a/autoPyTorch/data/tabular_feature_validator.py b/autoPyTorch/data/tabular_feature_validator.py index 62f571037..669576b9c 100644 --- a/autoPyTorch/data/tabular_feature_validator.py +++ b/autoPyTorch/data/tabular_feature_validator.py @@ -57,6 +57,7 @@ def get_tabular_preprocessors() -> Dict[str, List[BaseEstimator]]: """ This function creates a Dictionary containing a list of numerical and categorical preprocessors + Returns: Dict[str, List[BaseEstimator]] """ From 8519a48191d58fef95645c14c375a4ccf94687cb Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Wed, 20 Oct 2021 11:50:34 +0200 Subject: [PATCH 53/54] change to as_tensor and address comments from arlind --- .../pipeline/components/setup/network_backbone/utils.py | 5 +++-- .../pipeline/components/training/trainer/RowCutMixTrainer.py | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/utils.py b/autoPyTorch/pipeline/components/setup/network_backbone/utils.py index ea0a3c9d0..96390d003 100644 --- a/autoPyTorch/pipeline/components/setup/network_backbone/utils.py +++ b/autoPyTorch/pipeline/components/setup/network_backbone/utils.py @@ -28,6 +28,7 @@ def get_output_shape(network: torch.nn.Module, input_shape: typing.Tuple[int, .. placeholder = torch.randn((2, *input_shape), dtype=torch.float) with torch.no_grad(): output = network(placeholder) + return tuple(output.shape[1:]) @@ -157,9 +158,9 @@ def shake_drop_get_bl( if is_training: # Move to torch.randn(1) for reproducibility - bl = torch.Tensor(1.0) if torch.rand(1) <= pl else torch.Tensor(0.0) + bl = torch.as_tensor(1.0) if torch.rand(1) <= pl else torch.as_tensor(0.0) else: - bl = torch.Tensor(pl) + bl = torch.as_tensor(pl) if is_cuda: bl = bl.cuda() diff --git a/autoPyTorch/pipeline/components/training/trainer/RowCutMixTrainer.py b/autoPyTorch/pipeline/components/training/trainer/RowCutMixTrainer.py index f85cf253f..67de50108 100644 --- a/autoPyTorch/pipeline/components/training/trainer/RowCutMixTrainer.py +++ b/autoPyTorch/pipeline/components/training/trainer/RowCutMixTrainer.py @@ -36,7 +36,7 @@ def data_preparation(self, X: np.ndarray, y: np.ndarray, if beta <= 0 or r > self.alpha: return X, {'y_a': y, 'y_b': y[shuffled_indices], 'lam': 1} - cut_column_indices = torch.tensor(self.random_state.choice(range(n_columns), + cut_column_indices = torch.as_tensor(self.random_state.choice(range(n_columns), max(1, np.int32(n_columns * lam)), replace=False)) From 2c3a525ac6fb6fbcb066ef051dfce69ab9472d66 Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Wed, 20 Oct 2021 11:56:43 +0200 Subject: [PATCH 54/54] correct description for functions in data module --- autoPyTorch/data/base_feature_validator.py | 10 ++++----- autoPyTorch/data/base_target_validator.py | 8 +++---- autoPyTorch/data/base_validator.py | 4 ++-- autoPyTorch/data/tabular_feature_validator.py | 22 +++++++++---------- autoPyTorch/data/tabular_target_validator.py | 8 +++---- 5 files changed, 26 insertions(+), 26 deletions(-) diff --git a/autoPyTorch/data/base_feature_validator.py b/autoPyTorch/data/base_feature_validator.py index 420f0808c..a7cab5913 100644 --- a/autoPyTorch/data/base_feature_validator.py +++ b/autoPyTorch/data/base_feature_validator.py @@ -79,7 +79,7 @@ def fit( The supported data types are List, numpy arrays and pandas DataFrames. CSR sparse data types are also supported - Arguments: + Args: X_train (SUPPORTED_FEAT_TYPES): A set of features that are going to be validated (type and dimensionality checks) and a encoder fitted in the case the data needs encoding @@ -115,7 +115,7 @@ def _fit( X: SUPPORTED_FEAT_TYPES, ) -> BaseEstimator: """ - Arguments: + Args: X (SUPPORTED_FEAT_TYPES): A set of features that are going to be validated (type and dimensionality checks) and a encoder fitted in the case the data needs encoding @@ -133,7 +133,7 @@ def _check_data( """ Feature dimensionality and data type checks - Arguments: + Args: X (SUPPORTED_FEAT_TYPES): A set of features that are going to be validated (type and dimensionality checks) and a encoder fitted in the case the data needs encoding @@ -146,7 +146,7 @@ def transform( X: SUPPORTED_FEAT_TYPES, ) -> np.ndarray: """ - Arguments: + Args: X_train (SUPPORTED_FEAT_TYPES): A set of features, whose categorical features are going to be transformed @@ -168,7 +168,7 @@ def list_to_pandas( If test data is provided, we proactively match it to train data - Arguments: + Args: X_train (SUPPORTED_FEAT_TYPES): A set of features that are going to be validated (type and dimensionality checks) and a encoder fitted in the case the data needs encoding diff --git a/autoPyTorch/data/base_target_validator.py b/autoPyTorch/data/base_target_validator.py index c88dc5e9b..f191e985b 100644 --- a/autoPyTorch/data/base_target_validator.py +++ b/autoPyTorch/data/base_target_validator.py @@ -75,7 +75,7 @@ def fit( Validates and fit a categorical encoder (if needed) to the targets The supported data types are List, numpy arrays and pandas DataFrames. - Arguments: + Args: y_train (SUPPORTED_TARGET_TYPES) A set of targets set aside for training y_test (Union[SUPPORTED_TARGET_TYPES]) @@ -132,7 +132,7 @@ def _fit( y_test: Optional[SUPPORTED_TARGET_TYPES] = None, ) -> BaseEstimator: """ - Arguments: + Args: y_train (SUPPORTED_TARGET_TYPES) The labels of the current task. They are going to be encoded in case of classification @@ -146,7 +146,7 @@ def transform( y: Union[SUPPORTED_TARGET_TYPES], ) -> np.ndarray: """ - Arguments: + Args: y (SUPPORTED_TARGET_TYPES) A set of targets that are going to be encoded if the current task is classification @@ -163,7 +163,7 @@ def inverse_transform( """ Revert any encoding transformation done on a target array - Arguments: + Args: y (Union[np.ndarray, pd.DataFrame, pd.Series]): Target array to be transformed back to original form before encoding Returns: diff --git a/autoPyTorch/data/base_validator.py b/autoPyTorch/data/base_validator.py index 7528d56ab..4ef54c665 100644 --- a/autoPyTorch/data/base_validator.py +++ b/autoPyTorch/data/base_validator.py @@ -58,7 +58,7 @@ def fit( + Checks for dimensionality as well as missing values are performed. + If performing a classification task, the data is going to be encoded - Arguments: + Args: X_train (SUPPORTED_FEAT_TYPES): A set of features that are going to be validated (type and dimensionality checks). If this data contains categorical columns, an encoder is going to @@ -102,7 +102,7 @@ def transform( """ Transform the given target or features to a numpy array - Arguments: + Args: X (SUPPORTED_FEAT_TYPES): A set of features to transform y (typing.Optional[SUPPORTED_TARGET_TYPES]): diff --git a/autoPyTorch/data/tabular_feature_validator.py b/autoPyTorch/data/tabular_feature_validator.py index 669576b9c..62bd0b465 100644 --- a/autoPyTorch/data/tabular_feature_validator.py +++ b/autoPyTorch/data/tabular_feature_validator.py @@ -110,7 +110,7 @@ def _fit( features (from categorical for example) to a numerical value that further stages will be able to use - Arguments: + Args: X (SUPPORTED_FEAT_TYPES): A set of features that are going to be validated (type and dimensionality checks) and a encoder fitted in the case the data needs encoding @@ -180,7 +180,7 @@ def transform( Validates and fit a categorical encoder (if needed) to the features. The supported data types are List, numpy arrays and pandas DataFrames. - Arguments: + Args: X_train (SUPPORTED_FEAT_TYPES): A set of features, whose categorical features are going to be transformed @@ -279,7 +279,7 @@ def _check_data( """ Feature dimensionality and data type checks - Arguments: + Args: X (SUPPORTED_FEAT_TYPES): A set of features that are going to be validated (type and dimensionality checks) and a encoder fitted in the case the data needs encoding @@ -355,7 +355,7 @@ def _get_columns_info( """ Return the columns to be encoded from a pandas dataframe - Arguments: + Args: X (pd.DataFrame) A set of features that are going to be validated (type and dimensionality checks) and a encoder fitted in the case the data needs encoding @@ -425,7 +425,7 @@ def list_to_pandas( If test data is provided, we proactively match it to train data - Arguments: + Args: X_train (SUPPORTED_FEAT_TYPES): A set of features that are going to be validated (type and dimensionality checks) and a encoder fitted in the case the data needs encoding @@ -459,7 +459,7 @@ def numpy_to_pandas( """ Converts a numpy array to pandas for type inference - Arguments: + Args: X (np.ndarray): data to be interpreted. @@ -474,7 +474,7 @@ def infer_objects(self, X: pd.DataFrame) -> pd.DataFrame: This has to be done once, so the test and train data are treated equally - Arguments: + Args: X (pd.DataFrame): data to be interpreted. @@ -512,12 +512,12 @@ def has_object_columns( """ Indicate whether on a Series of dtypes for a Pandas DataFrame there exists one or more object columns. + + Args: + feature_types (pd.Series): The feature types for a DataFrame. - Arguments: - feature_types (pd.Series): - The feature types for a DataFrame. Returns: - bool: + bool: True if the DataFrame dtypes contain an object column, False otherwise. """ diff --git a/autoPyTorch/data/tabular_target_validator.py b/autoPyTorch/data/tabular_target_validator.py index 239791768..7cbd88c38 100644 --- a/autoPyTorch/data/tabular_target_validator.py +++ b/autoPyTorch/data/tabular_target_validator.py @@ -28,7 +28,7 @@ def _fit( It does so by also using the classes from the test data, to prevent encoding errors - Arguments: + Args: y_train (SUPPORTED_TARGET_TYPES) The labels of the current task. They are going to be encoded in case of classification @@ -100,7 +100,7 @@ def transform( Validates and fit a categorical encoder (if needed) to the features. The supported data types are List, numpy arrays and pandas DataFrames. - Arguments: + Args: y (SUPPORTED_TARGET_TYPES) A set of targets that are going to be encoded if the current task is classification @@ -152,7 +152,7 @@ def inverse_transform( """ Revert any encoding transformation done on a target array - Arguments: + Args: y (typing.Union[np.ndarray, pd.DataFrame, pd.Series]): Target array to be transformed back to original form before encoding Returns: @@ -189,7 +189,7 @@ def _check_data( """ Perform dimensionality and data type checks on the targets - Arguments: + Args: y (typing.Union[np.ndarray, pd.DataFrame, pd.Series]): A set of features whose dimensionality and data type is going to be checked """