Skip to content

Commit 359b4c9

Browse files
committed
Initial try at an enhancement for the tabular validator
1 parent 9cdfb64 commit 359b4c9

File tree

2 files changed

+116
-41
lines changed

2 files changed

+116
-41
lines changed

autoPyTorch/data/base_feature_validator.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -122,6 +122,7 @@ def _fit(
122122
self:
123123
The fitted base estimator
124124
"""
125+
125126
raise NotImplementedError()
126127

127128
def _check_data(
@@ -136,6 +137,7 @@ def _check_data(
136137
A set of features that are going to be validated (type and dimensionality
137138
checks) and a encoder fitted in the case the data needs encoding
138139
"""
140+
139141
raise NotImplementedError()
140142

141143
def transform(
@@ -152,4 +154,30 @@ def transform(
152154
np.ndarray:
153155
The transformed array
154156
"""
157+
158+
raise NotImplementedError()
159+
160+
def list_to_dataframe(
161+
self,
162+
X_train: SUPPORTED_FEAT_TYPES,
163+
X_test: typing.Optional[SUPPORTED_FEAT_TYPES] = None,
164+
) -> typing.Tuple[pd.DataFrame, typing.Optional[pd.DataFrame]]:
165+
"""
166+
Converts a list to a pandas DataFrame. In this process, column types are inferred.
167+
168+
If test data is provided, we proactively match it to train data
169+
170+
Arguments:
171+
X_train (SUPPORTED_FEAT_TYPES):
172+
A set of features that are going to be validated (type and dimensionality
173+
checks) and a encoder fitted in the case the data needs encoding
174+
X_test (typing.Optional[SUPPORTED_FEAT_TYPES]):
175+
A hold out set of data used for checking
176+
Returns:
177+
pd.DataFrame:
178+
transformed train data from list to pandas DataFrame
179+
pd.DataFrame:
180+
transformed test data from list to pandas DataFrame
181+
"""
182+
155183
raise NotImplementedError()

autoPyTorch/data/tabular_feature_validator.py

Lines changed: 88 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import functools
22
from typing import Any, Dict, List, Optional, Tuple, Union, cast
33

4+
45
import numpy as np
56

67
import pandas as pd
@@ -38,6 +39,7 @@ def _create_column_transformer(
3839
Returns:
3940
ColumnTransformer
4041
"""
42+
4143
numerical_pipeline = 'drop'
4244
categorical_pipeline = 'drop'
4345
if len(numerical_columns) > 0:
@@ -63,18 +65,25 @@ def get_tabular_preprocessors() -> Dict[str, List[BaseEstimator]]:
6365
preprocessors['numerical'] = list()
6466
preprocessors['categorical'] = list()
6567

66-
preprocessors['categorical'].append(OneHotEncoder(
67-
categories='auto',
68-
sparse=False,
69-
handle_unknown='ignore'))
70-
preprocessors['numerical'].append(SimpleImputer(strategy='median',
71-
copy=False))
72-
preprocessors['numerical'].append(StandardScaler(with_mean=True, with_std=True, copy=False))
68+
preprocessors['categorical'].append(
69+
OneHotEncoder(
70+
categories='auto',
71+
sparse=False,
72+
handle_unknown='ignore',
73+
)
74+
)
75+
preprocessors['numerical'].append(
76+
SimpleImputer(
77+
strategy='median',
78+
copy=False,
79+
)
80+
)
7381

7482
return preprocessors
7583

7684

7785
class TabularFeatureValidator(BaseFeatureValidator):
86+
7887
def _fit(
7988
self,
8089
X: SUPPORTED_FEAT_TYPES,
@@ -96,24 +105,27 @@ def _fit(
96105
# The final output of a validator is a numpy array. But pandas
97106
# gives us information about the column dtype
98107
if isinstance(X, np.ndarray):
108+
99109
X = self.numpy_array_to_pandas(X)
110+
# Replace the data type from the previously saved type.
111+
self.data_type = type(X)
112+
# save all the information about the column order and data types
113+
self._check_data(X)
100114

101115
if hasattr(X, "iloc") and not scipy.sparse.issparse(X):
102-
X = cast(pd.DataFrame, X)
103-
104-
if not X.select_dtypes(include='object').empty:
105-
X = self.infer_objects(X)
106116

107-
self._check_data(X)
117+
X = cast(pd.DataFrame, X)
108118
categorical_columns, numerical_columns, feat_type = self._get_columns_info(X)
109119

110120
self.enc_columns = categorical_columns
111121
if len(categorical_columns) >= 0:
112122
X = self.impute_nan_in_categories(X)
113123
preprocessors = get_tabular_preprocessors()
114-
self.column_transformer = _create_column_transformer(preprocessors=preprocessors,
115-
numerical_columns=numerical_columns,
116-
categorical_columns=categorical_columns)
124+
self.column_transformer = _create_column_transformer(
125+
preprocessors=preprocessors,
126+
numerical_columns=numerical_columns,
127+
categorical_columns=categorical_columns,
128+
)
117129

118130
# Mypy redefinition
119131
assert self.column_transformer is not None
@@ -142,21 +154,24 @@ def comparator(cmp1: str, cmp2: str) -> int:
142154

143155
if len(categorical_columns) > 0:
144156
self.categories = [
145-
# We fit an ordinal encoder, where all categorical
157+
# We fit an one-hot encoder, where all categorical
146158
# columns are shifted to the left
147159
list(range(len(cat)))
148160
for cat in self.column_transformer.named_transformers_[
149161
'categorical_pipeline'].named_steps['onehotencoder'].categories_
150162
]
151163

164+
# differently to categorical_columns and numerical_columns,
165+
# this saves the index of the column.
152166
for i, type_ in enumerate(self.feat_type):
153167
if 'numerical' in type_:
154168
self.numerical_columns.append(i)
155169
else:
156170
self.categorical_columns.append(i)
157171

158172
# Lastly, store the number of features
159-
self.num_features = np.shape(X)[1]
173+
self.num_features = len(X.columns)
174+
160175
return self
161176

162177
def transform(
@@ -189,10 +204,6 @@ def transform(
189204
if hasattr(X, "iloc") and not scipy.sparse.issparse(X):
190205
X = cast(pd.DataFrame, X)
191206

192-
# Also remove the object dtype for new data
193-
if not X.select_dtypes(include='object').empty:
194-
X = self.infer_objects(X)
195-
196207
# Check the data here so we catch problems on new test data
197208
self._check_data(X)
198209
# We also need to fillna on the transformation
@@ -268,13 +279,13 @@ def _check_data(
268279
X = cast(pd.DataFrame, X)
269280

270281
# Handle objects if possible
271-
if not X.select_dtypes(include='object').empty:
282+
object_columns_indicator = has_object_columns(X.dtypes)
283+
if object_columns_indicator:
272284
X = self.infer_objects(X)
273285

274286
# Define the column to be encoded here as the feature validator is fitted once
275287
# per estimator
276288
# enc_columns, _ = self._get_columns_to_encode(X)
277-
278289
column_order = [column for column in X.columns]
279290
if len(self.column_order) > 0:
280291
if self.column_order != column_order:
@@ -310,8 +321,10 @@ def _get_columns_info(
310321
A set of features that are going to be validated (type and dimensionality
311322
checks) and a encoder fitted in the case the data needs encoding
312323
Returns:
313-
enc_columns (List[str]):
314-
Columns to encode, if any
324+
categorical_columns: (List[str])
325+
Categorical columns.
326+
numerical_columns: (List[str])
327+
Numerical columns.
315328
feat_type:
316329
Type of each column numerical/categorical
317330
"""
@@ -323,14 +336,15 @@ def _get_columns_info(
323336

324337
# Make sure each column is a valid type
325338
for i, column in enumerate(X.columns):
326-
if X[column].dtype.name in ['category', 'bool']:
327-
339+
column_dtype = self.dtypes[i]
340+
if column_dtype.name in ['category', 'bool']:
328341
categorical_columns.append(column)
329342
feat_type.append('categorical')
330343
# Move away from np.issubdtype as it causes
331344
# TypeError: data type not understood in certain pandas types
332-
elif not is_numeric_dtype(X[column]):
333-
if X[column].dtype.name == 'object':
345+
elif not is_numeric_dtype(column_dtype):
346+
# TODO verify how would this happen when we always convert the object dtypes to category
347+
if column_dtype.name == 'object':
334348
raise ValueError(
335349
"Input Column {} has invalid type object. "
336350
"Cast it to a valid dtype before using it in AutoPyTorch. "
@@ -345,7 +359,7 @@ def _get_columns_info(
345359
)
346360
)
347361
elif pd.core.dtypes.common.is_datetime_or_timedelta_dtype(
348-
X[column].dtype
362+
column_dtype
349363
):
350364
raise ValueError(
351365
"AutoPyTorch does not support time and/or date datatype as given "
@@ -362,7 +376,7 @@ def _get_columns_info(
362376
"Make sure your data is formatted in a correct way, "
363377
"before feeding it to AutoPyTorch.".format(
364378
column,
365-
X[column].dtype.name,
379+
column_dtype.name,
366380
)
367381
)
368382
else:
@@ -394,7 +408,7 @@ def list_to_dataframe(
394408
"""
395409

396410
# If a list was provided, it will be converted to pandas
397-
X_train = pd.DataFrame(data=X_train).infer_objects()
411+
X_train = pd.DataFrame(data=X_train).convert_dtypes()
398412
self.logger.warning("The provided feature types to AutoPyTorch are of type list."
399413
"Features have been interpreted as: {}".format([(col, t) for col, t in
400414
zip(X_train.columns, X_train.dtypes)]))
@@ -403,7 +417,8 @@ def list_to_dataframe(
403417
self.logger.warning("Train features are a list while the provided test data"
404418
"is {}. X_test will be casted as DataFrame.".format(type(X_test))
405419
)
406-
X_test = pd.DataFrame(data=X_test).infer_objects()
420+
X_test = pd.DataFrame(data=X_test).convert_dtypes()
421+
407422
return X_train, X_test
408423

409424
@staticmethod
@@ -446,17 +461,21 @@ def infer_objects(self, X: pd.DataFrame) -> pd.DataFrame:
446461
self.logger.warning(f"Tried to cast column {key} to {dtype} caused {e}")
447462
pass
448463
else:
464+
# Calling for the first time to infer the categories
449465
X = X.infer_objects()
450-
for column in X.columns:
451-
if not is_numeric_dtype(X[column]):
466+
# initial data types
467+
data_types = X.dtypes
468+
for index, column in enumerate(X.columns):
469+
if not is_numeric_dtype(data_types[index]):
452470
X[column] = X[column].astype('category')
453-
self.object_dtype_mapping = {column: X[column].dtype for column in X.columns}
471+
# only numerical attributes and categories
472+
data_types = X.dtypes
473+
self.object_dtype_mapping = {column: data_types[index] for index, column in enumerate(X.columns)}
454474
self.logger.debug(f"Infer Objects: {self.object_dtype_mapping}")
475+
455476
return X
456477

457-
def impute_nan_in_categories(self,
458-
X: pd.DataFrame
459-
) -> pd.DataFrame:
478+
def impute_nan_in_categories(self, X: pd.DataFrame) -> pd.DataFrame:
460479
"""
461480
impute missing values before encoding,
462481
remove once sklearn natively supports
@@ -481,14 +500,16 @@ def impute_nan_in_categories(self,
481500
if X[column].isna().any():
482501
if column not in self.dict_missing_value_per_col:
483502
try:
484-
float(X[column].dropna().values[0])
503+
first_value = X[column].dropna().values[0]
504+
float(first_value)
485505
can_cast_as_number = True
486506
except Exception:
487507
can_cast_as_number = False
488508
if can_cast_as_number:
489509
# In this case, we expect to have a number as category
490510
# it might be string, but its value represent a number
491-
missing_value: Union[str, int] = '-1' if isinstance(X[column].dropna().values[0], str) else -1
511+
512+
missing_value: Union[str, int] = '-1' if isinstance(first_value, str) else -1
492513
else:
493514
missing_value = 'Missing!'
494515

@@ -507,4 +528,30 @@ def impute_nan_in_categories(self,
507528
X[column].cat.add_categories([self.dict_missing_value_per_col[column]],
508529
inplace=True)
509530
X.fillna({column: self.dict_missing_value_per_col[column]}, inplace=True)
531+
510532
return X
533+
534+
def has_object_columns(
535+
feature_types: pd.Series,
536+
) -> bool:
537+
"""
538+
Indicate whether on a Series of dtypes for a Pandas DataFrame
539+
there exists one or more object columns.
540+
541+
Arguments:
542+
----------
543+
feature_types: pd.Series
544+
The feature types for a DataFrame.
545+
Returns:
546+
--------
547+
bool
548+
True if the DataFrame dtypes contain an object column, False
549+
otherwise.
550+
"""
551+
object_columns_indicator = [True if pd.api.types.is_object_dtype(feature_type) else False
552+
for feature_type in feature_types]
553+
554+
if True in object_columns_indicator:
555+
return True
556+
else:
557+
return False

0 commit comments

Comments
 (0)