Skip to content

Commit ba516e9

Browse files
committed
[refactor] Separate some processes
1 parent 8e73972 commit ba516e9

File tree

1 file changed

+90
-68
lines changed

1 file changed

+90
-68
lines changed

autoPyTorch/data/tabular_feature_validator.py

Lines changed: 90 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -188,6 +188,29 @@ def _convert_all_nan_columns_to_numeric(self, X: pd.DataFrame, fit: bool = False
188188

189189
return X
190190

191+
def _encode_categories(self, X: pd.DataFrame) -> None:
192+
preprocessors = get_tabular_preprocessors()
193+
self.column_transformer = _create_column_transformer(
194+
preprocessors=preprocessors,
195+
categorical_columns=self.enc_columns,
196+
)
197+
198+
assert self.column_transformer is not None # Mypy redefinition
199+
self.column_transformer.fit(X)
200+
201+
# The column transformer moves categoricals to the left side
202+
self.feat_type = sorted(self.feat_type, key=functools.cmp_to_key(self._comparator))
203+
204+
encoded_categories = self.column_transformer.\
205+
named_transformers_['categorical_pipeline'].\
206+
named_steps['ordinalencoder'].categories_
207+
208+
# An ordinal encoder for each categorical columns
209+
self.categories = [
210+
list(range(len(cat)))
211+
for cat in encoded_categories
212+
]
213+
191214
def _fit(self, X: SupportedFeatTypes) -> BaseEstimator:
192215
"""
193216
In case input data is a pandas DataFrame, this utility encodes the user provided
@@ -215,44 +238,15 @@ def _fit(self, X: SupportedFeatTypes) -> BaseEstimator:
215238
self.enc_columns, self.feat_type = self._get_columns_to_encode(X)
216239

217240
assert self.feat_type is not None
218-
219241
if len(self.enc_columns) > 0:
220-
221-
preprocessors = get_tabular_preprocessors()
222-
self.column_transformer = _create_column_transformer(
223-
preprocessors=preprocessors,
224-
categorical_columns=self.enc_columns,
225-
)
226-
227-
# Mypy redefinition
228-
assert self.column_transformer is not None
229-
self.column_transformer.fit(X)
230-
231-
# The column transformer reorders the feature types
232-
# therefore, we need to change the order of columns as well
233-
# This means categorical columns are shifted to the left
234-
self.feat_type = sorted(
235-
self.feat_type,
236-
key=functools.cmp_to_key(self._comparator)
237-
)
238-
239-
encoded_categories = self.column_transformer.\
240-
named_transformers_['categorical_pipeline'].\
241-
named_steps['ordinalencoder'].categories_
242-
self.categories = [
243-
# We fit an ordinal encoder, where all categorical
244-
# columns are shifted to the left
245-
list(range(len(cat)))
246-
for cat in encoded_categories
247-
]
242+
self._encode_categories(X)
248243

249244
for i, type_ in enumerate(self.feat_type):
250245
if 'numerical' in type_:
251246
self.numerical_columns.append(i)
252247
else:
253248
self.categorical_columns.append(i)
254249

255-
# Lastly, store the number of features
256250
self.num_features = np.shape(X)[1]
257251
return self
258252

@@ -269,6 +263,41 @@ def transform(self, X: SupportedFeatTypes) -> Union[np.ndarray, spmatrix, pd.Dat
269263
Return:
270264
np.ndarray:
271265
The transformed array
266+
267+
Note:
268+
The default transform performs the folloing:
269+
* simple imputation for both
270+
* scaling for numerical
271+
* one-hot encoding for categorical
272+
For example, here is a simple case
273+
of which all the columns are categorical.
274+
data = [
275+
{'A': 1, 'B': np.nan, 'C': np.nan},
276+
{'A': np.nan, 'B': 3, 'C': np.nan},
277+
{'A': 2, 'B': np.nan, 'C': np.nan}
278+
]
279+
and suppose all the columns are categorical,
280+
then
281+
* `A` in {np.nan, 1, 2}
282+
* `B` in {np.nan, 3}
283+
* `C` in {np.nan} <=== it will be dropped.
284+
285+
So in the column A,
286+
* np.nan ==> [1, 0, 0] (always the index 0)
287+
* 1 ==> [0, 1, 0]
288+
* 2 ==> [0, 0, 1]
289+
in the column B,
290+
* np.nan ==> [1, 0]
291+
* 3 ==> [0, 1]
292+
Therefore, by concatenating,
293+
* {'A': 1, 'B': np.nan, 'C': np.nan} ==> [0, 1, 0, 1, 0]
294+
* {'A': np.nan, 'B': 3, 'C': np.nan} ==> [1, 0, 0, 0, 1]
295+
* {'A': 2, 'B': np.nan, 'C': np.nan} ==> [0, 0, 1, 1, 0]
296+
==> [
297+
[0, 1, 0, 1, 0],
298+
[1, 0, 0, 0, 1],
299+
[0, 0, 1, 1, 0]
300+
]
272301
"""
273302
if not self._is_fitted:
274303
raise NotFittedError("Cannot call transform on a validator that is not fitted")
@@ -287,14 +316,6 @@ def transform(self, X: SupportedFeatTypes) -> Union[np.ndarray, spmatrix, pd.Dat
287316

288317
# Pandas related transformations
289318
if hasattr(X, "iloc") and self.column_transformer is not None:
290-
if np.any(pd.isnull(X)):
291-
# After above check it means that if there is a NaN
292-
# the whole column must be NaN
293-
# Make sure it is numerical and let the pipeline handle it
294-
for column in X.columns:
295-
if X[column].isna().all():
296-
X[column] = pd.to_numeric(X[column])
297-
298319
X = self.column_transformer.transform(X)
299320

300321
# Sparse related transformations
@@ -303,17 +324,15 @@ def transform(self, X: SupportedFeatTypes) -> Union[np.ndarray, spmatrix, pd.Dat
303324
X.sort_indices()
304325

305326
try:
306-
X = sklearn.utils.check_array(
307-
X,
308-
force_all_finite=False,
309-
accept_sparse='csr'
310-
)
327+
X = sklearn.utils.check_array(X, force_all_finite=False, accept_sparse='csr')
311328
except Exception as e:
312-
self.logger.exception(f"Conversion failed for input {X.dtypes} {X}"
313-
"This means AutoPyTorch was not able to properly "
314-
"Extract the dtypes of the provided input features. "
315-
"Please try to manually cast it to a supported "
316-
"numerical or categorical values.")
329+
self.logger.exception(
330+
f"Conversion failed for input {X.dtypes} {X}"
331+
"This means AutoPyTorch was not able to properly "
332+
"Extract the dtypes of the provided input features. "
333+
"Please try to manually cast it to a supported "
334+
"numerical or categorical values."
335+
)
317336
raise e
318337

319338
X = self._compress_dataset(X)
@@ -327,7 +346,6 @@ def _compress_dataset(self, X: DatasetCompressionInputType) -> DatasetCompressio
327346
the testing data is converted to the same dtype as
328347
the training data.
329348
330-
331349
Args:
332350
X (DatasetCompressionInputType):
333351
Dataset
@@ -509,27 +527,31 @@ def infer_objects(self, X: pd.DataFrame) -> pd.DataFrame:
509527
pd.DataFrame
510528
"""
511529
if hasattr(self, 'object_dtype_mapping'):
512-
# Mypy does not process the has attr. This dict is defined below
513-
for key, dtype in self.object_dtype_mapping.items(): # type: ignore[has-type]
514-
if 'int' in dtype.name:
515-
# In the case train data was interpreted as int
516-
# and test data was interpreted as float, because of 0.0
517-
# for example, honor training data
518-
X[key] = X[key].applymap(np.int64)
519-
else:
520-
try:
521-
X[key] = X[key].astype(dtype.name)
522-
except Exception as e:
523-
# Try inference if possible
524-
self.logger.warning(f"Tried to cast column {key} to {dtype} caused {e}")
525-
pass
530+
# honor the training data types
531+
try:
532+
# Mypy does not process the has attr.
533+
X = X.astype(self.object_dtype_mapping) # type: ignore[has-type]
534+
except Exception as e:
535+
# Try inference if possible
536+
self.logger.warning(f'Casting the columns to training dtypes ' # type: ignore[has-type]
537+
f'{self.object_dtype_mapping} caused the exception {e}')
538+
pass
526539
else:
527-
X = X.infer_objects()
528-
for column in X.columns:
529-
if not is_numeric_dtype(X[column]):
530-
X[column] = X[column].astype('category')
531-
self.object_dtype_mapping = {column: X[column].dtype for column in X.columns}
540+
if len(self.dtypes) != 0:
541+
# when train data has no object dtype, but test does
542+
# we prioritise the datatype given in training data
543+
dtype_dict = {col: dtype for col, dtype in zip(X.columns, self.dtypes)}
544+
X = X.astype(dtype_dict)
545+
else:
546+
# Calling for the first time to infer the categories
547+
X = X.infer_objects()
548+
dtype_dict = {col: 'category' for col, dtype in zip(X.columns, X.dtypes) if not is_numeric_dtype(dtype)}
549+
X = X.astype(dtype_dict)
550+
# only numerical attributes and categories
551+
self.object_dtype_mapping = {column: data_type for column, data_type in zip(X.columns, X.dtypes)}
552+
532553
self.logger.debug(f"Infer Objects: {self.object_dtype_mapping}")
554+
533555
return X
534556

535557

0 commit comments

Comments
 (0)