1
1
import functools
2
2
from typing import Any , Dict , List , Optional , Tuple , Union , cast
3
3
4
+
4
5
import numpy as np
5
6
6
7
import pandas as pd
@@ -38,6 +39,7 @@ def _create_column_transformer(
38
39
Returns:
39
40
ColumnTransformer
40
41
"""
42
+
41
43
numerical_pipeline = 'drop'
42
44
categorical_pipeline = 'drop'
43
45
if len (numerical_columns ) > 0 :
@@ -63,18 +65,25 @@ def get_tabular_preprocessors() -> Dict[str, List[BaseEstimator]]:
63
65
preprocessors ['numerical' ] = list ()
64
66
preprocessors ['categorical' ] = list ()
65
67
66
- preprocessors ['categorical' ].append (OneHotEncoder (
67
- categories = 'auto' ,
68
- sparse = False ,
69
- handle_unknown = 'ignore' ))
70
- preprocessors ['numerical' ].append (SimpleImputer (strategy = 'median' ,
71
- copy = False ))
72
- preprocessors ['numerical' ].append (StandardScaler (with_mean = True , with_std = True , copy = False ))
68
+ preprocessors ['categorical' ].append (
69
+ OneHotEncoder (
70
+ categories = 'auto' ,
71
+ sparse = False ,
72
+ handle_unknown = 'ignore' ,
73
+ )
74
+ )
75
+ preprocessors ['numerical' ].append (
76
+ SimpleImputer (
77
+ strategy = 'median' ,
78
+ copy = False ,
79
+ )
80
+ )
73
81
74
82
return preprocessors
75
83
76
84
77
85
class TabularFeatureValidator (BaseFeatureValidator ):
86
+
78
87
def _fit (
79
88
self ,
80
89
X : SUPPORTED_FEAT_TYPES ,
@@ -96,24 +105,27 @@ def _fit(
96
105
# The final output of a validator is a numpy array. But pandas
97
106
# gives us information about the column dtype
98
107
if isinstance (X , np .ndarray ):
108
+
99
109
X = self .numpy_array_to_pandas (X )
110
+ # Replace the data type from the previously saved type.
111
+ self .data_type = type (X )
112
+ # save all the information about the column order and data types
113
+ self ._check_data (X )
100
114
101
115
if hasattr (X , "iloc" ) and not scipy .sparse .issparse (X ):
102
- X = cast (pd .DataFrame , X )
103
-
104
- if not X .select_dtypes (include = 'object' ).empty :
105
- X = self .infer_objects (X )
106
116
107
- self . _check_data ( X )
117
+ X = cast ( pd . DataFrame , X )
108
118
categorical_columns , numerical_columns , feat_type = self ._get_columns_info (X )
109
119
110
120
self .enc_columns = categorical_columns
111
121
if len (categorical_columns ) >= 0 :
112
122
X = self .impute_nan_in_categories (X )
113
123
preprocessors = get_tabular_preprocessors ()
114
- self .column_transformer = _create_column_transformer (preprocessors = preprocessors ,
115
- numerical_columns = numerical_columns ,
116
- categorical_columns = categorical_columns )
124
+ self .column_transformer = _create_column_transformer (
125
+ preprocessors = preprocessors ,
126
+ numerical_columns = numerical_columns ,
127
+ categorical_columns = categorical_columns ,
128
+ )
117
129
118
130
# Mypy redefinition
119
131
assert self .column_transformer is not None
@@ -142,21 +154,24 @@ def comparator(cmp1: str, cmp2: str) -> int:
142
154
143
155
if len (categorical_columns ) > 0 :
144
156
self .categories = [
145
- # We fit an ordinal encoder, where all categorical
157
+ # We fit an one-hot encoder, where all categorical
146
158
# columns are shifted to the left
147
159
list (range (len (cat )))
148
160
for cat in self .column_transformer .named_transformers_ [
149
161
'categorical_pipeline' ].named_steps ['onehotencoder' ].categories_
150
162
]
151
163
164
+ # differently to categorical_columns and numerical_columns,
165
+ # this saves the index of the column.
152
166
for i , type_ in enumerate (self .feat_type ):
153
167
if 'numerical' in type_ :
154
168
self .numerical_columns .append (i )
155
169
else :
156
170
self .categorical_columns .append (i )
157
171
158
172
# Lastly, store the number of features
159
- self .num_features = np .shape (X )[1 ]
173
+ self .num_features = len (X .columns )
174
+
160
175
return self
161
176
162
177
def transform (
@@ -189,10 +204,6 @@ def transform(
189
204
if hasattr (X , "iloc" ) and not scipy .sparse .issparse (X ):
190
205
X = cast (pd .DataFrame , X )
191
206
192
- # Also remove the object dtype for new data
193
- if not X .select_dtypes (include = 'object' ).empty :
194
- X = self .infer_objects (X )
195
-
196
207
# Check the data here so we catch problems on new test data
197
208
self ._check_data (X )
198
209
# We also need to fillna on the transformation
@@ -268,13 +279,13 @@ def _check_data(
268
279
X = cast (pd .DataFrame , X )
269
280
270
281
# Handle objects if possible
271
- if not X .select_dtypes (include = 'object' ).empty :
282
+ object_columns_indicator = has_object_columns (X .dtypes )
283
+ if object_columns_indicator :
272
284
X = self .infer_objects (X )
273
285
274
286
# Define the column to be encoded here as the feature validator is fitted once
275
287
# per estimator
276
288
# enc_columns, _ = self._get_columns_to_encode(X)
277
-
278
289
column_order = [column for column in X .columns ]
279
290
if len (self .column_order ) > 0 :
280
291
if self .column_order != column_order :
@@ -310,8 +321,10 @@ def _get_columns_info(
310
321
A set of features that are going to be validated (type and dimensionality
311
322
checks) and a encoder fitted in the case the data needs encoding
312
323
Returns:
313
- enc_columns (List[str]):
314
- Columns to encode, if any
324
+ categorical_columns: (List[str])
325
+ Categorical columns.
326
+ numerical_columns: (List[str])
327
+ Numerical columns.
315
328
feat_type:
316
329
Type of each column numerical/categorical
317
330
"""
@@ -323,14 +336,15 @@ def _get_columns_info(
323
336
324
337
# Make sure each column is a valid type
325
338
for i , column in enumerate (X .columns ):
326
- if X [ column ]. dtype . name in [ 'category' , 'bool' ]:
327
-
339
+ column_dtype = self . dtypes [ i ]
340
+ if column_dtype . name in [ 'category' , 'bool' ]:
328
341
categorical_columns .append (column )
329
342
feat_type .append ('categorical' )
330
343
# Move away from np.issubdtype as it causes
331
344
# TypeError: data type not understood in certain pandas types
332
- elif not is_numeric_dtype (X [column ]):
333
- if X [column ].dtype .name == 'object' :
345
+ elif not is_numeric_dtype (column_dtype ):
346
+ # TODO verify how would this happen when we always convert the object dtypes to category
347
+ if column_dtype .name == 'object' :
334
348
raise ValueError (
335
349
"Input Column {} has invalid type object. "
336
350
"Cast it to a valid dtype before using it in AutoPyTorch. "
@@ -345,7 +359,7 @@ def _get_columns_info(
345
359
)
346
360
)
347
361
elif pd .core .dtypes .common .is_datetime_or_timedelta_dtype (
348
- X [ column ]. dtype
362
+ column_dtype
349
363
):
350
364
raise ValueError (
351
365
"AutoPyTorch does not support time and/or date datatype as given "
@@ -362,7 +376,7 @@ def _get_columns_info(
362
376
"Make sure your data is formatted in a correct way, "
363
377
"before feeding it to AutoPyTorch." .format (
364
378
column ,
365
- X [ column ]. dtype .name ,
379
+ column_dtype .name ,
366
380
)
367
381
)
368
382
else :
@@ -394,7 +408,7 @@ def list_to_dataframe(
394
408
"""
395
409
396
410
# If a list was provided, it will be converted to pandas
397
- X_train = pd .DataFrame (data = X_train ).infer_objects ()
411
+ X_train = pd .DataFrame (data = X_train ).convert_dtypes ()
398
412
self .logger .warning ("The provided feature types to AutoPyTorch are of type list."
399
413
"Features have been interpreted as: {}" .format ([(col , t ) for col , t in
400
414
zip (X_train .columns , X_train .dtypes )]))
@@ -403,7 +417,8 @@ def list_to_dataframe(
403
417
self .logger .warning ("Train features are a list while the provided test data"
404
418
"is {}. X_test will be casted as DataFrame." .format (type (X_test ))
405
419
)
406
- X_test = pd .DataFrame (data = X_test ).infer_objects ()
420
+ X_test = pd .DataFrame (data = X_test ).convert_dtypes ()
421
+
407
422
return X_train , X_test
408
423
409
424
@staticmethod
@@ -446,17 +461,21 @@ def infer_objects(self, X: pd.DataFrame) -> pd.DataFrame:
446
461
self .logger .warning (f"Tried to cast column { key } to { dtype } caused { e } " )
447
462
pass
448
463
else :
464
+ # Calling for the first time to infer the categories
449
465
X = X .infer_objects ()
450
- for column in X .columns :
451
- if not is_numeric_dtype (X [column ]):
466
+ # initial data types
467
+ data_types = X .dtypes
468
+ for index , column in enumerate (X .columns ):
469
+ if not is_numeric_dtype (data_types [index ]):
452
470
X [column ] = X [column ].astype ('category' )
453
- self .object_dtype_mapping = {column : X [column ].dtype for column in X .columns }
471
+ # only numerical attributes and categories
472
+ data_types = X .dtypes
473
+ self .object_dtype_mapping = {column : data_types [index ] for index , column in enumerate (X .columns )}
454
474
self .logger .debug (f"Infer Objects: { self .object_dtype_mapping } " )
475
+
455
476
return X
456
477
457
- def impute_nan_in_categories (self ,
458
- X : pd .DataFrame
459
- ) -> pd .DataFrame :
478
+ def impute_nan_in_categories (self , X : pd .DataFrame ) -> pd .DataFrame :
460
479
"""
461
480
impute missing values before encoding,
462
481
remove once sklearn natively supports
@@ -481,14 +500,16 @@ def impute_nan_in_categories(self,
481
500
if X [column ].isna ().any ():
482
501
if column not in self .dict_missing_value_per_col :
483
502
try :
484
- float (X [column ].dropna ().values [0 ])
503
+ first_value = X [column ].dropna ().values [0 ]
504
+ float (first_value )
485
505
can_cast_as_number = True
486
506
except Exception :
487
507
can_cast_as_number = False
488
508
if can_cast_as_number :
489
509
# In this case, we expect to have a number as category
490
510
# it might be string, but its value represent a number
491
- missing_value : Union [str , int ] = '-1' if isinstance (X [column ].dropna ().values [0 ], str ) else - 1
511
+
512
+ missing_value : Union [str , int ] = '-1' if isinstance (first_value , str ) else - 1
492
513
else :
493
514
missing_value = 'Missing!'
494
515
@@ -507,4 +528,30 @@ def impute_nan_in_categories(self,
507
528
X [column ].cat .add_categories ([self .dict_missing_value_per_col [column ]],
508
529
inplace = True )
509
530
X .fillna ({column : self .dict_missing_value_per_col [column ]}, inplace = True )
531
+
510
532
return X
533
+
534
+ def has_object_columns (
535
+ feature_types : pd .Series ,
536
+ ) -> bool :
537
+ """
538
+ Indicate whether on a Series of dtypes for a Pandas DataFrame
539
+ there exists one or more object columns.
540
+
541
+ Arguments:
542
+ ----------
543
+ feature_types: pd.Series
544
+ The feature types for a DataFrame.
545
+ Returns:
546
+ --------
547
+ bool
548
+ True if the DataFrame dtypes contain an object column, False
549
+ otherwise.
550
+ """
551
+ object_columns_indicator = [True if pd .api .types .is_object_dtype (feature_type ) else False
552
+ for feature_type in feature_types ]
553
+
554
+ if True in object_columns_indicator :
555
+ return True
556
+ else :
557
+ return False
0 commit comments