@@ -189,6 +189,29 @@ def _convert_all_nan_columns_to_numeric(self, X: pd.DataFrame, fit: bool = False
189189
190190 return X
191191
192+ def _encode_categories (self , X : pd .DataFrame ) -> None :
193+ preprocessors = get_tabular_preprocessors ()
194+ self .column_transformer = _create_column_transformer (
195+ preprocessors = preprocessors ,
196+ categorical_columns = self .enc_columns ,
197+ )
198+
199+ assert self .column_transformer is not None # Mypy redefinition
200+ self .column_transformer .fit (X )
201+
202+ # The column transformer moves categoricals to the left side
203+ self .feat_type = sorted (self .feat_type , key = functools .cmp_to_key (self ._comparator ))
204+
205+ encoded_categories = self .column_transformer .\
206+ named_transformers_ ['categorical_pipeline' ].\
207+ named_steps ['ordinalencoder' ].categories_
208+
209+ # An ordinal encoder for each categorical columns
210+ self .categories = [
211+ list (range (len (cat )))
212+ for cat in encoded_categories
213+ ]
214+
192215 def _fit (self , X : SupportedFeatTypes ) -> BaseEstimator :
193216 """
194217 In case input data is a pandas DataFrame, this utility encodes the user provided
@@ -216,44 +239,15 @@ def _fit(self, X: SupportedFeatTypes) -> BaseEstimator:
216239 self .enc_columns , self .feat_type = self ._get_columns_to_encode (X )
217240
218241 assert self .feat_type is not None
219-
220242 if len (self .enc_columns ) > 0 :
221-
222- preprocessors = get_tabular_preprocessors ()
223- self .column_transformer = _create_column_transformer (
224- preprocessors = preprocessors ,
225- categorical_columns = self .enc_columns ,
226- )
227-
228- # Mypy redefinition
229- assert self .column_transformer is not None
230- self .column_transformer .fit (X )
231-
232- # The column transformer reorders the feature types
233- # therefore, we need to change the order of columns as well
234- # This means categorical columns are shifted to the left
235- self .feat_type = sorted (
236- self .feat_type ,
237- key = functools .cmp_to_key (self ._comparator )
238- )
239-
240- encoded_categories = self .column_transformer .\
241- named_transformers_ ['categorical_pipeline' ].\
242- named_steps ['ordinalencoder' ].categories_
243- self .categories = [
244- # We fit an ordinal encoder, where all categorical
245- # columns are shifted to the left
246- list (range (len (cat )))
247- for cat in encoded_categories
248- ]
243+ self ._encode_categories (X )
249244
250245 for i , type_ in enumerate (self .feat_type ):
251246 if 'numerical' in type_ :
252247 self .numerical_columns .append (i )
253248 else :
254249 self .categorical_columns .append (i )
255250
256- # Lastly, store the number of features
257251 self .num_features = np .shape (X )[1 ]
258252 return self
259253
@@ -270,6 +264,41 @@ def transform(self, X: SupportedFeatTypes) -> Union[np.ndarray, spmatrix, pd.Dat
270264 Return:
271265 np.ndarray:
272266 The transformed array
267+
268+ Note:
269+ The default transform performs the folloing:
270+ * simple imputation for both
271+ * scaling for numerical
272+ * one-hot encoding for categorical
273+ For example, here is a simple case
274+ of which all the columns are categorical.
275+ data = [
276+ {'A': 1, 'B': np.nan, 'C': np.nan},
277+ {'A': np.nan, 'B': 3, 'C': np.nan},
278+ {'A': 2, 'B': np.nan, 'C': np.nan}
279+ ]
280+ and suppose all the columns are categorical,
281+ then
282+ * `A` in {np.nan, 1, 2}
283+ * `B` in {np.nan, 3}
284+ * `C` in {np.nan} <=== it will be dropped.
285+
286+ So in the column A,
287+ * np.nan ==> [1, 0, 0] (always the index 0)
288+ * 1 ==> [0, 1, 0]
289+ * 2 ==> [0, 0, 1]
290+ in the column B,
291+ * np.nan ==> [1, 0]
292+ * 3 ==> [0, 1]
293+ Therefore, by concatenating,
294+ * {'A': 1, 'B': np.nan, 'C': np.nan} ==> [0, 1, 0, 1, 0]
295+ * {'A': np.nan, 'B': 3, 'C': np.nan} ==> [1, 0, 0, 0, 1]
296+ * {'A': 2, 'B': np.nan, 'C': np.nan} ==> [0, 0, 1, 1, 0]
297+ ==> [
298+ [0, 1, 0, 1, 0],
299+ [1, 0, 0, 0, 1],
300+ [0, 0, 1, 1, 0]
301+ ]
273302 """
274303 if not self ._is_fitted :
275304 raise NotFittedError ("Cannot call transform on a validator that is not fitted" )
@@ -288,14 +317,6 @@ def transform(self, X: SupportedFeatTypes) -> Union[np.ndarray, spmatrix, pd.Dat
288317
289318 # Pandas related transformations
290319 if ispandas (X ) and self .column_transformer is not None :
291- if np .any (pd .isnull (X )):
292- # After above check it means that if there is a NaN
293- # the whole column must be NaN
294- # Make sure it is numerical and let the pipeline handle it
295- for column in X .columns :
296- if X [column ].isna ().all ():
297- X [column ] = pd .to_numeric (X [column ])
298-
299320 X = self .column_transformer .transform (X )
300321
301322 # Sparse related transformations
@@ -304,17 +325,15 @@ def transform(self, X: SupportedFeatTypes) -> Union[np.ndarray, spmatrix, pd.Dat
304325 X .sort_indices ()
305326
306327 try :
307- X = sklearn .utils .check_array (
308- X ,
309- force_all_finite = False ,
310- accept_sparse = 'csr'
311- )
328+ X = sklearn .utils .check_array (X , force_all_finite = False , accept_sparse = 'csr' )
312329 except Exception as e :
313- self .logger .exception (f"Conversion failed for input { X .dtypes } { X } "
314- "This means AutoPyTorch was not able to properly "
315- "Extract the dtypes of the provided input features. "
316- "Please try to manually cast it to a supported "
317- "numerical or categorical values." )
330+ self .logger .exception (
331+ f"Conversion failed for input { X .dtypes } { X } "
332+ "This means AutoPyTorch was not able to properly "
333+ "Extract the dtypes of the provided input features. "
334+ "Please try to manually cast it to a supported "
335+ "numerical or categorical values."
336+ )
318337 raise e
319338
320339 X = self ._compress_dataset (X )
@@ -328,7 +347,6 @@ def _compress_dataset(self, X: DatasetCompressionInputType) -> DatasetCompressio
328347 the testing data is converted to the same dtype as
329348 the training data.
330349
331-
332350 Args:
333351 X (DatasetCompressionInputType):
334352 Dataset
@@ -510,27 +528,31 @@ def infer_objects(self, X: pd.DataFrame) -> pd.DataFrame:
510528 pd.DataFrame
511529 """
512530 if hasattr (self , 'object_dtype_mapping' ):
513- # Mypy does not process the has attr. This dict is defined below
514- for key , dtype in self .object_dtype_mapping .items (): # type: ignore[has-type]
515- if 'int' in dtype .name :
516- # In the case train data was interpreted as int
517- # and test data was interpreted as float, because of 0.0
518- # for example, honor training data
519- X [key ] = X [key ].applymap (np .int64 )
520- else :
521- try :
522- X [key ] = X [key ].astype (dtype .name )
523- except Exception as e :
524- # Try inference if possible
525- self .logger .warning (f"Tried to cast column { key } to { dtype } caused { e } " )
526- pass
531+ # honor the training data types
532+ try :
533+ # Mypy does not process the has attr.
534+ X = X .astype (self .object_dtype_mapping ) # type: ignore[has-type]
535+ except Exception as e :
536+ # Try inference if possible
537+ self .logger .warning (f'Casting the columns to training dtypes ' # type: ignore[has-type]
538+ f'{ self .object_dtype_mapping } caused the exception { e } ' )
539+ pass
527540 else :
528- X = X .infer_objects ()
529- for column in X .columns :
530- if not is_numeric_dtype (X [column ]):
531- X [column ] = X [column ].astype ('category' )
532- self .object_dtype_mapping = {column : X [column ].dtype for column in X .columns }
541+ if len (self .dtypes ) != 0 :
542+ # when train data has no object dtype, but test does
543+ # we prioritise the datatype given in training data
544+ dtype_dict = {col : dtype for col , dtype in zip (X .columns , self .dtypes )}
545+ X = X .astype (dtype_dict )
546+ else :
547+ # Calling for the first time to infer the categories
548+ X = X .infer_objects ()
549+ dtype_dict = {col : 'category' for col , dtype in zip (X .columns , X .dtypes ) if not is_numeric_dtype (dtype )}
550+ X = X .astype (dtype_dict )
551+ # only numerical attributes and categories
552+ self .object_dtype_mapping = {column : data_type for column , data_type in zip (X .columns , X .dtypes )}
553+
533554 self .logger .debug (f"Infer Objects: { self .object_dtype_mapping } " )
555+
534556 return X
535557
536558
0 commit comments