@@ -188,6 +188,29 @@ def _convert_all_nan_columns_to_numeric(self, X: pd.DataFrame, fit: bool = False
188188
189189 return X
190190
191+ def _encode_categories (self , X : pd .DataFrame ) -> None :
192+ preprocessors = get_tabular_preprocessors ()
193+ self .column_transformer = _create_column_transformer (
194+ preprocessors = preprocessors ,
195+ categorical_columns = self .enc_columns ,
196+ )
197+
198+ assert self .column_transformer is not None # Mypy redefinition
199+ self .column_transformer .fit (X )
200+
201+ # The column transformer moves categoricals to the left side
202+ self .feat_type = sorted (self .feat_type , key = functools .cmp_to_key (self ._comparator ))
203+
204+ encoded_categories = self .column_transformer .\
205+ named_transformers_ ['categorical_pipeline' ].\
206+ named_steps ['ordinalencoder' ].categories_
207+
208+ # An ordinal encoder for each categorical columns
209+ self .categories = [
210+ list (range (len (cat )))
211+ for cat in encoded_categories
212+ ]
213+
191214 def _fit (self , X : SupportedFeatTypes ) -> BaseEstimator :
192215 """
193216 In case input data is a pandas DataFrame, this utility encodes the user provided
@@ -215,44 +238,15 @@ def _fit(self, X: SupportedFeatTypes) -> BaseEstimator:
215238 self .enc_columns , self .feat_type = self ._get_columns_to_encode (X )
216239
217240 assert self .feat_type is not None
218-
219241 if len (self .enc_columns ) > 0 :
220-
221- preprocessors = get_tabular_preprocessors ()
222- self .column_transformer = _create_column_transformer (
223- preprocessors = preprocessors ,
224- categorical_columns = self .enc_columns ,
225- )
226-
227- # Mypy redefinition
228- assert self .column_transformer is not None
229- self .column_transformer .fit (X )
230-
231- # The column transformer reorders the feature types
232- # therefore, we need to change the order of columns as well
233- # This means categorical columns are shifted to the left
234- self .feat_type = sorted (
235- self .feat_type ,
236- key = functools .cmp_to_key (self ._comparator )
237- )
238-
239- encoded_categories = self .column_transformer .\
240- named_transformers_ ['categorical_pipeline' ].\
241- named_steps ['ordinalencoder' ].categories_
242- self .categories = [
243- # We fit an ordinal encoder, where all categorical
244- # columns are shifted to the left
245- list (range (len (cat )))
246- for cat in encoded_categories
247- ]
242+ self ._encode_categories (X )
248243
249244 for i , type_ in enumerate (self .feat_type ):
250245 if 'numerical' in type_ :
251246 self .numerical_columns .append (i )
252247 else :
253248 self .categorical_columns .append (i )
254249
255- # Lastly, store the number of features
256250 self .num_features = np .shape (X )[1 ]
257251 return self
258252
@@ -269,6 +263,41 @@ def transform(self, X: SupportedFeatTypes) -> Union[np.ndarray, spmatrix, pd.Dat
269263 Return:
270264 np.ndarray:
271265 The transformed array
266+
267+ Note:
268+ The default transform performs the folloing:
269+ * simple imputation for both
270+ * scaling for numerical
271+ * one-hot encoding for categorical
272+ For example, here is a simple case
273+ of which all the columns are categorical.
274+ data = [
275+ {'A': 1, 'B': np.nan, 'C': np.nan},
276+ {'A': np.nan, 'B': 3, 'C': np.nan},
277+ {'A': 2, 'B': np.nan, 'C': np.nan}
278+ ]
279+ and suppose all the columns are categorical,
280+ then
281+ * `A` in {np.nan, 1, 2}
282+ * `B` in {np.nan, 3}
283+ * `C` in {np.nan} <=== it will be dropped.
284+
285+ So in the column A,
286+ * np.nan ==> [1, 0, 0] (always the index 0)
287+ * 1 ==> [0, 1, 0]
288+ * 2 ==> [0, 0, 1]
289+ in the column B,
290+ * np.nan ==> [1, 0]
291+ * 3 ==> [0, 1]
292+ Therefore, by concatenating,
293+ * {'A': 1, 'B': np.nan, 'C': np.nan} ==> [0, 1, 0, 1, 0]
294+ * {'A': np.nan, 'B': 3, 'C': np.nan} ==> [1, 0, 0, 0, 1]
295+ * {'A': 2, 'B': np.nan, 'C': np.nan} ==> [0, 0, 1, 1, 0]
296+ ==> [
297+ [0, 1, 0, 1, 0],
298+ [1, 0, 0, 0, 1],
299+ [0, 0, 1, 1, 0]
300+ ]
272301 """
273302 if not self ._is_fitted :
274303 raise NotFittedError ("Cannot call transform on a validator that is not fitted" )
@@ -287,14 +316,6 @@ def transform(self, X: SupportedFeatTypes) -> Union[np.ndarray, spmatrix, pd.Dat
287316
288317 # Pandas related transformations
289318 if hasattr (X , "iloc" ) and self .column_transformer is not None :
290- if np .any (pd .isnull (X )):
291- # After above check it means that if there is a NaN
292- # the whole column must be NaN
293- # Make sure it is numerical and let the pipeline handle it
294- for column in X .columns :
295- if X [column ].isna ().all ():
296- X [column ] = pd .to_numeric (X [column ])
297-
298319 X = self .column_transformer .transform (X )
299320
300321 # Sparse related transformations
@@ -303,17 +324,15 @@ def transform(self, X: SupportedFeatTypes) -> Union[np.ndarray, spmatrix, pd.Dat
303324 X .sort_indices ()
304325
305326 try :
306- X = sklearn .utils .check_array (
307- X ,
308- force_all_finite = False ,
309- accept_sparse = 'csr'
310- )
327+ X = sklearn .utils .check_array (X , force_all_finite = False , accept_sparse = 'csr' )
311328 except Exception as e :
312- self .logger .exception (f"Conversion failed for input { X .dtypes } { X } "
313- "This means AutoPyTorch was not able to properly "
314- "Extract the dtypes of the provided input features. "
315- "Please try to manually cast it to a supported "
316- "numerical or categorical values." )
329+ self .logger .exception (
330+ f"Conversion failed for input { X .dtypes } { X } "
331+ "This means AutoPyTorch was not able to properly "
332+ "Extract the dtypes of the provided input features. "
333+ "Please try to manually cast it to a supported "
334+ "numerical or categorical values."
335+ )
317336 raise e
318337
319338 X = self ._compress_dataset (X )
@@ -327,7 +346,6 @@ def _compress_dataset(self, X: DatasetCompressionInputType) -> DatasetCompressio
327346 the testing data is converted to the same dtype as
328347 the training data.
329348
330-
331349 Args:
332350 X (DatasetCompressionInputType):
333351 Dataset
@@ -509,27 +527,31 @@ def infer_objects(self, X: pd.DataFrame) -> pd.DataFrame:
509527 pd.DataFrame
510528 """
511529 if hasattr (self , 'object_dtype_mapping' ):
512- # Mypy does not process the has attr. This dict is defined below
513- for key , dtype in self .object_dtype_mapping .items (): # type: ignore[has-type]
514- if 'int' in dtype .name :
515- # In the case train data was interpreted as int
516- # and test data was interpreted as float, because of 0.0
517- # for example, honor training data
518- X [key ] = X [key ].applymap (np .int64 )
519- else :
520- try :
521- X [key ] = X [key ].astype (dtype .name )
522- except Exception as e :
523- # Try inference if possible
524- self .logger .warning (f"Tried to cast column { key } to { dtype } caused { e } " )
525- pass
530+ # honor the training data types
531+ try :
532+ # Mypy does not process the has attr.
533+ X = X .astype (self .object_dtype_mapping ) # type: ignore[has-type]
534+ except Exception as e :
535+ # Try inference if possible
536+ self .logger .warning (f'Casting the columns to training dtypes ' # type: ignore[has-type]
537+ f'{ self .object_dtype_mapping } caused the exception { e } ' )
538+ pass
526539 else :
527- X = X .infer_objects ()
528- for column in X .columns :
529- if not is_numeric_dtype (X [column ]):
530- X [column ] = X [column ].astype ('category' )
531- self .object_dtype_mapping = {column : X [column ].dtype for column in X .columns }
540+ if len (self .dtypes ) != 0 :
541+ # when train data has no object dtype, but test does
542+ # we prioritise the datatype given in training data
543+ dtype_dict = {col : dtype for col , dtype in zip (X .columns , self .dtypes )}
544+ X = X .astype (dtype_dict )
545+ else :
546+ # Calling for the first time to infer the categories
547+ X = X .infer_objects ()
548+ dtype_dict = {col : 'category' for col , dtype in zip (X .columns , X .dtypes ) if not is_numeric_dtype (dtype )}
549+ X = X .astype (dtype_dict )
550+ # only numerical attributes and categories
551+ self .object_dtype_mapping = {column : data_type for column , data_type in zip (X .columns , X .dtypes )}
552+
532553 self .logger .debug (f"Infer Objects: { self .object_dtype_mapping } " )
554+
533555 return X
534556
535557
0 commit comments