[refactor] Fix SparseMatrixType --> spmatrix and add ispandas (#397)

nabenabe0928 · web-flow · commit 1b8e76abf4c1 · 2022-03-02T22:31:06.000+01:00
diff --git a/autoPyTorch/data/base_feature_validator.py b/autoPyTorch/data/base_feature_validator.py
@@ -5,13 +5,14 @@
 
 import pandas as pd
 
+from scipy.sparse import spmatrix
+
 from sklearn.base import BaseEstimator
 
-from autoPyTorch.utils.common import SparseMatrixType
 from autoPyTorch.utils.logging_ import PicklableClientLogger
 
 
-SupportedFeatTypes = Union[List, pd.DataFrame, np.ndarray, SparseMatrixType]
+SupportedFeatTypes = Union[List, pd.DataFrame, np.ndarray, spmatrix]
 
 
 class BaseFeatureValidator(BaseEstimator):
diff --git a/autoPyTorch/data/base_target_validator.py b/autoPyTorch/data/base_target_validator.py
@@ -5,13 +5,14 @@
 
 import pandas as pd
 
+from scipy.sparse import spmatrix
+
 from sklearn.base import BaseEstimator
 
-from autoPyTorch.utils.common import SparseMatrixType
 from autoPyTorch.utils.logging_ import PicklableClientLogger
 
 
-SupportedTargetTypes = Union[List, pd.Series, pd.DataFrame, np.ndarray, SparseMatrixType]
+SupportedTargetTypes = Union[List, pd.Series, pd.DataFrame, np.ndarray, spmatrix]
 
 
 class BaseTargetValidator(BaseEstimator):
diff --git a/autoPyTorch/data/tabular_feature_validator.py b/autoPyTorch/data/tabular_feature_validator.py
@@ -23,6 +23,7 @@
     DatasetDTypeContainerType,
     reduce_dataset_size_if_too_large
 )
+from autoPyTorch.utils.common import ispandas
 from autoPyTorch.utils.logging_ import PicklableClientLogger
 
 
@@ -155,7 +156,7 @@ def _fit(
         if isinstance(X, np.ndarray):
             X = self.numpy_array_to_pandas(X)
 
-        if hasattr(X, "iloc") and not issparse(X):
+        if ispandas(X) and not issparse(X):
             X = cast(pd.DataFrame, X)
             # Treat a column with all instances a NaN as numerical
             # This will prevent doing encoding to a categorical column made completely
@@ -245,7 +246,7 @@ def transform(
         if isinstance(X, np.ndarray):
             X = self.numpy_array_to_pandas(X)
 
-        if hasattr(X, "iloc") and not issparse(X):
+        if ispandas(X) and not issparse(X):
             if np.any(pd.isnull(X)):
                 for column in X.columns:
                     if X[column].isna().all():
@@ -259,7 +260,7 @@ def transform(
         self._check_data(X)
 
         # Pandas related transformations
-        if hasattr(X, "iloc") and self.column_transformer is not None:
+        if ispandas(X) and self.column_transformer is not None:
             if np.any(pd.isnull(X)):
                 # After above check it means that if there is a NaN
                 # the whole column must be NaN
@@ -309,7 +310,7 @@ def _compress_dataset(self, X: DatasetCompressionInputType) -> DatasetCompressio
             DatasetCompressionInputType:
                 Compressed dataset.
         """
-        is_dataframe = hasattr(X, 'iloc')
+        is_dataframe = ispandas(X)
         is_reducible_type = isinstance(X, np.ndarray) or issparse(X) or is_dataframe
         if not is_reducible_type or self._dataset_compression is None:
             return X
@@ -363,7 +364,7 @@ def _check_data(
                 )
 
         # Then for Pandas, we do not support Nan in categorical columns
-        if hasattr(X, "iloc"):
+        if ispandas(X):
             # If entered here, we have a pandas dataframe
             X = cast(pd.DataFrame, X)
 
diff --git a/autoPyTorch/data/tabular_target_validator.py b/autoPyTorch/data/tabular_target_validator.py
@@ -5,7 +5,7 @@
 import pandas as pd
 from pandas.api.types import is_numeric_dtype
 
-import scipy.sparse
+from scipy.sparse import issparse, spmatrix
 
 import sklearn.utils
 from sklearn import preprocessing
@@ -14,10 +14,10 @@
 from sklearn.utils.multiclass import type_of_target
 
 from autoPyTorch.data.base_target_validator import BaseTargetValidator, SupportedTargetTypes
-from autoPyTorch.utils.common import SparseMatrixType
+from autoPyTorch.utils.common import ispandas
 
 
-ArrayType = Union[np.ndarray, SparseMatrixType]
+ArrayType = Union[np.ndarray, spmatrix]
 
 
 def _check_and_to_array(y: SupportedTargetTypes) -> ArrayType:
@@ -71,7 +71,7 @@ def _fit(
             return self
 
         if y_test is not None:
-            if hasattr(y_train, "iloc"):
+            if ispandas(y_train):
                 y_train = pd.concat([y_train, y_test], ignore_index=True, sort=False)
             elif isinstance(y_train, list):
                 y_train = y_train + y_test
@@ -100,7 +100,7 @@ def _fit(
         if ndim > 1:
             self.encoder.fit(y_train)
         else:
-            if hasattr(y_train, 'iloc'):
+            if ispandas(y_train):
                 y_train = cast(pd.DataFrame, y_train)
                 self.encoder.fit(y_train.to_numpy().reshape(-1, 1))
             else:
@@ -131,7 +131,7 @@ def _transform_by_encoder(self, y: SupportedTargetTypes) -> np.ndarray:
         shape = np.shape(y)
         if len(shape) > 1:
             y = self.encoder.transform(y)
-        elif hasattr(y, 'iloc'):
+        elif ispandas(y):
             # The Ordinal encoder expects a 2 dimensional input.
             # The targets are 1 dimensional, so reshape to match the expected shape
             y = cast(pd.DataFrame, y)
@@ -192,7 +192,7 @@ def inverse_transform(self, y: SupportedTargetTypes) -> np.ndarray:
             y = self.encoder.inverse_transform(y)
         else:
             # The targets should be a flattened array, hence reshape with -1
-            if hasattr(y, 'iloc'):
+            if ispandas(y):
                 y = cast(pd.DataFrame, y)
                 y = self.encoder.inverse_transform(y.to_numpy().reshape(-1, 1)).reshape(-1)
             else:
@@ -216,7 +216,7 @@ def _check_data(self, y: SupportedTargetTypes) -> None:
 
         if not isinstance(y, (np.ndarray, pd.DataFrame,
                               List, pd.Series)) \
-                and not scipy.sparse.issparse(y):  # type: ignore[misc]
+                and not issparse(y):  # type: ignore[misc]
             raise ValueError("AutoPyTorch only supports Numpy arrays, Pandas DataFrames,"
                              " pd.Series, sparse data and Python Lists as targets, yet, "
                              "the provided input is of type {}".format(
@@ -225,8 +225,8 @@ def _check_data(self, y: SupportedTargetTypes) -> None:
 
         # Sparse data muss be numerical
         # Type ignore on attribute because sparse targets have a dtype
-        if scipy.sparse.issparse(y) and not np.issubdtype(y.dtype.type,  # type: ignore[union-attr]
-                                                          np.number):
+        if issparse(y) and not np.issubdtype(y.dtype.type,  # type: ignore[union-attr]
+                                             np.number):
             raise ValueError("When providing a sparse matrix as targets, the only supported "
                              "values are numerical. Please consider using a dense"
                              " instead."
@@ -245,10 +245,10 @@ def _check_data(self, y: SupportedTargetTypes) -> None:
 
         # No Nan is supported
         has_nan_values = False
-        if hasattr(y, 'iloc'):
+        if ispandas(y):
             has_nan_values = cast(pd.DataFrame, y).isnull().values.any()
-        if scipy.sparse.issparse(y):
-            y = cast(scipy.sparse.spmatrix, y)
+        if issparse(y):
+            y = cast(spmatrix, y)
             has_nan_values = not np.array_equal(y.data, y.data)
         else:
             # List and array like values are considered here
diff --git a/autoPyTorch/data/utils.py b/autoPyTorch/data/utils.py
@@ -21,6 +21,8 @@
 
 from scipy.sparse import issparse, spmatrix
 
+from autoPyTorch.utils.common import ispandas
+
 
 # TODO: TypedDict with python 3.8
 #
@@ -246,7 +248,7 @@ def reduce_precision(
         reduced_dtypes = reduction_mapping[X.dtype]
         X = X.astype(reduced_dtypes)
 
-    elif hasattr(X, 'iloc'):
+    elif ispandas(X):
         dtypes = dict(X.dtypes)
 
         col_names = X.dtypes.index
@@ -270,7 +272,7 @@ def megabytes(arr: DatasetCompressionInputType) -> float:
         memory_in_bytes = arr.nbytes
     elif issparse(arr):
         memory_in_bytes = arr.data.nbytes
-    elif hasattr(arr, 'iloc'):
+    elif ispandas(arr):
         memory_in_bytes = arr.memory_usage(index=True, deep=True).sum()
     else:
         raise ValueError(f"Unrecognised data type of X, expected data type to "
diff --git a/autoPyTorch/datasets/base_dataset.py b/autoPyTorch/datasets/base_dataset.py
@@ -27,7 +27,7 @@
     NoResamplingStrategyTypes,
     ResamplingStrategies
 )
-from autoPyTorch.utils.common import FitRequirement
+from autoPyTorch.utils.common import FitRequirement, ispandas
 
 BaseDatasetInputType = Union[Tuple[np.ndarray, np.ndarray], Dataset]
 BaseDatasetPropertiesType = Union[int, float, str, List, bool]
@@ -220,7 +220,7 @@ def __getitem__(self, index: int, train: bool = True) -> Tuple[np.ndarray, ...]:
             A transformed single point prediction
         """
 
-        X = self.train_tensors[0].iloc[[index]] if hasattr(self.train_tensors[0], 'loc') \
+        X = self.train_tensors[0].iloc[[index]] if ispandas(self.train_tensors[0]) \
             else self.train_tensors[0][index]
 
         if self.train_transform is not None and train:
diff --git a/autoPyTorch/pipeline/components/preprocessing/base_preprocessing.py b/autoPyTorch/pipeline/components/preprocessing/base_preprocessing.py
@@ -6,7 +6,7 @@
 
 import pandas as pd
 
-from scipy.sparse import csr_matrix
+from scipy.sparse import spmatrix
 
 import torch
 
@@ -24,7 +24,7 @@ def __init__(self) -> None:
         super().__init__()
         self.add_fit_requirements([
             FitRequirement('X_train',
-                           (np.ndarray, pd.DataFrame, csr_matrix),
+                           (np.ndarray, pd.DataFrame, spmatrix),
                            user_defined=True, dataset_property=False),
             FitRequirement('backend',
                            (Backend, ),
diff --git a/autoPyTorch/pipeline/components/setup/early_preprocessor/EarlyPreprocessing.py b/autoPyTorch/pipeline/components/setup/early_preprocessor/EarlyPreprocessing.py
@@ -6,7 +6,7 @@
 
 import pandas as pd
 
-from scipy.sparse import csr_matrix
+from scipy.sparse import spmatrix
 
 from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
 from autoPyTorch.pipeline.components.setup.base_setup import autoPyTorchSetupComponent
@@ -21,7 +21,7 @@ def __init__(self, random_state: Optional[np.random.RandomState] = None) -> None
         self.random_state = random_state
         self.add_fit_requirements([
             FitRequirement('is_small_preprocess', (bool,), user_defined=True, dataset_property=True),
-            FitRequirement('X_train', (np.ndarray, pd.DataFrame, csr_matrix), user_defined=True,
+            FitRequirement('X_train', (np.ndarray, pd.DataFrame, spmatrix), user_defined=True,
                            dataset_property=False)])
 
     def fit(self, X: Dict[str, Any], y: Any = None) -> "EarlyPreprocessing":
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/base_network_backbone.py b/autoPyTorch/pipeline/components/setup/network_backbone/base_network_backbone.py
@@ -5,7 +5,7 @@
 
 import pandas as pd
 
-from scipy.sparse import csr_matrix
+from scipy.sparse import spmatrix
 
 import torch
 from torch import nn
@@ -29,7 +29,7 @@ def __init__(self,
         super().__init__()
         self.add_fit_requirements([
             FitRequirement('is_small_preprocess', (bool,), user_defined=True, dataset_property=True),
-            FitRequirement('X_train', (np.ndarray, pd.DataFrame, csr_matrix), user_defined=True,
+            FitRequirement('X_train', (np.ndarray, pd.DataFrame, spmatrix), user_defined=True,
                            dataset_property=False),
             FitRequirement('input_shape', (Iterable,), user_defined=True, dataset_property=True),
             FitRequirement('tabular_transformer', (BaseEstimator,), user_defined=False, dataset_property=False),
diff --git a/autoPyTorch/utils/common.py b/autoPyTorch/utils/common.py
@@ -14,21 +14,17 @@
 
 import pandas as pd
 
-import scipy.sparse
+from scipy.sparse import spmatrix
 
 import torch
 from torch.utils.data.dataloader import default_collate
 
 HyperparameterValueType = Union[int, str, float]
-SparseMatrixType = Union[
-    scipy.sparse.bsr_matrix,
-    scipy.sparse.coo_matrix,
-    scipy.sparse.csc_matrix,
-    scipy.sparse.csr_matrix,
-    scipy.sparse.dia_matrix,
-    scipy.sparse.dok_matrix,
-    scipy.sparse.lil_matrix,
-]
+
+
+def ispandas(X: Any) -> bool:
+    """ Whether X is pandas.DataFrame or pandas.Series """
+    return hasattr(X, "iloc")
 
 
 class FitRequirement(NamedTuple):
@@ -177,10 +173,10 @@ def get_device_from_fit_dictionary(X: Dict[str, Any]) -> torch.device:
     return torch.device(X.get("device", "cpu"))
 
 
-def subsampler(data: Union[np.ndarray, pd.DataFrame, scipy.sparse.csr_matrix],
+def subsampler(data: Union[np.ndarray, pd.DataFrame, spmatrix],
                x: Union[np.ndarray, List[int]]
-               ) -> Union[np.ndarray, pd.DataFrame, scipy.sparse.csr_matrix]:
-    return data[x] if isinstance(data, (np.ndarray, scipy.sparse.csr_matrix)) else data.iloc[x]
+               ) -> Union[np.ndarray, pd.DataFrame, spmatrix]:
+    return data[x] if isinstance(data, (np.ndarray, spmatrix)) else data.iloc[x]
 
 
 def get_hyperparameter(hyperparameter: HyperparameterSearchSpace,