diff --git a/autoPyTorch/api/base_task.py b/autoPyTorch/api/base_task.py index 2e14befe1..12bd96fd6 100644 --- a/autoPyTorch/api/base_task.py +++ b/autoPyTorch/api/base_task.py @@ -9,6 +9,7 @@ import time import typing import unittest.mock +import uuid import warnings from abc import abstractmethod from typing import Any, Callable, Dict, List, Optional, Union, cast @@ -122,21 +123,24 @@ class BaseTask: """ def __init__( - self, - seed: int = 1, - n_jobs: int = 1, - logging_config: Optional[Dict] = None, - ensemble_size: int = 50, - ensemble_nbest: int = 50, - max_models_on_disc: int = 50, - temporary_directory: Optional[str] = None, - output_directory: Optional[str] = None, - delete_tmp_folder_after_terminate: bool = True, - delete_output_folder_after_terminate: bool = True, - include_components: Optional[Dict] = None, - exclude_components: Optional[Dict] = None, - backend: Optional[Backend] = None, - search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None + self, + seed: int = 1, + n_jobs: int = 1, + logging_config: Optional[Dict] = None, + ensemble_size: int = 50, + ensemble_nbest: int = 50, + max_models_on_disc: int = 50, + temporary_directory: Optional[str] = None, + output_directory: Optional[str] = None, + delete_tmp_folder_after_terminate: bool = True, + delete_output_folder_after_terminate: bool = True, + include_components: Optional[Dict] = None, + exclude_components: Optional[Dict] = None, + backend: Optional[Backend] = None, + resampling_strategy: Union[CrossValTypes, HoldoutValTypes] = HoldoutValTypes.holdout_validation, + resampling_strategy_args: Optional[Dict[str, Any]] = None, + search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None, + task_type: Optional[str] = None ) -> None: self.seed = seed self.n_jobs = n_jobs @@ -157,6 +161,7 @@ def __init__( delete_tmp_folder_after_terminate=delete_tmp_folder_after_terminate, delete_output_folder_after_terminate=delete_output_folder_after_terminate, ) + self.task_type = task_type self._stopwatch = StopWatch() self.pipeline_options = replace_string_bool_to_bool(json.load(open( @@ -164,7 +169,6 @@ def __init__( self.search_space: Optional[ConfigurationSpace] = None self._dataset_requirements: Optional[List[FitRequirement]] = None - self.task_type: Optional[str] = None self._metric: Optional[autoPyTorchMetric] = None self._logger: Optional[PicklableClientLogger] = None self.run_history: Optional[RunHistory] = None @@ -176,7 +180,8 @@ def __init__( self._logger_port = logging.handlers.DEFAULT_TCP_LOGGING_PORT # Store the resampling strategy from the dataset, to load models as needed - self.resampling_strategy = None # type: Optional[Union[CrossValTypes, HoldoutValTypes]] + self.resampling_strategy = resampling_strategy + self.resampling_strategy_args = resampling_strategy_args self.stop_logging_server = None # type: Optional[multiprocessing.synchronize.Event] @@ -287,7 +292,7 @@ def _get_logger(self, name: str) -> PicklableClientLogger: output_dir=self._backend.temporary_directory, ) - # As Auto-sklearn works with distributed process, + # As AutoPyTorch works with distributed process, # we implement a logger server that can receive tcp # pickled messages. They are unpickled and processed locally # under the above logging configuration setting @@ -398,20 +403,16 @@ def _close_dask_client(self) -> None: self._is_dask_client_internally_created = False del self._is_dask_client_internally_created - def _load_models(self, resampling_strategy: Optional[Union[CrossValTypes, HoldoutValTypes]] - ) -> bool: + def _load_models(self) -> bool: """ Loads the models saved in the temporary directory during the smac run and the final ensemble created - Args: - resampling_strategy (Union[CrossValTypes, HoldoutValTypes]): resampling strategy used to split the data - and to validate the performance of a candidate pipeline Returns: None """ - if resampling_strategy is None: + if self.resampling_strategy is None: raise ValueError("Resampling strategy is needed to determine what models to load") self.ensemble_ = self._backend.load_ensemble(self.seed) @@ -422,10 +423,10 @@ def _load_models(self, resampling_strategy: Optional[Union[CrossValTypes, Holdou if self.ensemble_: identifiers = self.ensemble_.get_selected_model_identifiers() self.models_ = self._backend.load_models_by_identifiers(identifiers) - if isinstance(resampling_strategy, CrossValTypes): + if isinstance(self.resampling_strategy, CrossValTypes): self.cv_models_ = self._backend.load_cv_models_by_identifiers(identifiers) - if isinstance(resampling_strategy, CrossValTypes): + if isinstance(self.resampling_strategy, CrossValTypes): if len(self.cv_models_) == 0: raise ValueError('No models fitted!') @@ -610,10 +611,10 @@ def _do_traditional_prediction(self, num_run: int, time_for_traditional: int) -> ) return num_run - def search( + def _search( self, - dataset: BaseDataset, optimize_metric: str, + dataset: BaseDataset, budget_type: Optional[str] = None, budget: Optional[float] = None, total_walltime_limit: int = 100, @@ -638,6 +639,7 @@ def search( The argument that will provide the dataset splits. It is a subclass of the base dataset object which can generate the splits based on different restrictions. + Providing X_train, y_train and dataset together is not supported. optimize_metric (str): name of the metric that is used to evaluate a pipeline. budget_type (Optional[str]): @@ -692,6 +694,7 @@ def search( self """ + if self.task_type != dataset.task_type: raise ValueError("Incompatible dataset entered for current task," "expected dataset to have task type :{} got " @@ -705,8 +708,8 @@ def search( dataset_properties = dataset.get_dataset_properties(dataset_requirements) self._stopwatch.start_task(experiment_task_name) self.dataset_name = dataset.dataset_name - self.resampling_strategy = dataset.resampling_strategy - self._logger = self._get_logger(self.dataset_name) + if self._logger is None: + self._logger = self._get_logger(self.dataset_name) self._all_supported_metrics = all_supported_metrics self._disable_file_output = disable_file_output self._memory_limit = memory_limit @@ -869,7 +872,7 @@ def search( if load_models: self._logger.info("Loading models...") - self._load_models(dataset.resampling_strategy) + self._load_models() self._logger.info("Finished loading models...") # Clean up the logger @@ -906,8 +909,11 @@ def refit( Returns: self """ + if self.dataset_name is None: + self.dataset_name = str(uuid.uuid1(clock_seq=os.getpid())) - self._logger = self._get_logger(dataset.dataset_name) + if self._logger is None: + self._logger = self._get_logger(self.dataset_name) dataset_requirements = get_dataset_requirements( info=self._get_required_dataset_properties(dataset)) @@ -927,7 +933,7 @@ def refit( }) X.update({**self.pipeline_options, **budget_config}) if self.models_ is None or len(self.models_) == 0 or self.ensemble_ is None: - self._load_models(dataset.resampling_strategy) + self._load_models() # Refit is not applicable when ensemble_size is set to zero. if self.ensemble_ is None: @@ -973,7 +979,11 @@ def fit(self, Returns: (BasePipeline): fitted pipeline """ - self._logger = self._get_logger(dataset.dataset_name) + if self.dataset_name is None: + self.dataset_name = str(uuid.uuid1(clock_seq=os.getpid())) + + if self._logger is None: + self._logger = self._get_logger(self.dataset_name) # get dataset properties dataset_requirements = get_dataset_requirements( @@ -1025,7 +1035,7 @@ def predict( if self._logger is None: self._logger = self._get_logger("Predict-Logger") - if self.ensemble_ is None and not self._load_models(self.resampling_strategy): + if self.ensemble_ is None and not self._load_models(): raise ValueError("No ensemble found. Either fit has not yet " "been called or no ensemble was fitted") @@ -1084,9 +1094,6 @@ def score( Returns: Dict[str, float]: Value of the evaluation metric calculated on the test set. """ - if isinstance(y_test, pd.Series): - y_test = y_test.to_numpy(dtype=np.float) - if self._metric is None: raise ValueError("No metric found. Either fit/search has not been called yet " "or AutoPyTorch failed to infer a metric from the dataset ") diff --git a/autoPyTorch/api/tabular_classification.py b/autoPyTorch/api/tabular_classification.py index 165ca98e7..3bd481995 100644 --- a/autoPyTorch/api/tabular_classification.py +++ b/autoPyTorch/api/tabular_classification.py @@ -1,11 +1,22 @@ -from typing import Any, Dict, Optional +import os +import uuid +from typing import Any, Callable, Dict, List, Optional, Union + +import numpy as np + +import pandas as pd from autoPyTorch.api.base_task import BaseTask from autoPyTorch.constants import ( TABULAR_CLASSIFICATION, TASK_TYPES_TO_STRING, ) +from autoPyTorch.data.tabular_validator import TabularInputValidator from autoPyTorch.datasets.base_dataset import BaseDataset +from autoPyTorch.datasets.resampling_strategy import ( + CrossValTypes, + HoldoutValTypes, +) from autoPyTorch.datasets.tabular_dataset import TabularDataset from autoPyTorch.pipeline.tabular_classification import TabularClassificationPipeline from autoPyTorch.utils.backend import Backend @@ -52,6 +63,8 @@ def __init__( delete_output_folder_after_terminate: bool = True, include_components: Optional[Dict] = None, exclude_components: Optional[Dict] = None, + resampling_strategy: Union[CrossValTypes, HoldoutValTypes] = HoldoutValTypes.holdout_validation, + resampling_strategy_args: Optional[Dict[str, Any]] = None, backend: Optional[Backend] = None, search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None ): @@ -69,9 +82,11 @@ def __init__( include_components=include_components, exclude_components=exclude_components, backend=backend, - search_space_updates=search_space_updates + resampling_strategy=resampling_strategy, + resampling_strategy_args=resampling_strategy_args, + search_space_updates=search_space_updates, + task_type=TASK_TYPES_TO_STRING[TABULAR_CLASSIFICATION], ) - self.task_type = TASK_TYPES_TO_STRING[TABULAR_CLASSIFICATION] def _get_required_dataset_properties(self, dataset: BaseDataset) -> Dict[str, Any]: if not isinstance(dataset, TabularDataset): @@ -86,3 +101,163 @@ def _get_required_dataset_properties(self, dataset: BaseDataset) -> Dict[str, An def build_pipeline(self, dataset_properties: Dict[str, Any]) -> TabularClassificationPipeline: return TabularClassificationPipeline(dataset_properties=dataset_properties) + + def search( + self, + optimize_metric: str, + X_train: Optional[Union[List, pd.DataFrame, np.ndarray]] = None, + y_train: Optional[Union[List, pd.DataFrame, np.ndarray]] = None, + X_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None, + y_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None, + dataset_name: Optional[str] = None, + budget_type: Optional[str] = None, + budget: Optional[float] = None, + total_walltime_limit: int = 100, + func_eval_time_limit: int = 60, + traditional_per_total_budget: float = 0.1, + memory_limit: Optional[int] = 4096, + smac_scenario_args: Optional[Dict[str, Any]] = None, + get_smac_object_callback: Optional[Callable] = None, + all_supported_metrics: bool = True, + precision: int = 32, + disable_file_output: List = [], + load_models: bool = True, + ) -> 'BaseTask': + """ + Search for the best pipeline configuration for the given dataset. + + Fit both optimizes the machine learning models and builds an ensemble out of them. + To disable ensembling, set ensemble_size==0. + using the optimizer. + Args: + X_train, y_train, X_test, y_test: Union[np.ndarray, List, pd.DataFrame] + A pair of features (X_train) and targets (y_train) used to fit a + pipeline. Additionally, a holdout of this pairs (X_test, y_test) can + be provided to track the generalization performance of each stage. + optimize_metric (str): name of the metric that is used to + evaluate a pipeline. + budget_type (Optional[str]): + Type of budget to be used when fitting the pipeline. + Either 'epochs' or 'runtime'. If not provided, uses + the default in the pipeline config ('epochs') + budget (Optional[float]): + Budget to fit a single run of the pipeline. If not + provided, uses the default in the pipeline config + total_walltime_limit (int), (default=100): Time limit + in seconds for the search of appropriate models. + By increasing this value, autopytorch has a higher + chance of finding better models. + func_eval_time_limit (int), (default=60): Time limit + for a single call to the machine learning model. + Model fitting will be terminated if the machine + learning algorithm runs over the time limit. Set + this value high enough so that typical machine + learning algorithms can be fit on the training + data. + traditional_per_total_budget (float), (default=0.1): + Percent of total walltime to be allocated for + running traditional classifiers. + memory_limit (Optional[int]), (default=4096): Memory + limit in MB for the machine learning algorithm. autopytorch + will stop fitting the machine learning algorithm if it tries + to allocate more than memory_limit MB. If None is provided, + no memory limit is set. In case of multi-processing, memory_limit + will be per job. This memory limit also applies to the ensemble + creation process. + smac_scenario_args (Optional[Dict]): Additional arguments inserted + into the scenario of SMAC. See the + [SMAC documentation] (https://automl.github.io/SMAC3/master/options.html?highlight=scenario#scenario) + get_smac_object_callback (Optional[Callable]): Callback function + to create an object of class + [smac.optimizer.smbo.SMBO](https://automl.github.io/SMAC3/master/apidoc/smac.optimizer.smbo.html). + The function must accept the arguments scenario_dict, + instances, num_params, runhistory, seed and ta. This is + an advanced feature. Use only if you are familiar with + [SMAC](https://automl.github.io/SMAC3/master/index.html). + all_supported_metrics (bool), (default=True): if True, all + metrics supporting current task will be calculated + for each pipeline and results will be available via cv_results + precision (int), (default=32): Numeric precision used when loading + ensemble data. Can be either '16', '32' or '64'. + disable_file_output (Union[bool, List]): + load_models (bool), (default=True): Whether to load the + models after fitting AutoPyTorch. + + Returns: + self + + """ + if dataset_name is None: + dataset_name = str(uuid.uuid1(clock_seq=os.getpid())) + + # we have to create a logger for at this point for the validator + self._logger = self._get_logger(dataset_name) + + # Create a validator object to make sure that the data provided by + # the user matches the autopytorch requirements + self.InputValidator = TabularInputValidator( + is_classification=True, + logger_port=self._logger_port, + ) + + # Fit a input validator to check the provided data + # Also, an encoder is fit to both train and test data, + # to prevent unseen categories during inference + self.InputValidator.fit(X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test) + + self.dataset = TabularDataset( + X=X_train, Y=y_train, + X_test=X_test, Y_test=y_test, + validator=self.InputValidator, + resampling_strategy=self.resampling_strategy, + resampling_strategy_args=self.resampling_strategy_args, + ) + + return self._search( + dataset=self.dataset, + optimize_metric=optimize_metric, + budget_type=budget_type, + budget=budget, + total_walltime_limit=total_walltime_limit, + func_eval_time_limit=func_eval_time_limit, + traditional_per_total_budget=traditional_per_total_budget, + memory_limit=memory_limit, + smac_scenario_args=smac_scenario_args, + get_smac_object_callback=get_smac_object_callback, + all_supported_metrics=all_supported_metrics, + precision=precision, + disable_file_output=disable_file_output, + load_models=load_models, + ) + + def predict( + self, + X_test: np.ndarray, + batch_size: Optional[int] = None, + n_jobs: int = 1 + ) -> np.ndarray: + if self.InputValidator is None or not self.InputValidator._is_fitted: + raise ValueError("predict() is only supported after calling search. Kindly call first " + "the estimator fit() method.") + + X_test = self.InputValidator.feature_validator.transform(X_test) + predicted_probabilities = super().predict(X_test, batch_size=batch_size, + n_jobs=n_jobs) + + if self.InputValidator.target_validator.is_single_column_target(): + predicted_indexes = np.argmax(predicted_probabilities, axis=1) + else: + predicted_indexes = (predicted_probabilities > 0.5).astype(int) + + # Allow to predict in the original domain -- that is, the user is not interested + # in our encoded values + return self.InputValidator.target_validator.inverse_transform(predicted_indexes) + + def predict_proba(self, + X_test: Union[np.ndarray, pd.DataFrame, List], + batch_size: Optional[int] = None, n_jobs: int = 1) -> np.ndarray: + if self.InputValidator is None or not self.InputValidator._is_fitted: + raise ValueError("predict() is only supported after calling search. Kindly call first " + "the estimator fit() method.") + X_test = self.InputValidator.feature_validator.transform(X_test) + return super().predict(X_test, batch_size=batch_size, n_jobs=n_jobs) diff --git a/autoPyTorch/data/__init__.py b/autoPyTorch/data/__init__.py new file mode 100644 index 000000000..dae354a67 --- /dev/null +++ b/autoPyTorch/data/__init__.py @@ -0,0 +1 @@ +# -*- encoding: utf-8 -*- diff --git a/autoPyTorch/data/base_feature_validator.py b/autoPyTorch/data/base_feature_validator.py new file mode 100644 index 000000000..2ef02ceba --- /dev/null +++ b/autoPyTorch/data/base_feature_validator.py @@ -0,0 +1,139 @@ +import logging +import typing + +import numpy as np + +import pandas as pd + +import scipy.sparse + +from sklearn.base import BaseEstimator + +from autoPyTorch.utils.logging_ import PicklableClientLogger + + +SUPPORTED_FEAT_TYPES = typing.Union[ + typing.List, + pd.DataFrame, + np.ndarray, + scipy.sparse.bsr_matrix, + scipy.sparse.coo_matrix, + scipy.sparse.csc_matrix, + scipy.sparse.csr_matrix, + scipy.sparse.dia_matrix, + scipy.sparse.dok_matrix, + scipy.sparse.lil_matrix, +] + + +class BaseFeatureValidator(BaseEstimator): + """ + A class to pre-process features. In this regards, the format of the data is checked, + and if applicable, features are encoded + Attributes: + feat_type (List[str]): + List of the column types found by this estimator during fit. + data_type (str): + Class name of the data type provided during fit. + encoder (typing.Optional[BaseEstimator]) + Host a encoder object if the data requires transformation (for example, + if provided a categorical column in a pandas DataFrame) + enc_columns (typing.List[str]) + List of columns that were encoded. + """ + def __init__(self, + logger: typing.Optional[typing.Union[PicklableClientLogger, logging.Logger + ]] = None, + ) -> None: + # Register types to detect unsupported data format changes + self.feat_type = None # type: typing.Optional[typing.List[str]] + self.data_type = None # type: typing.Optional[type] + self.dtypes = [] # type: typing.List[str] + self.column_order = [] # type: typing.List[str] + + self.encoder = None # type: typing.Optional[BaseEstimator] + self.enc_columns = [] # type: typing.List[str] + + self.logger: typing.Union[ + PicklableClientLogger, logging.Logger + ] = logger if logger is not None else logging.getLogger(__name__) + + # Required for dataset properties + self.num_features = None # type: typing.Optional[int] + self.categories = [] # type: typing.List[typing.List[int]] + self.categorical_columns: typing.List[int] = [] + self.numerical_columns: typing.List[int] = [] + + self._is_fitted = False + + def fit( + self, + X_train: SUPPORTED_FEAT_TYPES, + X_test: typing.Optional[SUPPORTED_FEAT_TYPES] = None, + ) -> BaseEstimator: + """ + Validates and fit a categorical encoder (if needed) to the features. + The supported data types are List, numpy arrays and pandas DataFrames. + CSR sparse data types are also supported + + Arguments: + X_train (SUPPORTED_FEAT_TYPES): + A set of features that are going to be validated (type and dimensionality + checks) and a encoder fitted in the case the data needs encoding + X_test (typing.Optional[SUPPORTED_FEAT_TYPES]): + A hold out set of data used for checking + """ + + # If a list was provided, it will be converted to pandas + if isinstance(X_train, list): + X_train, X_test = self.list_to_dataframe(X_train, X_test) + + self._check_data(X_train) + + if X_test is not None: + self._check_data(X_test) + + if np.shape(X_train)[1] != np.shape(X_test)[1]: + raise ValueError("The feature dimensionality of the train and test " + "data does not match train({}) != test({})".format( + np.shape(X_train)[1], + np.shape(X_test)[1] + )) + + # Fit on the training data + self._fit(X_train) + + self._is_fitted = True + + return self + + def _fit( + self, + X: SUPPORTED_FEAT_TYPES, + ) -> BaseEstimator: + """ + Arguments: + X (SUPPORTED_FEAT_TYPES): + A set of features that are going to be validated (type and dimensionality + checks) and a encoder fitted in the case the data needs encoding + Returns: + self: + The fitted base estimator + """ + raise NotImplementedError() + + def transform( + self, + X: SUPPORTED_FEAT_TYPES, + ) -> np.ndarray: + """ + Arguments: + X_train (SUPPORTED_FEAT_TYPES): + A set of features, whose categorical features are going to be + transformed + + Return: + np.ndarray: + The transformed array + """ + raise NotImplementedError() diff --git a/autoPyTorch/data/base_target_validator.py b/autoPyTorch/data/base_target_validator.py new file mode 100644 index 000000000..dba9c19e3 --- /dev/null +++ b/autoPyTorch/data/base_target_validator.py @@ -0,0 +1,193 @@ +import logging +import typing + +import numpy as np + +import pandas as pd + +import scipy.sparse + +from sklearn.base import BaseEstimator + +from autoPyTorch.utils.logging_ import PicklableClientLogger + + +SUPPORTED_TARGET_TYPES = typing.Union[ + typing.List, + pd.Series, + pd.DataFrame, + np.ndarray, + scipy.sparse.bsr_matrix, + scipy.sparse.coo_matrix, + scipy.sparse.csc_matrix, + scipy.sparse.csr_matrix, + scipy.sparse.dia_matrix, + scipy.sparse.dok_matrix, + scipy.sparse.lil_matrix, +] + + +class BaseTargetValidator(BaseEstimator): + """ + A class to pre-process targets. It validates the data provided during fit (to make sure + it matches AutoPyTorch expectation) as well as encoding the targets in case of classification + Attributes: + is_classification (bool): + A bool that indicates if the validator should operate in classification mode. + During classification, the targets are encoded. + encoder (typing.Optional[BaseEstimator]): + Host a encoder object if the data requires transformation (for example, + if provided a categorical column in a pandas DataFrame) + enc_columns (typing.List[str]) + List of columns that where encoded + """ + def __init__(self, + is_classification: bool = False, + logger: typing.Optional[typing.Union[PicklableClientLogger, logging.Logger + ]] = None, + ) -> None: + self.is_classification = is_classification + + self.data_type = None # type: typing.Optional[type] + + self.encoder = None # type: typing.Optional[BaseEstimator] + + self.out_dimensionality = None # type: typing.Optional[int] + self.type_of_target = None # type: typing.Optional[str] + + self.logger: typing.Union[ + PicklableClientLogger, logging.Logger + ] = logger if logger is not None else logging.getLogger(__name__) + + # Store the dtype for remapping to correct type + self.dtype = None # type: typing.Optional[type] + + self._is_fitted = False + + def fit( + self, + y_train: SUPPORTED_TARGET_TYPES, + y_test: typing.Optional[SUPPORTED_TARGET_TYPES] = None, + ) -> BaseEstimator: + """ + Validates and fit a categorical encoder (if needed) to the targets + The supported data types are List, numpy arrays and pandas DataFrames. + + Arguments: + y_train (SUPPORTED_TARGET_TYPES) + A set of targets set aside for training + y_test (typing.Union[SUPPORTED_TARGET_TYPES]) + A hold out set of data used of the targets. It is also used to fit the + categories of the encoder. + """ + # Check that the data is valid + self._check_data(y_train) + + shape = np.shape(y_train) + if y_test is not None: + self._check_data(y_test) + + if len(shape) != len(np.shape(y_test)) or ( + len(shape) > 1 and (shape[1] != np.shape(y_test)[1])): + raise ValueError("The dimensionality of the train and test targets " + "does not match train({}) != test({})".format( + np.shape(y_train), + np.shape(y_test) + )) + if isinstance(y_train, pd.DataFrame): + y_train = typing.cast(pd.DataFrame, y_train) + y_test = typing.cast(pd.DataFrame, y_test) + if y_train.columns.tolist() != y_test.columns.tolist(): + raise ValueError( + "Train and test targets must both have the same columns, yet " + "y={} and y_test={} ".format( + y_train.columns, + y_test.columns + ) + ) + + if list(y_train.dtypes) != list(y_test.dtypes): + raise ValueError("Train and test targets must both have the same dtypes") + + if self.out_dimensionality is None: + self.out_dimensionality = 1 if len(shape) == 1 else shape[1] + else: + _n_outputs = 1 if len(shape) == 1 else shape[1] + if self.out_dimensionality != _n_outputs: + raise ValueError('Number of outputs changed from %d to %d!' % + (self.out_dimensionality, _n_outputs)) + + # Fit on the training data + self._fit(y_train, y_test) + + self._is_fitted = True + + return self + + def _fit( + self, + y_train: SUPPORTED_TARGET_TYPES, + y_test: typing.Optional[SUPPORTED_TARGET_TYPES] = None, + ) -> BaseEstimator: + """ + Arguments: + y_train (SUPPORTED_TARGET_TYPES) + The labels of the current task. They are going to be encoded in case + of classification + y_test (typing.Optional[SUPPORTED_TARGET_TYPES]) + A holdout set of labels + """ + raise NotImplementedError() + + def transform( + self, + y: typing.Union[SUPPORTED_TARGET_TYPES], + ) -> np.ndarray: + """ + Arguments: + y (SUPPORTED_TARGET_TYPES) + A set of targets that are going to be encoded if the current task + is classification + Returns: + np.ndarray: + The transformed array + """ + raise NotImplementedError() + + def inverse_transform( + self, + y: SUPPORTED_TARGET_TYPES, + ) -> np.ndarray: + """ + Revert any encoding transformation done on a target array + + Arguments: + y (typing.Union[np.ndarray, pd.DataFrame, pd.Series]): + Target array to be transformed back to original form before encoding + Returns: + np.ndarray: + The transformed array + """ + raise NotImplementedError() + + @property + def classes_(self) -> np.ndarray: + """ + Complies with scikit learn classes_ attribute, + which consist of a ndarray of shape (n_classes,) + where n_classes are the number of classes seen while fitting + a encoder to the targets. + Returns: + classes_: np.ndarray + The unique classes seen during encoding of a classifier + """ + if self.encoder is None: + return np.array([]) + else: + return self.encoder.categories_[0] + + def is_single_column_target(self) -> bool: + """ + Output is encoded with a single column encoding + """ + return self.out_dimensionality == 1 diff --git a/autoPyTorch/data/base_validator.py b/autoPyTorch/data/base_validator.py new file mode 100644 index 000000000..7528d56ab --- /dev/null +++ b/autoPyTorch/data/base_validator.py @@ -0,0 +1,123 @@ +# -*- encoding: utf-8 -*- +import logging.handlers +import typing + +import numpy as np + +from sklearn.base import BaseEstimator +from sklearn.exceptions import NotFittedError + +from autoPyTorch.data.base_feature_validator import SUPPORTED_FEAT_TYPES +from autoPyTorch.data.base_target_validator import SUPPORTED_TARGET_TYPES + + +class BaseInputValidator(BaseEstimator): + """ + Makes sure the input data complies with Auto-PyTorch requirements. + Categorical inputs are encoded via an Encoder, if the input + is a dataframe. This allow us to nicely predict string targets + + This class also perform checks for data integrity and flags the user + via informative errors. + + Attributes: + is_classification (bool): + For classification task, this flag indicates that the target data + should be encoded + feature_validator (FeatureValidator): + A FeatureValidator instance used to validate and encode feature columns to match + sklearn expectations on the data + target_validator (TargetValidator): + A TargetValidator instance used to validate and encode (in case of classification) + the target values + """ + def __init__( + self, + is_classification: bool = False, + logger_port: typing.Optional[int] = logging.handlers.DEFAULT_TCP_LOGGING_PORT, + ) -> None: + raise NotImplementedError() + + def fit( + self, + X_train: SUPPORTED_FEAT_TYPES, + y_train: SUPPORTED_TARGET_TYPES, + X_test: typing.Optional[SUPPORTED_FEAT_TYPES] = None, + y_test: typing.Optional[SUPPORTED_TARGET_TYPES] = None, + ) -> BaseEstimator: + """ + Validates and fit a categorical encoder (if needed) to the features, and + a encoder for targets in the case of classification. Specifically: + + For features: + + Valid data types are enforced (List, np.ndarray, pd.DataFrame, pd.Series, scipy + sparse) as well as dimensionality checks + + If the provided data is a pandas DataFrame with categorical/boolean/int columns, + such columns will be encoded using an Ordinal Encoder + For targets: + + Checks for dimensionality as well as missing values are performed. + + If performing a classification task, the data is going to be encoded + + Arguments: + X_train (SUPPORTED_FEAT_TYPES): + A set of features that are going to be validated (type and dimensionality + checks). If this data contains categorical columns, an encoder is going to + be instantiated and trained with this data. + y_train (SUPPORTED_TARGET_TYPES): + A set of targets that are going to be encoded if the task is for classification + X_test (typing.Optional[SUPPORTED_FEAT_TYPES]): + A hold out set of features used for checking + y_test (SUPPORTED_TARGET_TYPES): + A hold out set of targets used for checking. Additionally, if the current task + is a classification task, this y_test categories are also going to be used to + fit a pre-processing encoding (to prevent errors on unseen classes). + Returns: + self + """ + # Check that the data is valid + if np.shape(X_train)[0] != np.shape(y_train)[0]: + raise ValueError("Inconsistent number of train datapoints for features and targets," + " {} for features and {} for targets".format( + np.shape(X_train)[0], + np.shape(y_train)[0], + )) + if X_test is not None and np.shape(X_test)[0] != np.shape(y_test)[0]: + raise ValueError("Inconsistent number of test datapoints for features and targets," + " {} for features and {} for targets".format( + np.shape(X_test)[0], + np.shape(y_test)[0], + )) + + self.feature_validator.fit(X_train, X_test) + self.target_validator.fit(y_train, y_test) + self._is_fitted = True + + return self + + def transform( + self, + X: SUPPORTED_FEAT_TYPES, + y: typing.Optional[SUPPORTED_TARGET_TYPES] = None, + ) -> typing.Tuple[np.ndarray, typing.Optional[np.ndarray]]: + """ + Transform the given target or features to a numpy array + + Arguments: + X (SUPPORTED_FEAT_TYPES): + A set of features to transform + y (typing.Optional[SUPPORTED_TARGET_TYPES]): + A set of targets to transform + + Returns: + np.ndarray: + The transformed features array + np.ndarray: + The transformed targets array + """ + if not self._is_fitted: + raise NotFittedError("Cannot call transform on a validator that is not fitted") + X_transformed = self.feature_validator.transform(X) + if y is not None: + return X_transformed, self.target_validator.transform(y) + else: + return X_transformed, y diff --git a/autoPyTorch/data/tabular_feature_validator.py b/autoPyTorch/data/tabular_feature_validator.py new file mode 100644 index 000000000..fb9a72082 --- /dev/null +++ b/autoPyTorch/data/tabular_feature_validator.py @@ -0,0 +1,378 @@ +import functools +import typing + +import numpy as np + +import pandas as pd +from pandas.api.types import is_numeric_dtype + +import scipy.sparse + +import sklearn.utils +from sklearn import preprocessing +from sklearn.base import BaseEstimator +from sklearn.compose import make_column_transformer +from sklearn.exceptions import NotFittedError + +from autoPyTorch.data.base_feature_validator import BaseFeatureValidator, SUPPORTED_FEAT_TYPES + + +class TabularFeatureValidator(BaseFeatureValidator): + def _fit( + self, + X: SUPPORTED_FEAT_TYPES, + ) -> BaseEstimator: + """ + In case input data is a pandas DataFrame, this utility encodes the user provided + features (from categorical for example) to a numerical value that further stages + will be able to use + + Arguments: + X (SUPPORTED_FEAT_TYPES): + A set of features that are going to be validated (type and dimensionality + checks) and a encoder fitted in the case the data needs encoding + Returns: + self: + The fitted base estimator + """ + + # The final output of a validator is a numpy array. But pandas + # gives us information about the column dtype + if isinstance(X, np.ndarray): + X = self.numpy_array_to_pandas(X) + + if hasattr(X, "iloc") and not scipy.sparse.issparse(X): + X = typing.cast(pd.DataFrame, X) + # Treat a column with all instances a NaN as numerical + # This will prevent doing encoding to a categorical column made completely + # out of nan values -- which will trigger a fail, as encoding is not supported + # with nan values. + # Columns that are completely made of NaN values are provided to the pipeline + # so that later stages decide how to handle them + if np.any(pd.isnull(X)): + for column in X.columns: + if X[column].isna().all(): + X[column] = pd.to_numeric(X[column]) + + self.enc_columns, self.feat_type = self._get_columns_to_encode(X) + + if len(self.enc_columns) > 0: + + self.encoder = make_column_transformer( + (preprocessing.OrdinalEncoder( + handle_unknown='use_encoded_value', + unknown_value=-1, + ), self.enc_columns), + remainder="passthrough" + ) + + # Mypy redefinition + assert self.encoder is not None + self.encoder.fit(X) + + # The column transformer reoders the feature types - we therefore need to change + # it as well + # This means columns are shifted to the right + def comparator(cmp1: str, cmp2: str) -> int: + if ( + cmp1 == 'categorical' and cmp2 == 'categorical' + or cmp1 == 'numerical' and cmp2 == 'numerical' + ): + return 0 + elif cmp1 == 'categorical' and cmp2 == 'numerical': + return -1 + elif cmp1 == 'numerical' and cmp2 == 'categorical': + return 1 + else: + raise ValueError((cmp1, cmp2)) + self.feat_type = sorted( + self.feat_type, + key=functools.cmp_to_key(comparator) + ) + + self.categories = [ + # We fit an ordinal encoder, where all categorical + # columns are shifted to the left + list(range(len(cat))) + for cat in self.encoder.transformers_[0][1].categories_ + ] + + for i, type_ in enumerate(self.feat_type): + if 'numerical' in type_: + self.numerical_columns.append(i) + else: + self.categorical_columns.append(i) + + # Lastly, store the number of features + self.num_features = np.shape(X)[1] + return self + + def transform( + self, + X: SUPPORTED_FEAT_TYPES, + ) -> np.ndarray: + """ + Validates and fit a categorical encoder (if needed) to the features. + The supported data types are List, numpy arrays and pandas DataFrames. + + Arguments: + X_train (SUPPORTED_FEAT_TYPES): + A set of features, whose categorical features are going to be + transformed + + Return: + np.ndarray: + The transformed array + """ + if not self._is_fitted: + raise NotFittedError("Cannot call transform on a validator that is not fitted") + + # If a list was provided, it will be converted to pandas + if isinstance(X, list): + X, _ = self.list_to_dataframe(X) + + if isinstance(X, np.ndarray): + X = self.numpy_array_to_pandas(X) + + if hasattr(X, "iloc") and not scipy.sparse.issparse(X): + X = typing.cast(pd.DataFrame, X) + if np.any(pd.isnull(X)): + for column in X.columns: + if X[column].isna().all(): + X[column] = pd.to_numeric(X[column]) + + # Check the data here so we catch problems on new test data + self._check_data(X) + + # Pandas related transformations + if hasattr(X, "iloc") and self.encoder is not None: + if np.any(pd.isnull(X)): + # After above check it means that if there is a NaN + # the whole column must be NaN + # Make sure it is numerical and let the pipeline handle it + for column in X.columns: + if X[column].isna().all(): + X[column] = pd.to_numeric(X[column]) + X = self.encoder.transform(X) + + # Sparse related transformations + # Not all sparse format support index sorting + if scipy.sparse.issparse(X) and hasattr(X, 'sort_indices'): + X.sort_indices() + + return sklearn.utils.check_array( + X, + force_all_finite=False, + accept_sparse='csr' + ) + + def _check_data( + self, + X: SUPPORTED_FEAT_TYPES, + ) -> None: + """ + Feature dimensionality and data type checks + + Arguments: + X (SUPPORTED_FEAT_TYPES): + A set of features that are going to be validated (type and dimensionality + checks) and a encoder fitted in the case the data needs encoding + """ + + if not isinstance(X, (np.ndarray, pd.DataFrame)) and not scipy.sparse.issparse(X): + raise ValueError("AutoPyTorch only supports Numpy arrays, Pandas DataFrames," + " scipy sparse and Python Lists, yet, the provided input is" + " of type {}".format( + type(X) + )) + + if self.data_type is None: + self.data_type = type(X) + if self.data_type != type(X): + self.logger.warning("AutoPyTorch previously received features of type %s " + "yet the current features have type %s. Changing the dtype " + "of inputs to an estimator might cause problems" % ( + str(self.data_type), + str(type(X)), + ), + ) + + # Do not support category/string numpy data. Only numbers + if hasattr(X, "dtype"): + if not np.issubdtype(X.dtype.type, np.number): # type: ignore[union-attr] + raise ValueError( + "When providing a numpy array to AutoPyTorch, the only valid " + "dtypes are numerical ones. The provided data type {} is not supported." + "".format( + X.dtype.type, # type: ignore[union-attr] + ) + ) + + # Then for Pandas, we do not support Nan in categorical columns + if hasattr(X, "iloc"): + # If entered here, we have a pandas dataframe + X = typing.cast(pd.DataFrame, X) + + # Define the column to be encoded here as the feature validator is fitted once + # per estimator + enc_columns, _ = self._get_columns_to_encode(X) + + if len(enc_columns) > 0: + if np.any(pd.isnull( + X[enc_columns].dropna( # type: ignore[call-overload] + axis='columns', how='all') + )): + # Ignore all NaN columns, and if still a NaN + # Error out + raise ValueError("Categorical features in a dataframe cannot contain " + "missing/NaN values. The OrdinalEncoder used by " + "AutoPyTorch cannot handle this yet (due to a " + "limitation on scikit-learn being addressed via: " + "https://github.com/scikit-learn/scikit-learn/issues/17123)" + ) + column_order = [column for column in X.columns] + if len(self.column_order) > 0: + if self.column_order != column_order: + raise ValueError("Changing the column order of the features after fit() is " + "not supported. Fit() method was called with " + "{} whereas the new features have {} as type".format( + self.column_order, + column_order, + )) + else: + self.column_order = column_order + dtypes = [dtype.name for dtype in X.dtypes] + if len(self.dtypes) > 0: + if self.dtypes != dtypes: + raise ValueError("Changing the dtype of the features after fit() is " + "not supported. Fit() method was called with " + "{} whereas the new features have {} as type".format( + self.dtypes, + dtypes, + )) + else: + self.dtypes = dtypes + + def _get_columns_to_encode( + self, + X: pd.DataFrame, + ) -> typing.Tuple[typing.List[str], typing.List[str]]: + """ + Return the columns to be encoded from a pandas dataframe + + Arguments: + X (pd.DataFrame) + A set of features that are going to be validated (type and dimensionality + checks) and a encoder fitted in the case the data needs encoding + Returns: + enc_columns (List[str]): + Columns to encode, if any + feat_type: + Type of each column numerical/categorical + """ + # Register if a column needs encoding + enc_columns = [] + + # Also, register the feature types for the estimator + feat_type = [] + + # Make sure each column is a valid type + for i, column in enumerate(X.columns): + if X[column].dtype.name in ['category', 'bool']: + + enc_columns.append(column) + feat_type.append('categorical') + # Move away from np.issubdtype as it causes + # TypeError: data type not understood in certain pandas types + elif not is_numeric_dtype(X[column]): + if X[column].dtype.name == 'object': + raise ValueError( + "Input Column {} has invalid type object. " + "Cast it to a valid dtype before using it in AutoPyTorch. " + "Valid types are numerical, categorical or boolean. " + "You can cast it to a valid dtype using " + "pandas.Series.astype ." + "If working with string objects, the following " + "tutorial illustrates how to work with text data: " + "https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html".format( # noqa: E501 + column, + ) + ) + elif pd.core.dtypes.common.is_datetime_or_timedelta_dtype( + X[column].dtype + ): + raise ValueError( + "AutoPyTorch does not support time and/or date datatype as given " + "in column {}. Please convert the time information to a numerical value " + "first. One example on how to do this can be found on " + "https://stats.stackexchange.com/questions/311494/".format( + column, + ) + ) + else: + raise ValueError( + "Input Column {} has unsupported dtype {}. " + "Supported column types are categorical/bool/numerical dtypes. " + "Make sure your data is formatted in a correct way, " + "before feeding it to AutoPyTorch.".format( + column, + X[column].dtype.name, + ) + ) + else: + feat_type.append('numerical') + return enc_columns, feat_type + + def list_to_dataframe( + self, + X_train: SUPPORTED_FEAT_TYPES, + X_test: typing.Optional[SUPPORTED_FEAT_TYPES] = None, + ) -> typing.Tuple[pd.DataFrame, typing.Optional[pd.DataFrame]]: + """ + Converts a list to a pandas DataFrame. In this process, column types are inferred. + + If test data is provided, we proactively match it to train data + + Arguments: + X_train (SUPPORTED_FEAT_TYPES): + A set of features that are going to be validated (type and dimensionality + checks) and a encoder fitted in the case the data needs encoding + X_test (typing.Optional[SUPPORTED_FEAT_TYPES]): + A hold out set of data used for checking + Returns: + pd.DataFrame: + transformed train data from list to pandas DataFrame + pd.DataFrame: + transformed test data from list to pandas DataFrame + """ + + # If a list was provided, it will be converted to pandas + X_train = pd.DataFrame(data=X_train).infer_objects() + self.logger.warning("The provided feature types to AutoPyTorch are of type list." + "Features have been interpreted as: {}".format( + [(col, t) for col, t in zip(X_train.columns, X_train.dtypes)] + )) + if X_test is not None: + if not isinstance(X_test, list): + self.logger.warning("Train features are a list while the provided test data" + "is {}. X_test will be casted as DataFrame.".format( + type(X_test) + )) + X_test = pd.DataFrame(data=X_test).infer_objects() + return X_train, X_test + + def numpy_array_to_pandas( + self, + X: np.ndarray, + ) -> pd.DataFrame: + """ + Converts a numpy array to pandas for type inference + + Arguments: + X (np.ndarray): + data to be interpreted. + + Returns: + pd.DataFrame + """ + return pd.DataFrame(X).infer_objects().convert_dtypes() diff --git a/autoPyTorch/data/tabular_target_validator.py b/autoPyTorch/data/tabular_target_validator.py new file mode 100644 index 000000000..239791768 --- /dev/null +++ b/autoPyTorch/data/tabular_target_validator.py @@ -0,0 +1,265 @@ +import typing + +import numpy as np + +import pandas as pd +from pandas.api.types import is_numeric_dtype + +import scipy.sparse + +import sklearn.utils +from sklearn import preprocessing +from sklearn.base import BaseEstimator +from sklearn.exceptions import NotFittedError +from sklearn.utils.multiclass import type_of_target + +from autoPyTorch.data.base_target_validator import BaseTargetValidator, SUPPORTED_TARGET_TYPES + + +class TabularTargetValidator(BaseTargetValidator): + def _fit( + self, + y_train: SUPPORTED_TARGET_TYPES, + y_test: typing.Optional[SUPPORTED_TARGET_TYPES] = None, + ) -> BaseEstimator: + """ + If dealing with classification, this utility encodes the targets. + + It does so by also using the classes from the test data, to prevent encoding + errors + + Arguments: + y_train (SUPPORTED_TARGET_TYPES) + The labels of the current task. They are going to be encoded in case + of classification + y_test (typing.Optional[SUPPORTED_TARGET_TYPES]) + A holdout set of labels + """ + if not self.is_classification or self.type_of_target == 'multilabel-indicator': + # Only fit an encoder for classification tasks + # Also, encoding multilabel indicator data makes the data multiclass + # Let the user employ a MultiLabelBinarizer if needed + return self + + if y_test is not None: + if hasattr(y_train, "iloc"): + y_train = pd.concat([y_train, y_test], ignore_index=True, sort=False) + elif isinstance(y_train, list): + y_train = y_train + y_test + elif isinstance(y_train, np.ndarray): + y_train = np.concatenate((y_train, y_test)) + + ndim = len(np.shape(y_train)) + if ndim == 1 or (ndim > 1 and np.shape(y_train)[1] == 1): + # The label encoder makes sure data is, and remains + # 1 dimensional + self.encoder = preprocessing.OrdinalEncoder(handle_unknown='use_encoded_value', + unknown_value=-1) + else: + # We should not reach this if statement as we check for type of targets before + raise ValueError("Multi-dimensional classification is not yet supported. " + "Encoding multidimensional data converts multiple columns " + "to a 1 dimensional encoding. Data involved = {}/{}".format( + np.shape(y_train), + self.type_of_target + )) + + # Mypy redefinition + assert self.encoder is not None + + # remove ravel warning from pandas Series + if ndim > 1: + self.encoder.fit(y_train) + else: + if hasattr(y_train, 'iloc'): + y_train = typing.cast(pd.DataFrame, y_train) + self.encoder.fit(y_train.to_numpy().reshape(-1, 1)) + else: + self.encoder.fit(np.array(y_train).reshape(-1, 1)) + + # we leave objects unchanged, so no need to store dtype in this case + if hasattr(y_train, 'dtype'): + # Series and numpy arrays are checked here + # Cast is as numpy for mypy checks + y_train = typing.cast(np.ndarray, y_train) + if is_numeric_dtype(y_train.dtype): + self.dtype = y_train.dtype + elif hasattr(y_train, 'dtypes') and is_numeric_dtype(typing.cast(pd.DataFrame, + y_train).dtypes[0]): + # This case is for pandas array with a single column + y_train = typing.cast(pd.DataFrame, y_train) + self.dtype = y_train.dtypes[0] + + return self + + def transform( + self, + y: typing.Union[SUPPORTED_TARGET_TYPES], + ) -> np.ndarray: + """ + Validates and fit a categorical encoder (if needed) to the features. + The supported data types are List, numpy arrays and pandas DataFrames. + + Arguments: + y (SUPPORTED_TARGET_TYPES) + A set of targets that are going to be encoded if the current task + is classification + Returns: + np.ndarray: + The transformed array + """ + if not self._is_fitted: + raise NotFittedError("Cannot call transform on a validator that is not fitted") + + # Check the data here so we catch problems on new test data + self._check_data(y) + + if self.encoder is not None: + # remove ravel warning from pandas Series + shape = np.shape(y) + if len(shape) > 1: + y = self.encoder.transform(y) + else: + # The Ordinal encoder expects a 2 dimensional input. + # The targets are 1 dimensional, so reshape to match the expected shape + if hasattr(y, 'iloc'): + y = typing.cast(pd.DataFrame, y) + y = self.encoder.transform(y.to_numpy().reshape(-1, 1)).reshape(-1) + else: + y = self.encoder.transform(np.array(y).reshape(-1, 1)).reshape(-1) + + # sklearn check array will make sure we have the + # correct numerical features for the array + # Also, a numpy array will be created + y = sklearn.utils.check_array( + y, + force_all_finite=True, + accept_sparse='csr', + ensure_2d=False, + ) + + # When translating a dataframe to numpy, make sure we + # honor the ravel requirement + if y.ndim == 2 and y.shape[1] == 1: + y = np.ravel(y) + + return y + + def inverse_transform( + self, + y: SUPPORTED_TARGET_TYPES, + ) -> np.ndarray: + """ + Revert any encoding transformation done on a target array + + Arguments: + y (typing.Union[np.ndarray, pd.DataFrame, pd.Series]): + Target array to be transformed back to original form before encoding + Returns: + np.ndarray: + The transformed array + """ + if not self._is_fitted: + raise NotFittedError("Cannot call inverse_transform on a validator that is not fitted") + + if self.encoder is None: + return y + shape = np.shape(y) + if len(shape) > 1: + y = self.encoder.inverse_transform(y) + else: + # The targets should be a flattened array, hence reshape with -1 + if hasattr(y, 'iloc'): + y = typing.cast(pd.DataFrame, y) + y = self.encoder.inverse_transform(y.to_numpy().reshape(-1, 1)).reshape(-1) + else: + y = self.encoder.inverse_transform(np.array(y).reshape(-1, 1)).reshape(-1) + + # Inverse transform returns a numpy array of type object + # This breaks certain metrics as accuracy, which makes type_of_target be unknown + # If while fit a dtype was observed, we try to honor that dtype + if self.dtype is not None: + y = y.astype(self.dtype) + return y + + def _check_data( + self, + y: SUPPORTED_TARGET_TYPES, + ) -> None: + """ + Perform dimensionality and data type checks on the targets + + Arguments: + y (typing.Union[np.ndarray, pd.DataFrame, pd.Series]): + A set of features whose dimensionality and data type is going to be checked + """ + + if not isinstance( + y, (np.ndarray, pd.DataFrame, list, pd.Series)) and not scipy.sparse.issparse(y): + raise ValueError("AutoPyTorch only supports Numpy arrays, Pandas DataFrames," + " pd.Series, sparse data and Python Lists as targets, yet, " + "the provided input is of type {}".format( + type(y) + )) + + # Sparse data muss be numerical + # Type ignore on attribute because sparse targets have a dtype + if scipy.sparse.issparse(y) and not np.issubdtype(y.dtype.type, # type: ignore[union-attr] + np.number): + raise ValueError("When providing a sparse matrix as targets, the only supported " + "values are numerical. Please consider using a dense" + " instead." + ) + + if self.data_type is None: + self.data_type = type(y) + if self.data_type != type(y): + self.logger.warning("AutoPyTorch previously received targets of type %s " + "yet the current features have type %s. Changing the dtype " + "of inputs to an estimator might cause problems" % ( + str(self.data_type), + str(type(y)), + ), + ) + + # No Nan is supported + has_nan_values = False + if hasattr(y, 'iloc'): + has_nan_values = typing.cast(pd.DataFrame, y).isnull().values.any() + if scipy.sparse.issparse(y): + y = typing.cast(scipy.sparse.spmatrix, y) + has_nan_values = not np.array_equal(y.data, y.data) + else: + # List and array like values are considered here + # np.isnan cannot work on strings, so we have to check for every element + # but NaN, are not equal to themselves: + has_nan_values = not np.array_equal(y, y) + if has_nan_values: + raise ValueError("Target values cannot contain missing/NaN values. " + "This is not supported by scikit-learn. " + ) + + # Pandas Series is not supported for multi-label indicator + # This format checks are done by type of target + try: + self.type_of_target = type_of_target(y) + except Exception as e: + raise ValueError("The provided data could not be interpreted by AutoPyTorch. " + "While determining the type of the targets via type_of_target " + "run into exception: {}.".format(e)) + + supported_output_types = ('binary', + 'continuous', + 'continuous-multioutput', + 'multiclass', + 'multilabel-indicator', + # Notice unknown/multiclass-multioutput are not supported + # This can only happen during testing only as estimators + # should filter out unsupported types. + ) + if self.type_of_target not in supported_output_types: + raise ValueError("Provided targets are not supported by AutoPyTorch. " + "Provided type is {} whereas supported types are {}.".format( + self.type_of_target, + supported_output_types + )) diff --git a/autoPyTorch/data/tabular_validator.py b/autoPyTorch/data/tabular_validator.py new file mode 100644 index 000000000..449cd3e3b --- /dev/null +++ b/autoPyTorch/data/tabular_validator.py @@ -0,0 +1,51 @@ +# -*- encoding: utf-8 -*- +import logging +import typing + +from autoPyTorch.data.base_validator import BaseInputValidator +from autoPyTorch.data.tabular_feature_validator import TabularFeatureValidator +from autoPyTorch.data.tabular_target_validator import TabularTargetValidator +from autoPyTorch.utils.logging_ import PicklableClientLogger, get_named_client_logger + + +class TabularInputValidator(BaseInputValidator): + """ + Makes sure the input data complies with Auto-PyTorch requirements. + Categorical inputs are encoded via an Encoder, if the input + is a dataframe. This allow us to nicely predict string targets + + This class also perform checks for data integrity and flags the user + via informative errors. + + Attributes: + is_classification (bool): + For classification task, this flag indicates that the target data + should be encoded + feature_validator (FeatureValidator): + A FeatureValidator instance used to validate and encode feature columns to match + sklearn expectations on the data + target_validator (TargetValidator): + A TargetValidator instance used to validate and encode (in case of classification) + the target values + """ + def __init__( + self, + is_classification: bool = False, + logger_port: typing.Optional[int] = None, + ) -> None: + self.is_classification = is_classification + self.logger_port = logger_port + if self.logger_port is not None: + self.logger: typing.Union[logging.Logger, PicklableClientLogger] = get_named_client_logger( + name='Validation', + port=self.logger_port, + ) + else: + self.logger = logging.getLogger('Validation') + + self.feature_validator = TabularFeatureValidator(logger=self.logger) + self.target_validator = TabularTargetValidator( + is_classification=self.is_classification, + logger=self.logger + ) + self._is_fitted = False diff --git a/autoPyTorch/datasets/tabular_dataset.py b/autoPyTorch/datasets/tabular_dataset.py index dbaa3a260..5087f6886 100644 --- a/autoPyTorch/datasets/tabular_dataset.py +++ b/autoPyTorch/datasets/tabular_dataset.py @@ -1,12 +1,9 @@ -from enum import Enum -from typing import Any, Dict, List, Optional, Tuple, Union +from typing import Any, Dict, Optional, Union import numpy as np import pandas as pd -from sklearn.utils import check_array - import torchvision.transforms from autoPyTorch.constants import ( @@ -19,6 +16,7 @@ TABULAR_REGRESSION, TASK_TYPES_TO_STRING, ) +from autoPyTorch.data.base_validator import BaseInputValidator from autoPyTorch.datasets.base_dataset import BaseDataset from autoPyTorch.datasets.resampling_strategy import ( CrossValTypes, @@ -26,13 +24,6 @@ ) -class DataTypes(Enum): - Canonical = 1 - Float = 2 - String = 3 - Categorical = 4 - - class Value2Index(object): def __init__(self, values: list): assert all(not (pd.isna(v)) for v in values) @@ -71,7 +62,8 @@ class TabularDataset(BaseDataset): """ - def __init__(self, X: Union[np.ndarray, pd.DataFrame], + def __init__(self, + X: Union[np.ndarray, pd.DataFrame], Y: Union[np.ndarray, pd.Series], X_test: Optional[Union[np.ndarray, pd.DataFrame]] = None, Y_test: Optional[Union[np.ndarray, pd.DataFrame]] = None, @@ -82,36 +74,23 @@ def __init__(self, X: Union[np.ndarray, pd.DataFrame], train_transforms: Optional[torchvision.transforms.Compose] = None, val_transforms: Optional[torchvision.transforms.Compose] = None, dataset_name: Optional[str] = None, + validator: Optional[BaseInputValidator] = None, ): - X, self.data_types, self.nan_mask, self.itovs, self.vtois = self.interpret_columns(X) + # Take information from the validator, which guarantees clean data for the + # dataset. + # TODO: Consider moving the validator to the pipeline itself when we + # move to using the fit_params on scikit learn 0.24 + if validator is None: + raise ValueError("A feature validator is required to build a tabular pipeline") - if Y is not None: - Y, _, self.target_nan_mask, self.target_itov, self.target_vtoi = self.interpret_columns( - Y, assert_single_column=True) - # For tabular classification, we expect also that it complies with Sklearn - # The below check_array performs input data checks and make sure that a numpy array - # is returned, as both Pytorch/Sklearn deal directly with numpy/list objects. - # In this particular case, the interpret() returns a pandas object (needed to extract) - # the data types, yet check_array translate the np.array. When Sklearn support pandas - # the below function will simply return Pandas DataFrame. - Y = check_array(Y, ensure_2d=False) - - self.categorical_columns, self.numerical_columns, self.categories, self.num_features = \ - self.infer_dataset_properties(X) - - # Allow support for X_test, Y_test. They will NOT be used for optimization, but - # rather to have a performance through time on the test data + X, Y = validator.transform(X, Y) if X_test is not None: - X_test, self._test_data_types, _, _, _ = self.interpret_columns(X_test) - # Some quality checks on the data - if self.data_types != self._test_data_types: - raise ValueError(f"The train data inferred types {self.data_types} are " - "different than the test inferred types {self._test_data_types}") - if Y_test is not None: - Y_test, _, _, _, _ = self.interpret_columns( - Y_test, assert_single_column=True) - Y_test = check_array(Y_test, ensure_2d=False) + X_test, Y_test = validator.transform(X_test, Y_test) + self.categorical_columns = validator.feature_validator.categorical_columns + self.numerical_columns = validator.feature_validator.numerical_columns + self.num_features = validator.feature_validator.num_features + self.categories = validator.feature_validator.categories super().__init__(train_tensors=(X, Y), test_tensors=(X_test, Y_test), shuffle=shuffle, resampling_strategy=resampling_strategy, @@ -131,100 +110,6 @@ def __init__(self, X: Union[np.ndarray, pd.DataFrame], if STRING_TO_TASK_TYPES[self.task_type] in CLASSIFICATION_TASKS: self.num_classes: int = len(np.unique(self.train_tensors[1])) - def interpret_columns(self, - data: Union[np.ndarray, pd.DataFrame, pd.Series], - assert_single_column: bool = False - ) -> Tuple[Union[pd.DataFrame, Any], List[DataTypes], - Union[np.ndarray], - List[Optional[list]], - List[Optional[Value2Index]]]: - """ - Interpret information such as data, data_types, nan_mask, itovs, vtois - about the columns from the given data. - - Args: - data (Union[np.ndarray, pd.DataFrame, pd.Series]): data to be - interpreted. - assert_single_column (bool), (default=False): flag for - asserting that the data contains a single column - - Returns: - Tuple[pd.DataFrame, List[DataTypes], - Union[np.ndarray], - List[Optional[list]], - List[Optional[Value2Index]]]: Tuple of information - """ - single_column = False - if isinstance(data, np.ndarray): - if len(data.shape) == 1 and ',' not in str(data.dtype): - single_column = True - data = data[:, None] - data = pd.DataFrame(data).infer_objects().convert_dtypes() - elif isinstance(data, pd.DataFrame): - data = data.infer_objects().convert_dtypes() - elif isinstance(data, pd.Series): - single_column = True - data = data.to_frame() - else: - raise ValueError('Provided data needs to be either an np.ndarray or a pd.DataFrame for TabularDataset.') - if assert_single_column: - assert single_column, \ - "The data is asserted to be only of a single column, but it isn't. \ - Most likely your targets are not a vector or series." - - data_types = [] - nan_mask = data.isna().to_numpy() - for col_index, dtype in enumerate(data.dtypes): - if dtype.kind == 'f': - data_types.append(DataTypes.Float) - elif dtype.kind in ('i', 'u', 'b'): - data_types.append(DataTypes.Canonical) - elif isinstance(dtype, pd.StringDtype): - data_types.append(DataTypes.String) - elif dtype.name == 'category': - # OpenML format categorical columns as category - # So add support for that - data_types.append(DataTypes.Categorical) - else: - raise ValueError(f"The dtype in column {col_index} is {dtype} which is not supported.") - itovs: List[Optional[List[Any]]] = [] - vtois: List[Optional[Value2Index]] = [] - for col_index, (_, col) in enumerate(data.iteritems()): - if data_types[col_index] != DataTypes.Float: - non_na_values = [v for v in set(col) if not pd.isna(v)] - non_na_values.sort() - itovs.append([np.nan] + non_na_values) - vtois.append(Value2Index(non_na_values)) - else: - itovs.append(None) - vtois.append(None) - - if single_column: - return data.iloc[:, 0], data_types, nan_mask, itovs, vtois - - return data, data_types, nan_mask, itovs, vtois - - def infer_dataset_properties(self, X: Any) -> Tuple[List[int], List[int], List[object], int]: - """ - Infers the properties of the dataset like - categorical_columns, numerical_columns, categories, num_features - Args: - X: input training data - - Returns: - (Tuple[List[int], List[int], List[object], int]): - """ - categorical_columns = [] - numerical_columns = [] - for i, data_type in enumerate(self.data_types): - if data_type == DataTypes.String or data_type == DataTypes.Categorical: - categorical_columns.append(i) - else: - numerical_columns.append(i) - categories = [np.unique(X.iloc[:, a]).tolist() for a in categorical_columns] - num_features = X.shape[1] - return categorical_columns, numerical_columns, categories, num_features - def get_required_dataset_info(self) -> Dict[str, Any]: """ Returns a dictionary containing required dataset properties to instantiate a pipeline, diff --git a/autoPyTorch/pipeline/base_pipeline.py b/autoPyTorch/pipeline/base_pipeline.py index 1eef14f7f..83484da1d 100644 --- a/autoPyTorch/pipeline/base_pipeline.py +++ b/autoPyTorch/pipeline/base_pipeline.py @@ -106,6 +106,52 @@ def __init__( self._additional_run_info = {} # type: Dict[str, str] + def fit(self, X: Dict[str, Any], y: Optional[np.ndarray] = None, + **fit_params: Any) -> Pipeline: + """Fit the selected algorithm to the training data. + Arguments: + X (typing.Dict): + A fit dictionary that contains information to fit a pipeline + TODO: Use fit_params support from 0.24 scikit learn version instead + y (None): + Used for Compatibility, but it has no funciton in out fit strategy + TODO: use actual y when moving to fit_params support + fit_params : dict + See the documentation of sklearn.pipeline.Pipeline for formatting + instructions. + + Returns: + self : + returns an instance of self. + + Raises: + NoModelException + NoModelException is raised if fit() is called without specifying + a classification algorithm first. + """ + X, fit_params = self.fit_transformer(X, y, **fit_params) + self.fit_estimator(X, y, **fit_params) + return self + + def fit_transformer(self, X: Dict[str, Any], y: Optional[np.ndarray] = None, + fit_params: Optional[Dict] = None, + ) -> Tuple[Dict[str, Any], Dict[str, Any]]: + if fit_params is None: + fit_params = {} + fit_params = {key.replace(":", "__"): value for key, value in + fit_params.items()} + fit_params_steps = self._check_fit_params(**fit_params) + Xt = self._fit(X, y, **fit_params_steps) + return Xt, fit_params_steps[self.steps[-1][0]] + + def fit_estimator(self, X: Dict[str, Any], + y: Optional[np.ndarray], **fit_params: Any + ) -> Pipeline: + fit_params = {key.replace(":", "__"): value for key, value in + fit_params.items()} + self._final_estimator.fit(X, y, **fit_params) + return self + def get_max_iter(self) -> int: if self.estimator_supports_iterative_fit(): return self._final_estimator.get_max_iter() diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/encoding/OneHotEncoder.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/encoding/OneHotEncoder.py index 8a781a986..8284c22f2 100644 --- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/encoding/OneHotEncoder.py +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/encoding/OneHotEncoder.py @@ -19,8 +19,12 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEncoder: self.check_requirements(X, y) - self.preprocessor['categorical'] = OHE(categories=X['dataset_properties']['categories'], - sparse=False, handle_unknown='error') + self.preprocessor['categorical'] = OHE( + # It is safer to have the OHE produce a 0 array than to crash a good configuration + categories=X['dataset_properties']['categories'] + if len(X['dataset_properties']['categories']) > 0 else 'auto', + sparse=False, + handle_unknown='ignore') return self @staticmethod diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/encoding/OrdinalEncoder.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/encoding/OrdinalEncoder.py index 7b127f00a..c65726327 100644 --- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/encoding/OrdinalEncoder.py +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/encoding/OrdinalEncoder.py @@ -19,7 +19,9 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEncoder: self.check_requirements(X, y) - self.preprocessor['categorical'] = OE(categories=X['dataset_properties']['categories']) + self.preprocessor['categorical'] = OE(handle_unknown='use_encoded_value', + unknown_value=-1, + ) return self @staticmethod diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/imputation/SimpleImputer.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/imputation/SimpleImputer.py index 95d89726c..6b2a81bc9 100644 --- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/imputation/SimpleImputer.py +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/imputation/SimpleImputer.py @@ -15,6 +15,8 @@ class SimpleImputer(BaseImputer): """ Impute missing values for categorical columns with '!missing!' + (In case of numpy data, the constant value is set to -1, under + the assumption that categorical data is fit with an Ordinal Scaler) """ def __init__(self, @@ -41,7 +43,13 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseImputer: if len(X['dataset_properties']['categorical_columns']) != 0: if self.categorical_strategy == 'constant_!missing!': self.preprocessor['categorical'] = SklearnSimpleImputer(strategy='constant', - fill_value='!missing!', + # Train data is numpy + # as of this point, where + # Ordinal Encoding is using + # for categorical. Only + # Numbers are allowed + # fill_value='!missing!', + fill_value=-1, copy=False) else: self.preprocessor['categorical'] = SklearnSimpleImputer(strategy=self.categorical_strategy, diff --git a/autoPyTorch/pipeline/components/training/data_loader/feature_data_loader.py b/autoPyTorch/pipeline/components/training/data_loader/feature_data_loader.py index 3cde4c3c9..985117b87 100644 --- a/autoPyTorch/pipeline/components/training/data_loader/feature_data_loader.py +++ b/autoPyTorch/pipeline/components/training/data_loader/feature_data_loader.py @@ -11,6 +11,16 @@ from autoPyTorch.pipeline.components.training.data_loader.base_data_loader import BaseDataLoaderComponent +class ExpandTransform(object): + """Expand Dimensionality so tabular transformations see + a 2d Array + """ + def __call__(self, data: np.ndarray) -> np.ndarray: + if len(data.shape) <= 1: + data = np.expand_dims(data, axis=0) + return data + + class ContractTransform(object): """Reverses the effect of ExpandTransform""" def __call__(self, data: np.ndarray) -> np.ndarray: @@ -63,6 +73,7 @@ def build_transform(self, X: Dict[str, Any], mode: str) -> torchvision.transform candidate_transformations = [] # type: List[Callable] if 'test' in mode or not X['dataset_properties']['is_small_preprocess']: + candidate_transformations.append((ExpandTransform())) candidate_transformations.extend(X['preprocess_transforms']) candidate_transformations.append((ContractTransform())) diff --git a/autoPyTorch/pipeline/create_searchspace_util.py b/autoPyTorch/pipeline/create_searchspace_util.py index ca5710498..f50d70f4d 100644 --- a/autoPyTorch/pipeline/create_searchspace_util.py +++ b/autoPyTorch/pipeline/create_searchspace_util.py @@ -185,11 +185,6 @@ def add_forbidden( product[idx - start_idx] + 1) for idx in range(len(matches.shape))) - # This prints the affected nodes - # print [node_choice_names[i][product[i]] - # for i in range(len(product))], \ - # np.sum(matches[slices]) - if np.sum(matches[slices]) == 0: constraint = tuple([(node_names[i], node_choice_names[i][product[i]]) diff --git a/autoPyTorch/pipeline/image_classification.py b/autoPyTorch/pipeline/image_classification.py index 108594c5e..b31c8dbf2 100644 --- a/autoPyTorch/pipeline/image_classification.py +++ b/autoPyTorch/pipeline/image_classification.py @@ -61,34 +61,6 @@ def __init__( config, steps, dataset_properties, include, exclude, random_state, init_params, search_space_updates) - def fit_transformer( - self, - X: np.ndarray, - y: np.ndarray, - fit_params: Optional[Dict[str, Any]] = None - ) -> Tuple[np.ndarray, Optional[Dict[str, Any]]]: - """Fits the pipeline given a training (X,y) pair - - Args: - X (np.ndarray): features from which to guess targets - y (np.ndarray): classification targets for this task - fit_params (Optional[Dict[str, Any]]]): handy communication dictionary, - so that inter-stages of the pipeline can share information - - Returns: - np.ndarray: the transformed features - Optional[Dict[str, Any]]]: A dictionary to share fit informations - within the pipeline stages - """ - - if fit_params is None: - fit_params = {} - - X, fit_params = super().fit_transformer( - X, y, fit_params=fit_params) - - return X, fit_params - def predict_proba(self, X: np.ndarray, batch_size: Optional[int] = None) -> np.ndarray: """predict_proba. diff --git a/autoPyTorch/pipeline/tabular_classification.py b/autoPyTorch/pipeline/tabular_classification.py index 3540d9660..ec80b4a5c 100644 --- a/autoPyTorch/pipeline/tabular_classification.py +++ b/autoPyTorch/pipeline/tabular_classification.py @@ -75,34 +75,6 @@ def __init__( config, steps, dataset_properties, include, exclude, random_state, init_params, search_space_updates) - def fit_transformer( - self, - X: np.ndarray, - y: np.ndarray, - fit_params: Optional[Dict[str, Any]] = None - ) -> Tuple[np.ndarray, Optional[Dict[str, Any]]]: - """Fits the pipeline given a training (X,y) pair - - Args: - X (np.ndarray): features from which to guess targets - y (np.ndarray): classification targets for this task - fit_params (Optional[Dict[str, Any]]]): handy communication dictionary, - so that inter-stages of the pipeline can share information - - Returns: - np.ndarray: the transformed features - Optional[Dict[str, Any]]]: A dictionary to share fit informations - within the pipeline stages - """ - - if fit_params is None: - fit_params = {} - - X, fit_params = super().fit_transformer( - X, y, fit_params=fit_params) - - return X, fit_params - def _predict_proba(self, X: np.ndarray) -> np.ndarray: # Pre-process X loader = self.named_steps['data_loader'].get_loader(X=X) diff --git a/autoPyTorch/pipeline/tabular_regression.py b/autoPyTorch/pipeline/tabular_regression.py index 02a668592..40645223f 100644 --- a/autoPyTorch/pipeline/tabular_regression.py +++ b/autoPyTorch/pipeline/tabular_regression.py @@ -73,34 +73,6 @@ def __init__( config, steps, dataset_properties, include, exclude, random_state, init_params, search_space_updates) - def fit_transformer( - self, - X: np.ndarray, - y: np.ndarray, - fit_params: Optional[Dict[str, Any]] = None - ) -> Tuple[np.ndarray, Optional[Dict[str, Any]]]: - """Fits the pipeline given a training (X,y) pair - - Args: - X (np.ndarray): features from which to guess targets - y (np.ndarray): classification targets for this task - fit_params (Optional[Dict[str, Any]]]): handy communication dictionary, - so that inter-stages of the pipeline can share information - - Returns: - np.ndarray: the transformed features - Optional[Dict[str, Any]]]: A dictionary to share fit informations - within the pipeline stages - """ - - if fit_params is None: - fit_params = {} - - X, fit_params = super().fit_transformer( - X, y, fit_params=fit_params) - - return X, fit_params - def score(self, X: np.ndarray, y: np.ndarray, batch_size: Optional[int] = None) -> np.ndarray: """score. diff --git a/autoPyTorch/pipeline/traditional_tabular_classification.py b/autoPyTorch/pipeline/traditional_tabular_classification.py index 3ac29efc1..319227e7f 100644 --- a/autoPyTorch/pipeline/traditional_tabular_classification.py +++ b/autoPyTorch/pipeline/traditional_tabular_classification.py @@ -38,34 +38,6 @@ def __init__( config, steps, dataset_properties, include, exclude, random_state, init_params) - def fit_transformer( - self, - X: np.ndarray, - y: np.ndarray, - fit_params: Optional[Dict[str, Any]] = None - ) -> Tuple[np.ndarray, Optional[Dict[str, Any]]]: - """Fits the pipeline given a training (X,y) pair - - Args: - X (np.ndarray): features from which to guess targets - y (np.ndarray): classification targets for this task - fit_params (Optional[Dict[str, Any]]]): handy communication dictionary, - so that inter-stages of the pipeline can share information - - Returns: - np.ndarray: the transformed features - Optional[Dict[str, Any]]]: A dictionary to share fit informations - within the pipeline stages - """ - - if fit_params is None: - fit_params = {} - - X, fit_params = super().fit_transformer( - X, y, fit_params=fit_params) - - return X, fit_params - def predict(self, X: np.ndarray, batch_size: Optional[int] = None ) -> np.ndarray: """Predict the output using the selected model. diff --git a/examples/example_tabular_classification.py b/examples/example_tabular_classification.py index e34fc85ea..3713ffb96 100644 --- a/examples/example_tabular_classification.py +++ b/examples/example_tabular_classification.py @@ -8,7 +8,6 @@ """ import os import tempfile as tmp -import typing import warnings os.environ['JOBLIB_TEMP_FOLDER'] = tmp.gettempdir() @@ -23,29 +22,9 @@ import sklearn.model_selection from autoPyTorch.api.tabular_classification import TabularClassificationTask -from autoPyTorch.datasets.tabular_dataset import TabularDataset from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates -# Get the training data for tabular classification -def get_data_to_train() -> typing.Tuple[typing.Any, typing.Any, typing.Any, typing.Any]: - """ - This function returns a fit dictionary that within itself, contains all - the information to fit a pipeline - """ - - # Get the training data for tabular classification - # Move to Australian to showcase numerical vs categorical - X, y = sklearn.datasets.fetch_openml(data_id=40981, return_X_y=True, as_frame=True) - X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split( - X, - y, - random_state=1, - ) - - return X_train, X_test, y_train, y_test - - def get_search_space_updates(): """ Search space updates to the task can be added using HyperparameterSearchSpaceUpdates @@ -72,10 +51,12 @@ def get_search_space_updates(): ############################################################################ # Data Loading # ============ - X_train, X_test, y_train, y_test = get_data_to_train() - datamanager = TabularDataset( - X=X_train, Y=y_train, - X_test=X_test, Y_test=y_test) + X, y = sklearn.datasets.fetch_openml(data_id=40981, return_X_y=True, as_frame=True) + X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split( + X, + y, + random_state=1, + ) ############################################################################ # Build and fit a classifier @@ -85,10 +66,13 @@ def get_search_space_updates(): search_space_updates=get_search_space_updates() ) api.search( - dataset=datamanager, + X_train=X_train, + y_train=y_train, + X_test=X_test.copy(), + y_test=y_test.copy(), optimize_metric='accuracy', total_walltime_limit=500, - func_eval_time_limit=150 + func_eval_time_limit=50 ) ############################################################################ diff --git a/requirements.txt b/requirements.txt index cced8bcf4..c771c891f 100755 --- a/requirements.txt +++ b/requirements.txt @@ -2,7 +2,7 @@ pandas torch torchvision tensorboard -scikit-learn>=0.22.0,<0.23 +scikit-learn>=0.24.0,<0.25.0 numpy scipy lockfile diff --git a/test/conftest.py b/test/conftest.py index d16d40546..5b63f33ad 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -12,6 +12,7 @@ from sklearn.datasets import fetch_openml, make_classification +from autoPyTorch.data.tabular_validator import TabularInputValidator from autoPyTorch.datasets.tabular_dataset import TabularDataset from autoPyTorch.utils.backend import create from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates @@ -166,8 +167,10 @@ def fit_dictionary_numerical_only(backend): random_state=0 ) X = X.astype('float64') + validator = TabularInputValidator(is_classification=True).fit(X.copy(), y.copy()) datamanager = TabularDataset( X=X, Y=y, + validator=validator, X_test=X, Y_test=y, ) @@ -175,8 +178,8 @@ def fit_dictionary_numerical_only(backend): dataset_properties = datamanager.get_dataset_properties(get_dataset_requirements(info)) fit_dictionary = { - 'X_train': X, - 'y_train': y, + 'X_train': datamanager.train_tensors[0], + 'y_train': datamanager.train_tensors[1], 'train_indices': datamanager.splits[0][0], 'val_indices': datamanager.splits[0][1], 'dataset_properties': dataset_properties, @@ -204,16 +207,18 @@ def fit_dictionary_categorical_only(backend): X = X[categorical_columns] X = X.iloc[0:200] y = y.iloc[0:200] + validator = TabularInputValidator(is_classification=True).fit(X.copy(), y.copy()) datamanager = TabularDataset( X=X, Y=y, + validator=validator, X_test=X, Y_test=y, ) info = datamanager.get_required_dataset_info() dataset_properties = datamanager.get_dataset_properties(get_dataset_requirements(info)) fit_dictionary = { - 'X_train': X, - 'y_train': y, + 'X_train': datamanager.train_tensors[0], + 'y_train': datamanager.train_tensors[1], 'train_indices': datamanager.splits[0][0], 'val_indices': datamanager.splits[0][1], 'dataset_properties': dataset_properties, @@ -230,10 +235,6 @@ def fit_dictionary_categorical_only(backend): 'split_id': 0, 'backend': backend, } - datamanager = TabularDataset( - X=X, Y=y, - X_test=X, Y_test=y, - ) backend.save_datamanager(datamanager) return fit_dictionary @@ -243,8 +244,10 @@ def fit_dictionary_num_and_categorical(backend): X, y = fetch_openml(data_id=40981, return_X_y=True, as_frame=True) X = X.iloc[0:200] y = y.iloc[0:200] + validator = TabularInputValidator(is_classification=True).fit(X.copy(), y.copy()) datamanager = TabularDataset( X=X, Y=y, + validator=validator, X_test=X, Y_test=y, ) info = datamanager.get_required_dataset_info() @@ -252,8 +255,8 @@ def fit_dictionary_num_and_categorical(backend): dataset_properties = datamanager.get_dataset_properties(get_dataset_requirements(info)) fit_dictionary = { - 'X_train': X, - 'y_train': y, + 'X_train': datamanager.train_tensors[0], + 'y_train': datamanager.train_tensors[1], 'train_indices': datamanager.splits[0][0], 'val_indices': datamanager.splits[0][1], 'dataset_properties': dataset_properties, diff --git a/test/test_api/test_api.py b/test/test_api/test_api.py index ce9a88e2e..cdd22882d 100644 --- a/test/test_api/test_api.py +++ b/test/test_api/test_api.py @@ -1,5 +1,6 @@ import os import pickle +import sys import numpy as np @@ -17,7 +18,6 @@ CrossValTypes, HoldoutValTypes, ) -from autoPyTorch.datasets.tabular_dataset import TabularDataset # Fixtures @@ -39,32 +39,35 @@ def test_classification(openml_id, resampling_strategy, backend): ) X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split( X, y, random_state=1) - datamanager = TabularDataset( - X=X_train, Y=y_train, - X_test=X_test, Y_test=y_test, + + # Search for a good configuration + estimator = TabularClassificationTask( + backend=backend, resampling_strategy=resampling_strategy, - dataset_name=str(openml_id), ) - assert datamanager.task_type == 'tabular_classification' - expected_num_splits = 1 if resampling_strategy == HoldoutValTypes.holdout_validation else 3 - assert len(datamanager.splits) == expected_num_splits - # Search for a good configuration - estimator = TabularClassificationTask(backend=backend) estimator.search( - dataset=datamanager, + X_train=X_train, y_train=y_train, + X_test=X_test, y_test=y_test, optimize_metric='accuracy', total_walltime_limit=150, func_eval_time_limit=50, traditional_per_total_budget=0 ) + # Internal dataset has expected settings + assert estimator.dataset.task_type == 'tabular_classification' + expected_num_splits = 1 if resampling_strategy == HoldoutValTypes.holdout_validation else 3 + assert estimator.resampling_strategy == resampling_strategy + assert estimator.dataset.resampling_strategy == resampling_strategy + assert len(estimator.dataset.splits) == expected_num_splits + # TODO: check for budget # Check for the created files tmp_dir = estimator._backend.temporary_directory loaded_datamanager = estimator._backend.load_datamanager() - assert len(loaded_datamanager.train_tensors) == len(datamanager.train_tensors) + assert len(loaded_datamanager.train_tensors) == len(estimator.dataset.train_tensors) expected_files = [ 'smac3-output/run_1/configspace.json', @@ -86,7 +89,7 @@ def test_classification(openml_id, resampling_strategy, backend): # Check that smac was able to find proper models succesful_runs = [run_value.status for run_value in estimator.run_history.data.values( ) if 'SUCCESS' in str(run_value.status)] - assert len(succesful_runs) > 1, estimator.run_history.data.items() + assert len(succesful_runs) > 1, [(k, v) for k, v in estimator.run_history.data.items()] # Search for an existing run key in disc. A individual model might have # a timeout and hence was not written to disc @@ -159,11 +162,14 @@ def test_classification(openml_id, resampling_strategy, backend): # Check that we can pickle # Test pickle - dump_file = os.path.join(estimator._backend.temporary_directory, 'dump.pkl') + # This can happen on python greater than 3.6 + # as older python do not control the state of the logger + if sys.version_info >= (3, 7): + dump_file = os.path.join(estimator._backend.temporary_directory, 'dump.pkl') - with open(dump_file, 'wb') as f: - pickle.dump(estimator, f) + with open(dump_file, 'wb') as f: + pickle.dump(estimator, f) - with open(dump_file, 'rb') as f: - restored_estimator = pickle.load(f) - restored_estimator.predict(X_test) + with open(dump_file, 'rb') as f: + restored_estimator = pickle.load(f) + restored_estimator.predict(X_test) diff --git a/test/test_data/__init__.py b/test/test_data/__init__.py new file mode 100644 index 000000000..cc3cd7bec --- /dev/null +++ b/test/test_data/__init__.py @@ -0,0 +1,2 @@ +# -*- encoding: utf-8 -*- +__author__ = 'feurerm' diff --git a/test/test_data/test_feature_validator.py b/test/test_data/test_feature_validator.py new file mode 100644 index 000000000..94a0c8ea4 --- /dev/null +++ b/test/test_data/test_feature_validator.py @@ -0,0 +1,524 @@ +import copy +import random + +import numpy as np + +import pandas as pd + +import pytest + +from scipy import sparse + +import sklearn.datasets +import sklearn.model_selection + +from autoPyTorch.data.tabular_feature_validator import TabularFeatureValidator + + +# Fixtures to be used in this class. By default all elements have 100 datapoints +@pytest.fixture +def input_data_featuretest(request): + if request.param == 'numpy_categoricalonly_nonan': + return np.random.randint(10, size=(100, 10)) + elif request.param == 'numpy_numericalonly_nonan': + return np.random.uniform(10, size=(100, 10)) + elif request.param == 'numpy_mixed_nonan': + return np.column_stack([ + np.random.uniform(10, size=(100, 3)), + np.random.randint(10, size=(100, 3)), + np.random.uniform(10, size=(100, 3)), + np.random.randint(10, size=(100, 1)), + ]) + elif request.param == 'numpy_string_nonan': + return np.array([ + ['a', 'b', 'c', 'a', 'b', 'c'], + ['a', 'b', 'd', 'r', 'b', 'c'], + ]) + elif request.param == 'numpy_categoricalonly_nan': + array = np.random.randint(10, size=(100, 10)).astype('float') + array[50, 0:5] = np.nan + return array + elif request.param == 'numpy_numericalonly_nan': + array = np.full(fill_value=10.0, shape=(100, 10), dtype=np.float64) + array[50, 0:5] = np.nan + # Somehow array is changed to dtype object after np.nan + return array.astype('float') + elif request.param == 'numpy_mixed_nan': + array = np.column_stack([ + np.random.uniform(10, size=(100, 3)), + np.random.randint(10, size=(100, 3)), + np.random.uniform(10, size=(100, 3)), + np.random.randint(10, size=(100, 1)), + ]) + array[50, 0:5] = np.nan + return array + elif request.param == 'numpy_string_nan': + return np.array([ + ['a', 'b', 'c', 'a', 'b', 'c'], + [np.nan, 'b', 'd', 'r', 'b', 'c'], + ]) + elif request.param == 'pandas_categoricalonly_nonan': + return pd.DataFrame([ + {'A': 1, 'B': 2}, + {'A': 3, 'B': 4}, + ], dtype='category') + elif request.param == 'pandas_numericalonly_nonan': + return pd.DataFrame([ + {'A': 1, 'B': 2}, + {'A': 3, 'B': 4}, + ], dtype='float') + elif request.param == 'pandas_mixed_nonan': + frame = pd.DataFrame([ + {'A': 1, 'B': 2}, + {'A': 3, 'B': 4}, + ], dtype='category') + frame['B'] = pd.to_numeric(frame['B']) + return frame + elif request.param == 'pandas_categoricalonly_nan': + return pd.DataFrame([ + {'A': 1, 'B': 2, 'C': np.nan}, + {'A': 3, 'C': np.nan}, + ], dtype='category') + elif request.param == 'pandas_numericalonly_nan': + return pd.DataFrame([ + {'A': 1, 'B': 2, 'C': np.nan}, + {'A': 3, 'C': np.nan}, + ], dtype='float') + elif request.param == 'pandas_mixed_nan': + frame = pd.DataFrame([ + {'A': 1, 'B': 2, 'C': 8}, + {'A': 3, 'B': 4}, + ], dtype='category') + frame['B'] = pd.to_numeric(frame['B']) + return frame + elif request.param == 'pandas_string_nonan': + return pd.DataFrame([ + {'A': 1, 'B': 2}, + {'A': 3, 'B': 4}, + ], dtype='string') + elif request.param == 'list_categoricalonly_nonan': + return [ + ['a', 'b', 'c', 'd'], + ['e', 'f', 'c', 'd'], + ] + elif request.param == 'list_numericalonly_nonan': + return [ + [1, 2, 3, 4], + [5, 6, 7, 8] + ] + elif request.param == 'list_mixed_nonan': + return [ + ['a', 2, 3, 4], + ['b', 6, 7, 8] + ] + elif request.param == 'list_categoricalonly_nan': + return [ + ['a', 'b', 'c', np.nan], + ['e', 'f', 'c', 'd'], + ] + elif request.param == 'list_numericalonly_nan': + return [ + [1, 2, 3, np.nan], + [5, 6, 7, 8] + ] + elif request.param == 'list_mixed_nan': + return [ + ['a', np.nan, 3, 4], + ['b', 6, 7, 8] + ] + elif 'sparse' in request.param: + # We expect the names to be of the type sparse_csc_nonan + sparse_, type_, nan_ = request.param.split('_') + if 'nonan' in nan_: + data = np.ones(3) + else: + data = np.array([1, 2, np.nan]) + + # Then the type of sparse + row_ind = np.array([0, 1, 2]) + col_ind = np.array([1, 2, 1]) + if 'csc' in type_: + return sparse.csc_matrix((data, (row_ind, col_ind))) + elif 'csr' in type_: + return sparse.csr_matrix((data, (row_ind, col_ind))) + elif 'coo' in type_: + return sparse.coo_matrix((data, (row_ind, col_ind))) + elif 'bsr' in type_: + return sparse.bsr_matrix((data, (row_ind, col_ind))) + elif 'lil' in type_: + return sparse.lil_matrix((data)) + elif 'dok' in type_: + return sparse.dok_matrix(np.vstack((data, data, data))) + elif 'dia' in type_: + return sparse.dia_matrix(np.vstack((data, data, data))) + else: + ValueError("Unsupported indirect fixture {}".format(request.param)) + elif 'openml' in request.param: + _, openml_id = request.param.split('_') + X, y = sklearn.datasets.fetch_openml(data_id=int(openml_id), + return_X_y=True, as_frame=True) + return X + else: + ValueError("Unsupported indirect fixture {}".format(request.param)) + + +# Actual checks for the features +@pytest.mark.parametrize( + 'input_data_featuretest', + ( + 'numpy_categoricalonly_nonan', + 'numpy_numericalonly_nonan', + 'numpy_mixed_nonan', + 'numpy_categoricalonly_nan', + 'numpy_numericalonly_nan', + 'numpy_mixed_nan', + 'pandas_categoricalonly_nonan', + 'pandas_numericalonly_nonan', + 'pandas_mixed_nonan', + 'pandas_numericalonly_nan', + 'list_numericalonly_nonan', + 'list_numericalonly_nan', + 'sparse_bsr_nonan', + 'sparse_bsr_nan', + 'sparse_coo_nonan', + 'sparse_coo_nan', + 'sparse_csc_nonan', + 'sparse_csc_nan', + 'sparse_csr_nonan', + 'sparse_csr_nan', + 'sparse_dia_nonan', + 'sparse_dia_nan', + 'sparse_dok_nonan', + 'sparse_dok_nan', + 'sparse_lil_nonan', + 'sparse_lil_nan', + 'openml_40981', # Australian + ), + indirect=True +) +def test_featurevalidator_supported_types(input_data_featuretest): + validator = TabularFeatureValidator() + validator.fit(input_data_featuretest, input_data_featuretest) + transformed_X = validator.transform(input_data_featuretest) + if sparse.issparse(input_data_featuretest): + assert sparse.issparse(transformed_X) + else: + assert isinstance(transformed_X, np.ndarray) + assert np.shape(input_data_featuretest) == np.shape(transformed_X) + assert np.issubdtype(transformed_X.dtype, np.number) + assert validator._is_fitted + + +@pytest.mark.parametrize( + 'input_data_featuretest', + ( + 'numpy_string_nonan', + 'numpy_string_nan', + ), + indirect=True +) +def test_featurevalidator_unsupported_numpy(input_data_featuretest): + validator = TabularFeatureValidator() + with pytest.raises(ValueError, match=r".*When providing a numpy array.*not supported."): + validator.fit(input_data_featuretest) + + +@pytest.mark.parametrize( + 'input_data_featuretest', + ( + 'pandas_categoricalonly_nan', + 'pandas_mixed_nan', + 'openml_179', # adult workclass has NaN in columns + ), + indirect=True +) +def test_featurevalidator_unsupported_pandas(input_data_featuretest): + validator = TabularFeatureValidator() + with pytest.raises(ValueError, match=r"Categorical features in a dataframe.*missing/NaN"): + validator.fit(input_data_featuretest) + + +@pytest.mark.parametrize( + 'input_data_featuretest', + ( + 'numpy_categoricalonly_nonan', + 'numpy_mixed_nonan', + 'numpy_categoricalonly_nan', + 'numpy_mixed_nan', + 'pandas_categoricalonly_nonan', + 'pandas_mixed_nonan', + 'list_numericalonly_nonan', + 'list_numericalonly_nan', + 'sparse_bsr_nonan', + 'sparse_bsr_nan', + 'sparse_coo_nonan', + 'sparse_coo_nan', + 'sparse_csc_nonan', + 'sparse_csc_nan', + 'sparse_csr_nonan', + 'sparse_csr_nan', + 'sparse_dia_nonan', + 'sparse_dia_nan', + 'sparse_dok_nonan', + 'sparse_dok_nan', + 'sparse_lil_nonan', + ), + indirect=True +) +def test_featurevalidator_fitontypeA_transformtypeB(input_data_featuretest): + """ + Check if we can fit in a given type (numpy) yet transform + if the user changes the type (pandas then) + + This is problematic only in the case we create an encoder + """ + validator = TabularFeatureValidator() + validator.fit(input_data_featuretest, input_data_featuretest) + if isinstance(input_data_featuretest, pd.DataFrame): + pytest.skip("Column order change in pandas is not supported") + elif isinstance(input_data_featuretest, np.ndarray): + complementary_type = pd.DataFrame(input_data_featuretest) + elif isinstance(input_data_featuretest, list): + complementary_type = pd.DataFrame(input_data_featuretest) + elif sparse.issparse(input_data_featuretest): + complementary_type = sparse.csr_matrix(input_data_featuretest.todense()) + else: + raise ValueError(type(input_data_featuretest)) + transformed_X = validator.transform(complementary_type) + assert np.shape(input_data_featuretest) == np.shape(transformed_X) + assert np.issubdtype(transformed_X.dtype, np.number) + assert validator._is_fitted + + +def test_featurevalidator_get_columns_to_encode(): + """ + Makes sure that encoded columns are returned by _get_columns_to_encode + whereas numerical columns are not returned + """ + validator = TabularFeatureValidator() + + df = pd.DataFrame([ + {'int': 1, 'float': 1.0, 'category': 'one', 'bool': True}, + {'int': 2, 'float': 2.0, 'category': 'two', 'bool': False}, + ]) + + for col in df.columns: + df[col] = df[col].astype(col) + + enc_columns, feature_types = validator._get_columns_to_encode(df) + + assert enc_columns == ['category', 'bool'] + assert feature_types == ['numerical', 'numerical', 'categorical', 'categorical'] + + +def test_features_unsupported_calls_are_raised(): + """ + Makes sure we raise a proper message to the user, + when providing not supported data input or using the validator in a way that is not + expected + """ + validator = TabularFeatureValidator() + with pytest.raises(ValueError, match=r"AutoPyTorch does not support time"): + validator.fit( + pd.DataFrame({'datetime': [pd.Timestamp('20180310')]}) + ) + with pytest.raises(ValueError, match="has invalid type object"): + validator.fit( + pd.DataFrame({'string': [TabularFeatureValidator()]}) + ) + with pytest.raises(ValueError, match=r"AutoPyTorch only supports.*yet, the provided input"): + validator.fit({'input1': 1, 'input2': 2}) + with pytest.raises(ValueError, match=r"has unsupported dtype string"): + validator.fit(pd.DataFrame([{'A': 1, 'B': 2}], dtype='string')) + with pytest.raises(ValueError, match=r"The feature dimensionality of the train and test"): + validator.fit(X_train=np.array([[1, 2, 3], [4, 5, 6]]), + X_test=np.array([[1, 2, 3, 4], [4, 5, 6, 7]]), + ) + with pytest.raises(ValueError, match=r"Cannot call transform on a validator that is not fit"): + validator.transform(np.array([[1, 2, 3], [4, 5, 6]])) + + +@pytest.mark.parametrize( + 'input_data_featuretest', + ( + 'numpy_numericalonly_nonan', + 'numpy_numericalonly_nan', + 'pandas_numericalonly_nonan', + 'pandas_numericalonly_nan', + 'list_numericalonly_nonan', + 'list_numericalonly_nan', + # Category in numpy is handled via feat_type + 'numpy_categoricalonly_nonan', + 'numpy_mixed_nonan', + 'numpy_categoricalonly_nan', + 'numpy_mixed_nan', + 'sparse_bsr_nonan', + 'sparse_bsr_nan', + 'sparse_coo_nonan', + 'sparse_coo_nan', + 'sparse_csc_nonan', + 'sparse_csc_nan', + 'sparse_csr_nonan', + 'sparse_csr_nan', + 'sparse_dia_nonan', + 'sparse_dia_nan', + 'sparse_dok_nonan', + 'sparse_dok_nan', + 'sparse_lil_nonan', + 'sparse_lil_nan', + ), + indirect=True +) +def test_no_encoder_created(input_data_featuretest): + """ + Makes sure that for numerical only features, no encoder is created + """ + validator = TabularFeatureValidator() + validator.fit(input_data_featuretest) + validator.transform(input_data_featuretest) + assert validator.encoder is None + + +@pytest.mark.parametrize( + 'input_data_featuretest', + ( + 'pandas_categoricalonly_nonan', + 'pandas_mixed_nonan', + ), + indirect=True +) +def test_encoder_created(input_data_featuretest): + """ + This test ensures an encoder is created if categorical data is provided + """ + validator = TabularFeatureValidator() + validator.fit(input_data_featuretest) + transformed_X = validator.transform(input_data_featuretest) + assert validator.encoder is not None + + # Make sure that the encoded features are actually encoded. Categorical columns are at + # the start after transformation. In our fixtures, this is also honored prior encode + enc_columns, feature_types = validator._get_columns_to_encode(input_data_featuretest) + + # At least one categorical + assert 'categorical' in validator.feat_type + + # Numerical if the original data has numerical only columns + if np.any([pd.api.types.is_numeric_dtype(input_data_featuretest[col] + ) for col in input_data_featuretest.columns]): + assert 'numerical' in validator.feat_type + for i, feat_type in enumerate(feature_types): + if 'numerical' in feat_type: + np.testing.assert_array_equal( + transformed_X[:, i], + input_data_featuretest[input_data_featuretest.columns[i]].to_numpy() + ) + elif 'categorical' in feat_type: + np.testing.assert_array_equal( + transformed_X[:, i], + # Expect always 0, 1... because we use a ordinal encoder + np.array([0, 1]) + ) + else: + raise ValueError(feat_type) + + +def test_no_new_category_after_fit(): + """ + This test makes sure that we can actually pass new categories to the estimator + without throwing an error + """ + # Then make sure we catch categorical extra categories + x = pd.DataFrame({'A': [1, 2, 3, 4], 'B': [5, 6, 7, 8]}, dtype='category') + validator = TabularFeatureValidator() + validator.fit(x) + x['A'] = x['A'].apply(lambda x: x * x) + validator.transform(x) + + +def test_unknown_encode_value(): + x = pd.DataFrame([ + {'a': -41, 'b': -3, 'c': 'a', 'd': -987.2}, + {'a': -21, 'b': -3, 'c': 'a', 'd': -9.2}, + {'a': 0, 'b': -4, 'c': 'b', 'd': -97.2}, + {'a': -51, 'b': -3, 'c': 'a', 'd': 987.2}, + {'a': 500, 'b': -3, 'c': 'a', 'd': -92}, + ]) + x['c'] = x['c'].astype('category') + validator = TabularFeatureValidator() + + # Make sure that this value is honored + validator.fit(x) + x['c'].cat.add_categories(['NA'], inplace=True) + x.loc[0, 'c'] = 'NA' # unknown value + x_t = validator.transform(x) + # The first row should have a -1 as we added a new categorical there + expected_row = [-1, -41, -3, -987.2] + assert expected_row == x_t[0].tolist() + + # Notice how there is only one column 'c' to encode + assert validator.categories == [list(range(2)) for i in range(1)] + + +# Actual checks for the features +@pytest.mark.parametrize( + 'openml_id', + ( + 40981, # Australian + 3, # kr-vs-kp + 1468, # cnae-9 + 40975, # car + 40984, # Segment + ), +) +@pytest.mark.parametrize('train_data_type', ('numpy', 'pandas', 'list')) +@pytest.mark.parametrize('test_data_type', ('numpy', 'pandas', 'list')) +def test_featurevalidator_new_data_after_fit(openml_id, + train_data_type, test_data_type): + + # List is currently not supported as infer_objects + # cast list objects to type objects + if train_data_type == 'list' or test_data_type == 'list': + pytest.skip() + + validator = TabularFeatureValidator() + + if train_data_type == 'numpy': + X, y = sklearn.datasets.fetch_openml(data_id=openml_id, + return_X_y=True, as_frame=False) + elif train_data_type == 'pandas': + X, y = sklearn.datasets.fetch_openml(data_id=openml_id, + return_X_y=True, as_frame=True) + else: + X, y = sklearn.datasets.fetch_openml(data_id=openml_id, + return_X_y=True, as_frame=True) + X = X.values.tolist() + y = y.values.tolist() + + X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split( + X, y, random_state=1) + + validator.fit(X_train) + + transformed_X = validator.transform(X_test) + + # Basic Checking + if sparse.issparse(input_data_featuretest): + assert sparse.issparse(transformed_X) + else: + assert isinstance(transformed_X, np.ndarray) + assert np.shape(X_test) == np.shape(transformed_X) + + # And then check proper error messages + if train_data_type == 'pandas': + old_dtypes = copy.deepcopy(validator.dtypes) + validator.dtypes = ['dummy' for dtype in X_train.dtypes] + with pytest.raises(ValueError, match=r"hanging the dtype of the features after fit"): + transformed_X = validator.transform(X_test) + validator.dtypes = old_dtypes + if test_data_type == 'pandas': + columns = X_test.columns.tolist() + random.shuffle(columns) + X_test = X_test[columns] + with pytest.raises(ValueError, match=r"Changing the column order of the features"): + transformed_X = validator.transform(X_test) diff --git a/test/test_data/test_target_validator.py b/test/test_data/test_target_validator.py new file mode 100644 index 000000000..aadc73416 --- /dev/null +++ b/test/test_data/test_target_validator.py @@ -0,0 +1,510 @@ +import numpy as np + +import pandas as pd +from pandas.api.types import is_numeric_dtype + +import pytest + + +from scipy import sparse + +import sklearn.datasets +import sklearn.model_selection +from sklearn.utils.multiclass import type_of_target + +from autoPyTorch.data.tabular_target_validator import TabularTargetValidator + + +# Fixtures to be used in this class. By default all elements have 100 datapoints +@pytest.fixture +def input_data_targettest(request): + if request.param == 'series_binary': + return pd.Series([1, -1, -1, 1]) + elif request.param == 'series_multiclass': + return pd.Series([1, 0, 2]) + elif request.param == 'series_multilabel': + return pd.Series([[1, 0], [0, 1]]) + elif request.param == 'series_continuous': + return pd.Series([0.1, 0.6, 0.7]) + elif request.param == 'series_continuous-multioutput': + return pd.Series([[1.5, 2.0], [3.0, 1.6]]) + elif request.param == 'pandas_binary': + return pd.DataFrame([1, -1, -1, 1]) + elif request.param == 'pandas_multiclass': + return pd.DataFrame([1, 0, 2]) + elif request.param == 'pandas_multilabel': + return pd.DataFrame([[1, 0], [0, 1]]) + elif request.param == 'pandas_continuous': + return pd.DataFrame([0.1, 0.6, 0.7]) + elif request.param == 'pandas_continuous-multioutput': + return pd.DataFrame([[1.5, 2.0], [3.0, 1.6]]) + elif request.param == 'numpy_binary': + return np.array([1, -1, -1, 1]) + elif request.param == 'numpy_multiclass': + return np.array([1, 0, 2]) + elif request.param == 'numpy_multilabel': + return np.array([[1, 0], [0, 1]]) + elif request.param == 'numpy_continuous': + return np.array([0.1, 0.6, 0.7]) + elif request.param == 'numpy_continuous-multioutput': + return np.array([[1.5, 2.0], [3.0, 1.6]]) + elif request.param == 'list_binary': + return [1, -1, -1, 1] + elif request.param == 'list_multiclass': + return [1, 0, 2] + elif request.param == 'list_multilabel': + return [[0, 1], [1, 0]] + elif request.param == 'list_continuous': + return [0.1, 0.6, 0.7] + elif request.param == 'list_continuous-multioutput': + return [[1.5, 2.0], [3.0, 1.6]] + elif 'openml' in request.param: + _, openml_id = request.param.split('_') + X, y = sklearn.datasets.fetch_openml(data_id=int(openml_id), + return_X_y=True, as_frame=True) + if len(y.shape) > 1 and y.shape[1] > 1 and np.any(y.eq('TRUE').any(1).to_numpy()): + # This 'if' is only asserted for multi-label data + # Force the downloaded data to be interpreted as multilabel + y = y.dropna() + y.replace('FALSE', 0, inplace=True) + y.replace('TRUE', 1, inplace=True) + y = y.astype(np.int) + return y + elif 'sparse' in request.param: + # We expect the names to be of the type sparse_csc_nonan + sparse_, type_, nan_ = request.param.split('_') + if 'nonan' in nan_: + data = np.ones(3) + else: + data = np.array([1, 2, np.nan]) + + # Then the type of sparse + if 'csc' in type_: + return sparse.csc_matrix(data) + elif 'csr' in type_: + return sparse.csr_matrix(data) + elif 'coo' in type_: + return sparse.coo_matrix(data) + elif 'bsr' in type_: + return sparse.bsr_matrix(data) + elif 'lil' in type_: + return sparse.lil_matrix(data) + elif 'dok' in type_: + return sparse.dok_matrix(np.vstack((data, data, data))) + elif 'dia' in type_: + return sparse.dia_matrix(np.vstack((data, data, data))) + else: + ValueError("Unsupported indirect fixture {}".format(request.param)) + else: + ValueError("Unsupported indirect fixture {}".format(request.param)) + + +# Actual checks for the targets +@pytest.mark.parametrize( + 'input_data_targettest', + ( + 'series_binary', + 'series_multiclass', + 'series_continuous', + 'pandas_binary', + 'pandas_multiclass', + 'pandas_multilabel', + 'pandas_continuous', + 'pandas_continuous-multioutput', + 'numpy_binary', + 'numpy_multiclass', + 'numpy_multilabel', + 'numpy_continuous', + 'numpy_continuous-multioutput', + 'list_binary', + 'list_multiclass', + 'list_multilabel', + 'list_continuous', + 'list_continuous-multioutput', + 'sparse_bsr_nonan', + 'sparse_coo_nonan', + 'sparse_csc_nonan', + 'sparse_csr_nonan', + 'sparse_lil_nonan', + 'openml_204', + ), + indirect=True +) +def test_targetvalidator_supported_types_noclassification(input_data_targettest): + validator = TabularTargetValidator(is_classification=False) + validator.fit(input_data_targettest) + transformed_y = validator.transform(input_data_targettest) + if sparse.issparse(input_data_targettest): + assert sparse.issparse(transformed_y) + else: + assert isinstance(transformed_y, np.ndarray) + epected_shape = np.shape(input_data_targettest) + if len(epected_shape) > 1 and epected_shape[1] == 1: + # The target should have (N,) dimensionality instead of (N, 1) + epected_shape = (epected_shape[0], ) + assert epected_shape == np.shape(transformed_y) + assert np.issubdtype(transformed_y.dtype, np.number) + assert validator._is_fitted + + # Because there is no classification, we do not expect a encoder + assert validator.encoder is None + + if hasattr(input_data_targettest, "iloc"): + np.testing.assert_array_equal( + np.ravel(input_data_targettest.to_numpy()), + np.ravel(transformed_y) + ) + elif sparse.issparse(input_data_targettest): + np.testing.assert_array_equal( + np.ravel(input_data_targettest.todense()), + np.ravel(transformed_y.todense()) + ) + else: + np.testing.assert_array_equal( + np.ravel(np.array(input_data_targettest)), + np.ravel(transformed_y) + ) + + +@pytest.mark.parametrize( + 'input_data_targettest', + ( + 'series_binary', + 'series_multiclass', + 'pandas_binary', + 'pandas_multiclass', + 'numpy_binary', + 'numpy_multiclass', + 'list_binary', + 'list_multiclass', + 'sparse_bsr_nonan', + 'sparse_coo_nonan', + 'sparse_csc_nonan', + 'sparse_csr_nonan', + 'sparse_lil_nonan', + 'openml_2', + ), + indirect=True +) +def test_targetvalidator_supported_types_classification(input_data_targettest): + validator = TabularTargetValidator(is_classification=True) + validator.fit(input_data_targettest) + transformed_y = validator.transform(input_data_targettest) + if sparse.issparse(input_data_targettest): + assert sparse.issparse(transformed_y) + else: + assert isinstance(transformed_y, np.ndarray) + epected_shape = np.shape(input_data_targettest) + if len(epected_shape) > 1 and epected_shape[1] == 1: + # The target should have (N,) dimensionality instead of (N, 1) + epected_shape = (epected_shape[0], ) + assert epected_shape == np.shape(transformed_y) + assert np.issubdtype(transformed_y.dtype, np.number) + assert validator._is_fitted + + # Because there is no classification, we do not expect a encoder + if not sparse.issparse(input_data_targettest): + assert validator.encoder is not None + + # The encoding should be per column + if len(transformed_y.shape) == 1: + assert np.min(transformed_y) == 0 + assert np.max(transformed_y) == len(np.unique(transformed_y)) - 1 + else: + for col in range(transformed_y.shape[1]): + assert np.min(transformed_y[:, col]) == 0 + assert np.max(transformed_y[:, col]) == len(np.unique(transformed_y[:, col])) - 1 + + # Make sure we can perform inverse transform + y_inverse = validator.inverse_transform(transformed_y) + if hasattr(input_data_targettest, 'dtype'): + # In case of numeric, we need to make sure dtype is preserved + if is_numeric_dtype(input_data_targettest.dtype): + assert y_inverse.dtype == input_data_targettest.dtype + # Then make sure every value is properly inverse-transformed + np.testing.assert_array_equal(np.array(y_inverse), np.array(input_data_targettest)) + elif hasattr(input_data_targettest, 'dtypes'): + if is_numeric_dtype(input_data_targettest.dtypes[0]): + assert y_inverse.dtype == input_data_targettest.dtypes[0] + # Then make sure every value is properly inverse-transformed + np.testing.assert_array_equal(np.array(y_inverse), + # pandas is always (N, 1) but targets are ravel() + input_data_targettest.to_numpy().reshape(-1)) + else: + # Sparse is not encoded, mainly because the sparse data is expected + # to be numpy of numerical type -- which currently does not require encoding + np.testing.assert_array_equal( + np.ravel(input_data_targettest.todense()), + np.ravel(transformed_y.todense()) + ) + + +@pytest.mark.parametrize( + 'input_data_targettest', + ( + 'series_binary', + 'pandas_binary', + 'numpy_binary', + 'list_binary', + 'openml_1066', + ), + indirect=True +) +def test_targetvalidator_binary(input_data_targettest): + assert type_of_target(input_data_targettest) == 'binary' + validator = TabularTargetValidator(is_classification=True) + # Test the X_test also! + validator.fit(input_data_targettest, input_data_targettest) + transformed_y = validator.transform(input_data_targettest) + assert type_of_target(transformed_y) == 'binary' + + +@pytest.mark.parametrize( + 'input_data_targettest', + ( + 'series_multiclass', + 'pandas_multiclass', + 'numpy_multiclass', + 'list_multiclass', + 'openml_54', + ), + indirect=True +) +def test_targetvalidator_multiclass(input_data_targettest): + assert type_of_target(input_data_targettest) == 'multiclass' + validator = TabularTargetValidator(is_classification=True) + # Test the X_test also! + validator.fit(input_data_targettest, input_data_targettest) + transformed_y = validator.transform(input_data_targettest) + assert type_of_target(transformed_y) == 'multiclass' + + +@pytest.mark.parametrize( + 'input_data_targettest', + ( + 'pandas_multilabel', + 'numpy_multilabel', + 'list_multilabel', + 'openml_40594', + ), + indirect=True +) +def test_targetvalidator_multilabel(input_data_targettest): + assert type_of_target(input_data_targettest) == 'multilabel-indicator' + validator = TabularTargetValidator(is_classification=True) + # Test the X_test also! + validator.fit(input_data_targettest, input_data_targettest) + transformed_y = validator.transform(input_data_targettest) + assert type_of_target(transformed_y) == 'multilabel-indicator' + + +@pytest.mark.parametrize( + 'input_data_targettest', + ( + 'series_continuous', + 'pandas_continuous', + 'numpy_continuous', + 'list_continuous', + 'openml_531', + ), + indirect=True +) +def test_targetvalidator_continuous(input_data_targettest): + assert type_of_target(input_data_targettest) == 'continuous' + validator = TabularTargetValidator(is_classification=False) + # Test the X_test also! + validator.fit(input_data_targettest, input_data_targettest) + transformed_y = validator.transform(input_data_targettest) + assert type_of_target(transformed_y) == 'continuous' + + +@pytest.mark.parametrize( + 'input_data_targettest', + ( + 'pandas_continuous-multioutput', + 'numpy_continuous-multioutput', + 'list_continuous-multioutput', + 'openml_41483', + ), + indirect=True +) +def test_targetvalidator_continuous_multioutput(input_data_targettest): + assert type_of_target(input_data_targettest) == 'continuous-multioutput' + validator = TabularTargetValidator(is_classification=False) + # Test the X_test also! + validator.fit(input_data_targettest, input_data_targettest) + transformed_y = validator.transform(input_data_targettest) + assert type_of_target(transformed_y) == 'continuous-multioutput' + + +@pytest.mark.parametrize( + 'input_data_targettest', + ( + 'series_binary', + 'pandas_binary', + 'numpy_binary', + 'list_binary', + ), + indirect=True +) +def test_targetvalidator_fitontypeA_transformtypeB(input_data_targettest): + """ + Check if we can fit in a given type (numpy) yet transform + if the user changes the type (pandas then) + + This is problematic only in the case we create an encoder + """ + validator = TabularTargetValidator(is_classification=True) + validator.fit(input_data_targettest) + if isinstance(input_data_targettest, pd.DataFrame): + complementary_type = input_data_targettest.to_numpy() + elif isinstance(input_data_targettest, pd.Series): + complementary_type = pd.DataFrame(input_data_targettest) + elif isinstance(input_data_targettest, np.ndarray): + complementary_type = pd.DataFrame(input_data_targettest) + elif isinstance(input_data_targettest, list): + complementary_type = pd.DataFrame(input_data_targettest) + validator.transform(complementary_type) + + +@pytest.mark.parametrize( + 'input_data_targettest', + ( + 'series_multilabel', + 'series_continuous-multioutput', + ), + indirect=True +) +def test_type_of_target_unsupported(input_data_targettest): + """ + Makes sure we raise a proper message to the user, + when providing not supported data input + """ + validator = TabularTargetValidator() + with pytest.raises(ValueError, match=r"legacy multi-.* data representation."): + validator.fit(input_data_targettest) + + +def test_target_unsupported(): + """ + Makes sure we raise a proper message to the user, + when providing not supported data input + """ + validator = TabularTargetValidator(is_classification=True) + with pytest.raises(ValueError, match=r"The dimensionality of the train and test targets"): + validator.fit( + np.array([[0, 1, 0], [0, 1, 1]]), + np.array([[0, 1, 0, 0], [0, 1, 1, 1]]), + ) + with pytest.raises(ValueError, match=r"Train and test targets must both have the same dtypes"): + validator.fit( + pd.DataFrame({'a': [1, 2, 3]}), + pd.DataFrame({'a': [True, False, False]}), + ) + with pytest.raises(ValueError, match=r"Provided targets are not supported.*"): + validator.fit( + np.array([[0, 1, 2], [0, 3, 4]]), + np.array([[0, 1, 2, 5], [0, 3, 4, 6]]), + ) + with pytest.raises(ValueError, match="Train and test targets must both have the same"): + validator.fit( + pd.DataFrame({'string': ['foo']}), + pd.DataFrame({'int': [1]}), + ) + with pytest.raises(ValueError, match=r"AutoPyTorch only supports Numpy arrays, .*"): + validator.fit({'input1': 1, 'input2': 2}) + with pytest.raises(ValueError, match=r"arget values cannot contain missing/NaN values"): + validator.fit(np.array([np.nan, 1, 2])) + with pytest.raises(ValueError, match=r"arget values cannot contain missing/NaN values"): + validator.fit(sparse.csr_matrix(np.array([1, 2, np.nan]))) + with pytest.raises(ValueError, match=r"Cannot call transform on a validator that is not fit"): + validator.transform(np.array([1, 2, 3])) + with pytest.raises(ValueError, match=r"Cannot call inverse_transform on a validator that is"): + validator.inverse_transform(np.array([1, 2, 3])) + with pytest.raises(ValueError, match=r"Multi-dimensional classification is not yet supported"): + validator._fit(np.array([[1, 2, 3], [1, 5, 6]])) + + # Dia/ DOK are not supported as type of target makes calls len on the array + # which causes TypeError: len() of unsized object. Basically, sparse data as + # multi-label is the only thing that makes sense in this format. + with pytest.raises(ValueError, match=r"The provided data could not be interpreted by AutoPyTorch"): + validator.fit(sparse.dia_matrix(np.array([1, 2, 3]))) + + validator.fit(np.array([[0, 1, 0], [0, 1, 1]])) + with pytest.raises(ValueError, match=r"Number of outputs changed from"): + validator.fit(np.array([0, 1, 0])) + + +def test_targetvalidator_inversetransform(): + """ + Test that the encoding/decoding works in 1D + """ + validator = TabularTargetValidator(is_classification=True) + validator.fit( + pd.DataFrame(data=['a', 'a', 'b', 'c', 'a'], dtype='category'), + ) + y = validator.transform( + pd.DataFrame(data=['a', 'a', 'b', 'c', 'a'], dtype='category'), + ) + np.testing.assert_array_almost_equal(np.array([0, 0, 1, 2, 0]), y) + + y_decoded = validator.inverse_transform(y) + assert ['a', 'a', 'b', 'c', 'a'] == y_decoded.tolist() + + assert validator.classes_.tolist() == ['a', 'b', 'c'] + + validator = TabularTargetValidator(is_classification=True) + multi_label = pd.DataFrame( + np.array([[1, 0, 0, 1], [0, 0, 1, 1], [0, 0, 0, 0]]), + dtype=bool + ) + validator.fit(multi_label) + y = validator.transform(multi_label) + + y_decoded = validator.inverse_transform(y) + np.testing.assert_array_almost_equal(y, y_decoded) + + # Multilabel classification is not encoded + # For this reason, classes_ attribute does not contain a class + np.testing.assert_array_almost_equal(validator.classes_, np.array([])) + + +# Actual checks for the targets +@pytest.mark.parametrize( + 'input_data_targettest', + ( + 'series_binary', + 'series_multiclass', + 'pandas_binary', + 'pandas_multiclass', + 'numpy_binary', + 'numpy_multiclass', + 'list_binary', + 'list_multiclass', + ), + indirect=True +) +def test_unknown_categories_in_targets(input_data_targettest): + validator = TabularTargetValidator(is_classification=True) + validator.fit(input_data_targettest) + + # Add an extra category + if isinstance(input_data_targettest, list): + input_data_targettest.append(input_data_targettest[-1] + 5000) + elif isinstance(input_data_targettest, (pd.DataFrame, pd.Series)): + input_data_targettest.iloc[-1] = 5000 + elif isinstance(input_data_targettest, np.ndarray): + input_data_targettest[-1] = 5000 + + x_t = validator.transform(input_data_targettest) + assert x_t[-1].item(0) == -1 + + +def test_is_single_column_target(): + validator = TabularTargetValidator(is_classification=True) + validator.fit(np.array([1, 2, 3, 4])) + assert validator.is_single_column_target() + + validator = TabularTargetValidator(is_classification=True) + validator.fit(np.array([[1, 0, 1, 0], [1, 1, 1, 1]])) + assert not validator.is_single_column_target() diff --git a/test/test_data/test_validation.py b/test/test_data/test_validation.py new file mode 100644 index 000000000..482c99769 --- /dev/null +++ b/test/test_data/test_validation.py @@ -0,0 +1,139 @@ +import numpy as np + +import pandas as pd + +import pytest + +from scipy import sparse + +import sklearn.datasets +import sklearn.model_selection + +from autoPyTorch.data.tabular_validator import TabularInputValidator + + +@pytest.mark.parametrize('openmlid', [2, 40975, 40984]) +@pytest.mark.parametrize('as_frame', [True, False]) +def test_data_validation_for_classification(openmlid, as_frame): + x, y = sklearn.datasets.fetch_openml(data_id=openmlid, return_X_y=True, as_frame=as_frame) + validator = TabularInputValidator(is_classification=True) + + if as_frame: + # NaN is not supported in categories, so + # drop columns with them. + nan_cols = [i for i in x.columns if x[i].isnull().any()] + cat_cols = [i for i in x.columns if x[i].dtype.name in ['category', 'bool']] + unsupported_columns = list(set(nan_cols) & set(cat_cols)) + if len(unsupported_columns) > 0: + x.drop(unsupported_columns, axis=1, inplace=True) + + X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split( + x, y, test_size=0.33, random_state=0) + + validator.fit(X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test) + + X_train_t, y_train_t = validator.transform(X_train, y_train) + assert np.shape(X_train) == np.shape(X_train_t) + + # Leave columns that are complete NaN + # The sklearn pipeline will handle that + if as_frame and np.any(pd.isnull(X_train).values.all(axis=0)): + assert np.any(pd.isnull(X_train_t).values.all(axis=0)) + elif not as_frame and np.any(pd.isnull(X_train).all(axis=0)): + assert np.any(pd.isnull(X_train_t).all(axis=0)) + + # make sure everything was encoded to number + assert np.issubdtype(X_train_t.dtype, np.number) + assert np.issubdtype(y_train_t.dtype, np.number) + + # Categorical columns are sorted to the beginning + if as_frame: + validator.feature_validator.feat_type is not None + ordered_unique_elements = list(dict.fromkeys(validator.feature_validator.feat_type)) + if len(ordered_unique_elements) > 1: + assert ordered_unique_elements[0] == 'categorical' + + +@pytest.mark.parametrize('openmlid', [505, 546, 531]) +@pytest.mark.parametrize('as_frame', [True, False]) +def test_data_validation_for_regression(openmlid, as_frame): + x, y = sklearn.datasets.fetch_openml(data_id=openmlid, return_X_y=True, as_frame=as_frame) + validator = TabularInputValidator(is_classification=False) + + if as_frame: + # NaN is not supported in categories, so + # drop columns with them. + nan_cols = [i for i in x.columns if x[i].isnull().any()] + cat_cols = [i for i in x.columns if x[i].dtype.name in ['category', 'bool']] + unsupported_columns = list(set(nan_cols) & set(cat_cols)) + if len(unsupported_columns) > 0: + x.drop(unsupported_columns, axis=1, inplace=True) + + X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split( + x, y, test_size=0.33, random_state=0) + + validator.fit(X_train=X_train, y_train=y_train) + + X_train_t, y_train_t = validator.transform(X_train, y_train) + assert np.shape(X_train) == np.shape(X_train_t) + + # Leave columns that are complete NaN + # The sklearn pipeline will handle that + if as_frame and np.any(pd.isnull(X_train).values.all(axis=0)): + assert np.any(pd.isnull(X_train_t).values.all(axis=0)) + elif not as_frame and np.any(pd.isnull(X_train).all(axis=0)): + assert np.any(pd.isnull(X_train_t).all(axis=0)) + + # make sure everything was encoded to number + assert np.issubdtype(X_train_t.dtype, np.number) + assert np.issubdtype(y_train_t.dtype, np.number) + + # Categorical columns are sorted to the beginning + if as_frame: + validator.feature_validator.feat_type is not None + ordered_unique_elements = list(dict.fromkeys(validator.feature_validator.feat_type)) + if len(ordered_unique_elements) > 1: + assert ordered_unique_elements[0] == 'categorical' + + +def test_sparse_data_validation_for_regression(): + X, y = sklearn.datasets.make_regression(n_samples=100, n_features=50, random_state=0) + X_sp = sparse.coo_matrix(X) + validator = TabularInputValidator(is_classification=False) + + validator.fit(X_train=X_sp, y_train=y) + + X_t, y_t = validator.transform(X, y) + assert np.shape(X) == np.shape(X_t) + + # make sure everything was encoded to number + assert np.issubdtype(X_t.dtype, np.number) + assert np.issubdtype(y_t.dtype, np.number) + + # Make sure we can change the sparse format + X_t, y_t = validator.transform(sparse.csr_matrix(X), y) + + +def test_validation_unsupported(): + """ + Makes sure we raise a proper message to the user, + when providing not supported data input + """ + validator = TabularInputValidator() + with pytest.raises(ValueError, match=r"Inconsistent number of train datapoints.*"): + validator.fit( + X_train=np.array([[0, 1, 0], [0, 1, 1]]), + y_train=np.array([0, 1, 0, 0, 0, 0]), + ) + with pytest.raises(ValueError, match=r"Inconsistent number of test datapoints.*"): + validator.fit( + X_train=np.array([[0, 1, 0], [0, 1, 1]]), + y_train=np.array([0, 1]), + X_test=np.array([[0, 1, 0], [0, 1, 1]]), + y_test=np.array([0, 1, 0, 0, 0, 0]), + ) + with pytest.raises(ValueError, match=r"Cannot call transform on a validator .*fitted"): + validator.transform( + X=np.array([[0, 1, 0], [0, 1, 1]]), + y=np.array([0, 1]), + ) diff --git a/test/test_datasets/test_tabular_dataset.py b/test/test_datasets/test_tabular_dataset.py index 6d5dacd8d..b96942902 100644 --- a/test/test_datasets/test_tabular_dataset.py +++ b/test/test_datasets/test_tabular_dataset.py @@ -1,113 +1,42 @@ -import typing -import unittest +import pytest -import numpy as np - -import pandas as pd - -import sklearn.datasets -import sklearn.model_selection - -from autoPyTorch.datasets.tabular_dataset import DataTypes, TabularDataset -from autoPyTorch.utils.backend import create from autoPyTorch.utils.pipeline import get_dataset_requirements -class DataFrameTest(unittest.TestCase): - def runTest(self): - df = pd.DataFrame([['a', 0.1, 1], ['b', 0.2, np.nan]]) - target_df = pd.Series([1, 2]) - ds = TabularDataset(df, target_df) - self.assertEqual(ds.data_types, [DataTypes.String, DataTypes.Float, DataTypes.Canonical]) - self.assertEqual(set(ds.itovs[2]), {np.nan, 1}) - self.assertEqual(set(ds.itovs[0]), {np.nan, 'a', 'b'}) - - self.assertEqual(ds.vtois[0]['a'], 1) - self.assertEqual(ds.vtois[0][np.nan], 0) - self.assertEqual(ds.vtois[0][pd._libs.NaT], 0) - self.assertEqual(ds.vtois[0][pd._libs.missing.NAType()], 0) - self.assertTrue((ds.nan_mask == np.array([[0, 0, 0], [0, 0, 1]], dtype=np.bool)).all()) - - -class NumpyArrayTest(unittest.TestCase): - def runTest(self): - matrix = np.array([(0, 0.1, 1), (1, np.nan, 3)], dtype='f4, f4, i4') - target_df = pd.Series([1, 2]) - ds = TabularDataset(matrix, target_df) - self.assertEqual(ds.data_types, [DataTypes.Canonical, DataTypes.Float, DataTypes.Canonical]) - self.assertEqual(set(ds.itovs[2]), {np.nan, 1, 3}) - - self.assertEqual(ds.vtois[0][1], 2) - self.assertEqual(ds.vtois[0][np.nan], 0) - self.assertEqual(ds.vtois[0][pd._libs.NaT], 0) - self.assertEqual(ds.vtois[0][pd._libs.missing.NAType()], 0) - self.assertTrue((ds.nan_mask == np.array([[0, 0, 0], [0, 1, 0]], dtype=np.bool)).all()) - - -def get_data_to_train() -> typing.Dict[str, typing.Any]: - """ - This function returns a fit dictionary that within itself, contains all - the information needed - """ - - # Get the training data for tabular classification - # Move to Australian to showcase numerical vs categorical - X, y = sklearn.datasets.fetch_openml(data_id=40981, return_X_y=True, as_frame=True) - X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split( - X, - y, - random_state=1, - test_size=0.2, - ) - # Fit the pipeline - fit_dictionary = { - 'X_train': X_train, - 'y_train': y_train, - 'X_test': X_test, - 'y_test': y_test, - } - - return fit_dictionary - - -class TabularDatasetTest(unittest.TestCase): - - def test_get_dataset_properties(self): - # Get data to train - fit_dictionary = get_data_to_train() - - # Build a repository with random fitted models - try: - backend = create(temporary_directory='/tmp/autoPyTorch_ensemble_test_tmp', - output_directory='/tmp/autoPyTorch_ensemble_test_out', - delete_tmp_folder_after_terminate=False) - except Exception: - self.assertRaises(FileExistsError) - return unittest.skip("File already exists") - - fit_dictionary['backend'] = backend - - # Create the directory structure - backend._make_internals_directory() - - # Create a datamanager for this toy problem - datamanager = TabularDataset( - X=fit_dictionary['X_train'], Y=fit_dictionary['y_train'], - X_test=fit_dictionary['X_test'], Y_test=fit_dictionary['y_test'], - ) - backend.save_datamanager(datamanager) - - datamanager = backend.load_datamanager() - info = datamanager.get_required_dataset_info() - dataset_requirements = get_dataset_requirements(info) - - dataset_properties = datamanager.get_dataset_properties(dataset_requirements) - - self.assertIsInstance(dataset_properties, dict) - for dataset_requirement in dataset_requirements: - self.assertIn(dataset_requirement.name, dataset_properties.keys()) - self.assertIsInstance(dataset_properties[dataset_requirement.name], dataset_requirement.supported_types) - - -if __name__ == '__main__': - unittest.main() +@pytest.mark.parametrize("fit_dictionary", ['fit_dictionary_numerical_only', + 'fit_dictionary_categorical_only', + 'fit_dictionary_num_and_categorical'], indirect=True) +def test_get_dataset_properties(backend, fit_dictionary): + + # The fixture creates a datamanager by itself + datamanager = backend.load_datamanager() + + info = {'task_type': datamanager.task_type, + 'output_type': datamanager.output_type, + 'issparse': datamanager.issparse, + 'numerical_columns': datamanager.numerical_columns, + 'categorical_columns': datamanager.categorical_columns} + dataset_requirements = get_dataset_requirements(info) + + dataset_properties = datamanager.get_dataset_properties(dataset_requirements) + for expected in [ + 'categorical_columns', + 'numerical_columns', + 'issparse', + 'is_small_preprocess', + 'task_type', + 'output_type', + 'input_shape', + 'output_shape', + 'num_classes', + ]: + assert expected in dataset_properties + + assert isinstance(dataset_properties, dict) + for dataset_requirement in dataset_requirements: + assert dataset_requirement.name in dataset_properties.keys() + assert isinstance(dataset_properties[dataset_requirement.name], dataset_requirement.supported_types) + + assert datamanager.train_tensors[0].shape == fit_dictionary['X_train'].shape + assert datamanager.train_tensors[1].shape == fit_dictionary['y_train'].shape + assert datamanager.task_type == 'tabular_classification' diff --git a/test/test_evaluation/evaluation_util.py b/test/test_evaluation/evaluation_util.py index b61df8643..088726963 100644 --- a/test/test_evaluation/evaluation_util.py +++ b/test/test_evaluation/evaluation_util.py @@ -11,6 +11,7 @@ import sklearn.model_selection from sklearn import preprocessing +from autoPyTorch.data.tabular_validator import TabularInputValidator from autoPyTorch.datasets.resampling_strategy import HoldoutValTypes from autoPyTorch.datasets.tabular_dataset import TabularDataset from autoPyTorch.pipeline.components.training.metrics.metrics import ( @@ -139,9 +140,11 @@ def get_multiclass_classification_datamanager(resampling_strategy=HoldoutValType X_train = X_train[indices] Y_train = Y_train[indices] + validator = TabularInputValidator(is_classification=True).fit(X_train, Y_train) dataset = TabularDataset( X=X_train, Y=Y_train, X_test=X_test, Y_test=Y_test, + validator=validator, resampling_strategy=resampling_strategy ) return dataset @@ -155,8 +158,10 @@ def get_abalone_datamanager(resampling_strategy=HoldoutValTypes.holdout_validati X, y, random_state=1 ) + validator = TabularInputValidator(is_classification=True).fit(X_train, y_train) dataset = TabularDataset( X=X_train, Y=y_train, + validator=validator, X_test=X_test, Y_test=y_test, resampling_strategy=resampling_strategy ) @@ -179,9 +184,11 @@ def get_binary_classification_datamanager(resampling_strategy=HoldoutValTypes.ho X_test = X_test[eliminate_class_two] Y_test = Y_test[eliminate_class_two] + validator = TabularInputValidator(is_classification=True).fit(X_train, Y_train) dataset = TabularDataset( X=X_train, Y=Y_train, X_test=X_test, Y_test=Y_test, + validator=validator, resampling_strategy=resampling_strategy ) return dataset @@ -195,9 +202,11 @@ def get_regression_datamanager(resampling_strategy=HoldoutValTypes.holdout_valid X_train = X_train[indices] Y_train = Y_train[indices] + validator = TabularInputValidator(is_classification=True).fit(X_train, Y_train) dataset = TabularDataset( X=X_train, Y=Y_train, X_test=X_test, Y_test=Y_test, + validator=validator, resampling_strategy=resampling_strategy ) return dataset @@ -221,9 +230,11 @@ def get_500_classes_datamanager(resampling_strategy=HoldoutValTypes.holdout_vali shuffle=True, random_state=1) + validator = TabularInputValidator(is_classification=True).fit(X, Y) dataset = TabularDataset( X=X[:700], Y=Y[:700], X_test=X[700:], Y_test=Y[710:], + validator=validator, resampling_strategy=resampling_strategy ) diff --git a/test/test_pipeline/components/test_feature_data_loader.py b/test/test_pipeline/components/test_feature_data_loader.py index 958c9ad7b..7d4c9d80d 100644 --- a/test/test_pipeline/components/test_feature_data_loader.py +++ b/test/test_pipeline/components/test_feature_data_loader.py @@ -40,4 +40,4 @@ def test_build_transform_small_preprocess_false(self): self.assertIsInstance(compose, torchvision.transforms.Compose) # We expect the to tensor, the preproces transforms and the check_array - self.assertEqual(len(compose.transforms), 3) + self.assertEqual(len(compose.transforms), 4) diff --git a/test/test_pipeline/components/test_imputers.py b/test/test_pipeline/components/test_imputers.py index ac8f0e143..983737dfe 100644 --- a/test/test_pipeline/components/test_imputers.py +++ b/test/test_pipeline/components/test_imputers.py @@ -209,7 +209,7 @@ def test_constant_imputation(self): ) column_transformer = column_transformer.fit(X['X_train']) transformed = column_transformer.transform(data[test_indices]) - assert_array_equal(transformed.astype(str), np.array([['!missing!', 8, 9], + assert_array_equal(transformed.astype(str), np.array([['-1', 8, 9], [7.0, '0', 9], [4.0, '0', '0']], dtype=str)) diff --git a/test/test_pipeline/test_tabular_classification.py b/test/test_pipeline/test_tabular_classification.py index e7ae68012..8f87d62ca 100644 --- a/test/test_pipeline/test_tabular_classification.py +++ b/test/test_pipeline/test_tabular_classification.py @@ -71,6 +71,7 @@ def test_pipeline_fit(self, fit_dictionary): def test_pipeline_predict(self, fit_dictionary): """This test makes sure that the pipeline is able to fit given random combinations of hyperparameters across the pipeline""" + X = fit_dictionary['X_train'].copy() pipeline = TabularClassificationPipeline( dataset_properties=fit_dictionary['dataset_properties']) @@ -80,8 +81,7 @@ def test_pipeline_predict(self, fit_dictionary): pipeline.fit(fit_dictionary) - prediction = pipeline.predict( - fit_dictionary['backend'].load_datamanager().test_tensors[0]) + prediction = pipeline.predict(X) assert isinstance(prediction, np.ndarray) assert prediction.shape == (200, 2) @@ -90,6 +90,7 @@ def test_pipeline_predict_proba(self, fit_dictionary): given random combinations of hyperparameters across the pipeline And then predict using predict probability """ + X = fit_dictionary['X_train'].copy() pipeline = TabularClassificationPipeline( dataset_properties=fit_dictionary['dataset_properties']) @@ -99,8 +100,7 @@ def test_pipeline_predict_proba(self, fit_dictionary): pipeline.fit(fit_dictionary) - prediction = pipeline.predict_proba( - fit_dictionary['backend'].load_datamanager().test_tensors[0]) + prediction = pipeline.predict_proba(X) assert isinstance(prediction, np.ndarray) assert prediction.shape == (200, 2) @@ -118,11 +118,8 @@ def test_pipeline_transform(self, fit_dictionary): config = cs.sample_configuration() pipeline.set_hyperparameters(config) - pipeline.fit(fit_dictionary) - # We do not want to make the same early preprocessing operation to the fit dictionary - if 'X_train' in fit_dictionary: - fit_dictionary.pop('X_train') + pipeline.fit(fit_dictionary.copy()) transformed_fit_dictionary = pipeline.transform(fit_dictionary)