[refactor] Getting dataset properties from the dataset object (#164)

ravinkohli · web-flow · commit 89626126bf08 · 2021-05-18T16:10:57.000+02:00
* Use get_required_dataset_info of the dataset when needing required info for getting dataset requirements

* Fix flake

* Fix bug in getting dataset requirements

* Added doc string to explain dataset properties

* Update doc string in utils pipeline
diff --git a/autoPyTorch/api/base_task.py b/autoPyTorch/api/base_task.py
@@ -196,14 +196,6 @@ def __init__(
                 raise ValueError("Expected search space updates to be of instance"
                                  " HyperparameterSearchSpaceUpdates got {}".format(type(self.search_space_updates)))
 
-    @abstractmethod
-    def _get_required_dataset_properties(self, dataset: BaseDataset) -> Dict[str, Any]:
-        """
-        given a pipeline type, this function returns the
-        dataset properties required by the dataset object
-        """
-        raise NotImplementedError
-
     @abstractmethod
     def build_pipeline(self, dataset_properties: Dict[str, Any]) -> BasePipeline:
         """
@@ -267,7 +259,10 @@ def get_search_space(self, dataset: BaseDataset = None) -> ConfigurationSpace:
             return self.search_space
         elif dataset is not None:
             dataset_requirements = get_dataset_requirements(
-                info=self._get_required_dataset_properties(dataset))
+                info=dataset.get_required_dataset_info(),
+                include=self.include_components,
+                exclude=self.exclude_components,
+                search_space_updates=self.search_space_updates)
             return get_configuration_space(info=dataset.get_dataset_properties(dataset_requirements),
                                            include=self.include_components,
                                            exclude=self.exclude_components,
@@ -785,7 +780,10 @@ def _search(
         # Initialise information needed for the experiment
         experiment_task_name: str = 'runSearch'
         dataset_requirements = get_dataset_requirements(
-            info=self._get_required_dataset_properties(dataset))
+            info=dataset.get_required_dataset_info(),
+            include=self.include_components,
+            exclude=self.exclude_components,
+            search_space_updates=self.search_space_updates)
         self._dataset_requirements = dataset_requirements
         dataset_properties = dataset.get_dataset_properties(dataset_requirements)
         self._stopwatch.start_task(experiment_task_name)
@@ -1049,7 +1047,10 @@ def refit(
             self._logger = self._get_logger(str(self.dataset_name))
 
         dataset_requirements = get_dataset_requirements(
-            info=self._get_required_dataset_properties(dataset))
+            info=dataset.get_required_dataset_info(),
+            include=self.include_components,
+            exclude=self.exclude_components,
+            search_space_updates=self.search_space_updates)
         dataset_properties = dataset.get_dataset_properties(dataset_requirements)
         self._backend.save_datamanager(dataset)
 
@@ -1119,7 +1120,10 @@ def fit(self,
 
         # get dataset properties
         dataset_requirements = get_dataset_requirements(
-            info=self._get_required_dataset_properties(dataset))
+            info=dataset.get_required_dataset_info(),
+            include=self.include_components,
+            exclude=self.exclude_components,
+            search_space_updates=self.search_space_updates)
         dataset_properties = dataset.get_dataset_properties(dataset_requirements)
         self._backend.save_datamanager(dataset)
 
diff --git a/autoPyTorch/api/tabular_classification.py b/autoPyTorch/api/tabular_classification.py
@@ -13,7 +13,6 @@
     TASK_TYPES_TO_STRING,
 )
 from autoPyTorch.data.tabular_validator import TabularInputValidator
-from autoPyTorch.datasets.base_dataset import BaseDataset
 from autoPyTorch.datasets.resampling_strategy import (
     CrossValTypes,
     HoldoutValTypes,
@@ -97,17 +96,6 @@ def __init__(
             task_type=TASK_TYPES_TO_STRING[TABULAR_CLASSIFICATION],
         )
 
-    def _get_required_dataset_properties(self, dataset: BaseDataset) -> Dict[str, Any]:
-        if not isinstance(dataset, TabularDataset):
-            raise ValueError("Dataset is incompatible for the given task,: {}".format(
-                type(dataset)
-            ))
-        return {'task_type': dataset.task_type,
-                'output_type': dataset.output_type,
-                'issparse': dataset.issparse,
-                'numerical_columns': dataset.numerical_columns,
-                'categorical_columns': dataset.categorical_columns}
-
     def build_pipeline(self, dataset_properties: Dict[str, Any]) -> TabularClassificationPipeline:
         return TabularClassificationPipeline(dataset_properties=dataset_properties)
 
diff --git a/autoPyTorch/api/tabular_regression.py b/autoPyTorch/api/tabular_regression.py
@@ -13,7 +13,6 @@
     TASK_TYPES_TO_STRING
 )
 from autoPyTorch.data.tabular_validator import TabularInputValidator
-from autoPyTorch.datasets.base_dataset import BaseDataset
 from autoPyTorch.datasets.resampling_strategy import (
     CrossValTypes,
     HoldoutValTypes,
@@ -89,17 +88,6 @@ def __init__(
             task_type=TASK_TYPES_TO_STRING[TABULAR_REGRESSION],
         )
 
-    def _get_required_dataset_properties(self, dataset: BaseDataset) -> Dict[str, Any]:
-        if not isinstance(dataset, TabularDataset):
-            raise ValueError("Dataset is incompatible for the given task,: {}".format(
-                type(dataset)
-            ))
-        return {'task_type': dataset.task_type,
-                'output_type': dataset.output_type,
-                'issparse': dataset.issparse,
-                'numerical_columns': dataset.numerical_columns,
-                'categorical_columns': dataset.categorical_columns}
-
     def build_pipeline(self, dataset_properties: Dict[str, Any]) -> TabularRegressionPipeline:
         return TabularRegressionPipeline(dataset_properties=dataset_properties)
 
diff --git a/autoPyTorch/datasets/base_dataset.py b/autoPyTorch/datasets/base_dataset.py
@@ -348,11 +348,17 @@ def replace_data(self, X_train: BaseDatasetInputType,
 
     def get_dataset_properties(self, dataset_requirements: List[FitRequirement]) -> Dict[str, Any]:
         """
-        Gets the dataset properties required in the fit dictionary
+        Gets the dataset properties required in the fit dictionary.
+        This depends on the components that are active in the
+        pipeline and returns the properties they need about the dataset.
+        Information of the required properties of each component
+        can be found in their documentation.
         Args:
             dataset_requirements (List[FitRequirement]): List of
                 fit requirements that the dataset properties must
-                contain.
+                contain. This is created using the `get_dataset_requirements
+                function in
+                <https://github.com/automl/Auto-PyTorch/blob/refactor_development/autoPyTorch/utils/pipeline.py#L25>`
 
         Returns:
             dataset_properties (Dict[str, Any]):
@@ -362,19 +368,15 @@ def get_dataset_properties(self, dataset_requirements: List[FitRequirement]) ->
         for dataset_requirement in dataset_requirements:
             dataset_properties[dataset_requirement.name] = getattr(self, dataset_requirement.name)
 
-        # Add task type, output type and issparse to dataset properties as
-        # they are not a dataset requirement in the pipeline
-        dataset_properties.update({'task_type': self.task_type,
-                                   'output_type': self.output_type,
-                                   'issparse': self.issparse,
-                                   'input_shape': self.input_shape,
-                                   'output_shape': self.output_shape
-                                   })
+        # Add the required dataset info to dataset properties as
+        # they might not be a dataset requirement in the pipeline
+        dataset_properties.update(self.get_required_dataset_info())
         return dataset_properties
 
     def get_required_dataset_info(self) -> Dict[str, Any]:
         """
-        Returns a dictionary containing required dataset properties to instantiate a pipeline,
+        Returns a dictionary containing required dataset
+        properties to instantiate a pipeline.
         """
         info = {'output_type': self.output_type,
                 'issparse': self.issparse}
diff --git a/autoPyTorch/datasets/tabular_dataset.py b/autoPyTorch/datasets/tabular_dataset.py
@@ -112,7 +112,24 @@ def __init__(self,
 
     def get_required_dataset_info(self) -> Dict[str, Any]:
         """
-        Returns a dictionary containing required dataset properties to instantiate a pipeline,
+        Returns a dictionary containing required dataset
+        properties to instantiate a pipeline.
+        For a Tabular Dataset this includes-
+            1. 'output_type'- Enum indicating the type of the output for this problem.
+                We currently use the `sklearn type_of_target
+                <https://scikit-learn.org/stable/modules/generated/sklearn.utils.multiclass.type_of_target.html>`
+                to infer the output type from the data and we encode it to an
+                Enum for which you can find more info in  `autopytorch/constants.py
+                <https://github.com/automl/Auto-PyTorch/blob/refactor_development/autoPyTorch/constants.py>`
+            2. 'issparse'- A flag indicating if the input is in a sparse matrix.
+            3. 'numerical_columns'- a list which contains the column numbers
+                for the numerical columns in the input dataset
+            4. 'categorical_columns'- a list which contains the column numbers
+                for the categorical columns in the input dataset
+            5. 'task_type'- Enum indicating the type of task. For tabular datasets,
+                currently we support 'tabular_classification' and 'tabular_regression'. and we encode it to an
+                Enum for which you can find more info in  `autopytorch/constants.py
+                <https://github.com/automl/Auto-PyTorch/blob/refactor_development/autoPyTorch/constants.py>`
         """
         info = super().get_required_dataset_info()
         info.update({
diff --git a/autoPyTorch/evaluation/abstract_evaluator.py b/autoPyTorch/evaluation/abstract_evaluator.py
@@ -31,7 +31,6 @@
     TABULAR_TASKS,
 )
 from autoPyTorch.datasets.base_dataset import BaseDataset
-from autoPyTorch.datasets.tabular_dataset import TabularDataset
 from autoPyTorch.evaluation.utils import (
     VotingRegressorWrapper,
     convert_multioutput_multiclass_to_multilabel
@@ -71,6 +70,7 @@ class MyTraditionalTabularClassificationPipeline(BaseEstimator):
             An optional dictionary that is passed to the pipeline's steps. It complies
             a similar function as the kwargs
     """
+
     def __init__(self, config: str,
                  dataset_properties: Dict[str, Any],
                  random_state: Optional[Union[int, np.random.RandomState]] = None,
@@ -141,6 +141,7 @@ class DummyClassificationPipeline(DummyClassifier):
             An optional dictionary that is passed to the pipeline's steps. It complies
             a similar function as the kwargs
     """
+
     def __init__(self, config: Configuration,
                  random_state: Optional[Union[int, np.random.RandomState]] = None,
                  init_params: Optional[Dict] = None
@@ -208,6 +209,7 @@ class DummyRegressionPipeline(DummyRegressor):
             An optional dictionary that is passed to the pipeline's steps. It complies
             a similar function as the kwargs
     """
+
     def __init__(self, config: Configuration,
                  random_state: Optional[Union[int, np.random.RandomState]] = None,
                  init_params: Optional[Dict] = None) -> None:
@@ -394,12 +396,9 @@ def __init__(self, backend: Backend,
             raise ValueError('disable_file_output should be either a bool or a list')
 
         self.pipeline_class: Optional[Union[BaseEstimator, BasePipeline]] = None
-        info: Dict[str, Any] = {'task_type': self.datamanager.task_type,
-                                'output_type': self.datamanager.output_type,
-                                'issparse': self.issparse}
         if self.task_type in REGRESSION_TASKS:
             if isinstance(self.configuration, int):
-                self.pipeline_class = DummyClassificationPipeline
+                self.pipeline_class = DummyRegressionPipeline
             elif isinstance(self.configuration, str):
                 raise ValueError("Only tabular classifications tasks "
                                  "are currently supported with traditional methods")
@@ -425,11 +424,12 @@ def __init__(self, backend: Backend,
                 else:
                     raise ValueError('task {} not available'.format(self.task_type))
             self.predict_function = self._predict_proba
-        if self.task_type in TABULAR_TASKS:
-            assert isinstance(self.datamanager, TabularDataset)
-            info.update({'numerical_columns': self.datamanager.numerical_columns,
-                         'categorical_columns': self.datamanager.categorical_columns})
-        self.dataset_properties = self.datamanager.get_dataset_properties(get_dataset_requirements(info))
+        self.dataset_properties = self.datamanager.get_dataset_properties(
+            get_dataset_requirements(info=self.datamanager.get_required_dataset_info(),
+                                     include=self.include,
+                                     exclude=self.exclude,
+                                     search_space_updates=self.search_space_updates
+                                     ))
 
         self.additional_metrics: Optional[List[autoPyTorchMetric]] = None
         if all_supported_metrics:
@@ -630,9 +630,9 @@ def finish_up(self, loss: Dict[str, float], train_loss: Dict[str, float],
         return None
 
     def calculate_auxiliary_losses(
-            self,
-            Y_valid_pred: np.ndarray,
-            Y_test_pred: np.ndarray,
+        self,
+        Y_valid_pred: np.ndarray,
+        Y_test_pred: np.ndarray,
     ) -> Tuple[Optional[float], Optional[float]]:
         """
         A helper function to calculate the performance estimate of the
@@ -670,10 +670,10 @@ def calculate_auxiliary_losses(
         return validation_loss, test_loss
 
     def file_output(
-            self,
-            Y_optimization_pred: np.ndarray,
-            Y_valid_pred: np.ndarray,
-            Y_test_pred: np.ndarray
+        self,
+        Y_optimization_pred: np.ndarray,
+        Y_valid_pred: np.ndarray,
+        Y_test_pred: np.ndarray
     ) -> Tuple[Optional[float], Dict]:
         """
         This method decides what file outputs are written to disk.
diff --git a/autoPyTorch/utils/pipeline.py b/autoPyTorch/utils/pipeline.py