add test for comparator and other improvements based on PR comments

ravinkohli · ravinkohli · commit f0c2aa04b952 · 2021-10-12T10:53:40.000+02:00
diff --git a/autoPyTorch/api/base_task.py b/autoPyTorch/api/base_task.py
@@ -9,7 +9,6 @@
 import tempfile
 import time
 import typing
-from typing_extensions import runtime
 import unittest.mock
 import warnings
 from abc import abstractmethod
@@ -751,13 +750,14 @@ def run_traditional_ml(
         self,
         current_task_name: str,
         runtime_limit: int,
-        func_eval_time_limit_secs: int) -> None:
+        func_eval_time_limit_secs: int
+    ) -> None:
         """
         This function can be used to run the suite of traditional machine
-        learning models during the current task (for e.g, ensemble fit, search) 
+        learning models during the current task (for e.g, ensemble fit, search)
 
         Args:
-            current_task_name (str): name of the current task, 
+            current_task_name (str): name of the current task,
             runtime_limit (int): time limit for fitting traditional models,
             func_eval_time_limit_secs (int): Time limit
                 for a single call to the machine learning model.
diff --git a/autoPyTorch/data/base_target_validator.py b/autoPyTorch/data/base_target_validator.py
@@ -43,8 +43,10 @@ class BaseTargetValidator(BaseEstimator):
     """
     def __init__(self,
                  is_classification: bool = False,
-                 logger: Optional[Union[PicklableClientLogger, logging.Logger
-                                                      ]] = None,
+                 logger: Optional[Union[PicklableClientLogger,
+                                        logging.Logger
+                                        ]
+                                  ] = None,
                  ) -> None:
         self.is_classification = is_classification
 
diff --git a/autoPyTorch/data/tabular_feature_validator.py b/autoPyTorch/data/tabular_feature_validator.py
@@ -78,6 +78,29 @@ def get_tabular_preprocessors() -> Dict[str, List[BaseEstimator]]:
 
 class TabularFeatureValidator(BaseFeatureValidator):
 
+    @staticmethod
+    def _comparator(cmp1: str, cmp2: str) -> int:
+        """Order so that categorical columns come right and numerical columns come left
+
+        Args:
+            cmp1 (str): First variable to compare
+            cmp2 (str): Second variable to compare
+
+        Raises:
+            ValueError: if the values of the variables to compare
+            are not in 'categorical' or 'numerical'
+
+        Returns:
+            int: either [0, -1, 1]
+        """
+        choices = ['categorical', 'numerical']
+        if cmp1 not in choices or cmp2 not in choices:
+            raise ValueError('The comparator for the column order only accepts {}, '
+                             'but got {} and {}'.format(choices, cmp1, cmp2))
+
+        idx1, idx2 = choices.index(cmp1), choices.index(cmp2)
+        return idx1 - idx2
+
     def _fit(
         self,
         X: SUPPORTED_FEAT_TYPES,
@@ -130,19 +153,10 @@ def _fit(
             # The column transformer reorders the feature types
             # therefore, we need to change the order of columns as well
             # This means categorical columns are shifted to the right
-            def comparator(cmp1: str, cmp2: str) -> int:
-                """ Order so that categorical columns come right and numerical columns come left """
-                choices = ['categorical', 'numerical']
-                if cmp1 not in choices or cmp2 not in choices:
-                    raise ValueError('The comparator for the column order only accepts {}, '
-                                     'but got {} and {}'.format(choices, cmp1, cmp2))
-
-                idx1, idx2 = choices.index(cmp1), choices.index(cmp2)
-                return idx1 - idx2
 
             self.feat_type = sorted(
                 feat_type,
-                key=functools.cmp_to_key(comparator)
+                key=functools.cmp_to_key(self._comparator)
             )
 
             # differently to categorical_columns and numerical_columns,
diff --git a/autoPyTorch/pipeline/base_pipeline.py b/autoPyTorch/pipeline/base_pipeline.py
@@ -400,6 +400,7 @@ def _check_search_space_updates(self, include: Optional[Dict[str, Any]],
                 raise ValueError("Unknown node name. Expected update node name to be in {} "
                                  "got {}".format(self.named_steps.keys(), update.node_name))
             node = self.named_steps[update.node_name]
+            node_name = node.__class__.__name__
             # if node is a choice module
             if hasattr(node, 'get_components'):
                 split_hyperparameter = update.hyperparameter.split(':')
@@ -429,16 +430,16 @@ def _check_search_space_updates(self, include: Optional[Dict[str, Any]],
                         if choice not in components.keys():
                             raise ValueError("Unknown component choice for node {}. "
                                              "Expected update hyperparameter "
-                                             "to be in {}, but got {}".format(node.__class__.__name__,
-                                                                         components.keys(), choice))
+                                             "to be in {}, but got {}".format(node_name,
+                                                                              components.keys(), choice))
                 # check if the component whose hyperparameter
                 # needs to be updated is in components of the
                 # choice module
                 elif split_hyperparameter[0] not in components.keys():
                     raise ValueError("Unknown component choice for node {}. "
                                      "Expected update component "
-                                     "to be in {}, but got {}".format(node.__class__.__name__,
-                                                                 components.keys(), split_hyperparameter[0]))
+                                     "to be in {}, but got {}".format(node_name,
+                                                                      components.keys(), split_hyperparameter[0]))
                 else:
                     # check if hyperparameter is in the search space of the component
                     component = components[split_hyperparameter[0]]
@@ -451,15 +452,15 @@ def _check_search_space_updates(self, include: Optional[Dict[str, Any]],
                                 component.get_hyperparameter_search_space(
                                     dataset_properties=self.dataset_properties).get_hyperparameter_names()]):
                             continue
+                        component_hyperparameters = component.get_hyperparameter_search_space(
+                            dataset_properties=self.dataset_properties).get_hyperparameter_names()
                         raise ValueError("Unknown hyperparameter for  component {} of node {}."
                                          " Expected update hyperparameter "
                                          "to be in {}, but got {}.".format(component.__name__,
-                                                                      node.__class__.__name__,
-                                                                      component.get_hyperparameter_search_space(
-                                                                          dataset_properties=self.dataset_properties
-                                                                      ).get_hyperparameter_names(),
-                                                                      split_hyperparameter[1]
-                                                                      )
+                                                                           node_name,
+                                                                           component_hyperparameters,
+                                                                           split_hyperparameter[1]
+                                                                           )
                                          )
             else:
                 if update.hyperparameter not in node.get_hyperparameter_search_space(
@@ -468,13 +469,13 @@ def _check_search_space_updates(self, include: Optional[Dict[str, Any]],
                             node.get_hyperparameter_search_space(
                                 dataset_properties=self.dataset_properties).get_hyperparameter_names()]):
                         continue
+                    node_hyperparameters = node.get_hyperparameter_search_space(
+                        dataset_properties=self.dataset_properties).get_hyperparameter_names()
                     raise ValueError("Unknown hyperparameter for node {}. "
                                      "Expected update hyperparameter "
-                                     "to be in {}, but got {}".format(node.__class__.__name__,
-                                                                 node.
-                                                                 get_hyperparameter_search_space(
-                                                                     dataset_properties=self.dataset_properties).
-                                                                 get_hyperparameter_names(), update.hyperparameter))
+                                     "to be in {}, but got {}".format(node_name,
+                                                                      node_hyperparameters,
+                                                                      update.hyperparameter))
 
     def _get_pipeline_steps(self, dataset_properties: Optional[Dict[str, Any]]
                             ) -> List[Tuple[str, autoPyTorchChoice]]:
diff --git a/autoPyTorch/pipeline/components/training/trainer/RowCutMixTrainer.py b/autoPyTorch/pipeline/components/training/trainer/RowCutMixTrainer.py
@@ -29,7 +29,7 @@ def data_preparation(self, X: np.ndarray, y: np.ndarray,
         beta = 1.0
         lam = self.random_state.beta(beta, beta)
         batch_size, n_columns = np.shape(X)
-        # shuffled_indices: Shuffled version of torch.arange(batch_size) 
+        # shuffled_indices: Shuffled version of torch.arange(batch_size)
         shuffled_indices = torch.randperm(batch_size).cuda() if X.is_cuda else torch.randperm(batch_size)
 
         r = self.random_state.rand(1)
diff --git a/test/test_data/test_feature_validator.py b/test/test_data/test_feature_validator.py
@@ -1,4 +1,5 @@
 import copy
+import functools
 
 import numpy as np
 
@@ -331,6 +332,11 @@ def feature_validator_remove_nan_catcolumns(df_train: pd.DataFrame, df_test: pd.
 def test_feature_validator_remove_nan_catcolumns():
     """
     Make sure categorical columns that have only nan values are removed.
+    The ans arrays contain the final output after calling transform on
+    datasets, this includes fitting and transforming a column transformer
+    containing simple imputation for both categorical and numerical
+    columns, scaling for numerical columns and one hot encoding for
+    categorical columns.
     """
     # First case, there exist null columns (B and C) in the train set
     # and a same column (C) are not all null for the test set.
@@ -396,6 +402,7 @@ def test_feature_validator_remove_nan_catcolumns():
     ans_test = np.array([[0, 0, 0, 0], [0, 0, 0, 0]], dtype=np.float64)
     feature_validator_remove_nan_catcolumns(df_train, df_test, ans_train, ans_test)
 
+
 def test_features_unsupported_calls_are_raised():
     """
     Makes sure we raise a proper message to the user,
@@ -664,3 +671,26 @@ def test_feature_validator_imbalanced_data():
     transformed_X_test = validator.transform(X_test)
     transformed_X_test = pd.DataFrame(transformed_X_test)
     assert not len(validator.all_nan_columns)
+
+
+def test_comparator():
+    numerical = 'numerical'
+    categorical = 'categorical'
+
+    validator = TabularFeatureValidator
+
+    feat_type = [numerical, categorical] * 10
+    ans = [categorical] * 10 + [numerical] * 10
+    feat_type = sorted(
+        feat_type,
+        key=functools.cmp_to_key(validator._comparator)
+    )
+    assert ans == feat_type
+
+    feat_type = [numerical] * 10 + [categorical] * 10
+    ans = [categorical] * 10 + [numerical] * 10
+    feat_type = sorted(
+        feat_type,
+        key=functools.cmp_to_key(validator._comparator)
+    )
+    assert ans == feat_type