Fix flake and mypy errors

ravinkohli · ravinkohli · commit 9cdfb64ec3fe · 2021-09-30T19:55:28.000+02:00
diff --git a/autoPyTorch/api/base_task.py b/autoPyTorch/api/base_task.py
@@ -1348,7 +1348,7 @@ def fit_ensemble(
             ensemble_size: int = 50,
             load_models: bool = True,
             time_for_task: int = 100,
-            func_eval_time_limit_secs: Optional[int] = None,
+            func_eval_time_limit_secs: int = 50,
             enable_traditional_pipeline: bool = True,
     ) -> 'BaseTask':
         """
diff --git a/autoPyTorch/api/tabular_classification.py b/autoPyTorch/api/tabular_classification.py
@@ -275,6 +275,7 @@ def search(
                          y_test=y_test,
                          dataset_name=dataset_name)
 
+        assert self.dataset is not None, "Something went wrong, expected dataset to be initialised"
         return self._search(
             dataset=self.dataset,
             optimize_metric=optimize_metric,
diff --git a/autoPyTorch/api/tabular_regression.py b/autoPyTorch/api/tabular_regression.py
@@ -261,6 +261,7 @@ def search(
                          y_test=y_test,
                          dataset_name=dataset_name)
 
+        assert self.dataset is not None, "Something went wrong, expected dataset to be initialised"
         return self._search(
             dataset=self.dataset,
             optimize_metric=optimize_metric,
diff --git a/autoPyTorch/data/tabular_feature_validator.py b/autoPyTorch/data/tabular_feature_validator.py
@@ -9,35 +9,32 @@
 import scipy.sparse
 
 import sklearn.utils
-
 from sklearn.base import BaseEstimator
 from sklearn.compose import ColumnTransformer
 from sklearn.exceptions import NotFittedError
-from sklearn.pipeline import make_pipeline
-from sklearn.preprocessing import OneHotEncoder
 from sklearn.impute import SimpleImputer
-from sklearn.preprocessing import StandardScaler
+from sklearn.pipeline import make_pipeline
+from sklearn.preprocessing import OneHotEncoder, StandardScaler
 
 from autoPyTorch.data.base_feature_validator import BaseFeatureValidator, SUPPORTED_FEAT_TYPES
 
 
 def _create_column_transformer(
-        preprocessors: Dict[str, List[BaseEstimator]],
-        numerical_columns: List[str],
-        categorical_columns: List[str]
+    preprocessors: Dict[str, List[BaseEstimator]],
+    numerical_columns: List[str],
+    categorical_columns: List[str]
 ) -> ColumnTransformer:
     """
-    Given a dictionary of preprocessors, this function 
-    creates a sklearn column transformer with appropriate 
-    columns associated with their preprocessors. 
+    Given a dictionary of preprocessors, this function
+    creates a sklearn column transformer with appropriate
+    columns associated with their preprocessors.
     Args:
-        preprocessors (Dict[str, List]): 
+        preprocessors (Dict[str, List]):
             Dictionary containing list of numerical and categorical preprocessors.
         numerical_columns (List[int]):
             List of names of numerical columns
         categorical_columns (List[int]):
             List of names of categorical columns
-            
     Returns:
         ColumnTransformer
     """
@@ -57,12 +54,12 @@ def _create_column_transformer(
 
 def get_tabular_preprocessors() -> Dict[str, List[BaseEstimator]]:
     """
-    This function creates a Dictionary containing list 
+    This function creates a Dictionary containing list
     of numerical and categorical preprocessors
     Returns:
-
+        Dict[str, List[BaseEstimator]]
     """
-    preprocessors = dict()
+    preprocessors: Dict[str, List[BaseEstimator]] = dict()
     preprocessors['numerical'] = list()
     preprocessors['categorical'] = list()
 
@@ -144,12 +141,12 @@ def comparator(cmp1: str, cmp2: str) -> int:
             )
 
             if len(categorical_columns) > 0:
-                print(self.column_transformer.named_transformers_['categorical_pipeline'].named_steps)
                 self.categories = [
                     # We fit an ordinal encoder, where all categorical
                     # columns are shifted to the left
                     list(range(len(cat)))
-                    for cat in self.column_transformer.named_transformers_['categorical_pipeline'].named_steps['onehotencoder'].categories_
+                    for cat in self.column_transformer.named_transformers_[
+                        'categorical_pipeline'].named_steps['onehotencoder'].categories_
                 ]
 
             for i, type_ in enumerate(self.feat_type):
@@ -284,7 +281,7 @@ def _check_data(
                     raise ValueError("Changing the column order of the features after fit() is "
                                      "not supported. Fit() method was called with "
                                      "{} whereas the new features have {} as type".format(self.column_order,
-                                                                                          column_order,)
+                                                                                          column_order, )
                                      )
             else:
                 self.column_order = column_order
@@ -411,7 +408,7 @@ def list_to_dataframe(
 
     @staticmethod
     def numpy_array_to_pandas(
-            X: np.ndarray,
+        X: np.ndarray,
     ) -> pd.DataFrame:
         """
         Converts a numpy array to pandas for type inference
@@ -457,7 +454,9 @@ def infer_objects(self, X: pd.DataFrame) -> pd.DataFrame:
         self.logger.debug(f"Infer Objects: {self.object_dtype_mapping}")
         return X
 
-    def impute_nan_in_categories(self, X: pd.DataFrame, categorical_columns=None) -> pd.DataFrame:
+    def impute_nan_in_categories(self,
+                                 X: pd.DataFrame
+                                 ) -> pd.DataFrame:
         """
         impute missing values before encoding,
         remove once sklearn natively supports
@@ -489,8 +488,7 @@ def impute_nan_in_categories(self, X: pd.DataFrame, categorical_columns=None) ->
                     if can_cast_as_number:
                         # In this case, we expect to have a number as category
                         # it might be string, but its value represent a number
-                        missing_value: Union[str, int] = '-1' if isinstance(X[column].dropna().values[0],
-                                                                                   str) else -1
+                        missing_value: Union[str, int] = '-1' if isinstance(X[column].dropna().values[0], str) else -1
                     else:
                         missing_value = 'Missing!'
 
@@ -509,4 +507,4 @@ def impute_nan_in_categories(self, X: pd.DataFrame, categorical_columns=None) ->
                 X[column].cat.add_categories([self.dict_missing_value_per_col[column]],
                                              inplace=True)
                 X.fillna({column: self.dict_missing_value_per_col[column]}, inplace=True)
-        return X
+        return X
diff --git a/autoPyTorch/datasets/base_dataset.py b/autoPyTorch/datasets/base_dataset.py
@@ -330,13 +330,19 @@ def get_dataset_for_training(self, split_id: int, train: bool, subset: int = 0)
         to provide training data to fit a pipeline
 
         Args:
-            split (int): The desired subset of the dataset to split and use
+            split_id (int): which split id to get from the splits
+            train (bool): whether the train or valid transforms are to be applied
+            subset (int, default=0): 0 is for train_indices, 1 is for valid_indices
 
         Returns:
+
             Dataset: the reduced dataset to be used for testing
         """
         # Subset creates a dataset. Splits is a (train_indices, test_indices) tuple
-        return TransformSubset(self, self.splits[split_id][subset], train=train)
+        assert split_id <= len(self.splits), "Expected split id to be less than length of splits"
+        indices = self.splits[split_id][subset]
+        assert indices is not None, "Trying to get subset when it does not exist"
+        return TransformSubset(self, indices, train=train)
 
     def replace_data(self, X_train: BaseDatasetInputType,
                      X_test: Optional[BaseDatasetInputType]) -> 'BaseDataset':
diff --git a/autoPyTorch/pipeline/base_pipeline.py b/autoPyTorch/pipeline/base_pipeline.py
@@ -451,13 +451,14 @@ def _check_search_space_updates(self, include: Optional[Dict[str, Any]],
                             continue
                         raise ValueError("Unknown hyperparameter for component {}. "
                                          "Expected update hyperparameter "
-                                         "to be in {} got {}. choice is {}".format(node.__class__.__name__,
-                                                                     component.
-                                                                     get_hyperparameter_search_space(
-                                                                         dataset_properties=self.dataset_properties).
-                                                                     get_hyperparameter_names(),
-                                                                     split_hyperparameter[1],
-                                                                                   component.__name__))
+                                         "to be in {} got {}."
+                                         " component is {}".format(node.__class__.__name__,
+                                                                   component.get_hyperparameter_search_space(
+                                                                       dataset_properties=self.dataset_properties
+                                                                   ).get_hyperparameter_names(),
+                                                                   split_hyperparameter[1],
+                                                                   component.__name__)
+                                         )
             else:
                 if update.hyperparameter not in node.get_hyperparameter_search_space(
                         dataset_properties=self.dataset_properties):
diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/TabularColumnTransformer.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/TabularColumnTransformer.py
@@ -3,14 +3,14 @@
 import numpy as np
 
 from sklearn.compose import ColumnTransformer
-from sklearn.pipeline import make_pipeline
-import time
+# from sklearn.pipeline import make_pipeline
+
 import torch
 
 from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.base_tabular_preprocessing import (
     autoPyTorchTabularPreprocessingComponent
 )
-from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.utils import get_tabular_preprocessers
+# from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.utils import get_tabular_preprocessers
 from autoPyTorch.utils.common import FitRequirement, subsampler
 
 
diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/utils.py b/autoPyTorch/pipeline/components/setup/network_backbone/utils.py
@@ -117,7 +117,7 @@ def shake_get_alpha_beta(
         beta = torch.FloatTensor([0.5])
     elif method == 'M3':
         beta = torch.FloatTensor(
-            [torch.rand(1)*(0.5 - alpha)*alpha if alpha < 0.5 else torch.rand(1)*(alpha - 0.5)*alpha]
+            [torch.rand(1) * (0.5 - alpha) * alpha if alpha < 0.5 else torch.rand(1) * (alpha - 0.5) * alpha]
         )
     else:
         raise ValueError("Unknown method for ShakeShakeRegularisation in NetworkBackbone")
diff --git a/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py b/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py
@@ -1,5 +1,5 @@
-import copy
-from typing import Any, Dict, Optional, Tuple
+# import copy
+from typing import Any, Dict, Optional  # , Tuple
 
 import numpy as np
 
@@ -30,23 +30,23 @@ def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
 
     def build_embedding(self, num_input_features: np.ndarray, num_numerical_features: int) -> nn.Module:
         raise NotImplementedError
-
-    def _get_args(self, X: Dict[str, Any]) -> Tuple[int, np.ndarray]:
-        # Feature preprocessors can alter numerical columns
-        # if len(X['dataset_properties']['numerical_columns']) == 0:
-        #     num_numerical_columns = 0
-        # else:
-        #     X_train = copy.deepcopy(X['backend'].load_datamanager().train_tensors[0][:2])
-        #
-        #     numerical_column_transformer = X['tabular_transformer'].preprocessor. \
-        #         named_transformers_['numerical_pipeline']
-        #     num_numerical_columns = numerical_column_transformer.transform(
-        #         X_train[:, X['dataset_properties']['numerical_columns']]).shape[1]
-        # num_input_features = np.zeros((num_numerical_columns + len(X['dataset_properties']['categorical_columns'])),
-        #                               dtype=int)
-        # categories = X['dataset_properties']['categories']
-        #
-        # for i, category in enumerate(categories):
-        #     num_input_features[num_numerical_columns + i, ] = len(category)
-        # return num_numerical_columns, num_input_features
-        return None, None
+    #
+    # def _get_args(self, X: Dict[str, Any]) -> Tuple[int, np.ndarray]:
+    #     # Feature preprocessors can alter numerical columns
+    #     # if len(X['dataset_properties']['numerical_columns']) == 0:
+    #     #     num_numerical_columns = 0
+    #     # else:
+    #     #     X_train = copy.deepcopy(X['backend'].load_datamanager().train_tensors[0][:2])
+    #     #
+    #     #     numerical_column_transformer = X['tabular_transformer'].preprocessor. \
+    #     #         named_transformers_['numerical_pipeline']
+    #     #     num_numerical_columns = numerical_column_transformer.transform(
+    #     #         X_train[:, X['dataset_properties']['numerical_columns']]).shape[1]
+    #     # num_input_features = np.zeros((num_numerical_columns + len(X['dataset_properties']['categorical_columns'])),
+    #     #                               dtype=int)
+    #     # categories = X['dataset_properties']['categories']
+    #     #
+    #     # for i, category in enumerate(categories):
+    #     #     num_input_features[num_numerical_columns + i, ] = len(category)
+    #     # return num_numerical_columns, num_input_features
+    #     return None, None
diff --git a/autoPyTorch/pipeline/components/training/trainer/RowCutOutTrainer.py b/autoPyTorch/pipeline/components/training/trainer/RowCutOutTrainer.py
@@ -2,7 +2,7 @@
 
 import numpy as np
 
-import torch
+# import torch
 
 from autoPyTorch.pipeline.components.training.trainer.base_trainer import BaseTrainerComponent
 from autoPyTorch.pipeline.components.training.trainer.cutout_utils import CutOut
@@ -40,14 +40,15 @@ def data_preparation(self, X: np.ndarray, y: np.ndarray,
         indices = self.random_state.choice(range(size), max(1, np.int32(size * self.patch_ratio)),
                                            replace=False)
 
-        """if not isinstance(self.numerical_columns, typing.Iterable):
+        """
+        if not isinstance(self.numerical_columns, typing.Iterable):
             raise ValueError("{} requires numerical columns information of {}"
                              "to prepare data got {}.".format(self.__class__.__name__,
                                                               typing.Iterable,
                                                               self.numerical_columns))
         numerical_indices = torch.tensor(self.numerical_columns)
         categorical_indices = torch.tensor([index for index in indices if index not in self.numerical_columns])
-    
+
         # We use an ordinal encoder on the categorical columns of tabular data
         # -1 is the conceptual equivalent to 0 in a image, that does not
         # have color as a feature and hence the network has to learn to deal
diff --git a/autoPyTorch/pipeline/components/training/trainer/base_trainer.py b/autoPyTorch/pipeline/components/training/trainer/base_trainer.py
@@ -221,8 +221,6 @@ def __init__(self, weighted_loss: int = 0,
         self.add_fit_requirements([
             FitRequirement("is_cyclic_scheduler", (bool,), user_defined=False, dataset_property=False),
         ])
-        self.batch_fit_times = []
-        self.data_loading_times = []
 
     def prepare(
         self,
diff --git a/autoPyTorch/pipeline/tabular_classification.py b/autoPyTorch/pipeline/tabular_classification.py
@@ -7,7 +7,6 @@
 
 import numpy as np
 
-import sklearn.preprocessing
 from sklearn.base import ClassifierMixin
 
 import torch
@@ -91,7 +90,7 @@ def _predict_proba(self, X: np.ndarray) -> np.ndarray:
         loader = self.named_steps['data_loader'].get_loader(X=X)
         pred = self.named_steps['network'].predict(loader)
         if isinstance(self.dataset_properties['output_shape'], int):
-          return pred
+            return pred
 
         else:
             all_proba = []
@@ -140,11 +139,6 @@ def predict_proba(self, X: np.ndarray, batch_size: Optional[int] = None) -> np.n
                     pred_prob = self.predict_proba(X[batch_from:batch_to], batch_size=None)
                     y[batch_from:batch_to] = pred_prob.astype(np.float32)
 
-        # Neural networks might not be fit to produce a [0-1] output
-        # For instance, after small number of epochs.
-        # y = np.clip(y, 0, 1)
-        # y = sklearn.preprocessing.normalize(y, axis=1, norm='l1')
-
         return y
 
     def _get_hyperparameter_search_space(self,
diff --git a/autoPyTorch/utils/backend.py b/autoPyTorch/utils/backend.py
@@ -328,7 +328,7 @@ def load_datamanager(self) -> BaseDataset:
             with open(filepath, 'rb') as fh:
                 return pickle.load(fh)
 
-    def replace_datamanager(self, datamanager: BaseDataset):
+    def replace_datamanager(self, datamanager: BaseDataset) -> None:
         warnings.warn("Original dataset will be overwritten with the provided dataset")
         os.remove(self._get_datamanager_pickle_filename())
         self.save_datamanager(datamanager=datamanager)

Original file line number	Diff line number	Diff line change
`@@ -117,7 +117,7 @@ def shake_get_alpha_beta(`
`117`	`117`	`beta = torch.FloatTensor([0.5])`
`118`	`118`	`elif method == 'M3':`
`119`	`119`	`beta = torch.FloatTensor(`
`120`		`- [torch.rand(1)(0.5 - alpha)alpha if alpha < 0.5 else torch.rand(1)(alpha - 0.5)alpha]`
	`120`	`+ [torch.rand(1) * (0.5 - alpha) * alpha if alpha < 0.5 else torch.rand(1) * (alpha - 0.5) * alpha]`
`121`	`121`	`)`
`122`	`122`	`else:`
`123`	`123`	`raise ValueError("Unknown method for ShakeShakeRegularisation in NetworkBackbone")`