Skip to content
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,9 +1,7 @@
from typing import Any, Dict, List, Optional, Union

from ConfigSpace.configuration_space import ConfigurationSpace
from ConfigSpace.hyperparameters import (
CategoricalHyperparameter
)
from ConfigSpace.hyperparameters import CategoricalHyperparameter

import numpy as np

Expand All @@ -15,92 +13,154 @@


class SimpleImputer(BaseImputer):
"""An imputer for categorical and numerical columns
Impute missing values for categorical columns with 'constant_!missing!'
Note:
In case of numpy data, the constant value is set to -1, under the assumption
that categorical data is fit with an Ordinal Scaler.
"""
Impute missing values for categorical columns with '!missing!'
(In case of numpy data, the constant value is set to -1, under
the assumption that categorical data is fit with an Ordinal Scaler)
"""

def __init__(self,
random_state: Optional[Union[np.random.RandomState, int]] = None,
numerical_strategy: str = 'mean',
categorical_strategy: str = 'most_frequent'):
def __init__(
self,
random_state: Optional[Union[np.random.RandomState, int]] = None,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We removed integer type from random_state, so it must be Optional[np.random.RandomState]

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Okay, will do. As an fyi, this can cause different output if you run the same function twice on the same object. The random state produces a sequence of numbers.

For example, if you create a single RandomState object and pass it to every object that requires a random_state, you will get different output depending on the order in which objects use that random_state. On the flip-side, if you use an int, they are independant of each other and so it doesn't matter which order objects use it.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, that is also true. My explanation was not sufficient, but we, in reality, decided to use seed for int and random_state for np.random.RandomState.
So it is a very good decision if we switch to seed instead of random_state in the future.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We kept random_state as the arg name and allow for both, internally we just pass the seed argument given at construction of an AutoSklearnClassifier so that internally it's an int passed throughout.

We follow sklearn in principle so we copy their expected behaviour.

numerical_strategy: str = 'mean',
categorical_strategy: str = 'most_frequent'
):
"""
Parameters
----------
random_state: Optional[Union[np.random.RandomState, int]] = None
The random state to use for the imputer
numerical_strategy: str = 'mean',
The strategy to use for imputing numerical columns.
Can be one of ['mean', 'median', 'most_frequent', 'constant', 'constant_!missing!']
Note:
Using 'constant' defaults to fill_value of 0 where 'constant_!missing!'
uses a fill_value of -1. This behaviour should probably be fixed.
categorical_strategy: str = 'most_frequent'
The strategy to use for imputing categorical columns.
Can be one of ['mean', 'median', 'most_frequent', 'constant_zero']
"""
super().__init__()
self.random_state = random_state
self.numerical_strategy = numerical_strategy
self.categorical_strategy = categorical_strategy

def fit(self, X: Dict[str, Any], y: Any = None) -> BaseImputer:
def fit(self, X: Dict[str, Any], y: Optional[Any] = None) -> BaseImputer:
"""
The fit function calls the fit function of the underlying model
and returns the transformed array.
Args:
X (np.ndarray): input features
y (Optional[np.ndarray]): input labels
Returns:
instance of self
Parameters
----------
X: np.ndarray
The input features to fit on
y: Optional[np.ndarray]
The labels for the input features `X`
Returns
-------
SimpleImputer
returns self
"""
self.check_requirements(X, y)
categorical_columns = X['dataset_properties']['categorical_columns'] \
if isinstance(X['dataset_properties']['categorical_columns'], List) else []
if len(categorical_columns) != 0:

# Choose an imputer for any categorical columns
categorical_columns = X['dataset_properties']['categorical_columns']

if isinstance(categorical_columns, List) and len(categorical_columns) != 0:
if self.categorical_strategy == 'constant_!missing!':
self.preprocessor['categorical'] = SklearnSimpleImputer(strategy='constant',
# Train data is numpy
# as of this point, where
# Ordinal Encoding is using
# for categorical. Only
# Numbers are allowed
# fill_value='!missing!',
fill_value=-1,
copy=False)
# Train data is numpy as of this point, where an Ordinal Encoding is used
# for categoricals. Only Numbers are allowed for `fill_value`
imputer = SklearnSimpleImputer(strategy='constant', fill_value=-1, copy=False)
self.preprocessor['categorical'] = imputer
else:
self.preprocessor['categorical'] = SklearnSimpleImputer(strategy=self.categorical_strategy,
copy=False)
numerical_columns = X['dataset_properties']['numerical_columns'] \
if isinstance(X['dataset_properties']['numerical_columns'], List) else []
if len(numerical_columns) != 0:
imputer = SklearnSimpleImputer(strategy=self.categorical_strategy, copy=False)
self.preprocessor['categorical'] = imputer

# Choose an imputer for any numerical columns
numerical_columns = X['dataset_properties']['numerical_columns']

if isinstance(numerical_columns, List) and len(numerical_columns) > 0:
if self.numerical_strategy == 'constant_zero':
self.preprocessor['numerical'] = SklearnSimpleImputer(strategy='constant',
fill_value=0,
copy=False)
imputer = SklearnSimpleImputer(strategy='constant', fill_value=0, copy=False)
self.preprocessor['numerical'] = imputer
else:
self.preprocessor['numerical'] = SklearnSimpleImputer(strategy=self.numerical_strategy, copy=False)
imputer = SklearnSimpleImputer(strategy=self.numerical_strategy, copy=False)
self.preprocessor['numerical'] = imputer

return self

@staticmethod
def get_hyperparameter_search_space(
dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
numerical_strategy: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='numerical_strategy',
value_range=("mean", "median",
"most_frequent",
"constant_zero"),
default_value="mean",
),
numerical_strategy: HyperparameterSearchSpace = HyperparameterSearchSpace(
hyperparameter='numerical_strategy',
value_range=("mean", "median", "most_frequet", "constant_zero"),
default_value="mean",
),
categorical_strategy: HyperparameterSearchSpace = HyperparameterSearchSpace(
hyperparameter='categorical_strategy',
value_range=("most_frequent",
"constant_!missing!"),
default_value="most_frequent")
value_range=("most_frequent", "constant_!missing!"),
default_value="most_frequent"
)
) -> ConfigurationSpace:
"""Get the hyperparameter search space for the SimpleImputer
Parameters
----------
dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None
Properties that describe the dataset
numerical_strategy: HyperparameterSearchSpace = HyperparameterSearchSpace(...)
The strategy to use for numerical imputation
caterogical_strategy: HyperparameterSearchSpace = HyperparameterSearchSpace(...)
The strategy to use for categorical imputation
Returns
-------
ConfigurationSpace
The space of possible configurations for a SimpleImputer with the given
`dataset_properties`
"""
cs = ConfigurationSpace()
assert dataset_properties is not None, "To create hyperparameter search space" \
", dataset_properties should not be None"
if len(dataset_properties['numerical_columns']) \
if isinstance(dataset_properties['numerical_columns'], List) else 0 != 0:

if dataset_properties is None:
raise ValueError("SimpleImputer requires `dataset_properties` for generating"
" a search space.")

if (
isinstance(dataset_properties['numerical_columns'], List)
and len(dataset_properties['numerical_columns']) != 0
):
add_hyperparameter(cs, numerical_strategy, CategoricalHyperparameter)

if len(dataset_properties['categorical_columns']) \
if isinstance(dataset_properties['categorical_columns'], List) else 0 != 0:
if (
isinstance(dataset_properties['categorical_columns'], List)
and len(dataset_properties['categorical_columns'])
):
add_hyperparameter(cs, categorical_strategy, CategoricalHyperparameter)

return cs

@staticmethod
def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None
) -> Dict[str, Union[str, bool]]:
def get_properties(
dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None
) -> Dict[str, Union[str, bool]]:
"""Get the properties of the SimpleImputer class and what it can handle
Returns
-------
Dict[str, Union[str, bool]]
A dict from property names to values
"""
return {
'shortname': 'SimpleImputer',
'name': 'Simple Imputer',
Expand Down