diff --git a/sklearn_pandas/categorical_imputer.py b/sklearn_pandas/categorical_imputer.py index 5c79477..3f65b51 100644 --- a/sklearn_pandas/categorical_imputer.py +++ b/sklearn_pandas/categorical_imputer.py @@ -1,35 +1,52 @@ -""" - -Impute missing values from a categorical/string np.ndarray or pd.Series with the most frequent value on the training data. - -""" - import pandas as pd import numpy as np -from sklearn.base import TransformerMixin +from collections import Counter +from sklearn.base import BaseEstimator, TransformerMixin +from sklearn.utils.validation import check_is_fitted -class CategoricalImputer(TransformerMixin): +def _get_mask(X, value): """ + Compute the boolean mask X == missing_values. + """ + if value == "NaN" or \ + value is None or \ + (isinstance(value, float) and np.isnan(value)): + return pd.isnull(X) + else: + return X == value - Attributes + +class CategoricalImputer(BaseEstimator, TransformerMixin): + """ + Impute missing values from a categorical/string np.ndarray or pd.Series + with the most frequent value on the training data. + + Parameters ---------- + missing_values : string or "NaN", optional (default="NaN") + The placeholder for the missing values. All occurrences of + `missing_values` will be imputed. None and np.nan are treated + as being the same, use the string value "NaN" for them. + + copy : boolean, optional (default=True) + If True, a copy of X will be created. - fill : str + Attributes + ---------- + fill_ : str Most frequent value of the training data. """ - def __init__(self): - - self.fill = None - - def fit(self, X): + def __init__(self, missing_values='NaN', copy=True): + self.missing_values = missing_values + self.copy = copy + def fit(self, X, y=None): """ - Get the most frequent value. Parameters @@ -37,22 +54,24 @@ def fit(self, X): X : np.ndarray or pd.Series Training data. + y : Passthrough for ``Pipeline`` compatibility. + Returns ------- - CategoricalImputer - Itself. - + self: CategoricalImputer """ - self.fill = pd.Series(X).mode().values[0] + mask = _get_mask(X, self.missing_values) + X = X[~mask] + + self.fill_ = Counter(X).most_common(1)[0][0] return self def transform(self, X): - """ - - Replaces null values in the input data with the most frequent value of the training data. + Replaces missing values in the input data with the most frequent value + of the training data. Parameters ---------- @@ -63,11 +82,14 @@ def transform(self, X): ------- np.ndarray Data with imputed values. - """ - X = X.copy() + check_is_fitted(self, 'fill_') + + if self.copy: + X = X.copy() - X[pd.isnull(X)] = self.fill + mask = _get_mask(X, self.missing_values) + X[mask] = self.fill_ return np.asarray(X) diff --git a/tests/test_categorical_imputer.py b/tests/test_categorical_imputer.py index 18f1063..811a393 100644 --- a/tests/test_categorical_imputer.py +++ b/tests/test_categorical_imputer.py @@ -6,6 +6,13 @@ from sklearn_pandas import CategoricalImputer from sklearn_pandas import DataFrameMapper +# In sklearn18 NotFittedError was moved from utils.validation +# to exceptions module. +try: + from sklearn.exceptions import NotFittedError +except ImportError: + from sklearn.utils.validation import NotFittedError + @pytest.mark.parametrize('none_value', [None, np.nan]) @pytest.mark.parametrize('input_type', ['np', 'pd']) @@ -16,7 +23,7 @@ def test_unit(input_type, none_value): if input_type == 'pd': X = pd.Series(data) else: - X = np.asarray(data) + X = np.asarray(data, dtype=object) Xc = X.copy() @@ -24,8 +31,57 @@ def test_unit(input_type, none_value): assert (np.asarray(X) == np.asarray(Xc)).all() assert type(Xt) == np.ndarray - assert len(X) == len(Xt) - assert len(Xt[pd.isnull(Xt)]) == 0 + assert (Xt == ['a', 'b', 'b', 'b']).all() + + +@pytest.mark.parametrize('input_type', ['np', 'pd']) +def test_missing_values_param(input_type): + + data = ['x', 'y', 'a_missing', 'y'] + + if input_type == 'pd': + X = pd.Series(data) + else: + X = np.asarray(data, dtype=object) + + imp = CategoricalImputer(missing_values='a_missing') + Xt = imp.fit_transform(X) + + assert (Xt == np.array(['x', 'y', 'y', 'y'])).all() + + +@pytest.mark.parametrize('input_type', ['np', 'pd']) +def test_copy_param(input_type): + + data = ['a', np.nan, 'b', 'a'] + + if input_type == 'pd': + X = pd.Series(data) + else: + X = np.asarray(data, dtype=object) + + imp = CategoricalImputer(copy=False) + Xt = imp.fit_transform(X) + + Xe = np.array(['a', 'a', 'b', 'a']) + assert (Xt == Xe).all() + assert (X == Xe).all() + + +@pytest.mark.parametrize('input_type', ['np', 'pd']) +def test_data_type(input_type): + + data = ['a', np.nan, 'b', 3, 'a', 3, 'a', 4.5] + + if input_type == 'pd': + X = pd.Series(data) + else: + X = np.asarray(data, dtype=object) + + Xt = CategoricalImputer().fit_transform(X) + + Xe = np.array(['a', 'a', 'b', 3, 'a', 3, 'a', 4.5], dtype=object) + assert (Xt == Xe).all() @pytest.mark.parametrize('none_value', [None, np.nan]) @@ -50,3 +106,12 @@ def test_integration(none_value): assert (df['cat'][val_idx] == df_t['cat'][val_idx]).all() assert (df_t['cat'][nan_idx] == df['cat'].mode().values[0]).all() + + +def test_not_fitted(): + """ + If imputer is not fitted, NotFittedError is raised. + """ + imp = CategoricalImputer() + with pytest.raises(NotFittedError): + imp.transform(np.array(['a', 'b', 'b', None]))