diff --git a/.gitignore b/.gitignore index b8803fc..941a393 100644 --- a/.gitignore +++ b/.gitignore @@ -2,4 +2,5 @@ *.pyc .tox/ build/ -dist/ \ No newline at end of file +dist/ +.cache/ \ No newline at end of file diff --git a/sklearn_pandas/__init__.py b/sklearn_pandas/__init__.py index 11ee470..1a0fb07 100644 --- a/sklearn_pandas/__init__.py +++ b/sklearn_pandas/__init__.py @@ -2,3 +2,4 @@ from .dataframe_mapper import DataFrameMapper # NOQA from .cross_validation import cross_val_score, GridSearchCV, RandomizedSearchCV # NOQA +from .categorical_imputer import CategoricalImputer diff --git a/sklearn_pandas/categorical_imputer.py b/sklearn_pandas/categorical_imputer.py new file mode 100644 index 0000000..5c79477 --- /dev/null +++ b/sklearn_pandas/categorical_imputer.py @@ -0,0 +1,73 @@ +""" + +Impute missing values from a categorical/string np.ndarray or pd.Series with the most frequent value on the training data. + +""" + +import pandas as pd +import numpy as np + +from sklearn.base import TransformerMixin + + +class CategoricalImputer(TransformerMixin): + + """ + + Attributes + ---------- + + fill : str + Most frequent value of the training data. + + """ + + def __init__(self): + + self.fill = None + + def fit(self, X): + + """ + + Get the most frequent value. + + Parameters + ---------- + X : np.ndarray or pd.Series + Training data. + + Returns + ------- + CategoricalImputer + Itself. + + """ + + self.fill = pd.Series(X).mode().values[0] + + return self + + def transform(self, X): + + """ + + Replaces null values in the input data with the most frequent value of the training data. + + Parameters + ---------- + X : np.ndarray or pd.Series + Data with values to be imputed. + + Returns + ------- + np.ndarray + Data with imputed values. + + """ + + X = X.copy() + + X[pd.isnull(X)] = self.fill + + return np.asarray(X) diff --git a/tests/test_string_imputer.py b/tests/test_string_imputer.py new file mode 100644 index 0000000..605bc42 --- /dev/null +++ b/tests/test_string_imputer.py @@ -0,0 +1,51 @@ +import pytest + +import numpy as np +import pandas as pd + +from sklearn_pandas import CategoricalImputer +from sklearn_pandas import DataFrameMapper + + +@pytest.mark.parametrize('none_value', [None, np.nan]) +@pytest.mark.parametrize('input_type', ['np', 'pd']) +def test_unit(input_type, none_value): + + data = ['a', 'b', 'b', none_value] + + if input_type == 'pd': + X = pd.Series(data) + else: + X = np.asarray(data) + + Xc = X.copy() + + Xt = CategoricalImputer().fit_transform(X) + + assert (np.asarray(X) == np.asarray(Xc)).all() + assert type(Xt) == np.ndarray + assert len(X) == len(Xt) + assert len(Xt[pd.isnull(Xt)]) == 0 + +@pytest.mark.parametrize('none_value', [None, np.nan]) +def test_integration(none_value): + + df = pd.DataFrame({'cat': ['a', 'a', 'a', none_value, 'b'], + 'num': [1, 2, 3, 4, 5]}) + + mapper = DataFrameMapper([ + ('cat', CategoricalImputer()), + ('num', None) + ], df_out=True).fit(df) + + df_t = mapper.transform(df) + + assert pd.notnull(df_t).all().all() + + val_idx = pd.notnull(df['cat']) + nan_idx = ~val_idx + + assert (df['num'] == df_t['num']).all() + + assert (df['cat'][val_idx] == df_t['cat'][val_idx]).all() + assert (df_t['cat'][nan_idx] == df['cat'].mode().values[0]).all()