diff --git a/category_encoders/leave_one_out.py b/category_encoders/leave_one_out.py index efdaee3f..af902db4 100644 --- a/category_encoders/leave_one_out.py +++ b/category_encoders/leave_one_out.py @@ -11,6 +11,10 @@ class LeaveOneOutEncoder(BaseEstimator, TransformerMixin): """Leave one out coding for categorical features. + This is very similar to target encoding, but excludes the current row's + target when calculating the mean target for a level to reduce the effect + of outliers. + Parameters ---------- @@ -28,10 +32,9 @@ class LeaveOneOutEncoder(BaseEstimator, TransformerMixin): options are 'error', 'ignore' and 'impute', defaults to 'impute', which will impute the category -1. Warning: if impute is used, an extra column will be added in if the transform matrix has unknown categories. This can causes unexpected changes in dimension in some cases. - randomized: bool - adds normal (Gaussian) distribution noise into training data in order to decrease overfitting (testing data are untouched). sigma: float - standard deviation (spread or "width") of the normal distribution. + adds normal (Gaussian) distribution noise into training data in order to decrease overfitting (testing data are untouched). + sigma gives the standard deviation (spread or "width") of the normal distribution. Example ------- @@ -69,13 +72,10 @@ class LeaveOneOutEncoder(BaseEstimator, TransformerMixin): .. [1] Strategies to encode categorical variables with many categories. from https://www.kaggle.com/c/caterpillar-tube-pricing/discussion/15748#143154. - - - """ def __init__(self, verbose=0, cols=None, drop_invariant=False, return_df=True, impute_missing=True, - handle_unknown='impute', random_state=None, randomized=False, sigma=0.05): + handle_unknown='impute', random_state=None, sigma=None): self.return_df = return_df self.drop_invariant = drop_invariant self.drop_cols = [] @@ -88,7 +88,6 @@ def __init__(self, verbose=0, cols=None, drop_invariant=False, return_df=True, i self.handle_unknown = handle_unknown self._mean = None self.random_state = random_state - self.randomized = randomized self.sigma = sigma def fit(self, X, y, **kwargs): @@ -114,9 +113,9 @@ def fit(self, X, y, **kwargs): # first check the type X = util.convert_input(X) if isinstance(y, pd.DataFrame): - y = y.iloc[:,0] + y = y.iloc[:, 0].astype(float) else: - y = pd.Series(y, name='target') + y = pd.Series(y, name='target', dtype=float) if X.shape[0] != y.shape[0]: raise ValueError("The length of X is " + str(X.shape[0]) + " but length of y is " + str(y.shape[0]) + ".") @@ -175,9 +174,9 @@ def transform(self, X, y=None): # if we are encoding the training data, we have to check the target if y is not None: if isinstance(y, pd.DataFrame): - y = y.iloc[:, 0] + y = y.iloc[:, 0].astype(float) else: - y = pd.Series(y, name='target') + y = pd.Series(y, name='target', dtype=float) if X.shape[0] != y.shape[0]: raise ValueError("The length of X is " + str(X.shape[0]) + " but length of y is " + str(y.shape[0]) + ".") @@ -215,16 +214,7 @@ def fit_leave_one_out(self, X_in, y, cols=None): cols = X.columns.values self._mean = y.mean() - mapping_out = [] - - for col in cols: - tmp = y.groupby(X[col]).agg(['sum', 'count']) - tmp['mean'] = tmp['sum'] / tmp['count'] - tmp = tmp.to_dict(orient='index') - - mapping_out.append({'col': col, 'mapping': tmp}, ) - - return mapping_out + return {col: y.groupby(X[col]).agg(['sum', 'count']) for col in cols} def transform_leave_one_out(self, X_in, y, mapping=None, impute_missing=True, handle_unknown='impute'): """ @@ -232,34 +222,28 @@ def transform_leave_one_out(self, X_in, y, mapping=None, impute_missing=True, ha """ X = X_in.copy(deep=True) - random_state_ = check_random_state(self.random_state) - for switch in mapping: - column = switch.get('col') - transformed_column = pd.Series([np.nan] * X.shape[0], name=column) - - for val in switch.get('mapping'): - if y is None: - transformed_column.loc[X[column] == val] = switch.get('mapping')[val]['mean'] - elif switch.get('mapping')[val]['count'] == 1: - transformed_column.loc[X[column] == val] = self._mean - else: - transformed_column.loc[X[column] == val] = ( - (switch.get('mapping')[val]['sum'] - y[(X[column] == val).values]) / ( - switch.get('mapping')[val]['count'] - 1) - ) + + for col, colmap in mapping.items(): + level_notunique = colmap['count'] > 1 + if y is None: # Replace level with its mean target; if level occurs only once, use global mean + level_means = (colmap['sum'] / colmap['count']).where(level_notunique, self._mean) + X[col] = X[col].map(level_means) + else: # Replace level with its mean target, calculated excluding this row's target + # The y (target) mean for this level is normally just the sum/count; + # excluding this row's y, it's (sum - y) / (count - 1) + level_means = (X[col].map(colmap['sum']) - y) / (X[col].map(colmap['count']) - 1) + # The 'where' fills in singleton levels (count = 1 -> div by 0) with the global mean + X[col] = level_means.where(X[col].map(colmap['count'][level_notunique]).notnull(), self._mean) if impute_missing: if handle_unknown == 'impute': - transformed_column.fillna(self._mean, inplace=True) + X[col].fillna(self._mean, inplace=True) elif handle_unknown == 'error': - missing = transformed_column.isnull() - if any(missing): - raise ValueError('Unexpected categories found in column %s' % column) - - if self.randomized and y is not None: - transformed_column = (transformed_column * random_state_.normal(1., self.sigma, transformed_column.shape[0])) + if X[col].isnull().any(): + raise ValueError('Unexpected categories found in column %s' % col) - X[column] = transformed_column.astype(float) + if self.sigma is not None and y is not None: + X[col] = X[col] * random_state_.normal(1., self.sigma, X[col].shape[0]) return X diff --git a/category_encoders/tests/test_leave_one_out.py b/category_encoders/tests/test_leave_one_out.py index 6e0553bc..12e2f5af 100644 --- a/category_encoders/tests/test_leave_one_out.py +++ b/category_encoders/tests/test_leave_one_out.py @@ -19,7 +19,7 @@ class TestLeaveOneOutEncoder(TestCase): def test_leave_one_out(self): - enc = encoders.LeaveOneOutEncoder(verbose=1, randomized=True, sigma=0.1) + enc = encoders.LeaveOneOutEncoder(verbose=1, sigma=0.1) enc.fit(X, y) tu.verify_numeric(enc.transform(X_t)) tu.verify_numeric(enc.transform(X_t, y_t)) @@ -32,7 +32,7 @@ def test_leave_one_out_values(self): X = df.drop('outcome', axis=1) y = df.drop('color', axis=1) - ce_leave = encoders.LeaveOneOutEncoder(cols=['color'], randomized=False) + ce_leave = encoders.LeaveOneOutEncoder(cols=['color']) obtained = ce_leave.fit_transform(X, y['outcome']) self.assertEqual([0.0, 0.5, 0.5, 0.5, 1.0, 0.5], list(obtained['color'])) @@ -46,7 +46,17 @@ def test_leave_one_out_fit_callTwiceOnDifferentData_ExpectRefit(self): encoder.fit(x_b, y_dummy) mapping = encoder.mapping self.assertEqual(1, len(mapping)) - col_b_mapping = mapping[0] - self.assertEqual('col_b', col_b_mapping['col']) # the model must get updated - self.assertEqual({'sum': 2.0, 'count': 3, 'mean': 2.0/3.0}, col_b_mapping['mapping']['1']) - self.assertEqual({'sum': 1.0, 'count': 3, 'mean': 01.0/3.0}, col_b_mapping['mapping']['2']) + self.assertIn('col_b', mapping) # the model should have the updated mapping + expected = pd.DataFrame({'sum': [2.0, 1.0], 'count': [3, 3]}, index=['1', '2']) + pd.testing.assert_frame_equal(expected, mapping['col_b'], check_like=True) + + def test_leave_one_out_unique(self): + X = pd.DataFrame(data=['1', '2', '2', '2', '3'], columns=['col']) + y = np.array([1, 0, 1, 0, 1]) + + encoder = encoders.LeaveOneOutEncoder(impute_missing=False) + result = encoder.fit(X, y).transform(X, y) + + self.assertFalse(result.isnull().any().any(), 'There should not be any missing value') + expected = pd.DataFrame(data=[y.mean(), 0.5, 0, 0.5, y.mean()], columns=['col']) + pd.testing.assert_frame_equal(expected, result)