scikit-learn-contrib · janmotl · Oct 26, 2018 · Oct 23, 2018 · Oct 23, 2018
diff --git a/category_encoders/leave_one_out.py b/category_encoders/leave_one_out.py
@@ -11,6 +11,10 @@
 class LeaveOneOutEncoder(BaseEstimator, TransformerMixin):
     """Leave one out coding for categorical features.
 
+    This is very similar to target encoding, but excludes the current row's
+    target when calculating the mean target for a level to reduce the effect
+    of outliers.
+
     Parameters
     ----------
 
@@ -28,10 +32,9 @@ class LeaveOneOutEncoder(BaseEstimator, TransformerMixin):
         options are 'error', 'ignore' and 'impute', defaults to 'impute', which will impute the category -1. Warning: if
         impute is used, an extra column will be added in if the transform matrix has unknown categories.  This can causes
         unexpected changes in dimension in some cases.
-    randomized: bool
-        adds normal (Gaussian) distribution noise into training data in order to decrease overfitting (testing data are untouched).
     sigma: float
-        standard deviation (spread or "width") of the normal distribution.
+        adds normal (Gaussian) distribution noise into training data in order to decrease overfitting (testing data are untouched).
+        sigma gives the standard deviation (spread or "width") of the normal distribution.
 
     Example
     -------
@@ -69,13 +72,10 @@ class LeaveOneOutEncoder(BaseEstimator, TransformerMixin):
 
     .. [1] Strategies to encode categorical variables with many categories. from
     https://www.kaggle.com/c/caterpillar-tube-pricing/discussion/15748#143154.
-
-
-
     """
 
     def __init__(self, verbose=0, cols=None, drop_invariant=False, return_df=True, impute_missing=True,
-                 handle_unknown='impute', random_state=None, randomized=False, sigma=0.05):
+                 handle_unknown='impute', random_state=None, sigma=None):
         self.return_df = return_df
         self.drop_invariant = drop_invariant
         self.drop_cols = []
@@ -88,7 +88,6 @@ def __init__(self, verbose=0, cols=None, drop_invariant=False, return_df=True, i
         self.handle_unknown = handle_unknown
         self._mean = None
         self.random_state = random_state
-        self.randomized = randomized
         self.sigma = sigma
 
     def fit(self, X, y, **kwargs):
@@ -114,9 +113,9 @@ def fit(self, X, y, **kwargs):
         # first check the type
         X = util.convert_input(X)
         if isinstance(y, pd.DataFrame):
-            y = y.iloc[:,0]
+            y = y.iloc[:, 0].astype(float)
         else:
-            y = pd.Series(y, name='target')
+            y = pd.Series(y, name='target', dtype=float)
         if X.shape[0] != y.shape[0]:
             raise ValueError("The length of X is " + str(X.shape[0]) + " but length of y is " + str(y.shape[0]) + ".")
 
@@ -175,9 +174,9 @@ def transform(self, X, y=None):
         # if we are encoding the training data, we have to check the target
         if y is not None:
             if isinstance(y, pd.DataFrame):
-                y = y.iloc[:, 0]
+                y = y.iloc[:, 0].astype(float)
             else:
-                y = pd.Series(y, name='target')
+                y = pd.Series(y, name='target', dtype=float)
             if X.shape[0] != y.shape[0]:
                 raise ValueError("The length of X is " + str(X.shape[0]) + " but length of y is " + str(y.shape[0]) + ".")
 
@@ -215,51 +214,36 @@ def fit_leave_one_out(self, X_in, y, cols=None):
             cols = X.columns.values
 
         self._mean = y.mean()
-        mapping_out = []
-
-        for col in cols:
-            tmp = y.groupby(X[col]).agg(['sum', 'count'])
-            tmp['mean'] = tmp['sum'] / tmp['count']
-            tmp = tmp.to_dict(orient='index')
-
-            mapping_out.append({'col': col, 'mapping': tmp}, )
-
-        return mapping_out
+        return {col: y.groupby(X[col]).agg(['sum', 'count']) for col in cols}
 
     def transform_leave_one_out(self, X_in, y, mapping=None, impute_missing=True, handle_unknown='impute'):
         """
         Leave one out encoding uses a single column of floats to represent the means of the target variables.
         """
 
         X = X_in.copy(deep=True)
-
         random_state_ = check_random_state(self.random_state)
-        for switch in mapping:
-            column = switch.get('col')
-            transformed_column = pd.Series([np.nan] * X.shape[0], name=column)
-
-            for val in switch.get('mapping'):
-                if y is None:
-                    transformed_column.loc[X[column] == val] = switch.get('mapping')[val]['mean']
-                elif switch.get('mapping')[val]['count'] == 1:
-                    transformed_column.loc[X[column] == val] = self._mean
-                else:
-                    transformed_column.loc[X[column] == val] = (
-                        (switch.get('mapping')[val]['sum'] - y[(X[column] == val).values]) / (
-                            switch.get('mapping')[val]['count'] - 1)
-                    )
+
+        for col, colmap in mapping.items():
+            level_notunique = colmap['count'] > 1
+            if y is None:    # Replace level with its mean target; if level occurs only once, use global mean
+                level_means = (colmap['sum'] / colmap['count']).where(level_notunique, self._mean)
+                X[col] = X[col].map(level_means)
+            else:            # Replace level with its mean target, calculated excluding this row's target
+                # The y (target) mean for this level is normally just the sum/count;
+                # excluding this row's y, it's (sum - y) / (count - 1)
+                level_means = (X[col].map(colmap['sum']) - y) / (X[col].map(colmap['count']) - 1)
+                # The 'where' fills in singleton levels (count = 1 -> div by 0) with the global mean
+                X[col] = level_means.where(X[col].map(colmap['count'][level_notunique]).notnull(), self._mean)
 
             if impute_missing:
                 if handle_unknown == 'impute':
-                    transformed_column.fillna(self._mean, inplace=True)
+                    X[col].fillna(self._mean, inplace=True)
                 elif handle_unknown == 'error':
-                    missing = transformed_column.isnull()
-                    if any(missing):
-                        raise ValueError('Unexpected categories found in column %s' % column)
-
-            if self.randomized and y is not None:
-                transformed_column = (transformed_column * random_state_.normal(1., self.sigma, transformed_column.shape[0]))
+                    if X[col].isnull().any():
+                        raise ValueError('Unexpected categories found in column %s' % col)
 
-            X[column] = transformed_column.astype(float)
+            if self.sigma is not None and y is not None:
+                X[col] = X[col] * random_state_.normal(1., self.sigma, X[col].shape[0])
 
         return X
diff --git a/category_encoders/tests/test_leave_one_out.py b/category_encoders/tests/test_leave_one_out.py
@@ -19,7 +19,7 @@
 class TestLeaveOneOutEncoder(TestCase):
 
     def test_leave_one_out(self):
-        enc = encoders.LeaveOneOutEncoder(verbose=1, randomized=True, sigma=0.1)
+        enc = encoders.LeaveOneOutEncoder(verbose=1, sigma=0.1)
         enc.fit(X, y)
         tu.verify_numeric(enc.transform(X_t))
         tu.verify_numeric(enc.transform(X_t, y_t))
@@ -32,7 +32,7 @@ def test_leave_one_out_values(self):
         X = df.drop('outcome', axis=1)
         y = df.drop('color', axis=1)
 
-        ce_leave = encoders.LeaveOneOutEncoder(cols=['color'], randomized=False)
+        ce_leave = encoders.LeaveOneOutEncoder(cols=['color'])
         obtained = ce_leave.fit_transform(X, y['outcome'])
 
         self.assertEqual([0.0, 0.5, 0.5, 0.5, 1.0, 0.5], list(obtained['color']))
@@ -46,7 +46,17 @@ def test_leave_one_out_fit_callTwiceOnDifferentData_ExpectRefit(self):
         encoder.fit(x_b, y_dummy)
         mapping = encoder.mapping
         self.assertEqual(1, len(mapping))
-        col_b_mapping = mapping[0]
-        self.assertEqual('col_b', col_b_mapping['col']) # the model must get updated
-        self.assertEqual({'sum': 2.0, 'count': 3, 'mean': 2.0/3.0}, col_b_mapping['mapping']['1'])
-        self.assertEqual({'sum': 1.0, 'count': 3, 'mean': 01.0/3.0}, col_b_mapping['mapping']['2'])
+        self.assertIn('col_b', mapping)     # the model should have the updated mapping
+        expected = pd.DataFrame({'sum': [2.0, 1.0], 'count': [3, 3]}, index=['1', '2'])
+        pd.testing.assert_frame_equal(expected, mapping['col_b'], check_like=True)
+
+    def test_leave_one_out_unique(self):
+        X = pd.DataFrame(data=['1', '2', '2', '2', '3'], columns=['col'])
+        y = np.array([1, 0, 1, 0, 1])
+
+        encoder = encoders.LeaveOneOutEncoder(impute_missing=False)
+        result = encoder.fit(X, y).transform(X, y)
+
+        self.assertFalse(result.isnull().any().any(), 'There should not be any missing value')
+        expected = pd.DataFrame(data=[y.mean(), 0.5, 0, 0.5, y.mean()], columns=['col'])
+        pd.testing.assert_frame_equal(expected, result)