Skip to content

Speed up LeaveOneOutEncoder with vectorization. #146

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Oct 26, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
74 changes: 29 additions & 45 deletions category_encoders/leave_one_out.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,10 @@
class LeaveOneOutEncoder(BaseEstimator, TransformerMixin):
"""Leave one out coding for categorical features.

This is very similar to target encoding, but excludes the current row's
target when calculating the mean target for a level to reduce the effect
of outliers.

Parameters
----------

Expand All @@ -28,10 +32,9 @@ class LeaveOneOutEncoder(BaseEstimator, TransformerMixin):
options are 'error', 'ignore' and 'impute', defaults to 'impute', which will impute the category -1. Warning: if
impute is used, an extra column will be added in if the transform matrix has unknown categories. This can causes
unexpected changes in dimension in some cases.
randomized: bool
adds normal (Gaussian) distribution noise into training data in order to decrease overfitting (testing data are untouched).
sigma: float
standard deviation (spread or "width") of the normal distribution.
adds normal (Gaussian) distribution noise into training data in order to decrease overfitting (testing data are untouched).
sigma gives the standard deviation (spread or "width") of the normal distribution.

Example
-------
Expand Down Expand Up @@ -69,13 +72,10 @@ class LeaveOneOutEncoder(BaseEstimator, TransformerMixin):

.. [1] Strategies to encode categorical variables with many categories. from
https://www.kaggle.com/c/caterpillar-tube-pricing/discussion/15748#143154.



"""

def __init__(self, verbose=0, cols=None, drop_invariant=False, return_df=True, impute_missing=True,
handle_unknown='impute', random_state=None, randomized=False, sigma=0.05):
handle_unknown='impute', random_state=None, sigma=None):
self.return_df = return_df
self.drop_invariant = drop_invariant
self.drop_cols = []
Expand All @@ -88,7 +88,6 @@ def __init__(self, verbose=0, cols=None, drop_invariant=False, return_df=True, i
self.handle_unknown = handle_unknown
self._mean = None
self.random_state = random_state
self.randomized = randomized
self.sigma = sigma

def fit(self, X, y, **kwargs):
Expand All @@ -114,9 +113,9 @@ def fit(self, X, y, **kwargs):
# first check the type
X = util.convert_input(X)
if isinstance(y, pd.DataFrame):
y = y.iloc[:,0]
y = y.iloc[:, 0].astype(float)
else:
y = pd.Series(y, name='target')
y = pd.Series(y, name='target', dtype=float)
if X.shape[0] != y.shape[0]:
raise ValueError("The length of X is " + str(X.shape[0]) + " but length of y is " + str(y.shape[0]) + ".")

Expand Down Expand Up @@ -175,9 +174,9 @@ def transform(self, X, y=None):
# if we are encoding the training data, we have to check the target
if y is not None:
if isinstance(y, pd.DataFrame):
y = y.iloc[:, 0]
y = y.iloc[:, 0].astype(float)
else:
y = pd.Series(y, name='target')
y = pd.Series(y, name='target', dtype=float)
if X.shape[0] != y.shape[0]:
raise ValueError("The length of X is " + str(X.shape[0]) + " but length of y is " + str(y.shape[0]) + ".")

Expand Down Expand Up @@ -215,51 +214,36 @@ def fit_leave_one_out(self, X_in, y, cols=None):
cols = X.columns.values

self._mean = y.mean()
mapping_out = []

for col in cols:
tmp = y.groupby(X[col]).agg(['sum', 'count'])
tmp['mean'] = tmp['sum'] / tmp['count']
tmp = tmp.to_dict(orient='index')

mapping_out.append({'col': col, 'mapping': tmp}, )

return mapping_out
return {col: y.groupby(X[col]).agg(['sum', 'count']) for col in cols}

def transform_leave_one_out(self, X_in, y, mapping=None, impute_missing=True, handle_unknown='impute'):
"""
Leave one out encoding uses a single column of floats to represent the means of the target variables.
"""

X = X_in.copy(deep=True)

random_state_ = check_random_state(self.random_state)
for switch in mapping:
column = switch.get('col')
transformed_column = pd.Series([np.nan] * X.shape[0], name=column)

for val in switch.get('mapping'):
if y is None:
transformed_column.loc[X[column] == val] = switch.get('mapping')[val]['mean']
elif switch.get('mapping')[val]['count'] == 1:
transformed_column.loc[X[column] == val] = self._mean
else:
transformed_column.loc[X[column] == val] = (
(switch.get('mapping')[val]['sum'] - y[(X[column] == val).values]) / (
switch.get('mapping')[val]['count'] - 1)
)

for col, colmap in mapping.items():
level_notunique = colmap['count'] > 1
if y is None: # Replace level with its mean target; if level occurs only once, use global mean
level_means = (colmap['sum'] / colmap['count']).where(level_notunique, self._mean)
X[col] = X[col].map(level_means)
else: # Replace level with its mean target, calculated excluding this row's target
# The y (target) mean for this level is normally just the sum/count;
# excluding this row's y, it's (sum - y) / (count - 1)
level_means = (X[col].map(colmap['sum']) - y) / (X[col].map(colmap['count']) - 1)
# The 'where' fills in singleton levels (count = 1 -> div by 0) with the global mean
X[col] = level_means.where(X[col].map(colmap['count'][level_notunique]).notnull(), self._mean)

if impute_missing:
if handle_unknown == 'impute':
transformed_column.fillna(self._mean, inplace=True)
X[col].fillna(self._mean, inplace=True)
elif handle_unknown == 'error':
missing = transformed_column.isnull()
if any(missing):
raise ValueError('Unexpected categories found in column %s' % column)

if self.randomized and y is not None:
transformed_column = (transformed_column * random_state_.normal(1., self.sigma, transformed_column.shape[0]))
if X[col].isnull().any():
raise ValueError('Unexpected categories found in column %s' % col)

X[column] = transformed_column.astype(float)
if self.sigma is not None and y is not None:
X[col] = X[col] * random_state_.normal(1., self.sigma, X[col].shape[0])

return X
22 changes: 16 additions & 6 deletions category_encoders/tests/test_leave_one_out.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
class TestLeaveOneOutEncoder(TestCase):

def test_leave_one_out(self):
enc = encoders.LeaveOneOutEncoder(verbose=1, randomized=True, sigma=0.1)
enc = encoders.LeaveOneOutEncoder(verbose=1, sigma=0.1)
enc.fit(X, y)
tu.verify_numeric(enc.transform(X_t))
tu.verify_numeric(enc.transform(X_t, y_t))
Expand All @@ -32,7 +32,7 @@ def test_leave_one_out_values(self):
X = df.drop('outcome', axis=1)
y = df.drop('color', axis=1)

ce_leave = encoders.LeaveOneOutEncoder(cols=['color'], randomized=False)
ce_leave = encoders.LeaveOneOutEncoder(cols=['color'])
obtained = ce_leave.fit_transform(X, y['outcome'])

self.assertEqual([0.0, 0.5, 0.5, 0.5, 1.0, 0.5], list(obtained['color']))
Expand All @@ -46,7 +46,17 @@ def test_leave_one_out_fit_callTwiceOnDifferentData_ExpectRefit(self):
encoder.fit(x_b, y_dummy)
mapping = encoder.mapping
self.assertEqual(1, len(mapping))
col_b_mapping = mapping[0]
self.assertEqual('col_b', col_b_mapping['col']) # the model must get updated
self.assertEqual({'sum': 2.0, 'count': 3, 'mean': 2.0/3.0}, col_b_mapping['mapping']['1'])
self.assertEqual({'sum': 1.0, 'count': 3, 'mean': 01.0/3.0}, col_b_mapping['mapping']['2'])
self.assertIn('col_b', mapping) # the model should have the updated mapping
expected = pd.DataFrame({'sum': [2.0, 1.0], 'count': [3, 3]}, index=['1', '2'])
pd.testing.assert_frame_equal(expected, mapping['col_b'], check_like=True)

def test_leave_one_out_unique(self):
X = pd.DataFrame(data=['1', '2', '2', '2', '3'], columns=['col'])
y = np.array([1, 0, 1, 0, 1])

encoder = encoders.LeaveOneOutEncoder(impute_missing=False)
result = encoder.fit(X, y).transform(X, y)

self.assertFalse(result.isnull().any().any(), 'There should not be any missing value')
expected = pd.DataFrame(data=[y.mean(), 0.5, 0, 0.5, y.mean()], columns=['col'])
pd.testing.assert_frame_equal(expected, result)