diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 2a825edd0e98a..8a137d1c90a8f 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -309,6 +309,7 @@ Bug Fixes - Bug in ``pd.read_csv()`` in which the ``dialect`` parameter was not being verified before processing (:issue:`14898`) - Bug in ``pd.read_fwf`` where the skiprows parameter was not being respected during column width inference (:issue:`11256`) - Bug in ``pd.read_csv()`` in which missing data was being improperly handled with ``usecols`` (:issue:`6710`) +- Bug in ``pd.tools.hashing.hash_pandas_object()`` in which hashing of categoricals depended on the ordering of categories, instead of just their values. (:issue:`15143`) - Bug in ``DataFrame.loc`` with indexing a ``MultiIndex`` with a ``Series`` indexer (:issue:`14730`) @@ -369,4 +370,4 @@ Bug Fixes - Bug in ``Series`` constructor when both ``copy=True`` and ``dtype`` arguments are provided (:issue:`15125`) - Bug in ``pd.read_csv()`` for the C engine where ``usecols`` were being indexed incorrectly with ``parse_dates`` (:issue:`14792`) -- Bug in ``Series.dt.round`` inconsistent behaviour on NAT's with different arguments (:issue:`14940`) \ No newline at end of file +- Bug in ``Series.dt.round`` inconsistent behaviour on NAT's with different arguments (:issue:`14940`) diff --git a/pandas/tools/hashing.py b/pandas/tools/hashing.py index aa18b8bc70c37..6d8b264feba83 100644 --- a/pandas/tools/hashing.py +++ b/pandas/tools/hashing.py @@ -4,7 +4,7 @@ import numpy as np from pandas import _hash, Series, factorize, Categorical, Index -from pandas.lib import infer_dtype +from pandas.lib import is_bool_array from pandas.types.generic import ABCIndexClass, ABCSeries, ABCDataFrame from pandas.types.common import is_categorical_dtype @@ -12,7 +12,8 @@ _default_hash_key = '0123456789123456' -def hash_pandas_object(obj, index=True, encoding='utf8', hash_key=None): +def hash_pandas_object(obj, index=True, encoding='utf8', hash_key=None, + categorize=True): """ Return a data hash of the Index/Series/DataFrame @@ -25,6 +26,11 @@ def hash_pandas_object(obj, index=True, encoding='utf8', hash_key=None): encoding : string, default 'utf8' encoding for data & key when strings hash_key : string key to encode, default to _default_hash_key + categorize : bool, default True + Whether to first categorize object arrays before hashing. This is more + efficient when the array contains duplicate values. + + .. versionadded:: 0.20.0 Returns ------- @@ -39,28 +45,33 @@ def adder(h, hashed_to_add): return np.add(h, hashed_to_add, h) if isinstance(obj, ABCIndexClass): - h = hash_array(obj.values, encoding, hash_key).astype('uint64') + h = hash_array(obj.values, encoding, hash_key, + categorize).astype('uint64') h = Series(h, index=obj, dtype='uint64') elif isinstance(obj, ABCSeries): - h = hash_array(obj.values, encoding, hash_key).astype('uint64') + h = hash_array(obj.values, encoding, hash_key, + categorize).astype('uint64') if index: h = adder(h, hash_pandas_object(obj.index, index=False, encoding=encoding, - hash_key=hash_key).values) + hash_key=hash_key, + categorize=categorize).values) h = Series(h, index=obj.index, dtype='uint64') elif isinstance(obj, ABCDataFrame): cols = obj.iteritems() first_series = next(cols)[1] h = hash_array(first_series.values, encoding, - hash_key).astype('uint64') + hash_key, categorize).astype('uint64') for _, col in cols: - h = adder(h, hash_array(col.values, encoding, hash_key)) + h = adder(h, hash_array(col.values, encoding, hash_key, + categorize)) if index: h = adder(h, hash_pandas_object(obj.index, index=False, encoding=encoding, - hash_key=hash_key).values) + hash_key=hash_key, + categorize=categorize).values) h = Series(h, index=obj.index, dtype='uint64') else: @@ -68,7 +79,15 @@ def adder(h, hashed_to_add): return h -def hash_array(vals, encoding='utf8', hash_key=None): +def _hash_categorical(c, encoding, hash_key): + """Hash a Categorical by hashing its categories, and then mapping the codes + to the hashes""" + cat_hashed = hash_array(c.categories.values, encoding, hash_key, + categorize=False).astype(np.uint64, copy=False) + return c.rename_categories(cat_hashed).astype(np.uint64) + + +def hash_array(vals, encoding='utf8', hash_key=None, categorize=True): """ Given a 1d array, return an array of deterministic integers. @@ -80,6 +99,11 @@ def hash_array(vals, encoding='utf8', hash_key=None): encoding : string, default 'utf8' encoding for data & key when strings hash_key : string key to encode, default to _default_hash_key + categorize : bool, default True + Whether to first categorize object arrays before hashing. This is more + efficient when the array contains duplicate values. + + .. versionadded:: 0.20.0 Returns ------- @@ -87,46 +111,39 @@ def hash_array(vals, encoding='utf8', hash_key=None): """ - # work with cagegoricals as ints. (This check is above the complex - # check so that we don't ask numpy if categorical is a subdtype of - # complex, as it will choke. if hash_key is None: hash_key = _default_hash_key + # For categoricals, we hash the categories, then remap the codes to the + # hash values. (This check is above the complex check so that we don't ask + # numpy if categorical is a subdtype of complex, as it will choke. if is_categorical_dtype(vals.dtype): - vals = vals.codes + return _hash_categorical(vals, encoding, hash_key) # we'll be working with everything as 64-bit values, so handle this # 128-bit value early if np.issubdtype(vals.dtype, np.complex128): return hash_array(vals.real) + 23 * hash_array(vals.imag) - # MAIN LOGIC: - inferred = infer_dtype(vals) - # First, turn whatever array this is into unsigned 64-bit ints, if we can # manage it. - if inferred == 'boolean': + if is_bool_array(vals): vals = vals.astype('u8') - - if (np.issubdtype(vals.dtype, np.datetime64) or - np.issubdtype(vals.dtype, np.timedelta64) or - np.issubdtype(vals.dtype, np.number)) and vals.dtype.itemsize <= 8: - + elif (np.issubdtype(vals.dtype, np.datetime64) or + np.issubdtype(vals.dtype, np.timedelta64) or + np.issubdtype(vals.dtype, np.number)) and vals.dtype.itemsize <= 8: vals = vals.view('u{}'.format(vals.dtype.itemsize)).astype('u8') else: - - # its MUCH faster to categorize object dtypes, then hash and rename - codes, categories = factorize(vals, sort=False) - categories = Index(categories) - c = Series(Categorical(codes, categories, - ordered=False, fastpath=True)) - vals = _hash.hash_object_array(categories.values, - hash_key, - encoding) - - # rename & extract - vals = c.cat.rename_categories(Index(vals)).astype(np.uint64).values + # With repeated values, its MUCH faster to categorize object dtypes, + # then hash and rename categories. We allow skipping the categorization + # when the values are known/likely to be unique. + if categorize: + codes, categories = factorize(vals, sort=False) + cat = Categorical(codes, Index(categories), + ordered=False, fastpath=True) + return _hash_categorical(cat, encoding, hash_key) + else: + vals = _hash.hash_object_array(vals, hash_key, encoding) # Then, redistribute these 64-bit ints within the space of 64-bit ints vals ^= vals >> 30 diff --git a/pandas/tools/tests/test_hashing.py b/pandas/tools/tests/test_hashing.py index 6e5f30fb7a52d..00137f080c5b7 100644 --- a/pandas/tools/tests/test_hashing.py +++ b/pandas/tools/tests/test_hashing.py @@ -90,6 +90,22 @@ def test_hash_pandas_empty_object(self): # these are by-definition the same with # or w/o the index as the data is empty + def test_categorical_consistency(self): + # Check that categoricals hash consistent with their values, not codes + # This should work for categoricals of any dtype + for s1 in [Series(['a', 'b', 'c', 'd']), + Series([1000, 2000, 3000, 4000]), + Series(pd.date_range(0, periods=4))]: + s2 = s1.astype('category').cat.set_categories(s1) + s3 = s2.cat.set_categories(list(reversed(s1))) + for categorize in [True, False]: + # These should all hash identically + h1 = hash_pandas_object(s1, categorize=categorize) + h2 = hash_pandas_object(s2, categorize=categorize) + h3 = hash_pandas_object(s3, categorize=categorize) + tm.assert_series_equal(h1, h2) + tm.assert_series_equal(h1, h3) + def test_errors(self): for obj in [pd.Timestamp('20130101'), tm.makePanel()]: