From 7878c5508375ce3b0a54e6d5cfd707f36f66f9f1 Mon Sep 17 00:00:00 2001 From: Jim Crist Date: Mon, 16 Jan 2017 16:32:27 -0600 Subject: [PATCH 1/2] Categoricals hash consistently Previously categorical values were hashed using just their codes. This meant that the hash value depended on the ordering of the categories, rather than on the values the series represented. This caused problems in dask, where different partitions might have different categorical mappings. This PR makes the hashing dependent on the values the categorical represents, rather than on the codes. The categories are first hashed, and then the codes are remapped to the hashed values. This is slightly slower than before (still need to hash the categories, where we didn't before), but allows for more consistent hashing. --- doc/source/whatsnew/v0.20.0.txt | 3 +- pandas/tools/hashing.py | 57 ++++++++++++++++-------------- pandas/tools/tests/test_hashing.py | 14 ++++++++ 3 files changed, 46 insertions(+), 28 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 2a825edd0e98a..9cfda249011bf 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -309,6 +309,7 @@ Bug Fixes - Bug in ``pd.read_csv()`` in which the ``dialect`` parameter was not being verified before processing (:issue:`14898`) - Bug in ``pd.read_fwf`` where the skiprows parameter was not being respected during column width inference (:issue:`11256`) - Bug in ``pd.read_csv()`` in which missing data was being improperly handled with ``usecols`` (:issue:`6710`) +- Bug in ``pandas.tools.hashing.hash_pandas_object`` in which hashing of categoricals depended on the ordering of categories, instead of just their values. - Bug in ``DataFrame.loc`` with indexing a ``MultiIndex`` with a ``Series`` indexer (:issue:`14730`) @@ -369,4 +370,4 @@ Bug Fixes - Bug in ``Series`` constructor when both ``copy=True`` and ``dtype`` arguments are provided (:issue:`15125`) - Bug in ``pd.read_csv()`` for the C engine where ``usecols`` were being indexed incorrectly with ``parse_dates`` (:issue:`14792`) -- Bug in ``Series.dt.round`` inconsistent behaviour on NAT's with different arguments (:issue:`14940`) \ No newline at end of file +- Bug in ``Series.dt.round`` inconsistent behaviour on NAT's with different arguments (:issue:`14940`) diff --git a/pandas/tools/hashing.py b/pandas/tools/hashing.py index aa18b8bc70c37..4c85a46d0917b 100644 --- a/pandas/tools/hashing.py +++ b/pandas/tools/hashing.py @@ -4,7 +4,7 @@ import numpy as np from pandas import _hash, Series, factorize, Categorical, Index -from pandas.lib import infer_dtype +from pandas.lib import is_bool_array from pandas.types.generic import ABCIndexClass, ABCSeries, ABCDataFrame from pandas.types.common import is_categorical_dtype @@ -68,7 +68,7 @@ def adder(h, hashed_to_add): return h -def hash_array(vals, encoding='utf8', hash_key=None): +def hash_array(vals, encoding='utf8', hash_key=None, categorize=True): """ Given a 1d array, return an array of deterministic integers. @@ -80,6 +80,9 @@ def hash_array(vals, encoding='utf8', hash_key=None): encoding : string, default 'utf8' encoding for data & key when strings hash_key : string key to encode, default to _default_hash_key + categorize : bool, default True + Whether to first categorize object arrays before hashing. This is more + efficient when the array contains duplicate values. Returns ------- @@ -87,46 +90,46 @@ def hash_array(vals, encoding='utf8', hash_key=None): """ - # work with cagegoricals as ints. (This check is above the complex - # check so that we don't ask numpy if categorical is a subdtype of - # complex, as it will choke. if hash_key is None: hash_key = _default_hash_key + # For categoricals, we hash the categories, then remap the codes to the + # hash values. (This check is above the complex check so that we don't ask + # numpy if categorical is a subdtype of complex, as it will choke. if is_categorical_dtype(vals.dtype): - vals = vals.codes + cat_hashed = hash_array(vals.categories.values, encoding, hash_key, + categorize=False).astype(np.uint64, copy=False) + # Since `cat_hashed` is already distributed in the space of uint64s, + # we can just return after remapping the codes here + c = Series(vals) + return c.cat.rename_categories(cat_hashed).values.astype(np.uint64) # we'll be working with everything as 64-bit values, so handle this # 128-bit value early if np.issubdtype(vals.dtype, np.complex128): return hash_array(vals.real) + 23 * hash_array(vals.imag) - # MAIN LOGIC: - inferred = infer_dtype(vals) - # First, turn whatever array this is into unsigned 64-bit ints, if we can # manage it. - if inferred == 'boolean': + if is_bool_array(vals): vals = vals.astype('u8') - - if (np.issubdtype(vals.dtype, np.datetime64) or - np.issubdtype(vals.dtype, np.timedelta64) or - np.issubdtype(vals.dtype, np.number)) and vals.dtype.itemsize <= 8: - + elif (np.issubdtype(vals.dtype, np.datetime64) or + np.issubdtype(vals.dtype, np.timedelta64) or + np.issubdtype(vals.dtype, np.number)) and vals.dtype.itemsize <= 8: vals = vals.view('u{}'.format(vals.dtype.itemsize)).astype('u8') else: - - # its MUCH faster to categorize object dtypes, then hash and rename - codes, categories = factorize(vals, sort=False) - categories = Index(categories) - c = Series(Categorical(codes, categories, - ordered=False, fastpath=True)) - vals = _hash.hash_object_array(categories.values, - hash_key, - encoding) - - # rename & extract - vals = c.cat.rename_categories(Index(vals)).astype(np.uint64).values + # With repeated values, its MUCH faster to categorize object dtypes, + # then hash and rename categories. We allow skipping the categorization + # when the values are known/likely to be unique. + if categorize: + codes, categories = factorize(vals, sort=False) + c = Series(Categorical(codes, Index(categories), + ordered=False, fastpath=True)) + vals = _hash.hash_object_array(categories, hash_key, encoding) + # rename & extract + vals = c.cat.rename_categories(vals).values.astype(np.uint64) + else: + vals = _hash.hash_object_array(vals, hash_key, encoding) # Then, redistribute these 64-bit ints within the space of 64-bit ints vals ^= vals >> 30 diff --git a/pandas/tools/tests/test_hashing.py b/pandas/tools/tests/test_hashing.py index 6e5f30fb7a52d..806a41ce5e58c 100644 --- a/pandas/tools/tests/test_hashing.py +++ b/pandas/tools/tests/test_hashing.py @@ -90,6 +90,20 @@ def test_hash_pandas_empty_object(self): # these are by-definition the same with # or w/o the index as the data is empty + def test_categorical_consistency(self): + # Check that categoricals hash consistent with their values, not codes + # This should work for categoricals of any dtype + for data in [['a', 'b', 'c', 'd'], [1000, 2000, 3000, 4000]]: + s1 = Series(data) + s2 = s1.astype('category').cat.set_categories(data) + s3 = s2.cat.set_categories(list(reversed(data))) + # These should all hash identically + h1 = hash_pandas_object(s1) + h2 = hash_pandas_object(s2) + h3 = hash_pandas_object(s3) + tm.assert_series_equal(h1, h2) + tm.assert_series_equal(h1, h3) + def test_errors(self): for obj in [pd.Timestamp('20130101'), tm.makePanel()]: From f1aea137e3e7c2b662ca11118b4cf7a8d1a27801 Mon Sep 17 00:00:00 2001 From: Jim Crist Date: Tue, 17 Jan 2017 13:29:37 -0600 Subject: [PATCH 2/2] Address comments - Add `categorize` parameter to `hash_pandas_object` - Update test - Update whatsnew --- doc/source/whatsnew/v0.20.0.txt | 2 +- pandas/tools/hashing.py | 50 +++++++++++++++++++----------- pandas/tools/tests/test_hashing.py | 22 +++++++------ 3 files changed, 45 insertions(+), 29 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 9cfda249011bf..8a137d1c90a8f 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -309,7 +309,7 @@ Bug Fixes - Bug in ``pd.read_csv()`` in which the ``dialect`` parameter was not being verified before processing (:issue:`14898`) - Bug in ``pd.read_fwf`` where the skiprows parameter was not being respected during column width inference (:issue:`11256`) - Bug in ``pd.read_csv()`` in which missing data was being improperly handled with ``usecols`` (:issue:`6710`) -- Bug in ``pandas.tools.hashing.hash_pandas_object`` in which hashing of categoricals depended on the ordering of categories, instead of just their values. +- Bug in ``pd.tools.hashing.hash_pandas_object()`` in which hashing of categoricals depended on the ordering of categories, instead of just their values. (:issue:`15143`) - Bug in ``DataFrame.loc`` with indexing a ``MultiIndex`` with a ``Series`` indexer (:issue:`14730`) diff --git a/pandas/tools/hashing.py b/pandas/tools/hashing.py index 4c85a46d0917b..6d8b264feba83 100644 --- a/pandas/tools/hashing.py +++ b/pandas/tools/hashing.py @@ -12,7 +12,8 @@ _default_hash_key = '0123456789123456' -def hash_pandas_object(obj, index=True, encoding='utf8', hash_key=None): +def hash_pandas_object(obj, index=True, encoding='utf8', hash_key=None, + categorize=True): """ Return a data hash of the Index/Series/DataFrame @@ -25,6 +26,11 @@ def hash_pandas_object(obj, index=True, encoding='utf8', hash_key=None): encoding : string, default 'utf8' encoding for data & key when strings hash_key : string key to encode, default to _default_hash_key + categorize : bool, default True + Whether to first categorize object arrays before hashing. This is more + efficient when the array contains duplicate values. + + .. versionadded:: 0.20.0 Returns ------- @@ -39,28 +45,33 @@ def adder(h, hashed_to_add): return np.add(h, hashed_to_add, h) if isinstance(obj, ABCIndexClass): - h = hash_array(obj.values, encoding, hash_key).astype('uint64') + h = hash_array(obj.values, encoding, hash_key, + categorize).astype('uint64') h = Series(h, index=obj, dtype='uint64') elif isinstance(obj, ABCSeries): - h = hash_array(obj.values, encoding, hash_key).astype('uint64') + h = hash_array(obj.values, encoding, hash_key, + categorize).astype('uint64') if index: h = adder(h, hash_pandas_object(obj.index, index=False, encoding=encoding, - hash_key=hash_key).values) + hash_key=hash_key, + categorize=categorize).values) h = Series(h, index=obj.index, dtype='uint64') elif isinstance(obj, ABCDataFrame): cols = obj.iteritems() first_series = next(cols)[1] h = hash_array(first_series.values, encoding, - hash_key).astype('uint64') + hash_key, categorize).astype('uint64') for _, col in cols: - h = adder(h, hash_array(col.values, encoding, hash_key)) + h = adder(h, hash_array(col.values, encoding, hash_key, + categorize)) if index: h = adder(h, hash_pandas_object(obj.index, index=False, encoding=encoding, - hash_key=hash_key).values) + hash_key=hash_key, + categorize=categorize).values) h = Series(h, index=obj.index, dtype='uint64') else: @@ -68,6 +79,14 @@ def adder(h, hashed_to_add): return h +def _hash_categorical(c, encoding, hash_key): + """Hash a Categorical by hashing its categories, and then mapping the codes + to the hashes""" + cat_hashed = hash_array(c.categories.values, encoding, hash_key, + categorize=False).astype(np.uint64, copy=False) + return c.rename_categories(cat_hashed).astype(np.uint64) + + def hash_array(vals, encoding='utf8', hash_key=None, categorize=True): """ Given a 1d array, return an array of deterministic integers. @@ -84,6 +103,8 @@ def hash_array(vals, encoding='utf8', hash_key=None, categorize=True): Whether to first categorize object arrays before hashing. This is more efficient when the array contains duplicate values. + .. versionadded:: 0.20.0 + Returns ------- 1d uint64 numpy array of hash values, same length as the vals @@ -97,12 +118,7 @@ def hash_array(vals, encoding='utf8', hash_key=None, categorize=True): # hash values. (This check is above the complex check so that we don't ask # numpy if categorical is a subdtype of complex, as it will choke. if is_categorical_dtype(vals.dtype): - cat_hashed = hash_array(vals.categories.values, encoding, hash_key, - categorize=False).astype(np.uint64, copy=False) - # Since `cat_hashed` is already distributed in the space of uint64s, - # we can just return after remapping the codes here - c = Series(vals) - return c.cat.rename_categories(cat_hashed).values.astype(np.uint64) + return _hash_categorical(vals, encoding, hash_key) # we'll be working with everything as 64-bit values, so handle this # 128-bit value early @@ -123,11 +139,9 @@ def hash_array(vals, encoding='utf8', hash_key=None, categorize=True): # when the values are known/likely to be unique. if categorize: codes, categories = factorize(vals, sort=False) - c = Series(Categorical(codes, Index(categories), - ordered=False, fastpath=True)) - vals = _hash.hash_object_array(categories, hash_key, encoding) - # rename & extract - vals = c.cat.rename_categories(vals).values.astype(np.uint64) + cat = Categorical(codes, Index(categories), + ordered=False, fastpath=True) + return _hash_categorical(cat, encoding, hash_key) else: vals = _hash.hash_object_array(vals, hash_key, encoding) diff --git a/pandas/tools/tests/test_hashing.py b/pandas/tools/tests/test_hashing.py index 806a41ce5e58c..00137f080c5b7 100644 --- a/pandas/tools/tests/test_hashing.py +++ b/pandas/tools/tests/test_hashing.py @@ -93,16 +93,18 @@ def test_hash_pandas_empty_object(self): def test_categorical_consistency(self): # Check that categoricals hash consistent with their values, not codes # This should work for categoricals of any dtype - for data in [['a', 'b', 'c', 'd'], [1000, 2000, 3000, 4000]]: - s1 = Series(data) - s2 = s1.astype('category').cat.set_categories(data) - s3 = s2.cat.set_categories(list(reversed(data))) - # These should all hash identically - h1 = hash_pandas_object(s1) - h2 = hash_pandas_object(s2) - h3 = hash_pandas_object(s3) - tm.assert_series_equal(h1, h2) - tm.assert_series_equal(h1, h3) + for s1 in [Series(['a', 'b', 'c', 'd']), + Series([1000, 2000, 3000, 4000]), + Series(pd.date_range(0, periods=4))]: + s2 = s1.astype('category').cat.set_categories(s1) + s3 = s2.cat.set_categories(list(reversed(s1))) + for categorize in [True, False]: + # These should all hash identically + h1 = hash_pandas_object(s1, categorize=categorize) + h2 = hash_pandas_object(s2, categorize=categorize) + h3 = hash_pandas_object(s3, categorize=categorize) + tm.assert_series_equal(h1, h2) + tm.assert_series_equal(h1, h3) def test_errors(self):