From 5b99a119791b5397bfd396d97ccbec4f6f6f651e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=BCnter=20Urak?= Date: Mon, 21 May 2018 17:43:15 +0200 Subject: [PATCH 1/5] add fix for bug 19029 As of version 0.23.0 MultiIndex throws an exception in case it contains duplicated level names. This can happen as a result of various groupby operations (21075). This commit changes the behavior of groupby slightly: In case there are duplicated names contained in the index these names get suffixed by there corresonding position (i.e. [name,name] => [name0,name1]) --- pandas/core/groupby/groupby.py | 13 ++++++++++++- pandas/tests/groupby/test_categorical.py | 10 ++++++++-- 2 files changed, 20 insertions(+), 3 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index df7a5dc9dc173..c77545ce25417 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -2298,7 +2298,18 @@ def levels(self): @property def names(self): - return [ping.name for ping in self.groupings] + # GH 19029 + # add suffix to level name in case they contain duplicates (GH 19029): + orig_names = [ping.name for ping in self.groupings] + # if no names were assigned return the original names + if all(x is None for x in orig_names): + return orig_names + # in case duplicates are contained rename all of them + if len(set(orig_names)) < len(orig_names): + orig_names = [''.join([str(x),str(i)]) + for i,x in enumerate(orig_names)] + + return orig_names def size(self): """ diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index e0793b8e1bd64..8a418e6d4086d 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -558,9 +558,15 @@ def test_as_index(): result = df.groupby(['cat', s], as_index=False, observed=True).sum() tm.assert_frame_equal(result, expected) - # GH18872: conflicting names in desired index - with pytest.raises(ValueError): + # GH 19029: conflicitng names should not raise a value error anymore + raised=False + try: df.groupby(['cat', s.rename('cat')], observed=True).sum() + except ValueError as e: + raised = True + assert raised == False + + # is original index dropped? group_columns = ['cat', 'A'] From 117872f156f2ad709cc121e1d2943439877bdeaf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=BCnter=20Urak?= Date: Mon, 21 May 2018 19:25:24 +0200 Subject: [PATCH 2/5] update old testcase to satisfy new behavior --- pandas/tests/reshape/test_pivot.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index d2cf3fc11e165..5a2ad7f89670b 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -1705,9 +1705,20 @@ def test_crosstab_with_numpy_size(self): tm.assert_frame_equal(result, expected) def test_crosstab_dup_index_names(self): - # GH 13279, GH 18872 + # duplicated index name should get renamed (GH 19029) s = pd.Series(range(3), name='foo') - pytest.raises(ValueError, pd.crosstab, s, s) + failed = False + try: + result=pd.crosstab(s,s) + except ValueError as e: + failed = True + + assert failed == False + + s0 = pd.Series(range(3),name='foo0') + s1 = pd.Series(range(3),name='foo1') + expected = pd.DataFrame(data=np.diag(np.ones(3,dtype='int64')), index=s0, columns=s1) + tm.assert_frame_equal(result,expected) @pytest.mark.parametrize("names", [['a', ('b', 'c')], [('a', 'b'), 'c']]) From 32e44c34f73a5224a624740a051d61b969a274a2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=BCnter=20Urak?= Date: Mon, 21 May 2018 19:47:56 +0200 Subject: [PATCH 3/5] add additional groupby testcases (19029) --- pandas/tests/groupby/test_groupby.py | 40 ++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index e05f9de5ea7f4..1f47f39646f4c 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1674,3 +1674,43 @@ def test_tuple_correct_keyerror(): [3, 4]])) with tm.assert_raises_regex(KeyError, "(7, 8)"): df.groupby((7, 8)).mean() + + +def test_dup_index_names(): + # duplicated index names in groupby operations should be renamed (GH 19029): + df = pd.DataFrame(data={'date': list(pd.date_range('5.1.2018', '5.3.2018')), + 'vals': list(range(3))}) + + mi = pd.MultiIndex.from_product([[5], [1, 2, 3]], names=['date0', 'date1']) + expected = pd.Series(data=list(range(3)), index=mi, name='vals') + + failed = False + try: + result = df.groupby([df.date.dt.month, df.date.dt.day])['vals'].sum() + except ValueError as e: + failed = True + + assert failed == False + + tm.assert_series_equal(result,expected) + + +def test_empty_index_names(): + # don't rename frames in case no names were assigned (GH 19029) + df = pd.DataFrame(data={'date': list(pd.date_range('5.1.2018', '5.3.2018')), + 'vals': list(range(3))}) + + mi = pd.MultiIndex.from_product([[5], [1, 2, 3]]) + expected = pd.Series(data=list(range(3)), index=mi, name='vals') + + failed = False + try: + result = df.groupby([df.date.dt.month.rename(None), + df.date.dt.day.rename(None)])['vals'].sum() + except ValueError as e: + failed = True + + assert failed == False + + tm.assert_series_equal(result,expected) + From c2a3fa5eebb63f3d9b4a084e452731726e4058b9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=BCnter=20Urak?= Date: Tue, 22 May 2018 14:33:06 +0200 Subject: [PATCH 4/5] resolve flake8 conflicts --- pandas/core/groupby/groupby.py | 8 ++++---- pandas/tests/groupby/test_categorical.py | 8 +++----- pandas/tests/groupby/test_groupby.py | 25 ++++++++++++------------ pandas/tests/reshape/test_pivot.py | 15 +++++++------- 4 files changed, 27 insertions(+), 29 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index c77545ce25417..1c5c279d27afd 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -2300,14 +2300,14 @@ def levels(self): def names(self): # GH 19029 # add suffix to level name in case they contain duplicates (GH 19029): - orig_names = [ping.name for ping in self.groupings] - # if no names were assigned return the original names + orig_names = [ping.name for ping in self.groupings] + # if no names were assigned return the original names if all(x is None for x in orig_names): return orig_names # in case duplicates are contained rename all of them if len(set(orig_names)) < len(orig_names): - orig_names = [''.join([str(x),str(i)]) - for i,x in enumerate(orig_names)] + orig_names = [''.join([str(x), str(i)]) + for i, x in enumerate(orig_names)] return orig_names diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 8a418e6d4086d..b615d1efa6f10 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -559,14 +559,12 @@ def test_as_index(): tm.assert_frame_equal(result, expected) # GH 19029: conflicitng names should not raise a value error anymore - raised=False + raised = False try: df.groupby(['cat', s.rename('cat')], observed=True).sum() - except ValueError as e: + except ValueError: raised = True - assert raised == False - - + assert raised is False # is original index dropped? group_columns = ['cat', 'A'] diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 1f47f39646f4c..52399427eddb5 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1677,9 +1677,9 @@ def test_tuple_correct_keyerror(): def test_dup_index_names(): - # duplicated index names in groupby operations should be renamed (GH 19029): - df = pd.DataFrame(data={'date': list(pd.date_range('5.1.2018', '5.3.2018')), - 'vals': list(range(3))}) + # dup. index names in groupby operations should be renamed (GH 19029): + df = pd.DataFrame({'date': list(pd.date_range('5.1.2018', '5.3.2018')), + 'vals': list(range(3))}) mi = pd.MultiIndex.from_product([[5], [1, 2, 3]], names=['date0', 'date1']) expected = pd.Series(data=list(range(3)), index=mi, name='vals') @@ -1687,18 +1687,18 @@ def test_dup_index_names(): failed = False try: result = df.groupby([df.date.dt.month, df.date.dt.day])['vals'].sum() - except ValueError as e: + except ValueError: failed = True - assert failed == False + assert failed is False - tm.assert_series_equal(result,expected) + tm.assert_series_equal(result, expected) def test_empty_index_names(): # don't rename frames in case no names were assigned (GH 19029) - df = pd.DataFrame(data={'date': list(pd.date_range('5.1.2018', '5.3.2018')), - 'vals': list(range(3))}) + df = pd.DataFrame({'date': list(pd.date_range('5.1.2018', '5.3.2018')), + 'vals': list(range(3))}) mi = pd.MultiIndex.from_product([[5], [1, 2, 3]]) expected = pd.Series(data=list(range(3)), index=mi, name='vals') @@ -1706,11 +1706,10 @@ def test_empty_index_names(): failed = False try: result = df.groupby([df.date.dt.month.rename(None), - df.date.dt.day.rename(None)])['vals'].sum() - except ValueError as e: + df.date.dt.day.rename(None)])['vals'].sum() + except ValueError: failed = True - assert failed == False - - tm.assert_series_equal(result,expected) + assert failed is False + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 5a2ad7f89670b..3e416e6fed161 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -1709,16 +1709,17 @@ def test_crosstab_dup_index_names(self): s = pd.Series(range(3), name='foo') failed = False try: - result=pd.crosstab(s,s) - except ValueError as e: + result = pd.crosstab(s, s) + except ValueError: failed = True - assert failed == False + assert failed is False - s0 = pd.Series(range(3),name='foo0') - s1 = pd.Series(range(3),name='foo1') - expected = pd.DataFrame(data=np.diag(np.ones(3,dtype='int64')), index=s0, columns=s1) - tm.assert_frame_equal(result,expected) + s0 = pd.Series(range(3), name='foo0') + s1 = pd.Series(range(3), name='foo1') + expected = pd.DataFrame(np.diag(np.ones(3, dtype='int64')), + index=s0, columns=s1) + tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("names", [['a', ('b', 'c')], [('a', 'b'), 'c']]) From 7cd448ac0e2907fc80ea9badce6fcfcdc88e5536 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=BCnter=20Urak?= Date: Wed, 23 May 2018 11:34:36 +0200 Subject: [PATCH 5/5] change groupby-behaviour (duplicates) & tests Only duplicates get suffixed by their corresponding enumeration value: ['name', None, 'name'] gets transformed into ['name_0', None, 'name_1'] Superfluous test cases have been deleted and some additonal test statements have been added. --- pandas/core/groupby/groupby.py | 22 +++++++--- pandas/tests/groupby/test_categorical.py | 8 ---- pandas/tests/groupby/test_groupby.py | 52 ++++++++++++------------ 3 files changed, 43 insertions(+), 39 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 1c5c279d27afd..6cd2a91e9c17d 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -2298,18 +2298,28 @@ def levels(self): @property def names(self): - # GH 19029 # add suffix to level name in case they contain duplicates (GH 19029): orig_names = [ping.name for ping in self.groupings] # if no names were assigned return the original names if all(x is None for x in orig_names): return orig_names - # in case duplicates are contained rename all of them - if len(set(orig_names)) < len(orig_names): - orig_names = [''.join([str(x), str(i)]) - for i, x in enumerate(orig_names)] - return orig_names + suffixes = collections.defaultdict(int) + dups = {n: count for n, count in + collections.Counter(orig_names).items() if count > 1} + new_names = [] + for name in orig_names: + if name not in dups: + new_names.append(name) + else: + if name is not None: + new_name = '{0}_{1}'.format(name, suffixes[name]) + else: + new_name = '{0}'.format(suffixes[name]) + suffixes[name] += 1 + new_names.append(new_name) + + return new_names def size(self): """ diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index b615d1efa6f10..fc3f2b1b7c4b7 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -558,14 +558,6 @@ def test_as_index(): result = df.groupby(['cat', s], as_index=False, observed=True).sum() tm.assert_frame_equal(result, expected) - # GH 19029: conflicitng names should not raise a value error anymore - raised = False - try: - df.groupby(['cat', s.rename('cat')], observed=True).sum() - except ValueError: - raised = True - assert raised is False - # is original index dropped? group_columns = ['cat', 'A'] expected = DataFrame( diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 52399427eddb5..a583c1230bfa4 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1678,38 +1678,40 @@ def test_tuple_correct_keyerror(): def test_dup_index_names(): # dup. index names in groupby operations should be renamed (GH 19029): - df = pd.DataFrame({'date': list(pd.date_range('5.1.2018', '5.3.2018')), + df = pd.DataFrame({'date': pd.date_range('5.1.2018', '5.3.2018'), 'vals': list(range(3))}) - mi = pd.MultiIndex.from_product([[5], [1, 2, 3]], names=['date0', 'date1']) + # duplicates get suffixed by integer position + mi = pd.MultiIndex.from_product([[5], [1, 2, 3]], + names=['date_0', 'date_1']) expected = pd.Series(data=list(range(3)), index=mi, name='vals') - - failed = False - try: - result = df.groupby([df.date.dt.month, df.date.dt.day])['vals'].sum() - except ValueError: - failed = True - - assert failed is False + result = df.groupby([df.date.dt.month, df.date.dt.day])['vals'].sum() tm.assert_series_equal(result, expected) - -def test_empty_index_names(): - # don't rename frames in case no names were assigned (GH 19029) - df = pd.DataFrame({'date': list(pd.date_range('5.1.2018', '5.3.2018')), - 'vals': list(range(3))}) - - mi = pd.MultiIndex.from_product([[5], [1, 2, 3]]) + # 2 out of 3 are duplicates and None + mi = pd.MultiIndex.from_product([[2018], [5], [1, 2, 3]], + names=['0', '1', 'date']) expected = pd.Series(data=list(range(3)), index=mi, name='vals') + result = df.groupby([df.date.dt.year.rename(None), + df.date.dt.month.rename(None), + df.date.dt.day])['vals'].sum() + tm.assert_series_equal(result, expected) - failed = False - try: - result = df.groupby([df.date.dt.month.rename(None), - df.date.dt.day.rename(None)])['vals'].sum() - except ValueError: - failed = True - - assert failed is False + # 2 out of 3 names (not None) are duplicates, the remaining is None + mi = pd.MultiIndex.from_product([[2018], [5], [1, 2, 3]], + names=['date_0', None, 'date_1']) + expected = pd.Series(data=list(range(3)), index=mi, name='vals') + result = df.groupby([df.date.dt.year, + df.date.dt.month.rename(None), + df.date.dt.day])['vals'].sum() + tm.assert_series_equal(result, expected) + # all are None + mi = pd.MultiIndex.from_product([[2018], [5], [1, 2, 3]], + names=[None, None, None]) + expected = pd.Series(data=list(range(3)), index=mi, name='vals') + result = df.groupby([df.date.dt.year.rename(None), + df.date.dt.month.rename(None), + df.date.dt.day.rename(None)])['vals'].sum() tm.assert_series_equal(result, expected)