From 5b99a119791b5397bfd396d97ccbec4f6f6f651e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?G=C3=BCnter=20Urak?= <guenteru@users.noreply.github.com>
Date: Mon, 21 May 2018 17:43:15 +0200
Subject: [PATCH 1/5] add fix for bug 19029

As of version 0.23.0 MultiIndex throws an exception in case it contains
duplicated level names. This can happen as a result of various groupby
operations (21075). This commit changes the behavior of groupby slightly: In
case there are duplicated names contained in the index these names get suffixed by there
corresonding position (i.e. [name,name] => [name0,name1])
---
 pandas/core/groupby/groupby.py           | 13 ++++++++++++-
 pandas/tests/groupby/test_categorical.py | 10 ++++++++--
 2 files changed, 20 insertions(+), 3 deletions(-)

diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
index df7a5dc9dc173..c77545ce25417 100644
--- a/pandas/core/groupby/groupby.py
+++ b/pandas/core/groupby/groupby.py
@@ -2298,7 +2298,18 @@ def levels(self):
 
     @property
     def names(self):
-        return [ping.name for ping in self.groupings]
+        # GH 19029
+        # add suffix to level name in case they contain duplicates (GH 19029):
+        orig_names =  [ping.name for ping in self.groupings]
+        # if no names were assigned return the original names 
+        if all(x is None for x in orig_names):
+            return orig_names
+        # in case duplicates are contained rename all of them
+        if len(set(orig_names)) < len(orig_names):
+            orig_names = [''.join([str(x),str(i)])
+                                for i,x in enumerate(orig_names)]
+
+        return orig_names
 
     def size(self):
         """
diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py
index e0793b8e1bd64..8a418e6d4086d 100644
--- a/pandas/tests/groupby/test_categorical.py
+++ b/pandas/tests/groupby/test_categorical.py
@@ -558,9 +558,15 @@ def test_as_index():
     result = df.groupby(['cat', s], as_index=False, observed=True).sum()
     tm.assert_frame_equal(result, expected)
 
-    # GH18872: conflicting names in desired index
-    with pytest.raises(ValueError):
+    # GH 19029: conflicitng names should not raise a value error anymore
+    raised=False
+    try:
         df.groupby(['cat', s.rename('cat')], observed=True).sum()
+    except ValueError as e:
+        raised = True
+    assert raised == False
+        
+         
 
     # is original index dropped?
     group_columns = ['cat', 'A']

From 117872f156f2ad709cc121e1d2943439877bdeaf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?G=C3=BCnter=20Urak?= <guenteru@users.noreply.github.com>
Date: Mon, 21 May 2018 19:25:24 +0200
Subject: [PATCH 2/5] update old testcase to satisfy new behavior

---
 pandas/tests/reshape/test_pivot.py | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py
index d2cf3fc11e165..5a2ad7f89670b 100644
--- a/pandas/tests/reshape/test_pivot.py
+++ b/pandas/tests/reshape/test_pivot.py
@@ -1705,9 +1705,20 @@ def test_crosstab_with_numpy_size(self):
         tm.assert_frame_equal(result, expected)
 
     def test_crosstab_dup_index_names(self):
-        # GH 13279, GH 18872
+        # duplicated index name should get renamed (GH 19029)
         s = pd.Series(range(3), name='foo')
-        pytest.raises(ValueError, pd.crosstab, s, s)
+        failed = False
+        try:
+           result=pd.crosstab(s,s)
+        except ValueError as e:
+            failed = True
+
+        assert failed == False
+
+        s0 = pd.Series(range(3),name='foo0')
+        s1 = pd.Series(range(3),name='foo1')
+        expected = pd.DataFrame(data=np.diag(np.ones(3,dtype='int64')), index=s0, columns=s1)
+        tm.assert_frame_equal(result,expected)
 
     @pytest.mark.parametrize("names", [['a', ('b', 'c')],
                                        [('a', 'b'), 'c']])

From 32e44c34f73a5224a624740a051d61b969a274a2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?G=C3=BCnter=20Urak?= <guenteru@users.noreply.github.com>
Date: Mon, 21 May 2018 19:47:56 +0200
Subject: [PATCH 3/5] add additional groupby testcases (19029)

---
 pandas/tests/groupby/test_groupby.py | 40 ++++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)

diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py
index e05f9de5ea7f4..1f47f39646f4c 100644
--- a/pandas/tests/groupby/test_groupby.py
+++ b/pandas/tests/groupby/test_groupby.py
@@ -1674,3 +1674,43 @@ def test_tuple_correct_keyerror():
                                                           [3, 4]]))
     with tm.assert_raises_regex(KeyError, "(7, 8)"):
         df.groupby((7, 8)).mean()
+
+
+def test_dup_index_names():
+    # duplicated index names in groupby operations should be renamed (GH 19029):
+    df = pd.DataFrame(data={'date': list(pd.date_range('5.1.2018', '5.3.2018')),
+                            'vals': list(range(3))})
+
+    mi = pd.MultiIndex.from_product([[5], [1, 2, 3]], names=['date0', 'date1'])
+    expected = pd.Series(data=list(range(3)), index=mi, name='vals')
+
+    failed = False
+    try:
+        result = df.groupby([df.date.dt.month, df.date.dt.day])['vals'].sum()
+    except ValueError as e:
+        failed = True
+
+    assert failed == False
+
+    tm.assert_series_equal(result,expected)
+
+
+def test_empty_index_names():
+    # don't rename frames in case no names were assigned (GH 19029)
+    df = pd.DataFrame(data={'date': list(pd.date_range('5.1.2018', '5.3.2018')),
+                            'vals': list(range(3))})
+
+    mi = pd.MultiIndex.from_product([[5], [1, 2, 3]])
+    expected = pd.Series(data=list(range(3)), index=mi, name='vals')
+
+    failed = False
+    try:
+        result = df.groupby([df.date.dt.month.rename(None),
+                        df.date.dt.day.rename(None)])['vals'].sum()
+    except ValueError as e:
+        failed = True
+
+    assert failed == False
+
+    tm.assert_series_equal(result,expected)
+

From c2a3fa5eebb63f3d9b4a084e452731726e4058b9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?G=C3=BCnter=20Urak?= <guenteru@users.noreply.github.com>
Date: Tue, 22 May 2018 14:33:06 +0200
Subject: [PATCH 4/5] resolve flake8 conflicts

---
 pandas/core/groupby/groupby.py           |  8 ++++----
 pandas/tests/groupby/test_categorical.py |  8 +++-----
 pandas/tests/groupby/test_groupby.py     | 25 ++++++++++++------------
 pandas/tests/reshape/test_pivot.py       | 15 +++++++-------
 4 files changed, 27 insertions(+), 29 deletions(-)

diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
index c77545ce25417..1c5c279d27afd 100644
--- a/pandas/core/groupby/groupby.py
+++ b/pandas/core/groupby/groupby.py
@@ -2300,14 +2300,14 @@ def levels(self):
     def names(self):
         # GH 19029
         # add suffix to level name in case they contain duplicates (GH 19029):
-        orig_names =  [ping.name for ping in self.groupings]
-        # if no names were assigned return the original names 
+        orig_names = [ping.name for ping in self.groupings]
+        # if no names were assigned return the original names
         if all(x is None for x in orig_names):
             return orig_names
         # in case duplicates are contained rename all of them
         if len(set(orig_names)) < len(orig_names):
-            orig_names = [''.join([str(x),str(i)])
-                                for i,x in enumerate(orig_names)]
+            orig_names = [''.join([str(x), str(i)])
+                          for i, x in enumerate(orig_names)]
 
         return orig_names
 
diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py
index 8a418e6d4086d..b615d1efa6f10 100644
--- a/pandas/tests/groupby/test_categorical.py
+++ b/pandas/tests/groupby/test_categorical.py
@@ -559,14 +559,12 @@ def test_as_index():
     tm.assert_frame_equal(result, expected)
 
     # GH 19029: conflicitng names should not raise a value error anymore
-    raised=False
+    raised = False
     try:
         df.groupby(['cat', s.rename('cat')], observed=True).sum()
-    except ValueError as e:
+    except ValueError:
         raised = True
-    assert raised == False
-        
-         
+    assert raised is False
 
     # is original index dropped?
     group_columns = ['cat', 'A']
diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py
index 1f47f39646f4c..52399427eddb5 100644
--- a/pandas/tests/groupby/test_groupby.py
+++ b/pandas/tests/groupby/test_groupby.py
@@ -1677,9 +1677,9 @@ def test_tuple_correct_keyerror():
 
 
 def test_dup_index_names():
-    # duplicated index names in groupby operations should be renamed (GH 19029):
-    df = pd.DataFrame(data={'date': list(pd.date_range('5.1.2018', '5.3.2018')),
-                            'vals': list(range(3))})
+    # dup. index names in groupby operations should be renamed (GH 19029):
+    df = pd.DataFrame({'date': list(pd.date_range('5.1.2018', '5.3.2018')),
+                       'vals': list(range(3))})
 
     mi = pd.MultiIndex.from_product([[5], [1, 2, 3]], names=['date0', 'date1'])
     expected = pd.Series(data=list(range(3)), index=mi, name='vals')
@@ -1687,18 +1687,18 @@ def test_dup_index_names():
     failed = False
     try:
         result = df.groupby([df.date.dt.month, df.date.dt.day])['vals'].sum()
-    except ValueError as e:
+    except ValueError:
         failed = True
 
-    assert failed == False
+    assert failed is False
 
-    tm.assert_series_equal(result,expected)
+    tm.assert_series_equal(result, expected)
 
 
 def test_empty_index_names():
     # don't rename frames in case no names were assigned (GH 19029)
-    df = pd.DataFrame(data={'date': list(pd.date_range('5.1.2018', '5.3.2018')),
-                            'vals': list(range(3))})
+    df = pd.DataFrame({'date': list(pd.date_range('5.1.2018', '5.3.2018')),
+                       'vals': list(range(3))})
 
     mi = pd.MultiIndex.from_product([[5], [1, 2, 3]])
     expected = pd.Series(data=list(range(3)), index=mi, name='vals')
@@ -1706,11 +1706,10 @@ def test_empty_index_names():
     failed = False
     try:
         result = df.groupby([df.date.dt.month.rename(None),
-                        df.date.dt.day.rename(None)])['vals'].sum()
-    except ValueError as e:
+                             df.date.dt.day.rename(None)])['vals'].sum()
+    except ValueError:
         failed = True
 
-    assert failed == False
-
-    tm.assert_series_equal(result,expected)
+    assert failed is False
 
+    tm.assert_series_equal(result, expected)
diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py
index 5a2ad7f89670b..3e416e6fed161 100644
--- a/pandas/tests/reshape/test_pivot.py
+++ b/pandas/tests/reshape/test_pivot.py
@@ -1709,16 +1709,17 @@ def test_crosstab_dup_index_names(self):
         s = pd.Series(range(3), name='foo')
         failed = False
         try:
-           result=pd.crosstab(s,s)
-        except ValueError as e:
+            result = pd.crosstab(s, s)
+        except ValueError:
             failed = True
 
-        assert failed == False
+        assert failed is False
 
-        s0 = pd.Series(range(3),name='foo0')
-        s1 = pd.Series(range(3),name='foo1')
-        expected = pd.DataFrame(data=np.diag(np.ones(3,dtype='int64')), index=s0, columns=s1)
-        tm.assert_frame_equal(result,expected)
+        s0 = pd.Series(range(3), name='foo0')
+        s1 = pd.Series(range(3), name='foo1')
+        expected = pd.DataFrame(np.diag(np.ones(3, dtype='int64')),
+                                index=s0, columns=s1)
+        tm.assert_frame_equal(result, expected)
 
     @pytest.mark.parametrize("names", [['a', ('b', 'c')],
                                        [('a', 'b'), 'c']])

From 7cd448ac0e2907fc80ea9badce6fcfcdc88e5536 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?G=C3=BCnter=20Urak?= <guenteru@users.noreply.github.com>
Date: Wed, 23 May 2018 11:34:36 +0200
Subject: [PATCH 5/5] change groupby-behaviour (duplicates) & tests

Only duplicates get suffixed by their corresponding enumeration value:
['name', None, 'name'] gets transformed into ['name_0', None, 'name_1']

Superfluous test cases have been deleted and some additonal test
statements have been added.
---
 pandas/core/groupby/groupby.py           | 22 +++++++---
 pandas/tests/groupby/test_categorical.py |  8 ----
 pandas/tests/groupby/test_groupby.py     | 52 ++++++++++++------------
 3 files changed, 43 insertions(+), 39 deletions(-)

diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
index 1c5c279d27afd..6cd2a91e9c17d 100644
--- a/pandas/core/groupby/groupby.py
+++ b/pandas/core/groupby/groupby.py
@@ -2298,18 +2298,28 @@ def levels(self):
 
     @property
     def names(self):
-        # GH 19029
         # add suffix to level name in case they contain duplicates (GH 19029):
         orig_names = [ping.name for ping in self.groupings]
         # if no names were assigned return the original names
         if all(x is None for x in orig_names):
             return orig_names
-        # in case duplicates are contained rename all of them
-        if len(set(orig_names)) < len(orig_names):
-            orig_names = [''.join([str(x), str(i)])
-                          for i, x in enumerate(orig_names)]
 
-        return orig_names
+        suffixes = collections.defaultdict(int)
+        dups = {n: count for n, count in
+                collections.Counter(orig_names).items() if count > 1}
+        new_names = []
+        for name in orig_names:
+            if name not in dups:
+                new_names.append(name)
+            else:
+                if name is not None:
+                    new_name = '{0}_{1}'.format(name, suffixes[name])
+                else:
+                    new_name = '{0}'.format(suffixes[name])
+                suffixes[name] += 1
+                new_names.append(new_name)
+
+        return new_names
 
     def size(self):
         """
diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py
index b615d1efa6f10..fc3f2b1b7c4b7 100644
--- a/pandas/tests/groupby/test_categorical.py
+++ b/pandas/tests/groupby/test_categorical.py
@@ -558,14 +558,6 @@ def test_as_index():
     result = df.groupby(['cat', s], as_index=False, observed=True).sum()
     tm.assert_frame_equal(result, expected)
 
-    # GH 19029: conflicitng names should not raise a value error anymore
-    raised = False
-    try:
-        df.groupby(['cat', s.rename('cat')], observed=True).sum()
-    except ValueError:
-        raised = True
-    assert raised is False
-
     # is original index dropped?
     group_columns = ['cat', 'A']
     expected = DataFrame(
diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py
index 52399427eddb5..a583c1230bfa4 100644
--- a/pandas/tests/groupby/test_groupby.py
+++ b/pandas/tests/groupby/test_groupby.py
@@ -1678,38 +1678,40 @@ def test_tuple_correct_keyerror():
 
 def test_dup_index_names():
     # dup. index names in groupby operations should be renamed (GH 19029):
-    df = pd.DataFrame({'date': list(pd.date_range('5.1.2018', '5.3.2018')),
+    df = pd.DataFrame({'date': pd.date_range('5.1.2018', '5.3.2018'),
                        'vals': list(range(3))})
 
-    mi = pd.MultiIndex.from_product([[5], [1, 2, 3]], names=['date0', 'date1'])
+    # duplicates get suffixed by integer position
+    mi = pd.MultiIndex.from_product([[5], [1, 2, 3]],
+                                    names=['date_0', 'date_1'])
     expected = pd.Series(data=list(range(3)), index=mi, name='vals')
-
-    failed = False
-    try:
-        result = df.groupby([df.date.dt.month, df.date.dt.day])['vals'].sum()
-    except ValueError:
-        failed = True
-
-    assert failed is False
+    result = df.groupby([df.date.dt.month, df.date.dt.day])['vals'].sum()
 
     tm.assert_series_equal(result, expected)
 
-
-def test_empty_index_names():
-    # don't rename frames in case no names were assigned (GH 19029)
-    df = pd.DataFrame({'date': list(pd.date_range('5.1.2018', '5.3.2018')),
-                       'vals': list(range(3))})
-
-    mi = pd.MultiIndex.from_product([[5], [1, 2, 3]])
+    # 2 out of 3 are duplicates and None
+    mi = pd.MultiIndex.from_product([[2018], [5], [1, 2, 3]],
+                                    names=['0', '1', 'date'])
     expected = pd.Series(data=list(range(3)), index=mi, name='vals')
+    result = df.groupby([df.date.dt.year.rename(None),
+                         df.date.dt.month.rename(None),
+                         df.date.dt.day])['vals'].sum()
+    tm.assert_series_equal(result, expected)
 
-    failed = False
-    try:
-        result = df.groupby([df.date.dt.month.rename(None),
-                             df.date.dt.day.rename(None)])['vals'].sum()
-    except ValueError:
-        failed = True
-
-    assert failed is False
+    # 2 out of 3 names (not None) are duplicates, the remaining is None
+    mi = pd.MultiIndex.from_product([[2018], [5], [1, 2, 3]],
+                                    names=['date_0', None, 'date_1'])
+    expected = pd.Series(data=list(range(3)), index=mi, name='vals')
+    result = df.groupby([df.date.dt.year,
+                         df.date.dt.month.rename(None),
+                         df.date.dt.day])['vals'].sum()
+    tm.assert_series_equal(result, expected)
 
+    # all are None
+    mi = pd.MultiIndex.from_product([[2018], [5], [1, 2, 3]],
+                                    names=[None, None, None])
+    expected = pd.Series(data=list(range(3)), index=mi, name='vals')
+    result = df.groupby([df.date.dt.year.rename(None),
+                         df.date.dt.month.rename(None),
+                         df.date.dt.day.rename(None)])['vals'].sum()
     tm.assert_series_equal(result, expected)