BUG: fix groupby.aggregate resulting dtype coercion, xref pandas-dev#11444

jreback · jreback · commit 6a9081956b6d · 2017-02-27T15:22:21.000-05:00
make sure .size includes the name of the grouped
diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt
@@ -628,7 +628,7 @@ Bug Fixes
 
 
 - Bug in ``.read_csv()`` with ``parse_dates`` when multiline headers are specified (:issue:`15376`)
-- Bug in ``groupby.transform()`` that would coerce the resultant dtypes back to the original (:issue:`10972`)
+- Bug in ``groupby.transform()`` that would coerce the resultant dtypes back to the original (:issue:`10972`, :issue:`11444`)
 
 
 - Bug in ``DataFrame.boxplot`` where ``fontsize`` was not applied to the tick labels on both axes (:issue:`15108`)
diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py
@@ -767,19 +767,23 @@ def _index_with_as_index(self, b):
         new.names = gp.names + original.names
         return new
 
-    def _try_cast(self, result, obj):
+    def _try_cast(self, result, obj, numeric_only=False):
         """
         try to cast the result to our obj original type,
         we may have roundtripped thru object in the mean-time
 
+        if numeric_only is True, then only try to cast numerics
+        and not datetimelikes
+
         """
         if obj.ndim > 1:
             dtype = obj.values.dtype
         else:
             dtype = obj.dtype
 
         if not is_scalar(result):
-            result = _possibly_downcast_to_dtype(result, dtype)
+            if numeric_only and is_numeric_dtype(dtype) or not numeric_only:
+                result = _possibly_downcast_to_dtype(result, dtype)
 
         return result
 
@@ -830,7 +834,7 @@ def _python_agg_general(self, func, *args, **kwargs):
         for name, obj in self._iterate_slices():
             try:
                 result, counts = self.grouper.agg_series(obj, f)
-                output[name] = self._try_cast(result, obj)
+                output[name] = self._try_cast(result, obj, numeric_only=True)
             except TypeError:
                 continue
 
@@ -1117,7 +1121,9 @@ def sem(self, ddof=1):
     @Appender(_doc_template)
     def size(self):
         """Compute group sizes"""
-        return self.grouper.size()
+        result = self.grouper.size()
+        result.name = getattr(self, 'name', None)
+        return result
 
     sum = _groupby_function('sum', 'add', np.sum)
     prod = _groupby_function('prod', 'prod', np.prod)
@@ -1689,7 +1695,9 @@ def size(self):
         ids, _, ngroup = self.group_info
         ids = _ensure_platform_int(ids)
         out = np.bincount(ids[ids != -1], minlength=ngroup or None)
-        return Series(out, index=self.result_index, dtype='int64')
+        return Series(out,
+                      index=self.result_index,
+                      dtype='int64')
 
     @cache_readonly
     def _max_groupsize(self):
@@ -2908,7 +2916,8 @@ def transform(self, func, *args, **kwargs):
         result = concat(results).sort_index()
 
         # we will only try to coerce the result type if
-        # we have a numeric dtype
+        # we have a numeric dtype, as these are *always* udfs
+        # the cython take a different path (and casting)
         dtype = self._selected_obj.dtype
         if is_numeric_dtype(dtype):
             result = _possibly_downcast_to_dtype(result, dtype)
diff --git a/pandas/tests/groupby/test_aggregate.py b/pandas/tests/groupby/test_aggregate.py
@@ -154,6 +154,29 @@ def test_agg_dict_parameter_cast_result_dtypes(self):
         assert_series_equal(grouped.time.last(), exp['time'])
         assert_series_equal(grouped.time.agg('last'), exp['time'])
 
+        # count
+        exp = pd.Series([2, 2, 2, 2],
+                        index=Index(list('ABCD'), name='class'),
+                        name='time')
+        assert_series_equal(grouped.time.agg(len), exp)
+        assert_series_equal(grouped.time.size(), exp)
+
+        exp = pd.Series([0, 1, 1, 2],
+                        index=Index(list('ABCD'), name='class'),
+                        name='time')
+        assert_series_equal(grouped.time.count(), exp)
+
+    def test_agg_cast_results_dtypes(self):
+        # similar to GH12821
+        # xref #11444
+        u = [datetime(2015, x + 1, 1) for x in range(12)]
+        v = list('aaabbbbbbccd')
+        df = pd.DataFrame({'X': v, 'Y': u})
+
+        result = df.groupby('X')['Y'].agg(len)
+        expected = df.groupby('X')['Y'].count()
+        assert_series_equal(result, expected)
+
     def test_agg_must_agg(self):
         grouped = self.df.groupby('A')['C']
         self.assertRaises(Exception, grouped.agg, lambda x: x.describe())