Fix apply to only call func once on the first column/row

alonme · alonme · commit 3be09bcbc190 · 2020-05-15T00:00:23.000+03:00
diff --git a/pandas/_libs/reduction.pyx b/pandas/_libs/reduction.pyx
@@ -107,6 +107,7 @@ cdef class Reducer:
 
         result = np.empty(self.nresults, dtype='O')
         it = <flatiter>PyArray_IterNew(result)
+        partial_result = None
 
         try:
             for i in range(self.nresults):
@@ -134,21 +135,33 @@ cdef class Reducer:
                     res = self.f(chunk)
 
                 # TODO: reason for not squeezing here?
-                res = _extract_result(res, squeeze=False)
+                extracted_res = _extract_result(res, squeeze=False)
                 if i == 0:
                     # On the first pass, we check the output shape to see
                     #  if this looks like a reduction.
-                    _check_result_array(res, len(self.dummy))
+                    #  if it does not, return the computed value to be used by the pure python implementation,
+                    #  so the function won't be called twice on the same object (and side effects would occur twice)
+                    try:
+                        _check_result_array(extracted_res, len(self.dummy))
+                    except ValueError as err:
+                        if "Function does not reduce" not in str(err):
+                            # catch only the specific exception
+                            raise
 
-                PyArray_SETITEM(result, PyArray_ITER_DATA(it), res)
+                        partial_result = copy(res)
+                        break
+
+
+                PyArray_SETITEM(result, PyArray_ITER_DATA(it), extracted_res)
                 chunk.data = chunk.data + self.increment
                 PyArray_ITER_NEXT(it)
+
         finally:
             # so we don't free the wrong memory
             chunk.data = dummy_buf
 
         result = maybe_convert_objects(result)
-        return result
+        return result, partial_result
 
 
 cdef class _BaseGrouper:
diff --git a/pandas/core/apply.py b/pandas/core/apply.py
@@ -18,6 +18,9 @@
 
 from pandas.core.construction import create_series_with_explicit_dtype
 
+from pandas.core.series import Series
+from pandas import DataFrame
+
 if TYPE_CHECKING:
     from pandas import DataFrame, Series, Index
 
@@ -220,14 +223,13 @@ def apply_empty_result(self):
 
     def apply_raw(self):
         """ apply to the values as a numpy array """
-        try:
-            result = libreduction.compute_reduction(self.values, self.f, axis=self.axis)
-        except ValueError as err:
-            if "Function does not reduce" not in str(err):
-                # catch only ValueError raised intentionally in libreduction
-                raise
-            # We expect np.apply_along_axis to give a two-dimensional result, or
-            #  also raise.
+        result, partial_result = libreduction.compute_reduction(
+            self.values, self.f, axis=self.axis
+        )
+
+        # A non None partial_result means that the reduction was unsuccessful
+        # We expect np.apply_along_axis to give a two-dimensional result, or raise.
+        if partial_result is not None:
             result = np.apply_along_axis(self.f, self.axis, self.values)
 
         # TODO: mixed type case
@@ -265,6 +267,7 @@ def apply_broadcast(self, target: "DataFrame") -> "DataFrame":
 
     def apply_standard(self):
 
+        partial_result = None
         # try to reduce first (by default)
         # this only matters if the reduction in values is of different dtype
         # e.g. if we want to apply to a SparseFrame, then can't directly reduce
@@ -292,13 +295,9 @@ def apply_standard(self):
             )
 
             try:
-                result = libreduction.compute_reduction(
+                result, partial_result = libreduction.compute_reduction(
                     values, self.f, axis=self.axis, dummy=dummy, labels=labels
                 )
-            except ValueError as err:
-                if "Function does not reduce" not in str(err):
-                    # catch only ValueError raised intentionally in libreduction
-                    raise
             except TypeError:
                 # e.g. test_apply_ignore_failures we just ignore
                 if not self.ignore_failures:
@@ -307,23 +306,36 @@ def apply_standard(self):
                 # reached via numexpr; fall back to python implementation
                 pass
             else:
-                return self.obj._constructor_sliced(result, index=labels)
+                # this means that the reduction was successful
+                if partial_result is None:
+                    return self.obj._constructor_sliced(result, index=labels)
+                else:
+                    if isinstance(partial_result, Series):
+                        partial_result = DataFrame.infer_objects(partial_result)
 
         # compute the result using the series generator
-        results, res_index = self.apply_series_generator()
+        results, res_index = self.apply_series_generator(partial_result)
 
         # wrap results
         return self.wrap_results(results, res_index)
 
-    def apply_series_generator(self) -> Tuple[ResType, "Index"]:
+    def apply_series_generator(self, partial_result=None) -> Tuple[ResType, "Index"]:
         series_gen = self.series_generator
         res_index = self.result_index
 
         keys = []
         results = {}
+
+        # If a partial result was already computed, use it instead of running on the first element again
+        series_gen_enumeration = enumerate(series_gen)
+        if partial_result is not None:
+            i, v = next(series_gen_enumeration)
+            results[i] = partial_result
+            keys.append(v.name)
+
         if self.ignore_failures:
             successes = []
-            for i, v in enumerate(series_gen):
+            for i, v in series_gen_enumeration:
                 try:
                     results[i] = self.f(v)
                 except Exception:
@@ -337,7 +349,8 @@ def apply_series_generator(self) -> Tuple[ResType, "Index"]:
                 res_index = res_index.take(successes)
 
         else:
-            for i, v in enumerate(series_gen):
+            for i, v in series_gen_enumeration:
+
                 results[i] = self.f(v)
                 keys.append(v.name)
 
diff --git a/pandas/tests/frame/test_apply.py b/pandas/tests/frame/test_apply.py
@@ -718,7 +718,9 @@ def apply_list(row):
 
     def test_apply_noreduction_tzaware_object(self):
         # https://github.com/pandas-dev/pandas/issues/31505
-        df = pd.DataFrame({"foo": [pd.Timestamp("2020", tz="UTC")]}, dtype="object")
+        df = pd.DataFrame(
+            {"foo": [pd.Timestamp("2020", tz="UTC")]}, dtype="datetime64[ns, UTC]"
+        )
         result = df.apply(lambda x: x)
         tm.assert_frame_equal(result, df)
         result = df.apply(lambda x: x.copy())