diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 03ad8ed162c95..7b271539262d7 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -960,6 +960,7 @@ MultiIndex - :func:`MultiIndex.get_level_values` accessing a :class:`DatetimeIndex` does not carry the frequency attribute along (:issue:`58327`, :issue:`57949`) - Bug in :class:`DataFrame` arithmetic operations in case of unaligned MultiIndex columns (:issue:`60498`) - Bug in :class:`DataFrame` arithmetic operations with :class:`Series` in case of unaligned MultiIndex (:issue:`61009`) +- Bug in :class:`DataFrame` arithmetic operations with :class:`Series` now works with ``fill_value`` parameter (:issue:`61581`) - Bug in :meth:`MultiIndex.from_tuples` causing wrong output with input of type tuples having NaN values (:issue:`60695`, :issue:`60988`) - Bug in :meth:`DataFrame.reindex` and :meth:`Series.reindex` where reindexing :class:`Index` to a :class:`MultiIndex` would incorrectly set all values to ``NaN``.(:issue:`60923`) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 7bd5e6fe255eb..bac63e4f21013 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -873,6 +873,13 @@ def _evaluate_op_method(self, other, op, arrow_funcs) -> Self: or pa.types.is_binary(pa_type) ): if op in [operator.add, roperator.radd]: + # pyarrow gets upset if you try to join a NullArray + if ( + pa.types.is_integer(other.type) + or pa.types.is_floating(other.type) + or pa.types.is_null(other.type) + ): + other = other.cast(pa_type) sep = pa.scalar("", type=pa_type) try: if op is operator.add: @@ -903,7 +910,7 @@ def _evaluate_op_method(self, other, op, arrow_funcs) -> Self: raise TypeError("Can only string multiply by an integer.") pa_integral = pc.if_else(pc.less(integral, 0), 0, integral) result = pc.binary_repeat(binary, pa_integral) - return self._from_pyarrow_array(result) + return type(self)(result) if ( isinstance(other, pa.Scalar) and pc.is_null(other).as_py() diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 0dcb15738c276..3a54893875b1f 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -8451,12 +8451,17 @@ def _maybe_align_series_as_frame(self, series: Series, axis: AxisInt): """ rvalues = series._values if not isinstance(rvalues, np.ndarray): - # TODO(EA2D): no need to special-case with 2D EAs - if rvalues.dtype in ("datetime64[ns]", "timedelta64[ns]"): - # We can losslessly+cheaply cast to ndarray - rvalues = np.asarray(rvalues) + if axis == 0: + df = DataFrame(dict.fromkeys(range(self.shape[1]), rvalues)) else: - return series + nrows = self.shape[0] + df = DataFrame( + {i: rvalues[[i]].repeat(nrows) for i in range(self.shape[1])}, + dtype=rvalues.dtype, + ) + df.index = self.index + df.columns = self.columns + return df if axis == 0: rvalues = rvalues.reshape(-1, 1) @@ -8480,11 +8485,6 @@ def _flex_arith_method( if self._should_reindex_frame_op(other, op, axis, fill_value, level): return self._arith_method_with_reindex(other, op) - if isinstance(other, Series) and fill_value is not None: - # TODO: We could allow this in cases where we end up going - # through the DataFrame path - raise NotImplementedError(f"fill_value {fill_value} not supported.") - other = ops.maybe_prepare_scalar_for_op(other, self.shape) self, other = self._align_for_op(other, axis, flex=True, level=level) diff --git a/pandas/tests/arithmetic/test_period.py b/pandas/tests/arithmetic/test_period.py index 67762e0b89c73..573d791daaecb 100644 --- a/pandas/tests/arithmetic/test_period.py +++ b/pandas/tests/arithmetic/test_period.py @@ -1361,12 +1361,7 @@ def test_period_add_timestamp_raises(self, box_with_array): arr + ts with pytest.raises(TypeError, match=msg): ts + arr - if box_with_array is pd.DataFrame: - # TODO: before implementing resolution-inference we got the same - # message with DataFrame and non-DataFrame. Why did that change? - msg = "cannot add PeriodArray and Timestamp" - else: - msg = "cannot add PeriodArray and DatetimeArray" + msg = "cannot add PeriodArray and DatetimeArray" with pytest.raises(TypeError, match=msg): arr + Series([ts]) with pytest.raises(TypeError, match=msg): @@ -1376,16 +1371,11 @@ def test_period_add_timestamp_raises(self, box_with_array): with pytest.raises(TypeError, match=msg): pd.Index([ts]) + arr - if box_with_array is pd.DataFrame: - msg = "cannot add PeriodArray and DatetimeArray" - else: - msg = r"unsupported operand type\(s\) for \+: 'Period' and 'DatetimeArray" + msg = "cannot add PeriodArray and DatetimeArray" + with pytest.raises(TypeError, match=msg): arr + pd.DataFrame([ts]) - if box_with_array is pd.DataFrame: - msg = "cannot add PeriodArray and DatetimeArray" - else: - msg = r"unsupported operand type\(s\) for \+: 'DatetimeArray' and 'Period'" + with pytest.raises(TypeError, match=msg): pd.DataFrame([ts]) + arr diff --git a/pandas/tests/arrays/boolean/test_arithmetic.py b/pandas/tests/arrays/boolean/test_arithmetic.py index 9ff690cdc914d..4e2f359075f05 100644 --- a/pandas/tests/arrays/boolean/test_arithmetic.py +++ b/pandas/tests/arrays/boolean/test_arithmetic.py @@ -118,7 +118,7 @@ def test_error_invalid_values(data, all_arithmetic_operators): ops(pd.Timestamp("20180101")) # invalid array-likes - if op not in ("__mul__", "__rmul__"): + if op not in ("__mul__", "__rmul__", "__add__", "__radd__"): # TODO(extension) numpy's mul with object array sees booleans as numbers msg = "|".join( [ diff --git a/pandas/tests/arrays/floating/test_arithmetic.py b/pandas/tests/arrays/floating/test_arithmetic.py index 777099e76fc73..7eec2db003909 100644 --- a/pandas/tests/arrays/floating/test_arithmetic.py +++ b/pandas/tests/arrays/floating/test_arithmetic.py @@ -152,8 +152,38 @@ def test_error_invalid_values(data, all_arithmetic_operators): ops(pd.Timestamp("20180101")) # invalid array-likes - with pytest.raises(TypeError, match=msg): - ops(pd.Series("foo", index=s.index)) + str_ser = pd.Series("foo", index=s.index) + if all_arithmetic_operators in [ + "__add__", + "__radd__", + ]: + res = ops(str_ser) + if all_arithmetic_operators == "__radd__": + data_expected = [] + for i in data: + if pd.isna(i): + data_expected.append(i) + elif i.is_integer(): + data_expected.append("foo" + str(int(i))) + else: + data_expected.append("foo" + str(i)) + + expected = pd.Series(data_expected, index=s.index) + else: + data_expected = [] + for i in data: + if pd.isna(i): + data_expected.append(i) + elif i.is_integer(): + data_expected.append(str(int(i)) + "foo") + else: + data_expected.append(str(i) + "foo") + + expected = pd.Series(data_expected, index=s.index) + tm.assert_series_equal(res, expected) + else: + with pytest.raises(TypeError, match=msg): + ops(str_ser) msg = "|".join( [ diff --git a/pandas/tests/arrays/integer/test_arithmetic.py b/pandas/tests/arrays/integer/test_arithmetic.py index aeceb9b8a3cb1..623b70092f424 100644 --- a/pandas/tests/arrays/integer/test_arithmetic.py +++ b/pandas/tests/arrays/integer/test_arithmetic.py @@ -197,6 +197,22 @@ def test_error_invalid_values(data, all_arithmetic_operators): # assert_almost_equal stricter, but the expected with pd.NA seems # more-correct than np.nan here. tm.assert_series_equal(res, expected) + elif all_arithmetic_operators in [ + "__add__", + "__radd__", + ]: + res = ops(str_ser) + if all_arithmetic_operators == "__radd__": + expected = pd.Series( + [np.nan if pd.isna(x) == 1 else "foo" + str(x) for x in data], + index=s.index, + ) + else: + expected = pd.Series( + [np.nan if pd.isna(x) == 1 else str(x) + "foo" for x in data], + index=s.index, + ) + tm.assert_series_equal(res, expected) else: with tm.external_error_raised(TypeError): ops(str_ser) diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 9dae3ae384255..5359b6d1614e6 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -253,7 +253,6 @@ def test_mul(dtype): tm.assert_extension_array_equal(result, expected) -@pytest.mark.xfail(reason="GH-28527") def test_add_strings(dtype): arr = pd.array(["a", "b", "c", "d"], dtype=dtype) df = pd.DataFrame([["t", "y", "v", "w"]], dtype=object) @@ -268,20 +267,22 @@ def test_add_strings(dtype): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(reason="GH-28527") def test_add_frame(dtype): arr = pd.array(["a", "b", np.nan, np.nan], dtype=dtype) df = pd.DataFrame([["x", np.nan, "y", np.nan]]) assert arr.__add__(df) is NotImplemented + # TODO + # pyarrow returns a different dtype despite the values being the same + # could be addressed this PR if needed result = arr + df expected = pd.DataFrame([["ax", np.nan, np.nan, np.nan]]).astype(dtype) - tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected, check_dtype=False) result = df + arr expected = pd.DataFrame([["xa", np.nan, np.nan, np.nan]]).astype(dtype) - tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected, check_dtype=False) def test_comparison_methods_scalar(comparison_op, dtype): diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py index a9a98a5005bb3..228d62878fc38 100644 --- a/pandas/tests/frame/test_arithmetic.py +++ b/pandas/tests/frame/test_arithmetic.py @@ -626,12 +626,6 @@ def test_arith_flex_frame_corner(self, float_frame): expected = float_frame.sort_index() * np.nan tm.assert_frame_equal(result, expected) - with pytest.raises(NotImplementedError, match="fill_value"): - float_frame.add(float_frame.iloc[0], fill_value=3) - - with pytest.raises(NotImplementedError, match="fill_value"): - float_frame.add(float_frame.iloc[0], axis="index", fill_value=3) - @pytest.mark.parametrize("op", ["add", "sub", "mul", "mod"]) def test_arith_flex_series_ops(self, simple_frame, op): # after arithmetic refactor, add truediv here @@ -665,19 +659,6 @@ def test_arith_flex_series_broadcasting(self, any_real_numpy_dtype): result = df.div(df[0], axis="index") tm.assert_frame_equal(result, expected) - def test_arith_flex_zero_len_raises(self): - # GH 19522 passing fill_value to frame flex arith methods should - # raise even in the zero-length special cases - ser_len0 = Series([], dtype=object) - df_len0 = DataFrame(columns=["A", "B"]) - df = DataFrame([[1, 2], [3, 4]], columns=["A", "B"]) - - with pytest.raises(NotImplementedError, match="fill_value"): - df.add(ser_len0, fill_value="E") - - with pytest.raises(NotImplementedError, match="fill_value"): - df_len0.sub(df["A"], axis=None, fill_value=3) - def test_flex_add_scalar_fill_value(self): # GH#12723 dat = np.array([0, 1, np.nan, 3, 4, 5], dtype="float") @@ -2192,3 +2173,61 @@ def test_mixed_col_index_dtype(string_dtype_no_object): expected.columns = expected.columns.astype(string_dtype_no_object) tm.assert_frame_equal(result, expected) + + +dt_params = [ + (tm.ALL_INT_NUMPY_DTYPES[0], 5), + (tm.ALL_INT_EA_DTYPES[0], 5), + (tm.FLOAT_NUMPY_DTYPES[0], 4.9), + (tm.FLOAT_EA_DTYPES[0], 4.9), +] + +axes = [0, 1] + + +@pytest.mark.parametrize( + "data_type,fill_val, axis", + [(dt, val, axis) for axis in axes for dt, val in dt_params], +) +def test_df_fill_value_dtype(data_type, fill_val, axis): + # GH 61581 + base_data = np.arange(25).reshape(5, 5) + mult_list = [1, np.nan, 5, np.nan, 3] + np_int_flag = 0 + + try: + mult_data = pd.array(mult_list, dtype=data_type) + except ValueError as e: + # Numpy int type cannot represent NaN, it will end up here + if "cannot convert float NaN to integer" in str(e): + mult_data = np.asarray(mult_list) + np_int_flag = 1 + + columns = list("ABCDE") + df = DataFrame(base_data, columns=columns) + + for i in range(df.shape[0]): + try: + df.iat[i, i] = np.nan + df.iat[i + 1, i] = pd.NA + df.iat[i + 3, i] = pd.NA + except IndexError: + pass + + mult_mat = np.broadcast_to(mult_data, df.shape) + if axis == 0: + mask = np.isnan(mult_mat).T + else: + mask = np.isnan(mult_mat) + mask = df.isna().values & mask + + df_result = df.mul(mult_data, axis=axis, fill_value=fill_val) + if np_int_flag == 1: + mult_np = np.nan_to_num(mult_data, nan=fill_val) + df_expected = (df.fillna(fill_val).mul(mult_np, axis=axis)).mask(mask, np.nan) + else: + df_expected = ( + df.fillna(fill_val).mul(mult_data.fillna(fill_val), axis=axis) + ).mask(mask, np.nan) + + tm.assert_frame_equal(df_result, df_expected)