Skip to content

BUG: mean/median with strings #52281

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 12 commits into from
Apr 24, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions doc/source/whatsnew/v2.1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -311,8 +311,11 @@ Timezones

Numeric
^^^^^^^
- Bug in :meth:`Series.corr` and :meth:`Series.cov` raising ``AttributeError`` for masked dtypes (:issue:`51422`)
- Bug in :meth:`Series.mean`, :meth:`DataFrame.mean` with object-dtype values containing strings that can be converted to numbers (e.g. "2") returning incorrect numeric results; these now raise ``TypeError`` (:issue:`36703`, :issue:`44008`)
- Bug in :meth:`DataFrame.corrwith` raising ``NotImplementedError`` for pyarrow-backed dtypes (:issue:`52314`)
- Bug in :meth:`Series.corr` and :meth:`Series.cov` raising ``AttributeError`` for masked dtypes (:issue:`51422`)
- Bug in :meth:`Series.median` and :meth:`DataFrame.median` with object-dtype values containing strings that can be converted to numbers (e.g. "2") returning incorrect numeric results; these now raise ``TypeError`` (:issue:`34671`)
-

Conversion
Expand Down
15 changes: 14 additions & 1 deletion pandas/core/nanops.py
Original file line number Diff line number Diff line change
Expand Up @@ -716,7 +716,8 @@ def nanmean(
dtype_count = dtype

count = _get_counts(values.shape, mask, axis, dtype=dtype_count)
the_sum = _ensure_numeric(values.sum(axis, dtype=dtype_sum))
the_sum = values.sum(axis, dtype=dtype_sum)
the_sum = _ensure_numeric(the_sum)

if axis is not None and getattr(the_sum, "ndim", False):
count = cast(np.ndarray, count)
Expand Down Expand Up @@ -775,6 +776,11 @@ def get_median(x, _mask=None):
dtype = values.dtype
values, mask = _get_values(values, skipna, mask=mask, fill_value=0)
if values.dtype.kind != "f":
if values.dtype == object:
# GH#34671 avoid casting strings to numeric
inferred = lib.infer_dtype(values)
if inferred in ["string", "mixed"]:
raise TypeError(f"Cannot convert {values} to numeric")
try:
values = values.astype("f8")
except ValueError as err:
Expand Down Expand Up @@ -1659,6 +1665,10 @@ def _ensure_numeric(x):
if x.dtype.kind in "biu":
x = x.astype(np.float64)
elif x.dtype == object:
inferred = lib.infer_dtype(x)
if inferred in ["string", "mixed"]:
# GH#44008, GH#36703 avoid casting e.g. strings to numeric
raise TypeError(f"Could not convert {x} to numeric")
try:
x = x.astype(np.complex128)
except (TypeError, ValueError):
Expand All @@ -1671,6 +1681,9 @@ def _ensure_numeric(x):
if not np.any(np.imag(x)):
x = x.real
elif not (is_float(x) or is_integer(x) or is_complex(x)):
if isinstance(x, str):
# GH#44008, GH#36703 avoid casting e.g. strings to numeric
raise TypeError(f"Could not convert string '{x}' to numeric")
try:
x = float(x)
except (TypeError, ValueError):
Expand Down
3 changes: 3 additions & 0 deletions pandas/tests/apply/test_invalid_arg.py
Original file line number Diff line number Diff line change
Expand Up @@ -244,6 +244,9 @@ def test_agg_cython_table_raises_frame(df, func, expected, axis):
def test_agg_cython_table_raises_series(series, func, expected):
# GH21224
msg = r"[Cc]ould not convert|can't multiply sequence by non-int of type"
if func == "median" or func is np.nanmedian or func is np.median:
msg = r"Cannot convert \['a' 'b' 'c'\] to numeric"

with pytest.raises(expected, match=msg):
# e.g. Series('a b'.split()).cumprod() will raise
series.agg(func)
Expand Down
44 changes: 35 additions & 9 deletions pandas/tests/frame/test_reductions.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,15 +169,30 @@ def test_stat_op_api_float_string_frame(self, float_string_frame, axis, opname):
):
getattr(float_string_frame, opname)(axis=axis)
else:
msg = "|".join(
[
"Could not convert",
"could not convert",
"can't multiply sequence by non-int",
"unsupported operand type",
"not supported between instances of",
]
)
if opname in ["var", "std", "sem", "skew", "kurt"]:
msg = "could not convert string to float: 'bar'"
elif opname == "product":
if axis == 1:
msg = "can't multiply sequence by non-int of type 'float'"
else:
msg = "can't multiply sequence by non-int of type 'str'"
elif opname == "sum":
msg = r"unsupported operand type\(s\) for \+: 'float' and 'str'"
elif opname == "mean":
if axis == 0:
# different message on different builds
msg = "|".join(
[
r"Could not convert \['.*'\] to numeric",
"Could not convert string '(bar){30}' to numeric",
]
)
else:
msg = r"unsupported operand type\(s\) for \+: 'float' and 'str'"
elif opname in ["min", "max"]:
msg = "'[><]=' not supported between instances of 'float' and 'str'"
elif opname == "median":
msg = re.compile(r"Cannot convert \[.*\] to numeric", flags=re.S)
with pytest.raises(TypeError, match=msg):
getattr(float_string_frame, opname)(axis=axis)
if opname != "nunique":
Expand Down Expand Up @@ -1759,5 +1774,16 @@ def test_fails_on_non_numeric(kernel):
"argument must be a string or a real number",
]
)
if kernel == "median":
# slightly different message on different builds
msg1 = (
r"Cannot convert \[\[<class 'object'> <class 'object'> "
r"<class 'object'>\]\] to numeric"
)
msg2 = (
r"Cannot convert \[<class 'object'> <class 'object'> "
r"<class 'object'>\] to numeric"
)
msg = "|".join([msg1, msg2])
with pytest.raises(TypeError, match=msg):
getattr(df, kernel)(*args)
6 changes: 6 additions & 0 deletions pandas/tests/groupby/test_function.py
Original file line number Diff line number Diff line change
Expand Up @@ -262,6 +262,8 @@ def _check(self, df, method, expected_columns, expected_columns_numeric):
"can't multiply sequence by non-int of type 'str'",
]
)
if method == "median":
msg = r"Cannot convert \['a' 'b'\] to numeric"
with pytest.raises(exception, match=msg):
getattr(gb, method)()
else:
Expand All @@ -279,6 +281,8 @@ def _check(self, df, method, expected_columns, expected_columns_numeric):
f"Cannot perform {method} with non-ordered Categorical",
]
)
if method == "median":
msg = r"Cannot convert \['a' 'b'\] to numeric"
with pytest.raises(exception, match=msg):
getattr(gb, method)(numeric_only=False)
else:
Expand Down Expand Up @@ -1467,6 +1471,8 @@ def test_numeric_only(kernel, has_arg, numeric_only, keys):
"function is not implemented for this dtype",
]
)
if kernel == "median":
msg = r"Cannot convert \[<class 'object'> <class 'object'>\] to numeric"
with pytest.raises(exception, match=msg):
method(*args, **kwargs)
elif not has_arg and numeric_only is not lib.no_default:
Expand Down
5 changes: 4 additions & 1 deletion pandas/tests/groupby/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -654,7 +654,8 @@ def test_frame_multi_key_function_list_partial_failure():

grouped = data.groupby(["A", "B"])
funcs = [np.mean, np.std]
with pytest.raises(TypeError, match="Could not convert dullshinyshiny to numeric"):
msg = "Could not convert string 'dullshinyshiny' to numeric"
with pytest.raises(TypeError, match=msg):
grouped.agg(funcs)


Expand Down Expand Up @@ -973,6 +974,8 @@ def test_omit_nuisance_agg(df, agg_function, numeric_only):
# columns when numeric_only is False
klass = ValueError if agg_function in ("std", "sem") else TypeError
msg = "|".join(["[C|c]ould not convert", "can't multiply sequence"])
if agg_function == "median":
msg = r"Cannot convert \['one' 'three' 'two'\] to numeric"
with pytest.raises(klass, match=msg):
getattr(grouped, agg_function)(numeric_only=numeric_only)
else:
Expand Down
22 changes: 19 additions & 3 deletions pandas/tests/groupby/test_raises.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,8 +147,21 @@ def test_groupby_raises_string(
"idxmin": (TypeError, "'argmin' not allowed for this dtype"),
"last": (None, ""),
"max": (None, ""),
"mean": (TypeError, "Could not convert xy?z?w?t?y?u?i?o? to numeric"),
"median": (TypeError, "could not convert string to float"),
"mean": (
TypeError,
"Could not convert string '(xy|xyzwt|xyz|xztuo)' to numeric",
),
"median": (
TypeError,
"|".join(
[
r"Cannot convert \['x' 'y' 'z'\] to numeric",
r"Cannot convert \['x' 'y'\] to numeric",
r"Cannot convert \['x' 'y' 'z' 'w' 't'\] to numeric",
r"Cannot convert \['x' 'z' 't' 'u' 'o'\] to numeric",
]
),
),
"min": (None, ""),
"ngroup": (None, ""),
"nunique": (None, ""),
Expand Down Expand Up @@ -197,7 +210,10 @@ def test_groupby_raises_string_np(

klass, msg = {
np.sum: (None, ""),
np.mean: (TypeError, "Could not convert xy?z?w?t?y?u?i?o? to numeric"),
np.mean: (
TypeError,
"Could not convert string '(xyzwt|xy|xyz|xztuo)' to numeric",
),
}[groupby_func_np]

_call_and_check(klass, msg, how, gb, groupby_func_np, tuple())
Expand Down
4 changes: 2 additions & 2 deletions pandas/tests/resample/test_resample_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -857,8 +857,8 @@ def test_end_and_end_day_origin(
("mean", False, "Could not convert"),
("mean", lib.no_default, "Could not convert"),
("median", True, {"num": [12.5]}),
("median", False, "could not convert"),
("median", lib.no_default, "could not convert"),
("median", False, r"Cannot convert \['cat_1' 'cat_2'\] to numeric"),
("median", lib.no_default, r"Cannot convert \['cat_1' 'cat_2'\] to numeric"),
("std", True, {"num": [10.606601717798213]}),
("std", False, "could not convert string to float"),
("std", lib.no_default, "could not convert string to float"),
Expand Down
49 changes: 49 additions & 0 deletions pandas/tests/series/test_reductions.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,3 +129,52 @@ def test_validate_stat_keepdims():
)
with pytest.raises(ValueError, match=msg):
np.sum(ser, keepdims=True)


def test_mean_with_convertible_string_raises(using_array_manager):
# GH#44008
ser = Series(["1", "2"])
assert ser.sum() == "12"
msg = "Could not convert string '12' to numeric"
with pytest.raises(TypeError, match=msg):
ser.mean()

df = ser.to_frame()
if not using_array_manager:
msg = r"Could not convert \['12'\] to numeric"
with pytest.raises(TypeError, match=msg):
df.mean()


def test_mean_dont_convert_j_to_complex(using_array_manager):
# GH#36703
df = pd.DataFrame([{"db": "J", "numeric": 123}])
if using_array_manager:
msg = "Could not convert string 'J' to numeric"
else:
msg = r"Could not convert \['J'\] to numeric"
with pytest.raises(TypeError, match=msg):
df.mean()

with pytest.raises(TypeError, match=msg):
df.agg("mean")

msg = "Could not convert string 'J' to numeric"
with pytest.raises(TypeError, match=msg):
df["db"].mean()
with pytest.raises(TypeError, match=msg):
np.mean(df["db"].astype("string").array)


def test_median_with_convertible_string_raises(using_array_manager):
# GH#34671 this _could_ return a string "2", but definitely not float 2.0
msg = r"Cannot convert \['1' '2' '3'\] to numeric"
ser = Series(["1", "2", "3"])
with pytest.raises(TypeError, match=msg):
ser.median()

if not using_array_manager:
msg = r"Cannot convert \[\['1' '2' '3'\]\] to numeric"
df = ser.to_frame()
with pytest.raises(TypeError, match=msg):
df.median()
19 changes: 14 additions & 5 deletions pandas/tests/test_nanops.py
Original file line number Diff line number Diff line change
Expand Up @@ -850,7 +850,9 @@ def test_ndarray(self):

# Test convertible string ndarray
s_values = np.array(["1", "2", "3"], dtype=object)
assert np.allclose(nanops._ensure_numeric(s_values), values)
msg = r"Could not convert \['1' '2' '3'\] to numeric"
with pytest.raises(TypeError, match=msg):
nanops._ensure_numeric(s_values)

# Test non-convertible string ndarray
s_values = np.array(["foo", "bar", "baz"], dtype=object)
Expand All @@ -859,12 +861,19 @@ def test_ndarray(self):
nanops._ensure_numeric(s_values)

def test_convertable_values(self):
assert np.allclose(nanops._ensure_numeric("1"), 1.0)
assert np.allclose(nanops._ensure_numeric("1.1"), 1.1)
assert np.allclose(nanops._ensure_numeric("1+1j"), 1 + 1j)
with pytest.raises(TypeError, match="Could not convert string '1' to numeric"):
nanops._ensure_numeric("1")
with pytest.raises(
TypeError, match="Could not convert string '1.1' to numeric"
):
nanops._ensure_numeric("1.1")
with pytest.raises(
TypeError, match=r"Could not convert string '1\+1j' to numeric"
):
nanops._ensure_numeric("1+1j")

def test_non_convertable_values(self):
msg = "Could not convert foo to numeric"
msg = "Could not convert string 'foo' to numeric"
with pytest.raises(TypeError, match=msg):
nanops._ensure_numeric("foo")

Expand Down