Skip to content

BUG: Coerce to numeric despite uint64 conflict #17823

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Oct 9, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.21.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -828,6 +828,7 @@ Conversion
- Bug in ``Timestamp.replace`` when replacing ``tzinfo`` around DST changes (:issue:`15683`)
- Bug in ``Timedelta`` construction and arithmetic that would not propagate the ``Overflow`` exception (:issue:`17367`)
- Bug in :meth:`~DataFrame.astype` converting to object dtype when passed extension type classes (`DatetimeTZDtype``, ``CategoricalDtype``) rather than instances. Now a ``TypeError`` is raised when a class is passed (:issue:`17780`).
- Bug in :meth:`to_numeric` in which elements were not always being coerced to numeric when ``errors='coerce'`` (:issue:`17007`, :issue:`17125`)

Indexing
^^^^^^^^
Expand Down
27 changes: 12 additions & 15 deletions pandas/_libs/src/inference.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -165,20 +165,8 @@ cdef class Seen(object):
two conflict cases was also detected. However, we are
trying to force conversion to a numeric dtype.
"""
if self.uint_ and (self.null_ or self.sint_):
if not self.coerce_numeric:
return True

if self.null_:
msg = ("uint64 array detected, and such an "
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

interesting, so these cases were not actually tested then? (IOW you didn't have to fix any tests)?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nvm. I see you blew away the entire section and re-wrote.

"array cannot contain NaN.")
else: # self.sint_ = 1
msg = ("uint64 and negative values detected. "
"Cannot safely return a numeric array "
"without truncating data.")

raise ValueError(msg)
return False
return (self.uint_ and (self.null_ or self.sint_)
and not self.coerce_numeric)

cdef inline saw_null(self):
"""
Expand Down Expand Up @@ -1103,10 +1091,17 @@ def maybe_convert_numeric(ndarray[object] values, set na_values,
seen.saw_int(val)

if val >= 0:
uints[i] = val
if val <= oUINT64_MAX:
uints[i] = val
else:
seen.float_ = True

if val <= oINT64_MAX:
ints[i] = val

if seen.sint_ and seen.uint_:
seen.float_ = True

elif util.is_bool_object(val):
floats[i] = uints[i] = ints[i] = bools[i] = val
seen.bool_ = True
Expand Down Expand Up @@ -1154,6 +1149,8 @@ def maybe_convert_numeric(ndarray[object] values, set na_values,
uints[i] = as_int
if as_int <= oINT64_MAX:
ints[i] = as_int

seen.float_ = seen.float_ or (seen.uint_ and seen.sint_)
else:
seen.float_ = True
except (TypeError, ValueError) as e:
Expand Down
75 changes: 37 additions & 38 deletions pandas/tests/dtypes/test_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,11 @@
from pandas.util import testing as tm


@pytest.fixture(params=[True, False], ids=lambda val: str(val))
def coerce(request):
return request.param
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we do this as a parameterize instead of fixture? (I don't know whether are some 'rules' where it is better to use one or the other, but in this case it really seems to be a parameterizing of the test, so I would find it easier to read the code if it uses paramatrize)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I use fixture whenever I find myself using the same parametrization multiple times. As you can see below, I use it three times.



def test_is_sequence():
is_seq = inference.is_sequence
assert (is_seq((1, 2)))
Expand Down Expand Up @@ -340,44 +345,38 @@ def test_convert_numeric_uint64(self):
exp = np.array([2**63], dtype=np.uint64)
tm.assert_numpy_array_equal(lib.maybe_convert_numeric(arr, set()), exp)

def test_convert_numeric_uint64_nan(self):
msg = 'uint64 array detected'
cases = [(np.array([2**63, np.nan], dtype=object), set()),
(np.array([str(2**63), np.nan], dtype=object), set()),
(np.array([np.nan, 2**63], dtype=object), set()),
(np.array([np.nan, str(2**63)], dtype=object), set()),
(np.array([2**63, 2**63 + 1], dtype=object), set([2**63])),
(np.array([str(2**63), str(2**63 + 1)],
dtype=object), set([2**63]))]

for coerce in (True, False):
for arr, na_values in cases:
if coerce:
with tm.assert_raises_regex(ValueError, msg):
lib.maybe_convert_numeric(arr, na_values,
coerce_numeric=coerce)
else:
tm.assert_numpy_array_equal(lib.maybe_convert_numeric(
arr, na_values), arr)

def test_convert_numeric_int64_uint64(self):
msg = 'uint64 and negative values detected'
cases = [np.array([2**63, -1], dtype=object),
np.array([str(2**63), -1], dtype=object),
np.array([str(2**63), str(-1)], dtype=object),
np.array([-1, 2**63], dtype=object),
np.array([-1, str(2**63)], dtype=object),
np.array([str(-1), str(2**63)], dtype=object)]

for coerce in (True, False):
for case in cases:
if coerce:
with tm.assert_raises_regex(ValueError, msg):
lib.maybe_convert_numeric(case, set(),
coerce_numeric=coerce)
else:
tm.assert_numpy_array_equal(lib.maybe_convert_numeric(
case, set()), case)
@pytest.mark.parametrize("arr", [
np.array([2**63, np.nan], dtype=object),
np.array([str(2**63), np.nan], dtype=object),
np.array([np.nan, 2**63], dtype=object),
np.array([np.nan, str(2**63)], dtype=object)])
def test_convert_numeric_uint64_nan(self, coerce, arr):
expected = arr.astype(float) if coerce else arr.copy()
result = lib.maybe_convert_numeric(arr, set(),
coerce_numeric=coerce)
tm.assert_almost_equal(result, expected)

def test_convert_numeric_uint64_nan_values(self, coerce):
arr = np.array([2**63, 2**63 + 1], dtype=object)
na_values = set([2**63])

expected = (np.array([np.nan, 2**63 + 1], dtype=float)
if coerce else arr.copy())
result = lib.maybe_convert_numeric(arr, na_values,
coerce_numeric=coerce)
tm.assert_almost_equal(result, expected)

@pytest.mark.parametrize("case", [
np.array([2**63, -1], dtype=object),
np.array([str(2**63), -1], dtype=object),
np.array([str(2**63), str(-1)], dtype=object),
np.array([-1, 2**63], dtype=object),
np.array([-1, str(2**63)], dtype=object),
np.array([str(-1), str(2**63)], dtype=object)])
def test_convert_numeric_int64_uint64(self, case, coerce):
expected = case.astype(float) if coerce else case.copy()
result = lib.maybe_convert_numeric(case, set(), coerce_numeric=coerce)
tm.assert_almost_equal(result, expected)

def test_maybe_convert_objects_uint64(self):
# see gh-4471
Expand Down
25 changes: 25 additions & 0 deletions pandas/tests/tools/test_numeric.py
Original file line number Diff line number Diff line change
Expand Up @@ -381,3 +381,28 @@ def test_downcast_limits(self):
for dtype, downcast, min_max in dtype_downcast_min_max:
series = pd.to_numeric(pd.Series(min_max), downcast=downcast)
assert series.dtype == dtype

def test_coerce_uint64_conflict(self):
# see gh-17007 and gh-17125
#
# Still returns float despite the uint64-nan conflict,
# which would normally force the casting to object.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you also test with errors='raise' and ignore here

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.

df = pd.DataFrame({"a": [200, 300, "", "NaN", 30000000000000000000]})
expected = pd.Series([200, 300, np.nan, np.nan,
30000000000000000000], dtype=float, name="a")
result = to_numeric(df["a"], errors="coerce")
tm.assert_series_equal(result, expected)

s = pd.Series(["12345678901234567890", "1234567890", "ITEM"])
expected = pd.Series([12345678901234567890,
1234567890, np.nan], dtype=float)
result = to_numeric(s, errors="coerce")
tm.assert_series_equal(result, expected)

# For completeness, check against "ignore" and "raise"
result = to_numeric(s, errors="ignore")
tm.assert_series_equal(result, s)

msg = "Unable to parse string"
with tm.assert_raises_regex(ValueError, msg):
to_numeric(s, errors="raise")