Skip to content

Commit 39fe877

Browse files
committed
ENH: add 'downcast' to pd.to_numeric
Closes pandas-devgh-13352.
1 parent bf4786a commit 39fe877

File tree

4 files changed

+110
-13
lines changed

4 files changed

+110
-13
lines changed

asv_bench/benchmarks/inference.py

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -135,4 +135,25 @@ def setup(self):
135135
self.df_timedelta64 = DataFrame(dict(A=(self.df_datetime64['A'] - self.df_datetime64['B']), B=self.df_datetime64['B']))
136136

137137
def time_dtype_infer_uint32(self):
138-
(self.df_uint32['A'] + self.df_uint32['B'])
138+
(self.df_uint32['A'] + self.df_uint32['B'])
139+
140+
141+
class to_numeric(object):
142+
N = 500000
143+
144+
param_names = ['data', 'dtype']
145+
params = [
146+
[(['1'] * N / 2) + ([2] * N / 2),
147+
(['-1'] * N / 2) + ([2] * N / 2),
148+
np.repeat(np.array('1970-01-01', '1970-01-02',
149+
dtype='datetime64[D]'), N),
150+
(['1.1'] * N / 2) + ([2] * N / 2),
151+
([1] * N / 2) + ([2] * N / 2),
152+
np.repeat(np.int32(1), N)],
153+
['int64', 'uint64', 'int32', 'uint32',
154+
'int16', 'uint16', 'int8', 'uint8',
155+
'float64', 'float32', float, int],
156+
]
157+
158+
def time_to_numeric(self, data, dtype):
159+
pd.to_numeric(data, downcast=dtype)

doc/source/whatsnew/v0.18.2.txt

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -186,6 +186,12 @@ Other enhancements
186186
^^^^^^^^^^^^^^^^^^
187187

188188
- The ``.tz_localize()`` method of ``DatetimeIndex`` and ``Timestamp`` has gained the ``errors`` keyword, so you can potentially coerce nonexistent timestamps to ``NaT``. The default behaviour remains to raising a ``NonExistentTimeError`` (:issue:`13057`)
189+
- ``pd.to_numeric()`` now accepts a ``downcast`` parameter (:issue:`13352`)
190+
191+
.. ipython:: python
192+
193+
s = ['1', 2, 3]
194+
pd.to_numeric(s, downcast=np.int8)
189195

190196
- ``Index`` now supports ``.str.extractall()`` which returns a ``DataFrame``, see :ref:`documentation here <text.extractall>` (:issue:`10008`, :issue:`13156`)
191197
- ``.to_hdf/read_hdf()`` now accept path objects (e.g. ``pathlib.Path``, ``py.path.local``) for the file path (:issue:`11773`)

pandas/tools/tests/test_util.py

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -291,6 +291,59 @@ def test_non_hashable(self):
291291
with self.assertRaisesRegexp(TypeError, "Invalid object type"):
292292
pd.to_numeric(s)
293293

294+
def test_downcast(self):
295+
mixed_data = ['1', 2, 3]
296+
int_data = [1, 2, 3]
297+
date_data = np.array(['1970-01-02', '1970-01-03',
298+
'1970-01-04'], dtype='datetime64[D]')
299+
300+
smaller_dtypes = [np.int8, np.uint8, np.int16, np.uint16,
301+
np.int32, np.uint32, np.float32]
302+
303+
larger_dtypes = [np.int64, np.uint64, np.float64]
304+
305+
msg = "'downcast' must be a numerical dtype"
306+
bad_dtype = 'datetime64[D]'
307+
308+
for data in [mixed_data, int_data, date_data]:
309+
tm.assertRaisesRegexp(ValueError, msg, pd.to_numeric,
310+
data, downcast=bad_dtype)
311+
312+
for smaller_dtype in smaller_dtypes:
313+
res = pd.to_numeric(data, downcast=smaller_dtype)
314+
expected = np.array([1, 2, 3], dtype=smaller_dtype)
315+
tm.assert_numpy_array_equal(res, expected)
316+
317+
expected = np.array([1, 2, 3], dtype=np.int64)
318+
319+
# default
320+
res = pd.to_numeric(data)
321+
tm.assert_numpy_array_equal(res, expected)
322+
323+
# explicit
324+
res = pd.to_numeric(data, downcast=None)
325+
tm.assert_numpy_array_equal(res, expected)
326+
327+
for larger_dtype in larger_dtypes:
328+
res = pd.to_numeric(data, downcast=larger_dtype)
329+
tm.assert_numpy_array_equal(res, expected)
330+
331+
# cannot downcast because it is the wrong type
332+
data = ['-1', 2, 3]
333+
res = pd.to_numeric(data, downcast=np.uint8)
334+
expected = np.array([-1, 2, 3], dtype=np.int64)
335+
tm.assert_numpy_array_equal(res, expected)
336+
337+
# ensure behaviour is respected when values are of
338+
# different integer dtypes (i.e. not 'np.int')
339+
data = np.array([1, 2, 3], dtype=np.int16)
340+
341+
res = pd.to_numeric(data, downcast=np.int16)
342+
tm.assert_numpy_array_equal(res, data)
343+
344+
expected = np.array([1, 2, 3], dtype=np.uint8)
345+
res = pd.to_numeric(data, downcast=np.uint8)
346+
tm.assert_numpy_array_equal(res, expected)
294347

295348
if __name__ == '__main__':
296349
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],

pandas/tools/util.py

Lines changed: 29 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ def compose(*funcs):
5050
return reduce(_compose2, funcs)
5151

5252

53-
def to_numeric(arg, errors='raise'):
53+
def to_numeric(arg, errors='raise', downcast=None):
5454
"""
5555
Convert argument to a numeric type.
5656
@@ -61,6 +61,13 @@ def to_numeric(arg, errors='raise'):
6161
- If 'raise', then invalid parsing will raise an exception
6262
- If 'coerce', then invalid parsing will be set as NaN
6363
- If 'ignore', then invalid parsing will return the input
64+
downcast : numpy.dtype or Python dtype, default None
65+
If possible, downcast the resulting numerical data to the
66+
specified **numerical** dtype in 'downcast'. If the size of
67+
the data's dtype is smaller or equal to that of 'downcast',
68+
then this parameter will be ignored.
69+
70+
.. versionadded:: 0.18.2
6471
6572
Returns
6673
-------
@@ -74,6 +81,7 @@ def to_numeric(arg, errors='raise'):
7481
>>> import pandas as pd
7582
>>> s = pd.Series(['1.0', '2', -3])
7683
>>> pd.to_numeric(s)
84+
>>> pd.to_numeric(s, downcast=np.int8)
7785
>>> s = pd.Series(['apple', '1.0', '2', -3])
7886
>>> pd.to_numeric(s, errors='ignore')
7987
>>> pd.to_numeric(s, errors='coerce')
@@ -102,20 +110,29 @@ def to_numeric(arg, errors='raise'):
102110
else:
103111
values = arg
104112

105-
if com.is_numeric_dtype(values):
106-
pass
107-
elif com.is_datetime_or_timedelta_dtype(values):
108-
values = values.astype(np.int64)
109-
else:
110-
values = com._ensure_object(values)
111-
coerce_numeric = False if errors in ('ignore', 'raise') else True
113+
try:
114+
if com.is_numeric_dtype(values):
115+
pass
116+
elif com.is_datetime_or_timedelta_dtype(values):
117+
values = values.astype(np.int64)
118+
else:
119+
values = com._ensure_object(values)
120+
coerce_numeric = False if errors in ('ignore', 'raise') else True
112121

113-
try:
114122
values = lib.maybe_convert_numeric(values, set(),
115123
coerce_numeric=coerce_numeric)
116-
except:
117-
if errors == 'raise':
118-
raise
124+
125+
if downcast is not None:
126+
dtype = np.dtype(downcast)
127+
if not com.is_numeric_dtype(dtype):
128+
raise ValueError("'downcast' must be a numerical dtype")
129+
130+
if values.dtype.itemsize > dtype.itemsize:
131+
values = com._possibly_downcast_to_dtype(values, dtype)
132+
133+
except Exception:
134+
if errors == 'raise':
135+
raise
119136

120137
if is_series:
121138
return pd.Series(values, index=arg.index, name=arg.name)

0 commit comments

Comments
 (0)