Skip to content

Commit f4324d7

Browse files
committed
BUG: Strings with exponent but no decimal point parsed as integers in python csv engine (GH 9565)
1 parent f6c7d89 commit f4324d7

File tree

4 files changed

+50
-36
lines changed

4 files changed

+50
-36
lines changed

doc/source/whatsnew/v0.17.0.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@ Bug Fixes
6161
~~~~~~~~~
6262

6363
- Bug in ``Categorical`` repr with ``display.width`` of ``None`` in Python 3 (:issue:`10087`)
64+
- Bug causing strings containing an exponent but no decimal to be parsed as ints instead of floats in python csv parser. (:issue:`9565`)
6465

6566

6667
- Bug in ``Timestamp``'s' ``microsecond``, ``quarter``, ``dayofyear``, ``week`` and ``daysinmonth`` properties return ``np.int`` type, not built-in ``int``. (:issue:`10050`)

pandas/io/tests/test_parsers.py

Lines changed: 26 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@
3535
from numpy.testing.decorators import slow
3636
from numpy.testing import assert_array_equal
3737

38-
from pandas.parser import OverflowError, CParserError
38+
import pandas.parser
3939

4040

4141
class ParserTests(object):
@@ -1648,7 +1648,7 @@ def test_read_table_buglet_4x_multiindex(self):
16481648
# Temporarily copied to TestPythonParser.
16491649
# Here test that CParserError is raised:
16501650

1651-
with tm.assertRaises(CParserError):
1651+
with tm.assertRaises(pandas.parser.CParserError):
16521652
text = """ A B C D E
16531653
one two three four
16541654
a b 10.0032 5 -0.5109 -2.3358 -0.4645 0.05076 0.3640
@@ -2293,6 +2293,30 @@ def test_chunk_begins_with_newline_whitespace(self):
22932293
result = self.read_csv(StringIO(data), header=None)
22942294
self.assertEqual(len(result), 2)
22952295

2296+
def test_float_parser(self):
2297+
# GH 9565
2298+
data = '45e-1,4.5,45.,inf,-inf'
2299+
result = self.read_csv(StringIO(data), header=None)
2300+
expected = pd.DataFrame([[float(s) for s in data.split(',')]])
2301+
tm.assert_frame_equal(result, expected)
2302+
2303+
def test_int64_overflow(self):
2304+
data = """ID
2305+
00013007854817840016671868
2306+
00013007854817840016749251
2307+
00013007854817840016754630
2308+
00013007854817840016781876
2309+
00013007854817840017028824
2310+
00013007854817840017963235
2311+
00013007854817840018860166"""
2312+
2313+
result = self.read_csv(StringIO(data))
2314+
self.assertTrue(result['ID'].dtype == object)
2315+
2316+
self.assertRaises((OverflowError, pandas.parser.OverflowError),
2317+
self.read_csv, StringIO(data),
2318+
converters={'ID' : np.int64})
2319+
22962320

22972321
class TestPythonParser(ParserTests, tm.TestCase):
22982322
def test_negative_skipfooter_raises(self):
@@ -3567,22 +3591,6 @@ def test_disable_bool_parsing(self):
35673591
result = read_csv(StringIO(data), dtype=object, na_filter=False)
35683592
self.assertEqual(result['B'][2], '')
35693593

3570-
def test_int64_overflow(self):
3571-
data = """ID
3572-
00013007854817840016671868
3573-
00013007854817840016749251
3574-
00013007854817840016754630
3575-
00013007854817840016781876
3576-
00013007854817840017028824
3577-
00013007854817840017963235
3578-
00013007854817840018860166"""
3579-
3580-
result = read_csv(StringIO(data))
3581-
self.assertTrue(result['ID'].dtype == object)
3582-
3583-
self.assertRaises(OverflowError, read_csv, StringIO(data),
3584-
dtype='i8')
3585-
35863594
def test_euro_decimal_format(self):
35873595
data = """Id;Number1;Number2;Text1;Text2;Number3
35883596
1;1521,1541;187101,9543;ABC;poi;4,738797819

pandas/src/inference.pyx

Lines changed: 11 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -514,7 +514,7 @@ def is_period_array(ndarray[object] values):
514514

515515

516516
cdef extern from "parse_helper.h":
517-
inline int floatify(object, double *result) except -1
517+
inline int floatify(object, double *result, int *maybe_int) except -1
518518

519519
cdef double fINT64_MAX = <double> INT64_MAX
520520
cdef double fINT64_MIN = <double> INT64_MIN
@@ -527,7 +527,7 @@ def maybe_convert_numeric(object[:] values, set na_values,
527527
convert to proper dtype array
528528
'''
529529
cdef:
530-
int status
530+
int status, maybe_int
531531
Py_ssize_t i, n = values.size
532532
ndarray[float64_t] floats = np.empty(n, dtype='f8')
533533
ndarray[complex128_t] complexes = np.empty(n, dtype='c16')
@@ -569,18 +569,17 @@ def maybe_convert_numeric(object[:] values, set na_values,
569569
seen_complex = True
570570
else:
571571
try:
572-
status = floatify(val, &fval)
572+
status = floatify(val, &fval, &maybe_int)
573573
floats[i] = fval
574574
if not seen_float:
575-
if '.' in val or fval == INF or fval == NEGINF:
576-
seen_float = True
577-
elif 'inf' in val: # special case to handle +/-inf
578-
seen_float = True
579-
elif fval < fINT64_MAX and fval > fINT64_MIN:
580-
try:
581-
ints[i] = int(val)
582-
except ValueError:
583-
ints[i] = <int64_t> fval
575+
if maybe_int:
576+
if fval < fINT64_MAX and fval > fINT64_MIN:
577+
try:
578+
ints[i] = int(val)
579+
except ValueError:
580+
ints[i] = <int64_t> fval
581+
else:
582+
raise ValueError('integer out of range')
584583
else:
585584
seen_float = True
586585
except:

pandas/src/parse_helper.h

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,13 @@
22
#include <float.h>
33

44
static double xstrtod(const char *p, char **q, char decimal, char sci,
5-
int skip_trailing);
5+
int skip_trailing, int *maybe_int);
66

7-
int to_double(char *item, double *p_value, char sci, char decimal)
7+
int to_double(char *item, double *p_value, char sci, char decimal, int *maybe_int)
88
{
99
char *p_end;
1010

11-
*p_value = xstrtod(item, &p_end, decimal, sci, 1);
11+
*p_value = xstrtod(item, &p_end, decimal, sci, 1, maybe_int);
1212

1313
return (errno == 0) && (!*p_end);
1414
}
@@ -18,7 +18,7 @@ int to_double(char *item, double *p_value, char sci, char decimal)
1818
#define PyBytes_AS_STRING PyString_AS_STRING
1919
#endif
2020

21-
int floatify(PyObject* str, double *result) {
21+
int floatify(PyObject* str, double *result, int *maybe_int) {
2222
int status;
2323
char *data;
2424
PyObject* tmp = NULL;
@@ -35,14 +35,16 @@ int floatify(PyObject* str, double *result) {
3535
return -1;
3636
}
3737

38-
status = to_double(data, result, sci, dec);
38+
status = to_double(data, result, sci, dec, maybe_int);
3939

4040
if (!status) {
4141
/* handle inf/-inf */
4242
if (0 == strcmp(data, "-inf")) {
4343
*result = -HUGE_VAL;
44+
*maybe_int = 0;
4445
} else if (0 == strcmp(data, "inf")) {
4546
*result = HUGE_VAL;
47+
*maybe_int = 0;
4648
} else {
4749
PyErr_SetString(PyExc_ValueError, "Unable to parse string");
4850
Py_XDECREF(tmp);
@@ -117,7 +119,7 @@ PANDAS_INLINE void uppercase(char *p) {
117119

118120

119121
static double xstrtod(const char *str, char **endptr, char decimal,
120-
char sci, int skip_trailing)
122+
char sci, int skip_trailing, int *maybe_int)
121123
{
122124
double number;
123125
int exponent;
@@ -129,6 +131,7 @@ static double xstrtod(const char *str, char **endptr, char decimal,
129131
int num_decimals;
130132

131133
errno = 0;
134+
*maybe_int = 1;
132135

133136
// Skip leading whitespace
134137
while (isspace(*p)) p++;
@@ -157,6 +160,7 @@ static double xstrtod(const char *str, char **endptr, char decimal,
157160
// Process decimal part
158161
if (*p == decimal)
159162
{
163+
*maybe_int = 0;
160164
p++;
161165

162166
while (isdigit(*p))
@@ -182,6 +186,8 @@ static double xstrtod(const char *str, char **endptr, char decimal,
182186
// Process an exponent string
183187
if (toupper(*p) == toupper(sci))
184188
{
189+
*maybe_int = 0;
190+
185191
// Handle optional sign
186192
negative = 0;
187193
switch (*++p)

0 commit comments

Comments
 (0)