Skip to content

Commit 564e7ae

Browse files
committed
BUG: Strings with exponent but no decimal point parsed as integers in python csv engine (GH 9565)
1 parent 7516ec7 commit 564e7ae

File tree

6 files changed

+68
-57
lines changed

6 files changed

+68
-57
lines changed

doc/source/whatsnew/v0.16.2.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,8 @@ Bug Fixes
7777

7878
- Bug in `plot` not defaulting to matplotlib `axes.grid` setting (:issue:`9792`)
7979

80+
- Bug causing strings containing an exponent but no decimal to be parsed as ints instead of floats in python csv parser. (:issue:`9565`)
81+
8082
- Bug in ``Series.align`` resets ``name`` when ``fill_value`` is specified (:issue:`10067`)
8183
- Bug in ``SparseSeries.abs`` resets ``name`` (:issue:`10241`)
8284

pandas/io/tests/test_parsers.py

Lines changed: 42 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@
3535
from numpy.testing.decorators import slow
3636
from numpy.testing import assert_array_equal
3737

38-
from pandas.parser import OverflowError, CParserError
38+
import pandas.parser
3939

4040

4141
class ParserTests(object):
@@ -1648,7 +1648,7 @@ def test_read_table_buglet_4x_multiindex(self):
16481648
# Temporarily copied to TestPythonParser.
16491649
# Here test that CParserError is raised:
16501650

1651-
with tm.assertRaises(CParserError):
1651+
with tm.assertRaises(pandas.parser.CParserError):
16521652
text = """ A B C D E
16531653
one two three four
16541654
a b 10.0032 5 -0.5109 -2.3358 -0.4645 0.05076 0.3640
@@ -2293,6 +2293,46 @@ def test_chunk_begins_with_newline_whitespace(self):
22932293
result = self.read_csv(StringIO(data), header=None)
22942294
self.assertEqual(len(result), 2)
22952295

2296+
def test_float_parser(self):
2297+
# GH 9565
2298+
data = '45e-1,4.5,45.,inf,-inf'
2299+
result = self.read_csv(StringIO(data), header=None)
2300+
expected = pd.DataFrame([[float(s) for s in data.split(',')]])
2301+
tm.assert_frame_equal(result, expected)
2302+
2303+
def test_int64_overflow(self):
2304+
data = """ID
2305+
00013007854817840016671868
2306+
00013007854817840016749251
2307+
00013007854817840016754630
2308+
00013007854817840016781876
2309+
00013007854817840017028824
2310+
00013007854817840017963235
2311+
00013007854817840018860166"""
2312+
2313+
result = self.read_csv(StringIO(data))
2314+
self.assertTrue(result['ID'].dtype == object)
2315+
2316+
self.assertRaises((OverflowError, pandas.parser.OverflowError),
2317+
self.read_csv, StringIO(data),
2318+
converters={'ID' : np.int64})
2319+
2320+
# Just inside int64 range: parse as integer
2321+
i_max = np.iinfo(np.int64).max
2322+
i_min = np.iinfo(np.int64).min
2323+
for x in [i_max, i_min]:
2324+
result = pd.read_csv(StringIO(str(x)), header=None)
2325+
expected = pd.DataFrame([x])
2326+
tm.assert_frame_equal(result, expected)
2327+
2328+
# Just outside int64 range: parse as string
2329+
too_big = i_max + 1
2330+
too_small = i_min - 1
2331+
for x in [too_big, too_small]:
2332+
result = pd.read_csv(StringIO(str(x)), header=None)
2333+
expected = pd.DataFrame([str(x)])
2334+
tm.assert_frame_equal(result, expected)
2335+
22962336

22972337
class TestPythonParser(ParserTests, tm.TestCase):
22982338
def test_negative_skipfooter_raises(self):
@@ -3567,22 +3607,6 @@ def test_disable_bool_parsing(self):
35673607
result = read_csv(StringIO(data), dtype=object, na_filter=False)
35683608
self.assertEqual(result['B'][2], '')
35693609

3570-
def test_int64_overflow(self):
3571-
data = """ID
3572-
00013007854817840016671868
3573-
00013007854817840016749251
3574-
00013007854817840016754630
3575-
00013007854817840016781876
3576-
00013007854817840017028824
3577-
00013007854817840017963235
3578-
00013007854817840018860166"""
3579-
3580-
result = read_csv(StringIO(data))
3581-
self.assertTrue(result['ID'].dtype == object)
3582-
3583-
self.assertRaises(OverflowError, read_csv, StringIO(data),
3584-
dtype='i8')
3585-
35863610
def test_euro_decimal_format(self):
35873611
data = """Id;Number1;Number2;Text1;Text2;Number3
35883612
1;1521,1541;187101,9543;ABC;poi;4,738797819

pandas/lib.pyx

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -64,11 +64,7 @@ ctypedef unsigned char UChar
6464
cimport util
6565
from util cimport is_array, _checknull, _checknan
6666

67-
cdef extern from "headers/stdint.h":
68-
enum: UINT8_MAX
69-
enum: INT64_MAX
70-
enum: INT64_MIN
71-
67+
from libc.stdint cimport UINT8_MAX, INT64_MAX, INT64_MIN
7268

7369
cdef extern from "math.h":
7470
double sqrt(double x)

pandas/parser.pyx

Lines changed: 1 addition & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -45,19 +45,7 @@ cdef bint PY3 = (sys.version_info[0] >= 3)
4545
cdef double INF = <double> np.inf
4646
cdef double NEGINF = -INF
4747

48-
cdef extern from "headers/stdint.h":
49-
enum: UINT8_MAX
50-
enum: UINT16_MAX
51-
enum: UINT32_MAX
52-
enum: UINT64_MAX
53-
enum: INT8_MIN
54-
enum: INT8_MAX
55-
enum: INT16_MIN
56-
enum: INT16_MAX
57-
enum: INT32_MAX
58-
enum: INT32_MIN
59-
enum: INT64_MAX
60-
enum: INT64_MIN
48+
from libc.stdint cimport *
6149

6250
cdef extern from "headers/portable.h":
6351
pass

pandas/src/inference.pyx

Lines changed: 10 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -514,10 +514,7 @@ def is_period_array(ndarray[object] values):
514514

515515

516516
cdef extern from "parse_helper.h":
517-
inline int floatify(object, double *result) except -1
518-
519-
cdef double fINT64_MAX = <double> INT64_MAX
520-
cdef double fINT64_MIN = <double> INT64_MIN
517+
inline int floatify(object, double *result, int *maybe_int) except -1
521518

522519

523520
def maybe_convert_numeric(object[:] values, set na_values,
@@ -527,7 +524,7 @@ def maybe_convert_numeric(object[:] values, set na_values,
527524
convert to proper dtype array
528525
'''
529526
cdef:
530-
int status
527+
int status, maybe_int
531528
Py_ssize_t i, n = values.size
532529
ndarray[float64_t] floats = np.empty(n, dtype='f8')
533530
ndarray[complex128_t] complexes = np.empty(n, dtype='c16')
@@ -569,18 +566,16 @@ def maybe_convert_numeric(object[:] values, set na_values,
569566
seen_complex = True
570567
else:
571568
try:
572-
status = floatify(val, &fval)
569+
status = floatify(val, &fval, &maybe_int)
573570
floats[i] = fval
574571
if not seen_float:
575-
if '.' in val or fval == INF or fval == NEGINF:
576-
seen_float = True
577-
elif 'inf' in val: # special case to handle +/-inf
578-
seen_float = True
579-
elif fval < fINT64_MAX and fval > fINT64_MIN:
580-
try:
581-
ints[i] = int(val)
582-
except ValueError:
583-
ints[i] = <int64_t> fval
572+
if maybe_int:
573+
as_int = int(val)
574+
575+
if as_int <= INT64_MAX and as_int >= INT64_MIN:
576+
ints[i] = as_int
577+
else:
578+
raise ValueError('integer out of range')
584579
else:
585580
seen_float = True
586581
except:

pandas/src/parse_helper.h

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,13 @@
22
#include <float.h>
33

44
static double xstrtod(const char *p, char **q, char decimal, char sci,
5-
int skip_trailing);
5+
int skip_trailing, int *maybe_int);
66

7-
int to_double(char *item, double *p_value, char sci, char decimal)
7+
int to_double(char *item, double *p_value, char sci, char decimal, int *maybe_int)
88
{
99
char *p_end;
1010

11-
*p_value = xstrtod(item, &p_end, decimal, sci, 1);
11+
*p_value = xstrtod(item, &p_end, decimal, sci, 1, maybe_int);
1212

1313
return (errno == 0) && (!*p_end);
1414
}
@@ -18,7 +18,7 @@ int to_double(char *item, double *p_value, char sci, char decimal)
1818
#define PyBytes_AS_STRING PyString_AS_STRING
1919
#endif
2020

21-
int floatify(PyObject* str, double *result) {
21+
int floatify(PyObject* str, double *result, int *maybe_int) {
2222
int status;
2323
char *data;
2424
PyObject* tmp = NULL;
@@ -35,14 +35,16 @@ int floatify(PyObject* str, double *result) {
3535
return -1;
3636
}
3737

38-
status = to_double(data, result, sci, dec);
38+
status = to_double(data, result, sci, dec, maybe_int);
3939

4040
if (!status) {
4141
/* handle inf/-inf */
4242
if (0 == strcmp(data, "-inf")) {
4343
*result = -HUGE_VAL;
44+
*maybe_int = 0;
4445
} else if (0 == strcmp(data, "inf")) {
4546
*result = HUGE_VAL;
47+
*maybe_int = 0;
4648
} else {
4749
PyErr_SetString(PyExc_ValueError, "Unable to parse string");
4850
Py_XDECREF(tmp);
@@ -117,7 +119,7 @@ PANDAS_INLINE void uppercase(char *p) {
117119

118120

119121
static double xstrtod(const char *str, char **endptr, char decimal,
120-
char sci, int skip_trailing)
122+
char sci, int skip_trailing, int *maybe_int)
121123
{
122124
double number;
123125
int exponent;
@@ -129,6 +131,7 @@ static double xstrtod(const char *str, char **endptr, char decimal,
129131
int num_decimals;
130132

131133
errno = 0;
134+
*maybe_int = 1;
132135

133136
// Skip leading whitespace
134137
while (isspace(*p)) p++;
@@ -157,6 +160,7 @@ static double xstrtod(const char *str, char **endptr, char decimal,
157160
// Process decimal part
158161
if (*p == decimal)
159162
{
163+
*maybe_int = 0;
160164
p++;
161165

162166
while (isdigit(*p))
@@ -182,6 +186,8 @@ static double xstrtod(const char *str, char **endptr, char decimal,
182186
// Process an exponent string
183187
if (toupper(*p) == toupper(sci))
184188
{
189+
*maybe_int = 0;
190+
185191
// Handle optional sign
186192
negative = 0;
187193
switch (*++p)

0 commit comments

Comments
 (0)