Skip to content

BUG: Strings with exponent but no decimal point parsed as integers in python csv engine #10133

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v0.16.2.txt
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,8 @@ Bug Fixes

- Bug in `plot` not defaulting to matplotlib `axes.grid` setting (:issue:`9792`)

- Bug causing strings containing an exponent but no decimal to be parsed as ints instead of floats in python csv parser. (:issue:`9565`)

- Bug in ``Series.align`` resets ``name`` when ``fill_value`` is specified (:issue:`10067`)
- Bug in ``SparseSeries.abs`` resets ``name`` (:issue:`10241`)

Expand Down
60 changes: 42 additions & 18 deletions pandas/io/tests/test_parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@
from numpy.testing.decorators import slow
from numpy.testing import assert_array_equal

from pandas.parser import OverflowError, CParserError
import pandas.parser


class ParserTests(object):
Expand Down Expand Up @@ -1648,7 +1648,7 @@ def test_read_table_buglet_4x_multiindex(self):
# Temporarily copied to TestPythonParser.
# Here test that CParserError is raised:

with tm.assertRaises(CParserError):
with tm.assertRaises(pandas.parser.CParserError):
text = """ A B C D E
one two three four
a b 10.0032 5 -0.5109 -2.3358 -0.4645 0.05076 0.3640
Expand Down Expand Up @@ -2293,6 +2293,46 @@ def test_chunk_begins_with_newline_whitespace(self):
result = self.read_csv(StringIO(data), header=None)
self.assertEqual(len(result), 2)

def test_float_parser(self):
# GH 9565
data = '45e-1,4.5,45.,inf,-inf'
result = self.read_csv(StringIO(data), header=None)
expected = pd.DataFrame([[float(s) for s in data.split(',')]])
tm.assert_frame_equal(result, expected)

def test_int64_overflow(self):
data = """ID
00013007854817840016671868
00013007854817840016749251
00013007854817840016754630
00013007854817840016781876
00013007854817840017028824
00013007854817840017963235
00013007854817840018860166"""

result = self.read_csv(StringIO(data))
self.assertTrue(result['ID'].dtype == object)

self.assertRaises((OverflowError, pandas.parser.OverflowError),
self.read_csv, StringIO(data),
converters={'ID' : np.int64})

# Just inside int64 range: parse as integer
i_max = np.iinfo(np.int64).max
i_min = np.iinfo(np.int64).min
for x in [i_max, i_min]:
result = pd.read_csv(StringIO(str(x)), header=None)
expected = pd.DataFrame([x])
tm.assert_frame_equal(result, expected)

# Just outside int64 range: parse as string
too_big = i_max + 1
too_small = i_min - 1
for x in [too_big, too_small]:
result = pd.read_csv(StringIO(str(x)), header=None)
expected = pd.DataFrame([str(x)])
tm.assert_frame_equal(result, expected)


class TestPythonParser(ParserTests, tm.TestCase):
def test_negative_skipfooter_raises(self):
Expand Down Expand Up @@ -3567,22 +3607,6 @@ def test_disable_bool_parsing(self):
result = read_csv(StringIO(data), dtype=object, na_filter=False)
self.assertEqual(result['B'][2], '')

def test_int64_overflow(self):
data = """ID
00013007854817840016671868
00013007854817840016749251
00013007854817840016754630
00013007854817840016781876
00013007854817840017028824
00013007854817840017963235
00013007854817840018860166"""

result = read_csv(StringIO(data))
self.assertTrue(result['ID'].dtype == object)

self.assertRaises(OverflowError, read_csv, StringIO(data),
dtype='i8')

def test_euro_decimal_format(self):
data = """Id;Number1;Number2;Text1;Text2;Number3
1;1521,1541;187101,9543;ABC;poi;4,738797819
Expand Down
27 changes: 12 additions & 15 deletions pandas/src/inference.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -514,11 +514,10 @@ def is_period_array(ndarray[object] values):


cdef extern from "parse_helper.h":
inline int floatify(object, double *result) except -1

cdef double fINT64_MAX = <double> INT64_MAX
cdef double fINT64_MIN = <double> INT64_MIN
inline int floatify(object, double *result, int *maybe_int) except -1

cdef int64_t iINT64_MAX = <int64_t> INT64_MAX
cdef int64_t iINT64_MIN = <int64_t> INT64_MIN
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same here (with the definitions)


def maybe_convert_numeric(object[:] values, set na_values,
bint convert_empty=True, bint coerce_numeric=False):
Expand All @@ -527,7 +526,7 @@ def maybe_convert_numeric(object[:] values, set na_values,
convert to proper dtype array
'''
cdef:
int status
int status, maybe_int
Py_ssize_t i, n = values.size
ndarray[float64_t] floats = np.empty(n, dtype='f8')
ndarray[complex128_t] complexes = np.empty(n, dtype='c16')
Expand Down Expand Up @@ -569,18 +568,16 @@ def maybe_convert_numeric(object[:] values, set na_values,
seen_complex = True
else:
try:
status = floatify(val, &fval)
status = floatify(val, &fval, &maybe_int)
floats[i] = fval
if not seen_float:
if '.' in val or fval == INF or fval == NEGINF:
seen_float = True
elif 'inf' in val: # special case to handle +/-inf
seen_float = True
elif fval < fINT64_MAX and fval > fINT64_MIN:
try:
ints[i] = int(val)
except ValueError:
ints[i] = <int64_t> fval
if maybe_int:
as_int = int(val)

if as_int <= iINT64_MAX and as_int >= iINT64_MIN:
ints[i] = as_int
else:
raise ValueError('integer out of range')
else:
seen_float = True
except:
Expand Down
18 changes: 12 additions & 6 deletions pandas/src/parse_helper.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,13 @@
#include <float.h>

static double xstrtod(const char *p, char **q, char decimal, char sci,
int skip_trailing);
int skip_trailing, int *maybe_int);

int to_double(char *item, double *p_value, char sci, char decimal)
int to_double(char *item, double *p_value, char sci, char decimal, int *maybe_int)
{
char *p_end;

*p_value = xstrtod(item, &p_end, decimal, sci, 1);
*p_value = xstrtod(item, &p_end, decimal, sci, 1, maybe_int);

return (errno == 0) && (!*p_end);
}
Expand All @@ -18,7 +18,7 @@ int to_double(char *item, double *p_value, char sci, char decimal)
#define PyBytes_AS_STRING PyString_AS_STRING
#endif

int floatify(PyObject* str, double *result) {
int floatify(PyObject* str, double *result, int *maybe_int) {
int status;
char *data;
PyObject* tmp = NULL;
Expand All @@ -35,14 +35,16 @@ int floatify(PyObject* str, double *result) {
return -1;
}

status = to_double(data, result, sci, dec);
status = to_double(data, result, sci, dec, maybe_int);

if (!status) {
/* handle inf/-inf */
if (0 == strcmp(data, "-inf")) {
*result = -HUGE_VAL;
*maybe_int = 0;
} else if (0 == strcmp(data, "inf")) {
*result = HUGE_VAL;
*maybe_int = 0;
} else {
PyErr_SetString(PyExc_ValueError, "Unable to parse string");
Py_XDECREF(tmp);
Expand Down Expand Up @@ -117,7 +119,7 @@ PANDAS_INLINE void uppercase(char *p) {


static double xstrtod(const char *str, char **endptr, char decimal,
char sci, int skip_trailing)
char sci, int skip_trailing, int *maybe_int)
{
double number;
int exponent;
Expand All @@ -129,6 +131,7 @@ static double xstrtod(const char *str, char **endptr, char decimal,
int num_decimals;

errno = 0;
*maybe_int = 1;

// Skip leading whitespace
while (isspace(*p)) p++;
Expand Down Expand Up @@ -157,6 +160,7 @@ static double xstrtod(const char *str, char **endptr, char decimal,
// Process decimal part
if (*p == decimal)
{
*maybe_int = 0;
p++;

while (isdigit(*p))
Expand All @@ -182,6 +186,8 @@ static double xstrtod(const char *str, char **endptr, char decimal,
// Process an exponent string
if (toupper(*p) == toupper(sci))
{
*maybe_int = 0;

// Handle optional sign
negative = 0;
switch (*++p)
Expand Down