BUG: Strings with exponent but no decimal point parsed as integers in python csv engine (GH 9565)

evanpw · evanpw · commit 564e7aed5b57 · 2015-06-04T08:09:16.000-04:00
diff --git a/doc/source/whatsnew/v0.16.2.txt b/doc/source/whatsnew/v0.16.2.txt
@@ -77,6 +77,8 @@ Bug Fixes
 
 - Bug in `plot` not defaulting to matplotlib `axes.grid` setting (:issue:`9792`)
 
+- Bug causing strings containing an exponent but no decimal to be parsed as ints instead of floats in python csv parser. (:issue:`9565`)
+
 - Bug in ``Series.align`` resets ``name`` when ``fill_value`` is specified (:issue:`10067`)
 - Bug in ``SparseSeries.abs`` resets ``name`` (:issue:`10241`)
 
diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py
@@ -35,7 +35,7 @@
 from numpy.testing.decorators import slow
 from numpy.testing import assert_array_equal
 
-from pandas.parser import OverflowError, CParserError
+import pandas.parser
 
 
 class ParserTests(object):
@@ -1648,7 +1648,7 @@ def test_read_table_buglet_4x_multiindex(self):
         # Temporarily copied to TestPythonParser.
         # Here test that CParserError is raised:
 
-        with tm.assertRaises(CParserError):
+        with tm.assertRaises(pandas.parser.CParserError):
             text = """                      A       B       C       D        E
 one two three   four
 a   b   10.0032 5    -0.5109 -2.3358 -0.4645  0.05076  0.3640
@@ -2293,6 +2293,46 @@ def test_chunk_begins_with_newline_whitespace(self):
         result = self.read_csv(StringIO(data), header=None)
         self.assertEqual(len(result), 2)
 
+    def test_float_parser(self):
+        # GH 9565
+        data = '45e-1,4.5,45.,inf,-inf'
+        result = self.read_csv(StringIO(data), header=None)
+        expected = pd.DataFrame([[float(s) for s in data.split(',')]])
+        tm.assert_frame_equal(result, expected)
+
+    def test_int64_overflow(self):
+        data = """ID
+00013007854817840016671868
+00013007854817840016749251
+00013007854817840016754630
+00013007854817840016781876
+00013007854817840017028824
+00013007854817840017963235
+00013007854817840018860166"""
+
+        result = self.read_csv(StringIO(data))
+        self.assertTrue(result['ID'].dtype == object)
+
+        self.assertRaises((OverflowError, pandas.parser.OverflowError),
+            self.read_csv, StringIO(data),
+            converters={'ID' : np.int64})
+
+        # Just inside int64 range: parse as integer
+        i_max = np.iinfo(np.int64).max
+        i_min = np.iinfo(np.int64).min
+        for x in [i_max, i_min]:
+            result = pd.read_csv(StringIO(str(x)), header=None)
+            expected = pd.DataFrame([x])
+            tm.assert_frame_equal(result, expected)
+
+        # Just outside int64 range: parse as string
+        too_big = i_max + 1
+        too_small = i_min - 1
+        for x in [too_big, too_small]:
+            result = pd.read_csv(StringIO(str(x)), header=None)
+            expected = pd.DataFrame([str(x)])
+            tm.assert_frame_equal(result, expected)
+
 
 class TestPythonParser(ParserTests, tm.TestCase):
     def test_negative_skipfooter_raises(self):
@@ -3567,22 +3607,6 @@ def test_disable_bool_parsing(self):
         result = read_csv(StringIO(data), dtype=object, na_filter=False)
         self.assertEqual(result['B'][2], '')
 
-    def test_int64_overflow(self):
-        data = """ID
-00013007854817840016671868
-00013007854817840016749251
-00013007854817840016754630
-00013007854817840016781876
-00013007854817840017028824
-00013007854817840017963235
-00013007854817840018860166"""
-
-        result = read_csv(StringIO(data))
-        self.assertTrue(result['ID'].dtype == object)
-
-        self.assertRaises(OverflowError, read_csv, StringIO(data),
-                          dtype='i8')
-
     def test_euro_decimal_format(self):
         data = """Id;Number1;Number2;Text1;Text2;Number3
 1;1521,1541;187101,9543;ABC;poi;4,738797819
diff --git a/pandas/lib.pyx b/pandas/lib.pyx
@@ -64,11 +64,7 @@ ctypedef unsigned char UChar
 cimport util
 from util cimport is_array, _checknull, _checknan
 
-cdef extern from "headers/stdint.h":
-    enum: UINT8_MAX
-    enum: INT64_MAX
-    enum: INT64_MIN
-
+from libc.stdint cimport UINT8_MAX, INT64_MAX, INT64_MIN
 
 cdef extern from "math.h":
     double sqrt(double x)
diff --git a/pandas/parser.pyx b/pandas/parser.pyx
@@ -45,19 +45,7 @@ cdef bint PY3 = (sys.version_info[0] >= 3)
 cdef double INF = <double> np.inf
 cdef double NEGINF = -INF
 
-cdef extern from "headers/stdint.h":
-    enum: UINT8_MAX
-    enum: UINT16_MAX
-    enum: UINT32_MAX
-    enum: UINT64_MAX
-    enum: INT8_MIN
-    enum: INT8_MAX
-    enum: INT16_MIN
-    enum: INT16_MAX
-    enum: INT32_MAX
-    enum: INT32_MIN
-    enum: INT64_MAX
-    enum: INT64_MIN
+from libc.stdint cimport *
 
 cdef extern from "headers/portable.h":
     pass
diff --git a/pandas/src/inference.pyx b/pandas/src/inference.pyx
@@ -514,10 +514,7 @@ def is_period_array(ndarray[object] values):
 
 
 cdef extern from "parse_helper.h":
-    inline int floatify(object, double *result) except -1
-
-cdef double fINT64_MAX = <double> INT64_MAX
-cdef double fINT64_MIN = <double> INT64_MIN
+    inline int floatify(object, double *result, int *maybe_int) except -1
 
 
 def maybe_convert_numeric(object[:] values, set na_values,
@@ -527,7 +524,7 @@ def maybe_convert_numeric(object[:] values, set na_values,
     convert to proper dtype array
     '''
     cdef:
-        int status
+        int status, maybe_int
         Py_ssize_t i, n = values.size
         ndarray[float64_t] floats = np.empty(n, dtype='f8')
         ndarray[complex128_t] complexes = np.empty(n, dtype='c16')
@@ -569,18 +566,16 @@ def maybe_convert_numeric(object[:] values, set na_values,
             seen_complex = True
         else:
             try:
-                status = floatify(val, &fval)
+                status = floatify(val, &fval, &maybe_int)
                 floats[i] = fval
                 if not seen_float:
-                    if '.' in val or fval == INF or fval == NEGINF:
-                        seen_float = True
-                    elif 'inf' in val:  # special case to handle +/-inf
-                        seen_float = True
-                    elif fval < fINT64_MAX and fval > fINT64_MIN:
-                        try:
-                            ints[i] = int(val)
-                        except ValueError:
-                            ints[i] = <int64_t> fval
+                    if maybe_int:
+                        as_int = int(val)
+
+                        if as_int <= INT64_MAX and as_int >= INT64_MIN:
+                            ints[i] = as_int
+                        else:
+                            raise ValueError('integer out of range')
                     else:
                         seen_float = True
             except:
diff --git a/pandas/src/parse_helper.h b/pandas/src/parse_helper.h
@@ -2,13 +2,13 @@
 #include <float.h>
 
 static double xstrtod(const char *p, char **q, char decimal, char sci,
-                      int skip_trailing);
+                      int skip_trailing, int *maybe_int);
 
-int to_double(char *item, double *p_value, char sci, char decimal)
+int to_double(char *item, double *p_value, char sci, char decimal, int *maybe_int)
 {
     char *p_end;
 
-    *p_value = xstrtod(item, &p_end, decimal, sci, 1);
+    *p_value = xstrtod(item, &p_end, decimal, sci, 1, maybe_int);
 
     return (errno == 0) && (!*p_end);
 }
@@ -18,7 +18,7 @@ int to_double(char *item, double *p_value, char sci, char decimal)
   #define PyBytes_AS_STRING            PyString_AS_STRING
 #endif
 
-int floatify(PyObject* str, double *result) {
+int floatify(PyObject* str, double *result, int *maybe_int) {
     int status;
     char *data;
     PyObject* tmp = NULL;
@@ -35,14 +35,16 @@ int floatify(PyObject* str, double *result) {
         return -1;
     }
 
-    status = to_double(data, result, sci, dec);
+    status = to_double(data, result, sci, dec, maybe_int);
 
     if (!status) {
         /* handle inf/-inf */
         if (0 == strcmp(data, "-inf")) {
             *result = -HUGE_VAL;
+            *maybe_int = 0;
         } else if (0 == strcmp(data, "inf")) {
             *result = HUGE_VAL;
+            *maybe_int = 0;
         } else {
             PyErr_SetString(PyExc_ValueError, "Unable to parse string");
             Py_XDECREF(tmp);
@@ -117,7 +119,7 @@ PANDAS_INLINE void uppercase(char *p) {
 
 
 static double xstrtod(const char *str, char **endptr, char decimal,
-                      char sci, int skip_trailing)
+                      char sci, int skip_trailing, int *maybe_int)
 {
   double number;
   int exponent;
@@ -129,6 +131,7 @@ static double xstrtod(const char *str, char **endptr, char decimal,
   int num_decimals;
 
   errno = 0;
+  *maybe_int = 1;
 
   // Skip leading whitespace
   while (isspace(*p)) p++;
@@ -157,6 +160,7 @@ static double xstrtod(const char *str, char **endptr, char decimal,
   // Process decimal part
   if (*p == decimal)
   {
+    *maybe_int = 0;
     p++;
 
     while (isdigit(*p))
@@ -182,6 +186,8 @@ static double xstrtod(const char *str, char **endptr, char decimal,
   // Process an exponent string
   if (toupper(*p) == toupper(sci))
   {
+    *maybe_int = 0;
+
     // Handle optional sign
     negative = 0;
     switch (*++p)