From 3989060f67238c6024e612dfe0782c52e8ed1c10 Mon Sep 17 00:00:00 2001 From: jreback Date: Wed, 27 Nov 2013 14:16:57 -0500 Subject: [PATCH] TST/API: test the list of NA values in the csv parser. add N/A, #NA as independent default na values (GH5521) --- doc/source/io.rst | 2 +- doc/source/release.rst | 4 +++- pandas/io/parsers.py | 2 +- pandas/io/tests/test_parsers.py | 25 +++++++++++++++++++++++++ 4 files changed, 30 insertions(+), 3 deletions(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index 0f34e94084878..a6f022d85272e 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -564,7 +564,7 @@ the corresponding equivalent values will also imply a missing value (in this cas ``[5.0,5]`` are recognized as ``NaN``. To completely override the default values that are recognized as missing, specify ``keep_default_na=False``. -The default ``NaN`` recognized values are ``['-1.#IND', '1.#QNAN', '1.#IND', '-1.#QNAN', '#N/A N/A', 'NA', +The default ``NaN`` recognized values are ``['-1.#IND', '1.#QNAN', '1.#IND', '-1.#QNAN', '#N/A','N/A', 'NA', '#NA', 'NULL', 'NaN', 'nan']``. .. code-block:: python diff --git a/doc/source/release.rst b/doc/source/release.rst index cb17e7b3d4b23..35e00d9ed9850 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -126,7 +126,7 @@ Improvements to existing features (:issue:`4039`) with improved validation for all (:issue:`4039`, :issue:`4794`) - A Series of dtype ``timedelta64[ns]`` can now be divided/multiplied - by an integer series (:issue`4521`) + by an integer series (:issue:`4521`) - A Series of dtype ``timedelta64[ns]`` can now be divided by another ``timedelta64[ns]`` object to yield a ``float64`` dtyped Series. This is frequency conversion; astyping is also supported. @@ -409,6 +409,8 @@ API Changes - raise/warn ``SettingWithCopyError/Warning`` exception/warning when setting of a copy thru chained assignment is detected, settable via option ``mode.chained_assignment`` + - test the list of ``NA`` values in the csv parser. add ``N/A``, ``#NA`` as independent default + na values (:issue:`5521`) Internal Refactoring ~~~~~~~~~~~~~~~~~~~~ diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index c10cb84de34fd..e62ecd5a541df 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -438,7 +438,7 @@ def read_fwf(filepath_or_buffer, colspecs='infer', widths=None, **kwds): # no longer excluding inf representations # '1.#INF','-1.#INF', '1.#INF000000', _NA_VALUES = set(['-1.#IND', '1.#QNAN', '1.#IND', '-1.#QNAN', - '#N/A N/A', 'NA', '#NA', 'NULL', 'NaN', + '#N/A','N/A', 'NA', '#NA', 'NULL', 'NaN', 'nan', '']) diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py index 84736f16e7cba..37d3c6c55ba65 100644 --- a/pandas/io/tests/test_parsers.py +++ b/pandas/io/tests/test_parsers.py @@ -683,6 +683,31 @@ def test_non_string_na_values(self): tm.assert_frame_equal(result6,good_compare) tm.assert_frame_equal(result7,good_compare) + def test_default_na_values(self): + _NA_VALUES = set(['-1.#IND', '1.#QNAN', '1.#IND', '-1.#QNAN', + '#N/A','N/A', 'NA', '#NA', 'NULL', 'NaN', + 'nan', '']) + + nv = len(_NA_VALUES) + def f(i, v): + if i == 0: + buf = '' + elif i > 0: + buf = ''.join([','] * i) + + buf = "{0}{1}".format(buf,v) + + if i < nv-1: + buf = "{0}{1}".format(buf,''.join([','] * (nv-i-1))) + + return buf + + data = StringIO('\n'.join([ f(i, v) for i, v in enumerate(_NA_VALUES) ])) + + expected = DataFrame(np.nan,columns=range(nv),index=range(nv)) + df = self.read_csv(data, header=None) + tm.assert_frame_equal(df, expected) + def test_custom_na_values(self): data = """A,B,C ignore,this,row