Skip to content

Commit d9abf68

Browse files
committed
BUG: more consistent na_values #1657
1 parent 27c4c96 commit d9abf68

File tree

3 files changed

+44
-10
lines changed

3 files changed

+44
-10
lines changed

doc/source/io.rst

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -84,9 +84,9 @@ data into a DataFrame object. They can take a number of arguments:
8484
- ``names``: List of column names to use. If passed, header will be
8585
implicitly set to None.
8686
- ``na_values``: optional list of strings to recognize as NaN (missing
87-
values), in addition to a default set. If you pass an empty list or an
88-
empty list for a particular column, no values (including empty strings)
89-
will be considered NA
87+
values), either in addition to or in lieu of the default set.
88+
- ``keep_default_na``: whether to include the default set of missing values
89+
in addition to the ones specified in ``na_values``
9090
- ``parse_dates``: if True then index will be parsed as dates
9191
(False by default). You can specify more complicated options to parse
9292
a subset of columns or a combination of columns into a single date column

pandas/io/parsers.py

Lines changed: 25 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,9 @@ class DateConversionError(Exception):
5454
na_values : list-like or dict, default None
5555
Additional strings to recognize as NA/NaN. If dict passed, specific
5656
per-column NA values
57+
keep_default_na : bool, default True
58+
If na_values are specified and keep_default_na is False the default NaN
59+
values are overridden, otherwise they're appended to
5760
parse_dates : boolean, list of ints or names, list of lists, or dict
5861
True -> try parsing all columns
5962
[1, 2, 3] -> try parsing columns 1, 2, 3 each as a separate date column
@@ -199,6 +202,7 @@ def read_csv(filepath_or_buffer,
199202
names=None,
200203
skiprows=None,
201204
na_values=None,
205+
keep_default_na=True,
202206
thousands=None,
203207
comment=None,
204208
parse_dates=False,
@@ -218,7 +222,8 @@ def read_csv(filepath_or_buffer,
218222
sep=sep, dialect=dialect,
219223
header=header, index_col=index_col,
220224
names=names, skiprows=skiprows,
221-
na_values=na_values, thousands=thousands,
225+
na_values=na_values, keep_default_na=keep_default_na,
226+
thousands=thousands,
222227
comment=comment, parse_dates=parse_dates,
223228
keep_date_col=keep_date_col,
224229
dayfirst=dayfirst, date_parser=date_parser,
@@ -244,6 +249,7 @@ def read_table(filepath_or_buffer,
244249
names=None,
245250
skiprows=None,
246251
na_values=None,
252+
keep_default_na=True,
247253
thousands=None,
248254
comment=None,
249255
parse_dates=False,
@@ -263,7 +269,8 @@ def read_table(filepath_or_buffer,
263269
sep=sep, dialect=dialect,
264270
header=header, index_col=index_col,
265271
names=names, skiprows=skiprows,
266-
na_values=na_values, thousands=thousands,
272+
na_values=na_values, keep_default_na=keep_default_na,
273+
thousands=thousands,
267274
comment=comment, parse_dates=parse_dates,
268275
keep_date_col=keep_date_col,
269276
dayfirst=dayfirst, date_parser=date_parser,
@@ -292,6 +299,7 @@ def read_fwf(filepath_or_buffer,
292299
names=None,
293300
skiprows=None,
294301
na_values=None,
302+
keep_default_na=True,
295303
thousands=None,
296304
comment=None,
297305
parse_dates=False,
@@ -311,7 +319,8 @@ def read_fwf(filepath_or_buffer,
311319
colspecs=colspecs, widths=widths,
312320
header=header, index_col=index_col,
313321
names=names, skiprows=skiprows,
314-
na_values=na_values, thousands=thousands,
322+
na_values=na_values, keep_default_na=keep_default_na,
323+
thousands=thousands,
315324
comment=comment, parse_dates=parse_dates,
316325
keep_date_col=keep_date_col,
317326
dayfirst=dayfirst, date_parser=date_parser,
@@ -407,6 +416,7 @@ class TextParser(object):
407416
Column or columns to use as the (possibly hierarchical) index
408417
na_values : iterable, default None
409418
Custom NA values
419+
keep_default_na : bool, default True
410420
thousands : str, default None
411421
Thousands separator
412422
comment : str, default None
@@ -425,7 +435,8 @@ class TextParser(object):
425435
"""
426436

427437
def __init__(self, f, delimiter=None, dialect=None, names=None, header=0,
428-
index_col=None, na_values=None, thousands=None,
438+
index_col=None, na_values=None, keep_default_na=True,
439+
thousands=None,
429440
comment=None, parse_dates=False, keep_date_col=False,
430441
date_parser=None, dayfirst=False,
431442
chunksize=None, skiprows=None, skip_footer=0, converters=None,
@@ -467,12 +478,20 @@ def __init__(self, f, delimiter=None, dialect=None, names=None, header=0,
467478

468479
assert(self.skip_footer >= 0)
469480

470-
if na_values is None:
481+
self.keep_default_na = keep_default_na
482+
if na_values is None and keep_default_na:
471483
self.na_values = _NA_VALUES
472484
elif isinstance(na_values, dict):
485+
if keep_default_na:
486+
for k, v in na_values.iteritems():
487+
v = set(list(v)) | _NA_VALUES
488+
na_values[k] = v
473489
self.na_values = na_values
474490
else:
475-
self.na_values = set(list(na_values)) | _NA_VALUES
491+
na_values = set(list(na_values))
492+
if keep_default_na:
493+
na_values = na_values | _NA_VALUES
494+
self.na_values = na_values
476495

477496
self.thousands = thousands
478497
self.comment = comment

pandas/io/tests/test_parsers.py

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -83,13 +83,28 @@ def test_empty_string(self):
8383
np.nan, 'seven']})
8484
assert_frame_equal(xp.reindex(columns=df.columns), df)
8585

86-
df = read_csv(StringIO(data), na_values={'One': [], 'Three': []})
86+
df = read_csv(StringIO(data), na_values={'One': [], 'Three': []},
87+
keep_default_na=False)
8788
xp = DataFrame({'One' : ['a', 'b', '', 'd', 'e', 'nan', 'g'],
8889
'Two' : [1,2,3,4,5,6,7],
8990
'Three' : ['one', 'two', 'three', 'nan', 'five',
9091
'', 'seven']})
9192
assert_frame_equal(xp.reindex(columns=df.columns), df)
9293

94+
df = read_csv(StringIO(data), na_values=['a'], keep_default_na=False)
95+
xp = DataFrame({'One' : [np.nan, 'b', '', 'd', 'e', 'nan', 'g'],
96+
'Two' : [1, 2, 3, 4, 5, 6, 7],
97+
'Three' : ['one', 'two', 'three', 'nan', 'five', '',
98+
'seven']})
99+
assert_frame_equal(xp.reindex(columns=df.columns), df)
100+
101+
df = read_csv(StringIO(data), na_values={'One': [], 'Three': []})
102+
xp = DataFrame({'One' : ['a', 'b', np.nan, 'd', 'e', np.nan, 'g'],
103+
'Two' : [1,2,3,4,5,6,7],
104+
'Three' : ['one', 'two', 'three', np.nan, 'five',
105+
np.nan, 'seven']})
106+
assert_frame_equal(xp.reindex(columns=df.columns), df)
107+
93108

94109
def test_read_csv(self):
95110
pass

0 commit comments

Comments
 (0)