Skip to content

Commit 3ed22d7

Browse files
committed
ENH: add verbose option to read_csv/read_table to print number of NA values filled in non-numeric columns per comment on #614
1 parent d20cb18 commit 3ed22d7

File tree

3 files changed

+39
-17
lines changed

3 files changed

+39
-17
lines changed

RELEASE.rst

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,8 @@ pandas 0.7.0
6666
- Add Panel item access via attributes and IPython completion (GH #554)
6767
- Implement ``DataFrame.lookup``, fancy-indexing analogue for retrieving
6868
values given a sequence of row and column labels (GH #338)
69+
- Add ``verbose`` option to ``read_csv`` and ``read_table`` to show number of
70+
NA values inserted in non-numeric columns (GH #614)
6971

7072
**API Changes**
7173

@@ -132,6 +134,8 @@ pandas 0.7.0
132134
operations that it implemented (GH #91)
133135
- Can pass a list of functions to aggregate with groupby on a DataFrame,
134136
yielding an aggregated result with hierarchical columns (GH #166)
137+
- Monkey-patch context to traceback in ``DataFrame.apply`` to indicate which
138+
row/column the function application failed on (GH #614)
135139

136140
**Bug fixes**
137141

@@ -199,6 +203,7 @@ Thanks
199203
- Mario Gamboa-Cavazos
200204
- Arthur Gerigk
201205
- Yaroslav Halchenko
206+
- Jeff Hammerbacher
202207
- Matt Harrison
203208
- Andreas Hilboll
204209
- Luc Kesters

pandas/io/parsers.py

Lines changed: 31 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,8 @@
4646
converters : dict. optional
4747
Dict of functions for converting values in certain columns. Keys can either
4848
be integers or column labels
49+
verbose : boolean, default False
50+
Indicate number of NA values placed in non-numeric columns
4951
5052
Returns
5153
-------
@@ -93,7 +95,7 @@
9395
def read_csv(filepath_or_buffer, sep=None, header=0, index_col=None, names=None,
9496
skiprows=None, na_values=None, parse_dates=False,
9597
date_parser=None, nrows=None, iterator=False, chunksize=None,
96-
skip_footer=0, converters=None):
98+
skip_footer=0, converters=None, verbose=False):
9799
if hasattr(filepath_or_buffer, 'read'):
98100
f = filepath_or_buffer
99101
else:
@@ -114,7 +116,8 @@ def read_csv(filepath_or_buffer, sep=None, header=0, index_col=None, names=None,
114116
delimiter=sep,
115117
chunksize=chunksize,
116118
skip_footer=skip_footer,
117-
converters=converters)
119+
converters=converters,
120+
verbose=verbose)
118121

119122
if nrows is not None:
120123
return parser.get_chunk(nrows)
@@ -127,13 +130,14 @@ def read_csv(filepath_or_buffer, sep=None, header=0, index_col=None, names=None,
127130
def read_table(filepath_or_buffer, sep='\t', header=0, index_col=None,
128131
names=None, skiprows=None, na_values=None, parse_dates=False,
129132
date_parser=None, nrows=None, iterator=False, chunksize=None,
130-
skip_footer=0, converters=None):
133+
skip_footer=0, converters=None, verbose=False):
131134
return read_csv(filepath_or_buffer, sep=sep, header=header,
132135
skiprows=skiprows, index_col=index_col,
133136
na_values=na_values, date_parser=date_parser,
134137
names=names, parse_dates=parse_dates,
135138
nrows=nrows, iterator=iterator, chunksize=chunksize,
136-
skip_footer=skip_footer, converters=converters)
139+
skip_footer=skip_footer, converters=converters,
140+
verbose=verbose)
137141

138142
def read_clipboard(**kwargs): # pragma: no cover
139143
"""
@@ -196,7 +200,7 @@ class TextParser(object):
196200
def __init__(self, f, delimiter=None, names=None, header=0,
197201
index_col=None, na_values=None, parse_dates=False,
198202
date_parser=None, chunksize=None, skiprows=None,
199-
skip_footer=0, converters=None):
203+
skip_footer=0, converters=None, verbose=False):
200204
"""
201205
Workhorse function for processing nested list into DataFrame
202206
@@ -215,6 +219,7 @@ def __init__(self, f, delimiter=None, names=None, header=0,
215219
self.skiprows = set() if skiprows is None else set(skiprows)
216220
self.skip_footer = skip_footer
217221
self.delimiter = delimiter
222+
self.verbose = verbose
218223

219224
if converters is not None:
220225
assert(isinstance(converters, dict))
@@ -412,14 +417,17 @@ def get_chunk(self, rows=None):
412417
if np.isscalar(self.index_col):
413418
if self.parse_dates:
414419
index = lib.try_parse_dates(index, parser=self.date_parser)
415-
index = Index(_convert_types(index, self.na_values),
416-
name=self.index_name)
420+
index, na_count = _convert_types(index, self.na_values)
421+
index = Index(index, name=self.index_name)
422+
if self.verbose and na_count:
423+
print 'Found %d NA values in the index' % na_count
417424
else:
418425
arrays = []
419426
for arr in index:
420427
if self.parse_dates:
421428
arr = lib.try_parse_dates(arr, parser=self.date_parser)
422-
arrays.append(_convert_types(arr, self.na_values))
429+
arr, _ = _convert_types(arr, self.na_values)
430+
arrays.append(arr)
423431
index = MultiIndex.from_arrays(arrays, names=self.index_name)
424432
else:
425433
index = Index(np.arange(len(content)))
@@ -442,7 +450,7 @@ def get_chunk(self, rows=None):
442450
result = result.astype('O')
443451
data[col] = result
444452

445-
data = _convert_to_ndarrays(data, self.na_values)
453+
data = _convert_to_ndarrays(data, self.na_values, self.verbose)
446454

447455
return DataFrame(data=data, columns=self.columns, index=index)
448456

@@ -483,24 +491,30 @@ def _get_lines(self, rows=None):
483491

484492
return lines
485493

486-
def _convert_to_ndarrays(dct, na_values):
494+
def _convert_to_ndarrays(dct, na_values, verbose=False):
487495
result = {}
488496
for c, values in dct.iteritems():
489-
result[c] = _convert_types(values, na_values)
497+
cvals, na_count = _convert_types(values, na_values)
498+
result[c] = cvals
499+
if verbose and na_count:
500+
print 'Filled %d NA values in column %s' % (na_count, str(c))
490501
return result
491502

492503
def _convert_types(values, na_values):
504+
na_count = 0
493505
if issubclass(values.dtype.type, (np.number, np.bool_)):
494-
return values
506+
return values, na_count
495507

496508
try:
497-
values = lib.maybe_convert_numeric(values, na_values)
509+
result = lib.maybe_convert_numeric(values, na_values)
498510
except Exception:
499-
lib.sanitize_objects(values, na_values)
511+
na_count = lib.sanitize_objects(values, na_values)
512+
result = values
500513

501-
if values.dtype == np.object_:
502-
return lib.maybe_convert_bool(values)
503-
return values
514+
if result.dtype == np.object_:
515+
result = lib.maybe_convert_bool(values)
516+
517+
return result, na_count
504518

505519
#-------------------------------------------------------------------------------
506520
# ExcelFile class

pandas/src/inference.pyx

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -331,6 +331,7 @@ def sanitize_objects(ndarray[object] values, set na_values):
331331
cdef:
332332
Py_ssize_t i, n
333333
object val, onan
334+
Py_ssize_t na_count = 0
334335

335336
n = len(values)
336337
onan = np.nan
@@ -339,6 +340,8 @@ def sanitize_objects(ndarray[object] values, set na_values):
339340
val = values[i]
340341
if val == '' or val in na_values:
341342
values[i] = onan
343+
na_count += 1
344+
return na_count
342345

343346
def maybe_convert_bool(ndarray[object] arr):
344347
cdef:

0 commit comments

Comments
 (0)