Skip to content

Commit d89790c

Browse files
committed
API: add dtype= option to python parser
1 parent 99b5876 commit d89790c

File tree

4 files changed

+316
-225
lines changed

4 files changed

+316
-225
lines changed

pandas/io/parsers.py

Lines changed: 94 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -17,10 +17,14 @@
1717
zip, string_types, map, u)
1818
from pandas.types.common import (is_integer, _ensure_object,
1919
is_list_like, is_integer_dtype,
20-
is_float,
21-
is_scalar)
20+
is_float, is_dtype_equal,
21+
is_object_dtype,
22+
is_scalar, is_categorical_dtype)
23+
from pandas.types.missing import isnull
24+
from pandas.types.cast import _astype_nansafe
2225
from pandas.core.index import Index, MultiIndex, RangeIndex
2326
from pandas.core.frame import DataFrame
27+
from pandas.core.categorical import Categorical
2428
from pandas.core.common import AbstractMethodError
2529
from pandas.core.config import get_option
2630
from pandas.io.date_converters import generic_parser
@@ -110,8 +114,9 @@
110114
are duplicate names in the columns.
111115
dtype : Type name or dict of column -> type, default None
112116
Data type for data or columns. E.g. {'a': np.float64, 'b': np.int32}
113-
(Unsupported with engine='python'). Use `str` or `object` to preserve and
114-
not interpret dtype.
117+
Use `str` or `object` to preserve and not interpret dtype.
118+
If converters are specified, they will be applied AFTER
119+
dtype conversion.
115120
%s
116121
converters : dict, default None
117122
Dict of functions for converting values in certain columns. Keys can either
@@ -420,6 +425,7 @@ def _read(filepath_or_buffer, kwds):
420425
'true_values': None,
421426
'false_values': None,
422427
'converters': None,
428+
'dtype': None,
423429
'skipfooter': 0,
424430

425431
'keep_default_na': True,
@@ -460,7 +466,6 @@ def _read(filepath_or_buffer, kwds):
460466
'buffer_lines': None,
461467
'error_bad_lines': True,
462468
'warn_bad_lines': True,
463-
'dtype': None,
464469
'float_precision': None
465470
}
466471

@@ -475,7 +480,6 @@ def _read(filepath_or_buffer, kwds):
475480
'buffer_lines',
476481
'error_bad_lines',
477482
'warn_bad_lines',
478-
'dtype',
479483
'float_precision',
480484
])
481485
_deprecated_args = set([
@@ -833,9 +837,6 @@ def _clean_options(self, options, engine):
833837
" ignored as it is not supported by the 'python'"
834838
" engine.").format(reason=fallback_reason,
835839
option=arg)
836-
if arg == 'dtype':
837-
msg += " (Note the 'converters' option provides"\
838-
" similar functionality.)"
839840
raise ValueError(msg)
840841
del result[arg]
841842

@@ -1284,48 +1285,78 @@ def _agg_index(self, index, try_parse_dates=True):
12841285
col_na_values, col_na_fvalues = _get_na_values(
12851286
col_name, self.na_values, self.na_fvalues)
12861287

1287-
arr, _ = self._convert_types(arr, col_na_values | col_na_fvalues)
1288+
arr, _ = self._infer_types(arr, col_na_values | col_na_fvalues)
12881289
arrays.append(arr)
12891290

12901291
index = MultiIndex.from_arrays(arrays, names=self.index_names)
12911292

12921293
return index
12931294

1295+
def _apply_converter(self, values, conv_f, na_values, col_na_values,
1296+
col_na_fvalues):
1297+
""" apply converter function to values, respecting NAs """
1298+
try:
1299+
values = lib.map_infer(values, conv_f)
1300+
except ValueError:
1301+
mask = lib.ismember(values, na_values).view(np.uint8)
1302+
values = lib.map_infer_mask(values, conv_f, mask)
1303+
1304+
cvals, na_count = self._infer_types(
1305+
values, set(col_na_values) | col_na_fvalues,
1306+
try_numeric=False)
1307+
return cvals, na_count
1308+
12941309
def _convert_to_ndarrays(self, dct, na_values, na_fvalues, verbose=False,
1295-
converters=None):
1310+
converters=None, dtypes=None):
12961311
result = {}
12971312
for c, values in compat.iteritems(dct):
12981313
conv_f = None if converters is None else converters.get(c, None)
1314+
if isinstance(dtypes, dict):
1315+
cast_type = dtypes.get(c, None)
1316+
else:
1317+
# single dtype or None
1318+
cast_type = dtypes
12991319

13001320
if self.na_filter:
13011321
col_na_values, col_na_fvalues = _get_na_values(
13021322
c, na_values, na_fvalues)
13031323
else:
13041324
col_na_values, col_na_fvalues = set(), set()
13051325

1306-
coerce_type = True
1307-
if conv_f is not None:
1308-
try:
1309-
values = lib.map_infer(values, conv_f)
1310-
except ValueError:
1311-
mask = lib.ismember(values, na_values).view(np.uint8)
1312-
values = lib.map_infer_mask(values, conv_f, mask)
1313-
coerce_type = False
1314-
1315-
cvals, na_count = self._convert_types(
1316-
values, set(col_na_values) | col_na_fvalues, coerce_type)
1326+
if conv_f is not None and cast_type is None:
1327+
# if type is not specified, apply the conversion first, without
1328+
# inference
1329+
cvals, na_count = self._apply_converter(
1330+
values, conv_f, na_values,
1331+
col_na_values, col_na_fvalues)
1332+
else:
1333+
# general type inference and conversion
1334+
cvals, na_count = self._infer_types(
1335+
values, set(col_na_values) | col_na_fvalues,
1336+
try_numeric=True)
13171337

13181338
if issubclass(cvals.dtype.type, np.integer) and self.compact_ints:
13191339
cvals = lib.downcast_int64(
13201340
cvals, _parser.na_values,
13211341
self.use_unsigned)
13221342

1343+
if cast_type and not is_dtype_equal(cvals, cast_type):
1344+
# type specificed in dtype param
1345+
1346+
cvals = self._cast_types(cvals, cast_type, c)
1347+
# for consistency with c-parser, if a converter and dtype are
1348+
# specified, apply the converter last
1349+
if conv_f is not None:
1350+
values, na_count = self._apply_converter(
1351+
values, conv_f, na_values,
1352+
col_na_values, col_na_fvalues)
1353+
13231354
result[c] = cvals
13241355
if verbose and na_count:
13251356
print('Filled %d NA values in column %s' % (na_count, str(c)))
13261357
return result
13271358

1328-
def _convert_types(self, values, na_values, try_num_bool=True):
1359+
def _infer_types(self, values, na_values, try_numeric=True):
13291360
na_count = 0
13301361
if issubclass(values.dtype.type, (np.number, np.bool_)):
13311362
mask = lib.ismember(values, na_values)
@@ -1336,9 +1367,10 @@ def _convert_types(self, values, na_values, try_num_bool=True):
13361367
np.putmask(values, mask, np.nan)
13371368
return values, na_count
13381369

1339-
if try_num_bool:
1370+
if try_numeric:
13401371
try:
13411372
result = lib.maybe_convert_numeric(values, na_values, False)
1373+
na_count = isnull(result).sum()
13421374
except Exception:
13431375
result = values
13441376
if values.dtype == np.object_:
@@ -1348,13 +1380,30 @@ def _convert_types(self, values, na_values, try_num_bool=True):
13481380
if values.dtype == np.object_:
13491381
na_count = lib.sanitize_objects(values, na_values, False)
13501382

1351-
if result.dtype == np.object_ and try_num_bool:
1383+
if result.dtype == np.object_ and try_numeric:
13521384
result = lib.maybe_convert_bool(values,
13531385
true_values=self.true_values,
13541386
false_values=self.false_values)
13551387

13561388
return result, na_count
13571389

1390+
def _cast_types(self, values, cast_type, column):
1391+
""" cast column to type specified in dtypes= param """
1392+
if is_categorical_dtype(cast_type):
1393+
# XXX this is for consistency with
1394+
# c-parser which parses all categories
1395+
# as strings
1396+
if not is_object_dtype(values):
1397+
values = _astype_nansafe(values, str)
1398+
values = Categorical(values)
1399+
else:
1400+
try:
1401+
values = _astype_nansafe(values, cast_type, copy=True)
1402+
except ValueError:
1403+
raise ValueError("Unable to convert column %s to "
1404+
"type %s" % (column, cast_type))
1405+
return values
1406+
13581407
def _do_date_conversions(self, names, data):
13591408
# returns data, columns
13601409
if self.parse_dates is not None:
@@ -1777,6 +1826,7 @@ def __init__(self, f, **kwds):
17771826

17781827
self.verbose = kwds['verbose']
17791828
self.converters = kwds['converters']
1829+
self.dtype = kwds['dtype']
17801830

17811831
self.compact_ints = kwds['compact_ints']
17821832
self.use_unsigned = kwds['use_unsigned']
@@ -1975,7 +2025,8 @@ def read(self, rows=None):
19752025
# DataFrame with the right metadata, even though it's length 0
19762026
names = self._maybe_dedup_names(self.orig_names)
19772027
return _get_empty_meta(names, self.index_col,
1978-
self.index_names)
2028+
self.index_names,
2029+
self.dtype)
19792030

19802031
# handle new style for names in index
19812032
count_empty_content_vals = count_empty_vals(content[0])
@@ -2023,15 +2074,25 @@ def get_chunk(self, size=None):
20232074

20242075
def _convert_data(self, data):
20252076
# apply converters
2026-
clean_conv = {}
2027-
2028-
for col, f in compat.iteritems(self.converters):
2029-
if isinstance(col, int) and col not in self.orig_names:
2030-
col = self.orig_names[col]
2031-
clean_conv[col] = f
2077+
def _clean_mapping(mapping):
2078+
"converts col numbers to names"
2079+
clean = {}
2080+
for col, v in compat.iteritems(mapping):
2081+
if isinstance(col, int) and col not in self.orig_names:
2082+
col = self.orig_names[col]
2083+
clean[col] = v
2084+
return clean
2085+
2086+
clean_conv = _clean_mapping(self.converters)
2087+
if not isinstance(self.dtype, dict):
2088+
# handles single dtype applied to all columns
2089+
clean_dtypes = self.dtype
2090+
else:
2091+
clean_dtypes = _clean_mapping(self.dtype)
20322092

20332093
return self._convert_to_ndarrays(data, self.na_values, self.na_fvalues,
2034-
self.verbose, clean_conv)
2094+
self.verbose, clean_conv,
2095+
clean_dtypes)
20352096

20362097
def _to_recarray(self, data, columns):
20372098
dtypes = []

0 commit comments

Comments
 (0)