17
17
zip , string_types , map , u )
18
18
from pandas .types .common import (is_integer , _ensure_object ,
19
19
is_list_like , is_integer_dtype ,
20
- is_float ,
21
- is_scalar )
20
+ is_float , is_dtype_equal ,
21
+ is_object_dtype ,
22
+ is_scalar , is_categorical_dtype )
23
+ from pandas .types .missing import isnull
24
+ from pandas .types .cast import _astype_nansafe
22
25
from pandas .core .index import Index , MultiIndex , RangeIndex
23
26
from pandas .core .frame import DataFrame
27
+ from pandas .core .categorical import Categorical
24
28
from pandas .core .common import AbstractMethodError
25
29
from pandas .core .config import get_option
26
30
from pandas .io .date_converters import generic_parser
110
114
are duplicate names in the columns.
111
115
dtype : Type name or dict of column -> type, default None
112
116
Data type for data or columns. E.g. {'a': np.float64, 'b': np.int32}
113
- (Unsupported with engine='python'). Use `str` or `object` to preserve and
114
- not interpret dtype.
117
+ Use `str` or `object` to preserve and not interpret dtype.
118
+ If converters are specified, they will be applied AFTER
119
+ dtype conversion.
115
120
%s
116
121
converters : dict, default None
117
122
Dict of functions for converting values in certain columns. Keys can either
@@ -420,6 +425,7 @@ def _read(filepath_or_buffer, kwds):
420
425
'true_values' : None ,
421
426
'false_values' : None ,
422
427
'converters' : None ,
428
+ 'dtype' : None ,
423
429
'skipfooter' : 0 ,
424
430
425
431
'keep_default_na' : True ,
@@ -460,7 +466,6 @@ def _read(filepath_or_buffer, kwds):
460
466
'buffer_lines' : None ,
461
467
'error_bad_lines' : True ,
462
468
'warn_bad_lines' : True ,
463
- 'dtype' : None ,
464
469
'float_precision' : None
465
470
}
466
471
@@ -475,7 +480,6 @@ def _read(filepath_or_buffer, kwds):
475
480
'buffer_lines' ,
476
481
'error_bad_lines' ,
477
482
'warn_bad_lines' ,
478
- 'dtype' ,
479
483
'float_precision' ,
480
484
])
481
485
_deprecated_args = set ([
@@ -833,9 +837,6 @@ def _clean_options(self, options, engine):
833
837
" ignored as it is not supported by the 'python'"
834
838
" engine." ).format (reason = fallback_reason ,
835
839
option = arg )
836
- if arg == 'dtype' :
837
- msg += " (Note the 'converters' option provides" \
838
- " similar functionality.)"
839
840
raise ValueError (msg )
840
841
del result [arg ]
841
842
@@ -1284,48 +1285,78 @@ def _agg_index(self, index, try_parse_dates=True):
1284
1285
col_na_values , col_na_fvalues = _get_na_values (
1285
1286
col_name , self .na_values , self .na_fvalues )
1286
1287
1287
- arr , _ = self ._convert_types (arr , col_na_values | col_na_fvalues )
1288
+ arr , _ = self ._infer_types (arr , col_na_values | col_na_fvalues )
1288
1289
arrays .append (arr )
1289
1290
1290
1291
index = MultiIndex .from_arrays (arrays , names = self .index_names )
1291
1292
1292
1293
return index
1293
1294
1295
+ def _apply_converter (self , values , conv_f , na_values , col_na_values ,
1296
+ col_na_fvalues ):
1297
+ """ apply converter function to values, respecting NAs """
1298
+ try :
1299
+ values = lib .map_infer (values , conv_f )
1300
+ except ValueError :
1301
+ mask = lib .ismember (values , na_values ).view (np .uint8 )
1302
+ values = lib .map_infer_mask (values , conv_f , mask )
1303
+
1304
+ cvals , na_count = self ._infer_types (
1305
+ values , set (col_na_values ) | col_na_fvalues ,
1306
+ try_numeric = False )
1307
+ return cvals , na_count
1308
+
1294
1309
def _convert_to_ndarrays (self , dct , na_values , na_fvalues , verbose = False ,
1295
- converters = None ):
1310
+ converters = None , dtypes = None ):
1296
1311
result = {}
1297
1312
for c , values in compat .iteritems (dct ):
1298
1313
conv_f = None if converters is None else converters .get (c , None )
1314
+ if isinstance (dtypes , dict ):
1315
+ cast_type = dtypes .get (c , None )
1316
+ else :
1317
+ # single dtype or None
1318
+ cast_type = dtypes
1299
1319
1300
1320
if self .na_filter :
1301
1321
col_na_values , col_na_fvalues = _get_na_values (
1302
1322
c , na_values , na_fvalues )
1303
1323
else :
1304
1324
col_na_values , col_na_fvalues = set (), set ()
1305
1325
1306
- coerce_type = True
1307
- if conv_f is not None :
1308
- try :
1309
- values = lib . map_infer ( values , conv_f )
1310
- except ValueError :
1311
- mask = lib . ismember ( values , na_values ). view ( np . uint8 )
1312
- values = lib . map_infer_mask ( values , conv_f , mask )
1313
- coerce_type = False
1314
-
1315
- cvals , na_count = self . _convert_types (
1316
- values , set ( col_na_values ) | col_na_fvalues , coerce_type )
1326
+ if conv_f is not None and cast_type is None :
1327
+ # if type is not specified, apply the conversion first, without
1328
+ # inference
1329
+ cvals , na_count = self . _apply_converter (
1330
+ values , conv_f , na_values ,
1331
+ col_na_values , col_na_fvalues )
1332
+ else :
1333
+ # general type inference and conversion
1334
+ cvals , na_count = self . _infer_types (
1335
+ values , set ( col_na_values ) | col_na_fvalues ,
1336
+ try_numeric = True )
1317
1337
1318
1338
if issubclass (cvals .dtype .type , np .integer ) and self .compact_ints :
1319
1339
cvals = lib .downcast_int64 (
1320
1340
cvals , _parser .na_values ,
1321
1341
self .use_unsigned )
1322
1342
1343
+ if cast_type and not is_dtype_equal (cvals , cast_type ):
1344
+ # type specificed in dtype param
1345
+
1346
+ cvals = self ._cast_types (cvals , cast_type , c )
1347
+ # for consistency with c-parser, if a converter and dtype are
1348
+ # specified, apply the converter last
1349
+ if conv_f is not None :
1350
+ values , na_count = self ._apply_converter (
1351
+ values , conv_f , na_values ,
1352
+ col_na_values , col_na_fvalues )
1353
+
1323
1354
result [c ] = cvals
1324
1355
if verbose and na_count :
1325
1356
print ('Filled %d NA values in column %s' % (na_count , str (c )))
1326
1357
return result
1327
1358
1328
- def _convert_types (self , values , na_values , try_num_bool = True ):
1359
+ def _infer_types (self , values , na_values , try_numeric = True ):
1329
1360
na_count = 0
1330
1361
if issubclass (values .dtype .type , (np .number , np .bool_ )):
1331
1362
mask = lib .ismember (values , na_values )
@@ -1336,9 +1367,10 @@ def _convert_types(self, values, na_values, try_num_bool=True):
1336
1367
np .putmask (values , mask , np .nan )
1337
1368
return values , na_count
1338
1369
1339
- if try_num_bool :
1370
+ if try_numeric :
1340
1371
try :
1341
1372
result = lib .maybe_convert_numeric (values , na_values , False )
1373
+ na_count = isnull (result ).sum ()
1342
1374
except Exception :
1343
1375
result = values
1344
1376
if values .dtype == np .object_ :
@@ -1348,13 +1380,30 @@ def _convert_types(self, values, na_values, try_num_bool=True):
1348
1380
if values .dtype == np .object_ :
1349
1381
na_count = lib .sanitize_objects (values , na_values , False )
1350
1382
1351
- if result .dtype == np .object_ and try_num_bool :
1383
+ if result .dtype == np .object_ and try_numeric :
1352
1384
result = lib .maybe_convert_bool (values ,
1353
1385
true_values = self .true_values ,
1354
1386
false_values = self .false_values )
1355
1387
1356
1388
return result , na_count
1357
1389
1390
+ def _cast_types (self , values , cast_type , column ):
1391
+ """ cast column to type specified in dtypes= param """
1392
+ if is_categorical_dtype (cast_type ):
1393
+ # XXX this is for consistency with
1394
+ # c-parser which parses all categories
1395
+ # as strings
1396
+ if not is_object_dtype (values ):
1397
+ values = _astype_nansafe (values , str )
1398
+ values = Categorical (values )
1399
+ else :
1400
+ try :
1401
+ values = _astype_nansafe (values , cast_type , copy = True )
1402
+ except ValueError :
1403
+ raise ValueError ("Unable to convert column %s to "
1404
+ "type %s" % (column , cast_type ))
1405
+ return values
1406
+
1358
1407
def _do_date_conversions (self , names , data ):
1359
1408
# returns data, columns
1360
1409
if self .parse_dates is not None :
@@ -1777,6 +1826,7 @@ def __init__(self, f, **kwds):
1777
1826
1778
1827
self .verbose = kwds ['verbose' ]
1779
1828
self .converters = kwds ['converters' ]
1829
+ self .dtype = kwds ['dtype' ]
1780
1830
1781
1831
self .compact_ints = kwds ['compact_ints' ]
1782
1832
self .use_unsigned = kwds ['use_unsigned' ]
@@ -1975,7 +2025,8 @@ def read(self, rows=None):
1975
2025
# DataFrame with the right metadata, even though it's length 0
1976
2026
names = self ._maybe_dedup_names (self .orig_names )
1977
2027
return _get_empty_meta (names , self .index_col ,
1978
- self .index_names )
2028
+ self .index_names ,
2029
+ self .dtype )
1979
2030
1980
2031
# handle new style for names in index
1981
2032
count_empty_content_vals = count_empty_vals (content [0 ])
@@ -2023,15 +2074,25 @@ def get_chunk(self, size=None):
2023
2074
2024
2075
def _convert_data (self , data ):
2025
2076
# apply converters
2026
- clean_conv = {}
2027
-
2028
- for col , f in compat .iteritems (self .converters ):
2029
- if isinstance (col , int ) and col not in self .orig_names :
2030
- col = self .orig_names [col ]
2031
- clean_conv [col ] = f
2077
+ def _clean_mapping (mapping ):
2078
+ "converts col numbers to names"
2079
+ clean = {}
2080
+ for col , v in compat .iteritems (mapping ):
2081
+ if isinstance (col , int ) and col not in self .orig_names :
2082
+ col = self .orig_names [col ]
2083
+ clean [col ] = v
2084
+ return clean
2085
+
2086
+ clean_conv = _clean_mapping (self .converters )
2087
+ if not isinstance (self .dtype , dict ):
2088
+ # handles single dtype applied to all columns
2089
+ clean_dtypes = self .dtype
2090
+ else :
2091
+ clean_dtypes = _clean_mapping (self .dtype )
2032
2092
2033
2093
return self ._convert_to_ndarrays (data , self .na_values , self .na_fvalues ,
2034
- self .verbose , clean_conv )
2094
+ self .verbose , clean_conv ,
2095
+ clean_dtypes )
2035
2096
2036
2097
def _to_recarray (self , data , columns ):
2037
2098
dtypes = []
0 commit comments