pandas-dev
diff --git a/‎pandas/io/parsers.py
Lines changed: 94 additions & 33 deletions b/‎pandas/io/parsers.py
Lines changed: 94 additions & 33 deletions
@@ -17,10 +17,14 @@
                            zip, string_types, map, u)
 from pandas.types.common import (is_integer, _ensure_object,
                                  is_list_like, is_integer_dtype,
-                                 is_float,
-                                 is_scalar)
+                                 is_float, is_dtype_equal,
+                                 is_object_dtype,
+                                 is_scalar, is_categorical_dtype)
+from pandas.types.missing import isnull
+from pandas.types.cast import _astype_nansafe
 from pandas.core.index import Index, MultiIndex, RangeIndex
 from pandas.core.frame import DataFrame
+from pandas.core.categorical import Categorical
 from pandas.core.common import AbstractMethodError
 from pandas.core.config import get_option
 from pandas.io.date_converters import generic_parser
@@ -110,8 +114,9 @@
     are duplicate names in the columns.
 dtype : Type name or dict of column -> type, default None
     Data type for data or columns. E.g. {'a': np.float64, 'b': np.int32}
-    (Unsupported with engine='python'). Use `str` or `object` to preserve and
-    not interpret dtype.
+    Use `str` or `object` to preserve and not interpret dtype.
+    If converters are specified, they will be applied AFTER
+    dtype conversion.
 %s
 converters : dict, default None
     Dict of functions for converting values in certain columns. Keys can either
@@ -420,6 +425,7 @@ def _read(filepath_or_buffer, kwds):
     'true_values': None,
     'false_values': None,
     'converters': None,
+    'dtype': None,
     'skipfooter': 0,
 
     'keep_default_na': True,
@@ -460,7 +466,6 @@ def _read(filepath_or_buffer, kwds):
     'buffer_lines': None,
     'error_bad_lines': True,
     'warn_bad_lines': True,
-    'dtype': None,
     'float_precision': None
 }
 
@@ -475,7 +480,6 @@ def _read(filepath_or_buffer, kwds):
     'buffer_lines',
     'error_bad_lines',
     'warn_bad_lines',
-    'dtype',
     'float_precision',
 ])
 _deprecated_args = set([
@@ -833,9 +837,6 @@ def _clean_options(self, options, engine):
                            " ignored as it is not supported by the 'python'"
                            " engine.").format(reason=fallback_reason,
                                               option=arg)
-                    if arg == 'dtype':
-                        msg += " (Note the 'converters' option provides"\
-                               " similar functionality.)"
                     raise ValueError(msg)
                 del result[arg]
 
@@ -1284,48 +1285,78 @@ def _agg_index(self, index, try_parse_dates=True):
                     col_na_values, col_na_fvalues = _get_na_values(
                         col_name, self.na_values, self.na_fvalues)
 
-            arr, _ = self._convert_types(arr, col_na_values | col_na_fvalues)
+            arr, _ = self._infer_types(arr, col_na_values | col_na_fvalues)
             arrays.append(arr)
 
         index = MultiIndex.from_arrays(arrays, names=self.index_names)
 
         return index
 
+    def _apply_converter(self, values, conv_f, na_values, col_na_values,
+                         col_na_fvalues):
+        """ apply converter function to values, respecting NAs """
+        try:
+            values = lib.map_infer(values, conv_f)
+        except ValueError:
+            mask = lib.ismember(values, na_values).view(np.uint8)
+            values = lib.map_infer_mask(values, conv_f, mask)
+
+        cvals, na_count = self._infer_types(
+            values, set(col_na_values) | col_na_fvalues,
+            try_numeric=False)
+        return cvals, na_count
+
     def _convert_to_ndarrays(self, dct, na_values, na_fvalues, verbose=False,
-                             converters=None):
+                             converters=None, dtypes=None):
         result = {}
         for c, values in compat.iteritems(dct):
             conv_f = None if converters is None else converters.get(c, None)
+            if isinstance(dtypes, dict):
+                cast_type = dtypes.get(c, None)
+            else:
+                # single dtype or None
+                cast_type = dtypes
 
             if self.na_filter:
                 col_na_values, col_na_fvalues = _get_na_values(
                     c, na_values, na_fvalues)
             else:
                 col_na_values, col_na_fvalues = set(), set()
 
-            coerce_type = True
-            if conv_f is not None:
-                try:
-                    values = lib.map_infer(values, conv_f)
-                except ValueError:
-                    mask = lib.ismember(values, na_values).view(np.uint8)
-                    values = lib.map_infer_mask(values, conv_f, mask)
-                coerce_type = False
-
-            cvals, na_count = self._convert_types(
-                values, set(col_na_values) | col_na_fvalues, coerce_type)
+            if conv_f is not None and cast_type is None:
+                # if type is not specified, apply the conversion first, without
+                # inference
+                cvals, na_count = self._apply_converter(
+                    values, conv_f, na_values,
+                    col_na_values, col_na_fvalues)
+            else:
+                # general type inference and conversion
+                cvals, na_count = self._infer_types(
+                    values, set(col_na_values) | col_na_fvalues,
+                    try_numeric=True)
 
             if issubclass(cvals.dtype.type, np.integer) and self.compact_ints:
                 cvals = lib.downcast_int64(
                     cvals, _parser.na_values,
                     self.use_unsigned)
 
+            if cast_type and not is_dtype_equal(cvals, cast_type):
+                # type specificed in dtype param
+
+                cvals = self._cast_types(cvals, cast_type, c)
+                # for consistency with c-parser, if a converter and dtype are
+                # specified, apply the converter last
+                if conv_f is not None:
+                    values, na_count = self._apply_converter(
+                        values, conv_f, na_values,
+                        col_na_values, col_na_fvalues)
+
             result[c] = cvals
             if verbose and na_count:
                 print('Filled %d NA values in column %s' % (na_count, str(c)))
         return result
 
-    def _convert_types(self, values, na_values, try_num_bool=True):
+    def _infer_types(self, values, na_values, try_numeric=True):
         na_count = 0
         if issubclass(values.dtype.type, (np.number, np.bool_)):
             mask = lib.ismember(values, na_values)
@@ -1336,9 +1367,10 @@ def _convert_types(self, values, na_values, try_num_bool=True):
                 np.putmask(values, mask, np.nan)
             return values, na_count
 
-        if try_num_bool:
+        if try_numeric:
             try:
                 result = lib.maybe_convert_numeric(values, na_values, False)
+                na_count = isnull(result).sum()
             except Exception:
                 result = values
                 if values.dtype == np.object_:
@@ -1348,13 +1380,30 @@ def _convert_types(self, values, na_values, try_num_bool=True):
             if values.dtype == np.object_:
                 na_count = lib.sanitize_objects(values, na_values, False)
 
-        if result.dtype == np.object_ and try_num_bool:
+        if result.dtype == np.object_ and try_numeric:
             result = lib.maybe_convert_bool(values,
                                             true_values=self.true_values,
                                             false_values=self.false_values)
 
         return result, na_count
 
+    def _cast_types(self, values, cast_type, column):
+        """ cast column to type specified in dtypes= param """
+        if is_categorical_dtype(cast_type):
+            # XXX this is for consistency with
+            # c-parser which parses all categories
+            # as strings
+            if not is_object_dtype(values):
+                values = _astype_nansafe(values, str)
+            values = Categorical(values)
+        else:
+            try:
+                values = _astype_nansafe(values, cast_type, copy=True)
+            except ValueError:
+                raise ValueError("Unable to convert column %s to "
+                                 "type %s" % (column, cast_type))
+        return values
+
     def _do_date_conversions(self, names, data):
         # returns data, columns
         if self.parse_dates is not None:
@@ -1777,6 +1826,7 @@ def __init__(self, f, **kwds):
 
         self.verbose = kwds['verbose']
         self.converters = kwds['converters']
+        self.dtype = kwds['dtype']
 
         self.compact_ints = kwds['compact_ints']
         self.use_unsigned = kwds['use_unsigned']
@@ -1975,7 +2025,8 @@ def read(self, rows=None):
             # DataFrame with the right metadata, even though it's length 0
             names = self._maybe_dedup_names(self.orig_names)
             return _get_empty_meta(names, self.index_col,
-                                   self.index_names)
+                                   self.index_names,
+                                   self.dtype)
 
         # handle new style for names in index
         count_empty_content_vals = count_empty_vals(content[0])
@@ -2023,15 +2074,25 @@ def get_chunk(self, size=None):
 
     def _convert_data(self, data):
         # apply converters
-        clean_conv = {}
-
-        for col, f in compat.iteritems(self.converters):
-            if isinstance(col, int) and col not in self.orig_names:
-                col = self.orig_names[col]
-            clean_conv[col] = f
+        def _clean_mapping(mapping):
+            "converts col numbers to names"
+            clean = {}
+            for col, v in compat.iteritems(mapping):
+                if isinstance(col, int) and col not in self.orig_names:
+                    col = self.orig_names[col]
+                clean[col] = v
+            return clean
+
+        clean_conv = _clean_mapping(self.converters)
+        if not isinstance(self.dtype, dict):
+            # handles single dtype applied to all columns
+            clean_dtypes = self.dtype
+        else:
+            clean_dtypes = _clean_mapping(self.dtype)
 
         return self._convert_to_ndarrays(data, self.na_values, self.na_fvalues,
-                                         self.verbose, clean_conv)
+                                         self.verbose, clean_conv,
+                                         clean_dtypes)
 
     def _to_recarray(self, data, columns):
         dtypes = []