diff --git a/doc/source/io.rst b/doc/source/io.rst index f524d37d0de60..f22374553e9c3 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -2538,6 +2538,20 @@ missing data to recover integer dtype: cfun = lambda x: int(x) if x else -1 read_excel('path_to_file.xls', 'Sheet1', converters={'MyInts': cfun}) +dtype Specifications +++++++++++++++++++++ + +.. versionadded:: 0.20 + +As an alternative to converters, the type for an entire column can +be specified using the `dtype` keyword, which takes a dictionary +mapping column names to types. To interpret data with +no type inference, use the type ``str`` or ``object``. + +.. code-block:: python + + read_excel('path_to_file.xls', dtype={'MyInts': 'int64', 'MyText': str}) + .. _io.excel_writer: Writing Excel Files diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 6fe0ad8092a03..06517c1489861 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -22,8 +22,8 @@ New features ~~~~~~~~~~~~ -``read_csv`` supports ``dtype`` keyword for python engine -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +``dtype`` keyword for data io +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ The ``dtype`` keyword argument in the :func:`read_csv` function for specifying the types of parsed columns is now supported with the ``'python'`` engine (:issue:`14295`). See the :ref:`io docs ` for more information. @@ -35,7 +35,7 @@ The ``dtype`` keyword argument in the :func:`read_csv` function for specifying t pd.read_csv(StringIO(data), engine='python', dtype={'a':'float64', 'b':'object'}).dtypes The ``dtype`` keyword argument is also now supported in the :func:`read_fwf` function for parsing -fixed-width text files. +fixed-width text files, and :func:`read_excel` for parsing Excel files. .. ipython:: python diff --git a/pandas/io/excel.py b/pandas/io/excel.py index d3171ceedfc03..6b7c597ecfcdc 100644 --- a/pandas/io/excel.py +++ b/pandas/io/excel.py @@ -87,6 +87,14 @@ either be integers or column labels, values are functions that take one input argument, the Excel cell content, and return the transformed content. +dtype : Type name or dict of column -> type, default None + Data type for data or columns. E.g. {'a': np.float64, 'b': np.int32} + Use `str` or `object` to preserve and not interpret dtype. + If converters are specified, they will be applied INSTEAD + of dtype conversion. + + .. versionadded:: 0.20.0 + true_values : list, default None Values to consider as True @@ -184,8 +192,8 @@ def read_excel(io, sheetname=0, header=0, skiprows=None, skip_footer=0, index_col=None, names=None, parse_cols=None, parse_dates=False, date_parser=None, na_values=None, thousands=None, convert_float=True, has_index_names=None, converters=None, - true_values=None, false_values=None, engine=None, squeeze=False, - **kwds): + dtype=None, true_values=None, false_values=None, engine=None, + squeeze=False, **kwds): if not isinstance(io, ExcelFile): io = ExcelFile(io, engine=engine) @@ -195,7 +203,7 @@ def read_excel(io, sheetname=0, header=0, skiprows=None, skip_footer=0, index_col=index_col, parse_cols=parse_cols, parse_dates=parse_dates, date_parser=date_parser, na_values=na_values, thousands=thousands, convert_float=convert_float, has_index_names=has_index_names, - skip_footer=skip_footer, converters=converters, + skip_footer=skip_footer, converters=converters, dtype=dtype, true_values=true_values, false_values=false_values, squeeze=squeeze, **kwds) @@ -318,7 +326,7 @@ def _parse_excel(self, sheetname=0, header=0, skiprows=None, names=None, parse_cols=None, parse_dates=False, date_parser=None, na_values=None, thousands=None, convert_float=True, true_values=None, false_values=None, verbose=False, - squeeze=False, **kwds): + dtype=None, squeeze=False, **kwds): skipfooter = kwds.pop('skipfooter', None) if skipfooter is not None: @@ -501,6 +509,7 @@ def _parse_cell(cell_contents, cell_typ): skiprows=skiprows, skipfooter=skip_footer, squeeze=squeeze, + dtype=dtype, **kwds) output[asheetname] = parser.read() diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 580a3398bb66a..ef839297c80d3 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -18,7 +18,7 @@ from pandas.types.common import (is_integer, _ensure_object, is_list_like, is_integer_dtype, is_float, is_dtype_equal, - is_object_dtype, + is_object_dtype, is_string_dtype, is_scalar, is_categorical_dtype) from pandas.types.missing import isnull from pandas.types.cast import _astype_nansafe @@ -1329,7 +1329,7 @@ def _convert_to_ndarrays(self, dct, na_values, na_fvalues, verbose=False, try_num_bool=False) else: # skip inference if specified dtype is object - try_num_bool = not (cast_type and is_object_dtype(cast_type)) + try_num_bool = not (cast_type and is_string_dtype(cast_type)) # general type inference and conversion cvals, na_count = self._infer_types( diff --git a/pandas/io/tests/data/testdtype.xls b/pandas/io/tests/data/testdtype.xls new file mode 100644 index 0000000000000..f63357524324f Binary files /dev/null and b/pandas/io/tests/data/testdtype.xls differ diff --git a/pandas/io/tests/data/testdtype.xlsm b/pandas/io/tests/data/testdtype.xlsm new file mode 100644 index 0000000000000..20e658288d5ac Binary files /dev/null and b/pandas/io/tests/data/testdtype.xlsm differ diff --git a/pandas/io/tests/data/testdtype.xlsx b/pandas/io/tests/data/testdtype.xlsx new file mode 100644 index 0000000000000..7c65263c373a3 Binary files /dev/null and b/pandas/io/tests/data/testdtype.xlsx differ diff --git a/pandas/io/tests/test_excel.py b/pandas/io/tests/test_excel.py index 49a508dd22023..9c909398d2d88 100644 --- a/pandas/io/tests/test_excel.py +++ b/pandas/io/tests/test_excel.py @@ -373,6 +373,33 @@ def test_reader_converters(self): actual = self.get_exceldf(basename, 'Sheet1', converters=converters) tm.assert_frame_equal(actual, expected) + def test_reader_dtype(self): + # GH 8212 + basename = 'testdtype' + actual = self.get_exceldf(basename) + + expected = DataFrame({ + 'a': [1, 2, 3, 4], + 'b': [2.5, 3.5, 4.5, 5.5], + 'c': [1, 2, 3, 4], + 'd': [1.0, 2.0, np.nan, 4.0]}).reindex( + columns=['a', 'b', 'c', 'd']) + + tm.assert_frame_equal(actual, expected) + + actual = self.get_exceldf(basename, + dtype={'a': 'float64', + 'b': 'float32', + 'c': str}) + + expected['a'] = expected['a'].astype('float64') + expected['b'] = expected['b'].astype('float32') + expected['c'] = ['001', '002', '003', '004'] + tm.assert_frame_equal(actual, expected) + + with tm.assertRaises(ValueError): + actual = self.get_exceldf(basename, dtype={'d': 'int64'}) + def test_reading_all_sheets(self): # Test reading all sheetnames by setting sheetname to None, # Ensure a dict is returned.