Merge remote-tracking branch 'upstream/master' into timestamp_tz_constructor_depr

Matt Roeschke · Matt Roeschke · commit 4a5b7199dd03 · 2018-11-13T09:54:57.000-08:00
diff --git a/doc/source/api.rst b/doc/source/api.rst
@@ -1724,6 +1724,7 @@ MultiIndex Components
    MultiIndex.set_levels
    MultiIndex.set_labels
    MultiIndex.to_hierarchical
+   MultiIndex.to_flat_index
    MultiIndex.to_frame
    MultiIndex.is_lexsorted
    MultiIndex.sortlevel
diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt
@@ -238,6 +238,7 @@ Other Enhancements
 - :func:`~DataFrame.to_parquet` now supports writing a ``DataFrame`` as a directory of parquet files partitioned by a subset of the columns when ``engine = 'pyarrow'`` (:issue:`23283`)
 - :meth:`Timestamp.tz_localize`, :meth:`DatetimeIndex.tz_localize`, and :meth:`Series.tz_localize` have gained the ``nonexistent`` argument for alternative handling of nonexistent times. See :ref:`timeseries.timezone_nonexistent` (:issue:`8917`)
 - :meth:`read_excel()` now accepts ``usecols`` as a list of column names or callable (:issue:`18273`)
+- :meth:`MultiIndex.to_flat_index` has been added to flatten multiple levels into a single-level :class:`Index` object.
 
 .. _whatsnew_0240.api_breaking:
 
@@ -1264,9 +1265,6 @@ MultiIndex
 I/O
 ^^^
 
-- Bug in :meth:`to_sql` when writing timezone aware data (``datetime64[ns, tz]`` dtype) would raise a ``TypeError`` (:issue:`9086`)
-- Bug in :meth:`to_sql` where a naive DatetimeIndex would be written as ``TIMESTAMP WITH TIMEZONE`` type in supported databases, e.g. PostgreSQL (:issue:`23510`)
-
 .. _whatsnew_0240.bug_fixes.nan_with_str_dtype:
 
 Proper handling of `np.NaN` in a string data-typed column with the Python engine
@@ -1302,6 +1300,9 @@ Current Behavior:
 
 Notice how we now instead output ``np.nan`` itself instead of a stringified form of it.
 
+- Bug in :meth:`to_sql` when writing timezone aware data (``datetime64[ns, tz]`` dtype) would raise a ``TypeError`` (:issue:`9086`)
+- Bug in :meth:`to_sql` where a naive DatetimeIndex would be written as ``TIMESTAMP WITH TIMEZONE`` type in supported databases, e.g. PostgreSQL (:issue:`23510`)
+- Bug in :meth:`read_excel()` when ``parse_cols`` is specified with an empty dataset (:issue:`9208`)
 - :func:`read_html()` no longer ignores all-whitespace ``<tr>`` within ``<thead>`` when considering the ``skiprows`` and ``header`` arguments. Previously, users had to decrease their ``header`` and ``skiprows`` values on such tables to work around the issue. (:issue:`21641`)
 - :func:`read_excel()` will correctly show the deprecation warning for previously deprecated ``sheetname`` (:issue:`17994`)
 - :func:`read_csv()` and func:`read_table()` will throw ``UnicodeError`` and not coredump on badly encoded strings (:issue:`22748`)
diff --git a/pandas/_libs/intervaltree.pxi.in b/pandas/_libs/intervaltree.pxi.in
@@ -116,7 +116,7 @@ cdef class IntervalTree(IntervalMixin):
         enclosing = self.get_loc(0.5 * (key_left + key_right))
         combined = np.concatenate([left_overlap, right_overlap, enclosing])
         uniques = pd.unique(combined)
-        return uniques
+        return uniques.astype('intp')
 
     def get_indexer(self, scalar_t[:] target):
         """Return the positions corresponding to unique intervals that overlap
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -1829,24 +1829,6 @@ def to_panel(self):
 
         return self._constructor_expanddim(new_mgr)
 
-    @Appender(_shared_docs['to_excel'] % _shared_doc_kwargs)
-    def to_excel(self, excel_writer, sheet_name='Sheet1', na_rep='',
-                 float_format=None, columns=None, header=True, index=True,
-                 index_label=None, startrow=0, startcol=0, engine=None,
-                 merge_cells=True, encoding=None, inf_rep='inf', verbose=True,
-                 freeze_panes=None):
-
-        from pandas.io.formats.excel import ExcelFormatter
-        formatter = ExcelFormatter(self, na_rep=na_rep, cols=columns,
-                                   header=header,
-                                   float_format=float_format, index=index,
-                                   index_label=index_label,
-                                   merge_cells=merge_cells,
-                                   inf_rep=inf_rep)
-        formatter.write(excel_writer, sheet_name=sheet_name, startrow=startrow,
-                        startcol=startcol, freeze_panes=freeze_panes,
-                        engine=engine)
-
     @deprecate_kwarg(old_arg_name='encoding', new_arg_name=None)
     def to_stata(self, fname, convert_dates=None, write_index=True,
                  encoding="latin-1", byteorder=None, time_stamp=None,
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -1977,16 +1977,17 @@ def _repr_latex_(self):
     # I/O Methods
 
     _shared_docs['to_excel'] = """
-    Write %(klass)s to an excel sheet.
+    Write %(klass)s to an Excel sheet.
 
-    To write a single %(klass)s to an excel .xlsx file it is only necessary to
+    To write a single %(klass)s to an Excel .xlsx file it is only necessary to
     specify a target file name. To write to multiple sheets it is necessary to
     create an `ExcelWriter` object with a target file name, and specify a sheet
-    in the file to write to. Multiple sheets may be written to by
-    specifying unique `sheet_name`. With all data written to the file it is
-    necessary to save the changes. Note that creating an ExcelWriter object
-    with a file name that already exists will result in the contents of the
-    existing file being erased.
+    in the file to write to.
+
+    Multiple sheets may be written to by specifying unique `sheet_name`.
+    With all data written to the file it is necessary to save the changes.
+    Note that creating an `ExcelWriter` object with a file name that already
+    exists will result in the contents of the existing file being erased.
 
     Parameters
     ----------
@@ -9951,6 +9952,25 @@ def to_csv(self, path_or_buf=None, sep=",", na_rep='', float_format=None,
         if path_or_buf is None:
             return formatter.path_or_buf.getvalue()
 
+    @Appender(_shared_docs["to_excel"] % dict(klass="object"))
+    def to_excel(self, excel_writer, sheet_name="Sheet1", na_rep="",
+                 float_format=None, columns=None, header=True, index=True,
+                 index_label=None, startrow=0, startcol=0, engine=None,
+                 merge_cells=True, encoding=None, inf_rep="inf", verbose=True,
+                 freeze_panes=None):
+        df = self if isinstance(self, ABCDataFrame) else self.to_frame()
+
+        from pandas.io.formats.excel import ExcelFormatter
+        formatter = ExcelFormatter(df, na_rep=na_rep, cols=columns,
+                                   header=header,
+                                   float_format=float_format, index=index,
+                                   index_label=index_label,
+                                   merge_cells=merge_cells,
+                                   inf_rep=inf_rep)
+        formatter.write(excel_writer, sheet_name=sheet_name, startrow=startrow,
+                        startcol=startcol, freeze_panes=freeze_panes,
+                        engine=engine)
+
 
 def _doc_parms(cls):
     """Return a tuple of the doc parms."""
diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
@@ -1113,6 +1113,26 @@ def _format_attrs(self):
         """
         return format_object_attrs(self)
 
+    def to_flat_index(self):
+        """
+        Identity method.
+
+        .. versionadded:: 0.24.0
+
+        This is implemented for compatability with subclass implementations
+        when chaining.
+
+        Returns
+        -------
+        pd.Index
+            Caller.
+
+        See Also
+        --------
+        MultiIndex.to_flat_index : Subclass implementation.
+        """
+        return self
+
     def to_series(self, index=None, name=None):
         """
         Create a Series with both index and values equal to the index keys
diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py
@@ -193,6 +193,7 @@ class MultiIndex(Index):
     set_levels
     set_labels
     to_frame
+    to_flat_index
     is_lexsorted
     sortlevel
     droplevel
@@ -1246,6 +1247,34 @@ def to_hierarchical(self, n_repeat, n_shuffle=1):
                       FutureWarning, stacklevel=2)
         return MultiIndex(levels=levels, labels=labels, names=names)
 
+    def to_flat_index(self):
+        """
+        Convert a MultiIndex to an Index of Tuples containing the level values.
+
+        .. versionadded:: 0.24.0
+
+        Returns
+        -------
+        pd.Index
+            Index with the MultiIndex data represented in Tuples.
+
+        Notes
+        -----
+        This method will simply return the caller if called by anything other
+        than a MultiIndex.
+
+        Examples
+        --------
+        >>> index = pd.MultiIndex.from_product(
+        ...     [['foo', 'bar'], ['baz', 'qux']],
+        ...     names=['a', 'b'])
+        >>> index.to_flat_index()
+        Index([('foo', 'baz'), ('foo', 'qux'),
+               ('bar', 'baz'), ('bar', 'qux')],
+              dtype='object')
+        """
+        return Index(self.values, tupleize_cols=False)
+
     @property
     def is_all_dates(self):
         return False
diff --git a/pandas/core/series.py b/pandas/core/series.py
@@ -3970,19 +3970,6 @@ def to_csv(self, *args, **kwargs):
             kwargs["header"] = False  # Backwards compatibility.
         return self.to_frame().to_csv(**kwargs)
 
-    @Appender(generic._shared_docs['to_excel'] % _shared_doc_kwargs)
-    def to_excel(self, excel_writer, sheet_name='Sheet1', na_rep='',
-                 float_format=None, columns=None, header=True, index=True,
-                 index_label=None, startrow=0, startcol=0, engine=None,
-                 merge_cells=True, encoding=None, inf_rep='inf', verbose=True):
-        df = self.to_frame()
-        df.to_excel(excel_writer=excel_writer, sheet_name=sheet_name,
-                    na_rep=na_rep, float_format=float_format, columns=columns,
-                    header=header, index=index, index_label=index_label,
-                    startrow=startrow, startcol=startcol, engine=engine,
-                    merge_cells=merge_cells, encoding=encoding,
-                    inf_rep=inf_rep, verbose=verbose)
-
     @Appender(generic._shared_docs['isna'] % _shared_doc_kwargs)
     def isna(self):
         return super(Series, self).isna()
diff --git a/pandas/io/excel.py b/pandas/io/excel.py
@@ -634,14 +634,17 @@ def _parse_cell(cell_contents, cell_typ):
                 else:
                     offset = 1 + max(header)
 
-                for col in index_col:
-                    last = data[offset][col]
-
-                    for row in range(offset + 1, len(data)):
-                        if data[row][col] == '' or data[row][col] is None:
-                            data[row][col] = last
-                        else:
-                            last = data[row][col]
+                # Check if we have an empty dataset
+                # before trying to collect data.
+                if offset < len(data):
+                    for col in index_col:
+                        last = data[offset][col]
+
+                        for row in range(offset + 1, len(data)):
+                            if data[row][col] == '' or data[row][col] is None:
+                                data[row][col] = last
+                            else:
+                                last = data[row][col]
 
             has_index_names = is_list_like(header) and len(header) > 1
 
diff --git a/pandas/tests/indexes/multi/test_conversion.py b/pandas/tests/indexes/multi/test_conversion.py
@@ -170,3 +170,11 @@ def test_to_series_with_arguments(idx):
     assert s.values is not idx.values
     assert s.index is not idx
     assert s.name != idx.name
+
+
+def test_to_flat_index(idx):
+    expected = pd.Index((('foo', 'one'), ('foo', 'two'), ('bar', 'one'),
+                         ('baz', 'two'), ('qux', 'one'), ('qux', 'two')),
+                        tupleize_cols=False)
+    result = idx.to_flat_index()
+    tm.assert_index_equal(result, expected)
diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py
@@ -2266,6 +2266,14 @@ def test_tab_complete_warning(self, ip):
             with provisionalcompleter('ignore'):
                 list(ip.Completer.completions('idx.', 4))
 
+    def test_to_flat_index(self, indices):
+        # 22866
+        if isinstance(indices, MultiIndex):
+            pytest.skip("Separate expectation for MultiIndex")
+
+        result = indices.to_flat_index()
+        tm.assert_index_equal(result, indices)
+
 
 class TestMixedIntIndex(Base):
     # Mostly the tests from common.py for which the results differ
diff --git a/pandas/tests/io/data/test1.xls b/pandas/tests/io/data/test1.xls
diff --git a/pandas/tests/io/data/test1.xlsm b/pandas/tests/io/data/test1.xlsm
diff --git a/pandas/tests/io/data/test1.xlsx b/pandas/tests/io/data/test1.xlsx
diff --git a/pandas/tests/io/test_excel.py b/pandas/tests/io/test_excel.py
@@ -235,6 +235,16 @@ def test_index_col_label_error(self, ext):
             self.get_exceldf("test1", ext, "Sheet1", index_col=["A"],
                              usecols=["A", "C"])
 
+    def test_index_col_empty(self, ext):
+        # see gh-9208
+        result = self.get_exceldf("test1", ext, "Sheet3",
+                                  index_col=["A", "B", "C"])
+        expected = DataFrame(columns=["D", "E", "F"],
+                             index=MultiIndex(levels=[[]] * 3,
+                                              labels=[[]] * 3,
+                                              names=["A", "B", "C"]))
+        tm.assert_frame_equal(result, expected)
+
     def test_usecols_pass_non_existent_column(self, ext):
         msg = ("Usecols do not match columns, "
                "columns expected but not found: " + r"\['E'\]")