diff --git a/ci/deps/travis-36-cov.yaml b/ci/deps/travis-36-cov.yaml index fead806fc8e1c..6f85c32b9a915 100644 --- a/ci/deps/travis-36-cov.yaml +++ b/ci/deps/travis-36-cov.yaml @@ -16,6 +16,7 @@ dependencies: - nomkl - numexpr - numpy=1.15.* + - odfpy - openpyxl - pandas-gbq # https://github.com/pydata/pandas-gbq/issues/271 diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 9af6c36cc4e4d..bf7ec561b4a7e 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -32,6 +32,7 @@ The pandas I/O API is a set of top level ``reader`` functions accessed like text;`HTML `__;:ref:`read_html`;:ref:`to_html` text; Local clipboard;:ref:`read_clipboard`;:ref:`to_clipboard` binary;`MS Excel `__;:ref:`read_excel`;:ref:`to_excel` + binary;`OpenDocument `__;:ref:`read_excel`; binary;`HDF5 Format `__;:ref:`read_hdf`;:ref:`to_hdf` binary;`Feather Format `__;:ref:`read_feather`;:ref:`to_feather` binary;`Parquet Format `__;:ref:`read_parquet`;:ref:`to_parquet` @@ -2779,9 +2780,10 @@ parse HTML tables in the top-level pandas io function ``read_html``. Excel files ----------- -The :func:`~pandas.read_excel` method can read Excel 2003 (``.xls``) and -Excel 2007+ (``.xlsx``) files using the ``xlrd`` Python -module. The :meth:`~DataFrame.to_excel` instance method is used for +The :func:`~pandas.read_excel` method can read Excel 2003 (``.xls``) +files using the ``xlrd`` Python module. Excel 2007+ (``.xlsx``) files +can be read using either ``xlrd`` or ``openpyxl``. +The :meth:`~DataFrame.to_excel` instance method is used for saving a ``DataFrame`` to Excel. Generally the semantics are similar to working with :ref:`csv` data. See the :ref:`cookbook` for some advanced strategies. @@ -3217,7 +3219,27 @@ The look and feel of Excel worksheets created from pandas can be modified using * ``float_format`` : Format string for floating point numbers (default ``None``). * ``freeze_panes`` : A tuple of two integers representing the bottommost row and rightmost column to freeze. Each of these parameters is one-based, so (1, 1) will freeze the first row and first column (default ``None``). +.. _io.ods: +OpenDocument Spreadsheets +------------------------- + +.. versionadded:: 0.25 + +The :func:`~pandas.read_excel` method can also read OpenDocument spreadsheets +using the ``odfpy`` module. The semantics and features for reading +OpenDocument spreadsheets match what can be done for `Excel files`_ using +``engine='odf'``. + +.. code-block:: python + + # Returns a DataFrame + pd.read_excel('path_to_file.ods', engine='odf') + +.. note:: + + Currently pandas only supports *reading* OpenDocument spreadsheets. Writing + is not implemented. .. _io.clipboard: diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 2030bb4d974c3..35e9fe5706b31 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -164,6 +164,7 @@ Other enhancements - Added new option ``plotting.backend`` to be able to select a plotting backend different than the existing ``matplotlib`` one. Use ``pandas.set_option('plotting.backend', '')`` where ``` for more details (:issue:`9070`) .. _whatsnew_0250.api_breaking: diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py index 31746dc3d6c16..620884d66821c 100644 --- a/pandas/compat/_optional.py +++ b/pandas/compat/_optional.py @@ -13,6 +13,7 @@ "lxml.etree": "3.8.0", "matplotlib": "2.2.2", "numexpr": "2.6.2", + "odfpy": "1.3.0", "openpyxl": "2.4.8", "pandas_gbq": "0.8.0", "pyarrow": "0.9.0", diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index 84ca154d045fe..7fe9f8438ac74 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -422,6 +422,7 @@ def use_inf_as_na_cb(key): _xls_options = ['xlrd'] _xlsm_options = ['xlrd', 'openpyxl'] _xlsx_options = ['xlrd', 'openpyxl'] +_ods_options = ['odf'] with cf.config_prefix("io.excel.xls"): @@ -447,6 +448,14 @@ def use_inf_as_na_cb(key): validator=str) +with cf.config_prefix("io.excel.ods"): + cf.register_option("reader", "auto", + reader_engine_doc.format( + ext='ods', + others=', '.join(_ods_options)), + validator=str) + + # Set up the io.excel specific writer configuration. writer_engine_doc = """ : string diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 8055b6609b1c4..d10a40541bb6c 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -768,12 +768,14 @@ class ExcelFile: Acceptable values are None or ``xlrd``. """ - from pandas.io.excel._xlrd import _XlrdReader + from pandas.io.excel._odfreader import _ODFReader from pandas.io.excel._openpyxl import _OpenpyxlReader + from pandas.io.excel._xlrd import _XlrdReader _engines = { 'xlrd': _XlrdReader, 'openpyxl': _OpenpyxlReader, + 'odf': _ODFReader, } def __init__(self, io, engine=None): diff --git a/pandas/io/excel/_odfreader.py b/pandas/io/excel/_odfreader.py new file mode 100644 index 0000000000000..c820c1497c3c9 --- /dev/null +++ b/pandas/io/excel/_odfreader.py @@ -0,0 +1,176 @@ +from typing import List + +from pandas.compat._optional import import_optional_dependency + +import pandas as pd +from pandas._typing import FilePathOrBuffer, Scalar + +from pandas.io.excel._base import _BaseExcelReader + + +class _ODFReader(_BaseExcelReader): + """Read tables out of OpenDocument formatted files + + Parameters + ---------- + filepath_or_buffer: string, path to be parsed or + an open readable stream. + """ + def __init__(self, filepath_or_buffer: FilePathOrBuffer): + import_optional_dependency("odf") + super().__init__(filepath_or_buffer) + + @property + def _workbook_class(self): + from odf.opendocument import OpenDocument + return OpenDocument + + def load_workbook(self, filepath_or_buffer: FilePathOrBuffer): + from odf.opendocument import load + return load(filepath_or_buffer) + + @property + def empty_value(self) -> str: + """Property for compat with other readers.""" + return '' + + @property + def sheet_names(self) -> List[str]: + """Return a list of sheet names present in the document""" + from odf.table import Table + + tables = self.book.getElementsByType(Table) + return [t.getAttribute("name") for t in tables] + + def get_sheet_by_index(self, index: int): + from odf.table import Table + tables = self.book.getElementsByType(Table) + return tables[index] + + def get_sheet_by_name(self, name: str): + from odf.table import Table + + tables = self.book.getElementsByType(Table) + + for table in tables: + if table.getAttribute("name") == name: + return table + + raise ValueError("sheet {name} not found".format(name)) + + def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]: + """Parse an ODF Table into a list of lists + """ + from odf.table import CoveredTableCell, TableCell, TableRow + + covered_cell_name = CoveredTableCell().qname + table_cell_name = TableCell().qname + cell_names = {covered_cell_name, table_cell_name} + + sheet_rows = sheet.getElementsByType(TableRow) + empty_rows = 0 + max_row_len = 0 + + table = [] # type: List[List[Scalar]] + + for i, sheet_row in enumerate(sheet_rows): + sheet_cells = [x for x in sheet_row.childNodes + if x.qname in cell_names] + empty_cells = 0 + table_row = [] # type: List[Scalar] + + for j, sheet_cell in enumerate(sheet_cells): + if sheet_cell.qname == table_cell_name: + value = self._get_cell_value(sheet_cell, convert_float) + else: + value = self.empty_value + + column_repeat = self._get_column_repeat(sheet_cell) + + # Queue up empty values, writing only if content succeeds them + if value == self.empty_value: + empty_cells += column_repeat + else: + table_row.extend([self.empty_value] * empty_cells) + empty_cells = 0 + table_row.extend([value] * column_repeat) + + if max_row_len < len(table_row): + max_row_len = len(table_row) + + row_repeat = self._get_row_repeat(sheet_row) + if self._is_empty_row(sheet_row): + empty_rows += row_repeat + else: + # add blank rows to our table + table.extend([[self.empty_value]] * empty_rows) + empty_rows = 0 + for _ in range(row_repeat): + table.append(table_row) + + # Make our table square + for row in table: + if len(row) < max_row_len: + row.extend([self.empty_value] * (max_row_len - len(row))) + + return table + + def _get_row_repeat(self, row) -> int: + """Return number of times this row was repeated + Repeating an empty row appeared to be a common way + of representing sparse rows in the table. + """ + from odf.namespaces import TABLENS + + return int(row.attributes.get((TABLENS, 'number-rows-repeated'), 1)) + + def _get_column_repeat(self, cell) -> int: + from odf.namespaces import TABLENS + return int(cell.attributes.get( + (TABLENS, 'number-columns-repeated'), 1)) + + def _is_empty_row(self, row) -> bool: + """Helper function to find empty rows + """ + for column in row.childNodes: + if len(column.childNodes) > 0: + return False + + return True + + def _get_cell_value(self, cell, convert_float: bool) -> Scalar: + from odf.namespaces import OFFICENS + cell_type = cell.attributes.get((OFFICENS, 'value-type')) + if cell_type == 'boolean': + if str(cell) == "TRUE": + return True + return False + if cell_type is None: + return self.empty_value + elif cell_type == 'float': + # GH5394 + cell_value = float(cell.attributes.get((OFFICENS, 'value'))) + + if cell_value == 0. and str(cell) != cell_value: # NA handling + return str(cell) + + if convert_float: + val = int(cell_value) + if val == cell_value: + return val + return cell_value + elif cell_type == 'percentage': + cell_value = cell.attributes.get((OFFICENS, 'value')) + return float(cell_value) + elif cell_type == 'string': + return str(cell) + elif cell_type == 'currency': + cell_value = cell.attributes.get((OFFICENS, 'value')) + return float(cell_value) + elif cell_type == 'date': + cell_value = cell.attributes.get((OFFICENS, 'date-value')) + return pd.to_datetime(cell_value) + elif cell_type == 'time': + return pd.to_datetime(str(cell)).time() + else: + raise ValueError('Unrecognized type {}'.format(cell_type)) diff --git a/pandas/tests/io/data/blank.ods b/pandas/tests/io/data/blank.ods new file mode 100644 index 0000000000000..7ded3c3c1d688 Binary files /dev/null and b/pandas/tests/io/data/blank.ods differ diff --git a/pandas/tests/io/data/blank_with_header.ods b/pandas/tests/io/data/blank_with_header.ods new file mode 100644 index 0000000000000..0a2e696267fda Binary files /dev/null and b/pandas/tests/io/data/blank_with_header.ods differ diff --git a/pandas/tests/io/data/invalid_value_type.ods b/pandas/tests/io/data/invalid_value_type.ods new file mode 100644 index 0000000000000..75a7a40b25d79 Binary files /dev/null and b/pandas/tests/io/data/invalid_value_type.ods differ diff --git a/pandas/tests/io/data/test1.ods b/pandas/tests/io/data/test1.ods new file mode 100644 index 0000000000000..5dc0e83456264 Binary files /dev/null and b/pandas/tests/io/data/test1.ods differ diff --git a/pandas/tests/io/data/test2.ods b/pandas/tests/io/data/test2.ods new file mode 100644 index 0000000000000..2a90db839026b Binary files /dev/null and b/pandas/tests/io/data/test2.ods differ diff --git a/pandas/tests/io/data/test3.ods b/pandas/tests/io/data/test3.ods new file mode 100644 index 0000000000000..dc78781caa6e9 Binary files /dev/null and b/pandas/tests/io/data/test3.ods differ diff --git a/pandas/tests/io/data/test4.ods b/pandas/tests/io/data/test4.ods new file mode 100644 index 0000000000000..c73a20d8b0562 Binary files /dev/null and b/pandas/tests/io/data/test4.ods differ diff --git a/pandas/tests/io/data/test5.ods b/pandas/tests/io/data/test5.ods new file mode 100644 index 0000000000000..5872e2624d033 Binary files /dev/null and b/pandas/tests/io/data/test5.ods differ diff --git a/pandas/tests/io/data/test_converters.ods b/pandas/tests/io/data/test_converters.ods new file mode 100644 index 0000000000000..0216fb16311d8 Binary files /dev/null and b/pandas/tests/io/data/test_converters.ods differ diff --git a/pandas/tests/io/data/test_index_name_pre17.ods b/pandas/tests/io/data/test_index_name_pre17.ods new file mode 100644 index 0000000000000..56638c983d944 Binary files /dev/null and b/pandas/tests/io/data/test_index_name_pre17.ods differ diff --git a/pandas/tests/io/data/test_multisheet.ods b/pandas/tests/io/data/test_multisheet.ods new file mode 100644 index 0000000000000..39058e67b4d5b Binary files /dev/null and b/pandas/tests/io/data/test_multisheet.ods differ diff --git a/pandas/tests/io/data/test_squeeze.ods b/pandas/tests/io/data/test_squeeze.ods new file mode 100644 index 0000000000000..10ccf0da2693e Binary files /dev/null and b/pandas/tests/io/data/test_squeeze.ods differ diff --git a/pandas/tests/io/data/test_types.ods b/pandas/tests/io/data/test_types.ods new file mode 100644 index 0000000000000..c9a82bfff810b Binary files /dev/null and b/pandas/tests/io/data/test_types.ods differ diff --git a/pandas/tests/io/data/testdateoverflow.ods b/pandas/tests/io/data/testdateoverflow.ods new file mode 100644 index 0000000000000..bb05267865303 Binary files /dev/null and b/pandas/tests/io/data/testdateoverflow.ods differ diff --git a/pandas/tests/io/data/testdtype.ods b/pandas/tests/io/data/testdtype.ods new file mode 100644 index 0000000000000..91145f807c9d9 Binary files /dev/null and b/pandas/tests/io/data/testdtype.ods differ diff --git a/pandas/tests/io/data/testmultiindex.ods b/pandas/tests/io/data/testmultiindex.ods new file mode 100644 index 0000000000000..b7f03900e6617 Binary files /dev/null and b/pandas/tests/io/data/testmultiindex.ods differ diff --git a/pandas/tests/io/data/testskiprows.ods b/pandas/tests/io/data/testskiprows.ods new file mode 100644 index 0000000000000..443602a2c3f98 Binary files /dev/null and b/pandas/tests/io/data/testskiprows.ods differ diff --git a/pandas/tests/io/data/times_1900.ods b/pandas/tests/io/data/times_1900.ods new file mode 100644 index 0000000000000..79e031c721ea3 Binary files /dev/null and b/pandas/tests/io/data/times_1900.ods differ diff --git a/pandas/tests/io/data/times_1904.ods b/pandas/tests/io/data/times_1904.ods new file mode 100644 index 0000000000000..b47a949d3b715 Binary files /dev/null and b/pandas/tests/io/data/times_1904.ods differ diff --git a/pandas/tests/io/data/writertable.odt b/pandas/tests/io/data/writertable.odt new file mode 100644 index 0000000000000..113bd651e8cd0 Binary files /dev/null and b/pandas/tests/io/data/writertable.odt differ diff --git a/pandas/tests/io/excel/conftest.py b/pandas/tests/io/excel/conftest.py index 935db254bd2e5..dd96fb2366152 100644 --- a/pandas/tests/io/excel/conftest.py +++ b/pandas/tests/io/excel/conftest.py @@ -30,7 +30,7 @@ def df_ref(): return df_ref -@pytest.fixture(params=['.xls', '.xlsx', '.xlsm']) +@pytest.fixture(params=['.xls', '.xlsx', '.xlsm', '.ods']) def read_ext(request): """ Valid extensions for reading Excel files. diff --git a/pandas/tests/io/excel/test_odf.py b/pandas/tests/io/excel/test_odf.py new file mode 100644 index 0000000000000..76b3fe19a0771 --- /dev/null +++ b/pandas/tests/io/excel/test_odf.py @@ -0,0 +1,39 @@ +import functools + +import numpy as np +import pytest + +import pandas as pd +import pandas.util.testing as tm + +pytest.importorskip("odf") + + +@pytest.fixture(autouse=True) +def cd_and_set_engine(monkeypatch, datapath): + func = functools.partial(pd.read_excel, engine="odf") + monkeypatch.setattr(pd, 'read_excel', func) + monkeypatch.chdir(datapath("io", "data")) + + +def test_read_invalid_types_raises(): + # the invalid_value_type.ods required manually editing + # of the included content.xml file + with pytest.raises(ValueError, + match="Unrecognized type awesome_new_type"): + pd.read_excel("invalid_value_type.ods") + + +def test_read_writer_table(): + # Also test reading tables from an text OpenDocument file + # (.odt) + index = pd.Index(["Row 1", "Row 2", "Row 3"], name="Header") + expected = pd.DataFrame([ + [1, np.nan, 7], + [2, np.nan, 8], + [3, np.nan, 9], + ], index=index, columns=["Column 1", "Unnamed: 2", "Column 3"]) + + result = pd.read_excel("writertable.odt", 'Table1', index_col=0) + + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index be5951fe12b46..ae69c2302e60a 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -33,9 +33,21 @@ def ignore_xlrd_time_clock_warning(): @pytest.fixture(params=[ # Add any engines to test here - pytest.param('xlrd', marks=td.skip_if_no('xlrd')), - pytest.param('openpyxl', marks=td.skip_if_no('openpyxl')), - pytest.param(None, marks=td.skip_if_no('xlrd')), + # When defusedxml is installed it triggers deprecation warnings for + # xlrd and openpyxl, so catch those here + pytest.param('xlrd', marks=[ + td.skip_if_no('xlrd'), + pytest.mark.filterwarnings("ignore:.*(tree\\.iter|html argument)"), + ]), + pytest.param('openpyxl', marks=[ + td.skip_if_no('openpyxl'), + pytest.mark.filterwarnings("ignore:.*html argument"), + ]), + pytest.param(None, marks=[ + td.skip_if_no('xlrd'), + pytest.mark.filterwarnings("ignore:.*(tree\\.iter|html argument)"), + ]), + pytest.param("odf", marks=td.skip_if_no("odf")), ]) def engine(request): """ @@ -53,6 +65,11 @@ def cd_and_set_engine(self, engine, datapath, monkeypatch, read_ext): """ if engine == 'openpyxl' and read_ext == '.xls': pytest.skip() + if engine == 'odf' and read_ext != '.ods': + pytest.skip() + if read_ext == ".ods" and engine != "odf": + pytest.skip() + func = partial(pd.read_excel, engine=engine) monkeypatch.chdir(datapath("io", "data")) monkeypatch.setattr(pd, 'read_excel', func) @@ -62,14 +79,16 @@ def test_usecols_int(self, read_ext, df_ref): # usecols as int with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): + check_stacklevel=False, + raise_on_extra_warnings=False): with ignore_xlrd_time_clock_warning(): df1 = pd.read_excel("test1" + read_ext, "Sheet1", index_col=0, usecols=3) # usecols as int with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): + check_stacklevel=False, + raise_on_extra_warnings=False): with ignore_xlrd_time_clock_warning(): df2 = pd.read_excel("test1" + read_ext, "Sheet2", skiprows=[1], index_col=0, usecols=3) @@ -439,6 +458,9 @@ def test_bad_engine_raises(self, read_ext): @tm.network def test_read_from_http_url(self, read_ext): + if read_ext == '.ods': # TODO: remove once on master + pytest.skip() + url = ('https://raw.github.com/pandas-dev/pandas/master/' 'pandas/tests/io/data/test1' + read_ext) url_table = pd.read_excel(url) @@ -736,6 +758,10 @@ def cd_and_set_engine(self, engine, datapath, monkeypatch, read_ext): """ Change directory and set engine for ExcelFile objects. """ + if engine == 'odf' and read_ext != '.ods': + pytest.skip() + if read_ext == ".ods" and engine != "odf": + pytest.skip() if engine == 'openpyxl' and read_ext == '.xls': pytest.skip() @@ -802,7 +828,8 @@ def test_excel_table_sheet_by_index(self, read_ext, df_ref): df3 = pd.read_excel(excel, 0, index_col=0, skipfooter=1) tm.assert_frame_equal(df3, df1.iloc[:-1]) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False, + raise_on_extra_warnings=False): with pd.ExcelFile('test1' + read_ext) as excel: df4 = pd.read_excel(excel, 0, index_col=0, skip_footer=1) diff --git a/pandas/tests/io/excel/test_xlrd.py b/pandas/tests/io/excel/test_xlrd.py index 94e1435d4dfab..d749f0ec3e252 100644 --- a/pandas/tests/io/excel/test_xlrd.py +++ b/pandas/tests/io/excel/test_xlrd.py @@ -10,6 +10,12 @@ xlwt = pytest.importorskip("xlwt") +@pytest.fixture(autouse=True) +def skip_ods_files(read_ext): + if read_ext == ".ods": + pytest.skip("Not valid for xlrd") + + def test_read_xlrd_book(read_ext, frame): df = frame