-
-
Notifications
You must be signed in to change notification settings - Fork 18.7k
DEPR: Adjust read excel behavior for xlrd >= 2.0 #38571
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
06d79b9
e2c29ff
f56970b
e992049
13cd483
792b53a
266b18b
56cf956
e1878a0
0146258
00bfba3
05c0725
4a8801d
59e0cb9
8f8b74e
f286497
8c95acd
4619363
05a6f09
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4,8 +4,9 @@ | |
from io import BufferedIOBase, BytesIO, RawIOBase | ||
import os | ||
from textwrap import fill | ||
from typing import Any, Dict, Mapping, Union, cast | ||
from typing import Any, Dict, Mapping, Optional, Union, cast | ||
import warnings | ||
import zipfile | ||
|
||
from pandas._config import config | ||
|
||
|
@@ -111,22 +112,19 @@ | |
- "pyxlsb" supports Binary Excel files. | ||
|
||
.. versionchanged:: 1.2.0 | ||
The engine `xlrd <https://xlrd.readthedocs.io/en/latest/>`_ | ||
is no longer maintained, and is not supported with | ||
python >= 3.9. When ``engine=None``, the following logic will be | ||
used to determine the engine. | ||
|
||
- If ``path_or_buffer`` is an OpenDocument format (.odf, .ods, .odt), | ||
then `odf <https://pypi.org/project/odfpy/>`_ will be used. | ||
- Otherwise if ``path_or_buffer`` is a bytes stream, the file has the | ||
extension ``.xls``, or is an ``xlrd`` Book instance, then ``xlrd`` will | ||
be used. | ||
- Otherwise if `openpyxl <https://pypi.org/project/openpyxl/>`_ is installed, | ||
then ``openpyxl`` will be used. | ||
- Otherwise ``xlrd`` will be used and a ``FutureWarning`` will be raised. | ||
|
||
Specifying ``engine="xlrd"`` will continue to be allowed for the | ||
indefinite future. | ||
|
||
The engine `xlrd <https://xlrd.readthedocs.io/en/latest/>`_ only reads | ||
xls files in version 2.0 and above. When ``engine=None``, | ||
the following logic will be used to determine the engine. | ||
|
||
- If ``path_or_buffer`` is an OpenDocument format (.odf, .ods, .odt), | ||
then `odf <https://pypi.org/project/odfpy/>`_ will be used. | ||
- Otherwise if ``path_or_buffer`` is an xls-style format, | ||
``xlrd`` will be used. | ||
- Otherwise if `openpyxl <https://pypi.org/project/openpyxl/>`_ is installed, | ||
then ``openpyxl`` will be used. | ||
- Otherwise if ``xlrd >= 2.0`` is installed, a ``ValueError`` will be raised. | ||
- Otherwise ``xlrd`` will be used and a ``FutureWarning`` will be raised. | ||
|
||
converters : dict, default None | ||
Dict of functions for converting values in certain columns. Keys can | ||
|
@@ -888,39 +886,93 @@ def close(self): | |
return content | ||
|
||
|
||
def _is_ods_stream(stream: Union[BufferedIOBase, RawIOBase]) -> bool: | ||
XLS_SIGNATURE = b"\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1" | ||
ZIP_SIGNATURE = b"PK\x03\x04" | ||
PEEK_SIZE = max(len(XLS_SIGNATURE), len(ZIP_SIGNATURE)) | ||
|
||
|
||
def inspect_excel_format( | ||
path: Optional[str] = None, | ||
content: Union[None, BufferedIOBase, RawIOBase, bytes] = None, | ||
) -> Optional[str]: | ||
""" | ||
Check if the stream is an OpenDocument Spreadsheet (.ods) file | ||
Inspect the path or content of an excel file. | ||
|
||
It uses magic values inside the stream | ||
Adopted from xlrd: https://github.com/python-excel/xlrd. | ||
|
||
Parameters | ||
---------- | ||
stream : Union[BufferedIOBase, RawIOBase] | ||
IO stream with data which might be an ODS file | ||
path : str, optional | ||
Path to file to inspect. May be a URL. | ||
content : file-like object | ||
Content of file to inspect. | ||
|
||
Returns | ||
------- | ||
is_ods : bool | ||
Boolean indication that this is indeed an ODS file or not | ||
str | ||
Format of file. Returns the extension if path is a URL. | ||
""" | ||
stream.seek(0) | ||
is_ods = False | ||
if stream.read(4) == b"PK\003\004": | ||
stream.seek(30) | ||
is_ods = ( | ||
stream.read(54) == b"mimetype" | ||
b"application/vnd.oasis.opendocument.spreadsheet" | ||
) | ||
stream.seek(0) | ||
return is_ods | ||
if content is not None: | ||
if isinstance(content, bytes): | ||
peek = content[:PEEK_SIZE] | ||
else: | ||
buf = content.read(PEEK_SIZE) | ||
if buf is None: | ||
raise ValueError("File is empty") | ||
else: | ||
peek = buf | ||
elif path is not None: | ||
try: | ||
with open(path, "rb") as f: | ||
buf = f.read(PEEK_SIZE) | ||
if buf is None: | ||
raise ValueError("File is empty") | ||
else: | ||
peek = buf | ||
except FileNotFoundError: | ||
# File may be a url, return the extension | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why provide degraded inference just because the source is a URL? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Previous ODS code read at most 84 bytes; current code needs the entire file. I'll do a partial revert here and utilize the previous ODS code, but I have reservations about downloading the entire file here. Would like to hear others' thoughts. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah; I think I have falsely assumed we need to get the entire contents of the file. I think it should be possible to get BufferedIOBase/RawIOBase into the proper form for ZipFile without reading. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I did put quite a lot of effort into #38424; the code in that PR is the way it is from, as carefully as I could, following through the various code paths and ensuring the behaviour was as simple and robust as it could be, in spite of less automated testing than I was expecting. It's tough to see these kind of comments which sort of imply that I hadn't thought any of this through... There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In no way shape or form did I have any intention of implying such a thing. As my comment above says, I was mistaken. |
||
return os.path.splitext(path)[-1][1:] | ||
else: | ||
raise ValueError("content or path must not be ") | ||
|
||
if peek.startswith(XLS_SIGNATURE): | ||
return "xls" | ||
|
||
if peek.startswith(ZIP_SIGNATURE): | ||
if content: | ||
# ZipFile requires IO[bytes] | ||
if isinstance(content, bytes): | ||
as_bytesio = BytesIO(content) | ||
else: | ||
buf = content.read() | ||
if buf is None: | ||
raise ValueError("File is empty") | ||
else: | ||
as_bytesio = BytesIO(buf) | ||
zf = zipfile.ZipFile(as_bytesio) | ||
else: | ||
assert path is not None | ||
zf = zipfile.ZipFile(path) | ||
|
||
# Workaround for some third party files that use forward slashes and | ||
# lower case names. | ||
component_names = [name.replace("\\", "/").lower() for name in zf.namelist()] | ||
|
||
if "xl/workbook.xml" in component_names: | ||
return "xlsx" | ||
if "xl/workbook.bin" in component_names: | ||
return "xlsb" | ||
if "content.xml" in component_names: | ||
return "ods" | ||
return "zip" | ||
return None | ||
|
||
|
||
class ExcelFile: | ||
""" | ||
Class for parsing tabular excel sheets into DataFrame objects. | ||
|
||
Uses xlrd engine by default. See read_excel for more documentation | ||
See read_excel for more documentation. | ||
|
||
Parameters | ||
---------- | ||
|
@@ -940,22 +992,18 @@ class ExcelFile: | |
|
||
.. versionchanged:: 1.2.0 | ||
|
||
The engine `xlrd <https://xlrd.readthedocs.io/en/latest/>`_ | ||
is no longer maintained, and is not supported with | ||
python >= 3.9. When ``engine=None``, the following logic will be | ||
used to determine the engine. | ||
The engine `xlrd <https://xlrd.readthedocs.io/en/latest/>`_ only reads | ||
xls files in version 2.0 and above. When ``engine=None``, | ||
the following logic will be used to determine the engine. | ||
|
||
- If ``path_or_buffer`` is an OpenDocument format (.odf, .ods, .odt), | ||
then `odf <https://pypi.org/project/odfpy/>`_ will be used. | ||
- Otherwise if ``path_or_buffer`` is a bytes stream, the file has the | ||
extension ``.xls``, or is an ``xlrd`` Book instance, then ``xlrd`` | ||
will be used. | ||
- Otherwise if ``path_or_buffer`` is an xls-style format, | ||
``xlrd`` will be used. | ||
- Otherwise if `openpyxl <https://pypi.org/project/openpyxl/>`_ is installed, | ||
then ``openpyxl`` will be used. | ||
- Otherwise if ``xlrd >= 2.0`` is installed, a ``ValueError`` will be raised. | ||
- Otherwise ``xlrd`` will be used and a ``FutureWarning`` will be raised. | ||
|
||
Specifying ``engine="xlrd"`` will continue to be allowed for the | ||
indefinite future. | ||
""" | ||
|
||
from pandas.io.excel._odfreader import ODFReader | ||
|
@@ -973,40 +1021,46 @@ class ExcelFile: | |
def __init__( | ||
self, path_or_buffer, engine=None, storage_options: StorageOptions = None | ||
): | ||
if engine is None: | ||
# Determine ext and use odf for ods stream/file | ||
if isinstance(path_or_buffer, (BufferedIOBase, RawIOBase)): | ||
ext = None | ||
if _is_ods_stream(path_or_buffer): | ||
engine = "odf" | ||
else: | ||
ext = os.path.splitext(str(path_or_buffer))[-1] | ||
if ext == ".ods": | ||
engine = "odf" | ||
if engine is not None and engine not in self._engines: | ||
raise ValueError(f"Unknown engine: {engine}") | ||
|
||
if ( | ||
import_optional_dependency( | ||
"xlrd", raise_on_missing=False, on_version="ignore" | ||
) | ||
is not None | ||
): | ||
from xlrd import Book | ||
# Determine xlrd version if installed | ||
if ( | ||
import_optional_dependency( | ||
"xlrd", raise_on_missing=False, on_version="ignore" | ||
) | ||
is None | ||
): | ||
xlrd_version = None | ||
else: | ||
import xlrd | ||
|
||
if isinstance(path_or_buffer, Book): | ||
engine = "xlrd" | ||
xlrd_version = xlrd.__version__ | ||
|
||
# GH 35029 - Prefer openpyxl except for xls files | ||
if engine is None: | ||
if ext is None or isinstance(path_or_buffer, bytes) or ext == ".xls": | ||
engine = "xlrd" | ||
elif ( | ||
if isinstance(path_or_buffer, (BufferedIOBase, RawIOBase, bytes)): | ||
ext = inspect_excel_format(content=path_or_buffer) | ||
elif xlrd_version is not None and isinstance(path_or_buffer, xlrd.Book): | ||
ext = "xls" | ||
else: | ||
# path_or_buffer is path-like, use stringified path | ||
ext = inspect_excel_format(path=str(path_or_buffer)) | ||
|
||
if engine is None: | ||
if ext == "ods": | ||
engine = "odf" | ||
elif ext == "xls": | ||
engine = "xlrd" | ||
else: | ||
# GH 35029 - Prefer openpyxl except for xls files | ||
if ( | ||
import_optional_dependency( | ||
"openpyxl", raise_on_missing=False, on_version="ignore" | ||
) | ||
is not None | ||
): | ||
engine = "openpyxl" | ||
else: | ||
elif xlrd_version is not None and xlrd_version < "2": | ||
# If xlrd_version >= "2", we error below | ||
caller = inspect.stack()[1] | ||
if ( | ||
caller.filename.endswith("pandas/io/excel/_base.py") | ||
|
@@ -1016,19 +1070,26 @@ def __init__( | |
else: | ||
stacklevel = 2 | ||
warnings.warn( | ||
"The xlrd engine is no longer maintained and is not " | ||
"supported when using pandas with python >= 3.9. However, " | ||
"the engine xlrd will continue to be allowed for the " | ||
"indefinite future. Beginning with pandas 1.2.0, the " | ||
"openpyxl engine will be used if it is installed and the " | ||
"engine argument is not specified. Either install openpyxl " | ||
"or specify engine='xlrd' to silence this warning.", | ||
f"Your version of xlrd is {xlrd_version}. In xlrd >= 2.0, " | ||
f"only the xls format is supported. As a result, the " | ||
f"openpyxl engine will be used if it is installed " | ||
f"and the engine argument is not specified. Either install " | ||
f"openpyxl or specify engine='xlrd' to silence this warning.", | ||
jorisvandenbossche marked this conversation as resolved.
Show resolved
Hide resolved
|
||
FutureWarning, | ||
stacklevel=stacklevel, | ||
) | ||
engine = "xlrd" | ||
jorisvandenbossche marked this conversation as resolved.
Show resolved
Hide resolved
|
||
if engine not in self._engines: | ||
raise ValueError(f"Unknown engine: {engine}") | ||
jorisvandenbossche marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
if ( | ||
engine == "xlrd" | ||
and xlrd_version is not None | ||
and xlrd_version >= "2" | ||
jreback marked this conversation as resolved.
Show resolved
Hide resolved
|
||
and not ext == "xls" | ||
): | ||
raise ValueError( | ||
f"Your version of xlrd is {xlrd_version}. In xlrd >= 2.0, " | ||
f"only the xls format is supported. " | ||
) | ||
|
||
self.engine = engine | ||
self.storage_options = storage_options | ||
|
Uh oh!
There was an error while loading. Please reload this page.