From 7190290a4246bd1042664aff1aa5b18847d69987 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Torsten=20W=C3=B6rtwein?= Date: Mon, 2 Nov 2020 21:10:32 -0500 Subject: [PATCH 1/2] REF: move get_filepath_buffer into get_handle --- doc/source/user_guide/io.rst | 9 +- doc/source/whatsnew/v1.2.0.rst | 6 +- pandas/_typing.py | 5 - pandas/core/frame.py | 10 +- pandas/core/generic.py | 2 +- pandas/io/common.py | 198 ++++++++++---------- pandas/io/excel/_base.py | 79 ++++---- pandas/io/excel/_odfreader.py | 4 +- pandas/io/excel/_odswriter.py | 2 +- pandas/io/excel/_openpyxl.py | 11 +- pandas/io/excel/_pyxlsb.py | 2 +- pandas/io/excel/_xlrd.py | 2 +- pandas/io/excel/_xlsxwriter.py | 2 +- pandas/io/excel/_xlwt.py | 2 +- pandas/io/feather_format.py | 14 +- pandas/io/formats/csvs.py | 26 ++- pandas/io/json/_json.py | 60 ++---- pandas/io/orc.py | 8 +- pandas/io/parquet.py | 94 ++++++---- pandas/io/parsers.py | 207 ++++++++++----------- pandas/io/pickle.py | 25 +-- pandas/io/sas/sas7bdat.py | 11 +- pandas/io/sas/sas_xport.py | 15 +- pandas/io/sas/sasreader.py | 14 +- pandas/io/stata.py | 86 ++------- pandas/tests/io/excel/test_writers.py | 2 +- pandas/tests/io/formats/test_to_csv.py | 15 +- pandas/tests/io/parser/test_compression.py | 2 +- pandas/tests/io/test_common.py | 21 ++- pandas/tests/io/test_fsspec.py | 43 +++++ pandas/tests/io/test_gcs.py | 57 +++--- pandas/tests/io/test_parquet.py | 25 ++- 32 files changed, 503 insertions(+), 556 deletions(-) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 1c271e74aafba..1bd35131622ab 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -1024,9 +1024,10 @@ Writing CSVs to binary file objects .. versionadded:: 1.2.0 -``df.to_csv(..., mode="w+b")`` allows writing a CSV to a file object -opened binary mode. For this to work, it is necessary that ``mode`` -contains a "b": +``df.to_csv(..., mode="wb")`` allows writing a CSV to a file object +opened binary mode. In most cases, it is not necessary to specify +``mode`` as Pandas will auto-detect whether the file object is +opened in text or binary mode. .. ipython:: python @@ -1034,7 +1035,7 @@ contains a "b": data = pd.DataFrame([0, 1, 2]) buffer = io.BytesIO() - data.to_csv(buffer, mode="w+b", encoding="utf-8", compression="gzip") + data.to_csv(buffer, encoding="utf-8", compression="gzip") .. _io.float_precision: diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 22a0fb7a45318..00349c2597f54 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -84,7 +84,8 @@ Support for binary file handles in ``to_csv`` :meth:`to_csv` supports file handles in binary mode (:issue:`19827` and :issue:`35058`) with ``encoding`` (:issue:`13068` and :issue:`23854`) and ``compression`` (:issue:`22555`). -``mode`` has to contain a ``b`` for binary handles to be supported. +If Pandas does not automatically detect whether the file handle is opened in binary or text mode, +it is necessary to provide ``mode="wb"``. For example: @@ -94,7 +95,7 @@ For example: data = pd.DataFrame([0, 1, 2]) buffer = io.BytesIO() - data.to_csv(buffer, mode="w+b", encoding="utf-8", compression="gzip") + data.to_csv(buffer, encoding="utf-8", compression="gzip") Support for short caption and table position in ``to_latex`` ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -514,6 +515,7 @@ I/O - :func:`read_csv` was closing user-provided binary file handles when ``engine="c"`` and an ``encoding`` was requested (:issue:`36980`) - Bug in :meth:`DataFrame.to_hdf` was not dropping missing rows with ``dropna=True`` (:issue:`35719`) - Bug in :func:`read_html` was raising a ``TypeError`` when supplying a ``pathlib.Path`` argument to the ``io`` parameter (:issue:`37705`) +- :meth:`to_excel` and :meth:`to_markdown` support writing to fsspec URLs such as S3 and Google Cloud Storage (:issue:`33987`) Plotting ^^^^^^^^ diff --git a/pandas/_typing.py b/pandas/_typing.py index 55a1c17b0aa53..7f01bcaa1c50e 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -146,10 +146,5 @@ CompressionOptions = Optional[Union[str, CompressionDict]] -# let's bind types -ModeVar = TypeVar("ModeVar", str, None, Optional[str]) -EncodingVar = TypeVar("EncodingVar", str, None, Optional[str]) - - # type of float formatter in DataFrameFormatter FloatFormatType = Union[str, Callable, "EngFormatter"] diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 9ce5ef2fc3cfe..27650c2889090 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -157,7 +157,7 @@ from pandas.core.series import Series from pandas.core.sorting import get_group_index, lexsort_indexer, nargsort -from pandas.io.common import get_filepath_or_buffer +from pandas.io.common import get_handle from pandas.io.formats import console, format as fmt from pandas.io.formats.info import DataFrameInfo import pandas.plotting @@ -2301,10 +2301,10 @@ def to_markdown( result = tabulate.tabulate(self, **kwargs) if buf is None: return result - ioargs = get_filepath_or_buffer(buf, mode=mode, storage_options=storage_options) - assert not isinstance(ioargs.filepath_or_buffer, (str, mmap.mmap)) - ioargs.filepath_or_buffer.writelines(result) - ioargs.close() + handles = get_handle(buf, mode, storage_options=storage_options) + assert not isinstance(handles.handle, (str, mmap.mmap)) + handles.handle.writelines(result) + handles.close() return None @deprecate_kwarg(old_arg_name="fname", new_arg_name="path") diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 24c1ae971686e..170950e069828 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3221,7 +3221,7 @@ def to_csv( File path or object, if None is provided the result is returned as a string. If a non-binary file object is passed, it should be opened with `newline=''`, disabling universal newlines. If a binary - file object is passed, `mode` needs to contain a `'b'`. + file object is passed, `mode` might need to contain a `'b'`. .. versionchanged:: 0.24.0 diff --git a/pandas/io/common.py b/pandas/io/common.py index 910eb23d9a2d0..739ee8076e29f 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -8,21 +8,7 @@ import mmap import os import pathlib -from typing import ( - IO, - TYPE_CHECKING, - Any, - AnyStr, - Dict, - Generic, - List, - Mapping, - Optional, - Tuple, - Type, - Union, - cast, -) +from typing import IO, Any, AnyStr, Dict, List, Mapping, Optional, Tuple, cast from urllib.parse import ( urljoin, urlparse as parse_url, @@ -37,10 +23,8 @@ Buffer, CompressionDict, CompressionOptions, - EncodingVar, FileOrBuffer, FilePathOrBuffer, - ModeVar, StorageOptions, ) from pandas.compat import get_lzma_file, import_lzma @@ -55,16 +39,10 @@ _VALID_URLS.discard("") -if TYPE_CHECKING: - from io import IOBase - - @dataclasses.dataclass -class IOArgs(Generic[ModeVar, EncodingVar]): +class IOArgs: """ - Return value of io/common.py:get_filepath_or_buffer. - - This is used to easily close created fsspec objects. + Return value of io/common.py:_get_filepath_or_buffer. Note (copy&past from io/parsers): filepath_or_buffer can be Union[FilePathOrBuffer, s3fs.S3File, gcsfs.GCSFile] @@ -73,23 +51,11 @@ class IOArgs(Generic[ModeVar, EncodingVar]): """ filepath_or_buffer: FileOrBuffer - encoding: EncodingVar - mode: Union[ModeVar, str] + encoding: str + mode: str compression: CompressionDict should_close: bool = False - def close(self) -> None: - """ - Close the buffer if it was created by get_filepath_or_buffer. - """ - if self.should_close: - assert not isinstance(self.filepath_or_buffer, str) - try: - self.filepath_or_buffer.close() - except (OSError, ValueError): - pass - self.should_close = False - @dataclasses.dataclass class IOHandles: @@ -105,6 +71,7 @@ class IOHandles: """ handle: Buffer + compression: CompressionDict created_handles: List[Buffer] = dataclasses.field(default_factory=list) is_wrapped: bool = False is_mmap: bool = False @@ -239,18 +206,13 @@ def is_fsspec_url(url: FilePathOrBuffer) -> bool: ) -# https://github.com/python/mypy/issues/8708 -# error: Incompatible default for argument "encoding" (default has type "None", -# argument has type "str") -# error: Incompatible default for argument "mode" (default has type "None", -# argument has type "str") -def get_filepath_or_buffer( +def _get_filepath_or_buffer( filepath_or_buffer: FilePathOrBuffer, - encoding: EncodingVar = None, # type: ignore[assignment] + encoding: str = "utf-8", compression: CompressionOptions = None, - mode: ModeVar = None, # type: ignore[assignment] + mode: str = "r", storage_options: StorageOptions = None, -) -> IOArgs[ModeVar, EncodingVar]: +) -> IOArgs: """ If the filepath_or_buffer is a url, translate and return the buffer. Otherwise passthrough. @@ -284,12 +246,7 @@ def get_filepath_or_buffer( compression_method = infer_compression(filepath_or_buffer, compression_method) # GH21227 internal compression is not used for non-binary handles. - if ( - compression_method - and hasattr(filepath_or_buffer, "write") - and mode - and "b" not in mode - ): + if compression_method and hasattr(filepath_or_buffer, "write") and "b" not in mode: warnings.warn( "compression has no effect when passing a non-binary object as input.", RuntimeWarning, @@ -306,8 +263,7 @@ def get_filepath_or_buffer( # bz2 and xz do not write the byte order mark for utf-16 and utf-32 # print a warning when writing such files if ( - mode - and "w" in mode + "w" in mode and compression_method in ["bz2", "xz"] and encoding in ["utf-16", "utf-32"] ): @@ -319,7 +275,7 @@ def get_filepath_or_buffer( # Use binary mode when converting path-like objects to file-like objects (fsspec) # except when text mode is explicitly requested. The original mode is returned if # fsspec is not used. - fsspec_mode = mode or "rb" + fsspec_mode = mode if "t" not in fsspec_mode and "b" not in fsspec_mode: fsspec_mode += "b" @@ -504,12 +460,8 @@ def infer_compression( ------ ValueError on invalid compression specified. """ - # No compression has been explicitly specified - if compression is None: - return None - # Infer compression - if compression == "infer": + if compression in ("infer", None): # Convert all path types (e.g. pathlib.Path) to strings filepath_or_buffer = stringify_path(filepath_or_buffer) if not isinstance(filepath_or_buffer, str): @@ -540,6 +492,7 @@ def get_handle( memory_map: bool = False, is_text: bool = True, errors: Optional[str] = None, + storage_options: StorageOptions = None, ) -> IOHandles: """ Get file handle for given path/buffer and mode. @@ -583,66 +536,73 @@ def get_handle( Specifies how encoding and decoding errors are to be handled. See the errors argument for :func:`open` for a full list of options. + storage_options: StorageOptions = None + Passed to _get_filepath_or_buffer .. versionchanged:: 1.2.0 Returns the dataclass IOHandles """ - need_text_wrapping: Tuple[Type["IOBase"], ...] - try: - from s3fs import S3File - - need_text_wrapping = (BufferedIOBase, RawIOBase, S3File) - except ImportError: - need_text_wrapping = (BufferedIOBase, RawIOBase) - # fsspec is an optional dependency. If it is available, add its file-object - # class to the list of classes that need text wrapping. If fsspec is too old and is - # needed, get_filepath_or_buffer would already have thrown an exception. - try: - from fsspec.spec import AbstractFileSystem - - need_text_wrapping = (*need_text_wrapping, AbstractFileSystem) - except ImportError: - pass - # Windows does not default to utf-8. Set to utf-8 for a consistent behavior if encoding is None: encoding = "utf-8" - # Convert pathlib.Path/py.path.local or string - handle = stringify_path(path_or_buf) + # read_csv does not know whether the buffer is opened in binary/text mode + if _is_binary_mode(path_or_buf, mode) and "b" not in mode: + mode += "b" + + # open URLs + ioargs = _get_filepath_or_buffer( + path_or_buf, + encoding=encoding, + compression=compression, + mode=mode, + storage_options=storage_options, + ) - compression, compression_args = get_compression_method(compression) - compression = infer_compression(handle, compression) + handle = ioargs.filepath_or_buffer + handles: List[Buffer] # memory mapping needs to be the first step handle, memory_map, handles = _maybe_memory_map( - handle, memory_map, encoding, mode, errors + handle, memory_map, ioargs.encoding, ioargs.mode, errors ) is_path = isinstance(handle, str) + compression_args = dict(ioargs.compression) + compression = compression_args.pop("method") + if compression: + # compression libraries do not like an explicit text-mode + ioargs.mode = ioargs.mode.replace("t", "") + # GZ Compression if compression == "gzip": if is_path: assert isinstance(handle, str) - handle = gzip.GzipFile(filename=handle, mode=mode, **compression_args) + handle = gzip.GzipFile( + filename=handle, + mode=ioargs.mode, + **compression_args, + ) else: handle = gzip.GzipFile( fileobj=handle, # type: ignore[arg-type] - mode=mode, + mode=ioargs.mode, **compression_args, ) # BZ Compression elif compression == "bz2": handle = bz2.BZ2File( - handle, mode=mode, **compression_args # type: ignore[arg-type] + handle, # type: ignore[arg-type] + mode=ioargs.mode, + **compression_args, ) # ZIP Compression elif compression == "zip": - handle = _BytesZipFile(handle, mode, **compression_args) + handle = _BytesZipFile(handle, ioargs.mode, **compression_args) if handle.mode == "r": handles.append(handle) zip_names = handle.namelist() @@ -658,7 +618,7 @@ def get_handle( # XZ Compression elif compression == "xz": - handle = get_lzma_file(lzma)(handle, mode) + handle = get_lzma_file(lzma)(handle, ioargs.mode) # Unrecognized Compression else: @@ -668,42 +628,50 @@ def get_handle( assert not isinstance(handle, str) handles.append(handle) - elif is_path: + elif isinstance(handle, str): # Check whether the filename is to be opened in binary mode. # Binary mode does not support 'encoding' and 'newline'. - assert isinstance(handle, str) - if encoding and "b" not in mode: + if ioargs.encoding and "b" not in ioargs.mode: # Encoding - handle = open(handle, mode, encoding=encoding, errors=errors, newline="") + handle = open( + handle, + ioargs.mode, + encoding=ioargs.encoding, + errors=errors, + newline="", + ) else: # Binary mode - handle = open(handle, mode) + handle = open(handle, ioargs.mode) handles.append(handle) # Convert BytesIO or file objects passed with an encoding is_wrapped = False - if is_text and ( - compression - or isinstance(handle, need_text_wrapping) - or "b" in getattr(handle, "mode", "") - ): + if is_text and (compression or _is_binary_mode(handle, ioargs.mode)): handle = TextIOWrapper( handle, # type: ignore[arg-type] - encoding=encoding, + encoding=ioargs.encoding, errors=errors, newline="", ) handles.append(handle) - # do not mark as wrapped when the user provided a string - is_wrapped = not is_path + # only marked as wrapped when the caller provided a handle + is_wrapped = not ( + isinstance(ioargs.filepath_or_buffer, str) or ioargs.should_close + ) handles.reverse() # close the most recently added buffer first + if ioargs.should_close: + assert not isinstance(ioargs.filepath_or_buffer, str) + handles.append(ioargs.filepath_or_buffer) + assert not isinstance(handle, str) return IOHandles( handle=handle, created_handles=handles, is_wrapped=is_wrapped, is_mmap=memory_map, + compression=ioargs.compression, ) @@ -804,7 +772,7 @@ def _maybe_memory_map( mode: str, errors: Optional[str], ) -> Tuple[FileOrBuffer, bool, List[Buffer]]: - """Try to use memory map file/buffer.""" + """Try to memory map file/buffer.""" handles: List[Buffer] = [] memory_map &= hasattr(handle, "fileno") or isinstance(handle, str) if not memory_map: @@ -834,3 +802,27 @@ def _maybe_memory_map( memory_map = False return handle, memory_map, handles + + +def file_exists(filepath_or_buffer: FilePathOrBuffer) -> bool: + """Test whether file exists.""" + exists = False + filepath_or_buffer = stringify_path(filepath_or_buffer) + if not isinstance(filepath_or_buffer, str): + return exists + try: + exists = os.path.exists(filepath_or_buffer) + # gh-5874: if the filepath is too long will raise here + except (TypeError, ValueError): + pass + return exists + + +def _is_binary_mode(handle: FilePathOrBuffer, mode: str) -> bool: + """Whether the handle is opened in binary mode""" + # classes that expect bytes + binary_classes = [BufferedIOBase, RawIOBase] + + return isinstance(handle, tuple(binary_classes)) or "b" in getattr( + handle, "mode", mode + ) diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index dd30bf37793d0..c2e9828e3ea42 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -16,14 +16,7 @@ from pandas.core.frame import DataFrame -from pandas.io.common import ( - IOArgs, - get_filepath_or_buffer, - is_url, - stringify_path, - urlopen, - validate_header_arg, -) +from pandas.io.common import IOHandles, get_handle, stringify_path, validate_header_arg from pandas.io.excel._util import ( fill_mi_header, get_default_writer, @@ -313,7 +306,9 @@ def read_excel( storage_options: StorageOptions = None, ): + should_close = False if not isinstance(io, ExcelFile): + should_close = True io = ExcelFile(io, storage_options=storage_options, engine=engine) elif engine and engine != io.engine: raise ValueError( @@ -321,7 +316,7 @@ def read_excel( "an ExcelFile - ExcelFile already has the engine set" ) - return io.parse( + data = io.parse( sheet_name=sheet_name, header=header, names=names, @@ -346,41 +341,29 @@ def read_excel( convert_float=convert_float, mangle_dupe_cols=mangle_dupe_cols, ) + if should_close: + io.close() + return data class BaseExcelReader(metaclass=abc.ABCMeta): def __init__(self, filepath_or_buffer, storage_options: StorageOptions = None): - self.ioargs = IOArgs( - filepath_or_buffer=filepath_or_buffer, - encoding=None, - mode=None, - compression={"method": None}, + self.handles = IOHandles( + handle=filepath_or_buffer, compression={"method": None} ) - # If filepath_or_buffer is a url, load the data into a BytesIO - if is_url(filepath_or_buffer): - self.ioargs = IOArgs( - filepath_or_buffer=BytesIO(urlopen(filepath_or_buffer).read()), - should_close=True, - encoding=None, - mode=None, - compression={"method": None}, - ) - elif not isinstance(filepath_or_buffer, (ExcelFile, self._workbook_class)): - self.ioargs = get_filepath_or_buffer( - filepath_or_buffer, storage_options=storage_options + if not isinstance(filepath_or_buffer, (ExcelFile, self._workbook_class)): + self.handles = get_handle( + filepath_or_buffer, "rb", storage_options=storage_options, is_text=False ) - if isinstance(self.ioargs.filepath_or_buffer, self._workbook_class): - self.book = self.ioargs.filepath_or_buffer - elif hasattr(self.ioargs.filepath_or_buffer, "read"): + if isinstance(self.handles.handle, self._workbook_class): + self.book = self.handles.handle + elif hasattr(self.handles.handle, "read"): # N.B. xlrd.Book has a read attribute too - assert not isinstance(self.ioargs.filepath_or_buffer, str) - self.ioargs.filepath_or_buffer.seek(0) - self.book = self.load_workbook(self.ioargs.filepath_or_buffer) - elif isinstance(self.ioargs.filepath_or_buffer, str): - self.book = self.load_workbook(self.ioargs.filepath_or_buffer) - elif isinstance(self.ioargs.filepath_or_buffer, bytes): - self.book = self.load_workbook(BytesIO(self.ioargs.filepath_or_buffer)) + self.handles.handle.seek(0) + self.book = self.load_workbook(self.handles.handle) + elif isinstance(self.handles.handle, bytes): + self.book = self.load_workbook(BytesIO(self.handles.handle)) else: raise ValueError( "Must explicitly set engine if not passing in buffer or path for io." @@ -396,7 +379,7 @@ def load_workbook(self, filepath_or_buffer): pass def close(self): - self.ioargs.close() + self.handles.close() @property @abc.abstractmethod @@ -581,7 +564,7 @@ class ExcelWriter(metaclass=abc.ABCMeta): Format string for datetime objects written into Excel files. (e.g. 'YYYY-MM-DD HH:MM:SS'). mode : {'w', 'a'}, default 'w' - File mode to use (write or append). + File mode to use (write or append). Append does not work with fsspec URLs. .. versionadded:: 0.24.0 @@ -739,7 +722,16 @@ def __init__( ext = os.path.splitext(path)[-1] self.check_extension(ext) - self.path = path + # use mode to open the file + if "b" not in mode: + mode += "b" + # use "a" for the user to append data to excel but internally use "r+" to let + # the excel backend first read the existing file and then write any data to it + mode = mode.replace("a", "r+") + + self.handles = IOHandles(path, compression={"copression": None}) + if not isinstance(path, ExcelWriter): + self.handles = get_handle(path, mode, is_text=False) self.sheets = {} self.cur_sheet = None @@ -755,10 +747,7 @@ def __init__( self.mode = mode def __fspath__(self): - # pandas\io\excel\_base.py:744: error: Argument 1 to "stringify_path" - # has incompatible type "Optional[Any]"; expected "Union[str, Path, - # IO[Any], IOBase]" [arg-type] - return stringify_path(self.path) # type: ignore[arg-type] + return getattr(self.handles.handle, "name", "") def _get_sheet_name(self, sheet_name): if sheet_name is None: @@ -828,7 +817,9 @@ def __exit__(self, exc_type, exc_value, traceback): def close(self): """synonym for save, to make it more file-like""" - return self.save() + content = self.save() + self.handles.close() + return content def _is_ods_stream(stream: Union[BufferedIOBase, RawIOBase]) -> bool: diff --git a/pandas/io/excel/_odfreader.py b/pandas/io/excel/_odfreader.py index 4f9f8a29c0010..c5c3927216850 100644 --- a/pandas/io/excel/_odfreader.py +++ b/pandas/io/excel/_odfreader.py @@ -19,7 +19,7 @@ class ODFReader(BaseExcelReader): filepath_or_buffer : string, path to be parsed or an open readable stream. storage_options : dict, optional - passed to fsspec for appropriate URLs (see ``get_filepath_or_buffer``) + passed to fsspec for appropriate URLs (see ``_get_filepath_or_buffer``) """ def __init__( @@ -69,6 +69,7 @@ def get_sheet_by_name(self, name: str): if table.getAttribute("name") == name: return table + self.close() raise ValueError(f"sheet {name} not found") def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]: @@ -190,6 +191,7 @@ def _get_cell_value(self, cell, convert_float: bool) -> Scalar: result = cast(pd.Timestamp, result) return result.time() else: + self.close() raise ValueError(f"Unrecognized type {cell_type}") def _get_cell_string_value(self, cell) -> str: diff --git a/pandas/io/excel/_odswriter.py b/pandas/io/excel/_odswriter.py index cbac60dfabaa7..c19d51540d2dd 100644 --- a/pandas/io/excel/_odswriter.py +++ b/pandas/io/excel/_odswriter.py @@ -34,7 +34,7 @@ def save(self) -> None: """ for sheet in self.sheets.values(): self.book.spreadsheet.addElement(sheet) - self.book.save(self.path) + self.book.save(self.handles.handle) def write_cells( self, diff --git a/pandas/io/excel/_openpyxl.py b/pandas/io/excel/_openpyxl.py index a5cadf4d93389..f643037dc216a 100644 --- a/pandas/io/excel/_openpyxl.py +++ b/pandas/io/excel/_openpyxl.py @@ -22,10 +22,12 @@ def __init__(self, path, engine=None, mode="w", **engine_kwargs): super().__init__(path, mode=mode, **engine_kwargs) - if self.mode == "a": # Load from existing workbook + # ExcelWriter replaced "a" by "r+" to allow us to first read the excel file from + # the file and later write to it + if "r+" in self.mode: # Load from existing workbook from openpyxl import load_workbook - self.book = load_workbook(self.path) + self.book = load_workbook(self.handles.handle) else: # Create workbook object with default optimized_write=True. self.book = Workbook() @@ -37,7 +39,7 @@ def save(self): """ Save workbook to disk. """ - self.book.save(self.path) + self.book.save(self.handles.handle) @classmethod def _convert_to_style_kwargs(cls, style_dict: dict) -> Dict[str, "Serialisable"]: @@ -452,7 +454,7 @@ def __init__( filepath_or_buffer : string, path object or Workbook Object to be parsed. storage_options : dict, optional - passed to fsspec for appropriate URLs (see ``get_filepath_or_buffer``) + passed to fsspec for appropriate URLs (see ``_get_filepath_or_buffer``) """ import_optional_dependency("openpyxl") super().__init__(filepath_or_buffer, storage_options=storage_options) @@ -474,6 +476,7 @@ def close(self): # https://stackoverflow.com/questions/31416842/ # openpyxl-does-not-close-excel-workbook-in-read-only-mode self.book.close() + super().close() @property def sheet_names(self) -> List[str]: diff --git a/pandas/io/excel/_pyxlsb.py b/pandas/io/excel/_pyxlsb.py index ac94f4dd3df74..de4f7bba1a179 100644 --- a/pandas/io/excel/_pyxlsb.py +++ b/pandas/io/excel/_pyxlsb.py @@ -20,7 +20,7 @@ def __init__( filepath_or_buffer : str, path object, or Workbook Object to be parsed. storage_options : dict, optional - passed to fsspec for appropriate URLs (see ``get_filepath_or_buffer``) + passed to fsspec for appropriate URLs (see ``_get_filepath_or_buffer``) """ import_optional_dependency("pyxlsb") # This will call load_workbook on the filepath or buffer diff --git a/pandas/io/excel/_xlrd.py b/pandas/io/excel/_xlrd.py index dfd5dde0329ae..c655db4bc772b 100644 --- a/pandas/io/excel/_xlrd.py +++ b/pandas/io/excel/_xlrd.py @@ -18,7 +18,7 @@ def __init__(self, filepath_or_buffer, storage_options: StorageOptions = None): filepath_or_buffer : string, path object or Workbook Object to be parsed. storage_options : dict, optional - passed to fsspec for appropriate URLs (see ``get_filepath_or_buffer``) + passed to fsspec for appropriate URLs (see ``_get_filepath_or_buffer``) """ err_msg = "Install xlrd >= 1.0.0 for Excel support" import_optional_dependency("xlrd", extra=err_msg) diff --git a/pandas/io/excel/_xlsxwriter.py b/pandas/io/excel/_xlsxwriter.py index 16c4d377d7610..77b631a41371e 100644 --- a/pandas/io/excel/_xlsxwriter.py +++ b/pandas/io/excel/_xlsxwriter.py @@ -186,7 +186,7 @@ def __init__( **engine_kwargs, ) - self.book = Workbook(path, **engine_kwargs) + self.book = Workbook(self.handles.handle, **engine_kwargs) def save(self): """ diff --git a/pandas/io/excel/_xlwt.py b/pandas/io/excel/_xlwt.py index 3592c2684f5a5..7f0ce3844c099 100644 --- a/pandas/io/excel/_xlwt.py +++ b/pandas/io/excel/_xlwt.py @@ -34,7 +34,7 @@ def save(self): """ Save workbook to disk. """ - self.book.save(self.path) + self.book.save(self.handles.handle) def write_cells( self, cells, sheet_name=None, startrow=0, startcol=0, freeze_panes=None diff --git a/pandas/io/feather_format.py b/pandas/io/feather_format.py index 198acd5862d45..0a72f750237a5 100644 --- a/pandas/io/feather_format.py +++ b/pandas/io/feather_format.py @@ -7,7 +7,7 @@ from pandas import DataFrame, Int64Index, RangeIndex -from pandas.io.common import get_filepath_or_buffer +from pandas.io.common import get_handle def to_feather( @@ -41,7 +41,7 @@ def to_feather( import_optional_dependency("pyarrow") from pyarrow import feather - ioargs = get_filepath_or_buffer(path, mode="wb", storage_options=storage_options) + handles = get_handle(path, "wb", storage_options=storage_options, is_text=False) if not isinstance(df, DataFrame): raise ValueError("feather only support IO with DataFrames") @@ -79,9 +79,9 @@ def to_feather( if df.columns.inferred_type not in valid_types: raise ValueError("feather must have string column names") - feather.write_feather(df, ioargs.filepath_or_buffer, **kwargs) + feather.write_feather(df, handles.handle, **kwargs) - ioargs.close() + handles.close() def read_feather( @@ -129,12 +129,12 @@ def read_feather( import_optional_dependency("pyarrow") from pyarrow import feather - ioargs = get_filepath_or_buffer(path, storage_options=storage_options) + handles = get_handle(path, "rb", storage_options=storage_options, is_text=False) df = feather.read_feather( - ioargs.filepath_or_buffer, columns=columns, use_threads=bool(use_threads) + handles.handle, columns=columns, use_threads=bool(use_threads) ) - ioargs.close() + handles.close() return df diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index 20226dbb3c9d4..db428ce61ee39 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -28,7 +28,7 @@ from pandas.core.indexes.api import Index -from pandas.io.common import get_filepath_or_buffer, get_handle +from pandas.io.common import get_handle if TYPE_CHECKING: from pandas.io.formats.format import DataFrameFormatter @@ -59,13 +59,11 @@ def __init__( self.obj = self.fmt.frame - self.ioargs = get_filepath_or_buffer( - path_or_buf, - encoding=encoding, - compression=compression, - mode=mode, - storage_options=storage_options, - ) + self.filepath_or_buffer = path_or_buf + self.encoding = encoding + self.compression = compression + self.mode = mode + self.storage_options = storage_options self.sep = sep self.index_label = self._initialize_index_label(index_label) @@ -228,11 +226,12 @@ def save(self) -> None: """ # apply compression and byte/text conversion handles = get_handle( - self.ioargs.filepath_or_buffer, - self.ioargs.mode, - encoding=self.ioargs.encoding, + self.filepath_or_buffer, + self.mode, + encoding=self.encoding, errors=self.errors, - compression=self.ioargs.compression, + compression=self.compression, + storage_options=self.storage_options, ) try: @@ -250,10 +249,7 @@ def save(self) -> None: self._save() finally: - # close compression and byte/text wrapper handles.close() - # close any fsspec-like objects - self.ioargs.close() def _save(self) -> None: if self._need_to_save_header: diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index 040279b9f3e67..b9b1535129db5 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -3,7 +3,6 @@ import functools from io import StringIO from itertools import islice -import os from typing import Any, Callable, Mapping, Optional, Tuple, Type, Union import numpy as np @@ -28,9 +27,11 @@ from pandas.io.common import ( IOHandles, - get_compression_method, - get_filepath_or_buffer, + file_exists, get_handle, + is_fsspec_url, + is_url, + stringify_path, ) from pandas.io.json._normalize import convert_to_line_delimits from pandas.io.json._table_schema import build_table_schema, parse_table_schema @@ -96,24 +97,14 @@ def to_json( s = convert_to_line_delimits(s) if path_or_buf is not None: - # open fsspec URLs - ioargs = get_filepath_or_buffer( - path_or_buf, - compression=compression, - mode="wt", - storage_options=storage_options, - ) # apply compression and byte/text conversion handles = get_handle( - ioargs.filepath_or_buffer, "w", compression=ioargs.compression + path_or_buf, "wt", compression=compression, storage_options=storage_options ) try: handles.handle.write(s) finally: - # close compression and byte/text wrapper handles.close() - # close any fsspec-like objects - ioargs.close() else: return s @@ -549,15 +540,8 @@ def read_json( if convert_axes is None and orient != "table": convert_axes = True - ioargs = get_filepath_or_buffer( - path_or_buf, - encoding=encoding or "utf-8", - compression=compression, - storage_options=storage_options, - ) - json_reader = JsonReader( - ioargs.filepath_or_buffer, + path_or_buf, orient=orient, typ=typ, dtype=dtype, @@ -567,20 +551,18 @@ def read_json( numpy=numpy, precise_float=precise_float, date_unit=date_unit, - encoding=ioargs.encoding, + encoding=encoding, lines=lines, chunksize=chunksize, - compression=ioargs.compression, + compression=compression, nrows=nrows, + storage_options=storage_options, ) if chunksize: return json_reader - result = json_reader.read() - ioargs.close() - - return result + return json_reader.read() class JsonReader(abc.Iterator): @@ -609,11 +591,9 @@ def __init__( chunksize: Optional[int], compression: CompressionOptions, nrows: Optional[int], + storage_options: StorageOptions = None, ): - compression_method, compression = get_compression_method(compression) - compression = dict(compression, method=compression_method) - self.orient = orient self.typ = typ self.dtype = dtype @@ -625,6 +605,7 @@ def __init__( self.date_unit = date_unit self.encoding = encoding self.compression = compression + self.storage_options = storage_options self.lines = lines self.chunksize = chunksize self.nrows_seen = 0 @@ -669,20 +650,19 @@ def _get_data_from_filepath(self, filepath_or_buffer): It returns input types (2) and (3) unchanged. """ # if it is a string but the file does not exist, it might be a JSON string - exists = False - if isinstance(filepath_or_buffer, str): - try: - exists = os.path.exists(filepath_or_buffer) - # gh-5874: if the filepath is too long will raise here - except (TypeError, ValueError): - pass - - if exists or not isinstance(filepath_or_buffer, str): + filepath_or_buffer = stringify_path(filepath_or_buffer) + if ( + not isinstance(filepath_or_buffer, str) + or is_url(filepath_or_buffer) + or is_fsspec_url(filepath_or_buffer) + or file_exists(filepath_or_buffer) + ): self.handles = get_handle( filepath_or_buffer, "r", encoding=self.encoding, compression=self.compression, + storage_options=self.storage_options, ) filepath_or_buffer = self.handles.handle diff --git a/pandas/io/orc.py b/pandas/io/orc.py index 5a734f0878a0c..750728b36bae5 100644 --- a/pandas/io/orc.py +++ b/pandas/io/orc.py @@ -5,7 +5,7 @@ from pandas._typing import FilePathOrBuffer -from pandas.io.common import get_filepath_or_buffer +from pandas.io.common import get_handle if TYPE_CHECKING: from pandas import DataFrame @@ -50,8 +50,8 @@ def read_orc( import pyarrow.orc - ioargs = get_filepath_or_buffer(path) - orc_file = pyarrow.orc.ORCFile(ioargs.filepath_or_buffer) + handles = get_handle(path, "rb", is_text=False) + orc_file = pyarrow.orc.ORCFile(handles.handle) result = orc_file.read(columns=columns, **kwargs).to_pandas() - ioargs.close() + handles.close() return result diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 88f57e18593f2..c76e18ae353a0 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -1,6 +1,7 @@ """ parquet compat """ import io +import os from typing import Any, AnyStr, Dict, List, Optional from warnings import catch_warnings @@ -10,7 +11,7 @@ from pandas import DataFrame, get_option -from pandas.io.common import get_filepath_or_buffer, is_fsspec_url, stringify_path +from pandas.io.common import get_handle, is_fsspec_url, stringify_path def get_engine(engine: str) -> "BaseImpl": @@ -102,19 +103,21 @@ def write( table = self.api.Table.from_pandas(df, **from_pandas_kwargs) + path = stringify_path(path) + # get_handle could be used here (for write_table, not for write_to_dataset) + # but it would complicate the code. if is_fsspec_url(path) and "filesystem" not in kwargs: # make fsspec instance, which pyarrow will use to open paths - import_optional_dependency("fsspec") - import fsspec.core + fsspec = import_optional_dependency("fsspec") fs, path = fsspec.core.url_to_fs(path, **(storage_options or {})) kwargs["filesystem"] = fs - else: - if storage_options: - raise ValueError( - "storage_options passed with file object or non-fsspec file path" - ) - path = stringify_path(path) + + elif storage_options: + raise ValueError( + "storage_options passed with file object or non-fsspec file path" + ) + if partition_cols is not None: # writes to multiple files under the given path self.api.parquet.write_to_dataset( @@ -131,32 +134,31 @@ def write( def read( self, path, columns=None, storage_options: StorageOptions = None, **kwargs ): - if is_fsspec_url(path) and "filesystem" not in kwargs: - import_optional_dependency("fsspec") - import fsspec.core + path = stringify_path(path) + handles = None + fs = kwargs.pop("filesystem", None) + if is_fsspec_url(path) and fs is None: + fsspec = import_optional_dependency("fsspec") fs, path = fsspec.core.url_to_fs(path, **(storage_options or {})) - should_close = False - else: - if storage_options: - raise ValueError( - "storage_options passed with buffer or non-fsspec filepath" - ) - fs = kwargs.pop("filesystem", None) - should_close = False - path = stringify_path(path) - - if not fs: - ioargs = get_filepath_or_buffer(path) - path = ioargs.filepath_or_buffer - should_close = ioargs.should_close + elif storage_options: + raise ValueError( + "storage_options passed with buffer or non-fsspec filepath" + ) + if not fs and isinstance(path, str) and not os.path.isdir(path): + # use get_handle only when we are very certain that it is not a directory + # fsspec resources can also point to directories + # this branch is used for example when reading from non-fsspec URLs + handles = get_handle(path, "rb", is_text=False) + path = handles.handle kwargs["use_pandas_metadata"] = True result = self.api.parquet.read_table( path, columns=columns, filesystem=fs, **kwargs ).to_pandas() - if should_close: - path.close() + + if handles is not None: + handles.close() return result @@ -196,6 +198,8 @@ def write( if partition_cols is not None: kwargs["file_scheme"] = "hive" + # cannot use get_handle as write() does not accept file buffers + path = stringify_path(path) if is_fsspec_url(path): fsspec = import_optional_dependency("fsspec") @@ -203,12 +207,10 @@ def write( kwargs["open_with"] = lambda path, _: fsspec.open( path, "wb", **(storage_options or {}) ).open() - else: - if storage_options: - raise ValueError( - "storage_options passed with file object or non-fsspec file path" - ) - path = get_filepath_or_buffer(path).filepath_or_buffer + elif storage_options: + raise ValueError( + "storage_options passed with file object or non-fsspec file path" + ) with catch_warnings(record=True): self.api.write( @@ -223,18 +225,28 @@ def write( def read( self, path, columns=None, storage_options: StorageOptions = None, **kwargs ): + path = stringify_path(path) + parquet_kwargs = {} + handles = None if is_fsspec_url(path): fsspec = import_optional_dependency("fsspec") - open_with = lambda path, _: fsspec.open( + parquet_kwargs["open_with"] = lambda path, _: fsspec.open( path, "rb", **(storage_options or {}) ).open() - parquet_file = self.api.ParquetFile(path, open_with=open_with) - else: - path = get_filepath_or_buffer(path).filepath_or_buffer - parquet_file = self.api.ParquetFile(path) - - return parquet_file.to_pandas(columns=columns, **kwargs) + elif isinstance(path, str) and not os.path.isdir(path): + # use get_handle only when we are very certain that it is not a directory + # fsspec resources can also point to directories + # this branch is used for example when reading from non-fsspec URLs + handles = get_handle(path, "rb", is_text=False) + path = handles.handle + parquet_file = self.api.ParquetFile(path, **parquet_kwargs) + + result = parquet_file.to_pandas(columns=columns, **kwargs) + + if handles is not None: + handles.close() + return result def to_parquet( diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 5725e2304e1d2..d7930f35a1421 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -10,7 +10,18 @@ import re import sys from textwrap import fill -from typing import Any, Dict, Iterable, List, Optional, Sequence, Set +from typing import ( + Any, + Dict, + Iterable, + Iterator, + List, + Optional, + Sequence, + Set, + Type, + cast, +) import warnings import numpy as np @@ -63,7 +74,7 @@ from pandas.core.series import Series from pandas.core.tools import datetimes as tools -from pandas.io.common import get_filepath_or_buffer, get_handle, validate_header_arg +from pandas.io.common import IOHandles, get_handle, stringify_path, validate_header_arg from pandas.io.date_converters import generic_parser # BOM character (byte order mark) @@ -428,17 +439,6 @@ def _validate_names(names): def _read(filepath_or_buffer: FilePathOrBuffer, kwds): """Generic reader of line files.""" - storage_options = kwds.get("storage_options", None) - - ioargs = get_filepath_or_buffer( - filepath_or_buffer, - kwds.get("encoding", None), - kwds.get("compression", "infer"), - storage_options=storage_options, - ) - kwds["compression"] = ioargs.compression - kwds["encoding"] = ioargs.encoding - if kwds.get("date_parser", None) is not None: if isinstance(kwds["parse_dates"], bool): kwds["parse_dates"] = True @@ -452,7 +452,7 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds): _validate_names(kwds.get("names", None)) # Create the parser. - parser = TextFileReader(ioargs.filepath_or_buffer, **kwds) + parser = TextFileReader(filepath_or_buffer, **kwds) if chunksize or iterator: return parser @@ -460,10 +460,7 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds): try: data = parser.read(nrows) finally: - # close compression and byte/text wrapper parser.close() - # close any fsspec-like objects - ioargs.close() return data @@ -777,7 +774,7 @@ class TextFileReader(abc.Iterator): def __init__(self, f, engine=None, **kwds): - self.f = f + self.f = stringify_path(f) if engine is not None: engine_specified = True @@ -802,6 +799,7 @@ def __init__(self, f, engine=None, **kwds): self._currow = 0 options = self._get_options_with_defaults(engine) + options["storage_options"] = kwds.get("storage_options", None) self.chunksize = options.pop("chunksize", None) self.nrows = options.pop("nrows", None) @@ -862,14 +860,11 @@ def _get_options_with_defaults(self, engine): def _check_file_or_buffer(self, f, engine): # see gh-16530 if is_file_like(f): - next_attr = "__next__" - - # The C engine doesn't need the file-like to have the "next" or - # "__next__" attribute. However, the Python engine explicitly calls - # "next(...)" when iterating through such an object, meaning it - # needs to have that attribute ("next" for Python 2.x, "__next__" - # for Python 3.x) - if engine != "c" and not hasattr(f, next_attr): + # The C engine doesn't need the file-like to have the "__next__" + # attribute. However, the Python engine explicitly calls + # "__next__(...)" when iterating through such an object, meaning it + # needs to have that attribute + if engine != "c" and not hasattr(f, "__next__"): msg = "The 'python' engine cannot iterate through this file buffer." raise ValueError(msg) @@ -1037,28 +1032,17 @@ def __next__(self): raise def _make_engine(self, engine="c"): - mapping = { - # pandas\io\parsers.py:1099: error: Dict entry 0 has incompatible - # type "str": "Type[CParserWrapper]"; expected "str": - # "Type[ParserBase]" [dict-item] - "c": CParserWrapper, # type: ignore[dict-item] - # pandas\io\parsers.py:1100: error: Dict entry 1 has incompatible - # type "str": "Type[PythonParser]"; expected "str": - # "Type[ParserBase]" [dict-item] - "python": PythonParser, # type: ignore[dict-item] - # pandas\io\parsers.py:1101: error: Dict entry 2 has incompatible - # type "str": "Type[FixedWidthFieldParser]"; expected "str": - # "Type[ParserBase]" [dict-item] - "python-fwf": FixedWidthFieldParser, # type: ignore[dict-item] + mapping: Dict[str, Type[ParserBase]] = { + "c": CParserWrapper, + "python": PythonParser, + "python-fwf": FixedWidthFieldParser, } - try: - klass = mapping[engine] - except KeyError: + if engine not in mapping: raise ValueError( f"Unknown engine: {engine} (valid options are {mapping.keys()})" ) - else: - return klass(self.f, **self.options) + # error: Too many arguments for "ParserBase" + return mapping[engine](self.f, **self.options) # type: ignore[call-arg] def _failover_to_python(self): raise AbstractMethodError(self) @@ -1275,13 +1259,14 @@ def _validate_parse_dates_arg(parse_dates): class ParserBase: def __init__(self, kwds): + self.names = kwds.get("names") - self.orig_names = None + self.orig_names: Optional[List] = None self.prefix = kwds.pop("prefix", None) self.index_col = kwds.get("index_col", None) - self.unnamed_cols = set() - self.index_names = None + self.unnamed_cols: Set = set() + self.index_names: Optional[List] = None self.col_names = None self.parse_dates = _validate_parse_dates_arg(kwds.pop("parse_dates", False)) @@ -1357,6 +1342,21 @@ def __init__(self, kwds): self._first_chunk = True + self.handles: Optional[IOHandles] = None + + def _open_handles(self, src: FilePathOrBuffer, kwds: Dict[str, Any]) -> None: + """ + Let the readers open IOHanldes after they are done with their potential raises. + """ + self.handles = get_handle( + src, + "r", + encoding=kwds.get("encoding", None), + compression=kwds.get("compression", None), + memory_map=kwds.get("memory_map", False), + storage_options=kwds.get("storage_options", None), + ) + def _validate_parse_dates_presence(self, columns: List[str]) -> None: """ Check if parse_dates are in columns. @@ -1406,9 +1406,8 @@ def _validate_parse_dates_presence(self, columns: List[str]) -> None: ) def close(self): - # pandas\io\parsers.py:1409: error: "ParserBase" has no attribute - # "handles" [attr-defined] - self.handles.close() # type: ignore[attr-defined] + if self.handles is not None: + self.handles.close() @property def _has_complex_date_col(self): @@ -1842,23 +1841,24 @@ def _do_date_conversions(self, names, data): class CParserWrapper(ParserBase): - def __init__(self, src, **kwds): + def __init__(self, src: FilePathOrBuffer, **kwds): self.kwds = kwds kwds = kwds.copy() ParserBase.__init__(self, kwds) - self.handles = get_handle( - src, - mode="r", - encoding=kwds.get("encoding", None), - compression=kwds.get("compression", None), - memory_map=kwds.get("memory_map", False), - is_text=True, - ) - kwds.pop("encoding", None) - kwds.pop("memory_map", None) - kwds.pop("compression", None) + # #2442 + kwds["allow_leading_cols"] = self.index_col is not False + + # GH20529, validate usecol arg before TextReader + self.usecols, self.usecols_dtype = _validate_usecols_arg(kwds["usecols"]) + kwds["usecols"] = self.usecols + + # open handles + self._open_handles(src, kwds) + assert self.handles is not None + for key in ("storage_options", "encoding", "memory_map", "compression"): + kwds.pop(key, None) if self.handles.is_mmap and hasattr(self.handles.handle, "mmap"): # pandas\io\parsers.py:1861: error: Item "IO[Any]" of # "Union[IO[Any], RawIOBase, BufferedIOBase, TextIOBase, @@ -1885,13 +1885,6 @@ def __init__(self, src, **kwds): # no attribute "mmap" [union-attr] self.handles.handle = self.handles.handle.mmap # type: ignore[union-attr] - # #2442 - kwds["allow_leading_cols"] = self.index_col is not False - - # GH20529, validate usecol arg before TextReader - self.usecols, self.usecols_dtype = _validate_usecols_arg(kwds["usecols"]) - kwds["usecols"] = self.usecols - self._reader = parsers.TextReader(self.handles.handle, **kwds) self.unnamed_cols = self._reader.unnamed_cols @@ -1935,6 +1928,8 @@ def __init__(self, src, **kwds): usecols = _evaluate_usecols(self.usecols, self.orig_names) # GH 14671 + # assert for mypy, orig_names is List or None, None would error in issubset + assert self.orig_names is not None if self.usecols_dtype == "string" and not set(usecols).issubset( self.orig_names ): @@ -2015,9 +2010,10 @@ def _set(x): x = usecols[x] if not is_integer(x): - # pandas\io\parsers.py:2037: error: Item "None" of - # "Optional[Any]" has no attribute "index" [union-attr] - x = names.index(x) # type: ignore[union-attr] + # assert for mypy, names is List or None, None would error when calling + # .index() + assert names is not None + x = names.index(x) self._reader.set_noconvert(x) @@ -2112,10 +2108,9 @@ def read(self, nrows=None): # ugh, mutation - # pandas\io\parsers.py:2131: error: Argument 1 to "list" has - # incompatible type "Optional[Any]"; expected "Iterable[Any]" - # [arg-type] - names = list(self.orig_names) # type: ignore[arg-type] + # assert for mypy, orig_names is List or None, None would error in list(...) + assert self.orig_names is not None + names = list(self.orig_names) names = self._maybe_dedup_names(names) if self.usecols is not None: @@ -2225,20 +2220,17 @@ def count_empty_vals(vals) -> int: class PythonParser(ParserBase): - def __init__(self, f, **kwds): + def __init__(self, f: Union[FilePathOrBuffer, List], **kwds): """ Workhorse function for processing nested list into DataFrame """ ParserBase.__init__(self, kwds) - self.data = None - self.buf = [] + self.data: Optional[Iterator[str]] = None + self.buf: List = [] self.pos = 0 self.line_pos = 0 - self.encoding = kwds["encoding"] - self.compression = kwds["compression"] - self.memory_map = kwds["memory_map"] self.skiprows = kwds["skiprows"] if callable(self.skiprows): @@ -2278,21 +2270,16 @@ def __init__(self, f, **kwds): self.decimal = kwds["decimal"] self.comment = kwds["comment"] - self._comment_lines = [] - - self.handles = get_handle( - f, - "r", - encoding=self.encoding, - compression=self.compression, - memory_map=self.memory_map, - ) # Set self.data to something that can read lines. - if hasattr(self.handles.handle, "readline"): - self._make_reader(self.handles.handle) + if isinstance(f, list): + # read_excel: f is a list + self.data = cast(Iterator[str], f) else: - self.data = self.handles.handle + self._open_handles(f, kwds) + assert self.handles is not None + assert hasattr(self.handles.handle, "readline") + self._make_reader(self.handles.handle) # Get columns in two steps: infer from data, then # infer column indices from self.usecols if it is specified. @@ -2429,11 +2416,11 @@ class MyDialect(csv.Dialect): sniffed = csv.Sniffer().sniff(line) dia.delimiter = sniffed.delimiter - # Note: self.encoding is irrelevant here + # Note: encoding is irrelevant here line_rdr = csv.reader(StringIO(line), dialect=dia) self.buf.extend(list(line_rdr)) - # Note: self.encoding is irrelevant here + # Note: encoding is irrelevant here reader = csv.reader(f, dialect=dia, strict=True) else: @@ -2894,10 +2881,9 @@ def _next_line(self): else: while self.skipfunc(self.pos): self.pos += 1 - # pandas\io\parsers.py:2865: error: Argument 1 to "next" has - # incompatible type "Optional[Any]"; expected "Iterator[Any]" - # [arg-type] - next(self.data) # type: ignore[arg-type] + # assert for mypy, data is Iterator[str] or None, would error in next + assert self.data is not None + next(self.data) while True: orig_line = self._next_iter_line(row_num=self.pos + 1) @@ -2958,10 +2944,9 @@ def _next_iter_line(self, row_num): row_num : The row number of the line being parsed. """ try: - # pandas\io\parsers.py:2926: error: Argument 1 to "next" has - # incompatible type "Optional[Any]"; expected "Iterator[Any]" - # [arg-type] - return next(self.data) # type: ignore[arg-type] + # assert for mypy, data is Iterator[str] or None, would error in next + assert self.data is not None + return next(self.data) except csv.Error as e: if self.warn_bad_lines or self.error_bad_lines: msg = str(e) @@ -3251,10 +3236,10 @@ def _get_lines(self, rows=None): try: if rows is not None: for _ in range(rows): - # pandas\io\parsers.py:3209: error: Argument 1 to - # "next" has incompatible type "Optional[Any]"; - # expected "Iterator[Any]" [arg-type] - new_rows.append(next(self.data)) # type: ignore[arg-type] + # assert for mypy, data is Iterator[str] or None, would + # error in next + assert self.data is not None + new_rows.append(next(self.data)) lines.extend(new_rows) else: rows = 0 @@ -3756,11 +3741,7 @@ def __init__(self, f, **kwds): PythonParser.__init__(self, f, **kwds) def _make_reader(self, f): - # pandas\io\parsers.py:3730: error: Incompatible types in assignment - # (expression has type "FixedWidthReader", variable has type - # "Union[IO[Any], RawIOBase, BufferedIOBase, TextIOBase, TextIOWrapper, - # mmap, None]") [assignment] - self.data = FixedWidthReader( # type: ignore[assignment] + self.data = FixedWidthReader( f, self.colspecs, self.delimiter, diff --git a/pandas/io/pickle.py b/pandas/io/pickle.py index 6fa044b4651a5..840ac0360658b 100644 --- a/pandas/io/pickle.py +++ b/pandas/io/pickle.py @@ -6,7 +6,7 @@ from pandas._typing import CompressionOptions, FilePathOrBuffer, StorageOptions from pandas.compat import pickle_compat as pc -from pandas.io.common import get_filepath_or_buffer, get_handle +from pandas.io.common import get_handle def to_pickle( @@ -86,24 +86,19 @@ def to_pickle( >>> import os >>> os.remove("./dummy.pkl") """ - ioargs = get_filepath_or_buffer( + handles = get_handle( filepath_or_buffer, + "wb", compression=compression, - mode="wb", + is_text=False, storage_options=storage_options, ) - handles = get_handle( - ioargs.filepath_or_buffer, "wb", compression=ioargs.compression, is_text=False - ) if protocol < 0: protocol = pickle.HIGHEST_PROTOCOL try: pickle.dump(obj, handles.handle, protocol=protocol) # type: ignore[arg-type] finally: - # close compression and byte/text wrapper handles.close() - # close any fsspec-like objects - ioargs.close() def read_pickle( @@ -183,11 +178,12 @@ def read_pickle( >>> import os >>> os.remove("./dummy.pkl") """ - ioargs = get_filepath_or_buffer( - filepath_or_buffer, compression=compression, storage_options=storage_options - ) handles = get_handle( - ioargs.filepath_or_buffer, "rb", compression=ioargs.compression, is_text=False + filepath_or_buffer, + "rb", + compression=compression, + is_text=False, + storage_options=storage_options, ) # 1) try standard library Pickle @@ -211,7 +207,4 @@ def read_pickle( # e.g. can occur for files written in py27; see GH#28645 and GH#31988 return pc.load(handles.handle, encoding="latin-1") finally: - # close compression and byte/text wrapper handles.close() - # close any fsspec-like objects - ioargs.close() diff --git a/pandas/io/sas/sas7bdat.py b/pandas/io/sas/sas7bdat.py index e9b74199cbc42..e9c1bf26f6675 100644 --- a/pandas/io/sas/sas7bdat.py +++ b/pandas/io/sas/sas7bdat.py @@ -24,7 +24,7 @@ import pandas as pd -from pandas.io.common import get_filepath_or_buffer +from pandas.io.common import get_handle from pandas.io.sas._sas import Parser import pandas.io.sas.sas_constants as const from pandas.io.sas.sasreader import ReaderBase @@ -168,12 +168,9 @@ def __init__( self._current_row_on_page_index = 0 self._current_row_in_file_index = 0 - self.ioargs = get_filepath_or_buffer(path_or_buf) - if isinstance(self.ioargs.filepath_or_buffer, str): - self.ioargs.filepath_or_buffer = open(path_or_buf, "rb") - self.ioargs.should_close = True + self.handles = get_handle(path_or_buf, "rb", is_text=False) - self._path_or_buf = cast(IO[Any], self.ioargs.filepath_or_buffer) + self._path_or_buf = cast(IO[Any], self.handles.handle) try: self._get_properties() @@ -198,7 +195,7 @@ def column_types(self): return np.asarray(self._column_types, dtype=np.dtype("S1")) def close(self): - self.ioargs.close() + self.handles.close() def _get_properties(self): diff --git a/pandas/io/sas/sas_xport.py b/pandas/io/sas/sas_xport.py index 4303cef2df60d..2f5de16a7ad6c 100644 --- a/pandas/io/sas/sas_xport.py +++ b/pandas/io/sas/sas_xport.py @@ -19,7 +19,7 @@ import pandas as pd -from pandas.io.common import get_filepath_or_buffer +from pandas.io.common import get_handle from pandas.io.sas.sasreader import ReaderBase _correct_line1 = ( @@ -253,13 +253,10 @@ def __init__( self._index = index self._chunksize = chunksize - self.ioargs = get_filepath_or_buffer(filepath_or_buffer, encoding=encoding) - - if isinstance(self.ioargs.filepath_or_buffer, str): - self.ioargs.filepath_or_buffer = open(self.ioargs.filepath_or_buffer, "rb") - self.ioargs.should_close = True - - self.filepath_or_buffer = cast(IO[bytes], self.ioargs.filepath_or_buffer) + self.handles = get_handle( + filepath_or_buffer, "rb", encoding=encoding, is_text=False + ) + self.filepath_or_buffer = cast(IO[bytes], self.handles.handle) try: self._read_header() @@ -268,7 +265,7 @@ def __init__( raise def close(self): - self.ioargs.close() + self.handles.close() def _get_row(self): return self.filepath_or_buffer.read(80).decode() diff --git a/pandas/io/sas/sasreader.py b/pandas/io/sas/sasreader.py index 446e2daaa1f9c..87832c10786f1 100644 --- a/pandas/io/sas/sasreader.py +++ b/pandas/io/sas/sasreader.py @@ -6,7 +6,7 @@ from pandas._typing import FilePathOrBuffer, Label -from pandas.io.common import get_filepath_or_buffer, stringify_path +from pandas.io.common import get_handle, stringify_path if TYPE_CHECKING: from pandas import DataFrame @@ -109,25 +109,25 @@ def read_sas( else: raise ValueError("unable to infer format of SAS file") - ioargs = get_filepath_or_buffer(filepath_or_buffer, encoding) + handles = get_handle(filepath_or_buffer, "rb", encoding=encoding, is_text=False) reader: ReaderBase if format.lower() == "xport": from pandas.io.sas.sas_xport import XportReader reader = XportReader( - ioargs.filepath_or_buffer, + handles.handle, index=index, - encoding=ioargs.encoding, + encoding=encoding, chunksize=chunksize, ) elif format.lower() == "sas7bdat": from pandas.io.sas.sas7bdat import SAS7BDATReader reader = SAS7BDATReader( - ioargs.filepath_or_buffer, + handles.handle, index=index, - encoding=ioargs.encoding, + encoding=encoding, chunksize=chunksize, ) else: @@ -139,4 +139,4 @@ def read_sas( try: return reader.read() finally: - ioargs.close() + handles.close() diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 7c7997f128086..839f28c4034df 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -53,12 +53,7 @@ from pandas.core.indexes.base import Index from pandas.core.series import Series -from pandas.io.common import ( - IOHandles, - get_filepath_or_buffer, - get_handle, - stringify_path, -) +from pandas.io.common import get_handle _version_error = ( "Version of given Stata file is {version}. pandas supports importing " @@ -1062,20 +1057,16 @@ def __init__( self._lines_read = 0 self._native_byteorder = _set_endianness(sys.byteorder) - self.ioargs = get_filepath_or_buffer( - path_or_buf, storage_options=storage_options + handles = get_handle( + path_or_buf, + "rb", + storage_options=storage_options, + is_text=False, ) - - if isinstance(self.ioargs.filepath_or_buffer, (str, bytes)): - self.ioargs.filepath_or_buffer = open(self.ioargs.filepath_or_buffer, "rb") - self.ioargs.should_close = True - elif hasattr(path_or_buf, "read"): - # Copy to BytesIO, and ensure no encoding - contents = self.ioargs.filepath_or_buffer.read() - self.ioargs.close() - self.ioargs.filepath_or_buffer = BytesIO(contents) # type: ignore[arg-type] - self.ioargs.should_close = True - self.path_or_buf = cast(BytesIO, self.ioargs.filepath_or_buffer) + # Copy to BytesIO, and ensure no encoding + contents = handles.handle.read() + handles.handle.close() + self.path_or_buf = BytesIO(contents) # type: ignore[arg-type] self._read_header() self._setup_dtype() @@ -1090,7 +1081,7 @@ def __exit__(self, exc_type, exc_value, traceback) -> None: def close(self) -> None: """ close the handle if its open """ - self.ioargs.close() + self.path_or_buf.close() def _set_encoding(self) -> None: """ @@ -1932,48 +1923,6 @@ def read_stata( return data -def _open_file_binary_write( - fname: FilePathOrBuffer, - compression: CompressionOptions, - storage_options: StorageOptions = None, -) -> Tuple[IOHandles, CompressionOptions]: - """ - Open a binary file or no-op if file-like. - - Parameters - ---------- - fname : string path, path object or buffer - The file name or buffer. - compression : {str, dict, None} - The compression method to use. - - storage_options : dict, optional - Extra options that make sense for a particular storage connection, e.g. - host, port, username, password, etc., if using a URL that will - be parsed by ``fsspec``, e.g., starting "s3://", "gcs://". An error - will be raised if providing this argument with a local path or - a file-like buffer. See the fsspec and backend storage implementation - docs for the set of allowed keys and values - - .. versionadded:: 1.2.0 - """ - ioargs = get_filepath_or_buffer( - fname, mode="wb", compression=compression, storage_options=storage_options - ) - handles = get_handle( - ioargs.filepath_or_buffer, - "wb", - compression=ioargs.compression, - is_text=False, - ) - if ioargs.filepath_or_buffer != fname and not isinstance( - ioargs.filepath_or_buffer, str - ): - # add handle created by get_filepath_or_buffer - handles.created_handles.append(ioargs.filepath_or_buffer) - return handles, ioargs.compression - - def _set_endianness(endianness: str) -> str: if endianness.lower() in ["<", "little"]: return "<" @@ -2231,7 +2180,7 @@ def __init__( if byteorder is None: byteorder = sys.byteorder self._byteorder = _set_endianness(byteorder) - self._fname = stringify_path(fname) + self._fname = fname self.type_converters = {253: np.int32, 252: np.int16, 251: np.int8} self._converted_names: Dict[Label, str] = {} @@ -2511,10 +2460,15 @@ def _encode_strings(self) -> None: self.data[col] = encoded def write_file(self) -> None: - self.handles, compression = _open_file_binary_write( - self._fname, self._compression, storage_options=self.storage_options + self.handles = get_handle( + self._fname, + "wb", + compression=self._compression, + is_text=False, + storage_options=self.storage_options, ) - if compression is not None: + + if self.handles.compression["method"] is not None: # ZipFile creates a file (with the same name) for each write call. # Write it first into a buffer and then write the buffer to the ZipFile. self._output_file = self.handles.handle diff --git a/pandas/tests/io/excel/test_writers.py b/pandas/tests/io/excel/test_writers.py index 0a297286aa208..00e41a19a7980 100644 --- a/pandas/tests/io/excel/test_writers.py +++ b/pandas/tests/io/excel/test_writers.py @@ -525,7 +525,7 @@ def test_sheets(self, frame, tsframe, path): writer = ExcelWriter(path) frame.to_excel(writer, "test1") tsframe.to_excel(writer, "test2") - writer.save() + writer.close() reader = ExcelFile(path) recons = pd.read_excel(reader, sheet_name="test1", index_col=0) tm.assert_frame_equal(frame, recons) diff --git a/pandas/tests/io/formats/test_to_csv.py b/pandas/tests/io/formats/test_to_csv.py index 3584ec047d4d2..a9673ded7c377 100644 --- a/pandas/tests/io/formats/test_to_csv.py +++ b/pandas/tests/io/formats/test_to_csv.py @@ -602,19 +602,22 @@ def test_to_csv_errors(self, errors): # No use in reading back the data as it is not the same anymore # due to the error handling - def test_to_csv_binary_handle(self): + @pytest.mark.parametrize("mode", ["wb", "w"]) + def test_to_csv_binary_handle(self, mode): """ - Binary file objects should work if 'mode' contains a 'b'. + Binary file objects should work (if 'mode' contains a 'b') or even without + it in most cases. GH 35058 and GH 19827 """ df = tm.makeDataFrame() with tm.ensure_clean() as path: with open(path, mode="w+b") as handle: - df.to_csv(handle, mode="w+b") + df.to_csv(handle, mode=mode) tm.assert_frame_equal(df, pd.read_csv(path, index_col=0)) - def test_to_csv_encoding_binary_handle(self): + @pytest.mark.parametrize("mode", ["wb", "w"]) + def test_to_csv_encoding_binary_handle(self, mode): """ Binary file objects should honor a specified encoding. @@ -626,14 +629,14 @@ def test_to_csv_encoding_binary_handle(self): df = pd.read_csv(buffer, encoding="utf-8-sig") buffer = io.BytesIO() - df.to_csv(buffer, mode="w+b", encoding="utf-8-sig", index=False) + df.to_csv(buffer, mode=mode, encoding="utf-8-sig", index=False) buffer.seek(0) # tests whether file handle wasn't closed assert buffer.getvalue().startswith(content) # example from GH 13068 with tm.ensure_clean() as path: with open(path, "w+b") as handle: - DataFrame().to_csv(handle, mode="w+b", encoding="utf-8-sig") + DataFrame().to_csv(handle, mode=mode, encoding="utf-8-sig") handle.seek(0) assert handle.read().startswith(b'\xef\xbb\xbf""') diff --git a/pandas/tests/io/parser/test_compression.py b/pandas/tests/io/parser/test_compression.py index b773664adda72..5680669f75aa3 100644 --- a/pandas/tests/io/parser/test_compression.py +++ b/pandas/tests/io/parser/test_compression.py @@ -75,7 +75,7 @@ def test_zip_error_invalid_zip(parser_and_data): parser, _, _ = parser_and_data with tm.ensure_clean() as path: - with open(path, "wb") as f: + with open(path, "rb") as f: with pytest.raises(zipfile.BadZipfile, match="File is not a zip file"): parser.read_csv(f, compression="zip") diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index 2a6f3d1ad9380..2fb923fc0e853 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -106,20 +106,21 @@ def test_infer_compression_from_path(self, extension, expected, path_type): assert compression == expected @pytest.mark.parametrize("path_type", [str, CustomFSPath, Path]) - def test_get_filepath_or_buffer_with_path(self, path_type): + def test_get_handle_with_path(self, path_type): # ignore LocalPath: it creates strange paths: /absolute/~/sometest filename = path_type("~/sometest") - ioargs = icom.get_filepath_or_buffer(filename) - assert ioargs.filepath_or_buffer != filename - assert os.path.isabs(ioargs.filepath_or_buffer) - assert os.path.expanduser(filename) == ioargs.filepath_or_buffer - assert not ioargs.should_close + handles = icom.get_handle(filename, "w") + assert os.path.isabs(handles.handle.name) + assert os.path.expanduser(filename) == handles.handle.name + handles.close() - def test_get_filepath_or_buffer_with_buffer(self): + def test_get_handle_with_buffer(self): input_buffer = StringIO() - ioargs = icom.get_filepath_or_buffer(input_buffer) - assert ioargs.filepath_or_buffer == input_buffer - assert not ioargs.should_close + handles = icom.get_handle(input_buffer, "r") + assert handles.handle == input_buffer + handles.close() + assert not handles.handle.closed + input_buffer.close() def test_iterator(self): reader = pd.read_csv(StringIO(self.data1), chunksize=1) diff --git a/pandas/tests/io/test_fsspec.py b/pandas/tests/io/test_fsspec.py index f8081a6a69e83..312ea5abdfe39 100644 --- a/pandas/tests/io/test_fsspec.py +++ b/pandas/tests/io/test_fsspec.py @@ -7,6 +7,7 @@ DataFrame, date_range, read_csv, + read_excel, read_feather, read_json, read_parquet, @@ -66,11 +67,53 @@ def test_reasonable_error(monkeypatch, cleared_fs): def test_to_csv(cleared_fs): df1.to_csv("memory://test/test.csv", index=True) + df2 = read_csv("memory://test/test.csv", parse_dates=["dt"], index_col=0) tm.assert_frame_equal(df1, df2) +@pytest.mark.parametrize("ext", ["xls", "xlsx"]) +def test_to_excel(cleared_fs, ext): + if ext == "xls": + pytest.importorskip("xlwt") + else: + pytest.importorskip("openpyxl") + + path = f"memory://test/test.{ext}" + df1.to_excel(path, index=True) + + df2 = read_excel(path, parse_dates=["dt"], index_col=0) + + tm.assert_frame_equal(df1, df2) + + +@pytest.mark.parametrize("binary_mode", [False, True]) +def test_to_csv_fsspec_object(cleared_fs, binary_mode): + fsspec = pytest.importorskip("fsspec") + + path = "memory://test/test.csv" + mode = "wb" if binary_mode else "w" + fsspec_object = fsspec.open(path, mode=mode).open() + + df1.to_csv(fsspec_object, index=True) + assert not fsspec_object.closed + fsspec_object.close() + + mode = mode.replace("w", "r") + fsspec_object = fsspec.open(path, mode=mode).open() + + df2 = read_csv( + fsspec_object, + parse_dates=["dt"], + index_col=0, + ) + assert not fsspec_object.closed + fsspec_object.close() + + tm.assert_frame_equal(df1, df2) + + def test_csv_options(fsspectest): df = DataFrame({"a": [0]}) df.to_csv( diff --git a/pandas/tests/io/test_gcs.py b/pandas/tests/io/test_gcs.py index 65e174cd32e22..10b3f7ce2cd0b 100644 --- a/pandas/tests/io/test_gcs.py +++ b/pandas/tests/io/test_gcs.py @@ -4,7 +4,7 @@ import numpy as np import pytest -from pandas import DataFrame, date_range, read_csv +from pandas import DataFrame, date_range, read_csv, read_excel, read_json, read_parquet import pandas._testing as tm from pandas.util import _test_decorators as td @@ -24,35 +24,23 @@ def open(*args, **kwargs): gcs_buffer.seek(0) return gcs_buffer + def ls(self, path, **kwargs): + # needed for pyarrow + return [{"name": path, "type": "file"}] + monkeypatch.setattr("gcsfs.GCSFileSystem", MockGCSFileSystem) return gcs_buffer @td.skip_if_no("gcsfs") -def test_read_csv_gcs(gcs_buffer): - from fsspec import registry - - registry.target.clear() # remove state - - df1 = DataFrame( - { - "int": [1, 3], - "float": [2.0, np.nan], - "str": ["t", "s"], - "dt": date_range("2018-06-18", periods=2), - } - ) - - gcs_buffer.write(df1.to_csv(index=False).encode()) - - df2 = read_csv("gs://test/test.csv", parse_dates=["dt"]) - - tm.assert_frame_equal(df1, df2) - +@pytest.mark.parametrize("format", ["csv", "json", "parquet", "excel", "markdown"]) +def test_to_read_gcs(gcs_buffer, format): + """ + Test that many to/read functions support GCS. -@td.skip_if_no("gcsfs") -def test_to_csv_gcs(gcs_buffer): + GH 33987 + """ from fsspec import registry registry.target.clear() # remove state @@ -66,9 +54,26 @@ def test_to_csv_gcs(gcs_buffer): } ) - df1.to_csv("gs://test/test.csv", index=True) - - df2 = read_csv("gs://test/test.csv", parse_dates=["dt"], index_col=0) + path = f"gs://test/test.{format}" + + if format == "csv": + df1.to_csv(path, index=True) + df2 = read_csv(path, parse_dates=["dt"], index_col=0) + elif format == "excel": + path = "gs://test/test.xls" + df1.to_excel(path) + df2 = read_excel(path, parse_dates=["dt"], index_col=0) + elif format == "json": + df1.to_json(path) + df2 = read_json(path, convert_dates=["dt"]) + elif format == "parquet": + pytest.importorskip("pyarrow") + df1.to_parquet(path) + df2 = read_parquet(path) + elif format == "markdown": + pytest.importorskip("tabulate") + df1.to_markdown(path) + df2 = df1 tm.assert_frame_equal(df1, df2) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 285601b37b80f..123e115cd2f2a 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -339,6 +339,17 @@ def check_error_on_write(self, df, engine, exc): with pytest.raises(exc): to_parquet(df, path, engine, compression=None) + @tm.network + def test_parquet_read_from_url(self, df_compat, engine): + if engine != "auto": + pytest.importorskip(engine) + url = ( + "https://raw.githubusercontent.com/pandas-dev/pandas/" + "master/pandas/tests/io/data/parquet/simple.parquet" + ) + df = pd.read_parquet(url) + tm.assert_frame_equal(df, df_compat) + class TestBasic(Base): def test_error(self, engine): @@ -653,16 +664,6 @@ def test_s3_roundtrip_for_dir( repeat=1, ) - @tm.network - @td.skip_if_no("pyarrow") - def test_parquet_read_from_url(self, df_compat): - url = ( - "https://raw.githubusercontent.com/pandas-dev/pandas/" - "master/pandas/tests/io/data/parquet/simple.parquet" - ) - df = pd.read_parquet(url) - tm.assert_frame_equal(df, df_compat) - @td.skip_if_no("pyarrow") def test_read_file_like_obj_support(self, df_compat): buffer = BytesIO() @@ -704,9 +705,7 @@ def test_partition_cols_string(self, pa, df_full): assert len(dataset.partitions.partition_names) == 1 assert dataset.partitions.partition_names == set(partition_cols_list) - @pytest.mark.parametrize( - "path_type", [lambda path: path, lambda path: pathlib.Path(path)] - ) + @pytest.mark.parametrize("path_type", [str, pathlib.Path]) def test_partition_cols_pathlib(self, pa, df_compat, path_type): # GH 35902 From 4f1fad83f29ac6dd0055221678a9b727050efa81 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Torsten=20W=C3=B6rtwein?= Date: Thu, 12 Nov 2020 06:30:12 -0500 Subject: [PATCH 2/2] make IOHandles a context manager --- pandas/core/frame.py | 8 +- pandas/io/common.py | 8 ++ pandas/io/feather_format.py | 23 +++--- pandas/io/formats/csvs.py | 8 +- pandas/io/json/_json.py | 7 +- pandas/io/orc.py | 10 +-- pandas/io/pickle.py | 54 ++++++------- pandas/io/sas/sasreader.py | 13 +--- pandas/io/stata.py | 90 +++++++++++----------- pandas/tests/frame/methods/test_to_csv.py | 10 +-- pandas/tests/io/test_common.py | 14 ++-- pandas/tests/io/test_compression.py | 23 +++--- pandas/tests/series/methods/test_to_csv.py | 8 +- 13 files changed, 127 insertions(+), 149 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 27650c2889090..bae06339a1e60 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2301,10 +2301,10 @@ def to_markdown( result = tabulate.tabulate(self, **kwargs) if buf is None: return result - handles = get_handle(buf, mode, storage_options=storage_options) - assert not isinstance(handles.handle, (str, mmap.mmap)) - handles.handle.writelines(result) - handles.close() + + with get_handle(buf, mode, storage_options=storage_options) as handles: + assert not isinstance(handles.handle, (str, mmap.mmap)) + handles.handle.writelines(result) return None @deprecate_kwarg(old_arg_name="fname", new_arg_name="path") diff --git a/pandas/io/common.py b/pandas/io/common.py index 739ee8076e29f..695c1671abd61 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -62,6 +62,8 @@ class IOHandles: """ Return value of io/common.py:get_handle + Can be used as a context manager. + This is used to easily close created buffers and to handle corner cases when TextIOWrapper is inserted. @@ -96,6 +98,12 @@ def close(self) -> None: self.created_handles = [] self.is_wrapped = False + def __enter__(self) -> "IOHandles": + return self + + def __exit__(self, *args: Any) -> None: + self.close() + def is_url(url) -> bool: """ diff --git a/pandas/io/feather_format.py b/pandas/io/feather_format.py index 0a72f750237a5..9e63976bf8cf9 100644 --- a/pandas/io/feather_format.py +++ b/pandas/io/feather_format.py @@ -41,8 +41,6 @@ def to_feather( import_optional_dependency("pyarrow") from pyarrow import feather - handles = get_handle(path, "wb", storage_options=storage_options, is_text=False) - if not isinstance(df, DataFrame): raise ValueError("feather only support IO with DataFrames") @@ -79,9 +77,10 @@ def to_feather( if df.columns.inferred_type not in valid_types: raise ValueError("feather must have string column names") - feather.write_feather(df, handles.handle, **kwargs) - - handles.close() + with get_handle( + path, "wb", storage_options=storage_options, is_text=False + ) as handles: + feather.write_feather(df, handles.handle, **kwargs) def read_feather( @@ -129,12 +128,10 @@ def read_feather( import_optional_dependency("pyarrow") from pyarrow import feather - handles = get_handle(path, "rb", storage_options=storage_options, is_text=False) - - df = feather.read_feather( - handles.handle, columns=columns, use_threads=bool(use_threads) - ) + with get_handle( + path, "rb", storage_options=storage_options, is_text=False + ) as handles: - handles.close() - - return df + return feather.read_feather( + handles.handle, columns=columns, use_threads=bool(use_threads) + ) diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index db428ce61ee39..cbe2ed1ed838d 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -225,16 +225,15 @@ def save(self) -> None: Create the writer & save. """ # apply compression and byte/text conversion - handles = get_handle( + with get_handle( self.filepath_or_buffer, self.mode, encoding=self.encoding, errors=self.errors, compression=self.compression, storage_options=self.storage_options, - ) + ) as handles: - try: # Note: self.encoding is irrelevant here self.writer = csvlib.writer( handles.handle, # type: ignore[arg-type] @@ -248,9 +247,6 @@ def save(self) -> None: self._save() - finally: - handles.close() - def _save(self) -> None: if self._need_to_save_header: self._save_header() diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index b9b1535129db5..f30007f6ed907 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -98,13 +98,10 @@ def to_json( if path_or_buf is not None: # apply compression and byte/text conversion - handles = get_handle( + with get_handle( path_or_buf, "wt", compression=compression, storage_options=storage_options - ) - try: + ) as handles: handles.handle.write(s) - finally: - handles.close() else: return s diff --git a/pandas/io/orc.py b/pandas/io/orc.py index 750728b36bae5..d9e9f3e1770be 100644 --- a/pandas/io/orc.py +++ b/pandas/io/orc.py @@ -48,10 +48,6 @@ def read_orc( if distutils.version.LooseVersion(pyarrow.__version__) < "0.13.0": raise ImportError("pyarrow must be >= 0.13.0 for read_orc") - import pyarrow.orc - - handles = get_handle(path, "rb", is_text=False) - orc_file = pyarrow.orc.ORCFile(handles.handle) - result = orc_file.read(columns=columns, **kwargs).to_pandas() - handles.close() - return result + with get_handle(path, "rb", is_text=False) as handles: + orc_file = pyarrow.orc.ORCFile(handles.handle) + return orc_file.read(columns=columns, **kwargs).to_pandas() diff --git a/pandas/io/pickle.py b/pandas/io/pickle.py index 840ac0360658b..7d09029aded1b 100644 --- a/pandas/io/pickle.py +++ b/pandas/io/pickle.py @@ -86,19 +86,17 @@ def to_pickle( >>> import os >>> os.remove("./dummy.pkl") """ - handles = get_handle( + if protocol < 0: + protocol = pickle.HIGHEST_PROTOCOL + + with get_handle( filepath_or_buffer, "wb", compression=compression, is_text=False, storage_options=storage_options, - ) - if protocol < 0: - protocol = pickle.HIGHEST_PROTOCOL - try: + ) as handles: pickle.dump(obj, handles.handle, protocol=protocol) # type: ignore[arg-type] - finally: - handles.close() def read_pickle( @@ -178,33 +176,31 @@ def read_pickle( >>> import os >>> os.remove("./dummy.pkl") """ - handles = get_handle( + excs_to_catch = (AttributeError, ImportError, ModuleNotFoundError, TypeError) + with get_handle( filepath_or_buffer, "rb", compression=compression, is_text=False, storage_options=storage_options, - ) + ) as handles: - # 1) try standard library Pickle - # 2) try pickle_compat (older pandas version) to handle subclass changes - # 3) try pickle_compat with latin-1 encoding upon a UnicodeDecodeError + # 1) try standard library Pickle + # 2) try pickle_compat (older pandas version) to handle subclass changes + # 3) try pickle_compat with latin-1 encoding upon a UnicodeDecodeError - try: - excs_to_catch = (AttributeError, ImportError, ModuleNotFoundError, TypeError) - # TypeError for Cython complaints about object.__new__ vs Tick.__new__ try: - with warnings.catch_warnings(record=True): - # We want to silence any warnings about, e.g. moved modules. - warnings.simplefilter("ignore", Warning) - return pickle.load(handles.handle) # type: ignore[arg-type] - except excs_to_catch: - # e.g. - # "No module named 'pandas.core.sparse.series'" - # "Can't get attribute '__nat_unpickle' on None: self.data[col] = encoded def write_file(self) -> None: - self.handles = get_handle( + with get_handle( self._fname, "wb", compression=self._compression, is_text=False, storage_options=self.storage_options, - ) + ) as self.handles: - if self.handles.compression["method"] is not None: - # ZipFile creates a file (with the same name) for each write call. - # Write it first into a buffer and then write the buffer to the ZipFile. - self._output_file = self.handles.handle - self.handles.handle = BytesIO() - try: - self._write_header(data_label=self._data_label, time_stamp=self._time_stamp) - self._write_map() - self._write_variable_types() - self._write_varnames() - self._write_sortlist() - self._write_formats() - self._write_value_label_names() - self._write_variable_labels() - self._write_expansion_fields() - self._write_characteristics() - records = self._prepare_data() - self._write_data(records) - self._write_strls() - self._write_value_labels() - self._write_file_close_tag() - self._write_map() - except Exception as exc: - self._close() - if isinstance(self._fname, (str, Path)): - try: - os.unlink(self._fname) - except OSError: - warnings.warn( - f"This save was not successful but {self._fname} could not " - "be deleted. This file is not valid.", - ResourceWarning, - ) - raise exc - else: - self._close() + if self.handles.compression["method"] is not None: + # ZipFile creates a file (with the same name) for each write call. + # Write it first into a buffer and then write the buffer to the ZipFile. + self._output_file = self.handles.handle + self.handles.handle = BytesIO() + + try: + self._write_header( + data_label=self._data_label, time_stamp=self._time_stamp + ) + self._write_map() + self._write_variable_types() + self._write_varnames() + self._write_sortlist() + self._write_formats() + self._write_value_label_names() + self._write_variable_labels() + self._write_expansion_fields() + self._write_characteristics() + records = self._prepare_data() + self._write_data(records) + self._write_strls() + self._write_value_labels() + self._write_file_close_tag() + self._write_map() + except Exception as exc: + self._close() + if isinstance(self._fname, (str, Path)): + try: + os.unlink(self._fname) + except OSError: + warnings.warn( + f"This save was not successful but {self._fname} could not " + "be deleted. This file is not valid.", + ResourceWarning, + ) + raise exc + else: + self._close() def _close(self) -> None: """ @@ -2520,8 +2522,6 @@ def _close(self) -> None: self.handles.handle = self._output_file self.handles.handle.write(bio.read()) # type: ignore[arg-type] bio.close() - # close any created handles - self.handles.close() def _write_map(self) -> None: """No-op, future compatibility""" diff --git a/pandas/tests/frame/methods/test_to_csv.py b/pandas/tests/frame/methods/test_to_csv.py index 3103f6e1ba0b1..7babc6853aef3 100644 --- a/pandas/tests/frame/methods/test_to_csv.py +++ b/pandas/tests/frame/methods/test_to_csv.py @@ -1034,12 +1034,12 @@ def test_to_csv_compression(self, df, encoding, compression): tm.assert_frame_equal(df, result) # test the round trip using file handle - to_csv -> read_csv - handles = get_handle( + with get_handle( filename, "w", compression=compression, encoding=encoding - ) - df.to_csv(handles.handle, encoding=encoding) - assert not handles.handle.closed - handles.close() + ) as handles: + df.to_csv(handles.handle, encoding=encoding) + assert not handles.handle.closed + result = pd.read_csv( filename, compression=compression, diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index 2fb923fc0e853..c7a7101b5fe17 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -109,17 +109,15 @@ def test_infer_compression_from_path(self, extension, expected, path_type): def test_get_handle_with_path(self, path_type): # ignore LocalPath: it creates strange paths: /absolute/~/sometest filename = path_type("~/sometest") - handles = icom.get_handle(filename, "w") - assert os.path.isabs(handles.handle.name) - assert os.path.expanduser(filename) == handles.handle.name - handles.close() + with icom.get_handle(filename, "w") as handles: + assert os.path.isabs(handles.handle.name) + assert os.path.expanduser(filename) == handles.handle.name def test_get_handle_with_buffer(self): input_buffer = StringIO() - handles = icom.get_handle(input_buffer, "r") - assert handles.handle == input_buffer - handles.close() - assert not handles.handle.closed + with icom.get_handle(input_buffer, "r") as handles: + assert handles.handle == input_buffer + assert not input_buffer.closed input_buffer.close() def test_iterator(self): diff --git a/pandas/tests/io/test_compression.py b/pandas/tests/io/test_compression.py index 43a31ff1e4b58..158504082e657 100644 --- a/pandas/tests/io/test_compression.py +++ b/pandas/tests/io/test_compression.py @@ -47,18 +47,14 @@ def test_compression_size(obj, method, compression_only): @pytest.mark.parametrize("method", ["to_csv", "to_json"]) def test_compression_size_fh(obj, method, compression_only): with tm.ensure_clean() as path: - handles = icom.get_handle(path, "w", compression=compression_only) - getattr(obj, method)(handles.handle) - assert not handles.handle.closed - handles.close() - assert handles.handle.closed + with icom.get_handle(path, "w", compression=compression_only) as handles: + getattr(obj, method)(handles.handle) + assert not handles.handle.closed compressed_size = os.path.getsize(path) with tm.ensure_clean() as path: - handles = icom.get_handle(path, "w", compression=None) - getattr(obj, method)(handles.handle) - assert not handles.handle.closed - handles.close() - assert handles.handle.closed + with icom.get_handle(path, "w", compression=None) as handles: + getattr(obj, method)(handles.handle) + assert not handles.handle.closed uncompressed_size = os.path.getsize(path) assert uncompressed_size > compressed_size @@ -111,10 +107,9 @@ def test_compression_warning(compression_only): columns=["X", "Y", "Z"], ) with tm.ensure_clean() as path: - handles = icom.get_handle(path, "w", compression=compression_only) - with tm.assert_produces_warning(RuntimeWarning, check_stacklevel=False): - df.to_csv(handles.handle, compression=compression_only) - handles.close() + with icom.get_handle(path, "w", compression=compression_only) as handles: + with tm.assert_produces_warning(RuntimeWarning, check_stacklevel=False): + df.to_csv(handles.handle, compression=compression_only) def test_compression_binary(compression_only): diff --git a/pandas/tests/series/methods/test_to_csv.py b/pandas/tests/series/methods/test_to_csv.py index 714173158f4d6..72db87362584d 100644 --- a/pandas/tests/series/methods/test_to_csv.py +++ b/pandas/tests/series/methods/test_to_csv.py @@ -143,11 +143,11 @@ def test_to_csv_compression(self, s, encoding, compression): tm.assert_series_equal(s, result) # test the round trip using file handle - to_csv -> read_csv - handles = get_handle( + with get_handle( filename, "w", compression=compression, encoding=encoding - ) - s.to_csv(handles.handle, encoding=encoding, header=True) - handles.close() + ) as handles: + s.to_csv(handles.handle, encoding=encoding, header=True) + result = pd.read_csv( filename, compression=compression,