-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
ENH: Add BytesIOWrapper #42669
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
ENH: Add BytesIOWrapper #42669
Changes from all commits
2a16522
43252cc
4e998a6
392c48b
0a58afc
8e68d5e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -6,11 +6,13 @@ | |
from collections import abc | ||
import dataclasses | ||
import gzip | ||
import io | ||
from io import ( | ||
BufferedIOBase, | ||
BytesIO, | ||
RawIOBase, | ||
StringIO, | ||
TextIOBase, | ||
TextIOWrapper, | ||
) | ||
import mmap | ||
|
@@ -50,7 +52,6 @@ | |
|
||
lzma = import_lzma() | ||
|
||
|
||
_VALID_URLS = set(uses_relative + uses_netloc + uses_params) | ||
_VALID_URLS.discard("") | ||
|
||
|
@@ -102,7 +103,7 @@ def close(self) -> None: | |
avoid closing the potentially user-created buffer. | ||
""" | ||
if self.is_wrapped: | ||
assert isinstance(self.handle, TextIOWrapper) | ||
assert isinstance(self.handle, (TextIOWrapper, BytesIOWrapper)) | ||
self.handle.flush() | ||
self.handle.detach() | ||
self.created_handles.remove(self.handle) | ||
|
@@ -712,7 +713,16 @@ def get_handle( | |
|
||
# Convert BytesIO or file objects passed with an encoding | ||
is_wrapped = False | ||
if is_text and (compression or _is_binary_mode(handle, ioargs.mode)): | ||
if not is_text and ioargs.mode == "rb" and isinstance(handle, TextIOBase): | ||
handle = BytesIOWrapper( | ||
handle, | ||
encoding=ioargs.encoding, | ||
) | ||
handles.append(handle) | ||
# the (text) handle is always provided by the caller | ||
# since get_handle would have opened it in binary mode | ||
is_wrapped = True | ||
elif is_text and (compression or _is_binary_mode(handle, ioargs.mode)): | ||
handle = TextIOWrapper( | ||
# error: Argument 1 to "TextIOWrapper" has incompatible type | ||
# "Union[IO[bytes], IO[Any], RawIOBase, BufferedIOBase, TextIOBase, mmap]"; | ||
|
@@ -878,6 +888,46 @@ def __next__(self) -> str: | |
return newline.lstrip("\n") | ||
|
||
|
||
# Wrapper that wraps a StringIO buffer and reads bytes from it | ||
# Created for compat with pyarrow read_csv | ||
class BytesIOWrapper(io.BytesIO): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't know what pandas's policy is on public/private classes. Maybe name the class There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this is fine, this entire module is de-facto private. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. and we already expose TextIOWrapper so this is similar |
||
buffer: StringIO | TextIOBase | None | ||
|
||
def __init__(self, buffer: StringIO | TextIOBase, encoding: str = "utf-8"): | ||
self.buffer = buffer | ||
self.encoding = encoding | ||
# Because a character can be represented by more than 1 byte, | ||
# it is possible that reading will produce more bytes than n | ||
# We store the extra bytes in this overflow variable, and append the | ||
# overflow to the front of the bytestring the next time reading is performed | ||
self.overflow = b"" | ||
|
||
def __getattr__(self, attr: str): | ||
return getattr(self.buffer, attr) | ||
|
||
def read(self, n: int | None = -1) -> bytes: | ||
assert self.buffer is not None | ||
bytestring = self.buffer.read(n).encode(self.encoding) | ||
# When n=-1/n greater than remaining bytes: Read entire file/rest of file | ||
combined_bytestring = self.overflow + bytestring | ||
if n is None or n < 0 or n >= len(combined_bytestring): | ||
self.overflow = b"" | ||
return combined_bytestring | ||
else: | ||
to_return = combined_bytestring[:n] | ||
self.overflow = combined_bytestring[n:] | ||
return to_return | ||
|
||
def detach(self): | ||
twoertwein marked this conversation as resolved.
Show resolved
Hide resolved
|
||
# Slightly modified from Python's TextIOWrapper detach method | ||
if self.buffer is None: | ||
raise ValueError("buffer is already detached") | ||
self.flush() | ||
buffer = self.buffer | ||
self.buffer = None | ||
return buffer | ||
|
||
|
||
def _maybe_memory_map( | ||
handle: FileOrBuffer, | ||
memory_map: bool, | ||
|
Uh oh!
There was an error while loading. Please reload this page.