-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
ENH: Incorproate ArrowDtype into ArrowExtensionArray #47034
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
9053263
088f72e
aee3dc8
aa13af8
d521264
ce05407
bf0365b
01e4a4b
cc1c687
97967a5
f2d872d
a77ea6b
26e8998
a157e51
baeae04
c33c345
901e9b0
b3f6d93
80059d5
5c873d5
1160bff
68bb030
9fd9161
939e751
1a5d3ff
01ca1c7
f2dda8c
26b2f1c
95bd38f
a455b50
8d6ebb5
b6972a5
0024d9e
a18fd6f
f6b779d
9edb6a4
d074188
f8983ad
1b6fe93
eedffc2
c69d70e
245fbe6
91aaaab
1a44a6d
86e178c
4129e37
c5d029f
4743781
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -9,6 +9,8 @@ | |
import numpy as np | ||
|
||
from pandas._typing import ( | ||
Dtype, | ||
PositionalIndexer, | ||
TakeIndexer, | ||
npt, | ||
) | ||
|
@@ -24,13 +26,15 @@ | |
is_array_like, | ||
is_bool_dtype, | ||
is_integer, | ||
is_integer_dtype, | ||
is_scalar, | ||
) | ||
from pandas.core.dtypes.missing import isna | ||
|
||
from pandas.core.arrays.base import ExtensionArray | ||
from pandas.core.indexers import ( | ||
check_array_indexer, | ||
unpack_tuple_and_ellipses, | ||
validate_indices, | ||
) | ||
|
||
|
@@ -39,6 +43,7 @@ | |
import pyarrow.compute as pc | ||
|
||
from pandas.core.arrays.arrow._arrow_utils import fallback_performancewarning | ||
from pandas.core.arrays.arrow.dtype import ArrowDtype | ||
|
||
if TYPE_CHECKING: | ||
from pandas import Series | ||
|
@@ -48,16 +53,130 @@ | |
|
||
class ArrowExtensionArray(ExtensionArray): | ||
""" | ||
Base class for ExtensionArray backed by Arrow array. | ||
Base class for ExtensionArray backed by Arrow ChunkedArray. | ||
""" | ||
|
||
_data: pa.ChunkedArray | ||
|
||
def __init__(self, values: pa.ChunkedArray) -> None: | ||
self._data = values | ||
def __init__(self, values: pa.Array | pa.ChunkedArray) -> None: | ||
if pa_version_under1p01: | ||
msg = "pyarrow>=1.0.0 is required for PyArrow backed ArrowExtensionArray." | ||
raise ImportError(msg) | ||
if isinstance(values, pa.Array): | ||
mroeschke marked this conversation as resolved.
Show resolved
Hide resolved
|
||
self._data = pa.chunked_array([values]) | ||
elif isinstance(values, pa.ChunkedArray): | ||
self._data = values | ||
else: | ||
raise ValueError( | ||
f"Unsupported type '{type(values)}' for ArrowExtensionArray" | ||
) | ||
self._dtype = ArrowDtype(self._data.type) | ||
|
||
@classmethod | ||
def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy=False): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @mroeschke i just tried the following and got an ArrowInvalid exception
should this work? update: looks like just There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Related to #48238, I hadn't really anticipated users passing pyarrow arrays but I suppose this should be supported. |
||
""" | ||
Construct a new ExtensionArray from a sequence of scalars. | ||
""" | ||
if isinstance(dtype, ArrowDtype): | ||
pa_dtype = dtype.pyarrow_dtype | ||
elif dtype: | ||
pa_dtype = pa.from_numpy_dtype(dtype) | ||
else: | ||
pa_dtype = None | ||
|
||
if isinstance(scalars, cls): | ||
data = scalars._data | ||
if pa_dtype: | ||
data = data.cast(pa_dtype) | ||
return cls(data) | ||
else: | ||
return cls( | ||
pa.chunked_array(pa.array(scalars, type=pa_dtype, from_pandas=True)) | ||
) | ||
|
||
@classmethod | ||
def _from_sequence_of_strings( | ||
cls, strings, *, dtype: Dtype | None = None, copy=False | ||
): | ||
""" | ||
Construct a new ExtensionArray from a sequence of strings. | ||
""" | ||
return cls._from_sequence(strings, dtype=dtype, copy=copy) | ||
|
||
def __getitem__(self, item: PositionalIndexer): | ||
"""Select a subset of self. | ||
|
||
Parameters | ||
---------- | ||
item : int, slice, or ndarray | ||
* int: The position in 'self' to get. | ||
* slice: A slice object, where 'start', 'stop', and 'step' are | ||
integers or None | ||
* ndarray: A 1-d boolean NumPy ndarray the same length as 'self' | ||
|
||
Returns | ||
------- | ||
item : scalar or ExtensionArray | ||
|
||
Notes | ||
----- | ||
For scalar ``item``, return a scalar value suitable for the array's | ||
type. This should be an instance of ``self.dtype.type``. | ||
For slice ``key``, return an instance of ``ExtensionArray``, even | ||
if the slice is length 0 or 1. | ||
For a boolean mask, return an instance of ``ExtensionArray``, filtered | ||
to the values where ``item`` is True. | ||
""" | ||
item = check_array_indexer(self, item) | ||
|
||
if isinstance(item, np.ndarray): | ||
if not len(item): | ||
# Removable once we migrate StringDtype[pyarrow] to ArrowDtype[string] | ||
if self._dtype.name == "string" and self._dtype.storage == "pyarrow": | ||
pa_dtype = pa.string() | ||
else: | ||
pa_dtype = self._dtype.pyarrow_dtype | ||
return type(self)(pa.chunked_array([], type=pa_dtype)) | ||
elif is_integer_dtype(item.dtype): | ||
return self.take(item) | ||
elif is_bool_dtype(item.dtype): | ||
return type(self)(self._data.filter(item)) | ||
else: | ||
raise IndexError( | ||
"Only integers, slices and integer or " | ||
"boolean arrays are valid indices." | ||
) | ||
elif isinstance(item, tuple): | ||
item = unpack_tuple_and_ellipses(item) | ||
|
||
# error: Non-overlapping identity check (left operand type: | ||
# "Union[Union[int, integer[Any]], Union[slice, List[int], | ||
# ndarray[Any, Any]]]", right operand type: "ellipsis") | ||
if item is Ellipsis: # type: ignore[comparison-overlap] | ||
# TODO: should be handled by pyarrow? | ||
item = slice(None) | ||
|
||
if is_scalar(item) and not is_integer(item): | ||
# e.g. "foo" or 2.5 | ||
# exception message copied from numpy | ||
raise IndexError( | ||
r"only integers, slices (`:`), ellipsis (`...`), numpy.newaxis " | ||
r"(`None`) and integer or boolean arrays are valid indices" | ||
) | ||
# We are not an array indexer, so maybe e.g. a slice or integer | ||
# indexer. We dispatch to pyarrow. | ||
value = self._data[item] | ||
if isinstance(value, pa.ChunkedArray): | ||
return type(self)(value) | ||
else: | ||
scalar = value.as_py() | ||
if scalar is None: | ||
return self._dtype.na_value | ||
else: | ||
return scalar | ||
|
||
def __arrow_array__(self, type=None): | ||
"""Convert myself to a pyarrow Array or ChunkedArray.""" | ||
"""Convert myself to a pyarrow ChunkedArray.""" | ||
return self._data | ||
|
||
def equals(self, other) -> bool: | ||
|
@@ -67,6 +186,13 @@ def equals(self, other) -> bool: | |
# TODO: is this documented somewhere? | ||
return self._data == other._data | ||
|
||
@property | ||
def dtype(self) -> ArrowDtype: | ||
""" | ||
An instance of 'ExtensionDtype'. | ||
""" | ||
return self._dtype | ||
|
||
@property | ||
def nbytes(self) -> int: | ||
""" | ||
|
@@ -377,7 +503,8 @@ def _indexing_key_to_indices( | |
|
||
def _maybe_convert_setitem_value(self, value): | ||
"""Maybe convert value to be pyarrow compatible.""" | ||
raise NotImplementedError() | ||
# TODO: Make more robust like ArrowStringArray._maybe_convert_setitem_value | ||
return value | ||
|
||
def _set_via_chunk_iteration( | ||
self, indices: npt.NDArray[np.intp], value: npt.NDArray[Any] | ||
|
Uh oh!
There was an error while loading. Please reload this page.