From 923ee579c22ba22567dae8bd59bdf8c3325132a3 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 4 May 2021 14:48:18 +0200 Subject: [PATCH 1/4] [ArrowStringDtype] Make it already a StringDtype subclass --- pandas/core/arrays/string_arrow.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 72a2ab8a1b80a..524066c3c6e79 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -25,7 +25,6 @@ from pandas.util._decorators import doc from pandas.util._validators import validate_fillna_kwargs -from pandas.core.dtypes.base import ExtensionDtype from pandas.core.dtypes.common import ( is_array_like, is_bool_dtype, @@ -42,6 +41,7 @@ from pandas.core.arraylike import OpsMixin from pandas.core.arrays.base import ExtensionArray from pandas.core.arrays.boolean import BooleanDtype +from pandas.core.arrays.string_ import StringDtype from pandas.core.indexers import ( check_array_indexer, validate_indices, @@ -74,7 +74,7 @@ @register_extension_dtype -class ArrowStringDtype(ExtensionDtype): +class ArrowStringDtype(StringDtype): """ Extension dtype for string data in a ``pyarrow.ChunkedArray``. From be2913e59d6e30e0a6adbaae5a0ff31a3b88668e Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 4 May 2021 19:41:24 +0200 Subject: [PATCH 2/4] clean-up isinstance checks --- pandas/core/arrays/base.py | 6 ++---- pandas/core/arrays/interval.py | 3 +-- pandas/core/strings/accessor.py | 6 ++---- pandas/tests/extension/json/array.py | 3 +-- 4 files changed, 6 insertions(+), 12 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index bd01191719143..2cb30c53b6832 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -530,7 +530,6 @@ def astype(self, dtype, copy=True): NumPy ndarray with 'dtype' for its dtype. """ from pandas.core.arrays.string_ import StringDtype - from pandas.core.arrays.string_arrow import ArrowStringDtype dtype = pandas_dtype(dtype) if is_dtype_equal(dtype, self.dtype): @@ -540,9 +539,8 @@ def astype(self, dtype, copy=True): return self.copy() # FIXME: Really hard-code here? - if isinstance( - dtype, (ArrowStringDtype, StringDtype) - ): # allow conversion to StringArrays + if isinstance(dtype, StringDtype): + # allow conversion to StringArrays return dtype.construct_array_type()._from_sequence(self, copy=False) return np.array(self, dtype=dtype, copy=copy) diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index 95c95d98bc968..a99bf245a6073 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -829,7 +829,6 @@ def astype(self, dtype, copy: bool = True): """ from pandas import Index from pandas.core.arrays.string_ import StringDtype - from pandas.core.arrays.string_arrow import ArrowStringDtype if dtype is not None: dtype = pandas_dtype(dtype) @@ -852,7 +851,7 @@ def astype(self, dtype, copy: bool = True): return self._shallow_copy(new_left, new_right) elif is_categorical_dtype(dtype): return Categorical(np.asarray(self), dtype=dtype) - elif isinstance(dtype, (StringDtype, ArrowStringDtype)): + elif isinstance(dtype, StringDtype): return dtype.construct_array_type()._from_sequence(self, copy=False) # TODO: This try/except will be repeated. diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index f7fa32076ec86..f8df05a7022d1 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -155,11 +155,10 @@ class StringMethods(NoNewAttributesMixin): def __init__(self, data): from pandas.core.arrays.string_ import StringDtype - from pandas.core.arrays.string_arrow import ArrowStringDtype self._inferred_dtype = self._validate(data) self._is_categorical = is_categorical_dtype(data.dtype) - self._is_string = isinstance(data.dtype, (StringDtype, ArrowStringDtype)) + self._is_string = isinstance(data.dtype, StringDtype) self._data = data self._index = self._name = None @@ -3028,9 +3027,8 @@ def _result_dtype(arr): # ideally we just pass `dtype=arr.dtype` unconditionally, but this fails # when the list of values is empty. from pandas.core.arrays.string_ import StringDtype - from pandas.core.arrays.string_arrow import ArrowStringDtype - if isinstance(arr.dtype, (StringDtype, ArrowStringDtype)): + if isinstance(arr.dtype, StringDtype): return arr.dtype.name else: return object diff --git a/pandas/tests/extension/json/array.py b/pandas/tests/extension/json/array.py index ffe2769730f34..2eef828288e59 100644 --- a/pandas/tests/extension/json/array.py +++ b/pandas/tests/extension/json/array.py @@ -40,7 +40,6 @@ ExtensionDtype, ) from pandas.api.types import is_bool_dtype -from pandas.core.arrays.string_arrow import ArrowStringDtype class JSONDtype(ExtensionDtype): @@ -196,7 +195,7 @@ def astype(self, dtype, copy=True): if copy: return self.copy() return self - elif isinstance(dtype, (StringDtype, ArrowStringDtype)): + elif isinstance(dtype, StringDtype): value = self.astype(str) # numpy doesn'y like nested dicts return dtype.construct_array_type()._from_sequence(value, copy=False) From ad80a00175b4f796cf5436b3b8420cf814e24966 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 4 May 2021 20:26:29 +0200 Subject: [PATCH 3/4] add overrides --- pandas/core/arrays/string_arrow.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 8e1440e9c61e3..4465591f8570e 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -110,7 +110,7 @@ def type(self) -> type[str]: return str @classmethod - def construct_array_type(cls) -> type_t[ArrowStringArray]: + def construct_array_type(cls) -> type_t[ArrowStringArray]: # type: ignore[override] """ Return the array type associated with this dtype. @@ -126,7 +126,9 @@ def __hash__(self) -> int: def __repr__(self) -> str: return "ArrowStringDtype" - def __from_arrow__(self, array: pa.Array | pa.ChunkedArray) -> ArrowStringArray: + def __from_arrow__( + self, array: pa.Array | pa.ChunkedArray + ) -> ArrowStringArray: # type: ignore[override] """ Construct StringArray from pyarrow Array/ChunkedArray. """ From 29dd9c99e46a88a1e08087b939759f938fa5c651 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 4 May 2021 20:46:57 +0200 Subject: [PATCH 4/4] move comment --- pandas/core/arrays/string_arrow.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 4465591f8570e..6f23457c04dd4 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -126,9 +126,9 @@ def __hash__(self) -> int: def __repr__(self) -> str: return "ArrowStringDtype" - def __from_arrow__( + def __from_arrow__( # type: ignore[override] self, array: pa.Array | pa.ChunkedArray - ) -> ArrowStringArray: # type: ignore[override] + ) -> ArrowStringArray: """ Construct StringArray from pyarrow Array/ChunkedArray. """