diff --git a/doc/source/whatsnew/index.rst b/doc/source/whatsnew/index.rst
index 09d76d71c6e1b..ae96d0f8296f2 100644
--- a/doc/source/whatsnew/index.rst
+++ b/doc/source/whatsnew/index.rst
@@ -10,6 +10,14 @@ This is the list of changes to pandas between each release. For full details,
see the `commit logs `_. For install and
upgrade instructions, see :ref:`install`.
+Version 2.3
+-----------
+
+.. toctree::
+ :maxdepth: 2
+
+ v2.3.0
+
Version 2.2
-----------
diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst
new file mode 100644
index 0000000000000..01c2ed3821d7a
--- /dev/null
+++ b/doc/source/whatsnew/v2.3.0.rst
@@ -0,0 +1,180 @@
+.. _whatsnew_230:
+
+What's new in 2.3.0 (Month XX, 2024)
+------------------------------------
+
+These are the changes in pandas 2.3.0. See :ref:`release` for a full changelog
+including other versions of pandas.
+
+{{ header }}
+
+.. ---------------------------------------------------------------------------
+
+.. _whatsnew_230.upcoming_changes:
+
+Upcoming changes in pandas 3.0
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+
+.. _whatsnew_230.enhancements:
+
+Enhancements
+~~~~~~~~~~~~
+
+.. _whatsnew_230.enhancements.enhancement1:
+
+enhancement1
+^^^^^^^^^^^^
+
+
+.. _whatsnew_230.enhancements.other:
+
+Other enhancements
+^^^^^^^^^^^^^^^^^^
+
+-
+-
+
+.. ---------------------------------------------------------------------------
+.. _whatsnew_230.notable_bug_fixes:
+
+Notable bug fixes
+~~~~~~~~~~~~~~~~~
+
+These are bug fixes that might have notable behavior changes.
+
+.. _whatsnew_230.notable_bug_fixes.notable_bug_fix1:
+
+notable_bug_fix1
+^^^^^^^^^^^^^^^^
+
+.. ---------------------------------------------------------------------------
+.. _whatsnew_230.deprecations:
+
+Deprecations
+~~~~~~~~~~~~
+- Deprecated allowing non-``bool`` values for ``na`` in :meth:`.str.contains`, :meth:`.str.startswith`, and :meth:`.str.endswith` for dtypes that do not already disallow these (:issue:`59615`)
+-
+
+.. ---------------------------------------------------------------------------
+.. _whatsnew_230.performance:
+
+Performance improvements
+~~~~~~~~~~~~~~~~~~~~~~~~
+-
+-
+
+.. ---------------------------------------------------------------------------
+.. _whatsnew_230.bug_fixes:
+
+Bug fixes
+~~~~~~~~~
+
+Categorical
+^^^^^^^^^^^
+-
+-
+
+Datetimelike
+^^^^^^^^^^^^
+-
+-
+
+Timedelta
+^^^^^^^^^
+-
+-
+
+Timezones
+^^^^^^^^^
+-
+-
+
+Numeric
+^^^^^^^
+-
+-
+
+Conversion
+^^^^^^^^^^
+-
+-
+
+Strings
+^^^^^^^
+- Bug in :meth:`Series.rank` for :class:`StringDtype` with ``storage="pyarrow"`` incorrectly returning integer results in case of ``method="average"`` and raising an error if it would truncate results (:issue:`59768`)
+- Bug in :meth:`Series.str.replace` when ``n < 0`` for :class:`StringDtype` with ``storage="pyarrow"`` (:issue:`59628`)
+- Bug in ``ser.str.slice`` with negative ``step`` with :class:`ArrowDtype` and :class:`StringDtype` with ``storage="pyarrow"`` giving incorrect results (:issue:`59710`)
+- Bug in the ``center`` method on :class:`Series` and :class:`Index` object ``str`` accessors with pyarrow-backed dtype not matching the python behavior in corner cases with an odd number of fill characters (:issue:`54792`)
+-
+
+Interval
+^^^^^^^^
+-
+-
+
+Indexing
+^^^^^^^^
+-
+-
+
+Missing
+^^^^^^^
+-
+-
+
+MultiIndex
+^^^^^^^^^^
+-
+-
+
+I/O
+^^^
+-
+-
+
+Period
+^^^^^^
+-
+-
+
+Plotting
+^^^^^^^^
+-
+-
+
+Groupby/resample/rolling
+^^^^^^^^^^^^^^^^^^^^^^^^
+-
+-
+
+Reshaping
+^^^^^^^^^
+-
+-
+
+Sparse
+^^^^^^
+-
+-
+
+ExtensionArray
+^^^^^^^^^^^^^^
+-
+-
+
+Styler
+^^^^^^
+-
+-
+
+Other
+^^^^^
+-
+-
+
+.. ---------------------------------------------------------------------------
+.. _whatsnew_230.contributors:
+
+Contributors
+~~~~~~~~~~~~
diff --git a/pandas/_libs/arrays.pyx b/pandas/_libs/arrays.pyx
index 9889436a542c1..2932f3ff56396 100644
--- a/pandas/_libs/arrays.pyx
+++ b/pandas/_libs/arrays.pyx
@@ -67,6 +67,10 @@ cdef class NDArrayBacked:
"""
Construct a new ExtensionArray `new_array` with `arr` as its _ndarray.
+ The returned array has the same dtype as self.
+
+ Caller is responsible for ensuring `values.dtype == self._ndarray.dtype`.
+
This should round-trip:
self == self._from_backing_data(self._ndarray)
"""
diff --git a/pandas/_libs/hashtable.pyx b/pandas/_libs/hashtable.pyx
index ccac3d0b50d45..127b0b845d219 100644
--- a/pandas/_libs/hashtable.pyx
+++ b/pandas/_libs/hashtable.pyx
@@ -33,7 +33,10 @@ from pandas._libs.khash cimport (
kh_python_hash_func,
khiter_t,
)
-from pandas._libs.missing cimport checknull
+from pandas._libs.missing cimport (
+ checknull,
+ is_matching_na,
+)
def get_hashtable_trace_domain():
diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in
index c0723392496c1..c42bccb7f38f7 100644
--- a/pandas/_libs/hashtable_class_helper.pxi.in
+++ b/pandas/_libs/hashtable_class_helper.pxi.in
@@ -1121,11 +1121,13 @@ cdef class StringHashTable(HashTable):
const char **vecs
khiter_t k
bint use_na_value
+ bint non_null_na_value
if return_inverse:
labels = np.zeros(n, dtype=np.intp)
uindexer = np.empty(n, dtype=np.int64)
use_na_value = na_value is not None
+ non_null_na_value = not checknull(na_value)
# assign pointers and pre-filter out missing (if ignore_na)
vecs = malloc(n * sizeof(char *))
@@ -1134,7 +1136,12 @@ cdef class StringHashTable(HashTable):
if (ignore_na
and (not isinstance(val, str)
- or (use_na_value and val == na_value))):
+ or (use_na_value and (
+ (non_null_na_value and val == na_value) or
+ (not non_null_na_value and is_matching_na(val, na_value)))
+ )
+ )
+ ):
# if missing values do not count as unique values (i.e. if
# ignore_na is True), we can skip the actual value, and
# replace the label with na_sentinel directly
@@ -1400,10 +1407,11 @@ cdef class PyObjectHashTable(HashTable):
object val
khiter_t k
bint use_na_value
-
+ bint non_null_na_value
if return_inverse:
labels = np.empty(n, dtype=np.intp)
use_na_value = na_value is not None
+ non_null_na_value = not checknull(na_value)
for i in range(n):
val = values[i]
@@ -1411,7 +1419,11 @@ cdef class PyObjectHashTable(HashTable):
if ignore_na and (
checknull(val)
- or (use_na_value and val == na_value)
+ or (use_na_value and (
+ (non_null_na_value and val == na_value) or
+ (not non_null_na_value and is_matching_na(val, na_value))
+ )
+ )
):
# if missing values do not count as unique values (i.e. if
# ignore_na is True), skip the hashtable entry for them, and
diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
index 5d8a04664b0e4..c23f907aecfab 100644
--- a/pandas/_libs/lib.pyx
+++ b/pandas/_libs/lib.pyx
@@ -736,7 +736,9 @@ cpdef ndarray[object] ensure_string_array(
convert_na_value : bool, default True
If False, existing na values will be used unchanged in the new array.
copy : bool, default True
- Whether to ensure that a new array is returned.
+ Whether to ensure that a new array is returned. When True, a new array
+ is always returned. When False, a new array is only returned when needed
+ to avoid mutating the input array.
skipna : bool, default True
Whether or not to coerce nulls to their stringified form
(e.g. if False, NaN becomes 'nan').
@@ -753,7 +755,14 @@ cpdef ndarray[object] ensure_string_array(
if hasattr(arr, "to_numpy"):
- if hasattr(arr, "dtype") and arr.dtype.kind in "mM":
+ if (
+ hasattr(arr, "dtype")
+ and arr.dtype.kind in "mM"
+ # TODO: we should add a custom ArrowExtensionArray.astype implementation
+ # that handles astype(str) specifically, avoiding ending up here and
+ # then we can remove the below check for `_pa_array` (for ArrowEA)
+ and not hasattr(arr, "_pa_array")
+ ):
# dtype check to exclude DataFrame
# GH#41409 TODO: not a great place for this
out = arr.astype(str).astype(object)
@@ -765,10 +774,17 @@ cpdef ndarray[object] ensure_string_array(
result = np.asarray(arr, dtype="object")
- if copy and (result is arr or np.shares_memory(arr, result)):
- # GH#54654
- result = result.copy()
- elif not copy and result is arr:
+ if result is arr or np.may_share_memory(arr, result):
+ # if np.asarray(..) did not make a copy of the input arr, we still need
+ # to do that to avoid mutating the input array
+ # GH#54654: share_memory check is needed for rare cases where np.asarray
+ # returns a new object without making a copy of the actual data
+ if copy:
+ result = result.copy()
+ else:
+ already_copied = False
+ elif not copy and not result.flags.writeable:
+ # Weird edge case where result is a view
already_copied = False
if issubclass(arr.dtype.type, np.str_):
diff --git a/pandas/_testing/__init__.py b/pandas/_testing/__init__.py
index 10c1c490551fb..3aa7c64831efe 100644
--- a/pandas/_testing/__init__.py
+++ b/pandas/_testing/__init__.py
@@ -112,7 +112,7 @@
COMPLEX_DTYPES: list[Dtype] = [complex, "complex64", "complex128"]
if using_string_dtype():
- STRING_DTYPES: list[Dtype] = [str, "U"]
+ STRING_DTYPES: list[Dtype] = ["U"]
else:
STRING_DTYPES: list[Dtype] = [str, "str", "U"] # type: ignore[no-redef]
COMPLEX_FLOAT_DTYPES: list[Dtype] = [*COMPLEX_DTYPES, *FLOAT_NUMPY_DTYPES]
diff --git a/pandas/conftest.py b/pandas/conftest.py
index 433ea7275223d..c6237d0309630 100644
--- a/pandas/conftest.py
+++ b/pandas/conftest.py
@@ -1228,6 +1228,34 @@ def string_dtype(request):
return request.param
+@pytest.fixture(
+ params=[
+ ("python", pd.NA),
+ pytest.param(("pyarrow", pd.NA), marks=td.skip_if_no("pyarrow")),
+ pytest.param(("pyarrow", np.nan), marks=td.skip_if_no("pyarrow")),
+ ("python", np.nan),
+ ],
+ ids=[
+ "string=string[python]",
+ "string=string[pyarrow]",
+ "string=str[pyarrow]",
+ "string=str[python]",
+ ],
+)
+def string_dtype_no_object(request):
+ """
+ Parametrized fixture for string dtypes.
+ * 'string[python]' (NA variant)
+ * 'string[pyarrow]' (NA variant)
+ * 'str' (NaN variant, with pyarrow)
+ * 'str' (NaN variant, without pyarrow)
+ """
+ # need to instantiate the StringDtype here instead of in the params
+ # to avoid importing pyarrow during test collection
+ storage, na_value = request.param
+ return pd.StringDtype(storage, na_value)
+
+
@pytest.fixture(
params=[
"string[python]",
@@ -1266,7 +1294,13 @@ def string_storage(request):
pytest.param(("pyarrow", pd.NA), marks=td.skip_if_no("pyarrow")),
pytest.param(("pyarrow", np.nan), marks=td.skip_if_no("pyarrow")),
("python", np.nan),
- ]
+ ],
+ ids=[
+ "string=string[python]",
+ "string=string[pyarrow]",
+ "string=str[pyarrow]",
+ "string=str[python]",
+ ],
)
def string_dtype_arguments(request):
"""
@@ -1297,6 +1331,7 @@ def dtype_backend(request):
# Alias so we can test with cartesian product of string_storage
string_storage2 = string_storage
+string_dtype_arguments2 = string_dtype_arguments
@pytest.fixture(params=tm.BYTES_DTYPES)
diff --git a/pandas/core/_numba/extensions.py b/pandas/core/_numba/extensions.py
index ee09c9380fb0f..b05f12295a729 100644
--- a/pandas/core/_numba/extensions.py
+++ b/pandas/core/_numba/extensions.py
@@ -49,7 +49,8 @@
@contextmanager
def set_numba_data(index: Index):
numba_data = index._data
- if numba_data.dtype == object:
+ if numba_data.dtype in (object, "string"):
+ numba_data = np.asarray(numba_data)
if not lib.is_string_array(numba_data):
raise ValueError(
"The numba engine only supports using string or numeric column names"
diff --git a/pandas/core/apply.py b/pandas/core/apply.py
index 25a71ce5b5f4f..fafc9ee1b6928 100644
--- a/pandas/core/apply.py
+++ b/pandas/core/apply.py
@@ -1174,12 +1174,7 @@ def apply_with_numba(self) -> dict[int, Any]:
from pandas.core._numba.extensions import set_numba_data
index = self.obj.index
- if index.dtype == "string":
- index = index.astype(object)
-
columns = self.obj.columns
- if columns.dtype == "string":
- columns = columns.astype(object)
# Convert from numba dict to regular dict
# Our isinstance checks in the df constructor don't pass for numbas typed dict
diff --git a/pandas/core/arrays/_arrow_string_mixins.py b/pandas/core/arrays/_arrow_string_mixins.py
index cc41985843574..a39668faf779e 100644
--- a/pandas/core/arrays/_arrow_string_mixins.py
+++ b/pandas/core/arrays/_arrow_string_mixins.py
@@ -1,22 +1,85 @@
from __future__ import annotations
-from typing import Literal
+from functools import partial
+import re
+from typing import (
+ TYPE_CHECKING,
+ Any,
+ Literal,
+)
import numpy as np
-from pandas.compat import pa_version_under10p1
+from pandas.compat import (
+ pa_version_under10p1,
+ pa_version_under11p0,
+ pa_version_under13p0,
+ pa_version_under17p0,
+)
+
+from pandas.core.dtypes.missing import isna
if not pa_version_under10p1:
import pyarrow as pa
import pyarrow.compute as pc
+if TYPE_CHECKING:
+ from collections.abc import Callable
+
+ from pandas._typing import (
+ Scalar,
+ Self,
+ )
+
class ArrowStringArrayMixin:
- _pa_array = None
+ _pa_array: pa.ChunkedArray
def __init__(self, *args, **kwargs) -> None:
raise NotImplementedError
+ def _convert_bool_result(self, result):
+ # Convert a bool-dtype result to the appropriate result type
+ raise NotImplementedError
+
+ def _convert_int_result(self, result):
+ # Convert an integer-dtype result to the appropriate result type
+ raise NotImplementedError
+
+ def _apply_elementwise(self, func: Callable) -> list[list[Any]]:
+ raise NotImplementedError
+
+ def _str_len(self):
+ result = pc.utf8_length(self._pa_array)
+ return self._convert_int_result(result)
+
+ def _str_lower(self) -> Self:
+ return type(self)(pc.utf8_lower(self._pa_array))
+
+ def _str_upper(self) -> Self:
+ return type(self)(pc.utf8_upper(self._pa_array))
+
+ def _str_strip(self, to_strip=None) -> Self:
+ if to_strip is None:
+ result = pc.utf8_trim_whitespace(self._pa_array)
+ else:
+ result = pc.utf8_trim(self._pa_array, characters=to_strip)
+ return type(self)(result)
+
+ def _str_lstrip(self, to_strip=None) -> Self:
+ if to_strip is None:
+ result = pc.utf8_ltrim_whitespace(self._pa_array)
+ else:
+ result = pc.utf8_ltrim(self._pa_array, characters=to_strip)
+ return type(self)(result)
+
+ def _str_rstrip(self, to_strip=None) -> Self:
+ if to_strip is None:
+ result = pc.utf8_rtrim_whitespace(self._pa_array)
+ else:
+ result = pc.utf8_rtrim(self._pa_array, characters=to_strip)
+ return type(self)(result)
+
def _str_pad(
self,
width: int,
@@ -28,7 +91,19 @@ def _str_pad(
elif side == "right":
pa_pad = pc.utf8_rpad
elif side == "both":
- pa_pad = pc.utf8_center
+ if pa_version_under17p0:
+ # GH#59624 fall back to object dtype
+ from pandas import array as pd_array
+
+ obj_arr = self.astype(object, copy=False) # type: ignore[attr-defined]
+ obj = pd_array(obj_arr, dtype=object)
+ result = obj._str_pad(width, side, fillchar) # type: ignore[attr-defined]
+ return type(self)._from_sequence(result, dtype=self.dtype) # type: ignore[attr-defined]
+ else:
+ # GH#54792
+ # https://github.com/apache/arrow/issues/15053#issuecomment-2317032347
+ lean_left = (width % 2) == 0
+ pa_pad = partial(pc.utf8_center, lean_left_on_odd_padding=lean_left)
else:
raise ValueError(
f"Invalid side: {side}. Side must be one of 'left', 'right', 'both'"
@@ -51,12 +126,29 @@ def _str_get(self, i: int):
selected = pc.utf8_slice_codeunits(
self._pa_array, start=start, stop=stop, step=step
)
- null_value = pa.scalar(
- None, type=self._pa_array.type # type: ignore[attr-defined]
- )
+ null_value = pa.scalar(None, type=self._pa_array.type)
result = pc.if_else(not_out_of_bounds, selected, null_value)
return type(self)(result)
+ def _str_slice(
+ self, start: int | None = None, stop: int | None = None, step: int | None = None
+ ):
+ if pa_version_under11p0:
+ # GH#59724
+ result = self._apply_elementwise(lambda val: val[start:stop:step])
+ return type(self)(pa.chunked_array(result, type=self._pa_array.type))
+ if start is None:
+ if step is not None and step < 0:
+ # GH#59710
+ start = -1
+ else:
+ start = 0
+ if step is None:
+ step = 1
+ return type(self)(
+ pc.utf8_slice_codeunits(self._pa_array, start=start, stop=stop, step=step)
+ )
+
def _str_slice_replace(
self, start: int | None = None, stop: int | None = None, repl: str | None = None
):
@@ -68,7 +160,34 @@ def _str_slice_replace(
stop = np.iinfo(np.int64).max
return type(self)(pc.utf8_replace_slice(self._pa_array, start, stop, repl))
- def _str_capitalize(self):
+ def _str_replace(
+ self,
+ pat: str | re.Pattern,
+ repl: str | Callable,
+ n: int = -1,
+ case: bool = True,
+ flags: int = 0,
+ regex: bool = True,
+ ) -> Self:
+ if isinstance(pat, re.Pattern) or callable(repl) or not case or flags:
+ raise NotImplementedError(
+ "replace is not supported with a re.Pattern, callable repl, "
+ "case=False, or flags!=0"
+ )
+
+ func = pc.replace_substring_regex if regex else pc.replace_substring
+ # https://github.com/apache/arrow/issues/39149
+ # GH 56404, unexpected behavior with negative max_replacements with pyarrow.
+ pa_max_replacements = None if n < 0 else n
+ result = func(
+ self._pa_array,
+ pattern=pat,
+ replacement=repl,
+ max_replacements=pa_max_replacements,
+ )
+ return type(self)(result)
+
+ def _str_capitalize(self) -> Self:
return type(self)(pc.utf8_capitalize(self._pa_array))
def _str_title(self):
@@ -77,8 +196,151 @@ def _str_title(self):
def _str_swapcase(self):
return type(self)(pc.utf8_swapcase(self._pa_array))
+ def _str_removeprefix(self, prefix: str):
+ if not pa_version_under13p0:
+ starts_with = pc.starts_with(self._pa_array, pattern=prefix)
+ removed = pc.utf8_slice_codeunits(self._pa_array, len(prefix))
+ result = pc.if_else(starts_with, removed, self._pa_array)
+ return type(self)(result)
+ predicate = lambda val: val.removeprefix(prefix)
+ result = self._apply_elementwise(predicate)
+ return type(self)(pa.chunked_array(result))
+
def _str_removesuffix(self, suffix: str):
ends_with = pc.ends_with(self._pa_array, pattern=suffix)
removed = pc.utf8_slice_codeunits(self._pa_array, 0, stop=-len(suffix))
result = pc.if_else(ends_with, removed, self._pa_array)
return type(self)(result)
+
+ def _str_startswith(self, pat: str | tuple[str, ...], na: Scalar | None = None):
+ if isinstance(pat, str):
+ result = pc.starts_with(self._pa_array, pattern=pat)
+ else:
+ if len(pat) == 0:
+ # For empty tuple we return null for missing values and False
+ # for valid values.
+ result = pc.if_else(pc.is_null(self._pa_array), None, False)
+ else:
+ result = pc.starts_with(self._pa_array, pattern=pat[0])
+
+ for p in pat[1:]:
+ result = pc.or_(result, pc.starts_with(self._pa_array, pattern=p))
+ if not isna(na): # pyright: ignore [reportGeneralTypeIssues]
+ result = result.fill_null(na)
+ return self._convert_bool_result(result)
+
+ def _str_endswith(self, pat: str | tuple[str, ...], na: Scalar | None = None):
+ if isinstance(pat, str):
+ result = pc.ends_with(self._pa_array, pattern=pat)
+ else:
+ if len(pat) == 0:
+ # For empty tuple we return null for missing values and False
+ # for valid values.
+ result = pc.if_else(pc.is_null(self._pa_array), None, False)
+ else:
+ result = pc.ends_with(self._pa_array, pattern=pat[0])
+
+ for p in pat[1:]:
+ result = pc.or_(result, pc.ends_with(self._pa_array, pattern=p))
+ if not isna(na): # pyright: ignore [reportGeneralTypeIssues]
+ result = result.fill_null(na)
+ return self._convert_bool_result(result)
+
+ def _str_isalnum(self):
+ result = pc.utf8_is_alnum(self._pa_array)
+ return self._convert_bool_result(result)
+
+ def _str_isalpha(self):
+ result = pc.utf8_is_alpha(self._pa_array)
+ return self._convert_bool_result(result)
+
+ def _str_isdecimal(self):
+ result = pc.utf8_is_decimal(self._pa_array)
+ return self._convert_bool_result(result)
+
+ def _str_isdigit(self):
+ result = pc.utf8_is_digit(self._pa_array)
+ return self._convert_bool_result(result)
+
+ def _str_islower(self):
+ result = pc.utf8_is_lower(self._pa_array)
+ return self._convert_bool_result(result)
+
+ def _str_isnumeric(self):
+ result = pc.utf8_is_numeric(self._pa_array)
+ return self._convert_bool_result(result)
+
+ def _str_isspace(self):
+ result = pc.utf8_is_space(self._pa_array)
+ return self._convert_bool_result(result)
+
+ def _str_istitle(self):
+ result = pc.utf8_is_title(self._pa_array)
+ return self._convert_bool_result(result)
+
+ def _str_isupper(self):
+ result = pc.utf8_is_upper(self._pa_array)
+ return self._convert_bool_result(result)
+
+ def _str_contains(
+ self, pat, case: bool = True, flags: int = 0, na=None, regex: bool = True
+ ):
+ if flags:
+ raise NotImplementedError(f"contains not implemented with {flags=}")
+
+ if regex:
+ pa_contains = pc.match_substring_regex
+ else:
+ pa_contains = pc.match_substring
+ result = pa_contains(self._pa_array, pat, ignore_case=not case)
+ if not isna(na): # pyright: ignore [reportGeneralTypeIssues]
+ result = result.fill_null(na)
+ return self._convert_bool_result(result)
+
+ def _str_match(
+ self, pat: str, case: bool = True, flags: int = 0, na: Scalar | None = None
+ ):
+ if not pat.startswith("^"):
+ pat = f"^{pat}"
+ return self._str_contains(pat, case, flags, na, regex=True)
+
+ def _str_fullmatch(
+ self, pat, case: bool = True, flags: int = 0, na: Scalar | None = None
+ ):
+ if not pat.endswith("$") or pat.endswith("\\$"):
+ pat = f"{pat}$"
+ return self._str_match(pat, case, flags, na)
+
+ def _str_find(self, sub: str, start: int = 0, end: int | None = None):
+ if (
+ pa_version_under13p0
+ and not (start != 0 and end is not None)
+ and not (start == 0 and end is None)
+ ):
+ # GH#59562
+ res_list = self._apply_elementwise(lambda val: val.find(sub, start, end))
+ return self._convert_int_result(pa.chunked_array(res_list))
+
+ if (start == 0 or start is None) and end is None:
+ result = pc.find_substring(self._pa_array, sub)
+ else:
+ if sub == "":
+ # GH#56792
+ res_list = self._apply_elementwise(
+ lambda val: val.find(sub, start, end)
+ )
+ return self._convert_int_result(pa.chunked_array(res_list))
+ if start is None:
+ start_offset = 0
+ start = 0
+ elif start < 0:
+ start_offset = pc.add(start, pc.utf8_length(self._pa_array))
+ start_offset = pc.if_else(pc.less(start_offset, 0), 0, start_offset)
+ else:
+ start_offset = start
+ slices = pc.utf8_slice_codeunits(self._pa_array, start, stop=end)
+ result = pc.find_substring(slices, sub)
+ found = pc.not_equal(result, pa.scalar(-1, type=result.type))
+ offset_result = pc.add(result, start_offset)
+ result = pc.if_else(found, offset_result, -1)
+ return self._convert_int_result(result)
diff --git a/pandas/core/arrays/_mixins.py b/pandas/core/arrays/_mixins.py
index 0da121c36644a..cb6861a8dd00f 100644
--- a/pandas/core/arrays/_mixins.py
+++ b/pandas/core/arrays/_mixins.py
@@ -515,17 +515,14 @@ def _quantile(
fill_value = self._internal_fill_value
res_values = quantile_with_mask(arr, mask, fill_value, qs, interpolation)
-
- res_values = self._cast_quantile_result(res_values)
- return self._from_backing_data(res_values)
-
- # TODO: see if we can share this with other dispatch-wrapping methods
- def _cast_quantile_result(self, res_values: np.ndarray) -> np.ndarray:
- """
- Cast the result of quantile_with_mask to an appropriate dtype
- to pass to _from_backing_data in _quantile.
- """
- return res_values
+ if res_values.dtype == self._ndarray.dtype:
+ return self._from_backing_data(res_values)
+ else:
+ # e.g. test_quantile_empty we are empty integer dtype and res_values
+ # has floating dtype
+ # TODO: technically __init__ isn't defined here.
+ # Should we raise NotImplementedError and handle this on NumpyEA?
+ return type(self)(res_values) # type: ignore[call-arg]
# ------------------------------------------------------------------------
# numpy-like methods
diff --git a/pandas/core/arrays/arrow/_arrow_utils.py b/pandas/core/arrays/arrow/_arrow_utils.py
index 2a053fac2985c..285c3fd465ffc 100644
--- a/pandas/core/arrays/arrow/_arrow_utils.py
+++ b/pandas/core/arrays/arrow/_arrow_utils.py
@@ -1,24 +1,8 @@
from __future__ import annotations
-import warnings
-
import numpy as np
import pyarrow
-from pandas.errors import PerformanceWarning
-from pandas.util._exceptions import find_stack_level
-
-
-def fallback_performancewarning(version: str | None = None) -> None:
- """
- Raise a PerformanceWarning for falling back to ExtensionArray's
- non-pyarrow method
- """
- msg = "Falling back on a non-pyarrow code path which may decrease performance."
- if version is not None:
- msg += f" Upgrade to pyarrow >={version} to possibly suppress this warning."
- warnings.warn(msg, PerformanceWarning, stacklevel=find_stack_level())
-
def pyarrow_array_to_numpy_and_mask(
arr, dtype: np.dtype
diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py
index 46f2cbb2ebeef..e0ccbd6fdc5fd 100644
--- a/pandas/core/arrays/arrow/array.py
+++ b/pandas/core/arrays/arrow/array.py
@@ -676,7 +676,12 @@ def __invert__(self) -> Self:
return type(self)(pc.invert(self._pa_array))
def __neg__(self) -> Self:
- return type(self)(pc.negate_checked(self._pa_array))
+ try:
+ return type(self)(pc.negate_checked(self._pa_array))
+ except pa.ArrowNotImplementedError as err:
+ raise TypeError(
+ f"unary '-' not supported for dtype '{self.dtype}'"
+ ) from err
def __pos__(self) -> Self:
return type(self)(self._pa_array)
@@ -731,8 +736,19 @@ def _cmp_method(self, other, op):
)
return ArrowExtensionArray(result)
- def _evaluate_op_method(self, other, op, arrow_funcs):
+ def _op_method_error_message(self, other, op) -> str:
+ if hasattr(other, "dtype"):
+ other_type = f"dtype '{other.dtype}'"
+ else:
+ other_type = f"object of type {type(other)}"
+ return (
+ f"operation '{op.__name__}' not supported for "
+ f"dtype '{self.dtype}' with {other_type}"
+ )
+
+ def _evaluate_op_method(self, other, op, arrow_funcs) -> Self:
pa_type = self._pa_array.type
+ other_original = other
other = self._box_pa(other)
if (
@@ -742,10 +758,15 @@ def _evaluate_op_method(self, other, op, arrow_funcs):
):
if op in [operator.add, roperator.radd]:
sep = pa.scalar("", type=pa_type)
- if op is operator.add:
- result = pc.binary_join_element_wise(self._pa_array, other, sep)
- elif op is roperator.radd:
- result = pc.binary_join_element_wise(other, self._pa_array, sep)
+ try:
+ if op is operator.add:
+ result = pc.binary_join_element_wise(self._pa_array, other, sep)
+ elif op is roperator.radd:
+ result = pc.binary_join_element_wise(other, self._pa_array, sep)
+ except pa.ArrowNotImplementedError as err:
+ raise TypeError(
+ self._op_method_error_message(other_original, op)
+ ) from err
return type(self)(result)
elif op in [operator.mul, roperator.rmul]:
binary = self._pa_array
@@ -777,9 +798,14 @@ def _evaluate_op_method(self, other, op, arrow_funcs):
pc_func = arrow_funcs[op.__name__]
if pc_func is NotImplemented:
+ if pa.types.is_string(pa_type) or pa.types.is_large_string(pa_type):
+ raise TypeError(self._op_method_error_message(other_original, op))
raise NotImplementedError(f"{op.__name__} not implemented.")
- result = pc_func(self._pa_array, other)
+ try:
+ result = pc_func(self._pa_array, other)
+ except pa.ArrowNotImplementedError as err:
+ raise TypeError(self._op_method_error_message(other_original, op)) from err
return type(self)(result)
def _logical_method(self, other, op):
@@ -1963,7 +1989,7 @@ def _rank(
"""
See Series.rank.__doc__.
"""
- return type(self)(
+ return self._convert_rank_result(
self._rank_calc(
axis=axis,
method=method,
@@ -2259,86 +2285,19 @@ def _apply_elementwise(self, func: Callable) -> list[list[Any]]:
for chunk in self._pa_array.iterchunks()
]
- def _str_count(self, pat: str, flags: int = 0):
- if flags:
- raise NotImplementedError(f"count not implemented with {flags=}")
- return type(self)(pc.count_substring_regex(self._pa_array, pat))
-
- def _str_contains(
- self, pat, case: bool = True, flags: int = 0, na=None, regex: bool = True
- ):
- if flags:
- raise NotImplementedError(f"contains not implemented with {flags=}")
-
- if regex:
- pa_contains = pc.match_substring_regex
- else:
- pa_contains = pc.match_substring
- result = pa_contains(self._pa_array, pat, ignore_case=not case)
- if not isna(na):
- result = result.fill_null(na)
+ def _convert_bool_result(self, result):
return type(self)(result)
- def _str_startswith(self, pat: str | tuple[str, ...], na=None):
- if isinstance(pat, str):
- result = pc.starts_with(self._pa_array, pattern=pat)
- else:
- if len(pat) == 0:
- # For empty tuple, pd.StringDtype() returns null for missing values
- # and false for valid values.
- result = pc.if_else(pc.is_null(self._pa_array), None, False)
- else:
- result = pc.starts_with(self._pa_array, pattern=pat[0])
-
- for p in pat[1:]:
- result = pc.or_(result, pc.starts_with(self._pa_array, pattern=p))
- if not isna(na):
- result = result.fill_null(na)
+ def _convert_int_result(self, result):
return type(self)(result)
- def _str_endswith(self, pat: str | tuple[str, ...], na=None):
- if isinstance(pat, str):
- result = pc.ends_with(self._pa_array, pattern=pat)
- else:
- if len(pat) == 0:
- # For empty tuple, pd.StringDtype() returns null for missing values
- # and false for valid values.
- result = pc.if_else(pc.is_null(self._pa_array), None, False)
- else:
- result = pc.ends_with(self._pa_array, pattern=pat[0])
-
- for p in pat[1:]:
- result = pc.or_(result, pc.ends_with(self._pa_array, pattern=p))
- if not isna(na):
- result = result.fill_null(na)
+ def _convert_rank_result(self, result):
return type(self)(result)
- def _str_replace(
- self,
- pat: str | re.Pattern,
- repl: str | Callable,
- n: int = -1,
- case: bool = True,
- flags: int = 0,
- regex: bool = True,
- ):
- if isinstance(pat, re.Pattern) or callable(repl) or not case or flags:
- raise NotImplementedError(
- "replace is not supported with a re.Pattern, callable repl, "
- "case=False, or flags!=0"
- )
-
- func = pc.replace_substring_regex if regex else pc.replace_substring
- # https://github.com/apache/arrow/issues/39149
- # GH 56404, unexpected behavior with negative max_replacements with pyarrow.
- pa_max_replacements = None if n < 0 else n
- result = func(
- self._pa_array,
- pattern=pat,
- replacement=repl,
- max_replacements=pa_max_replacements,
- )
- return type(self)(result)
+ def _str_count(self, pat: str, flags: int = 0):
+ if flags:
+ raise NotImplementedError(f"count not implemented with {flags=}")
+ return type(self)(pc.count_substring_regex(self._pa_array, pat))
def _str_repeat(self, repeats: int | Sequence[int]):
if not isinstance(repeats, int):
@@ -2348,37 +2307,6 @@ def _str_repeat(self, repeats: int | Sequence[int]):
else:
return type(self)(pc.binary_repeat(self._pa_array, repeats))
- def _str_match(
- self, pat: str, case: bool = True, flags: int = 0, na: Scalar | None = None
- ):
- if not pat.startswith("^"):
- pat = f"^{pat}"
- return self._str_contains(pat, case, flags, na, regex=True)
-
- def _str_fullmatch(
- self, pat, case: bool = True, flags: int = 0, na: Scalar | None = None
- ):
- if not pat.endswith("$") or pat.endswith("\\$"):
- pat = f"{pat}$"
- return self._str_match(pat, case, flags, na)
-
- def _str_find(self, sub: str, start: int = 0, end: int | None = None):
- if start != 0 and end is not None:
- slices = pc.utf8_slice_codeunits(self._pa_array, start, stop=end)
- result = pc.find_substring(slices, sub)
- not_found = pc.equal(result, -1)
- start_offset = max(0, start)
- offset_result = pc.add(result, start_offset)
- result = pc.if_else(not_found, result, offset_result)
- elif start == 0 and end is None:
- slices = self._pa_array
- result = pc.find_substring(slices, sub)
- else:
- raise NotImplementedError(
- f"find not implemented with {sub=}, {start=}, {end=}"
- )
- return type(self)(result)
-
def _str_join(self, sep: str):
if pa.types.is_string(self._pa_array.type) or pa.types.is_large_string(
self._pa_array.type
@@ -2399,84 +2327,6 @@ def _str_rpartition(self, sep: str, expand: bool):
result = self._apply_elementwise(predicate)
return type(self)(pa.chunked_array(result))
- def _str_slice(
- self, start: int | None = None, stop: int | None = None, step: int | None = None
- ):
- if start is None:
- start = 0
- if step is None:
- step = 1
- return type(self)(
- pc.utf8_slice_codeunits(self._pa_array, start=start, stop=stop, step=step)
- )
-
- def _str_isalnum(self):
- return type(self)(pc.utf8_is_alnum(self._pa_array))
-
- def _str_isalpha(self):
- return type(self)(pc.utf8_is_alpha(self._pa_array))
-
- def _str_isdecimal(self):
- return type(self)(pc.utf8_is_decimal(self._pa_array))
-
- def _str_isdigit(self):
- return type(self)(pc.utf8_is_digit(self._pa_array))
-
- def _str_islower(self):
- return type(self)(pc.utf8_is_lower(self._pa_array))
-
- def _str_isnumeric(self):
- return type(self)(pc.utf8_is_numeric(self._pa_array))
-
- def _str_isspace(self):
- return type(self)(pc.utf8_is_space(self._pa_array))
-
- def _str_istitle(self):
- return type(self)(pc.utf8_is_title(self._pa_array))
-
- def _str_isupper(self):
- return type(self)(pc.utf8_is_upper(self._pa_array))
-
- def _str_len(self):
- return type(self)(pc.utf8_length(self._pa_array))
-
- def _str_lower(self):
- return type(self)(pc.utf8_lower(self._pa_array))
-
- def _str_upper(self):
- return type(self)(pc.utf8_upper(self._pa_array))
-
- def _str_strip(self, to_strip=None):
- if to_strip is None:
- result = pc.utf8_trim_whitespace(self._pa_array)
- else:
- result = pc.utf8_trim(self._pa_array, characters=to_strip)
- return type(self)(result)
-
- def _str_lstrip(self, to_strip=None):
- if to_strip is None:
- result = pc.utf8_ltrim_whitespace(self._pa_array)
- else:
- result = pc.utf8_ltrim(self._pa_array, characters=to_strip)
- return type(self)(result)
-
- def _str_rstrip(self, to_strip=None):
- if to_strip is None:
- result = pc.utf8_rtrim_whitespace(self._pa_array)
- else:
- result = pc.utf8_rtrim(self._pa_array, characters=to_strip)
- return type(self)(result)
-
- def _str_removeprefix(self, prefix: str):
- if not pa_version_under13p0:
- starts_with = pc.starts_with(self._pa_array, pattern=prefix)
- removed = pc.utf8_slice_codeunits(self._pa_array, len(prefix))
- result = pc.if_else(starts_with, removed, self._pa_array)
- return type(self)(result)
- predicate = lambda val: val.removeprefix(prefix)
- result = self._apply_elementwise(predicate)
- return type(self)(pa.chunked_array(result))
-
def _str_casefold(self):
predicate = lambda val: val.casefold()
result = self._apply_elementwise(predicate)
diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
index f191f7277743f..97004474648b2 100644
--- a/pandas/core/arrays/categorical.py
+++ b/pandas/core/arrays/categorical.py
@@ -2475,11 +2475,6 @@ def unique(self) -> Self:
# pylint: disable=useless-parent-delegation
return super().unique()
- def _cast_quantile_result(self, res_values: np.ndarray) -> np.ndarray:
- # make sure we have correct itemsize for resulting codes
- assert res_values.dtype == self._ndarray.dtype
- return res_values
-
def equals(self, other: object) -> bool:
"""
Returns True if categorical arrays are equal.
@@ -2696,7 +2691,9 @@ def _str_get_dummies(self, sep: str = "|"):
# sep may not be in categories. Just bail on this.
from pandas.core.arrays import NumpyExtensionArray
- return NumpyExtensionArray(self.astype(str))._str_get_dummies(sep)
+ return NumpyExtensionArray(self.to_numpy(str, na_value="NaN"))._str_get_dummies(
+ sep
+ )
# ------------------------------------------------------------------------
# GroupBy Methods
diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py
index e85c0222bbec3..81e2f04f2ba2e 100644
--- a/pandas/core/arrays/datetimelike.py
+++ b/pandas/core/arrays/datetimelike.py
@@ -472,10 +472,16 @@ def astype(self, dtype, copy: bool = True):
return self._box_values(self.asi8.ravel()).reshape(self.shape)
+ elif is_string_dtype(dtype):
+ if isinstance(dtype, ExtensionDtype):
+ arr_object = self._format_native_types(na_rep=dtype.na_value) # type: ignore[arg-type]
+ cls = dtype.construct_array_type()
+ return cls._from_sequence(arr_object, dtype=dtype, copy=False)
+ else:
+ return self._format_native_types()
+
elif isinstance(dtype, ExtensionDtype):
return super().astype(dtype, copy=copy)
- elif is_string_dtype(dtype):
- return self._format_native_types()
elif dtype.kind in "iu":
# we deliberately ignore int32 vs. int64 here.
# See https://github.com/pandas-dev/pandas/issues/24381 for more.
diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py
index 03712f75db0c7..aafcd82114b97 100644
--- a/pandas/core/arrays/numpy_.py
+++ b/pandas/core/arrays/numpy_.py
@@ -137,9 +137,6 @@ def _from_sequence(
result = result.copy()
return cls(result)
- def _from_backing_data(self, arr: np.ndarray) -> NumpyExtensionArray:
- return type(self)(arr)
-
# ------------------------------------------------------------------------
# Data
diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index 1aa6fb70d250c..0b0fffcb928a3 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -46,6 +46,7 @@
nanops,
ops,
)
+from pandas.core.algorithms import isin
from pandas.core.array_algos import masked_reductions
from pandas.core.arrays.base import ExtensionArray
from pandas.core.arrays.floating import (
@@ -65,6 +66,7 @@
import pyarrow
from pandas._typing import (
+ ArrayLike,
AxisInt,
Dtype,
DtypeObj,
@@ -167,9 +169,9 @@ def __init__(
# a consistent NaN value (and we can use `dtype.na_value is np.nan`)
na_value = np.nan
elif na_value is not libmissing.NA:
- raise ValueError("'na_value' must be np.nan or pd.NA, got {na_value}")
+ raise ValueError(f"'na_value' must be np.nan or pd.NA, got {na_value}")
- self.storage = storage
+ self.storage = cast(str, storage)
self._na_value = na_value
def __repr__(self) -> str:
@@ -280,6 +282,34 @@ def construct_array_type( # type: ignore[override]
else:
return ArrowStringArrayNumpySemantics
+ def _get_common_dtype(self, dtypes: list[DtypeObj]) -> DtypeObj | None:
+ storages = set()
+ na_values = set()
+
+ for dtype in dtypes:
+ if isinstance(dtype, StringDtype):
+ storages.add(dtype.storage)
+ na_values.add(dtype.na_value)
+ elif isinstance(dtype, np.dtype) and dtype.kind in ("U", "T"):
+ continue
+ else:
+ return None
+
+ if len(storages) == 2:
+ # if both python and pyarrow storage -> priority to pyarrow
+ storage = "pyarrow"
+ else:
+ storage = next(iter(storages)) # type: ignore[assignment]
+
+ na_value: libmissing.NAType | float
+ if len(na_values) == 2:
+ # if both NaN and NA -> priority to NA
+ na_value = libmissing.NA
+ else:
+ na_value = next(iter(na_values))
+
+ return StringDtype(storage=storage, na_value=na_value)
+
def __from_arrow__(
self, array: pyarrow.Array | pyarrow.ChunkedArray
) -> BaseStringArray:
@@ -657,11 +687,10 @@ def __arrow_array__(self, type=None):
values[self.isna()] = None
return pa.array(values, type=type, from_pandas=True)
- def _values_for_factorize(self):
+ def _values_for_factorize(self) -> tuple[np.ndarray, libmissing.NAType | float]: # type: ignore[override]
arr = self._ndarray.copy()
- mask = self.isna()
- arr[mask] = None
- return arr, None
+
+ return arr, self.dtype.na_value
def __setitem__(self, key, value) -> None:
value = extract_array(value, extract_numpy=True)
@@ -686,6 +715,10 @@ def __setitem__(self, key, value) -> None:
else:
if not is_array_like(value):
value = np.asarray(value, dtype=object)
+ else:
+ # cast categories and friends to arrays to see if values are
+ # compatible, compatibility with arrow backed strings
+ value = np.asarray(value)
if len(value) and not lib.is_string_array(value, skipna=True):
raise TypeError("Must provide strings.")
@@ -702,6 +735,24 @@ def _putmask(self, mask: npt.NDArray[np.bool_], value) -> None:
# base class implementation that uses __setitem__
ExtensionArray._putmask(self, mask, value)
+ def isin(self, values: ArrayLike) -> npt.NDArray[np.bool_]:
+ if isinstance(values, BaseStringArray) or (
+ isinstance(values, ExtensionArray) and is_string_dtype(values.dtype)
+ ):
+ values = values.astype(self.dtype, copy=False)
+ else:
+ if not lib.is_string_array(np.asarray(values), skipna=True):
+ values = np.array(
+ [val for val in values if isinstance(val, str) or isna(val)],
+ dtype=object,
+ )
+ if not len(values):
+ return np.zeros(self.shape, dtype=bool)
+
+ values = self._from_sequence(values, dtype=self.dtype)
+
+ return isin(np.asarray(self), np.asarray(values))
+
def astype(self, dtype, copy: bool = True):
dtype = pandas_dtype(dtype)
@@ -824,8 +875,11 @@ def _cmp_method(self, other, op):
f"Lengths of operands do not match: {len(self)} != {len(other)}"
)
- other = np.asarray(other)
+ # for array-likes, first filter out NAs before converting to numpy
+ if not is_array_like(other):
+ other = np.asarray(other)
other = other[valid]
+ other = np.asarray(other)
if op.__name__ in ops.ARITHMETIC_BINOPS:
result = np.empty_like(self._ndarray, dtype="object")
@@ -871,8 +925,3 @@ def _from_sequence(
if dtype is None:
dtype = StringDtype(storage="python", na_value=np.nan)
return super()._from_sequence(scalars, dtype=dtype, copy=copy)
-
- def _from_backing_data(self, arr: np.ndarray) -> StringArrayNumpySemantics:
- # need to override NumpyExtensionArray._from_backing_data to ensure
- # we always preserve the dtype
- return NDArrayBacked._from_backing_data(self, arr)
diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index 91c1f20ba93c6..56f7d3aecce20 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -30,6 +30,7 @@
from pandas.core.arrays._arrow_string_mixins import ArrowStringArrayMixin
from pandas.core.arrays.arrow import ArrowExtensionArray
from pandas.core.arrays.boolean import BooleanDtype
+from pandas.core.arrays.floating import Float64Dtype
from pandas.core.arrays.integer import Int64Dtype
from pandas.core.arrays.numeric import NumericDtype
from pandas.core.arrays.string_ import (
@@ -42,17 +43,13 @@
import pyarrow as pa
import pyarrow.compute as pc
- from pandas.core.arrays.arrow._arrow_utils import fallback_performancewarning
-
if TYPE_CHECKING:
from collections.abc import Sequence
from pandas._typing import (
ArrayLike,
- AxisInt,
Dtype,
- Scalar,
npt,
)
@@ -214,10 +211,8 @@ def insert(self, loc: int, item) -> ArrowStringArray:
raise TypeError("Scalar must be NA or str")
return super().insert(loc, item)
- def _result_converter(self, values, na=None):
+ def _convert_bool_result(self, values):
if self.dtype.na_value is np.nan:
- if not isna(na):
- values = values.fill_null(bool(na))
return ArrowExtensionArray(values).to_numpy(na_value=np.nan)
return BooleanDtype().__from_arrow__(values)
@@ -233,7 +228,7 @@ def _maybe_convert_setitem_value(self, value):
value[isna(value)] = None
for v in value:
if not (v is None or isinstance(v, str)):
- raise TypeError("Scalar must be NA or str")
+ raise TypeError("Must provide strings")
return super()._maybe_convert_setitem_value(value)
def isin(self, values: ArrayLike) -> npt.NDArray[np.bool_]:
@@ -283,61 +278,54 @@ def _data(self):
# ------------------------------------------------------------------------
# String methods interface
+ _str_isalnum = ArrowStringArrayMixin._str_isalnum
+ _str_isalpha = ArrowStringArrayMixin._str_isalpha
+ _str_isdecimal = ArrowStringArrayMixin._str_isdecimal
+ _str_isdigit = ArrowStringArrayMixin._str_isdigit
+ _str_islower = ArrowStringArrayMixin._str_islower
+ _str_isnumeric = ArrowStringArrayMixin._str_isnumeric
+ _str_isspace = ArrowStringArrayMixin._str_isspace
+ _str_istitle = ArrowStringArrayMixin._str_istitle
+ _str_isupper = ArrowStringArrayMixin._str_isupper
+
_str_map = BaseStringArray._str_map
+ _str_startswith = ArrowStringArrayMixin._str_startswith
+ _str_endswith = ArrowStringArrayMixin._str_endswith
+ _str_pad = ArrowStringArrayMixin._str_pad
+ _str_match = ArrowStringArrayMixin._str_match
+ _str_fullmatch = ArrowStringArrayMixin._str_fullmatch
+ _str_lower = ArrowStringArrayMixin._str_lower
+ _str_upper = ArrowStringArrayMixin._str_upper
+ _str_strip = ArrowStringArrayMixin._str_strip
+ _str_lstrip = ArrowStringArrayMixin._str_lstrip
+ _str_rstrip = ArrowStringArrayMixin._str_rstrip
+ _str_removesuffix = ArrowStringArrayMixin._str_removesuffix
+ _str_get = ArrowStringArrayMixin._str_get
+ _str_capitalize = ArrowStringArrayMixin._str_capitalize
+ _str_title = ArrowStringArrayMixin._str_title
+ _str_swapcase = ArrowStringArrayMixin._str_swapcase
+ _str_slice_replace = ArrowStringArrayMixin._str_slice_replace
+ _str_len = ArrowStringArrayMixin._str_len
+ _str_slice = ArrowStringArrayMixin._str_slice
def _str_contains(
self, pat, case: bool = True, flags: int = 0, na=np.nan, regex: bool = True
):
if flags:
- fallback_performancewarning()
return super()._str_contains(pat, case, flags, na, regex)
- if regex:
- result = pc.match_substring_regex(self._pa_array, pat, ignore_case=not case)
- else:
- result = pc.match_substring(self._pa_array, pat, ignore_case=not case)
- result = self._result_converter(result, na=na)
if not isna(na):
- result[isna(result)] = bool(na)
- return result
-
- def _str_startswith(self, pat: str | tuple[str, ...], na: Scalar | None = None):
- if isinstance(pat, str):
- result = pc.starts_with(self._pa_array, pattern=pat)
- else:
- if len(pat) == 0:
- # mimic existing behaviour of string extension array
- # and python string method
- result = pa.array(
- np.zeros(len(self._pa_array), dtype=bool), mask=isna(self._pa_array)
- )
- else:
- result = pc.starts_with(self._pa_array, pattern=pat[0])
-
- for p in pat[1:]:
- result = pc.or_(result, pc.starts_with(self._pa_array, pattern=p))
- if not isna(na):
- result = result.fill_null(na)
- return self._result_converter(result)
-
- def _str_endswith(self, pat: str | tuple[str, ...], na: Scalar | None = None):
- if isinstance(pat, str):
- result = pc.ends_with(self._pa_array, pattern=pat)
- else:
- if len(pat) == 0:
- # mimic existing behaviour of string extension array
- # and python string method
- result = pa.array(
- np.zeros(len(self._pa_array), dtype=bool), mask=isna(self._pa_array)
+ if not isinstance(na, bool):
+ # GH#59561
+ warnings.warn(
+ "Allowing a non-bool 'na' in obj.str.contains is deprecated "
+ "and will raise in a future version.",
+ FutureWarning,
+ stacklevel=find_stack_level(),
)
- else:
- result = pc.ends_with(self._pa_array, pattern=pat[0])
+ na = bool(na)
- for p in pat[1:]:
- result = pc.or_(result, pc.ends_with(self._pa_array, pattern=p))
- if not isna(na):
- result = result.fill_null(na)
- return self._result_converter(result)
+ return ArrowStringArrayMixin._str_contains(self, pat, case, flags, na, regex)
def _str_replace(
self,
@@ -349,146 +337,38 @@ def _str_replace(
regex: bool = True,
):
if isinstance(pat, re.Pattern) or callable(repl) or not case or flags:
- fallback_performancewarning()
return super()._str_replace(pat, repl, n, case, flags, regex)
- func = pc.replace_substring_regex if regex else pc.replace_substring
- result = func(self._pa_array, pattern=pat, replacement=repl, max_replacements=n)
- return type(self)(result)
+ return ArrowStringArrayMixin._str_replace(
+ self, pat, repl, n, case, flags, regex
+ )
def _str_repeat(self, repeats: int | Sequence[int]):
if not isinstance(repeats, int):
return super()._str_repeat(repeats)
else:
- return type(self)(pc.binary_repeat(self._pa_array, repeats))
-
- def _str_match(
- self, pat: str, case: bool = True, flags: int = 0, na: Scalar | None = None
- ):
- if not pat.startswith("^"):
- pat = f"^{pat}"
- return self._str_contains(pat, case, flags, na, regex=True)
-
- def _str_fullmatch(
- self, pat, case: bool = True, flags: int = 0, na: Scalar | None = None
- ):
- if not pat.endswith("$") or pat.endswith("\\$"):
- pat = f"{pat}$"
- return self._str_match(pat, case, flags, na)
-
- def _str_slice(
- self, start: int | None = None, stop: int | None = None, step: int | None = None
- ):
- if stop is None:
- return super()._str_slice(start, stop, step)
- if start is None:
- start = 0
- if step is None:
- step = 1
- return type(self)(
- pc.utf8_slice_codeunits(self._pa_array, start=start, stop=stop, step=step)
- )
-
- def _str_isalnum(self):
- result = pc.utf8_is_alnum(self._pa_array)
- return self._result_converter(result)
-
- def _str_isalpha(self):
- result = pc.utf8_is_alpha(self._pa_array)
- return self._result_converter(result)
-
- def _str_isdecimal(self):
- result = pc.utf8_is_decimal(self._pa_array)
- return self._result_converter(result)
-
- def _str_isdigit(self):
- result = pc.utf8_is_digit(self._pa_array)
- return self._result_converter(result)
-
- def _str_islower(self):
- result = pc.utf8_is_lower(self._pa_array)
- return self._result_converter(result)
-
- def _str_isnumeric(self):
- result = pc.utf8_is_numeric(self._pa_array)
- return self._result_converter(result)
-
- def _str_isspace(self):
- result = pc.utf8_is_space(self._pa_array)
- return self._result_converter(result)
-
- def _str_istitle(self):
- result = pc.utf8_is_title(self._pa_array)
- return self._result_converter(result)
-
- def _str_isupper(self):
- result = pc.utf8_is_upper(self._pa_array)
- return self._result_converter(result)
-
- def _str_len(self):
- result = pc.utf8_length(self._pa_array)
- return self._convert_int_dtype(result)
-
- def _str_lower(self):
- return type(self)(pc.utf8_lower(self._pa_array))
-
- def _str_upper(self):
- return type(self)(pc.utf8_upper(self._pa_array))
-
- def _str_strip(self, to_strip=None):
- if to_strip is None:
- result = pc.utf8_trim_whitespace(self._pa_array)
- else:
- result = pc.utf8_trim(self._pa_array, characters=to_strip)
- return type(self)(result)
-
- def _str_lstrip(self, to_strip=None):
- if to_strip is None:
- result = pc.utf8_ltrim_whitespace(self._pa_array)
- else:
- result = pc.utf8_ltrim(self._pa_array, characters=to_strip)
- return type(self)(result)
-
- def _str_rstrip(self, to_strip=None):
- if to_strip is None:
- result = pc.utf8_rtrim_whitespace(self._pa_array)
- else:
- result = pc.utf8_rtrim(self._pa_array, characters=to_strip)
- return type(self)(result)
+ return ArrowExtensionArray._str_repeat(self, repeats=repeats)
def _str_removeprefix(self, prefix: str):
if not pa_version_under13p0:
- starts_with = pc.starts_with(self._pa_array, pattern=prefix)
- removed = pc.utf8_slice_codeunits(self._pa_array, len(prefix))
- result = pc.if_else(starts_with, removed, self._pa_array)
- return type(self)(result)
+ return ArrowStringArrayMixin._str_removeprefix(self, prefix)
return super()._str_removeprefix(prefix)
- def _str_removesuffix(self, suffix: str):
- ends_with = pc.ends_with(self._pa_array, pattern=suffix)
- removed = pc.utf8_slice_codeunits(self._pa_array, 0, stop=-len(suffix))
- result = pc.if_else(ends_with, removed, self._pa_array)
- return type(self)(result)
-
def _str_count(self, pat: str, flags: int = 0):
if flags:
return super()._str_count(pat, flags)
result = pc.count_substring_regex(self._pa_array, pat)
- return self._convert_int_dtype(result)
+ return self._convert_int_result(result)
def _str_find(self, sub: str, start: int = 0, end: int | None = None):
- if start != 0 and end is not None:
- slices = pc.utf8_slice_codeunits(self._pa_array, start, stop=end)
- result = pc.find_substring(slices, sub)
- not_found = pc.equal(result, -1)
- offset_result = pc.add(result, end - start)
- result = pc.if_else(not_found, result, offset_result)
- elif start == 0 and end is None:
- slices = self._pa_array
- result = pc.find_substring(slices, sub)
- else:
+ if (
+ pa_version_under13p0
+ and not (start != 0 and end is not None)
+ and not (start == 0 and end is None)
+ ):
+ # GH#59562
return super()._str_find(sub, start, end)
- return self._convert_int_dtype(result)
+ return ArrowStringArrayMixin._str_find(self, sub, start, end)
def _str_get_dummies(self, sep: str = "|"):
dummies_pa, labels = ArrowExtensionArray(self._pa_array)._str_get_dummies(sep)
@@ -497,7 +377,7 @@ def _str_get_dummies(self, sep: str = "|"):
dummies = np.vstack(dummies_pa.to_numpy())
return dummies.astype(np.int64, copy=False), labels
- def _convert_int_dtype(self, result):
+ def _convert_int_result(self, result):
if self.dtype.na_value is np.nan:
if isinstance(result, pa.Array):
result = result.to_numpy(zero_copy_only=False)
@@ -509,6 +389,16 @@ def _convert_int_dtype(self, result):
return Int64Dtype().__from_arrow__(result)
+ def _convert_rank_result(self, result):
+ if self.dtype.na_value is np.nan:
+ if isinstance(result, pa.Array):
+ result = result.to_numpy(zero_copy_only=False)
+ else:
+ result = result.to_numpy()
+ return result.astype("float64", copy=False)
+
+ return Float64Dtype().__from_arrow__(result)
+
def _reduce(
self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs
):
@@ -524,34 +414,12 @@ def _reduce(
result = self._reduce_calc(name, skipna=skipna, keepdims=keepdims, **kwargs)
if name in ("argmin", "argmax") and isinstance(result, pa.Array):
- return self._convert_int_dtype(result)
+ return self._convert_int_result(result)
elif isinstance(result, pa.Array):
return type(self)(result)
else:
return result
- def _rank(
- self,
- *,
- axis: AxisInt = 0,
- method: str = "average",
- na_option: str = "keep",
- ascending: bool = True,
- pct: bool = False,
- ):
- """
- See Series.rank.__doc__.
- """
- return self._convert_int_dtype(
- self._rank_calc(
- axis=axis,
- method=method,
- na_option=na_option,
- ascending=ascending,
- pct=pct,
- )
- )
-
def value_counts(self, dropna: bool = True) -> Series:
result = super().value_counts(dropna=dropna)
if self.dtype.na_value is np.nan:
@@ -573,10 +441,3 @@ def _cmp_method(self, other, op):
class ArrowStringArrayNumpySemantics(ArrowStringArray):
_na_value = np.nan
- _str_get = ArrowStringArrayMixin._str_get
- _str_removesuffix = ArrowStringArrayMixin._str_removesuffix
- _str_capitalize = ArrowStringArrayMixin._str_capitalize
- _str_pad = ArrowStringArrayMixin._str_pad
- _str_title = ArrowStringArrayMixin._str_title
- _str_swapcase = ArrowStringArrayMixin._str_swapcase
- _str_slice_replace = ArrowStringArrayMixin._str_slice_replace
diff --git a/pandas/core/computation/eval.py b/pandas/core/computation/eval.py
index f1fe528de06f8..7bb623cba3755 100644
--- a/pandas/core/computation/eval.py
+++ b/pandas/core/computation/eval.py
@@ -10,7 +10,10 @@
from pandas.util._exceptions import find_stack_level
from pandas.util._validators import validate_bool_kwarg
-from pandas.core.dtypes.common import is_extension_array_dtype
+from pandas.core.dtypes.common import (
+ is_extension_array_dtype,
+ is_string_dtype,
+)
from pandas.core.computation.engines import ENGINES
from pandas.core.computation.expr import (
@@ -336,10 +339,13 @@ def eval(
parsed_expr = Expr(expr, engine=engine, parser=parser, env=env)
if engine == "numexpr" and (
- is_extension_array_dtype(parsed_expr.terms.return_type)
+ (
+ is_extension_array_dtype(parsed_expr.terms.return_type)
+ and not is_string_dtype(parsed_expr.terms.return_type)
+ )
or getattr(parsed_expr.terms, "operand_types", None) is not None
and any(
- is_extension_array_dtype(elem)
+ (is_extension_array_dtype(elem) and not is_string_dtype(elem))
for elem in parsed_expr.terms.operand_types
)
):
diff --git a/pandas/core/computation/expr.py b/pandas/core/computation/expr.py
index d642c37cea129..34055d2177626 100644
--- a/pandas/core/computation/expr.py
+++ b/pandas/core/computation/expr.py
@@ -20,6 +20,8 @@
from pandas.errors import UndefinedVariableError
+from pandas.core.dtypes.common import is_string_dtype
+
import pandas.core.common as com
from pandas.core.computation.ops import (
ARITH_OPS_SYMS,
@@ -520,10 +522,12 @@ def _maybe_evaluate_binop(
elif self.engine != "pytables":
if (
getattr(lhs, "return_type", None) == object
+ or is_string_dtype(getattr(lhs, "return_type", None))
or getattr(rhs, "return_type", None) == object
+ or is_string_dtype(getattr(rhs, "return_type", None))
):
# evaluate "==" and "!=" in python if either of our operands
- # has an object return type
+ # has an object or string return type
return self._maybe_eval(res, eval_in_python + maybe_eval_in_python)
return res
diff --git a/pandas/core/construction.py b/pandas/core/construction.py
index 5bccca9cfbd47..584a1d417d198 100644
--- a/pandas/core/construction.py
+++ b/pandas/core/construction.py
@@ -609,7 +609,10 @@ def sanitize_array(
dtype = StringDtype(na_value=np.nan)
subarr = dtype.construct_array_type()._from_sequence(data, dtype=dtype)
- if subarr is data and copy:
+ if (
+ subarr is data
+ or (subarr.dtype == "str" and subarr.dtype.storage == "python") # type: ignore[union-attr]
+ ) and copy:
subarr = subarr.copy()
else:
diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py
index df0251d141984..fe705daaad5fa 100644
--- a/pandas/core/dtypes/common.py
+++ b/pandas/core/dtypes/common.py
@@ -12,6 +12,8 @@
import numpy as np
+from pandas._config import using_string_dtype
+
from pandas._libs import (
Interval,
Period,
@@ -1325,7 +1327,15 @@ def is_extension_array_dtype(arr_or_dtype) -> bool:
elif isinstance(dtype, np.dtype):
return False
else:
- return registry.find(dtype) is not None
+ try:
+ with warnings.catch_warnings():
+ # pandas_dtype(..) can raise UserWarning for class input
+ warnings.simplefilter("ignore", UserWarning)
+ dtype = pandas_dtype(dtype)
+ except (TypeError, ValueError):
+ # np.dtype(..) can raise ValueError
+ return False
+ return isinstance(dtype, ExtensionDtype)
def is_ea_or_datetimelike_dtype(dtype: DtypeObj | None) -> bool:
@@ -1620,6 +1630,12 @@ def pandas_dtype(dtype) -> DtypeObj:
elif isinstance(dtype, (np.dtype, ExtensionDtype)):
return dtype
+ # builtin aliases
+ if dtype is str and using_string_dtype():
+ from pandas.core.arrays.string_ import StringDtype
+
+ return StringDtype(na_value=np.nan)
+
# registered extension types
result = registry.find(dtype)
if result is not None:
diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py
index 1c43ef55c11d7..e7efb8598ec61 100644
--- a/pandas/core/dtypes/dtypes.py
+++ b/pandas/core/dtypes/dtypes.py
@@ -453,7 +453,7 @@ def __eq__(self, other: object) -> bool:
# Because left and right have the same length and are unique,
# `indexer` not having any -1s implies that there is a
# bijection between `left` and `right`.
- return (indexer != -1).all()
+ return bool((indexer != -1).all())
# With object-dtype we need a comparison that identifies
# e.g. int(2) as distinct from float(2)
@@ -1791,7 +1791,7 @@ def _is_na_fill_value(self) -> bool:
@property
def _is_numeric(self) -> bool:
- return not self.subtype == object
+ return self.subtype != object
@property
def _is_boolean(self) -> bool:
diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
index 825316585c03c..8e8eb768130fd 100644
--- a/pandas/core/indexes/base.py
+++ b/pandas/core/indexes/base.py
@@ -506,7 +506,8 @@ def __new__(
elif is_ea_or_datetimelike_dtype(dtype):
# non-EA dtype indexes have special casting logic, so we punt here
- pass
+ if isinstance(data, (set, frozenset)):
+ data = list(data)
elif is_ea_or_datetimelike_dtype(data_dtype):
pass
@@ -6414,7 +6415,11 @@ def _should_compare(self, other: Index) -> bool:
return False
dtype = _unpack_nested_dtype(other)
- return self._is_comparable_dtype(dtype) or is_object_dtype(dtype)
+ return (
+ self._is_comparable_dtype(dtype)
+ or is_object_dtype(dtype)
+ or is_string_dtype(dtype)
+ )
def _is_comparable_dtype(self, dtype: DtypeObj) -> bool:
"""
@@ -6995,6 +7000,9 @@ def insert(self, loc: int, item) -> Index:
# We cannot keep the same dtype, so cast to the (often object)
# minimal shared dtype before doing the insert.
dtype = self._find_common_type_compat(item)
+ if dtype == self.dtype:
+ # EA's might run into recursion errors if loc is invalid
+ raise
return self.astype(dtype).insert(loc, item)
if arr.dtype != object or not isinstance(
diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py
index 4fcdb87974511..635924674d9f4 100644
--- a/pandas/core/indexes/interval.py
+++ b/pandas/core/indexes/interval.py
@@ -50,6 +50,7 @@
is_number,
is_object_dtype,
is_scalar,
+ is_string_dtype,
pandas_dtype,
)
from pandas.core.dtypes.dtypes import (
@@ -699,7 +700,7 @@ def _get_indexer(
# left/right get_indexer, compare elementwise, equality -> match
indexer = self._get_indexer_unique_sides(target)
- elif not is_object_dtype(target.dtype):
+ elif not (is_object_dtype(target.dtype) or is_string_dtype(target.dtype)):
# homogeneous scalar index: use IntervalTree
# we should always have self._should_partial_index(target) here
target = self._maybe_convert_i8(target)
diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py
index 090e27ec58cc3..f376c239a0ce0 100644
--- a/pandas/core/strings/object_array.py
+++ b/pandas/core/strings/object_array.py
@@ -10,12 +10,14 @@
cast,
)
import unicodedata
+import warnings
import numpy as np
from pandas._libs import lib
import pandas._libs.missing as libmissing
import pandas._libs.ops as libops
+from pandas.util._exceptions import find_stack_level
from pandas.core.dtypes.missing import isna
@@ -140,14 +142,38 @@ def _str_contains(
else:
upper_pat = pat.upper()
f = lambda x: upper_pat in x.upper()
+ if not isna(na) and not isinstance(na, bool):
+ # GH#59561
+ warnings.warn(
+ "Allowing a non-bool 'na' in obj.str.contains is deprecated "
+ "and will raise in a future version.",
+ FutureWarning,
+ stacklevel=find_stack_level(),
+ )
return self._str_map(f, na, dtype=np.dtype("bool"))
def _str_startswith(self, pat, na=None):
f = lambda x: x.startswith(pat)
+ if not isna(na) and not isinstance(na, bool):
+ # GH#59561
+ warnings.warn(
+ "Allowing a non-bool 'na' in obj.str.startswith is deprecated "
+ "and will raise in a future version.",
+ FutureWarning,
+ stacklevel=find_stack_level(),
+ )
return self._str_map(f, na_value=na, dtype=np.dtype(bool))
def _str_endswith(self, pat, na=None):
f = lambda x: x.endswith(pat)
+ if not isna(na) and not isinstance(na, bool):
+ # GH#59561
+ warnings.warn(
+ "Allowing a non-bool 'na' in obj.str.endswith is deprecated "
+ "and will raise in a future version.",
+ FutureWarning,
+ stacklevel=find_stack_level(),
+ )
return self._str_map(f, na_value=na, dtype=np.dtype(bool))
def _str_replace(
diff --git a/pandas/io/_util.py b/pandas/io/_util.py
index 68fcfcf65e0c2..50a97f1059b5c 100644
--- a/pandas/io/_util.py
+++ b/pandas/io/_util.py
@@ -24,6 +24,8 @@ def _arrow_dtype_mapping() -> dict:
pa.string(): pd.StringDtype(),
pa.float32(): pd.Float32Dtype(),
pa.float64(): pd.Float64Dtype(),
+ pa.string(): pd.StringDtype(),
+ pa.large_string(): pd.StringDtype(),
}
diff --git a/pandas/tests/apply/test_frame_apply.py b/pandas/tests/apply/test_frame_apply.py
index 5e0f991d5c406..6a328dfb39be5 100644
--- a/pandas/tests/apply/test_frame_apply.py
+++ b/pandas/tests/apply/test_frame_apply.py
@@ -65,7 +65,6 @@ def test_apply(float_frame, engine, request):
assert result.index is float_frame.index
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
@pytest.mark.parametrize("axis", [0, 1])
@pytest.mark.parametrize("raw", [True, False])
def test_apply_args(float_frame, axis, raw, engine, request):
diff --git a/pandas/tests/apply/test_numba.py b/pandas/tests/apply/test_numba.py
index 6bbe5100e8826..20c067a776f4d 100644
--- a/pandas/tests/apply/test_numba.py
+++ b/pandas/tests/apply/test_numba.py
@@ -1,10 +1,9 @@
import numpy as np
import pytest
-from pandas._config import using_string_dtype
-
import pandas.util._test_decorators as td
+import pandas as pd
from pandas import (
DataFrame,
Index,
@@ -19,7 +18,6 @@ def apply_axis(request):
return request.param
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
def test_numba_vs_python_noop(float_frame, apply_axis):
func = lambda x: x
result = float_frame.apply(func, engine="numba", axis=apply_axis)
@@ -29,11 +27,10 @@ def test_numba_vs_python_noop(float_frame, apply_axis):
def test_numba_vs_python_string_index():
# GH#56189
- pytest.importorskip("pyarrow")
df = DataFrame(
1,
- index=Index(["a", "b"], dtype="string[pyarrow_numpy]"),
- columns=Index(["x", "y"], dtype="string[pyarrow_numpy]"),
+ index=Index(["a", "b"], dtype=pd.StringDtype(na_value=np.nan)),
+ columns=Index(["x", "y"], dtype=pd.StringDtype(na_value=np.nan)),
)
func = lambda x: x
result = df.apply(func, engine="numba", axis=0)
@@ -43,7 +40,6 @@ def test_numba_vs_python_string_index():
)
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
def test_numba_vs_python_indexing():
frame = DataFrame(
{"a": [1, 2, 3], "b": [4, 5, 6], "c": [7.0, 8.0, 9.0]},
diff --git a/pandas/tests/arithmetic/test_object.py b/pandas/tests/arithmetic/test_object.py
index 899ea1910d055..44e485d40ba53 100644
--- a/pandas/tests/arithmetic/test_object.py
+++ b/pandas/tests/arithmetic/test_object.py
@@ -8,9 +8,6 @@
import numpy as np
import pytest
-from pandas._config import using_string_dtype
-
-from pandas.compat import HAS_PYARROW
import pandas.util._test_decorators as td
import pandas as pd
@@ -186,6 +183,10 @@ def test_objarr_add_invalid(self, op, box_with_array):
"unsupported operand type",
"must be str",
"has no kernel",
+ "operation 'add' not supported",
+ "operation 'radd' not supported",
+ "operation 'sub' not supported",
+ "operation 'rsub' not supported",
]
)
with pytest.raises(Exception, match=msg):
@@ -318,27 +319,17 @@ def test_add(self):
expected = pd.Index(["1a", "1b", "1c"])
tm.assert_index_equal("1" + index, expected)
- @pytest.mark.xfail(
- using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)"
- )
- def test_sub_fail(self, using_infer_string):
+ def test_sub_fail(self):
index = pd.Index([str(i) for i in range(10)])
- if using_infer_string:
- import pyarrow as pa
-
- err = pa.lib.ArrowNotImplementedError
- msg = "has no kernel"
- else:
- err = TypeError
- msg = "unsupported operand type|Cannot broadcast"
- with pytest.raises(err, match=msg):
+ msg = "unsupported operand type|Cannot broadcast|sub' not supported"
+ with pytest.raises(TypeError, match=msg):
index - "a"
- with pytest.raises(err, match=msg):
+ with pytest.raises(TypeError, match=msg):
index - index
- with pytest.raises(err, match=msg):
+ with pytest.raises(TypeError, match=msg):
index - index.tolist()
- with pytest.raises(err, match=msg):
+ with pytest.raises(TypeError, match=msg):
index.tolist() - index
def test_sub_object(self):
diff --git a/pandas/tests/arrays/boolean/test_arithmetic.py b/pandas/tests/arrays/boolean/test_arithmetic.py
index 4dbd8eb9f5ca7..9ff690cdc914d 100644
--- a/pandas/tests/arrays/boolean/test_arithmetic.py
+++ b/pandas/tests/arrays/boolean/test_arithmetic.py
@@ -3,10 +3,6 @@
import numpy as np
import pytest
-from pandas._config import using_string_dtype
-
-from pandas.compat import HAS_PYARROW
-
import pandas as pd
import pandas._testing as tm
@@ -94,19 +90,8 @@ def test_op_int8(left_array, right_array, opname):
# -----------------------------------------------------------------------------
-@pytest.mark.xfail(
- using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)"
-)
-def test_error_invalid_values(data, all_arithmetic_operators, using_infer_string):
+def test_error_invalid_values(data, all_arithmetic_operators):
# invalid ops
-
- if using_infer_string:
- import pyarrow as pa
-
- err = (TypeError, pa.lib.ArrowNotImplementedError, NotImplementedError)
- else:
- err = TypeError
-
op = all_arithmetic_operators
s = pd.Series(data)
ops = getattr(s, op)
@@ -116,7 +101,8 @@ def test_error_invalid_values(data, all_arithmetic_operators, using_infer_string
"did not contain a loop with signature matching types|"
"BooleanArray cannot perform the operation|"
"not supported for the input types, and the inputs could not be safely coerced "
- "to any supported types according to the casting rule ''safe''"
+ "to any supported types according to the casting rule ''safe''|"
+ "not supported for dtype"
)
with pytest.raises(TypeError, match=msg):
ops("foo")
@@ -125,9 +111,10 @@ def test_error_invalid_values(data, all_arithmetic_operators, using_infer_string
r"unsupported operand type\(s\) for",
"Concatenation operation is not implemented for NumPy arrays",
"has no kernel",
+ "not supported for dtype",
]
)
- with pytest.raises(err, match=msg):
+ with pytest.raises(TypeError, match=msg):
ops(pd.Timestamp("20180101"))
# invalid array-likes
@@ -140,7 +127,8 @@ def test_error_invalid_values(data, all_arithmetic_operators, using_infer_string
"not all arguments converted during string formatting",
"has no kernel",
"not implemented",
+ "not supported for dtype",
]
)
- with pytest.raises(err, match=msg):
+ with pytest.raises(TypeError, match=msg):
ops(pd.Series("foo", index=s.index))
diff --git a/pandas/tests/arrays/categorical/test_api.py b/pandas/tests/arrays/categorical/test_api.py
index 1d948b7495a43..a939ee5f6f53f 100644
--- a/pandas/tests/arrays/categorical/test_api.py
+++ b/pandas/tests/arrays/categorical/test_api.py
@@ -3,8 +3,6 @@
import numpy as np
import pytest
-from pandas._config import using_string_dtype
-
from pandas.compat import PY311
from pandas import (
@@ -158,7 +156,6 @@ def test_reorder_categories_raises(self, new_categories):
with pytest.raises(ValueError, match=msg):
cat.reorder_categories(new_categories)
- @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
def test_add_categories(self):
cat = Categorical(["a", "b", "c", "a"], ordered=True)
old = cat.copy()
diff --git a/pandas/tests/arrays/floating/test_arithmetic.py b/pandas/tests/arrays/floating/test_arithmetic.py
index 768d3c1449fa4..009fac4c2f5ed 100644
--- a/pandas/tests/arrays/floating/test_arithmetic.py
+++ b/pandas/tests/arrays/floating/test_arithmetic.py
@@ -3,8 +3,6 @@
import numpy as np
import pytest
-from pandas._config import using_string_dtype
-
import pandas as pd
import pandas._testing as tm
from pandas.core.arrays import FloatingArray
@@ -124,19 +122,11 @@ def test_arith_zero_dim_ndarray(other):
# -----------------------------------------------------------------------------
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
-def test_error_invalid_values(data, all_arithmetic_operators, using_infer_string):
+def test_error_invalid_values(data, all_arithmetic_operators):
op = all_arithmetic_operators
s = pd.Series(data)
ops = getattr(s, op)
- if using_infer_string:
- import pyarrow as pa
-
- errs = (TypeError, pa.lib.ArrowNotImplementedError, NotImplementedError)
- else:
- errs = TypeError
-
# invalid scalars
msg = "|".join(
[
@@ -152,15 +142,17 @@ def test_error_invalid_values(data, all_arithmetic_operators, using_infer_string
"Concatenation operation is not implemented for NumPy arrays",
"has no kernel",
"not implemented",
+ "not supported for dtype",
+ "Can only string multiply by an integer",
]
)
- with pytest.raises(errs, match=msg):
+ with pytest.raises(TypeError, match=msg):
ops("foo")
- with pytest.raises(errs, match=msg):
+ with pytest.raises(TypeError, match=msg):
ops(pd.Timestamp("20180101"))
# invalid array-likes
- with pytest.raises(errs, match=msg):
+ with pytest.raises(TypeError, match=msg):
ops(pd.Series("foo", index=s.index))
msg = "|".join(
@@ -181,9 +173,10 @@ def test_error_invalid_values(data, all_arithmetic_operators, using_infer_string
"cannot subtract DatetimeArray from ndarray",
"has no kernel",
"not implemented",
+ "not supported for dtype",
]
)
- with pytest.raises(errs, match=msg):
+ with pytest.raises(TypeError, match=msg):
ops(pd.Series(pd.date_range("20180101", periods=len(s))))
diff --git a/pandas/tests/arrays/floating/test_astype.py b/pandas/tests/arrays/floating/test_astype.py
index ccf644b34051d..752ebe194ffcf 100644
--- a/pandas/tests/arrays/floating/test_astype.py
+++ b/pandas/tests/arrays/floating/test_astype.py
@@ -68,11 +68,9 @@ def test_astype_str(using_infer_string):
if using_infer_string:
expected = pd.array(["0.1", "0.2", None], dtype=pd.StringDtype(na_value=np.nan))
- tm.assert_extension_array_equal(a.astype("str"), expected)
- # TODO(infer_string) this should also be a string array like above
- expected = np.array(["0.1", "0.2", ""], dtype="U32")
- tm.assert_numpy_array_equal(a.astype(str), expected)
+ tm.assert_extension_array_equal(a.astype(str), expected)
+ tm.assert_extension_array_equal(a.astype("str"), expected)
else:
expected = np.array(["0.1", "0.2", ""], dtype="U32")
diff --git a/pandas/tests/arrays/integer/test_arithmetic.py b/pandas/tests/arrays/integer/test_arithmetic.py
index 8aa8c2db940b4..dee3deeee0f2f 100644
--- a/pandas/tests/arrays/integer/test_arithmetic.py
+++ b/pandas/tests/arrays/integer/test_arithmetic.py
@@ -3,8 +3,6 @@
import numpy as np
import pytest
-from pandas._config import using_string_dtype
-
import pandas as pd
import pandas._testing as tm
from pandas.core import ops
@@ -174,19 +172,11 @@ def test_numpy_zero_dim_ndarray(other):
# -----------------------------------------------------------------------------
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
-def test_error_invalid_values(data, all_arithmetic_operators, using_infer_string):
+def test_error_invalid_values(data, all_arithmetic_operators):
op = all_arithmetic_operators
s = pd.Series(data)
ops = getattr(s, op)
- if using_infer_string:
- import pyarrow as pa
-
- errs = (TypeError, pa.lib.ArrowNotImplementedError, NotImplementedError)
- else:
- errs = TypeError
-
# invalid scalars
msg = "|".join(
[
@@ -201,24 +191,21 @@ def test_error_invalid_values(data, all_arithmetic_operators, using_infer_string
"has no kernel",
"not implemented",
"The 'out' kwarg is necessary. Use numpy.strings.multiply without it.",
+ "not supported for dtype",
]
)
- with pytest.raises(errs, match=msg):
+ with pytest.raises(TypeError, match=msg):
ops("foo")
- with pytest.raises(errs, match=msg):
+ with pytest.raises(TypeError, match=msg):
ops(pd.Timestamp("20180101"))
# invalid array-likes
str_ser = pd.Series("foo", index=s.index)
# with pytest.raises(TypeError, match=msg):
- if (
- all_arithmetic_operators
- in [
- "__mul__",
- "__rmul__",
- ]
- and not using_infer_string
- ): # (data[~data.isna()] >= 0).all():
+ if all_arithmetic_operators in [
+ "__mul__",
+ "__rmul__",
+ ]: # (data[~data.isna()] >= 0).all():
res = ops(str_ser)
expected = pd.Series(["foo" * x for x in data], index=s.index)
expected = expected.fillna(np.nan)
@@ -227,7 +214,7 @@ def test_error_invalid_values(data, all_arithmetic_operators, using_infer_string
# more-correct than np.nan here.
tm.assert_series_equal(res, expected)
else:
- with pytest.raises(errs, match=msg):
+ with pytest.raises(TypeError, match=msg):
ops(str_ser)
msg = "|".join(
@@ -242,9 +229,10 @@ def test_error_invalid_values(data, all_arithmetic_operators, using_infer_string
"cannot subtract DatetimeArray from ndarray",
"has no kernel",
"not implemented",
+ "not supported for dtype",
]
)
- with pytest.raises(errs, match=msg):
+ with pytest.raises(TypeError, match=msg):
ops(pd.Series(pd.date_range("20180101", periods=len(s))))
diff --git a/pandas/tests/arrays/integer/test_dtypes.py b/pandas/tests/arrays/integer/test_dtypes.py
index 7be00e569b3fe..90879d8bd3063 100644
--- a/pandas/tests/arrays/integer/test_dtypes.py
+++ b/pandas/tests/arrays/integer/test_dtypes.py
@@ -283,11 +283,9 @@ def test_astype_str(using_infer_string):
if using_infer_string:
expected = pd.array(["1", "2", None], dtype=pd.StringDtype(na_value=np.nan))
- tm.assert_extension_array_equal(a.astype("str"), expected)
- # TODO(infer_string) this should also be a string array like above
- expected = np.array(["1", "2", ""], dtype=f"{tm.ENDIAN}U21")
- tm.assert_numpy_array_equal(a.astype(str), expected)
+ tm.assert_extension_array_equal(a.astype(str), expected)
+ tm.assert_extension_array_equal(a.astype("str"), expected)
else:
expected = np.array(["1", "2", ""], dtype=f"{tm.ENDIAN}U21")
diff --git a/pandas/tests/arrays/masked/test_arrow_compat.py b/pandas/tests/arrays/masked/test_arrow_compat.py
index 31765165f5f16..293ee4095d02e 100644
--- a/pandas/tests/arrays/masked/test_arrow_compat.py
+++ b/pandas/tests/arrays/masked/test_arrow_compat.py
@@ -1,17 +1,12 @@
import numpy as np
import pytest
-from pandas._config import using_string_dtype
-
import pandas as pd
import pandas._testing as tm
-pytestmark = [
- pytest.mark.filterwarnings(
- "ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
- ),
- pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False),
-]
+pytestmark = pytest.mark.filterwarnings(
+ "ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
+)
pa = pytest.importorskip("pyarrow")
diff --git a/pandas/tests/arrays/sparse/test_astype.py b/pandas/tests/arrays/sparse/test_astype.py
index 83a507e679d46..e6e4a11a0f5ab 100644
--- a/pandas/tests/arrays/sparse/test_astype.py
+++ b/pandas/tests/arrays/sparse/test_astype.py
@@ -81,8 +81,8 @@ def test_astype_all(self, any_real_numpy_dtype):
),
(
SparseArray([0, 1, 10]),
- str,
- SparseArray(["0", "1", "10"], dtype=SparseDtype(str, "0")),
+ np.str_,
+ SparseArray(["0", "1", "10"], dtype=SparseDtype(np.str_, "0")),
),
(SparseArray(["10", "20"]), float, SparseArray([10.0, 20.0])),
(
diff --git a/pandas/tests/arrays/sparse/test_dtype.py b/pandas/tests/arrays/sparse/test_dtype.py
index 234f4092421e5..149c28341ba3d 100644
--- a/pandas/tests/arrays/sparse/test_dtype.py
+++ b/pandas/tests/arrays/sparse/test_dtype.py
@@ -177,7 +177,7 @@ def test_construct_from_string_fill_value_raises(string):
[
(SparseDtype(int, 0), float, SparseDtype(float, 0.0)),
(SparseDtype(int, 1), float, SparseDtype(float, 1.0)),
- (SparseDtype(int, 1), str, SparseDtype(object, "1")),
+ (SparseDtype(int, 1), np.str_, SparseDtype(object, "1")),
(SparseDtype(float, 1.5), int, SparseDtype(int, 1)),
],
)
diff --git a/pandas/tests/arrays/string_/test_concat.py b/pandas/tests/arrays/string_/test_concat.py
new file mode 100644
index 0000000000000..320d700b2b6c3
--- /dev/null
+++ b/pandas/tests/arrays/string_/test_concat.py
@@ -0,0 +1,73 @@
+import numpy as np
+import pytest
+
+from pandas.compat import HAS_PYARROW
+
+from pandas.core.dtypes.cast import find_common_type
+
+import pandas as pd
+import pandas._testing as tm
+from pandas.util.version import Version
+
+
+@pytest.mark.parametrize(
+ "to_concat_dtypes, result_dtype",
+ [
+ # same types
+ ([("pyarrow", pd.NA), ("pyarrow", pd.NA)], ("pyarrow", pd.NA)),
+ ([("pyarrow", np.nan), ("pyarrow", np.nan)], ("pyarrow", np.nan)),
+ ([("python", pd.NA), ("python", pd.NA)], ("python", pd.NA)),
+ ([("python", np.nan), ("python", np.nan)], ("python", np.nan)),
+ # pyarrow preference
+ ([("pyarrow", pd.NA), ("python", pd.NA)], ("pyarrow", pd.NA)),
+ # NA preference
+ ([("python", pd.NA), ("python", np.nan)], ("python", pd.NA)),
+ ],
+)
+def test_concat_series(request, to_concat_dtypes, result_dtype):
+ if any(storage == "pyarrow" for storage, _ in to_concat_dtypes) and not HAS_PYARROW:
+ pytest.skip("Could not import 'pyarrow'")
+
+ ser_list = [
+ pd.Series(["a", "b", None], dtype=pd.StringDtype(storage, na_value))
+ for storage, na_value in to_concat_dtypes
+ ]
+
+ result = pd.concat(ser_list, ignore_index=True)
+ expected = pd.Series(
+ ["a", "b", None, "a", "b", None], dtype=pd.StringDtype(*result_dtype)
+ )
+ tm.assert_series_equal(result, expected)
+
+ # order doesn't matter for result
+ result = pd.concat(ser_list[::1], ignore_index=True)
+ tm.assert_series_equal(result, expected)
+
+
+def test_concat_with_object(string_dtype_arguments):
+ # _get_common_dtype cannot inspect values, so object dtype with strings still
+ # results in object dtype
+ result = pd.concat(
+ [
+ pd.Series(["a", "b", None], dtype=pd.StringDtype(*string_dtype_arguments)),
+ pd.Series(["a", "b", None], dtype=object),
+ ]
+ )
+ assert result.dtype == np.dtype("object")
+
+
+def test_concat_with_numpy(string_dtype_arguments):
+ # common type with a numpy string dtype always preserves the pandas string dtype
+ dtype = pd.StringDtype(*string_dtype_arguments)
+ assert find_common_type([dtype, np.dtype("U")]) == dtype
+ assert find_common_type([np.dtype("U"), dtype]) == dtype
+ assert find_common_type([dtype, np.dtype("U10")]) == dtype
+ assert find_common_type([np.dtype("U10"), dtype]) == dtype
+
+ # with any other numpy dtype -> object
+ assert find_common_type([dtype, np.dtype("S")]) == np.dtype("object")
+ assert find_common_type([dtype, np.dtype("int64")]) == np.dtype("object")
+
+ if Version(np.__version__) >= Version("2"):
+ assert find_common_type([dtype, np.dtypes.StringDType()]) == dtype
+ assert find_common_type([np.dtypes.StringDType(), dtype]) == dtype
diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py
index 1296cc3b5a494..265b9fc40629b 100644
--- a/pandas/tests/arrays/string_/test_string.py
+++ b/pandas/tests/arrays/string_/test_string.py
@@ -29,6 +29,12 @@ def dtype(string_dtype_arguments):
return pd.StringDtype(storage=storage, na_value=na_value)
+@pytest.fixture
+def dtype2(string_dtype_arguments2):
+ storage, na_value = string_dtype_arguments2
+ return pd.StringDtype(storage=storage, na_value=na_value)
+
+
@pytest.fixture
def cls(dtype):
"""Fixture giving array type from parametrized 'dtype'"""
@@ -101,10 +107,7 @@ def test_setitem_validates(cls, dtype):
with pytest.raises(TypeError, match=msg):
arr[0] = 10
- if dtype.storage == "python":
- msg = "Must provide strings."
- else:
- msg = "Scalar must be NA or str"
+ msg = "Must provide strings"
with pytest.raises(TypeError, match=msg):
arr[:] = np.array([1, 2])
@@ -524,7 +527,6 @@ def test_arrow_array(dtype):
assert arr.equals(expected)
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
@pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning")
def test_arrow_roundtrip(dtype, string_storage, using_infer_string):
# roundtrip possible from arrow 1.0.0
@@ -543,13 +545,16 @@ def test_arrow_roundtrip(dtype, string_storage, using_infer_string):
assert result["a"].dtype == "object"
else:
assert isinstance(result["a"].dtype, pd.StringDtype)
- expected = df.astype(f"string[{string_storage}]")
+ expected = df.astype(pd.StringDtype(string_storage, na_value=dtype.na_value))
+ if using_infer_string:
+ expected.columns = expected.columns.astype(
+ pd.StringDtype(string_storage, na_value=np.nan)
+ )
tm.assert_frame_equal(result, expected)
# ensure the missing value is represented by NA and not np.nan or None
assert result.loc[2, "a"] is result["a"].dtype.na_value
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
@pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning")
def test_arrow_load_from_zero_chunks(dtype, string_storage, using_infer_string):
# GH-41040
@@ -571,7 +576,11 @@ def test_arrow_load_from_zero_chunks(dtype, string_storage, using_infer_string):
assert result["a"].dtype == "object"
else:
assert isinstance(result["a"].dtype, pd.StringDtype)
- expected = df.astype(f"string[{string_storage}]")
+ expected = df.astype(pd.StringDtype(string_storage, na_value=dtype.na_value))
+ if using_infer_string:
+ expected.columns = expected.columns.astype(
+ pd.StringDtype(string_storage, na_value=np.nan)
+ )
tm.assert_frame_equal(result, expected)
@@ -686,11 +695,7 @@ def test_isin(dtype, fixed_now_ts):
tm.assert_series_equal(result, expected)
result = s.isin(["a", pd.NA])
- if dtype.storage == "python" and dtype.na_value is np.nan:
- # TODO(infer_string) we should make this consistent
- expected = pd.Series([True, False, False])
- else:
- expected = pd.Series([True, False, True])
+ expected = pd.Series([True, False, True])
tm.assert_series_equal(result, expected)
result = s.isin([])
@@ -701,6 +706,35 @@ def test_isin(dtype, fixed_now_ts):
expected = pd.Series([True, False, False])
tm.assert_series_equal(result, expected)
+ result = s.isin([fixed_now_ts])
+ expected = pd.Series([False, False, False])
+ tm.assert_series_equal(result, expected)
+
+
+def test_isin_string_array(dtype, dtype2):
+ s = pd.Series(["a", "b", None], dtype=dtype)
+
+ result = s.isin(pd.array(["a", "c"], dtype=dtype2))
+ expected = pd.Series([True, False, False])
+ tm.assert_series_equal(result, expected)
+
+ result = s.isin(pd.array(["a", None], dtype=dtype2))
+ expected = pd.Series([True, False, True])
+ tm.assert_series_equal(result, expected)
+
+
+def test_isin_arrow_string_array(dtype):
+ pa = pytest.importorskip("pyarrow")
+ s = pd.Series(["a", "b", None], dtype=dtype)
+
+ result = s.isin(pd.array(["a", "c"], dtype=pd.ArrowDtype(pa.string())))
+ expected = pd.Series([True, False, False])
+ tm.assert_series_equal(result, expected)
+
+ result = s.isin(pd.array(["a", None], dtype=pd.ArrowDtype(pa.string())))
+ expected = pd.Series([True, False, True])
+ tm.assert_series_equal(result, expected)
+
def test_setitem_scalar_with_mask_validation(dtype):
# https://github.com/pandas-dev/pandas/issues/47628
diff --git a/pandas/tests/arrays/string_/test_string_arrow.py b/pandas/tests/arrays/string_/test_string_arrow.py
index e6957feecf4b5..2f3840e92b62a 100644
--- a/pandas/tests/arrays/string_/test_string_arrow.py
+++ b/pandas/tests/arrays/string_/test_string_arrow.py
@@ -241,10 +241,11 @@ def test_setitem_invalid_indexer_raises():
arr[[0, 1]] = ["foo", "bar", "baz"]
-@pytest.mark.parametrize("dtype", ["string[pyarrow]", "string[pyarrow_numpy]"])
-def test_pickle_roundtrip(dtype):
+@pytest.mark.parametrize("na_value", [pd.NA, np.nan])
+def test_pickle_roundtrip(na_value):
# GH 42600
pytest.importorskip("pyarrow")
+ dtype = StringDtype("pyarrow", na_value=na_value)
expected = pd.Series(range(10), dtype=dtype)
expected_sliced = expected.head(2)
full_pickled = pickle.dumps(expected)
diff --git a/pandas/tests/base/test_misc.py b/pandas/tests/base/test_misc.py
index 3e0d8b1afedc0..b42e01c76335c 100644
--- a/pandas/tests/base/test_misc.py
+++ b/pandas/tests/base/test_misc.py
@@ -180,9 +180,7 @@ def test_access_by_position(index_flat):
assert index[-1] == index[size - 1]
msg = f"index {size} is out of bounds for axis 0 with size {size}"
- if is_dtype_equal(index.dtype, "string[pyarrow]") or is_dtype_equal(
- index.dtype, "string[pyarrow_numpy]"
- ):
+ if isinstance(index.dtype, pd.StringDtype) and index.dtype.storage == "pyarrow":
msg = "index out of bounds"
with pytest.raises(IndexError, match=msg):
index[size]
diff --git a/pandas/tests/copy_view/test_astype.py b/pandas/tests/copy_view/test_astype.py
index fb82329d5b50d..e0e3f6dc058a4 100644
--- a/pandas/tests/copy_view/test_astype.py
+++ b/pandas/tests/copy_view/test_astype.py
@@ -135,7 +135,8 @@ def test_astype_string_and_object_update_original(
tm.assert_frame_equal(df2, df_orig)
-def test_astype_string_copy_on_pickle_roundrip():
+def test_astype_str_copy_on_pickle_roundrip():
+ # TODO(infer_string) this test can be removed after 3.0 (once str is the default)
# https://github.com/pandas-dev/pandas/issues/54654
# ensure_string_array may alter array inplace
base = Series(np.array([(1, 2), None, 1], dtype="object"))
@@ -144,6 +145,25 @@ def test_astype_string_copy_on_pickle_roundrip():
tm.assert_series_equal(base, base_copy)
+def test_astype_string_copy_on_pickle_roundrip(any_string_dtype):
+ # https://github.com/pandas-dev/pandas/issues/54654
+ # ensure_string_array may alter array inplace
+ base = Series(np.array([(1, 2), None, 1], dtype="object"))
+ base_copy = pickle.loads(pickle.dumps(base))
+ base_copy.astype(any_string_dtype)
+ tm.assert_series_equal(base, base_copy)
+
+
+def test_astype_string_read_only_on_pickle_roundrip(any_string_dtype):
+ # https://github.com/pandas-dev/pandas/issues/54654
+ # ensure_string_array may alter read-only array inplace
+ base = Series(np.array([(1, 2), None, 1], dtype="object"))
+ base_copy = pickle.loads(pickle.dumps(base))
+ base_copy._values.flags.writeable = False
+ base_copy.astype(any_string_dtype)
+ tm.assert_series_equal(base, base_copy)
+
+
def test_astype_dict_dtypes(using_copy_on_write):
df = DataFrame(
{"a": [1, 2, 3], "b": [4, 5, 6], "c": Series([1.5, 1.5, 1.5], dtype="float64")}
diff --git a/pandas/tests/copy_view/test_constructors.py b/pandas/tests/copy_view/test_constructors.py
index 866b1964a334f..66c9b456f18ad 100644
--- a/pandas/tests/copy_view/test_constructors.py
+++ b/pandas/tests/copy_view/test_constructors.py
@@ -1,8 +1,6 @@
import numpy as np
import pytest
-from pandas._config import using_string_dtype
-
import pandas as pd
from pandas import (
DataFrame,
@@ -285,10 +283,9 @@ def test_dataframe_from_dict_of_series_with_reindex(dtype):
assert np.shares_memory(arr_before, arr_after)
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
@pytest.mark.parametrize("cons", [Series, Index])
@pytest.mark.parametrize(
- "data, dtype", [([1, 2], None), ([1, 2], "int64"), (["a", "b"], None)]
+ "data, dtype", [([1, 2], None), ([1, 2], "int64"), (["a", "b"], object)]
)
def test_dataframe_from_series_or_index(
using_copy_on_write, warn_copy_on_write, data, dtype, cons
diff --git a/pandas/tests/copy_view/test_functions.py b/pandas/tests/copy_view/test_functions.py
index a87baaedb9244..23ed7f9edcd22 100644
--- a/pandas/tests/copy_view/test_functions.py
+++ b/pandas/tests/copy_view/test_functions.py
@@ -16,10 +16,9 @@
from pandas.tests.copy_view.util import get_array
-@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)")
def test_concat_frames(using_copy_on_write):
- df = DataFrame({"b": ["a"] * 3})
- df2 = DataFrame({"a": ["a"] * 3})
+ df = DataFrame({"b": ["a"] * 3}, dtype=object)
+ df2 = DataFrame({"a": ["a"] * 3}, dtype=object)
df_orig = df.copy()
result = concat([df, df2], axis=1)
@@ -41,10 +40,9 @@ def test_concat_frames(using_copy_on_write):
tm.assert_frame_equal(df, df_orig)
-@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)")
def test_concat_frames_updating_input(using_copy_on_write):
- df = DataFrame({"b": ["a"] * 3})
- df2 = DataFrame({"a": ["a"] * 3})
+ df = DataFrame({"b": ["a"] * 3}, dtype=object)
+ df2 = DataFrame({"a": ["a"] * 3}, dtype=object)
result = concat([df, df2], axis=1)
if using_copy_on_write:
@@ -203,7 +201,7 @@ def test_concat_copy_keyword(using_copy_on_write, copy):
assert not np.shares_memory(get_array(df2, "b"), get_array(result, "b"))
-@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)")
+# @pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)")
@pytest.mark.parametrize(
"func",
[
@@ -212,8 +210,8 @@ def test_concat_copy_keyword(using_copy_on_write, copy):
],
)
def test_merge_on_key(using_copy_on_write, func):
- df1 = DataFrame({"key": ["a", "b", "c"], "a": [1, 2, 3]})
- df2 = DataFrame({"key": ["a", "b", "c"], "b": [4, 5, 6]})
+ df1 = DataFrame({"key": Series(["a", "b", "c"], dtype=object), "a": [1, 2, 3]})
+ df2 = DataFrame({"key": Series(["a", "b", "c"], dtype=object), "b": [4, 5, 6]})
df1_orig = df1.copy()
df2_orig = df2.copy()
@@ -267,7 +265,6 @@ def test_merge_on_index(using_copy_on_write):
tm.assert_frame_equal(df2, df2_orig)
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
@pytest.mark.parametrize(
"func, how",
[
@@ -276,8 +273,8 @@ def test_merge_on_index(using_copy_on_write):
],
)
def test_merge_on_key_enlarging_one(using_copy_on_write, func, how):
- df1 = DataFrame({"key": ["a", "b", "c"], "a": [1, 2, 3]})
- df2 = DataFrame({"key": ["a", "b"], "b": [4, 5]})
+ df1 = DataFrame({"key": Series(["a", "b", "c"], dtype=object), "a": [1, 2, 3]})
+ df2 = DataFrame({"key": Series(["a", "b"], dtype=object), "b": [4, 5]})
df1_orig = df1.copy()
df2_orig = df2.copy()
@@ -321,9 +318,13 @@ def test_merge_copy_keyword(using_copy_on_write, copy):
assert not np.shares_memory(get_array(df2, "b"), get_array(result, "b"))
-@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)")
+@pytest.mark.xfail(
+ using_string_dtype() and HAS_PYARROW,
+ reason="TODO(infer_string); result.index infers str dtype while both "
+ "df1 and df2 index are object.",
+)
def test_join_on_key(using_copy_on_write):
- df_index = Index(["a", "b", "c"], name="key")
+ df_index = Index(["a", "b", "c"], name="key", dtype=object)
df1 = DataFrame({"a": [1, 2, 3]}, index=df_index.copy(deep=True))
df2 = DataFrame({"b": [4, 5, 6]}, index=df_index.copy(deep=True))
@@ -355,9 +356,8 @@ def test_join_on_key(using_copy_on_write):
tm.assert_frame_equal(df2, df2_orig)
-@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)")
def test_join_multiple_dataframes_on_key(using_copy_on_write):
- df_index = Index(["a", "b", "c"], name="key")
+ df_index = Index(["a", "b", "c"], name="key", dtype=object)
df1 = DataFrame({"a": [1, 2, 3]}, index=df_index.copy(deep=True))
dfs_list = [
diff --git a/pandas/tests/copy_view/test_internals.py b/pandas/tests/copy_view/test_internals.py
index 6f7198520d22e..8526d38588897 100644
--- a/pandas/tests/copy_view/test_internals.py
+++ b/pandas/tests/copy_view/test_internals.py
@@ -1,12 +1,13 @@
import numpy as np
import pytest
-from pandas._config import using_string_dtype
-
import pandas.util._test_decorators as td
import pandas as pd
-from pandas import DataFrame
+from pandas import (
+ DataFrame,
+ Series,
+)
import pandas._testing as tm
from pandas.tests.copy_view.util import get_array
@@ -78,7 +79,6 @@ def test_switch_options():
@td.skip_array_manager_invalid_test
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
@pytest.mark.parametrize("dtype", [np.intp, np.int8])
@pytest.mark.parametrize(
"locs, arr",
@@ -105,7 +105,7 @@ def test_iset_splits_blocks_inplace(using_copy_on_write, locs, arr, dtype):
"c": [7, 8, 9],
"d": [10, 11, 12],
"e": [13, 14, 15],
- "f": ["a", "b", "c"],
+ "f": Series(["a", "b", "c"], dtype=object),
},
)
arr = arr.astype(dtype)
diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py
index ccd30caba5dee..f7442cf5d6d3c 100644
--- a/pandas/tests/dtypes/test_common.py
+++ b/pandas/tests/dtypes/test_common.py
@@ -810,11 +810,23 @@ def test_pandas_dtype_string_dtypes(string_storage):
"pyarrow" if HAS_PYARROW else "python", na_value=np.nan
)
+ with pd.option_context("future.infer_string", True):
+ # with the default string_storage setting
+ result = pandas_dtype(str)
+ assert result == pd.StringDtype(
+ "pyarrow" if HAS_PYARROW else "python", na_value=np.nan
+ )
+
with pd.option_context("future.infer_string", True):
with pd.option_context("string_storage", string_storage):
result = pandas_dtype("str")
assert result == pd.StringDtype(string_storage, na_value=np.nan)
+ with pd.option_context("future.infer_string", True):
+ with pd.option_context("string_storage", string_storage):
+ result = pandas_dtype(str)
+ assert result == pd.StringDtype(string_storage, na_value=np.nan)
+
with pd.option_context("future.infer_string", False):
with pd.option_context("string_storage", string_storage):
result = pandas_dtype("str")
diff --git a/pandas/tests/dtypes/test_dtypes.py b/pandas/tests/dtypes/test_dtypes.py
index a4916ed1bbd8a..a5666e169fb4c 100644
--- a/pandas/tests/dtypes/test_dtypes.py
+++ b/pandas/tests/dtypes/test_dtypes.py
@@ -5,8 +5,6 @@
import pytest
import pytz
-from pandas._config import using_string_dtype
-
from pandas._libs.tslibs.dtypes import NpyDatetimeUnit
from pandas.core.dtypes.base import _registry as registry
@@ -961,7 +959,6 @@ def test_same_categories_different_order(self):
c2 = CategoricalDtype(["b", "a"], ordered=True)
assert c1 is not c2
- @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
@pytest.mark.parametrize("ordered1", [True, False, None])
@pytest.mark.parametrize("ordered2", [True, False, None])
def test_categorical_equality(self, ordered1, ordered2):
diff --git a/pandas/tests/extension/base/casting.py b/pandas/tests/extension/base/casting.py
index 2bfe801c48a77..56879129c3a28 100644
--- a/pandas/tests/extension/base/casting.py
+++ b/pandas/tests/extension/base/casting.py
@@ -43,8 +43,8 @@ def test_tolist(self, data):
assert result == expected
def test_astype_str(self, data):
- result = pd.Series(data[:5]).astype(str)
- expected = pd.Series([str(x) for x in data[:5]], dtype=str)
+ result = pd.Series(data[:2]).astype(str)
+ expected = pd.Series([str(x) for x in data[:2]], dtype=str)
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize(
diff --git a/pandas/tests/extension/base/ops.py b/pandas/tests/extension/base/ops.py
index ff9f3cbed64a2..547114ecfddd0 100644
--- a/pandas/tests/extension/base/ops.py
+++ b/pandas/tests/extension/base/ops.py
@@ -24,7 +24,7 @@ class BaseOpsUtil:
def _get_expected_exception(
self, op_name: str, obj, other
- ) -> type[Exception] | None:
+ ) -> type[Exception] | tuple[type[Exception], ...] | None:
# Find the Exception, if any we expect to raise calling
# obj.__op_name__(other)
@@ -39,14 +39,6 @@ def _get_expected_exception(
else:
result = self.frame_scalar_exc
- if using_string_dtype() and result is not None:
- import pyarrow as pa
-
- result = ( # type: ignore[assignment]
- result,
- pa.lib.ArrowNotImplementedError,
- NotImplementedError,
- )
return result
def _cast_pointwise_result(self, op_name: str, obj, other, pointwise_result):
diff --git a/pandas/tests/extension/decimal/test_decimal.py b/pandas/tests/extension/decimal/test_decimal.py
index 9907e345ada63..8afb989508e04 100644
--- a/pandas/tests/extension/decimal/test_decimal.py
+++ b/pandas/tests/extension/decimal/test_decimal.py
@@ -68,7 +68,7 @@ def data_for_grouping():
class TestDecimalArray(base.ExtensionTests):
def _get_expected_exception(
self, op_name: str, obj, other
- ) -> type[Exception] | None:
+ ) -> type[Exception] | tuple[type[Exception], ...] | None:
return None
def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool:
diff --git a/pandas/tests/extension/json/array.py b/pandas/tests/extension/json/array.py
index e43b50322bb92..5cbd45a99ae5c 100644
--- a/pandas/tests/extension/json/array.py
+++ b/pandas/tests/extension/json/array.py
@@ -207,9 +207,8 @@ def astype(self, dtype, copy=True):
return self.copy()
return self
elif isinstance(dtype, StringDtype):
- value = self.astype(str) # numpy doesn't like nested dicts
arr_cls = dtype.construct_array_type()
- return arr_cls._from_sequence(value, dtype=dtype, copy=False)
+ return arr_cls._from_sequence(self, dtype=dtype, copy=False)
elif not copy:
return np.asarray([dict(x) for x in self], dtype=dtype)
else:
diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py
index d9a3033b8380e..60e7bd83432c5 100644
--- a/pandas/tests/extension/test_arrow.py
+++ b/pandas/tests/extension/test_arrow.py
@@ -41,7 +41,6 @@
pa_version_under13p0,
pa_version_under14p0,
)
-import pandas.util._test_decorators as td
from pandas.core.dtypes.dtypes import (
ArrowDtype,
@@ -286,7 +285,7 @@ def test_map(self, data_missing, na_action):
expected = data_missing.to_numpy()
tm.assert_numpy_array_equal(result, expected)
- def test_astype_str(self, data, request):
+ def test_astype_str(self, data, request, using_infer_string):
pa_dtype = data.dtype.pyarrow_dtype
if pa.types.is_binary(pa_dtype):
request.applymarker(
@@ -294,9 +293,10 @@ def test_astype_str(self, data, request):
reason=f"For {pa_dtype} .astype(str) decodes.",
)
)
- elif (
- pa.types.is_timestamp(pa_dtype) and pa_dtype.tz is None
- ) or pa.types.is_duration(pa_dtype):
+ elif not using_infer_string and (
+ (pa.types.is_timestamp(pa_dtype) and pa_dtype.tz is None)
+ or pa.types.is_duration(pa_dtype)
+ ):
request.applymarker(
pytest.mark.xfail(
reason="pd.Timestamp/pd.Timedelta repr different from numpy repr",
@@ -304,25 +304,6 @@ def test_astype_str(self, data, request):
)
super().test_astype_str(data)
- @pytest.mark.parametrize(
- "nullable_string_dtype",
- [
- "string[python]",
- pytest.param("string[pyarrow]", marks=td.skip_if_no("pyarrow")),
- ],
- )
- def test_astype_string(self, data, nullable_string_dtype, request):
- pa_dtype = data.dtype.pyarrow_dtype
- if (
- pa.types.is_timestamp(pa_dtype) and pa_dtype.tz is None
- ) or pa.types.is_duration(pa_dtype):
- request.applymarker(
- pytest.mark.xfail(
- reason="pd.Timestamp/pd.Timedelta repr different from numpy repr",
- )
- )
- super().test_astype_string(data, nullable_string_dtype)
-
def test_from_dtype(self, data, request):
pa_dtype = data.dtype.pyarrow_dtype
if pa.types.is_string(pa_dtype) or pa.types.is_decimal(pa_dtype):
@@ -800,8 +781,6 @@ def test_value_counts_returns_pyarrow_int64(self, data):
_combine_le_expected_dtype = "bool[pyarrow]"
- divmod_exc = NotImplementedError
-
def get_op_from_name(self, op_name):
short_opname = op_name.strip("_")
if short_opname == "rtruediv":
@@ -935,10 +914,11 @@ def _is_temporal_supported(self, opname, pa_dtype):
def _get_expected_exception(
self, op_name: str, obj, other
- ) -> type[Exception] | None:
+ ) -> type[Exception] | tuple[type[Exception], ...] | None:
if op_name in ("__divmod__", "__rdivmod__"):
- return self.divmod_exc
+ return (NotImplementedError, TypeError)
+ exc: type[Exception] | tuple[type[Exception], ...] | None
dtype = tm.get_dtype(obj)
# error: Item "dtype[Any]" of "dtype[Any] | ExtensionDtype" has no
# attribute "pyarrow_dtype"
@@ -949,7 +929,7 @@ def _get_expected_exception(
"__mod__",
"__rmod__",
}:
- exc = NotImplementedError
+ exc = (NotImplementedError, TypeError)
elif arrow_temporal_supported:
exc = None
elif op_name in ["__add__", "__radd__"] and (
@@ -961,10 +941,7 @@ def _get_expected_exception(
or pa.types.is_integer(pa_dtype)
or pa.types.is_decimal(pa_dtype)
):
- # TODO: in many of these cases, e.g. non-duration temporal,
- # these will *never* be allowed. Would it make more sense to
- # re-raise as TypeError, more consistent with non-pyarrow cases?
- exc = pa.ArrowNotImplementedError
+ exc = TypeError
else:
exc = None
return exc
@@ -1020,14 +997,6 @@ def test_arith_series_with_scalar(self, data, all_arithmetic_operators, request)
if all_arithmetic_operators == "__rmod__" and pa.types.is_binary(pa_dtype):
pytest.skip("Skip testing Python string formatting")
- elif all_arithmetic_operators in ("__rmul__", "__mul__") and (
- pa.types.is_binary(pa_dtype) or pa.types.is_string(pa_dtype)
- ):
- request.applymarker(
- pytest.mark.xfail(
- raises=TypeError, reason="Can only string multiply by an integer."
- )
- )
mark = self._get_arith_xfail_marker(all_arithmetic_operators, pa_dtype)
if mark is not None:
@@ -1042,14 +1011,6 @@ def test_arith_frame_with_scalar(self, data, all_arithmetic_operators, request):
pa.types.is_string(pa_dtype) or pa.types.is_binary(pa_dtype)
):
pytest.skip("Skip testing Python string formatting")
- elif all_arithmetic_operators in ("__rmul__", "__mul__") and (
- pa.types.is_binary(pa_dtype) or pa.types.is_string(pa_dtype)
- ):
- request.applymarker(
- pytest.mark.xfail(
- raises=TypeError, reason="Can only string multiply by an integer."
- )
- )
mark = self._get_arith_xfail_marker(all_arithmetic_operators, pa_dtype)
if mark is not None:
@@ -1073,14 +1034,6 @@ def test_arith_series_with_array(self, data, all_arithmetic_operators, request):
),
)
)
- elif all_arithmetic_operators in ("__rmul__", "__mul__") and (
- pa.types.is_binary(pa_dtype) or pa.types.is_string(pa_dtype)
- ):
- request.applymarker(
- pytest.mark.xfail(
- raises=TypeError, reason="Can only string multiply by an integer."
- )
- )
mark = self._get_arith_xfail_marker(all_arithmetic_operators, pa_dtype)
if mark is not None:
@@ -1868,6 +1821,17 @@ def test_str_replace_negative_n():
expected = pd.Series(["bc", ""], dtype=ArrowDtype(pa.string()))
tm.assert_series_equal(expected, actual)
+ # Same bug for pyarrow-backed StringArray GH#59628
+ ser2 = ser.astype(pd.StringDtype(storage="pyarrow"))
+ actual2 = ser2.str.replace("a", "", -3, True)
+ expected2 = expected.astype(ser2.dtype)
+ tm.assert_series_equal(expected2, actual2)
+
+ ser3 = ser.astype(pd.StringDtype(storage="pyarrow", na_value=np.nan))
+ actual3 = ser3.str.replace("a", "", -3, True)
+ expected3 = expected.astype(ser3.dtype)
+ tm.assert_series_equal(expected3, actual3)
+
def test_str_repeat_unsupported():
ser = pd.Series(["abc", None], dtype=ArrowDtype(pa.string()))
@@ -1942,10 +1906,56 @@ def test_str_find_negative_start():
tm.assert_series_equal(result, expected)
-def test_str_find_notimplemented():
+def test_str_find_no_end():
ser = pd.Series(["abc", None], dtype=ArrowDtype(pa.string()))
- with pytest.raises(NotImplementedError, match="find not implemented"):
- ser.str.find("ab", start=1)
+ result = ser.str.find("ab", start=1)
+ expected = pd.Series([-1, None], dtype="int64[pyarrow]")
+ tm.assert_series_equal(result, expected)
+
+
+def test_str_find_negative_start_negative_end():
+ # GH 56791
+ ser = pd.Series(["abcdefg", None], dtype=ArrowDtype(pa.string()))
+ result = ser.str.find(sub="d", start=-6, end=-3)
+ expected = pd.Series([3, None], dtype=ArrowDtype(pa.int64()))
+ tm.assert_series_equal(result, expected)
+
+
+def test_str_find_large_start():
+ # GH 56791
+ ser = pd.Series(["abcdefg", None], dtype=ArrowDtype(pa.string()))
+ result = ser.str.find(sub="d", start=16)
+ expected = pd.Series([-1, None], dtype=ArrowDtype(pa.int64()))
+ tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.skipif(
+ pa_version_under13p0, reason="https://github.com/apache/arrow/issues/36311"
+)
+@pytest.mark.parametrize("start", [-15, -3, 0, 1, 15, None])
+@pytest.mark.parametrize("end", [-15, -1, 0, 3, 15, None])
+@pytest.mark.parametrize("sub", ["", "az", "abce", "a", "caa"])
+def test_str_find_e2e(start, end, sub):
+ s = pd.Series(
+ ["abcaadef", "abc", "abcdeddefgj8292", "ab", "a", ""],
+ dtype=ArrowDtype(pa.string()),
+ )
+ object_series = s.astype(pd.StringDtype(storage="python"))
+ result = s.str.find(sub, start, end)
+ expected = object_series.str.find(sub, start, end).astype(result.dtype)
+ tm.assert_series_equal(result, expected)
+
+ arrow_str_series = s.astype(pd.StringDtype(storage="pyarrow"))
+ result2 = arrow_str_series.str.find(sub, start, end).astype(result.dtype)
+ tm.assert_series_equal(result2, expected)
+
+
+def test_str_find_negative_start_negative_end_no_match():
+ # GH 56791
+ ser = pd.Series(["abcdefg", None], dtype=ArrowDtype(pa.string()))
+ result = ser.str.find(sub="d", start=-3, end=-6)
+ expected = pd.Series([-1, None], dtype=ArrowDtype(pa.int64()))
+ tm.assert_series_equal(result, expected)
@pytest.mark.parametrize(
@@ -1989,6 +1999,7 @@ def test_str_join_string_type():
[None, 2, None, ["ab", None]],
[None, 2, 1, ["ab", None]],
[1, 3, 1, ["bc", None]],
+ (None, None, -1, ["dcba", None]),
],
)
def test_str_slice(start, stop, step, exp):
diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py
index f800f734ec9d9..07c3b4224e76f 100644
--- a/pandas/tests/extension/test_string.py
+++ b/pandas/tests/extension/test_string.py
@@ -21,7 +21,7 @@
import numpy as np
import pytest
-from pandas._config import using_string_dtype
+from pandas.compat import HAS_PYARROW
import pandas as pd
import pandas._testing as tm
@@ -30,10 +30,6 @@
from pandas.core.arrays.string_ import StringDtype
from pandas.tests.extension import base
-pytestmark = pytest.mark.xfail(
- using_string_dtype(), reason="TODO(infer_string)", strict=False
-)
-
def maybe_split_array(arr, chunked):
if not chunked:
@@ -168,24 +164,15 @@ def test_fillna_no_op_returns_copy(self, data):
def _get_expected_exception(
self, op_name: str, obj, other
- ) -> type[Exception] | None:
- if op_name in ["__divmod__", "__rdivmod__"]:
- if (
- isinstance(obj, pd.Series)
- and cast(StringDtype, tm.get_dtype(obj)).storage == "pyarrow"
- ):
- # TODO: re-raise as TypeError?
- return NotImplementedError
- elif (
- isinstance(other, pd.Series)
- and cast(StringDtype, tm.get_dtype(other)).storage == "pyarrow"
- ):
- # TODO: re-raise as TypeError?
- return NotImplementedError
- return TypeError
- elif op_name in ["__mod__", "__rmod__", "__pow__", "__rpow__"]:
- if cast(StringDtype, tm.get_dtype(obj)).storage == "pyarrow":
- return NotImplementedError
+ ) -> type[Exception] | tuple[type[Exception], ...] | None:
+ if op_name in [
+ "__mod__",
+ "__rmod__",
+ "__divmod__",
+ "__rdivmod__",
+ "__pow__",
+ "__rpow__",
+ ]:
return TypeError
elif op_name in ["__mul__", "__rmul__"]:
# Can only multiply strings by integers
@@ -198,11 +185,6 @@ def _get_expected_exception(
"__sub__",
"__rsub__",
]:
- if cast(StringDtype, tm.get_dtype(obj)).storage == "pyarrow":
- import pyarrow as pa
-
- # TODO: better to re-raise as TypeError?
- return pa.ArrowNotImplementedError
return TypeError
return None
@@ -230,9 +212,35 @@ def test_compare_scalar(self, data, comparison_op):
ser = pd.Series(data)
self._compare_other(ser, data, comparison_op, "abc")
- @pytest.mark.filterwarnings("ignore:Falling back:pandas.errors.PerformanceWarning")
- def test_groupby_extension_apply(self, data_for_grouping, groupby_apply_op):
- super().test_groupby_extension_apply(data_for_grouping, groupby_apply_op)
+ def test_combine_add(self, data_repeated, using_infer_string, request):
+ dtype = next(data_repeated(1)).dtype
+ if using_infer_string and (
+ (dtype.na_value is pd.NA) and dtype.storage == "python"
+ ):
+ mark = pytest.mark.xfail(
+ reason="The pointwise operation result will be inferred to "
+ "string[nan, pyarrow], which does not match the input dtype"
+ )
+ request.applymarker(mark)
+ super().test_combine_add(data_repeated)
+
+ def test_arith_series_with_array(
+ self, data, all_arithmetic_operators, using_infer_string, request
+ ):
+ dtype = data.dtype
+ if (
+ using_infer_string
+ and all_arithmetic_operators == "__radd__"
+ and (
+ (dtype.na_value is pd.NA) or (dtype.storage == "python" and HAS_PYARROW)
+ )
+ ):
+ mark = pytest.mark.xfail(
+ reason="The pointwise operation result will be inferred to "
+ "string[nan, pyarrow], which does not match the input dtype"
+ )
+ request.applymarker(mark)
+ super().test_arith_series_with_array(data, all_arithmetic_operators)
class Test2DCompat(base.Dim2CompatTests):
diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py
index 7a7586961deca..04dba325f060f 100644
--- a/pandas/tests/frame/indexing/test_indexing.py
+++ b/pandas/tests/frame/indexing/test_indexing.py
@@ -1955,13 +1955,11 @@ def test_adding_new_conditional_column() -> None:
("dtype", "infer_string"),
[
(object, False),
- ("string[pyarrow_numpy]", True),
+ (pd.StringDtype(na_value=np.nan), True),
],
)
def test_adding_new_conditional_column_with_string(dtype, infer_string) -> None:
# https://github.com/pandas-dev/pandas/issues/56204
- pytest.importorskip("pyarrow")
-
df = DataFrame({"a": [1, 2], "b": [3, 4]})
with pd.option_context("future.infer_string", infer_string):
df.loc[df["a"] == 1, "c"] = "1"
@@ -1971,16 +1969,14 @@ def test_adding_new_conditional_column_with_string(dtype, infer_string) -> None:
tm.assert_frame_equal(df, expected)
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
def test_add_new_column_infer_string():
# GH#55366
- pytest.importorskip("pyarrow")
df = DataFrame({"x": [1]})
with pd.option_context("future.infer_string", True):
df.loc[df["x"] == 1, "y"] = "1"
expected = DataFrame(
- {"x": [1], "y": Series(["1"], dtype="string[pyarrow_numpy]")},
- columns=Index(["x", "y"], dtype=object),
+ {"x": [1], "y": Series(["1"], dtype=pd.StringDtype(na_value=np.nan))},
+ columns=Index(["x", "y"], dtype="str"),
)
tm.assert_frame_equal(df, expected)
diff --git a/pandas/tests/frame/methods/test_astype.py b/pandas/tests/frame/methods/test_astype.py
index 9c27e76de91b2..ca3764ac87e95 100644
--- a/pandas/tests/frame/methods/test_astype.py
+++ b/pandas/tests/frame/methods/test_astype.py
@@ -169,21 +169,21 @@ def test_astype_str(self):
"d": list(map(str, d._values)),
"e": list(map(str, e._values)),
},
- dtype="object",
+ dtype="str",
)
tm.assert_frame_equal(result, expected)
- def test_astype_str_float(self):
+ def test_astype_str_float(self, using_infer_string):
# see GH#11302
result = DataFrame([np.nan]).astype(str)
- expected = DataFrame(["nan"], dtype="object")
+ expected = DataFrame([np.nan if using_infer_string else "nan"], dtype="str")
tm.assert_frame_equal(result, expected)
result = DataFrame([1.12345678901234567890]).astype(str)
val = "1.1234567890123457"
- expected = DataFrame([val], dtype="object")
+ expected = DataFrame([val], dtype="str")
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("dtype_class", [dict, Series])
@@ -285,7 +285,7 @@ def test_astype_duplicate_col_series_arg(self):
result = df.astype(dtypes)
expected = DataFrame(
{
- 0: Series(vals[:, 0].astype(str), dtype=object),
+ 0: Series(vals[:, 0].astype(str), dtype="str"),
1: vals[:, 1],
2: pd.array(vals[:, 2], dtype="Float64"),
3: vals[:, 3],
@@ -666,9 +666,10 @@ def test_astype_dt64tz(self, timezone_frame):
# dt64tz->dt64 deprecated
timezone_frame.astype("datetime64[ns]")
- def test_astype_dt64tz_to_str(self, timezone_frame):
+ def test_astype_dt64tz_to_str(self, timezone_frame, using_infer_string):
# str formatting
result = timezone_frame.astype(str)
+ na_value = np.nan if using_infer_string else "NaT"
expected = DataFrame(
[
[
@@ -676,7 +677,7 @@ def test_astype_dt64tz_to_str(self, timezone_frame):
"2013-01-01 00:00:00-05:00",
"2013-01-01 00:00:00+01:00",
],
- ["2013-01-02", "NaT", "NaT"],
+ ["2013-01-02", na_value, na_value],
[
"2013-01-03",
"2013-01-03 00:00:00-05:00",
@@ -684,7 +685,7 @@ def test_astype_dt64tz_to_str(self, timezone_frame):
],
],
columns=timezone_frame.columns,
- dtype="object",
+ dtype="str",
)
tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/frame/methods/test_rank.py b/pandas/tests/frame/methods/test_rank.py
index edba971408d04..37bed2da05743 100644
--- a/pandas/tests/frame/methods/test_rank.py
+++ b/pandas/tests/frame/methods/test_rank.py
@@ -6,13 +6,10 @@
import numpy as np
import pytest
-from pandas._config import using_string_dtype
-
from pandas._libs.algos import (
Infinity,
NegInfinity,
)
-from pandas.compat import HAS_PYARROW
from pandas import (
DataFrame,
@@ -473,23 +470,10 @@ def test_rank_inf_nans_na_option(
("top", False, [2.0, 3.0, 1.0, 4.0]),
],
)
- def test_rank_object_first(
- self,
- request,
- frame_or_series,
- na_option,
- ascending,
- expected,
- using_infer_string,
- ):
+ def test_rank_object_first(self, frame_or_series, na_option, ascending, expected):
obj = frame_or_series(["foo", "foo", None, "foo"])
- if using_string_dtype() and not HAS_PYARROW and isinstance(obj, Series):
- request.applymarker(pytest.mark.xfail(reason="TODO(infer_string)"))
-
result = obj.rank(method="first", na_option=na_option, ascending=ascending)
expected = frame_or_series(expected)
- if using_infer_string and isinstance(obj, Series):
- expected = expected.astype("uint64")
tm.assert_equal(result, expected)
@pytest.mark.parametrize(
@@ -509,14 +493,15 @@ def test_rank_mixed_axis_zero(self, data, expected):
result = df.rank(numeric_only=True)
tm.assert_frame_equal(result, expected)
- @pytest.mark.parametrize(
- "dtype, exp_dtype",
- [("string[pyarrow]", "Int64"), ("string[pyarrow_numpy]", "float64")],
- )
- def test_rank_string_dtype(self, dtype, exp_dtype):
+ def test_rank_string_dtype(self, string_dtype_no_object):
# GH#55362
- pytest.importorskip("pyarrow")
- obj = Series(["foo", "foo", None, "foo"], dtype=dtype)
+ obj = Series(["foo", "foo", None, "foo"], dtype=string_dtype_no_object)
result = obj.rank(method="first")
+ exp_dtype = (
+ "Float64" if string_dtype_no_object == "string[pyarrow]" else "float64"
+ )
+ if string_dtype_no_object.storage == "python":
+ # TODO nullable string[python] should also return nullable Int64
+ exp_dtype = "float64"
expected = Series([1, 2, None, 3], dtype=exp_dtype)
tm.assert_series_equal(result, expected)
diff --git a/pandas/tests/frame/methods/test_select_dtypes.py b/pandas/tests/frame/methods/test_select_dtypes.py
index 875dca321635f..0354e9df3d168 100644
--- a/pandas/tests/frame/methods/test_select_dtypes.py
+++ b/pandas/tests/frame/methods/test_select_dtypes.py
@@ -99,6 +99,9 @@ def test_select_dtypes_include_using_list_like(self, using_infer_string):
ei = df[["a"]]
tm.assert_frame_equal(ri, ei)
+ ri = df.select_dtypes(include=[str])
+ tm.assert_frame_equal(ri, ei)
+
def test_select_dtypes_exclude_using_list_like(self):
df = DataFrame(
{
@@ -358,7 +361,7 @@ def test_select_dtypes_datetime_with_tz(self):
@pytest.mark.parametrize("dtype", [str, "str", np.bytes_, "S1", np.str_, "U1"])
@pytest.mark.parametrize("arg", ["include", "exclude"])
def test_select_dtypes_str_raises(self, dtype, arg, using_infer_string):
- if using_infer_string and dtype == "str":
+ if using_infer_string and (dtype == "str" or dtype is str):
# this is tested below
pytest.skip("Selecting string columns works with future strings")
df = DataFrame(
diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py
index 86d9dc0c7fbdc..fd770b368c9da 100644
--- a/pandas/tests/frame/test_constructors.py
+++ b/pandas/tests/frame/test_constructors.py
@@ -24,7 +24,6 @@
from pandas._config import using_string_dtype
from pandas._libs import lib
-from pandas.compat import HAS_PYARROW
from pandas.compat.numpy import np_version_gt2
from pandas.errors import IntCastingNaNError
import pandas.util._test_decorators as td
@@ -83,7 +82,7 @@ def test_constructor_from_ndarray_with_str_dtype(self):
# with an array of strings each of which is e.g. "[0 1 2]"
arr = np.arange(12).reshape(4, 3)
df = DataFrame(arr, dtype=str)
- expected = DataFrame(arr.astype(str), dtype=object)
+ expected = DataFrame(arr.astype(str), dtype="str")
tm.assert_frame_equal(df, expected)
def test_constructor_from_2d_datetimearray(self, using_array_manager):
@@ -328,19 +327,39 @@ def test_constructor_dtype_nocast_view_2d_array(
assert df2._mgr.arrays[0].flags.c_contiguous
@td.skip_array_manager_invalid_test
- @pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="conversion copies")
- def test_1d_object_array_does_not_copy(self):
+ def test_1d_object_array_does_not_copy(self, using_infer_string):
# https://github.com/pandas-dev/pandas/issues/39272
arr = np.array(["a", "b"], dtype="object")
df = DataFrame(arr, copy=False)
+ if using_infer_string:
+ if df[0].dtype.storage == "pyarrow":
+ # object dtype strings are converted to arrow memory,
+ # no numpy arrays to compare
+ pass
+ else:
+ assert np.shares_memory(df[0].to_numpy(), arr)
+ else:
+ assert np.shares_memory(df.values, arr)
+
+ df = DataFrame(arr, dtype=object, copy=False)
assert np.shares_memory(df.values, arr)
@td.skip_array_manager_invalid_test
- @pytest.mark.xfail(using_string_dtype(), reason="conversion copies")
- def test_2d_object_array_does_not_copy(self):
+ def test_2d_object_array_does_not_copy(self, using_infer_string):
# https://github.com/pandas-dev/pandas/issues/39272
arr = np.array([["a", "b"], ["c", "d"]], dtype="object")
df = DataFrame(arr, copy=False)
+ if using_infer_string:
+ if df[0].dtype.storage == "pyarrow":
+ # object dtype strings are converted to arrow memory,
+ # no numpy arrays to compare
+ pass
+ else:
+ assert np.shares_memory(df[0].to_numpy(), arr)
+ else:
+ assert np.shares_memory(df.values, arr)
+
+ df = DataFrame(arr, dtype=object, copy=False)
assert np.shares_memory(df.values, arr)
def test_constructor_dtype_list_data(self):
@@ -1793,12 +1812,18 @@ def test_constructor_column_duplicates(self):
tm.assert_frame_equal(idf, edf)
- def test_constructor_empty_with_string_dtype(self):
+ def test_constructor_empty_with_string_dtype(self, using_infer_string):
# GH 9428
expected = DataFrame(index=[0, 1], columns=[0, 1], dtype=object)
+ expected_str = DataFrame(
+ index=[0, 1], columns=[0, 1], dtype=pd.StringDtype(na_value=np.nan)
+ )
df = DataFrame(index=[0, 1], columns=[0, 1], dtype=str)
- tm.assert_frame_equal(df, expected)
+ if using_infer_string:
+ tm.assert_frame_equal(df, expected_str)
+ else:
+ tm.assert_frame_equal(df, expected)
df = DataFrame(index=[0, 1], columns=[0, 1], dtype=np.str_)
tm.assert_frame_equal(df, expected)
df = DataFrame(index=[0, 1], columns=[0, 1], dtype="U5")
@@ -2721,8 +2746,7 @@ def test_construct_with_strings_and_none(self):
def test_frame_string_inference(self):
# GH#54430
- pytest.importorskip("pyarrow")
- dtype = "string[pyarrow_numpy]"
+ dtype = pd.StringDtype(na_value=np.nan)
expected = DataFrame(
{"a": ["a", "b"]}, dtype=dtype, columns=Index(["a"], dtype=dtype)
)
@@ -2756,8 +2780,7 @@ def test_frame_string_inference(self):
def test_frame_string_inference_array_string_dtype(self):
# GH#54496
- pytest.importorskip("pyarrow")
- dtype = "string[pyarrow_numpy]"
+ dtype = pd.StringDtype(na_value=np.nan)
expected = DataFrame(
{"a": ["a", "b"]}, dtype=dtype, columns=Index(["a"], dtype=dtype)
)
@@ -2781,7 +2804,6 @@ def test_frame_string_inference_array_string_dtype(self):
def test_frame_string_inference_block_dim(self):
# GH#55363
- pytest.importorskip("pyarrow")
with pd.option_context("future.infer_string", True):
df = DataFrame(np.array([["hello", "goodbye"], ["hello", "Hello"]]))
assert df._mgr.blocks[0].ndim == 2
diff --git a/pandas/tests/frame/test_logical_ops.py b/pandas/tests/frame/test_logical_ops.py
index 2684704f86b82..f1163e994557f 100644
--- a/pandas/tests/frame/test_logical_ops.py
+++ b/pandas/tests/frame/test_logical_ops.py
@@ -4,10 +4,6 @@
import numpy as np
import pytest
-from pandas._config import using_string_dtype
-
-from pandas.compat import HAS_PYARROW
-
from pandas import (
CategoricalIndex,
DataFrame,
@@ -100,9 +96,6 @@ def test_logical_ops_int_frame(self):
res_ser = df1a_int["A"] | df1a_bool["A"]
tm.assert_series_equal(res_ser, df1a_bool["A"])
- @pytest.mark.xfail(
- using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)"
- )
def test_logical_ops_invalid(self, using_infer_string):
# GH#5808
@@ -114,15 +107,12 @@ def test_logical_ops_invalid(self, using_infer_string):
df1 = DataFrame("foo", index=[1], columns=["A"])
df2 = DataFrame(True, index=[1], columns=["A"])
- msg = re.escape("unsupported operand type(s) for |: 'str' and 'bool'")
- if using_infer_string:
- import pyarrow as pa
-
- with pytest.raises(pa.lib.ArrowNotImplementedError, match="|has no kernel"):
- df1 | df2
+ if using_infer_string and df1["A"].dtype.storage == "pyarrow":
+ msg = "operation 'or_' not supported for dtype 'str'"
else:
- with pytest.raises(TypeError, match=msg):
- df1 | df2
+ msg = re.escape("unsupported operand type(s) for |: 'str' and 'bool'")
+ with pytest.raises(TypeError, match=msg):
+ df1 | df2
def test_logical_operators(self):
def _check_bin_op(op):
diff --git a/pandas/tests/frame/test_query_eval.py b/pandas/tests/frame/test_query_eval.py
index 7dde0683aa960..27848e4d18596 100644
--- a/pandas/tests/frame/test_query_eval.py
+++ b/pandas/tests/frame/test_query_eval.py
@@ -3,8 +3,6 @@
import numpy as np
import pytest
-from pandas._config import using_string_dtype
-
from pandas.errors import (
NumExprClobberingError,
UndefinedVariableError,
@@ -747,7 +745,6 @@ def test_inf(self, op, f, engine, parser):
result = df.query(q, engine=engine, parser=parser)
tm.assert_frame_equal(result, expected)
- @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
def test_check_tz_aware_index_query(self, tz_aware_fixture):
# https://github.com/pandas-dev/pandas/issues/29463
tz = tz_aware_fixture
@@ -760,6 +757,7 @@ def test_check_tz_aware_index_query(self, tz_aware_fixture):
tm.assert_frame_equal(result, expected)
expected = DataFrame(df_index)
+ expected.columns = expected.columns.astype(object)
result = df.reset_index().query('"2018-01-03 00:00:00+00" < time')
tm.assert_frame_equal(result, expected)
@@ -1057,7 +1055,7 @@ def test_query_with_string_columns(self, parser, engine):
with pytest.raises(NotImplementedError, match=msg):
df.query("a in b and c < d", parser=parser, engine=engine)
- def test_object_array_eq_ne(self, parser, engine, using_infer_string):
+ def test_object_array_eq_ne(self, parser, engine):
df = DataFrame(
{
"a": list("aaaabbbbcccc"),
@@ -1066,14 +1064,11 @@ def test_object_array_eq_ne(self, parser, engine, using_infer_string):
"d": np.random.default_rng(2).integers(9, size=12),
}
)
- warning = RuntimeWarning if using_infer_string and engine == "numexpr" else None
- with tm.assert_produces_warning(warning):
- res = df.query("a == b", parser=parser, engine=engine)
+ res = df.query("a == b", parser=parser, engine=engine)
exp = df[df.a == df.b]
tm.assert_frame_equal(res, exp)
- with tm.assert_produces_warning(warning):
- res = df.query("a != b", parser=parser, engine=engine)
+ res = df.query("a != b", parser=parser, engine=engine)
exp = df[df.a != df.b]
tm.assert_frame_equal(res, exp)
@@ -1112,16 +1107,12 @@ def test_query_with_nested_special_character(self, parser, engine):
[">=", operator.ge],
],
)
- def test_query_lex_compare_strings(
- self, parser, engine, op, func, using_infer_string
- ):
+ def test_query_lex_compare_strings(self, parser, engine, op, func):
a = Series(np.random.default_rng(2).choice(list("abcde"), 20))
b = Series(np.arange(a.size))
df = DataFrame({"X": a, "Y": b})
- warning = RuntimeWarning if using_infer_string and engine == "numexpr" else None
- with tm.assert_produces_warning(warning):
- res = df.query(f'X {op} "d"', engine=engine, parser=parser)
+ res = df.query(f'X {op} "d"', engine=engine, parser=parser)
expected = df[func(df.X, "d")]
tm.assert_frame_equal(res, expected)
diff --git a/pandas/tests/frame/test_unary.py b/pandas/tests/frame/test_unary.py
index 8e1df679ee1b4..a76d33e922486 100644
--- a/pandas/tests/frame/test_unary.py
+++ b/pandas/tests/frame/test_unary.py
@@ -44,11 +44,6 @@ def test_neg_object(self, df, expected):
tm.assert_frame_equal(-df, expected)
tm.assert_series_equal(-df["a"], expected["a"])
- @pytest.mark.xfail(
- using_string_dtype() and not HAS_PYARROW,
- reason="TODO(infer_string)",
- strict=False,
- )
@pytest.mark.parametrize(
"df",
[
@@ -59,22 +54,13 @@ def test_neg_object(self, df, expected):
def test_neg_raises(self, df, using_infer_string):
msg = (
"bad operand type for unary -: 'str'|"
- r"bad operand type for unary -: 'DatetimeArray'"
+ r"bad operand type for unary -: 'DatetimeArray'|"
+ "unary '-' not supported for dtype"
)
- if using_infer_string and df.dtypes.iloc[0] == "string":
- import pyarrow as pa
-
- msg = "has no kernel"
- with pytest.raises(pa.lib.ArrowNotImplementedError, match=msg):
- (-df)
- with pytest.raises(pa.lib.ArrowNotImplementedError, match=msg):
- (-df["a"])
-
- else:
- with pytest.raises(TypeError, match=msg):
- (-df)
- with pytest.raises(TypeError, match=msg):
- (-df["a"])
+ with pytest.raises(TypeError, match=msg):
+ (-df)
+ with pytest.raises(TypeError, match=msg):
+ (-df["a"])
def test_invert(self, float_frame):
df = float_frame
diff --git a/pandas/tests/groupby/methods/test_describe.py b/pandas/tests/groupby/methods/test_describe.py
index 34b046bff7c91..c0889ab415e74 100644
--- a/pandas/tests/groupby/methods/test_describe.py
+++ b/pandas/tests/groupby/methods/test_describe.py
@@ -1,8 +1,6 @@
import numpy as np
import pytest
-from pandas._config import using_string_dtype
-
import pandas as pd
from pandas import (
DataFrame,
@@ -73,8 +71,7 @@ def test_series_describe_as_index(as_index, keys):
tm.assert_frame_equal(result, expected)
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
-def test_frame_describe_multikey(tsframe):
+def test_frame_describe_multikey(tsframe, using_infer_string):
grouped = tsframe.groupby([lambda x: x.year, lambda x: x.month])
result = grouped.describe()
desc_groups = []
@@ -82,7 +79,7 @@ def test_frame_describe_multikey(tsframe):
group = grouped[col].describe()
# GH 17464 - Remove duplicate MultiIndex levels
group_col = MultiIndex(
- levels=[[col], group.columns],
+ levels=[Index([col], dtype=tsframe.columns.dtype), group.columns],
codes=[[0] * len(group.columns), range(len(group.columns))],
)
group = DataFrame(group.values, columns=group_col, index=group.index)
@@ -90,6 +87,10 @@ def test_frame_describe_multikey(tsframe):
expected = pd.concat(desc_groups, axis=1)
tm.assert_frame_equal(result, expected)
+ # remainder of the tests fails with string dtype but is testing deprecated behaviour
+ if using_infer_string:
+ return
+
msg = "DataFrame.groupby with axis=1 is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
groupedT = tsframe.groupby({"A": 0, "B": 0, "C": 1, "D": 1}, axis=1)
@@ -275,7 +276,6 @@ def test_describe(self, df, gb, gni):
tm.assert_frame_equal(result, expected)
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
@pytest.mark.parametrize("dtype", [int, float, object])
@pytest.mark.parametrize(
"kwargs",
@@ -297,5 +297,5 @@ def test_groupby_empty_dataset(dtype, kwargs):
result = df.iloc[:0].groupby("A").B.describe(**kwargs)
expected = df.groupby("A").B.describe(**kwargs).reset_index(drop=True).iloc[:0]
- expected.index = Index([])
+ expected.index = Index([], dtype=df.columns.dtype)
tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/groupby/methods/test_nth.py b/pandas/tests/groupby/methods/test_nth.py
index 344258257ba80..2722993ee5cdf 100644
--- a/pandas/tests/groupby/methods/test_nth.py
+++ b/pandas/tests/groupby/methods/test_nth.py
@@ -1,8 +1,6 @@
import numpy as np
import pytest
-from pandas._config import using_string_dtype
-
import pandas as pd
from pandas import (
DataFrame,
@@ -706,14 +704,14 @@ def test_first_multi_key_groupby_categorical():
tm.assert_frame_equal(result, expected)
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
@pytest.mark.parametrize("method", ["first", "last", "nth"])
def test_groupby_last_first_nth_with_none(method, nulls_fixture):
# GH29645
- expected = Series(["y"])
+ expected = Series(["y"], dtype=object)
data = Series(
[nulls_fixture, nulls_fixture, nulls_fixture, "y", nulls_fixture],
index=[0, 0, 0, 0, 0],
+ dtype=object,
).groupby(level=0)
if method == "nth":
diff --git a/pandas/tests/groupby/methods/test_size.py b/pandas/tests/groupby/methods/test_size.py
index 5b4c08fc24411..fb834ee2a8799 100644
--- a/pandas/tests/groupby/methods/test_size.py
+++ b/pandas/tests/groupby/methods/test_size.py
@@ -3,8 +3,6 @@
from pandas._config import using_string_dtype
-import pandas.util._test_decorators as td
-
from pandas.core.dtypes.common import is_integer_dtype
from pandas import (
@@ -111,16 +109,9 @@ def test_size_series_masked_type_returns_Int64(dtype):
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
-@pytest.mark.parametrize(
- "dtype",
- [
- object,
- pytest.param("string[pyarrow_numpy]", marks=td.skip_if_no("pyarrow")),
- pytest.param("string[pyarrow]", marks=td.skip_if_no("pyarrow")),
- ],
-)
-def test_size_strings(dtype):
+def test_size_strings(any_string_dtype):
# GH#55627
+ dtype = any_string_dtype
df = DataFrame({"a": ["a", "a", "b"], "b": "a"}, dtype=dtype)
result = df.groupby("a")["b"].size()
exp_dtype = "Int64" if dtype == "string[pyarrow]" else "int64"
diff --git a/pandas/tests/groupby/methods/test_value_counts.py b/pandas/tests/groupby/methods/test_value_counts.py
index 51232fac7d6f6..d8c6c7c3fe50c 100644
--- a/pandas/tests/groupby/methods/test_value_counts.py
+++ b/pandas/tests/groupby/methods/test_value_counts.py
@@ -8,11 +8,6 @@
import numpy as np
import pytest
-from pandas._config import using_string_dtype
-
-from pandas.compat import HAS_PYARROW
-import pandas.util._test_decorators as td
-
from pandas import (
Categorical,
CategoricalIndex,
@@ -288,7 +283,6 @@ def _frame_value_counts(df, keys, normalize, sort, ascending):
return df[keys].value_counts(normalize=normalize, sort=sort, ascending=ascending)
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
@pytest.mark.parametrize("groupby", ["column", "array", "function"])
@pytest.mark.parametrize("normalize, name", [(True, "proportion"), (False, "count")])
@pytest.mark.parametrize(
@@ -302,7 +296,16 @@ def _frame_value_counts(df, keys, normalize, sort, ascending):
@pytest.mark.parametrize("as_index", [True, False])
@pytest.mark.parametrize("frame", [True, False])
def test_against_frame_and_seriesgroupby(
- education_df, groupby, normalize, name, sort, ascending, as_index, frame, request
+ education_df,
+ groupby,
+ normalize,
+ name,
+ sort,
+ ascending,
+ as_index,
+ frame,
+ request,
+ using_infer_string,
):
# test all parameters:
# - Use column, array or function as by= parameter
@@ -366,25 +369,24 @@ def test_against_frame_and_seriesgroupby(
index_frame["gender"] = index_frame["both"].str.split("-").str.get(0)
index_frame["education"] = index_frame["both"].str.split("-").str.get(1)
del index_frame["both"]
- index_frame = index_frame.rename({0: None}, axis=1)
- expected.index = MultiIndex.from_frame(index_frame)
+ index_frame2 = index_frame.rename({0: None}, axis=1)
+ expected.index = MultiIndex.from_frame(index_frame2)
+
+ if index_frame2.columns.isna()[0]:
+ # with using_infer_string, the columns in index_frame as string
+ # dtype, which makes the rename({0: None}) above use np.nan
+ # instead of None, so we need to set None more explicitly.
+ expected.index.names = [None] + expected.index.names[1:]
tm.assert_series_equal(result, expected)
else:
expected.insert(1, "gender", expected["both"].str.split("-").str.get(0))
expected.insert(2, "education", expected["both"].str.split("-").str.get(1))
+ if using_infer_string:
+ expected = expected.astype({"gender": "str", "education": "str"})
del expected["both"]
tm.assert_frame_equal(result, expected)
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
-@pytest.mark.parametrize(
- "dtype",
- [
- object,
- pytest.param("string[pyarrow_numpy]", marks=td.skip_if_no("pyarrow")),
- pytest.param("string[pyarrow]", marks=td.skip_if_no("pyarrow")),
- ],
-)
@pytest.mark.parametrize("normalize", [True, False])
@pytest.mark.parametrize(
"sort, ascending, expected_rows, expected_count, expected_group_size",
@@ -402,8 +404,10 @@ def test_compound(
expected_rows,
expected_count,
expected_group_size,
- dtype,
+ any_string_dtype,
+ using_infer_string,
):
+ dtype = any_string_dtype
education_df = education_df.astype(dtype)
education_df.columns = education_df.columns.astype(dtype)
# Multiple groupby keys and as_index=False
@@ -420,11 +424,17 @@ def test_compound(
expected["proportion"] = expected_count
expected["proportion"] /= expected_group_size
if dtype == "string[pyarrow]":
+ # TODO(nullable) also string[python] should return nullable dtypes
expected["proportion"] = expected["proportion"].convert_dtypes()
else:
expected["count"] = expected_count
if dtype == "string[pyarrow]":
expected["count"] = expected["count"].convert_dtypes()
+ if using_infer_string and dtype == object:
+ expected = expected.astype(
+ {"country": "str", "gender": "str", "education": "str"}
+ )
+
tm.assert_frame_equal(result, expected)
@@ -537,9 +547,6 @@ def names_with_nulls_df(nulls_fixture):
)
-@pytest.mark.xfail(
- using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)", strict=False
-)
@pytest.mark.parametrize(
"dropna, expected_data, expected_index",
[
diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py
index 015a9db32883b..586ef8a126536 100644
--- a/pandas/tests/groupby/test_groupby.py
+++ b/pandas/tests/groupby/test_groupby.py
@@ -2832,20 +2832,13 @@ def test_rolling_wrong_param_min_period():
test_df.groupby("name")["val"].rolling(window=2, min_period=1).sum()
-@pytest.mark.parametrize(
- "dtype",
- [
- object,
- pytest.param("string[pyarrow_numpy]", marks=td.skip_if_no("pyarrow")),
- ],
-)
-def test_by_column_values_with_same_starting_value(dtype):
+def test_by_column_values_with_same_starting_value(any_string_dtype):
# GH29635
df = DataFrame(
{
"Name": ["Thomas", "Thomas", "Thomas John"],
"Credit": [1200, 1300, 900],
- "Mood": Series(["sad", "happy", "happy"], dtype=dtype),
+ "Mood": Series(["sad", "happy", "happy"], dtype=any_string_dtype),
}
)
aggregate_details = {"Mood": Series.mode, "Credit": "sum"}
diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py
index d843a992daee0..9c01e017dd29c 100644
--- a/pandas/tests/groupby/test_groupby_dropna.py
+++ b/pandas/tests/groupby/test_groupby_dropna.py
@@ -3,7 +3,6 @@
from pandas._config import using_string_dtype
-from pandas.compat import HAS_PYARROW
from pandas.compat.pyarrow import pa_version_under10p1
from pandas.core.dtypes.missing import na_value_for_dtype
@@ -13,9 +12,6 @@
from pandas.tests.groupby import get_groupby_method_args
-@pytest.mark.xfail(
- using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)", strict=False
-)
@pytest.mark.parametrize(
"dropna, tuples, outputs",
[
@@ -59,9 +55,6 @@ def test_groupby_dropna_multi_index_dataframe_nan_in_one_group(
tm.assert_frame_equal(grouped, expected)
-@pytest.mark.xfail(
- using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)", strict=False
-)
@pytest.mark.parametrize(
"dropna, tuples, outputs",
[
@@ -138,9 +131,6 @@ def test_groupby_dropna_normal_index_dataframe(dropna, idx, outputs):
tm.assert_frame_equal(grouped, expected)
-@pytest.mark.xfail(
- using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)", strict=False
-)
@pytest.mark.parametrize(
"dropna, idx, expected",
[
@@ -216,9 +206,6 @@ def test_groupby_dataframe_slice_then_transform(dropna, index):
tm.assert_series_equal(result, expected)
-@pytest.mark.xfail(
- using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)", strict=False
-)
@pytest.mark.parametrize(
"dropna, tuples, outputs",
[
@@ -300,9 +287,6 @@ def test_groupby_dropna_datetime_like_data(
tm.assert_frame_equal(grouped, expected)
-@pytest.mark.xfail(
- using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)", strict=False
-)
@pytest.mark.parametrize(
"dropna, data, selected_data, levels",
[
@@ -388,9 +372,6 @@ def test_groupby_dropna_with_multiindex_input(input_index, keys, series):
tm.assert_equal(result, expected)
-@pytest.mark.xfail(
- using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)"
-)
def test_groupby_nan_included():
# GH 35646
data = {"group": ["g1", np.nan, "g1", "g2", np.nan], "B": [0, 1, 2, 3, 4]}
diff --git a/pandas/tests/groupby/test_numeric_only.py b/pandas/tests/groupby/test_numeric_only.py
index b1fa541d42086..3b7614347d181 100644
--- a/pandas/tests/groupby/test_numeric_only.py
+++ b/pandas/tests/groupby/test_numeric_only.py
@@ -273,6 +273,7 @@ def test_axis1_numeric_only(request, groupby_func, numeric_only, using_infer_str
# cumsum, diff, pct_change
"unsupported operand type",
"has no kernel",
+ "operation 'sub' not supported for dtype 'str' with dtype 'float64'",
)
if using_infer_string:
pa = pytest.importorskip("pyarrow")
diff --git a/pandas/tests/groupby/test_reductions.py b/pandas/tests/groupby/test_reductions.py
index f67051de6e8c7..8e1bbcb43e3f3 100644
--- a/pandas/tests/groupby/test_reductions.py
+++ b/pandas/tests/groupby/test_reductions.py
@@ -702,10 +702,9 @@ def test_groupby_min_max_categorical(func):
@pytest.mark.parametrize("func", ["min", "max"])
-def test_min_empty_string_dtype(func):
+def test_min_empty_string_dtype(func, string_dtype_no_object):
# GH#55619
- pytest.importorskip("pyarrow")
- dtype = "string[pyarrow_numpy]"
+ dtype = string_dtype_no_object
df = DataFrame({"a": ["a"], "b": "a", "c": "a"}, dtype=dtype).iloc[:0]
result = getattr(df.groupby("a"), func)()
expected = DataFrame(
diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py
index a5433d5496b0b..395036dd400e5 100644
--- a/pandas/tests/groupby/transform/test_transform.py
+++ b/pandas/tests/groupby/transform/test_transform.py
@@ -5,6 +5,7 @@
from pandas._config import using_string_dtype
from pandas._libs import lib
+from pandas.compat import HAS_PYARROW
from pandas.core.dtypes.common import ensure_platform_int
@@ -499,8 +500,7 @@ def test_transform_select_columns(df):
tm.assert_frame_equal(result, expected)
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
-def test_transform_nuisance_raises(df):
+def test_transform_nuisance_raises(df, using_infer_string):
# case that goes through _transform_item_by_item
df.columns = ["A", "B", "B", "D"]
@@ -510,10 +510,16 @@ def test_transform_nuisance_raises(df):
grouped = df.groupby("A")
gbc = grouped["B"]
- with pytest.raises(TypeError, match="Could not convert"):
+ msg = "Could not convert"
+ if using_infer_string:
+ if df.columns.dtype.storage == "pyarrow":
+ msg = "with dtype str does not support reduction 'mean'"
+ else:
+ msg = "Cannot perform reduction 'mean' with string dtype"
+ with pytest.raises(TypeError, match=msg):
gbc.transform(lambda x: np.mean(x))
- with pytest.raises(TypeError, match="Could not convert"):
+ with pytest.raises(TypeError, match=msg):
df.groupby("A").transform(lambda x: np.mean(x))
@@ -582,8 +588,7 @@ def test_transform_coercion():
tm.assert_frame_equal(result, expected)
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
-def test_groupby_transform_with_int():
+def test_groupby_transform_with_int(using_infer_string):
# GH 3740, make sure that we might upcast on item-by-item transform
# floats
@@ -613,8 +618,14 @@ def test_groupby_transform_with_int():
"D": "foo",
}
)
+ msg = "Could not convert"
+ if using_infer_string:
+ if HAS_PYARROW:
+ msg = "with dtype str does not support reduction 'mean'"
+ else:
+ msg = "Cannot perform reduction 'mean' with string dtype"
with np.errstate(all="ignore"):
- with pytest.raises(TypeError, match="Could not convert"):
+ with pytest.raises(TypeError, match=msg):
df.groupby("A").transform(lambda x: (x - x.mean()) / x.std())
result = df.groupby("A")[["B", "C"]].transform(
lambda x: (x - x.mean()) / x.std()
@@ -626,7 +637,7 @@ def test_groupby_transform_with_int():
s = Series([2, 3, 4, 10, 5, -1])
df = DataFrame({"A": [1, 1, 1, 2, 2, 2], "B": 1, "C": s, "D": "foo"})
with np.errstate(all="ignore"):
- with pytest.raises(TypeError, match="Could not convert"):
+ with pytest.raises(TypeError, match=msg):
df.groupby("A").transform(lambda x: (x - x.mean()) / x.std())
result = df.groupby("A")[["B", "C"]].transform(
lambda x: (x - x.mean()) / x.std()
@@ -850,7 +861,6 @@ def test_cython_transform_frame(request, op, args, targop, df_fix, gb_target):
tm.assert_frame_equal(result, expected)
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
@pytest.mark.slow
@pytest.mark.parametrize(
"op, args, targop",
@@ -901,6 +911,7 @@ def test_cython_transform_frame_column(
"does not support .* operations",
".* is not supported for object dtype",
"is not implemented for this dtype",
+ ".* is not supported for str dtype",
]
)
with pytest.raises(TypeError, match=msg):
diff --git a/pandas/tests/indexes/base_class/test_constructors.py b/pandas/tests/indexes/base_class/test_constructors.py
index 338509dd239e6..dcf0165ead6c0 100644
--- a/pandas/tests/indexes/base_class/test_constructors.py
+++ b/pandas/tests/indexes/base_class/test_constructors.py
@@ -47,9 +47,7 @@ def test_construct_empty_tuples(self, tuple_list):
def test_index_string_inference(self):
# GH#54430
- pytest.importorskip("pyarrow")
- dtype = "string[pyarrow_numpy]"
- expected = Index(["a", "b"], dtype=dtype)
+ expected = Index(["a", "b"], dtype=pd.StringDtype(na_value=np.nan))
with pd.option_context("future.infer_string", True):
ser = Index(["a", "b"])
tm.assert_index_equal(ser, expected)
diff --git a/pandas/tests/indexes/base_class/test_reshape.py b/pandas/tests/indexes/base_class/test_reshape.py
index 6a544e448ebe1..b1a6c30b52f68 100644
--- a/pandas/tests/indexes/base_class/test_reshape.py
+++ b/pandas/tests/indexes/base_class/test_reshape.py
@@ -59,12 +59,11 @@ def test_insert_datetime_into_object(self, loc, val):
tm.assert_index_equal(result, expected)
assert type(expected[2]) is type(val)
- def test_insert_none_into_string_numpy(self):
+ def test_insert_none_into_string_numpy(self, string_dtype_no_object):
# GH#55365
- pytest.importorskip("pyarrow")
- index = Index(["a", "b", "c"], dtype="string[pyarrow_numpy]")
+ index = Index(["a", "b", "c"], dtype=string_dtype_no_object)
result = index.insert(-1, None)
- expected = Index(["a", "b", None, "c"], dtype="string[pyarrow_numpy]")
+ expected = Index(["a", "b", None, "c"], dtype=string_dtype_no_object)
tm.assert_index_equal(result, expected)
@pytest.mark.parametrize(
diff --git a/pandas/tests/indexes/base_class/test_setops.py b/pandas/tests/indexes/base_class/test_setops.py
index 2176aa52b17f4..a897e5aca058a 100644
--- a/pandas/tests/indexes/base_class/test_setops.py
+++ b/pandas/tests/indexes/base_class/test_setops.py
@@ -3,8 +3,6 @@
import numpy as np
import pytest
-from pandas._config import using_string_dtype
-
import pandas as pd
from pandas import (
Index,
@@ -233,7 +231,6 @@ def test_tuple_union_bug(self, method, expected, sort):
expected = Index(expected)
tm.assert_index_equal(result, expected)
- @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
@pytest.mark.parametrize("first_list", [["b", "a"], []])
@pytest.mark.parametrize("second_list", [["a", "b"], []])
@pytest.mark.parametrize(
@@ -243,6 +240,7 @@ def test_tuple_union_bug(self, method, expected, sort):
def test_union_name_preservation(
self, first_list, second_list, first_name, second_name, expected_name, sort
):
+ expected_dtype = object if not first_list or not second_list else "str"
first = Index(first_list, name=first_name)
second = Index(second_list, name=second_name)
union = first.union(second, sort=sort)
@@ -253,7 +251,7 @@ def test_union_name_preservation(
expected = Index(sorted(vals), name=expected_name)
tm.assert_index_equal(union, expected)
else:
- expected = Index(vals, name=expected_name)
+ expected = Index(vals, name=expected_name, dtype=expected_dtype)
tm.assert_index_equal(union.sort_values(), expected.sort_values())
@pytest.mark.parametrize(
diff --git a/pandas/tests/indexes/datetimes/methods/test_astype.py b/pandas/tests/indexes/datetimes/methods/test_astype.py
index c0bc6601769b1..a9bcae625e494 100644
--- a/pandas/tests/indexes/datetimes/methods/test_astype.py
+++ b/pandas/tests/indexes/datetimes/methods/test_astype.py
@@ -102,13 +102,16 @@ def test_astype_tznaive_to_tzaware(self):
# dt64->dt64tz deprecated
idx._data.astype("datetime64[ns, US/Eastern]")
- def test_astype_str_nat(self):
+ def test_astype_str_nat(self, using_infer_string):
# GH 13149, GH 13209
# verify that we are returning NaT as a string (and not unicode)
idx = DatetimeIndex(["2016-05-16", "NaT", NaT, np.nan])
result = idx.astype(str)
- expected = Index(["2016-05-16", "NaT", "NaT", "NaT"], dtype=object)
+ if using_infer_string:
+ expected = Index(["2016-05-16", None, None, None], dtype="str")
+ else:
+ expected = Index(["2016-05-16", "NaT", "NaT", "NaT"], dtype=object)
tm.assert_index_equal(result, expected)
def test_astype_str(self):
@@ -118,7 +121,7 @@ def test_astype_str(self):
expected = Index(
["2012-01-01", "2012-01-02", "2012-01-03", "2012-01-04"],
name="test_name",
- dtype=object,
+ dtype="str",
)
tm.assert_index_equal(result, expected)
@@ -133,7 +136,7 @@ def test_astype_str_tz_and_name(self):
"2012-01-03 00:00:00-05:00",
],
name="test_name",
- dtype=object,
+ dtype="str",
)
tm.assert_index_equal(result, expected)
@@ -144,7 +147,7 @@ def test_astype_str_freq_and_name(self):
expected = Index(
["2011-01-01 00:00:00", "2011-01-01 01:00:00", "2011-01-01 02:00:00"],
name="test_name",
- dtype=object,
+ dtype="str",
)
tm.assert_index_equal(result, expected)
@@ -156,7 +159,7 @@ def test_astype_str_freq_and_tz(self):
result = dti.astype(str)
expected = Index(
["2012-03-06 00:00:00+00:00", "2012-03-06 01:00:00+00:00"],
- dtype=object,
+ dtype="str",
name="test_name",
)
tm.assert_index_equal(result, expected)
diff --git a/pandas/tests/indexes/object/test_astype.py b/pandas/tests/indexes/object/test_astype.py
index 9c1ef302c5b51..ce05b5e9f2238 100644
--- a/pandas/tests/indexes/object/test_astype.py
+++ b/pandas/tests/indexes/object/test_astype.py
@@ -15,12 +15,12 @@ def test_astype_str_from_bytes():
# ensure_string_array which does f"{val}"
idx = Index(["あ", b"a"], dtype="object")
result = idx.astype(str)
- expected = Index(["あ", "a"], dtype="object")
+ expected = Index(["あ", "a"], dtype="str")
tm.assert_index_equal(result, expected)
# while we're here, check that Series.astype behaves the same
result = Series(idx).astype(str)
- expected = Series(expected, dtype=object)
+ expected = Series(expected, dtype="str")
tm.assert_series_equal(result, expected)
diff --git a/pandas/tests/indexes/object/test_indexing.py b/pandas/tests/indexes/object/test_indexing.py
index add2f3f18b348..57e5c5e3b6abb 100644
--- a/pandas/tests/indexes/object/test_indexing.py
+++ b/pandas/tests/indexes/object/test_indexing.py
@@ -3,14 +3,10 @@
import numpy as np
import pytest
-from pandas._config import using_string_dtype
-
from pandas._libs.missing import (
NA,
is_matching_na,
)
-from pandas.compat import HAS_PYARROW
-import pandas.util._test_decorators as td
import pandas as pd
from pandas import Index
@@ -31,39 +27,25 @@ def test_get_indexer_strings(self, method, expected):
tm.assert_numpy_array_equal(actual, expected)
- @pytest.mark.xfail(
- using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)"
- )
def test_get_indexer_strings_raises(self, using_infer_string):
index = Index(["b", "c"])
- if using_infer_string:
- import pyarrow as pa
-
- msg = "has no kernel"
- with pytest.raises(pa.lib.ArrowNotImplementedError, match=msg):
- index.get_indexer(["a", "b", "c", "d"], method="nearest")
-
- with pytest.raises(pa.lib.ArrowNotImplementedError, match=msg):
- index.get_indexer(["a", "b", "c", "d"], method="pad", tolerance=2)
-
- with pytest.raises(pa.lib.ArrowNotImplementedError, match=msg):
- index.get_indexer(
- ["a", "b", "c", "d"], method="pad", tolerance=[2, 2, 2, 2]
- )
-
- else:
- msg = r"unsupported operand type\(s\) for -: 'str' and 'str'"
- with pytest.raises(TypeError, match=msg):
- index.get_indexer(["a", "b", "c", "d"], method="nearest")
+ msg = "|".join(
+ [
+ "operation 'sub' not supported for dtype 'str'",
+ r"unsupported operand type\(s\) for -: 'str' and 'str'",
+ ]
+ )
+ with pytest.raises(TypeError, match=msg):
+ index.get_indexer(["a", "b", "c", "d"], method="nearest")
- with pytest.raises(TypeError, match=msg):
- index.get_indexer(["a", "b", "c", "d"], method="pad", tolerance=2)
+ with pytest.raises(TypeError, match=msg):
+ index.get_indexer(["a", "b", "c", "d"], method="pad", tolerance=2)
- with pytest.raises(TypeError, match=msg):
- index.get_indexer(
- ["a", "b", "c", "d"], method="pad", tolerance=[2, 2, 2, 2]
- )
+ with pytest.raises(TypeError, match=msg):
+ index.get_indexer(
+ ["a", "b", "c", "d"], method="pad", tolerance=[2, 2, 2, 2]
+ )
def test_get_indexer_with_NA_values(
self, unique_nulls_fixture, unique_nulls_fixture2
@@ -176,14 +158,6 @@ def test_get_indexer_non_unique_np_nats(self, np_nat_fixture, np_nat_fixture2):
class TestSliceLocs:
- # TODO(infer_string) parametrize over multiple string dtypes
- @pytest.mark.parametrize(
- "dtype",
- [
- "object",
- pytest.param("string[pyarrow_numpy]", marks=td.skip_if_no("pyarrow")),
- ],
- )
@pytest.mark.parametrize(
"in_slice,expected",
[
@@ -207,24 +181,22 @@ class TestSliceLocs:
(pd.IndexSlice["m":"m":-1], ""), # type: ignore[misc]
],
)
- def test_slice_locs_negative_step(self, in_slice, expected, dtype):
- index = Index(list("bcdxy"), dtype=dtype)
+ def test_slice_locs_negative_step(self, in_slice, expected, any_string_dtype):
+ index = Index(list("bcdxy"), dtype=any_string_dtype)
s_start, s_stop = index.slice_locs(in_slice.start, in_slice.stop, in_slice.step)
result = index[s_start : s_stop : in_slice.step]
- expected = Index(list(expected), dtype=dtype)
+ expected = Index(list(expected), dtype=any_string_dtype)
tm.assert_index_equal(result, expected)
- # TODO(infer_string) parametrize over multiple string dtypes
- @td.skip_if_no("pyarrow")
- def test_slice_locs_negative_step_oob(self):
- index = Index(list("bcdxy"), dtype="string[pyarrow_numpy]")
+ def test_slice_locs_negative_step_oob(self, any_string_dtype):
+ index = Index(list("bcdxy"), dtype=any_string_dtype)
result = index[-10:5:1]
tm.assert_index_equal(result, index)
result = index[4:-10:-1]
- expected = Index(list("yxdcb"), dtype="string[pyarrow_numpy]")
+ expected = Index(list("yxdcb"), dtype=any_string_dtype)
tm.assert_index_equal(result, expected)
def test_slice_locs_dup(self):
diff --git a/pandas/tests/indexes/period/methods/test_astype.py b/pandas/tests/indexes/period/methods/test_astype.py
index d545bfd2fae0f..af3c2667f51b4 100644
--- a/pandas/tests/indexes/period/methods/test_astype.py
+++ b/pandas/tests/indexes/period/methods/test_astype.py
@@ -22,7 +22,7 @@ def test_astype_raises(self, dtype):
with pytest.raises(TypeError, match=msg):
idx.astype(dtype)
- def test_astype_conversion(self):
+ def test_astype_conversion(self, using_infer_string):
# GH#13149, GH#13209
idx = PeriodIndex(["2016-05-16", "NaT", NaT, np.nan], freq="D", name="idx")
@@ -41,7 +41,12 @@ def test_astype_conversion(self):
tm.assert_index_equal(result, expected)
result = idx.astype(str)
- expected = Index([str(x) for x in idx], name="idx", dtype=object)
+ if using_infer_string:
+ expected = Index(
+ [str(x) if x is not NaT else None for x in idx], name="idx", dtype="str"
+ )
+ else:
+ expected = Index([str(x) for x in idx], name="idx", dtype=object)
tm.assert_index_equal(result, expected)
idx = period_range("1990", "2009", freq="Y", name="idx")
diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py
index cf75f95d17b0a..3bcc62445f0ac 100644
--- a/pandas/tests/indexes/test_base.py
+++ b/pandas/tests/indexes/test_base.py
@@ -76,9 +76,6 @@ def test_constructor_casting(self, index):
tm.assert_contains_all(arr, new_index)
tm.assert_index_equal(index, new_index)
- @pytest.mark.xfail(
- using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)"
- )
def test_constructor_copy(self, using_infer_string):
index = Index(list("abc"), name="name")
arr = np.array(index)
@@ -346,11 +343,6 @@ def test_constructor_empty_special(self, empty, klass):
def test_view_with_args(self, index):
index.view("i8")
- @pytest.mark.xfail(
- using_string_dtype() and not HAS_PYARROW,
- reason="TODO(infer_string)",
- strict=False,
- )
@pytest.mark.parametrize(
"index",
[
@@ -367,7 +359,8 @@ def test_view_with_args_object_array_raises(self, index):
msg = "When changing to a larger dtype"
with pytest.raises(ValueError, match=msg):
index.view("i8")
- elif index.dtype == "string":
+ elif index.dtype == "str" and not index.dtype.storage == "python":
+ # TODO(infer_string): Make the errors consistent
with pytest.raises(NotImplementedError, match="i8"):
index.view("i8")
else:
@@ -978,10 +971,9 @@ def test_isin_empty(self, empty):
result = index.isin(empty)
tm.assert_numpy_array_equal(expected, result)
- @td.skip_if_no("pyarrow")
- def test_isin_arrow_string_null(self):
+ def test_isin_string_null(self, string_dtype_no_object):
# GH#55821
- index = Index(["a", "b"], dtype="string[pyarrow_numpy]")
+ index = Index(["a", "b"], dtype=string_dtype_no_object)
result = index.isin([None])
expected = np.array([False, False])
tm.assert_numpy_array_equal(result, expected)
diff --git a/pandas/tests/indexes/test_old_base.py b/pandas/tests/indexes/test_old_base.py
index 8d859a61a2bd5..176bf893cafa8 100644
--- a/pandas/tests/indexes/test_old_base.py
+++ b/pandas/tests/indexes/test_old_base.py
@@ -6,10 +6,7 @@
import numpy as np
import pytest
-from pandas._config import using_string_dtype
-
from pandas._libs.tslibs import Timestamp
-from pandas.compat import HAS_PYARROW
from pandas.core.dtypes.common import (
is_integer_dtype,
@@ -28,6 +25,7 @@
PeriodIndex,
RangeIndex,
Series,
+ StringDtype,
TimedeltaIndex,
isna,
period_range,
@@ -233,7 +231,6 @@ def test_logical_compat(self, simple_index):
with pytest.raises(TypeError, match=msg):
idx.any()
- @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
def test_repr_roundtrip(self, simple_index):
if isinstance(simple_index, IntervalIndex):
pytest.skip(f"Not a valid repr for {type(simple_index).__name__}")
@@ -250,11 +247,6 @@ def test_repr_max_seq_item_setting(self, simple_index):
repr(idx)
assert "..." not in str(idx)
- @pytest.mark.xfail(
- using_string_dtype() and not HAS_PYARROW,
- reason="TODO(infer_string)",
- strict=False,
- )
@pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning")
def test_ensure_copied_data(self, index):
# Check the "copy" argument of each Index.__new__ is honoured
@@ -302,12 +294,17 @@ def test_ensure_copied_data(self, index):
tm.assert_numpy_array_equal(
index._values._mask, result._values._mask, check_same="same"
)
- elif index.dtype == "string[python]":
+ elif (
+ isinstance(index.dtype, StringDtype) and index.dtype.storage == "python"
+ ):
assert np.shares_memory(index._values._ndarray, result._values._ndarray)
tm.assert_numpy_array_equal(
index._values._ndarray, result._values._ndarray, check_same="same"
)
- elif index.dtype in ("string[pyarrow]", "string[pyarrow_numpy]"):
+ elif (
+ isinstance(index.dtype, StringDtype)
+ and index.dtype.storage == "pyarrow"
+ ):
assert tm.shares_memory(result._values, index._values)
else:
raise NotImplementedError(index.dtype)
@@ -432,11 +429,7 @@ def test_insert_base(self, index):
result = trimmed.insert(0, index[0])
assert index[0:4].equals(result)
- @pytest.mark.skipif(
- using_string_dtype(),
- reason="completely different behavior, tested elsewher",
- )
- def test_insert_out_of_bounds(self, index):
+ def test_insert_out_of_bounds(self, index, using_infer_string):
# TypeError/IndexError matches what np.insert raises in these cases
if len(index) > 0:
@@ -448,6 +441,12 @@ def test_insert_out_of_bounds(self, index):
msg = "index (0|0.5) is out of bounds for axis 0 with size 0"
else:
msg = "slice indices must be integers or None or have an __index__ method"
+
+ if using_infer_string and (
+ index.dtype == "string" or index.dtype == "category" # noqa: PLR1714
+ ):
+ msg = "loc must be an integer between"
+
with pytest.raises(err, match=msg):
index.insert(0.5, "foo")
@@ -853,7 +852,6 @@ def test_append_preserves_dtype(self, simple_index):
alt = index.take(list(range(N)) * 2)
tm.assert_index_equal(result, alt, check_exact=True)
- @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
def test_inv(self, simple_index, using_infer_string):
idx = simple_index
@@ -867,21 +865,14 @@ def test_inv(self, simple_index, using_infer_string):
tm.assert_series_equal(res2, Series(expected))
else:
if idx.dtype.kind == "f":
- err = TypeError
msg = "ufunc 'invert' not supported for the input types"
- elif using_infer_string and idx.dtype == "string":
- import pyarrow as pa
-
- err = pa.lib.ArrowNotImplementedError
- msg = "has no kernel"
else:
- err = TypeError
- msg = "bad operand"
- with pytest.raises(err, match=msg):
+ msg = "bad operand|__invert__ is not supported for string dtype"
+ with pytest.raises(TypeError, match=msg):
~idx
# check that we get the same behavior with Series
- with pytest.raises(err, match=msg):
+ with pytest.raises(TypeError, match=msg):
~Series(idx)
def test_is_boolean_is_deprecated(self, simple_index):
diff --git a/pandas/tests/indexes/test_setops.py b/pandas/tests/indexes/test_setops.py
index 4a6982cf98670..72c3396f124b8 100644
--- a/pandas/tests/indexes/test_setops.py
+++ b/pandas/tests/indexes/test_setops.py
@@ -240,9 +240,6 @@ def test_intersection_base(self, index):
with pytest.raises(TypeError, match=msg):
first.intersection([1, 2, 3])
- @pytest.mark.filterwarnings(
- "ignore:Falling back on a non-pyarrow:pandas.errors.PerformanceWarning"
- )
@pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning")
def test_union_base(self, index):
index = index.unique()
@@ -270,9 +267,6 @@ def test_union_base(self, index):
first.union([1, 2, 3])
@pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning")
- @pytest.mark.filterwarnings(
- "ignore:Falling back on a non-pyarrow:pandas.errors.PerformanceWarning"
- )
def test_difference_base(self, sort, index):
first = index[2:]
second = index[:4]
@@ -299,9 +293,6 @@ def test_difference_base(self, sort, index):
first.difference([1, 2, 3], sort)
@pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning")
- @pytest.mark.filterwarnings(
- "ignore:Falling back on a non-pyarrow:pandas.errors.PerformanceWarning"
- )
def test_symmetric_difference(self, index):
if isinstance(index, CategoricalIndex):
pytest.skip(f"Not relevant for {type(index).__name__}")
@@ -523,9 +514,6 @@ def test_intersection_difference_match_empty(self, index, sort):
@pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning")
-@pytest.mark.filterwarnings(
- "ignore:Falling back on a non-pyarrow:pandas.errors.PerformanceWarning"
-)
@pytest.mark.parametrize(
"method", ["intersection", "union", "difference", "symmetric_difference"]
)
diff --git a/pandas/tests/indexes/timedeltas/methods/test_astype.py b/pandas/tests/indexes/timedeltas/methods/test_astype.py
index 311f2b5c9aa59..5166cadae499e 100644
--- a/pandas/tests/indexes/timedeltas/methods/test_astype.py
+++ b/pandas/tests/indexes/timedeltas/methods/test_astype.py
@@ -44,7 +44,7 @@ def test_astype_object_with_nat(self):
tm.assert_index_equal(result, expected)
assert idx.tolist() == expected_list
- def test_astype(self):
+ def test_astype(self, using_infer_string):
# GH 13149, GH 13209
idx = TimedeltaIndex([1e14, "NaT", NaT, np.nan], name="idx")
@@ -61,7 +61,12 @@ def test_astype(self):
tm.assert_index_equal(result, expected)
result = idx.astype(str)
- expected = Index([str(x) for x in idx], name="idx", dtype=object)
+ if using_infer_string:
+ expected = Index(
+ [str(x) if x is not NaT else None for x in idx], name="idx", dtype="str"
+ )
+ else:
+ expected = Index([str(x) for x in idx], name="idx", dtype=object)
tm.assert_index_equal(result, expected)
rng = timedelta_range("1 days", periods=10)
diff --git a/pandas/tests/indexing/test_iloc.py b/pandas/tests/indexing/test_iloc.py
index 3fd9498e21a73..c2742f42e3a92 100644
--- a/pandas/tests/indexing/test_iloc.py
+++ b/pandas/tests/indexing/test_iloc.py
@@ -6,8 +6,6 @@
import numpy as np
import pytest
-from pandas._config import using_string_dtype
-
from pandas.errors import IndexingError
import pandas.util._test_decorators as td
@@ -1218,22 +1216,27 @@ def test_iloc_getitem_int_single_ea_block_view(self):
arr[2] = arr[-1]
assert ser[0] == arr[-1]
- @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
- def test_iloc_setitem_multicolumn_to_datetime(self):
+ def test_iloc_setitem_multicolumn_to_datetime(self, using_infer_string):
# GH#20511
df = DataFrame({"A": ["2022-01-01", "2022-01-02"], "B": ["2021", "2022"]})
- df.iloc[:, [0]] = DataFrame({"A": to_datetime(["2021", "2022"])})
- expected = DataFrame(
- {
- "A": [
- Timestamp("2021-01-01 00:00:00"),
- Timestamp("2022-01-01 00:00:00"),
- ],
- "B": ["2021", "2022"],
- }
- )
- tm.assert_frame_equal(df, expected, check_dtype=False)
+ if using_infer_string:
+ with tm.assert_produces_warning(
+ FutureWarning, match="Setting an item of incompatible dtype"
+ ):
+ df.iloc[:, [0]] = DataFrame({"A": to_datetime(["2021", "2022"])})
+ else:
+ df.iloc[:, [0]] = DataFrame({"A": to_datetime(["2021", "2022"])})
+ expected = DataFrame(
+ {
+ "A": [
+ Timestamp("2021-01-01 00:00:00"),
+ Timestamp("2022-01-01 00:00:00"),
+ ],
+ "B": ["2021", "2022"],
+ }
+ )
+ tm.assert_frame_equal(df, expected, check_dtype=False)
class TestILocErrors:
diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py
index e57598cfc2be1..0ff33ba88b16f 100644
--- a/pandas/tests/indexing/test_indexing.py
+++ b/pandas/tests/indexing/test_indexing.py
@@ -8,8 +8,6 @@
import numpy as np
import pytest
-from pandas._config import using_string_dtype
-
from pandas.errors import IndexingError
from pandas.core.dtypes.common import (
@@ -563,12 +561,12 @@ def test_string_slice_empty(self):
with pytest.raises(KeyError, match="^0$"):
df.loc["2011", 0]
- @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
def test_astype_assignment(self, using_infer_string):
# GH4312 (iloc)
df_orig = DataFrame(
[["1", "2", "3", ".4", 5, 6.0, "foo"]], columns=list("ABCDEFG")
)
+ df_orig[list("ABCDG")] = df_orig[list("ABCDG")].astype(object)
df = df_orig.copy()
@@ -578,9 +576,9 @@ def test_astype_assignment(self, using_infer_string):
expected = DataFrame(
[[1, 2, "3", ".4", 5, 6.0, "foo"]], columns=list("ABCDEFG")
)
- if not using_infer_string:
- expected["A"] = expected["A"].astype(object)
- expected["B"] = expected["B"].astype(object)
+ expected[list("CDG")] = expected[list("CDG")].astype(object)
+ expected["A"] = expected["A"].astype(object)
+ expected["B"] = expected["B"].astype(object)
tm.assert_frame_equal(df, expected)
# GH5702 (loc)
@@ -589,18 +587,16 @@ def test_astype_assignment(self, using_infer_string):
expected = DataFrame(
[[1, "2", "3", ".4", 5, 6.0, "foo"]], columns=list("ABCDEFG")
)
- if not using_infer_string:
- expected["A"] = expected["A"].astype(object)
+ expected[list("ABCDG")] = expected[list("ABCDG")].astype(object)
tm.assert_frame_equal(df, expected)
df = df_orig.copy()
+
df.loc[:, ["B", "C"]] = df.loc[:, ["B", "C"]].astype(np.int64)
expected = DataFrame(
[["1", 2, 3, ".4", 5, 6.0, "foo"]], columns=list("ABCDEFG")
)
- if not using_infer_string:
- expected["B"] = expected["B"].astype(object)
- expected["C"] = expected["C"].astype(object)
+ expected[list("ABCDG")] = expected[list("ABCDG")].astype(object)
tm.assert_frame_equal(df, expected)
def test_astype_assignment_full_replacements(self):
diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py
index d61b2ea642439..bdc6d9aff6f4e 100644
--- a/pandas/tests/indexing/test_loc.py
+++ b/pandas/tests/indexing/test_loc.py
@@ -1,5 +1,6 @@
""" test label based indexing with loc """
from collections import namedtuple
+import contextlib
from datetime import (
date,
datetime,
@@ -15,7 +16,6 @@
from pandas._config import using_string_dtype
from pandas._libs import index as libindex
-from pandas.compat import HAS_PYARROW
from pandas.compat.numpy import np_version_gt2
from pandas.errors import IndexingError
import pandas.util._test_decorators as td
@@ -648,8 +648,9 @@ def test_loc_setitem_consistency_empty(self):
expected["x"] = expected["x"].astype(np.int64)
tm.assert_frame_equal(df, expected)
+ # incompatible dtype warning
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
- def test_loc_setitem_consistency_slice_column_len(self):
+ def test_loc_setitem_consistency_slice_column_len(self, using_infer_string):
# .loc[:,column] setting with slice == len of the column
# GH10408
levels = [
@@ -673,13 +674,24 @@ def test_loc_setitem_consistency_slice_column_len(self):
]
df = DataFrame(values, index=mi, columns=cols)
- df.loc[:, ("Respondent", "StartDate")] = to_datetime(
- df.loc[:, ("Respondent", "StartDate")]
- )
- df.loc[:, ("Respondent", "EndDate")] = to_datetime(
- df.loc[:, ("Respondent", "EndDate")]
- )
- df = df.infer_objects(copy=False)
+ ctx = contextlib.nullcontext()
+ if using_infer_string:
+ ctx = pytest.raises(TypeError, match="Invalid value")
+
+ with ctx:
+ df.loc[:, ("Respondent", "StartDate")] = to_datetime(
+ df.loc[:, ("Respondent", "StartDate")]
+ )
+ with ctx:
+ df.loc[:, ("Respondent", "EndDate")] = to_datetime(
+ df.loc[:, ("Respondent", "EndDate")]
+ )
+
+ if using_infer_string:
+ # infer-objects won't infer stuff anymore
+ return
+
+ df = df.infer_objects()
# Adding a new key
df.loc[:, ("Respondent", "Duration")] = (
@@ -1269,20 +1281,23 @@ def test_loc_reverse_assignment(self):
tm.assert_series_equal(result, expected)
- @pytest.mark.xfail(using_string_dtype(), reason="can't set int into string")
- def test_loc_setitem_str_to_small_float_conversion_type(self):
+ def test_loc_setitem_str_to_small_float_conversion_type(self, using_infer_string):
# GH#20388
col_data = [str(np.random.default_rng(2).random() * 1e-12) for _ in range(5)]
result = DataFrame(col_data, columns=["A"])
- expected = DataFrame(col_data, columns=["A"], dtype=object)
+ expected = DataFrame(col_data, columns=["A"])
tm.assert_frame_equal(result, expected)
# assigning with loc/iloc attempts to set the values inplace, which
# in this case is successful
- result.loc[result.index, "A"] = [float(x) for x in col_data]
- expected = DataFrame(col_data, columns=["A"], dtype=float).astype(object)
- tm.assert_frame_equal(result, expected)
+ if using_infer_string:
+ with pytest.raises(TypeError, match="Must provide strings"):
+ result.loc[result.index, "A"] = [float(x) for x in col_data]
+ else:
+ result.loc[result.index, "A"] = [float(x) for x in col_data]
+ expected = DataFrame(col_data, columns=["A"], dtype=float).astype(object)
+ tm.assert_frame_equal(result, expected)
# assigning the entire column using __setitem__ swaps in the new array
# GH#???
@@ -1443,9 +1458,6 @@ def test_loc_setitem_listlike_with_timedelta64index(self, indexer, expected):
tm.assert_frame_equal(expected, df)
- @pytest.mark.xfail(
- using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)"
- )
def test_loc_setitem_categorical_values_partial_column_slice(self):
# Assigning a Category to parts of a int/... column uses the values of
# the Categorical
@@ -1458,9 +1470,6 @@ def test_loc_setitem_categorical_values_partial_column_slice(self):
df.loc[2:3, "b"] = Categorical(["b", "b"], categories=["a", "b"])
tm.assert_frame_equal(df, exp)
- @pytest.mark.xfail(
- using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)"
- )
def test_loc_setitem_single_row_categorical(self, using_infer_string):
# GH#25495
df = DataFrame({"Alpha": ["a"], "Numeric": [0]})
diff --git a/pandas/tests/interchange/test_impl.py b/pandas/tests/interchange/test_impl.py
index d1a15dc93f702..ef94c4c7aff2c 100644
--- a/pandas/tests/interchange/test_impl.py
+++ b/pandas/tests/interchange/test_impl.py
@@ -412,6 +412,7 @@ def test_interchange_from_corrected_buffer_dtypes(monkeypatch) -> None:
pd.api.interchange.from_dataframe(df)
+@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
def test_empty_string_column():
# https://github.com/pandas-dev/pandas/issues/56703
df = pd.DataFrame({"a": []}, dtype=str)
@@ -476,7 +477,7 @@ def test_non_str_names_w_duplicates():
([1.0, 2.25, None], "Float32[pyarrow]", "float32"),
([True, False, None], "boolean", "bool"),
([True, False, None], "boolean[pyarrow]", "bool"),
- (["much ado", "about", None], "string[pyarrow_numpy]", "large_string"),
+ (["much ado", "about", None], pd.StringDtype(na_value=np.nan), "large_string"),
(["much ado", "about", None], "string[pyarrow]", "large_string"),
(
[datetime(2020, 1, 1), datetime(2020, 1, 2), None],
@@ -539,7 +540,11 @@ def test_pandas_nullable_with_missing_values(
([1.0, 2.25, 5.0], "Float32[pyarrow]", "float32"),
([True, False, False], "boolean", "bool"),
([True, False, False], "boolean[pyarrow]", "bool"),
- (["much ado", "about", "nothing"], "string[pyarrow_numpy]", "large_string"),
+ (
+ ["much ado", "about", "nothing"],
+ pd.StringDtype(na_value=np.nan),
+ "large_string",
+ ),
(["much ado", "about", "nothing"], "string[pyarrow]", "large_string"),
(
[datetime(2020, 1, 1), datetime(2020, 1, 2), datetime(2020, 1, 3)],
diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py
index 8dc76d8f747cb..3c5e1e1cf5afb 100644
--- a/pandas/tests/io/excel/test_readers.py
+++ b/pandas/tests/io/excel/test_readers.py
@@ -550,7 +550,7 @@ def test_reader_dtype(self, read_ext):
expected["a"] = expected["a"].astype("float64")
expected["b"] = expected["b"].astype("float32")
- expected["c"] = Series(["001", "002", "003", "004"], dtype=object)
+ expected["c"] = Series(["001", "002", "003", "004"], dtype="str")
tm.assert_frame_equal(actual, expected)
msg = "Unable to convert column d to type int64"
@@ -577,9 +577,9 @@ def test_reader_dtype(self, read_ext):
{
"a": Series([1, 2, 3, 4], dtype="float64"),
"b": Series([2.5, 3.5, 4.5, 5.5], dtype="float32"),
- "c": Series(["001", "002", "003", "004"], dtype=object),
- "d": Series(["1", "2", np.nan, "4"], dtype=object),
- }
+ "c": Series(["001", "002", "003", "004"], dtype="str"),
+ "d": Series(["1", "2", np.nan, "4"], dtype="str"),
+ },
),
),
],
diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py
index de40441fe25dd..a8608434be5ee 100644
--- a/pandas/tests/io/json/test_pandas.py
+++ b/pandas/tests/io/json/test_pandas.py
@@ -2139,18 +2139,18 @@ def test_pyarrow_engine_lines_false():
def test_json_roundtrip_string_inference(orient):
- pytest.importorskip("pyarrow")
df = DataFrame(
[["a", "b"], ["c", "d"]], index=["row 1", "row 2"], columns=["col 1", "col 2"]
)
out = df.to_json()
with pd.option_context("future.infer_string", True):
result = read_json(StringIO(out))
+ dtype = pd.StringDtype(na_value=np.nan)
expected = DataFrame(
[["a", "b"], ["c", "d"]],
- dtype="string[pyarrow_numpy]",
- index=Index(["row 1", "row 2"], dtype="string[pyarrow_numpy]"),
- columns=Index(["col 1", "col 2"], dtype="string[pyarrow_numpy]"),
+ dtype=dtype,
+ index=Index(["row 1", "row 2"], dtype=dtype),
+ columns=Index(["col 1", "col 2"], dtype=dtype),
)
tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py
index 800ece5a409e1..787941c5d0376 100644
--- a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py
+++ b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py
@@ -28,7 +28,7 @@
@pytest.mark.parametrize("dtype", [str, object])
@pytest.mark.parametrize("check_orig", [True, False])
@pytest.mark.usefixtures("pyarrow_xfail")
-def test_dtype_all_columns(all_parsers, dtype, check_orig):
+def test_dtype_all_columns(all_parsers, dtype, check_orig, using_infer_string):
# see gh-3795, gh-6607
parser = all_parsers
@@ -46,8 +46,10 @@ def test_dtype_all_columns(all_parsers, dtype, check_orig):
if check_orig:
expected = df.copy()
result = result.astype(float)
- else:
+ elif using_infer_string and dtype is str:
expected = df.astype(str)
+ else:
+ expected = df.astype(str).astype(object)
tm.assert_frame_equal(result, expected)
@@ -300,7 +302,6 @@ def test_true_values_cast_to_bool(all_parsers):
tm.assert_frame_equal(result, expected)
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
@pytest.mark.usefixtures("pyarrow_xfail")
@pytest.mark.parametrize("dtypes, exp_value", [({}, "1"), ({"a.1": "int64"}, 1)])
def test_dtype_mangle_dup_cols(all_parsers, dtypes, exp_value):
@@ -316,7 +317,6 @@ def test_dtype_mangle_dup_cols(all_parsers, dtypes, exp_value):
tm.assert_frame_equal(result, expected)
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
@pytest.mark.usefixtures("pyarrow_xfail")
def test_dtype_mangle_dup_cols_single_dtype(all_parsers):
# GH#42022
@@ -547,8 +547,7 @@ def test_ea_int_avoid_overflow(all_parsers):
def test_string_inference(all_parsers):
# GH#54430
- pytest.importorskip("pyarrow")
- dtype = "string[pyarrow_numpy]"
+ dtype = pd.StringDtype(na_value=np.nan)
data = """a,b
x,1
@@ -566,10 +565,8 @@ def test_string_inference(all_parsers):
@pytest.mark.parametrize("dtype", ["O", object, "object", np.object_, str, np.str_])
-def test_string_inference_object_dtype(all_parsers, dtype):
+def test_string_inference_object_dtype(all_parsers, dtype, using_infer_string):
# GH#56047
- pytest.importorskip("pyarrow")
-
data = """a,b
x,a
y,a
@@ -578,12 +575,13 @@ def test_string_inference_object_dtype(all_parsers, dtype):
with pd.option_context("future.infer_string", True):
result = parser.read_csv(StringIO(data), dtype=dtype)
+ expected_dtype = pd.StringDtype(na_value=np.nan) if dtype is str else object
expected = DataFrame(
{
- "a": pd.Series(["x", "y", "z"], dtype=object),
- "b": pd.Series(["a", "a", "a"], dtype=object),
+ "a": pd.Series(["x", "y", "z"], dtype=expected_dtype),
+ "b": pd.Series(["a", "a", "a"], dtype=expected_dtype),
},
- columns=pd.Index(["a", "b"], dtype="string[pyarrow_numpy]"),
+ columns=pd.Index(["a", "b"], dtype=pd.StringDtype(na_value=np.nan)),
)
tm.assert_frame_equal(result, expected)
@@ -592,10 +590,10 @@ def test_string_inference_object_dtype(all_parsers, dtype):
expected = DataFrame(
{
- "a": pd.Series(["x", "y", "z"], dtype=object),
- "b": pd.Series(["a", "a", "a"], dtype="string[pyarrow_numpy]"),
+ "a": pd.Series(["x", "y", "z"], dtype=expected_dtype),
+ "b": pd.Series(["a", "a", "a"], dtype=pd.StringDtype(na_value=np.nan)),
},
- columns=pd.Index(["a", "b"], dtype="string[pyarrow_numpy]"),
+ columns=pd.Index(["a", "b"], dtype=pd.StringDtype(na_value=np.nan)),
)
tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/io/parser/test_na_values.py b/pandas/tests/io/parser/test_na_values.py
index 1a3b7b37bf66b..5f9823f7225f9 100644
--- a/pandas/tests/io/parser/test_na_values.py
+++ b/pandas/tests/io/parser/test_na_values.py
@@ -630,7 +630,6 @@ def test_inf_na_values_with_int_index(all_parsers):
tm.assert_frame_equal(out, expected)
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
@xfail_pyarrow # mismatched shape
@pytest.mark.parametrize("na_filter", [True, False])
def test_na_values_with_dtype_str_and_na_filter(all_parsers, na_filter):
@@ -682,7 +681,6 @@ def test_cast_NA_to_bool_raises_error(all_parsers, data, na_values):
# TODO: this test isn't about the na_values keyword, it is about the empty entries
# being returned with NaN entries, whereas the pyarrow engine returns "nan"
@xfail_pyarrow # mismatched shapes
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
def test_str_nan_dropped(all_parsers):
# see gh-21131
parser = all_parsers
diff --git a/pandas/tests/io/parser/test_python_parser_only.py b/pandas/tests/io/parser/test_python_parser_only.py
index 9e7530906afa3..5f2ddf7de9c6d 100644
--- a/pandas/tests/io/parser/test_python_parser_only.py
+++ b/pandas/tests/io/parser/test_python_parser_only.py
@@ -17,8 +17,6 @@
import numpy as np
import pytest
-from pandas._config import using_string_dtype
-
from pandas.errors import (
ParserError,
ParserWarning,
@@ -498,7 +496,6 @@ def test_header_int_do_not_infer_multiindex_names_on_different_line(python_parse
tm.assert_frame_equal(result, expected)
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
@pytest.mark.parametrize(
"dtype", [{"a": object}, {"a": str, "b": np.int64, "c": np.int64}]
)
@@ -523,10 +520,11 @@ def test_no_thousand_convert_with_dot_for_non_numeric_cols(python_parser_only, d
"c": [0, 4000, 131],
}
)
+ if dtype["a"] == object:
+ expected["a"] = expected["a"].astype(object)
tm.assert_frame_equal(result, expected)
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
@pytest.mark.parametrize(
"dtype,expected",
[
diff --git a/pandas/tests/io/pytables/test_read.py b/pandas/tests/io/pytables/test_read.py
index a04f02f0e052b..28cd8aea1defc 100644
--- a/pandas/tests/io/pytables/test_read.py
+++ b/pandas/tests/io/pytables/test_read.py
@@ -403,7 +403,6 @@ def test_read_py2_hdf_file_in_py3(datapath):
def test_read_infer_string(tmp_path, setup_path):
# GH#54431
- pytest.importorskip("pyarrow")
df = DataFrame({"a": ["a", "b", None]})
path = tmp_path / setup_path
df.to_hdf(path, key="data", format="table")
@@ -411,7 +410,7 @@ def test_read_infer_string(tmp_path, setup_path):
result = read_hdf(path, key="data", mode="r")
expected = DataFrame(
{"a": ["a", "b", None]},
- dtype="string[pyarrow_numpy]",
- columns=Index(["a"], dtype="string[pyarrow_numpy]"),
+ dtype=pd.StringDtype(na_value=np.nan),
+ columns=Index(["a"], dtype=pd.StringDtype(na_value=np.nan)),
)
tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/io/test_feather.py b/pandas/tests/io/test_feather.py
index d1201686edefa..3b4484e44e155 100644
--- a/pandas/tests/io/test_feather.py
+++ b/pandas/tests/io/test_feather.py
@@ -2,19 +2,15 @@
import numpy as np
import pytest
-from pandas._config import using_string_dtype
-
import pandas as pd
import pandas._testing as tm
from pandas.io.feather_format import read_feather, to_feather # isort:skip
-pytestmark = [
- pytest.mark.filterwarnings(
- "ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
- ),
- pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False),
-]
+pytestmark = pytest.mark.filterwarnings(
+ "ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
+)
+
pa = pytest.importorskip("pyarrow")
@@ -150,12 +146,11 @@ def test_path_localpath(self):
result = tm.round_trip_localpath(df.to_feather, read_feather)
tm.assert_frame_equal(df, result)
- @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
def test_passthrough_keywords(self):
df = pd.DataFrame(
1.1 * np.arange(120).reshape((30, 4)),
- columns=pd.Index(list("ABCD"), dtype=object),
- index=pd.Index([f"i-{i}" for i in range(30)], dtype=object),
+ columns=pd.Index(list("ABCD")),
+ index=pd.Index([f"i-{i}" for i in range(30)]),
).reset_index()
self.check_round_trip(df, write_kwargs={"version": 1})
@@ -169,7 +164,9 @@ def test_http_path(self, feather_file, httpserver):
res = read_feather(httpserver.url)
tm.assert_frame_equal(expected, res)
- def test_read_feather_dtype_backend(self, string_storage, dtype_backend):
+ def test_read_feather_dtype_backend(
+ self, string_storage, dtype_backend, using_infer_string
+ ):
# GH#50765
df = pd.DataFrame(
{
@@ -191,7 +188,10 @@ def test_read_feather_dtype_backend(self, string_storage, dtype_backend):
if dtype_backend == "pyarrow":
pa = pytest.importorskip("pyarrow")
- string_dtype = pd.ArrowDtype(pa.string())
+ if using_infer_string:
+ string_dtype = pd.ArrowDtype(pa.large_string())
+ else:
+ string_dtype = pd.ArrowDtype(pa.string())
else:
string_dtype = pd.StringDtype(string_storage)
@@ -218,6 +218,10 @@ def test_read_feather_dtype_backend(self, string_storage, dtype_backend):
}
)
+ if using_infer_string:
+ expected.columns = expected.columns.astype(
+ pd.StringDtype(string_storage, na_value=np.nan)
+ )
tm.assert_frame_equal(result, expected)
def test_int_columns_and_index(self):
@@ -242,5 +246,7 @@ def test_string_inference(self, tmp_path):
df.to_feather(path)
with pd.option_context("future.infer_string", True):
result = read_feather(path)
- expected = pd.DataFrame(data={"a": ["x", "y"]}, dtype="string[pyarrow_numpy]")
+ expected = pd.DataFrame(
+ data={"a": ["x", "y"]}, dtype=pd.StringDtype(na_value=np.nan)
+ )
tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/io/test_fsspec.py b/pandas/tests/io/test_fsspec.py
index 19b60e17d3a92..5ed64e3eb0958 100644
--- a/pandas/tests/io/test_fsspec.py
+++ b/pandas/tests/io/test_fsspec.py
@@ -168,7 +168,7 @@ def test_excel_options(fsspectest):
assert fsspectest.test[0] == "read"
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
+@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string) fastparquet")
def test_to_parquet_new_file(cleared_fs, df1):
"""Regression test for writing to a not-yet-existent GCS Parquet file."""
pytest.importorskip("fastparquet")
@@ -198,7 +198,7 @@ def test_arrowparquet_options(fsspectest):
@td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) fastparquet
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
+@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string) fastparquet")
def test_fastparquet_options(fsspectest):
"""Regression test for writing to a not-yet-existent GCS Parquet file."""
pytest.importorskip("fastparquet")
@@ -256,7 +256,7 @@ def test_s3_protocols(s3_public_bucket_with_data, tips_file, protocol, s3so):
)
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
+@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string) fastparquet")
@pytest.mark.single_cpu
@td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) fastparquet
def test_s3_parquet(s3_public_bucket, s3so, df1):
diff --git a/pandas/tests/io/test_gcs.py b/pandas/tests/io/test_gcs.py
index 96bc0326b23ab..81f951b3958b0 100644
--- a/pandas/tests/io/test_gcs.py
+++ b/pandas/tests/io/test_gcs.py
@@ -197,7 +197,7 @@ def test_to_csv_compression_encoding_gcs(
tm.assert_frame_equal(df, read_df)
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
+@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string) fastparquet")
def test_to_parquet_gcs_new_file(monkeypatch, tmpdir):
"""Regression test for writing to a not-yet-existent GCS Parquet file."""
pytest.importorskip("fastparquet")
diff --git a/pandas/tests/io/test_orc.py b/pandas/tests/io/test_orc.py
index 52d6850483418..4c4d7461e4ac5 100644
--- a/pandas/tests/io/test_orc.py
+++ b/pandas/tests/io/test_orc.py
@@ -8,8 +8,6 @@
import numpy as np
import pytest
-from pandas._config import using_string_dtype
-
import pandas as pd
from pandas import read_orc
import pandas._testing as tm
@@ -19,12 +17,9 @@
import pyarrow as pa
-pytestmark = [
- pytest.mark.filterwarnings(
- "ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
- ),
- pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False),
-]
+pytestmark = pytest.mark.filterwarnings(
+ "ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
+)
@pytest.fixture
@@ -47,7 +42,7 @@ def orc_writer_dtypes_not_supported(request):
return pd.DataFrame({"unimpl": request.param})
-def test_orc_reader_empty(dirpath):
+def test_orc_reader_empty(dirpath, using_infer_string):
columns = [
"boolean1",
"byte1",
@@ -68,11 +63,12 @@ def test_orc_reader_empty(dirpath):
"float32",
"float64",
"object",
- "object",
+ "str" if using_infer_string else "object",
]
expected = pd.DataFrame(index=pd.RangeIndex(0))
for colname, dtype in zip(columns, dtypes):
expected[colname] = pd.Series(dtype=dtype)
+ expected.columns = expected.columns.astype("str")
inputfile = os.path.join(dirpath, "TestOrcFile.emptyFile.orc")
got = read_orc(inputfile, columns=columns)
@@ -309,7 +305,7 @@ def test_orc_writer_dtypes_not_supported(orc_writer_dtypes_not_supported):
orc_writer_dtypes_not_supported.to_orc()
-def test_orc_dtype_backend_pyarrow():
+def test_orc_dtype_backend_pyarrow(using_infer_string):
pytest.importorskip("pyarrow")
df = pd.DataFrame(
{
@@ -340,6 +336,13 @@ def test_orc_dtype_backend_pyarrow():
for col in df.columns
}
)
+ if using_infer_string:
+ # ORC does not preserve distinction between string and large string
+ # -> the default large string comes back as string
+ string_dtype = pd.ArrowDtype(pa.string())
+ expected["string"] = expected["string"].astype(string_dtype)
+ expected["string_with_nan"] = expected["string_with_nan"].astype(string_dtype)
+ expected["string_with_none"] = expected["string_with_none"].astype(string_dtype)
tm.assert_frame_equal(result, expected)
@@ -435,7 +438,7 @@ def test_string_inference(tmp_path):
result = read_orc(path)
expected = pd.DataFrame(
data={"a": ["x", "y"]},
- dtype="string[pyarrow_numpy]",
- columns=pd.Index(["a"], dtype="string[pyarrow_numpy]"),
+ dtype=pd.StringDtype(na_value=np.nan),
+ columns=pd.Index(["a"], dtype=pd.StringDtype(na_value=np.nan)),
)
tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py
index 59662ec77d52f..746ca3cf6534d 100644
--- a/pandas/tests/io/test_parquet.py
+++ b/pandas/tests/io/test_parquet.py
@@ -55,7 +55,6 @@
pytest.mark.filterwarnings(
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
),
- pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False),
]
@@ -64,11 +63,18 @@
params=[
pytest.param(
"fastparquet",
- marks=pytest.mark.skipif(
- not _HAVE_FASTPARQUET
- or _get_option("mode.data_manager", silent=True) == "array",
- reason="fastparquet is not installed or ArrayManager is used",
- ),
+ marks=[
+ pytest.mark.skipif(
+ not _HAVE_FASTPARQUET
+ or _get_option("mode.data_manager", silent=True) == "array",
+ reason="fastparquet is not installed or ArrayManager is used",
+ ),
+ pytest.mark.xfail(
+ using_string_dtype(),
+ reason="TODO(infer_string) fastparquet",
+ strict=False,
+ ),
+ ],
),
pytest.param(
"pyarrow",
@@ -90,17 +96,24 @@ def pa():
@pytest.fixture
-def fp():
+def fp(request):
if not _HAVE_FASTPARQUET:
pytest.skip("fastparquet is not installed")
elif _get_option("mode.data_manager", silent=True) == "array":
pytest.skip("ArrayManager is not supported with fastparquet")
+ if using_string_dtype():
+ request.applymarker(
+ pytest.mark.xfail(reason="TODO(infer_string) fastparquet", strict=False)
+ )
return "fastparquet"
@pytest.fixture
def df_compat():
- return pd.DataFrame({"A": [1, 2, 3], "B": "foo"})
+ # TODO(infer_string) should this give str columns?
+ return pd.DataFrame(
+ {"A": [1, 2, 3], "B": "foo"}, columns=pd.Index(["A", "B"], dtype=object)
+ )
@pytest.fixture
@@ -389,16 +402,6 @@ def check_external_error_on_write(self, df, engine, exc):
with tm.external_error_raised(exc):
to_parquet(df, path, engine, compression=None)
- @pytest.mark.network
- @pytest.mark.single_cpu
- def test_parquet_read_from_url(self, httpserver, datapath, df_compat, engine):
- if engine != "auto":
- pytest.importorskip(engine)
- with open(datapath("io", "data", "parquet", "simple.parquet"), mode="rb") as f:
- httpserver.serve_content(content=f.read())
- df = read_parquet(httpserver.url)
- tm.assert_frame_equal(df, df_compat)
-
class TestBasic(Base):
def test_error(self, engine):
@@ -696,6 +699,16 @@ def test_read_empty_array(self, pa, dtype):
df, pa, read_kwargs={"dtype_backend": "numpy_nullable"}, expected=expected
)
+ @pytest.mark.network
+ @pytest.mark.single_cpu
+ def test_parquet_read_from_url(self, httpserver, datapath, df_compat, engine):
+ if engine != "auto":
+ pytest.importorskip(engine)
+ with open(datapath("io", "data", "parquet", "simple.parquet"), mode="rb") as f:
+ httpserver.serve_content(content=f.read())
+ df = read_parquet(httpserver.url, engine=engine)
+ tm.assert_frame_equal(df, df_compat)
+
class TestParquetPyArrow(Base):
def test_basic(self, pa, df_full):
@@ -925,7 +938,7 @@ def test_write_with_schema(self, pa):
out_df = df.astype(bool)
check_round_trip(df, pa, write_kwargs={"schema": schema}, expected=out_df)
- def test_additional_extension_arrays(self, pa):
+ def test_additional_extension_arrays(self, pa, using_infer_string):
# test additional ExtensionArrays that are supported through the
# __arrow_array__ protocol
pytest.importorskip("pyarrow")
@@ -936,17 +949,25 @@ def test_additional_extension_arrays(self, pa):
"c": pd.Series(["a", None, "c"], dtype="string"),
}
)
- check_round_trip(df, pa)
+ if using_infer_string:
+ check_round_trip(df, pa, expected=df.astype({"c": "str"}))
+ else:
+ check_round_trip(df, pa)
df = pd.DataFrame({"a": pd.Series([1, 2, 3, None], dtype="Int64")})
check_round_trip(df, pa)
- def test_pyarrow_backed_string_array(self, pa, string_storage):
+ def test_pyarrow_backed_string_array(self, pa, string_storage, using_infer_string):
# test ArrowStringArray supported through the __arrow_array__ protocol
pytest.importorskip("pyarrow")
df = pd.DataFrame({"a": pd.Series(["a", None, "c"], dtype="string[pyarrow]")})
with pd.option_context("string_storage", string_storage):
- check_round_trip(df, pa, expected=df.astype(f"string[{string_storage}]"))
+ if using_infer_string:
+ expected = df.astype("str")
+ expected.columns = expected.columns.astype("str")
+ else:
+ expected = df.astype(f"string[{string_storage}]")
+ check_round_trip(df, pa, expected=expected)
def test_additional_extension_types(self, pa):
# test additional ExtensionArrays that are supported through the
@@ -1107,8 +1128,8 @@ def test_string_inference(self, tmp_path, pa):
result = read_parquet(path, engine="pyarrow")
expected = pd.DataFrame(
data={"a": ["x", "y"]},
- dtype="string[pyarrow_numpy]",
- index=pd.Index(["a", "b"], dtype="string[pyarrow_numpy]"),
+ dtype=pd.StringDtype(na_value=np.nan),
+ index=pd.Index(["a", "b"], dtype=pd.StringDtype(na_value=np.nan)),
)
tm.assert_frame_equal(result, expected)
@@ -1138,8 +1159,8 @@ def test_infer_string_large_string_type(self, tmp_path, pa):
result = read_parquet(path)
expected = pd.DataFrame(
data={"a": [None, "b", "c"]},
- dtype="string[pyarrow_numpy]",
- columns=pd.Index(["a"], dtype="string[pyarrow_numpy]"),
+ dtype=pd.StringDtype(na_value=np.nan),
+ columns=pd.Index(["a"], dtype=pd.StringDtype(na_value=np.nan)),
)
tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py
index b1557d71f15e4..514eaceaccbe6 100644
--- a/pandas/tests/io/test_sql.py
+++ b/pandas/tests/io/test_sql.py
@@ -3840,7 +3840,6 @@ class Test(BaseModel):
def test_read_sql_string_inference(sqlite_engine):
conn = sqlite_engine
# GH#54430
- pytest.importorskip("pyarrow")
table = "test"
df = DataFrame({"a": ["x", "y"]})
df.to_sql(table, con=conn, index=False, if_exists="replace")
@@ -3848,7 +3847,7 @@ def test_read_sql_string_inference(sqlite_engine):
with pd.option_context("future.infer_string", True):
result = read_sql_table(table, conn)
- dtype = "string[pyarrow_numpy]"
+ dtype = pd.StringDtype(na_value=np.nan)
expected = DataFrame(
{"a": ["x", "y"]}, dtype=dtype, columns=Index(["a"], dtype=dtype)
)
diff --git a/pandas/tests/libs/test_lib.py b/pandas/tests/libs/test_lib.py
index 8583d8bcc052c..17dae1879f3b8 100644
--- a/pandas/tests/libs/test_lib.py
+++ b/pandas/tests/libs/test_lib.py
@@ -1,3 +1,5 @@
+import pickle
+
import numpy as np
import pytest
@@ -283,3 +285,15 @@ def test_no_default_pickle():
# GH#40397
obj = tm.round_trip_pickle(lib.no_default)
assert obj is lib.no_default
+
+
+def test_ensure_string_array_copy():
+ # ensure the original array is not modified in case of copy=False with
+ # pickle-roundtripped object dtype array
+ # https://github.com/pandas-dev/pandas/issues/54654
+ arr = np.array(["a", None], dtype=object)
+ arr = pickle.loads(pickle.dumps(arr))
+ result = lib.ensure_string_array(arr, copy=False)
+ assert not np.shares_memory(arr, result)
+ assert arr[1] is None
+ assert result[1] is np.nan
diff --git a/pandas/tests/reshape/concat/test_concat.py b/pandas/tests/reshape/concat/test_concat.py
index 2a52d3060e4b9..77c45cf36894b 100644
--- a/pandas/tests/reshape/concat/test_concat.py
+++ b/pandas/tests/reshape/concat/test_concat.py
@@ -46,6 +46,7 @@ def test_append_concat(self):
assert isinstance(result.index, PeriodIndex)
assert result.index[0] == s1.index[0]
+ # test is not written to work with string dtype (checks .base)
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
def test_concat_copy(self, using_array_manager, using_copy_on_write):
df = DataFrame(np.random.default_rng(2).standard_normal((4, 3)))
@@ -80,6 +81,7 @@ def test_concat_copy(self, using_array_manager, using_copy_on_write):
assert arr is df3._mgr.arrays[0]
else:
assert arr.base is not None
+ assert arr.base is not None
# Float block was consolidated.
df4 = DataFrame(np.random.default_rng(2).standard_normal((4, 1)))
diff --git a/pandas/tests/reshape/merge/test_merge_asof.py b/pandas/tests/reshape/merge/test_merge_asof.py
index 11e29f4e10dc4..77a3d64415ace 100644
--- a/pandas/tests/reshape/merge/test_merge_asof.py
+++ b/pandas/tests/reshape/merge/test_merge_asof.py
@@ -4,8 +4,6 @@
import pytest
import pytz
-from pandas._config import using_string_dtype
-
import pandas.util._test_decorators as td
import pandas as pd
@@ -3083,12 +3081,8 @@ def test_on_float_by_int(self):
tm.assert_frame_equal(result, expected)
- @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
- def test_merge_datatype_error_raises(self, using_infer_string):
- if using_infer_string:
- msg = "incompatible merge keys"
- else:
- msg = r"Incompatible merge dtype, .*, both sides must have numeric dtype"
+ def test_merge_datatype_error_raises(self):
+ msg = r"Incompatible merge dtype, .*, both sides must have numeric dtype"
left = pd.DataFrame({"left_val": [1, 5, 10], "a": ["a", "b", "c"]})
right = pd.DataFrame({"right_val": [1, 2, 3, 6, 7], "a": [1, 2, 3, 6, 7]})
diff --git a/pandas/tests/reshape/test_get_dummies.py b/pandas/tests/reshape/test_get_dummies.py
index 2c17b7f6a5a47..637bce59e9e2c 100644
--- a/pandas/tests/reshape/test_get_dummies.py
+++ b/pandas/tests/reshape/test_get_dummies.py
@@ -4,8 +4,6 @@
import numpy as np
import pytest
-from pandas._config import using_string_dtype
-
import pandas.util._test_decorators as td
from pandas.core.dtypes.common import is_integer_dtype
@@ -216,11 +214,10 @@ def test_dataframe_dummies_all_obj(self, df, sparse):
tm.assert_frame_equal(result, expected)
- @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
- def test_dataframe_dummies_string_dtype(self, df, using_infer_string):
+ def test_dataframe_dummies_string_dtype(self, df, any_string_dtype):
# GH44965
df = df[["A", "B"]]
- df = df.astype({"A": "object", "B": "string"})
+ df = df.astype({"A": "str", "B": any_string_dtype})
result = get_dummies(df)
expected = DataFrame(
{
@@ -231,8 +228,7 @@ def test_dataframe_dummies_string_dtype(self, df, using_infer_string):
},
dtype=bool,
)
- if not using_infer_string:
- # infer_string returns numpy bools
+ if any_string_dtype == "string" and any_string_dtype.na_value is pd.NA:
expected[["B_b", "B_c"]] = expected[["B_b", "B_c"]].astype("boolean")
tm.assert_frame_equal(result, expected)
@@ -711,19 +707,17 @@ def test_get_dummies_ea_dtype_dataframe(self, any_numeric_ea_and_arrow_dtype):
)
tm.assert_frame_equal(result, expected)
- @td.skip_if_no("pyarrow")
- def test_get_dummies_ea_dtype(self):
+ @pytest.mark.parametrize("dtype_type", ["string", "category"])
+ def test_get_dummies_ea_dtype(self, dtype_type, string_dtype_no_object):
# GH#56273
- for dtype, exp_dtype in [
- ("string[pyarrow]", "boolean"),
- ("string[pyarrow_numpy]", "bool"),
- (CategoricalDtype(Index(["a"], dtype="string[pyarrow]")), "boolean"),
- (CategoricalDtype(Index(["a"], dtype="string[pyarrow_numpy]")), "bool"),
- ]:
- df = DataFrame({"name": Series(["a"], dtype=dtype), "x": 1})
- result = get_dummies(df)
- expected = DataFrame({"x": 1, "name_a": Series([True], dtype=exp_dtype)})
- tm.assert_frame_equal(result, expected)
+ dtype = string_dtype_no_object
+ exp_dtype = "boolean" if dtype.na_value is pd.NA else "bool"
+ if dtype_type == "category":
+ dtype = CategoricalDtype(Index(["a"], dtype))
+ df = DataFrame({"name": Series(["a"], dtype=dtype), "x": 1})
+ result = get_dummies(df)
+ expected = DataFrame({"x": 1, "name_a": Series([True], dtype=exp_dtype)})
+ tm.assert_frame_equal(result, expected)
@td.skip_if_no("pyarrow")
def test_get_dummies_arrow_dtype(self):
diff --git a/pandas/tests/reshape/test_melt.py b/pandas/tests/reshape/test_melt.py
index cbe2c9b931ee3..72fd72df60761 100644
--- a/pandas/tests/reshape/test_melt.py
+++ b/pandas/tests/reshape/test_melt.py
@@ -3,8 +3,6 @@
import numpy as np
import pytest
-from pandas._config import using_string_dtype
-
import pandas as pd
from pandas import (
DataFrame,
@@ -21,7 +19,7 @@
def df():
res = DataFrame(
np.random.default_rng(2).standard_normal((10, 4)),
- columns=Index(list("ABCD"), dtype=object),
+ columns=Index(list("ABCD")),
index=date_range("2000-01-01", periods=10, freq="B"),
)
res["id1"] = (res["A"] > 0).astype(np.int64)
@@ -83,7 +81,6 @@ def test_default_col_names(self, df):
result2 = df.melt(id_vars=["id1", "id2"])
assert result2.columns.tolist() == ["id1", "id2", "variable", "value"]
- @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
def test_value_vars(self, df):
result3 = df.melt(id_vars=["id1", "id2"], value_vars="A")
assert len(result3) == 10
@@ -100,7 +97,6 @@ def test_value_vars(self, df):
)
tm.assert_frame_equal(result4, expected4)
- @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
@pytest.mark.parametrize("type_", (tuple, list, np.array))
def test_value_vars_types(self, type_, df):
# GH 15348
@@ -181,7 +177,6 @@ def test_tuple_vars_fail_with_multiindex(self, id_vars, value_vars, df1):
with pytest.raises(ValueError, match=msg):
df1.melt(id_vars=id_vars, value_vars=value_vars)
- @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
def test_custom_var_name(self, df, var_name):
result5 = df.melt(var_name=var_name)
assert result5.columns.tolist() == ["var", "value"]
@@ -209,7 +204,6 @@ def test_custom_var_name(self, df, var_name):
)
tm.assert_frame_equal(result9, expected9)
- @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
def test_custom_value_name(self, df, value_name):
result10 = df.melt(value_name=value_name)
assert result10.columns.tolist() == ["variable", "val"]
@@ -239,7 +233,6 @@ def test_custom_value_name(self, df, value_name):
)
tm.assert_frame_equal(result14, expected14)
- @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
def test_custom_var_and_value_name(self, df, value_name, var_name):
result15 = df.melt(var_name=var_name, value_name=value_name)
assert result15.columns.tolist() == ["var", "val"]
@@ -364,7 +357,6 @@ def test_melt_missing_columns_raises(self):
with pytest.raises(KeyError, match=msg):
multi.melt(["A"], ["F"], col_level=0)
- @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
def test_melt_mixed_int_str_id_vars(self):
# GH 29718
df = DataFrame({0: ["foo"], "a": ["bar"], "b": [1], "d": [2]})
@@ -372,6 +364,8 @@ def test_melt_mixed_int_str_id_vars(self):
expected = DataFrame(
{0: ["foo"] * 2, "a": ["bar"] * 2, "variable": list("bd"), "value": [1, 2]}
)
+ # the df's columns are mixed type and thus object -> preserves object dtype
+ expected["variable"] = expected["variable"].astype(object)
tm.assert_frame_equal(result, expected)
def test_melt_mixed_int_str_value_vars(self):
@@ -1205,12 +1199,13 @@ def test_raise_of_column_name_value(self):
):
df.melt(id_vars="value", value_name="value")
- @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
- @pytest.mark.parametrize("dtype", ["O", "string"])
- def test_missing_stubname(self, dtype):
+ def test_missing_stubname(self, request, any_string_dtype, using_infer_string):
+ if using_infer_string and any_string_dtype == "object":
+ # triggers object dtype inference warning of dtype=object
+ request.applymarker(pytest.mark.xfail(reason="TODO(infer_string)"))
# GH46044
df = DataFrame({"id": ["1", "2"], "a-1": [100, 200], "a-2": [300, 400]})
- df = df.astype({"id": dtype})
+ df = df.astype({"id": any_string_dtype})
result = wide_to_long(
df,
stubnames=["a", "b"],
@@ -1226,15 +1221,16 @@ def test_missing_stubname(self, dtype):
{"a": [100, 200, 300, 400], "b": [np.nan] * 4},
index=index,
)
- new_level = expected.index.levels[0].astype(dtype)
+ new_level = expected.index.levels[0].astype(any_string_dtype)
+ if any_string_dtype == "object":
+ new_level = expected.index.levels[0].astype("str")
expected.index = expected.index.set_levels(new_level, level=0)
tm.assert_frame_equal(result, expected)
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
-def test_wide_to_long_pyarrow_string_columns():
+def test_wide_to_long_string_columns(string_storage):
# GH 57066
- pytest.importorskip("pyarrow")
+ string_dtype = pd.StringDtype(string_storage, na_value=np.nan)
df = DataFrame(
{
"ID": {0: 1},
@@ -1244,17 +1240,17 @@ def test_wide_to_long_pyarrow_string_columns():
"D": {0: 1},
}
)
- df.columns = df.columns.astype("string[pyarrow_numpy]")
+ df.columns = df.columns.astype(string_dtype)
result = wide_to_long(
df, stubnames="R", i="ID", j="UNPIVOTED", sep="_", suffix=".*"
)
expected = DataFrame(
[[1, 1], [1, 1], [1, 2]],
- columns=Index(["D", "R"], dtype=object),
+ columns=Index(["D", "R"]),
index=pd.MultiIndex.from_arrays(
[
[1, 1, 1],
- Index(["test1", "test2", "test3"], dtype="string[pyarrow_numpy]"),
+ Index(["test1", "test2", "test3"], dtype=string_dtype),
],
names=["ID", "UNPIVOTED"],
),
diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py
index 9aa13d59a586b..d0858a0ea5558 100644
--- a/pandas/tests/reshape/test_pivot.py
+++ b/pandas/tests/reshape/test_pivot.py
@@ -1081,7 +1081,6 @@ def test_margins_dtype_len(self, data):
tm.assert_frame_equal(expected, result)
- @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
@pytest.mark.parametrize("cols", [(1, 2), ("a", "b"), (1, "b"), ("a", 1)])
def test_pivot_table_multiindex_only(self, cols):
# GH 17038
@@ -1091,7 +1090,7 @@ def test_pivot_table_multiindex_only(self, cols):
expected = DataFrame(
[[4.0, 5.0, 6.0]],
columns=MultiIndex.from_tuples([(1, 1), (2, 2), (3, 3)], names=cols),
- index=Index(["v"], dtype=object),
+ index=Index(["v"], dtype="str" if cols == ("a", "b") else "object"),
)
tm.assert_frame_equal(result, expected)
@@ -2525,13 +2524,16 @@ def test_pivot_empty(self):
expected = DataFrame(index=[], columns=[])
tm.assert_frame_equal(result, expected, check_names=False)
- @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
- @pytest.mark.parametrize("dtype", [object, "string"])
- def test_pivot_integer_bug(self, dtype):
- df = DataFrame(data=[("A", "1", "A1"), ("B", "2", "B2")], dtype=dtype)
+ def test_pivot_integer_bug(self, any_string_dtype):
+ df = DataFrame(
+ data=[("A", "1", "A1"), ("B", "2", "B2")], dtype=any_string_dtype
+ )
result = df.pivot(index=1, columns=0, values=2)
- tm.assert_index_equal(result.columns, Index(["A", "B"], name=0, dtype=dtype))
+ expected_columns = Index(["A", "B"], name=0, dtype=any_string_dtype)
+ if any_string_dtype == "object":
+ expected_columns = expected_columns.astype("str")
+ tm.assert_index_equal(result.columns, expected_columns)
def test_pivot_index_none(self):
# GH#3962
@@ -2613,7 +2615,9 @@ def test_pivot_columns_not_given(self):
with pytest.raises(TypeError, match="missing 1 required keyword-only argument"):
df.pivot() # pylint: disable=missing-kwoa
- @pytest.mark.xfail(using_string_dtype(), reason="None is cast to NaN")
+ @pytest.mark.xfail(
+ using_string_dtype(), reason="TODO(infer_string) None is cast to NaN"
+ )
def test_pivot_columns_is_none(self):
# GH#48293
df = DataFrame({None: [1], "b": 2, "c": 3})
@@ -2629,7 +2633,9 @@ def test_pivot_columns_is_none(self):
expected = DataFrame({1: 3}, index=Index([2], name="b"))
tm.assert_frame_equal(result, expected)
- @pytest.mark.xfail(using_string_dtype(), reason="None is cast to NaN")
+ @pytest.mark.xfail(
+ using_string_dtype(), reason="TODO(infer_string) None is cast to NaN"
+ )
def test_pivot_index_is_none(self):
# GH#48293
df = DataFrame({None: [1], "b": 2, "c": 3})
@@ -2643,7 +2649,9 @@ def test_pivot_index_is_none(self):
expected = DataFrame(3, index=[1], columns=Index([2], name="b"))
tm.assert_frame_equal(result, expected)
- @pytest.mark.xfail(using_string_dtype(), reason="None is cast to NaN")
+ @pytest.mark.xfail(
+ using_string_dtype(), reason="TODO(infer_string) None is cast to NaN"
+ )
def test_pivot_values_is_none(self):
# GH#48293
df = DataFrame({None: [1], "b": 2, "c": 3})
diff --git a/pandas/tests/series/methods/test_astype.py b/pandas/tests/series/methods/test_astype.py
index ef0757ffe4aa8..b9ba03d1e9f41 100644
--- a/pandas/tests/series/methods/test_astype.py
+++ b/pandas/tests/series/methods/test_astype.py
@@ -76,7 +76,7 @@ def test_astype_dict_like(self, dtype_class):
dt1 = dtype_class({"abc": str})
result = ser.astype(dt1)
- expected = Series(["0", "2", "4", "6", "8"], name="abc", dtype=object)
+ expected = Series(["0", "2", "4", "6", "8"], name="abc", dtype="str")
tm.assert_series_equal(result, expected)
dt2 = dtype_class({"abc": "float64"})
@@ -172,10 +172,14 @@ def test_astype_empty_constructor_equality(self, dtype):
)
def test_astype_str_map(self, dtype, series, using_infer_string):
# see GH#4405
+ using_string_dtype = using_infer_string and dtype is str
result = series.astype(dtype)
- expected = series.map(str)
- if using_infer_string:
- expected = expected.astype(object)
+ if using_string_dtype:
+ expected = series.map(lambda val: str(val) if val is not np.nan else np.nan)
+ else:
+ expected = series.map(str)
+ if using_infer_string:
+ expected = expected.astype(object)
tm.assert_series_equal(result, expected)
def test_astype_float_to_period(self):
@@ -212,7 +216,7 @@ def test_astype_dt64_to_str(self):
# GH#10442 : testing astype(str) is correct for Series/DatetimeIndex
dti = date_range("2012-01-01", periods=3)
result = Series(dti).astype(str)
- expected = Series(["2012-01-01", "2012-01-02", "2012-01-03"], dtype=object)
+ expected = Series(["2012-01-01", "2012-01-02", "2012-01-03"], dtype="str")
tm.assert_series_equal(result, expected)
def test_astype_dt64tz_to_str(self):
@@ -225,7 +229,7 @@ def test_astype_dt64tz_to_str(self):
"2012-01-02 00:00:00-05:00",
"2012-01-03 00:00:00-05:00",
],
- dtype=object,
+ dtype="str",
)
tm.assert_series_equal(result, expected)
@@ -285,13 +289,13 @@ def test_astype_str_cast_dt64(self):
ts = Series([Timestamp("2010-01-04 00:00:00")])
res = ts.astype(str)
- expected = Series(["2010-01-04"], dtype=object)
+ expected = Series(["2010-01-04"], dtype="str")
tm.assert_series_equal(res, expected)
ts = Series([Timestamp("2010-01-04 00:00:00", tz="US/Eastern")])
res = ts.astype(str)
- expected = Series(["2010-01-04 00:00:00-05:00"], dtype=object)
+ expected = Series(["2010-01-04 00:00:00-05:00"], dtype="str")
tm.assert_series_equal(res, expected)
def test_astype_str_cast_td64(self):
@@ -300,7 +304,7 @@ def test_astype_str_cast_td64(self):
td = Series([Timedelta(1, unit="d")])
ser = td.astype(str)
- expected = Series(["1 days"], dtype=object)
+ expected = Series(["1 days"], dtype="str")
tm.assert_series_equal(ser, expected)
def test_dt64_series_astype_object(self):
@@ -347,7 +351,7 @@ def test_astype_from_float_to_str(self, dtype):
# https://github.com/pandas-dev/pandas/issues/36451
ser = Series([0.1], dtype=dtype)
result = ser.astype(str)
- expected = Series(["0.1"], dtype=object)
+ expected = Series(["0.1"], dtype="str")
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize(
@@ -358,11 +362,13 @@ def test_astype_from_float_to_str(self, dtype):
(NA, ""),
],
)
- def test_astype_to_str_preserves_na(self, value, string_value):
+ def test_astype_to_str_preserves_na(self, value, string_value, using_infer_string):
# https://github.com/pandas-dev/pandas/issues/36904
ser = Series(["a", "b", value], dtype=object)
result = ser.astype(str)
- expected = Series(["a", "b", string_value], dtype=object)
+ expected = Series(
+ ["a", "b", None if using_infer_string else string_value], dtype="str"
+ )
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("dtype", ["float32", "float64", "int64", "int32"])
diff --git a/pandas/tests/series/methods/test_map.py b/pandas/tests/series/methods/test_map.py
index ac489b2579e05..e5281a18236da 100644
--- a/pandas/tests/series/methods/test_map.py
+++ b/pandas/tests/series/methods/test_map.py
@@ -553,13 +553,11 @@ def f(x):
(list(range(3)), {0: 42}, [42] + [np.nan] * 3),
],
)
-def test_map_missing_mixed(vals, mapping, exp, using_infer_string):
+def test_map_missing_mixed(vals, mapping, exp):
# GH20495
s = Series(vals + [np.nan])
result = s.map(mapping)
exp = Series(exp)
- if using_infer_string and mapping == {np.nan: "not NaN"}:
- exp.iloc[-1] = np.nan
tm.assert_series_equal(result, exp)
diff --git a/pandas/tests/series/methods/test_rank.py b/pandas/tests/series/methods/test_rank.py
index 24cf97c05c0a8..1c3ebe5653ce3 100644
--- a/pandas/tests/series/methods/test_rank.py
+++ b/pandas/tests/series/methods/test_rank.py
@@ -33,7 +33,8 @@ def ser():
["max", np.array([2, 6, 7, 4, np.nan, 4, 2, 8, np.nan, 6])],
["first", np.array([1, 5, 7, 3, np.nan, 4, 2, 8, np.nan, 6])],
["dense", np.array([1, 3, 4, 2, np.nan, 2, 1, 5, np.nan, 3])],
- ]
+ ],
+ ids=lambda x: x[0],
)
def results(request):
return request.param
@@ -48,12 +49,29 @@ def results(request):
"Int64",
pytest.param("float64[pyarrow]", marks=td.skip_if_no("pyarrow")),
pytest.param("int64[pyarrow]", marks=td.skip_if_no("pyarrow")),
+ pytest.param("string[pyarrow]", marks=td.skip_if_no("pyarrow")),
+ "string[python]",
+ "str",
]
)
def dtype(request):
return request.param
+def expected_dtype(dtype, method, pct=False):
+ exp_dtype = "float64"
+ # elif dtype in ["Int64", "Float64", "string[pyarrow]", "string[python]"]:
+ if dtype in ["string[pyarrow]"]:
+ exp_dtype = "Float64"
+ elif dtype in ["float64[pyarrow]", "int64[pyarrow]"]:
+ if method == "average" or pct:
+ exp_dtype = "double[pyarrow]"
+ else:
+ exp_dtype = "uint64[pyarrow]"
+
+ return exp_dtype
+
+
class TestSeriesRank:
def test_rank(self, datetime_series):
sp_stats = pytest.importorskip("scipy.stats")
@@ -241,12 +259,18 @@ def test_rank_signature(self):
with pytest.raises(ValueError, match=msg):
s.rank("average")
- @pytest.mark.parametrize("dtype", [None, object])
- def test_rank_tie_methods(self, ser, results, dtype):
+ def test_rank_tie_methods(self, ser, results, dtype, using_infer_string):
method, exp = results
+ if (
+ dtype == "int64"
+ or dtype == "Int64"
+ or (not using_infer_string and dtype == "str")
+ ):
+ pytest.skip("int64/str does not support NaN")
+
ser = ser if dtype is None else ser.astype(dtype)
result = ser.rank(method=method)
- tm.assert_series_equal(result, Series(exp))
+ tm.assert_series_equal(result, Series(exp, dtype=expected_dtype(dtype, method)))
@pytest.mark.parametrize("ascending", [True, False])
@pytest.mark.parametrize("method", ["average", "min", "max", "first", "dense"])
@@ -346,25 +370,35 @@ def test_rank_methods_series(self, method, op, value):
],
)
def test_rank_dense_method(self, dtype, ser, exp):
+ if ser[0] < 0 and dtype.startswith("str"):
+ exp = exp[::-1]
s = Series(ser).astype(dtype)
result = s.rank(method="dense")
- expected = Series(exp).astype(result.dtype)
+ expected = Series(exp).astype(expected_dtype(dtype, "dense"))
tm.assert_series_equal(result, expected)
- def test_rank_descending(self, ser, results, dtype):
+ def test_rank_descending(self, ser, results, dtype, using_infer_string):
method, _ = results
- if "i" in dtype:
+ if dtype == "int64" or (not using_infer_string and dtype == "str"):
s = ser.dropna()
else:
s = ser.astype(dtype)
res = s.rank(ascending=False)
- expected = (s.max() - s).rank()
- tm.assert_series_equal(res, expected)
+ if dtype.startswith("str"):
+ expected = (s.astype("float64").max() - s.astype("float64")).rank()
+ else:
+ expected = (s.max() - s).rank()
+ tm.assert_series_equal(res, expected.astype(expected_dtype(dtype, "average")))
- expected = (s.max() - s).rank(method=method)
+ if dtype.startswith("str"):
+ expected = (s.astype("float64").max() - s.astype("float64")).rank(
+ method=method
+ )
+ else:
+ expected = (s.max() - s).rank(method=method)
res2 = s.rank(method=method, ascending=False)
- tm.assert_series_equal(res2, expected)
+ tm.assert_series_equal(res2, expected.astype(expected_dtype(dtype, method)))
def test_rank_int(self, ser, results):
method, exp = results
@@ -421,9 +455,11 @@ def test_rank_ea_small_values(self):
],
)
def test_rank_dense_pct(dtype, ser, exp):
+ if ser[0] < 0 and dtype.startswith("str"):
+ exp = exp[::-1]
s = Series(ser).astype(dtype)
result = s.rank(method="dense", pct=True)
- expected = Series(exp).astype(result.dtype)
+ expected = Series(exp).astype(expected_dtype(dtype, "dense", pct=True))
tm.assert_series_equal(result, expected)
@@ -442,9 +478,11 @@ def test_rank_dense_pct(dtype, ser, exp):
],
)
def test_rank_min_pct(dtype, ser, exp):
+ if ser[0] < 0 and dtype.startswith("str"):
+ exp = exp[::-1]
s = Series(ser).astype(dtype)
result = s.rank(method="min", pct=True)
- expected = Series(exp).astype(result.dtype)
+ expected = Series(exp).astype(expected_dtype(dtype, "min", pct=True))
tm.assert_series_equal(result, expected)
@@ -463,9 +501,11 @@ def test_rank_min_pct(dtype, ser, exp):
],
)
def test_rank_max_pct(dtype, ser, exp):
+ if ser[0] < 0 and dtype.startswith("str"):
+ exp = exp[::-1]
s = Series(ser).astype(dtype)
result = s.rank(method="max", pct=True)
- expected = Series(exp).astype(result.dtype)
+ expected = Series(exp).astype(expected_dtype(dtype, "max", pct=True))
tm.assert_series_equal(result, expected)
@@ -484,9 +524,11 @@ def test_rank_max_pct(dtype, ser, exp):
],
)
def test_rank_average_pct(dtype, ser, exp):
+ if ser[0] < 0 and dtype.startswith("str"):
+ exp = exp[::-1]
s = Series(ser).astype(dtype)
result = s.rank(method="average", pct=True)
- expected = Series(exp).astype(result.dtype)
+ expected = Series(exp).astype(expected_dtype(dtype, "average", pct=True))
tm.assert_series_equal(result, expected)
@@ -505,9 +547,11 @@ def test_rank_average_pct(dtype, ser, exp):
],
)
def test_rank_first_pct(dtype, ser, exp):
+ if ser[0] < 0 and dtype.startswith("str"):
+ exp = exp[::-1]
s = Series(ser).astype(dtype)
result = s.rank(method="first", pct=True)
- expected = Series(exp).astype(result.dtype)
+ expected = Series(exp).astype(expected_dtype(dtype, "first", pct=True))
tm.assert_series_equal(result, expected)
diff --git a/pandas/tests/series/test_arithmetic.py b/pandas/tests/series/test_arithmetic.py
index 1ffc9ddca5adf..a65d7687cfb06 100644
--- a/pandas/tests/series/test_arithmetic.py
+++ b/pandas/tests/series/test_arithmetic.py
@@ -9,8 +9,6 @@
import numpy as np
import pytest
-from pandas._config import using_string_dtype
-
from pandas._libs import lib
from pandas._libs.tslibs import IncompatibleFrequency
@@ -214,9 +212,9 @@ def test_series_integer_mod(self, index):
s1 = Series(range(1, 10))
s2 = Series("foo", index=index)
- msg = "not all arguments converted during string formatting|mod not"
+ msg = "not all arguments converted during string formatting|'mod' not supported"
- with pytest.raises((TypeError, NotImplementedError), match=msg):
+ with pytest.raises(TypeError, match=msg):
s2 % s1
def test_add_with_duplicate_index(self):
@@ -501,28 +499,14 @@ def test_ser_cmp_result_names(self, names, comparison_op):
result = op(ser, cidx)
assert result.name == names[2]
- @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
- def test_comparisons(self, using_infer_string):
+ def test_comparisons(self):
s = Series(["a", "b", "c"])
s2 = Series([False, True, False])
# it works!
exp = Series([False, False, False])
- if using_infer_string:
- import pyarrow as pa
-
- msg = "has no kernel"
- # TODO(3.0) GH56008
- with pytest.raises(pa.lib.ArrowNotImplementedError, match=msg):
- s == s2
- with tm.assert_produces_warning(
- DeprecationWarning, match="comparison", check_stacklevel=False
- ):
- with pytest.raises(pa.lib.ArrowNotImplementedError, match=msg):
- s2 == s
- else:
- tm.assert_series_equal(s == s2, exp)
- tm.assert_series_equal(s2 == s, exp)
+ tm.assert_series_equal(s == s2, exp)
+ tm.assert_series_equal(s2 == s, exp)
# -----------------------------------------------------------------
# Categorical Dtype Comparisons
diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py
index 6efe0bcb8b45d..60b2ec7b6912d 100644
--- a/pandas/tests/series/test_constructors.py
+++ b/pandas/tests/series/test_constructors.py
@@ -230,7 +230,7 @@ def test_constructor_empty(self, input_class, using_infer_string):
# GH 19853 : with empty string, index and dtype str
empty = Series("", dtype=str, index=range(3))
if using_infer_string:
- empty2 = Series("", index=range(3), dtype=object)
+ empty2 = Series("", index=range(3), dtype="str")
else:
empty2 = Series("", index=range(3))
tm.assert_series_equal(empty, empty2)
diff --git a/pandas/tests/series/test_logical_ops.py b/pandas/tests/series/test_logical_ops.py
index a9f1726afc942..26bdfcbc6ec56 100644
--- a/pandas/tests/series/test_logical_ops.py
+++ b/pandas/tests/series/test_logical_ops.py
@@ -6,13 +6,12 @@
from pandas._config import using_string_dtype
-from pandas.compat import HAS_PYARROW
-
from pandas import (
ArrowDtype,
DataFrame,
Index,
Series,
+ StringDtype,
bdate_range,
)
import pandas._testing as tm
@@ -151,10 +150,7 @@ def test_logical_operators_int_dtype_with_bool(self):
expected = Series([False, True, True, True])
tm.assert_series_equal(result, expected)
- @pytest.mark.xfail(
- using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)"
- )
- def test_logical_operators_int_dtype_with_object(self, using_infer_string):
+ def test_logical_operators_int_dtype_with_object(self):
# GH#9016: support bitwise op for integer types
s_0123 = Series(range(4), dtype="int64")
@@ -163,14 +159,10 @@ def test_logical_operators_int_dtype_with_object(self, using_infer_string):
tm.assert_series_equal(result, expected)
s_abNd = Series(["a", "b", np.nan, "d"])
- if using_infer_string:
- import pyarrow as pa
-
- with pytest.raises(pa.lib.ArrowNotImplementedError, match="has no kernel"):
- s_0123 & s_abNd
- else:
- with pytest.raises(TypeError, match="unsupported.* 'int' and 'str'"):
- s_0123 & s_abNd
+ with pytest.raises(
+ TypeError, match="unsupported.* 'int' and 'str'|'rand_' not supported"
+ ):
+ s_0123 & s_abNd
def test_logical_operators_bool_dtype_with_int(self):
index = list("bca")
@@ -368,9 +360,7 @@ def test_reverse_ops_with_index(self, op, expected):
result = op(ser, idx)
tm.assert_series_equal(result, expected)
- @pytest.mark.xfail(
- using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)"
- )
+ @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
def test_logical_ops_label_based(self, using_infer_string):
# GH#4947
# logical ops should be label based
@@ -439,15 +429,12 @@ def test_logical_ops_label_based(self, using_infer_string):
tm.assert_series_equal(result, a[a])
for e in [Series(["z"])]:
- warn = FutureWarning if using_infer_string else None
if using_infer_string:
- import pyarrow as pa
-
- with tm.assert_produces_warning(warn, match="Operation between non"):
- with pytest.raises(
- pa.lib.ArrowNotImplementedError, match="has no kernel"
- ):
- result = a[a | e]
+ # TODO(infer_string) should this behave differently?
+ with pytest.raises(
+ TypeError, match="not supported for dtype|unsupported operand type"
+ ):
+ result = a[a | e]
else:
result = a[a | e]
tm.assert_series_equal(result, a[a])
@@ -547,7 +534,7 @@ def test_pyarrow_numpy_string_invalid(self):
# GH#56008
pa = pytest.importorskip("pyarrow")
ser = Series([False, True])
- ser2 = Series(["a", "b"], dtype="string[pyarrow_numpy]")
+ ser2 = Series(["a", "b"], dtype=StringDtype(na_value=np.nan))
result = ser == ser2
expected_eq = Series(False, index=ser.index)
tm.assert_series_equal(result, expected_eq)
diff --git a/pandas/tests/strings/test_case_justify.py b/pandas/tests/strings/test_case_justify.py
index 41aedae90ca76..819556f961fa3 100644
--- a/pandas/tests/strings/test_case_justify.py
+++ b/pandas/tests/strings/test_case_justify.py
@@ -291,11 +291,7 @@ def test_center_ljust_rjust_mixed_object():
def test_center_ljust_rjust_fillchar(any_string_dtype):
- if any_string_dtype == "string[pyarrow_numpy]":
- pytest.skip(
- "Arrow logic is different, "
- "see https://github.com/pandas-dev/pandas/pull/54533/files#r1299808126",
- )
+ # GH#54533, GH#54792
s = Series(["a", "bb", "cccc", "ddddd", "eeeeee"], dtype=any_string_dtype)
result = s.str.center(5, fillchar="X")
diff --git a/pandas/tests/strings/test_find_replace.py b/pandas/tests/strings/test_find_replace.py
index 78ce1d7418886..2742c5b67e57e 100644
--- a/pandas/tests/strings/test_find_replace.py
+++ b/pandas/tests/strings/test_find_replace.py
@@ -4,7 +4,6 @@
import numpy as np
import pytest
-from pandas.errors import PerformanceWarning
import pandas.util._test_decorators as td
import pandas as pd
@@ -22,10 +21,6 @@
# --------------------------------------------------------------------------------------
-def using_pyarrow(dtype):
- return dtype in ("string[pyarrow]", "string[pyarrow_numpy]")
-
-
def test_contains(any_string_dtype):
values = np.array(
["foo", np.nan, "fooommm__foo", "mmm_", "foommm[_]+bar"], dtype=np.object_
@@ -167,7 +162,16 @@ def test_contains_na_kwarg_for_nullable_string_dtype(
# https://github.com/pandas-dev/pandas/pull/41025#issuecomment-824062416
values = Series(["a", "b", "c", "a", np.nan], dtype=nullable_string_dtype)
- result = values.str.contains("a", na=na, regex=regex)
+
+ msg = (
+ "Allowing a non-bool 'na' in obj.str.contains is deprecated and "
+ "will raise in a future version"
+ )
+ warn = None
+ if not pd.isna(na) and not isinstance(na, bool):
+ warn = FutureWarning
+ with tm.assert_produces_warning(warn, match=msg):
+ result = values.str.contains("a", na=na, regex=regex)
expected = Series([True, False, False, True, expected], dtype="boolean")
tm.assert_series_equal(result, expected)
@@ -233,6 +237,7 @@ def test_contains_nan(any_string_dtype):
expected = Series([True, True, True], dtype=expected_dtype)
tm.assert_series_equal(result, expected)
+ # TODO(infer_string)
# this particular combination of events is broken on 2.3
# would require cherry picking #58483, which in turn requires #57481
# which introduce many behavioral changes
@@ -241,14 +246,19 @@ def test_contains_nan(any_string_dtype):
and any_string_dtype.storage == "python"
and any_string_dtype.na_value is np.nan
):
- result = s.str.contains("foo", na="foo")
+ msg = (
+ "Allowing a non-bool 'na' in obj.str.contains is deprecated and "
+ "will raise in a future version"
+ )
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ result = s.str.contains("foo", na="foo")
if any_string_dtype == "object":
expected = Series(["foo", "foo", "foo"], dtype=np.object_)
elif any_string_dtype.na_value is np.nan:
expected = Series([True, True, True], dtype=np.bool_)
else:
expected = Series([True, True, True], dtype="boolean")
- tm.assert_series_equal(result, expected)
+ tm.assert_series_equal(result, expected)
result = s.str.contains("foo")
expected_dtype = (
@@ -263,6 +273,40 @@ def test_contains_nan(any_string_dtype):
# --------------------------------------------------------------------------------------
+def test_startswith_endswith_validate_na(request, any_string_dtype):
+ if (
+ any_string_dtype == "string"
+ and any_string_dtype.na_value is np.nan
+ and any_string_dtype.storage == "python"
+ ):
+ request.applymarker(pytest.mark.xfail(reason="TODO(infer_string)"))
+ # GH#59615
+ ser = Series(
+ ["om", np.nan, "foo_nom", "nom", "bar_foo", np.nan, "foo"],
+ dtype=any_string_dtype,
+ )
+
+ dtype = ser.dtype
+ if (
+ isinstance(dtype, pd.StringDtype) and dtype.storage == "python"
+ ) or dtype == np.dtype("object"):
+ msg = "Allowing a non-bool 'na' in obj.str.startswith is deprecated"
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ ser.str.startswith("kapow", na="baz")
+ msg = "Allowing a non-bool 'na' in obj.str.endswith is deprecated"
+ with tm.assert_produces_warning(FutureWarning, match=msg):
+ ser.str.endswith("bar", na="baz")
+ else:
+ # TODO(infer_string): don't surface pyarrow errors
+ import pyarrow as pa
+
+ msg = "Could not convert 'baz' with type str: tried to convert to boolean"
+ with pytest.raises(pa.lib.ArrowInvalid, match=msg):
+ ser.str.startswith("kapow", na="baz")
+ with pytest.raises(pa.lib.ArrowInvalid, match=msg):
+ ser.str.endswith("kapow", na="baz")
+
+
@pytest.mark.parametrize("pat", ["foo", ("foo", "baz")])
@pytest.mark.parametrize("dtype", ["object", "category"])
@pytest.mark.parametrize("null_value", [None, np.nan, pd.NA])
@@ -413,8 +457,7 @@ def test_replace_mixed_object():
def test_replace_unicode(any_string_dtype):
ser = Series([b"abcd,\xc3\xa0".decode("utf-8")], dtype=any_string_dtype)
expected = Series([b"abcd, \xc3\xa0".decode("utf-8")], dtype=any_string_dtype)
- with tm.maybe_produces_warning(PerformanceWarning, using_pyarrow(any_string_dtype)):
- result = ser.str.replace(r"(?<=\w),(?=\w)", ", ", flags=re.UNICODE, regex=True)
+ result = ser.str.replace(r"(?<=\w),(?=\w)", ", ", flags=re.UNICODE, regex=True)
tm.assert_series_equal(result, expected)
@@ -434,8 +477,7 @@ def test_replace_callable(any_string_dtype):
# test with callable
repl = lambda m: m.group(0).swapcase()
- with tm.maybe_produces_warning(PerformanceWarning, using_pyarrow(any_string_dtype)):
- result = ser.str.replace("[a-z][A-Z]{2}", repl, n=2, regex=True)
+ result = ser.str.replace("[a-z][A-Z]{2}", repl, n=2, regex=True)
expected = Series(["foObaD__baRbaD", np.nan], dtype=any_string_dtype)
tm.assert_series_equal(result, expected)
@@ -453,10 +495,7 @@ def test_replace_callable_raises(any_string_dtype, repl):
r"(?(3)required )positional arguments?"
)
with pytest.raises(TypeError, match=msg):
- with tm.maybe_produces_warning(
- PerformanceWarning, using_pyarrow(any_string_dtype)
- ):
- values.str.replace("a", repl, regex=True)
+ values.str.replace("a", repl, regex=True)
def test_replace_callable_named_groups(any_string_dtype):
@@ -464,8 +503,7 @@ def test_replace_callable_named_groups(any_string_dtype):
ser = Series(["Foo Bar Baz", np.nan], dtype=any_string_dtype)
pat = r"(?P\w+) (?P\w+) (?P\w+)"
repl = lambda m: m.group("middle").swapcase()
- with tm.maybe_produces_warning(PerformanceWarning, using_pyarrow(any_string_dtype)):
- result = ser.str.replace(pat, repl, regex=True)
+ result = ser.str.replace(pat, repl, regex=True)
expected = Series(["bAR", np.nan], dtype=any_string_dtype)
tm.assert_series_equal(result, expected)
@@ -476,13 +514,11 @@ def test_replace_compiled_regex(any_string_dtype):
# test with compiled regex
pat = re.compile(r"BAD_*")
- with tm.maybe_produces_warning(PerformanceWarning, using_pyarrow(any_string_dtype)):
- result = ser.str.replace(pat, "", regex=True)
+ result = ser.str.replace(pat, "", regex=True)
expected = Series(["foobar", np.nan], dtype=any_string_dtype)
tm.assert_series_equal(result, expected)
- with tm.maybe_produces_warning(PerformanceWarning, using_pyarrow(any_string_dtype)):
- result = ser.str.replace(pat, "", n=1, regex=True)
+ result = ser.str.replace(pat, "", n=1, regex=True)
expected = Series(["foobarBAD", np.nan], dtype=any_string_dtype)
tm.assert_series_equal(result, expected)
@@ -503,8 +539,7 @@ def test_replace_compiled_regex_unicode(any_string_dtype):
ser = Series([b"abcd,\xc3\xa0".decode("utf-8")], dtype=any_string_dtype)
expected = Series([b"abcd, \xc3\xa0".decode("utf-8")], dtype=any_string_dtype)
pat = re.compile(r"(?<=\w),(?=\w)", flags=re.UNICODE)
- with tm.maybe_produces_warning(PerformanceWarning, using_pyarrow(any_string_dtype)):
- result = ser.str.replace(pat, ", ", regex=True)
+ result = ser.str.replace(pat, ", ", regex=True)
tm.assert_series_equal(result, expected)
@@ -531,8 +566,7 @@ def test_replace_compiled_regex_callable(any_string_dtype):
ser = Series(["fooBAD__barBAD", np.nan], dtype=any_string_dtype)
repl = lambda m: m.group(0).swapcase()
pat = re.compile("[a-z][A-Z]{2}")
- with tm.maybe_produces_warning(PerformanceWarning, using_pyarrow(any_string_dtype)):
- result = ser.str.replace(pat, repl, n=2, regex=True)
+ result = ser.str.replace(pat, repl, n=2, regex=True)
expected = Series(["foObaD__baRbaD", np.nan], dtype=any_string_dtype)
tm.assert_series_equal(result, expected)
@@ -580,8 +614,7 @@ def test_replace_moar(any_string_dtype):
)
tm.assert_series_equal(result, expected)
- with tm.maybe_produces_warning(PerformanceWarning, using_pyarrow(any_string_dtype)):
- result = ser.str.replace("A", "YYY", case=False)
+ result = ser.str.replace("A", "YYY", case=False)
expected = Series(
[
"YYY",
@@ -599,8 +632,7 @@ def test_replace_moar(any_string_dtype):
)
tm.assert_series_equal(result, expected)
- with tm.maybe_produces_warning(PerformanceWarning, using_pyarrow(any_string_dtype)):
- result = ser.str.replace("^.a|dog", "XX-XX ", case=False, regex=True)
+ result = ser.str.replace("^.a|dog", "XX-XX ", case=False, regex=True)
expected = Series(
[
"A",
@@ -623,13 +655,11 @@ def test_replace_not_case_sensitive_not_regex(any_string_dtype):
# https://github.com/pandas-dev/pandas/issues/41602
ser = Series(["A.", "a.", "Ab", "ab", np.nan], dtype=any_string_dtype)
- with tm.maybe_produces_warning(PerformanceWarning, using_pyarrow(any_string_dtype)):
- result = ser.str.replace("a", "c", case=False, regex=False)
+ result = ser.str.replace("a", "c", case=False, regex=False)
expected = Series(["c.", "c.", "cb", "cb", np.nan], dtype=any_string_dtype)
tm.assert_series_equal(result, expected)
- with tm.maybe_produces_warning(PerformanceWarning, using_pyarrow(any_string_dtype)):
- result = ser.str.replace("a.", "c.", case=False, regex=False)
+ result = ser.str.replace("a.", "c.", case=False, regex=False)
expected = Series(["c.", "c.", "Ab", "ab", np.nan], dtype=any_string_dtype)
tm.assert_series_equal(result, expected)
@@ -801,8 +831,7 @@ def test_fullmatch_case_kwarg(any_string_dtype):
result = ser.str.fullmatch("ab", case=False)
tm.assert_series_equal(result, expected)
- with tm.maybe_produces_warning(PerformanceWarning, using_pyarrow(any_string_dtype)):
- result = ser.str.fullmatch("ab", flags=re.IGNORECASE)
+ result = ser.str.fullmatch("ab", flags=re.IGNORECASE)
tm.assert_series_equal(result, expected)
@@ -987,17 +1016,13 @@ def test_flags_kwarg(any_string_dtype):
pat = r"([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})"
- use_pyarrow = using_pyarrow(any_string_dtype)
-
result = data.str.extract(pat, flags=re.IGNORECASE, expand=True)
assert result.iloc[0].tolist() == ["dave", "google", "com"]
- with tm.maybe_produces_warning(PerformanceWarning, use_pyarrow):
- result = data.str.match(pat, flags=re.IGNORECASE)
+ result = data.str.match(pat, flags=re.IGNORECASE)
assert result.iloc[0]
- with tm.maybe_produces_warning(PerformanceWarning, use_pyarrow):
- result = data.str.fullmatch(pat, flags=re.IGNORECASE)
+ result = data.str.fullmatch(pat, flags=re.IGNORECASE)
assert result.iloc[0]
result = data.str.findall(pat, flags=re.IGNORECASE)
@@ -1007,8 +1032,6 @@ def test_flags_kwarg(any_string_dtype):
assert result.iloc[0] == 1
msg = "has match groups"
- with tm.assert_produces_warning(
- UserWarning, match=msg, raise_on_extra_warnings=not use_pyarrow
- ):
+ with tm.assert_produces_warning(UserWarning, match=msg):
result = data.str.contains(pat, flags=re.IGNORECASE)
assert result.iloc[0]
diff --git a/pandas/tests/strings/test_string_array.py b/pandas/tests/strings/test_string_array.py
index 0b3f368afea5e..517ddb164985c 100644
--- a/pandas/tests/strings/test_string_array.py
+++ b/pandas/tests/strings/test_string_array.py
@@ -12,7 +12,6 @@
)
-@pytest.mark.filterwarnings("ignore:Falling back")
def test_string_array(nullable_string_dtype, any_string_method):
method_name, args, kwargs = any_string_method
diff --git a/pandas/tests/strings/test_strings.py b/pandas/tests/strings/test_strings.py
index 015df18221b40..40b6c69dc8025 100644
--- a/pandas/tests/strings/test_strings.py
+++ b/pandas/tests/strings/test_strings.py
@@ -393,6 +393,7 @@ def test_pipe_failures(any_string_dtype):
(2, 5, None, ["foo", "bar", np.nan, "baz"]),
(0, 3, -1, ["", "", np.nan, ""]),
(None, None, -1, ["owtoofaa", "owtrabaa", np.nan, "xuqzabaa"]),
+ (None, 2, -1, ["owtoo", "owtra", np.nan, "xuqza"]),
(3, 10, 2, ["oto", "ato", np.nan, "aqx"]),
(3, 0, -1, ["ofa", "aba", np.nan, "aba"]),
],
diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py
index f3a7ba2607f4a..a7c2ec5acb7c2 100644
--- a/pandas/tests/test_algos.py
+++ b/pandas/tests/test_algos.py
@@ -1900,13 +1900,16 @@ def test_strobj_mode(self):
tm.assert_series_equal(ser.mode(), exp)
@pytest.mark.parametrize("dt", [str, object])
- def test_strobj_multi_char(self, dt):
+ def test_strobj_multi_char(self, dt, using_infer_string):
exp = ["bar"]
data = ["foo"] * 2 + ["bar"] * 3
ser = Series(data, dtype=dt)
exp = Series(exp, dtype=dt)
- tm.assert_numpy_array_equal(algos.mode(ser.values), exp.values)
+ if using_infer_string and dt is str:
+ tm.assert_extension_array_equal(algos.mode(ser.values), exp.values)
+ else:
+ tm.assert_numpy_array_equal(algos.mode(ser.values), exp.values)
tm.assert_series_equal(ser.mode(), exp)
def test_datelike_mode(self):
diff --git a/pandas/tests/util/test_shares_memory.py b/pandas/tests/util/test_shares_memory.py
index 00a897d574a07..8f1ac93b40247 100644
--- a/pandas/tests/util/test_shares_memory.py
+++ b/pandas/tests/util/test_shares_memory.py
@@ -1,3 +1,5 @@
+import numpy as np
+
import pandas.util._test_decorators as td
import pandas as pd
@@ -20,10 +22,10 @@ def test_shares_memory_string():
# GH#55823
import pyarrow as pa
- obj = pd.array(["a", "b"], dtype="string[pyarrow]")
+ obj = pd.array(["a", "b"], dtype=pd.StringDtype("pyarrow", na_value=pd.NA))
assert tm.shares_memory(obj, obj)
- obj = pd.array(["a", "b"], dtype="string[pyarrow_numpy]")
+ obj = pd.array(["a", "b"], dtype=pd.StringDtype("pyarrow", na_value=np.nan))
assert tm.shares_memory(obj, obj)
obj = pd.array(["a", "b"], dtype=pd.ArrowDtype(pa.string()))
diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py
index acf636616421f..f353a7fa2f0fe 100644
--- a/pandas/tests/window/test_rolling.py
+++ b/pandas/tests/window/test_rolling.py
@@ -6,10 +6,7 @@
import numpy as np
import pytest
-from pandas._config import using_string_dtype
-
from pandas.compat import (
- HAS_PYARROW,
IS64,
is_platform_arm,
is_platform_power,
@@ -1423,9 +1420,6 @@ def test_rolling_corr_timedelta_index(index, window):
tm.assert_almost_equal(result, expected)
-@pytest.mark.xfail(
- using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)"
-)
def test_groupby_rolling_nan_included():
# GH 35542
data = {"group": ["g1", np.nan, "g1", "g2", np.nan], "B": [0, 1, 2, 3, 4]}