Skip to content
Merged
1 change: 1 addition & 0 deletions .github/workflows/unit-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@ jobs:
- name: "Pyarrow Nightly"
env_file: actions-311-pyarrownightly.yaml
pattern: "not slow and not network and not single_cpu"
pandas_future_infer_string: "1"
platform: ubuntu-22.04
fail-fast: false
name: ${{ matrix.name || format('ubuntu-latest {0}', matrix.env_file) }}-${{ matrix.platform }}
Expand Down
2 changes: 1 addition & 1 deletion ci/deps/actions-311-pyarrownightly.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ dependencies:

- pip:
- "tzdata>=2022.7"
- "--extra-index-url https://pypi.fury.io/arrow-nightlies/"
- "--extra-index-url https://pypi.anaconda.org/scientific-python-nightly-wheels/simple"
- "--prefer-binary"
- "--pre"
- "pyarrow"
2 changes: 2 additions & 0 deletions pandas/compat/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
pa_version_under16p0,
pa_version_under17p0,
pa_version_under18p0,
pa_version_under19p0,
)

if TYPE_CHECKING:
Expand Down Expand Up @@ -166,4 +167,5 @@ def is_ci_environment() -> bool:
"pa_version_under16p0",
"pa_version_under17p0",
"pa_version_under18p0",
"pa_version_under19p0",
]
2 changes: 2 additions & 0 deletions pandas/compat/pyarrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
pa_version_under16p0 = _palv < Version("16.0.0")
pa_version_under17p0 = _palv < Version("17.0.0")
pa_version_under18p0 = _palv < Version("18.0.0")
pa_version_under19p0 = _palv < Version("19.0.0")
HAS_PYARROW = True
except ImportError:
pa_version_under10p1 = True
Expand All @@ -30,4 +31,5 @@
pa_version_under16p0 = True
pa_version_under17p0 = True
pa_version_under18p0 = True
pa_version_under19p0 = True
HAS_PYARROW = False
10 changes: 8 additions & 2 deletions pandas/io/_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,10 @@
from pandas._config import using_string_dtype

from pandas._libs import lib
from pandas.compat import pa_version_under18p0
from pandas.compat import (
pa_version_under18p0,
pa_version_under19p0,
)
from pandas.compat._optional import import_optional_dependency

import pandas as pd
Expand Down Expand Up @@ -77,7 +80,10 @@ def arrow_table_to_pandas(
elif dtype_backend == "pyarrow":
types_mapper = pd.ArrowDtype
elif using_string_dtype():
types_mapper = _arrow_string_types_mapper()
if pa_version_under19p0:
types_mapper = _arrow_string_types_mapper()
else:
types_mapper = None
elif dtype_backend is lib.no_default or dtype_backend == "numpy":
types_mapper = None
else:
Expand Down
22 changes: 20 additions & 2 deletions pandas/tests/arrays/string_/test_string.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,10 @@

from pandas._config import using_string_dtype

from pandas.compat.pyarrow import pa_version_under12p0
from pandas.compat.pyarrow import (
pa_version_under12p0,
pa_version_under19p0,
)

from pandas.core.dtypes.common import is_dtype_equal

Expand Down Expand Up @@ -539,7 +542,7 @@ def test_arrow_roundtrip(dtype, string_storage, using_infer_string):
assert table.field("a").type == "large_string"
with pd.option_context("string_storage", string_storage):
result = table.to_pandas()
if dtype.na_value is np.nan and not using_string_dtype():
if dtype.na_value is np.nan and not using_infer_string:
assert result["a"].dtype == "object"
else:
assert isinstance(result["a"].dtype, pd.StringDtype)
Expand All @@ -553,6 +556,21 @@ def test_arrow_roundtrip(dtype, string_storage, using_infer_string):
assert result.loc[2, "a"] is result["a"].dtype.na_value


@pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning")
def test_arrow_from_string(using_infer_string):
# not roundtrip, but starting with pyarrow table without pandas metadata
pa = pytest.importorskip("pyarrow")
table = pa.table({"a": pa.array(["a", "b", None], type=pa.string())})

result = table.to_pandas()

if using_infer_string and not pa_version_under19p0:
expected = pd.DataFrame({"a": ["a", "b", None]}, dtype="str")
else:
expected = pd.DataFrame({"a": ["a", "b", None]}, dtype="object")
tm.assert_frame_equal(result, expected)


@pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning")
def test_arrow_load_from_zero_chunks(dtype, string_storage, using_infer_string):
# GH-41040
Expand Down
5 changes: 3 additions & 2 deletions pandas/tests/io/test_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
WASM,
is_platform_windows,
)
from pandas.compat.pyarrow import pa_version_under19p0
import pandas.util._test_decorators as td

import pandas as pd
Expand Down Expand Up @@ -152,8 +153,8 @@ def test_get_handle_pyarrow_compat(self):
s = StringIO(data)
with icom.get_handle(s, "rb", is_text=False) as handles:
df = pa_csv.read_csv(handles.handle).to_pandas()
# TODO will have to update this when pyarrow' to_pandas() is fixed
expected = expected.astype("object")
if pa_version_under19p0:
expected = expected.astype("object")
tm.assert_frame_equal(df, expected)
assert not s.closed

Expand Down
18 changes: 16 additions & 2 deletions pandas/tests/io/test_feather.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,10 @@
import numpy as np
import pytest

from pandas.compat.pyarrow import pa_version_under18p0
from pandas.compat.pyarrow import (
pa_version_under18p0,
pa_version_under19p0,
)

import pandas as pd
import pandas._testing as tm
Expand Down Expand Up @@ -239,16 +242,27 @@ def test_invalid_dtype_backend(self):
with pytest.raises(ValueError, match=msg):
read_feather(path, dtype_backend="numpy")

def test_string_inference(self, tmp_path):
def test_string_inference(self, tmp_path, using_infer_string):
# GH#54431
path = tmp_path / "test_string_inference.p"
df = pd.DataFrame(data={"a": ["x", "y"]})
df.to_feather(path)
with pd.option_context("future.infer_string", True):
result = read_feather(path)
dtype = pd.StringDtype(na_value=np.nan)
expected = pd.DataFrame(
data={"a": ["x", "y"]}, dtype=pd.StringDtype(na_value=np.nan)
)
expected = pd.DataFrame(
data={"a": ["x", "y"]},
dtype=dtype,
columns=pd.Index(
["a"],
dtype=object
if pa_version_under19p0 and not using_infer_string
else dtype,
),
)
tm.assert_frame_equal(result, expected)

@pytest.mark.skipif(pa_version_under18p0, reason="not supported before 18.0")
Expand Down
65 changes: 43 additions & 22 deletions pandas/tests/io/test_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
pa_version_under13p0,
pa_version_under15p0,
pa_version_under17p0,
pa_version_under19p0,
)

import pandas as pd
Expand Down Expand Up @@ -254,8 +255,10 @@ def test_invalid_engine(df_compat):
check_round_trip(df_compat, "foo", "bar")


def test_options_py(df_compat, pa):
def test_options_py(df_compat, pa, using_infer_string):
# use the set option
if using_infer_string and not pa_version_under19p0:
df_compat.columns = df_compat.columns.astype("str")

with pd.option_context("io.parquet.engine", "pyarrow"):
check_round_trip(df_compat)
Expand Down Expand Up @@ -784,18 +787,21 @@ def test_unsupported_float16_cleanup(self, pa, path_type):

def test_categorical(self, pa):
# supported in >= 0.7.0
df = pd.DataFrame()
df["a"] = pd.Categorical(list("abcdef"))

# test for null, out-of-order values, and unobserved category
df["b"] = pd.Categorical(
["bar", "foo", "foo", "bar", None, "bar"],
dtype=pd.CategoricalDtype(["foo", "bar", "baz"]),
)

# test for ordered flag
df["c"] = pd.Categorical(
["a", "b", "c", "a", "c", "b"], categories=["b", "c", "d"], ordered=True
df = pd.DataFrame(
{
"a": pd.Categorical(list("abcdef")),
# test for null, out-of-order values, and unobserved category
"b": pd.Categorical(
["bar", "foo", "foo", "bar", None, "bar"],
dtype=pd.CategoricalDtype(["foo", "bar", "baz"]),
),
# test for ordered flag
"c": pd.Categorical(
["a", "b", "c", "a", "c", "b"],
categories=["b", "c", "d"],
ordered=True,
),
}
)

check_round_trip(df, pa)
Expand Down Expand Up @@ -858,11 +864,13 @@ def test_s3_roundtrip_for_dir(
repeat=1,
)

def test_read_file_like_obj_support(self, df_compat):
def test_read_file_like_obj_support(self, df_compat, using_infer_string):
pytest.importorskip("pyarrow")
buffer = BytesIO()
df_compat.to_parquet(buffer)
df_from_buf = read_parquet(buffer)
if using_infer_string and not pa_version_under19p0:
df_compat.columns = df_compat.columns.astype("str")
tm.assert_frame_equal(df_compat, df_from_buf)

def test_expand_user(self, df_compat, monkeypatch):
Expand Down Expand Up @@ -929,7 +937,7 @@ def test_additional_extension_arrays(self, pa, using_infer_string):
"c": pd.Series(["a", None, "c"], dtype="string"),
}
)
if using_infer_string:
if using_infer_string and pa_version_under19p0:
check_round_trip(df, pa, expected=df.astype({"c": "str"}))
else:
check_round_trip(df, pa)
Expand All @@ -943,7 +951,10 @@ def test_pyarrow_backed_string_array(self, pa, string_storage, using_infer_strin
df = pd.DataFrame({"a": pd.Series(["a", None, "c"], dtype="string[pyarrow]")})
with pd.option_context("string_storage", string_storage):
if using_infer_string:
expected = df.astype("str")
if pa_version_under19p0:
expected = df.astype("str")
else:
expected = df.astype(f"string[{string_storage}]")
expected.columns = expected.columns.astype("str")
else:
expected = df.astype(f"string[{string_storage}]")
Expand Down Expand Up @@ -1099,17 +1110,24 @@ def test_df_attrs_persistence(self, tmp_path, pa):
new_df = read_parquet(path, engine=pa)
assert new_df.attrs == df.attrs

def test_string_inference(self, tmp_path, pa):
def test_string_inference(self, tmp_path, pa, using_infer_string):
# GH#54431
path = tmp_path / "test_string_inference.p"
df = pd.DataFrame(data={"a": ["x", "y"]}, index=["a", "b"])
df.to_parquet(path, engine="pyarrow")
df.to_parquet(path, engine=pa)
with pd.option_context("future.infer_string", True):
result = read_parquet(path, engine="pyarrow")
result = read_parquet(path, engine=pa)
dtype = pd.StringDtype(na_value=np.nan)
expected = pd.DataFrame(
data={"a": ["x", "y"]},
dtype=pd.StringDtype(na_value=np.nan),
index=pd.Index(["a", "b"], dtype=pd.StringDtype(na_value=np.nan)),
dtype=dtype,
index=pd.Index(["a", "b"], dtype=dtype),
columns=pd.Index(
["a"],
dtype=object
if pa_version_under19p0 and not using_infer_string
else dtype,
),
)
tm.assert_frame_equal(result, expected)

Expand All @@ -1122,7 +1140,10 @@ def test_roundtrip_decimal(self, tmp_path, pa):
df = pd.DataFrame({"a": [Decimal("123.00")]}, dtype="string[pyarrow]")
df.to_parquet(path, schema=pa.schema([("a", pa.decimal128(5))]))
result = read_parquet(path)
expected = pd.DataFrame({"a": ["123"]}, dtype="string[python]")
if pa_version_under19p0:
expected = pd.DataFrame({"a": ["123"]}, dtype="string[python]")
else:
expected = pd.DataFrame({"a": [Decimal("123.00")]}, dtype="object")
tm.assert_frame_equal(result, expected)

def test_infer_string_large_string_type(self, tmp_path, pa):
Expand Down
Loading