From 71da3b098d2e0ba93ae432b6a7f8486350c4f9fb Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sun, 11 Aug 2024 10:56:18 +0200 Subject: [PATCH 1/3] String dtype: fix pyarrow-based IO + update tests --- pandas/core/arrays/string_arrow.py | 16 +++++--- pandas/io/_util.py | 2 + pandas/tests/io/test_feather.py | 51 ++++++++++-------------- pandas/tests/io/test_fsspec.py | 6 +-- pandas/tests/io/test_gcs.py | 2 +- pandas/tests/io/test_orc.py | 25 ++++++------ pandas/tests/io/test_parquet.py | 62 ++++++++++++++++++++---------- 7 files changed, 91 insertions(+), 73 deletions(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index cc37995969f0a..f48aec19685d3 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -130,18 +130,22 @@ class ArrowStringArray(ObjectStringArrayMixin, ArrowExtensionArray, BaseStringAr def __init__(self, values) -> None: _chk_pyarrow_available() - if isinstance(values, (pa.Array, pa.ChunkedArray)) and pa.types.is_string( - values.type + if isinstance(values, (pa.Array, pa.ChunkedArray)) and ( + pa.types.is_string(values.type) + or ( + pa.types.is_dictionary(values.type) + and ( + pa.types.is_string(values.type.value_type) + or pa.types.is_large_string(values.type.value_type) + ) + ) ): values = pc.cast(values, pa.large_string()) super().__init__(values) self._dtype = StringDtype(storage=self._storage, na_value=self._na_value) - if not pa.types.is_large_string(self._pa_array.type) and not ( - pa.types.is_dictionary(self._pa_array.type) - and pa.types.is_large_string(self._pa_array.type.value_type) - ): + if not pa.types.is_large_string(self._pa_array.type): raise ValueError( "ArrowStringArray requires a PyArrow (chunked) array of " "large_string type" diff --git a/pandas/io/_util.py b/pandas/io/_util.py index f502f827faa4e..a1c3318f04466 100644 --- a/pandas/io/_util.py +++ b/pandas/io/_util.py @@ -27,6 +27,8 @@ def _arrow_dtype_mapping() -> dict: pa.string(): pd.StringDtype(), pa.float32(): pd.Float32Dtype(), pa.float64(): pd.Float64Dtype(), + pa.string(): pd.StringDtype(), + pa.large_string(): pd.StringDtype(), } diff --git a/pandas/tests/io/test_feather.py b/pandas/tests/io/test_feather.py index 5aa8f1c69fe44..16037f0e7dcdf 100644 --- a/pandas/tests/io/test_feather.py +++ b/pandas/tests/io/test_feather.py @@ -5,23 +5,15 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas as pd import pandas._testing as tm -from pandas.core.arrays import ( - ArrowStringArray, - StringArray, -) from pandas.io.feather_format import read_feather, to_feather # isort:skip -pytestmark = [ - pytest.mark.filterwarnings( - "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" - ), - pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False), -] +pytestmark = pytest.mark.filterwarnings( + "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" +) + pa = pytest.importorskip("pyarrow") @@ -154,8 +146,8 @@ def test_path_pathlib(self): def test_passthrough_keywords(self): df = pd.DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=pd.Index(list("ABCD"), dtype=object), - index=pd.Index([f"i-{i}" for i in range(30)], dtype=object), + columns=pd.Index(list("ABCD")), + index=pd.Index([f"i-{i}" for i in range(30)]), ).reset_index() self.check_round_trip(df, write_kwargs={"version": 1}) @@ -169,7 +161,9 @@ def test_http_path(self, feather_file, httpserver): res = read_feather(httpserver.url) tm.assert_frame_equal(expected, res) - def test_read_feather_dtype_backend(self, string_storage, dtype_backend): + def test_read_feather_dtype_backend( + self, string_storage, dtype_backend, using_infer_string + ): # GH#50765 df = pd.DataFrame( { @@ -184,25 +178,20 @@ def test_read_feather_dtype_backend(self, string_storage, dtype_backend): } ) - if string_storage == "python": - string_array = StringArray(np.array(["a", "b", "c"], dtype=np.object_)) - string_array_na = StringArray(np.array(["a", "b", pd.NA], dtype=np.object_)) - - elif dtype_backend == "pyarrow": - from pandas.arrays import ArrowExtensionArray - - string_array = ArrowExtensionArray(pa.array(["a", "b", "c"])) - string_array_na = ArrowExtensionArray(pa.array(["a", "b", None])) - - else: - string_array = ArrowStringArray(pa.array(["a", "b", "c"])) - string_array_na = ArrowStringArray(pa.array(["a", "b", None])) - with tm.ensure_clean() as path: to_feather(df, path) with pd.option_context("mode.string_storage", string_storage): result = read_feather(path, dtype_backend=dtype_backend) + if dtype_backend == "pyarrow": + pa = pytest.importorskip("pyarrow") + if using_infer_string: + string_dtype = pd.ArrowDtype(pa.large_string()) + else: + string_dtype = pd.ArrowDtype(pa.string()) + else: + string_dtype = pd.StringDtype(string_storage) + expected = pd.DataFrame( { "a": pd.Series([1, np.nan, 3], dtype="Int64"), @@ -211,8 +200,8 @@ def test_read_feather_dtype_backend(self, string_storage, dtype_backend): "d": pd.Series([1.5, 2.0, 2.5], dtype="Float64"), "e": pd.Series([True, False, pd.NA], dtype="boolean"), "f": pd.Series([True, False, True], dtype="boolean"), - "g": string_array, - "h": string_array_na, + "g": pd.Series(["a", "b", "c"], dtype=string_dtype), + "h": pd.Series(["a", "b", None], dtype=string_dtype), } ) diff --git a/pandas/tests/io/test_fsspec.py b/pandas/tests/io/test_fsspec.py index 45e0cab2165a7..aa9c47ea0e63c 100644 --- a/pandas/tests/io/test_fsspec.py +++ b/pandas/tests/io/test_fsspec.py @@ -176,7 +176,7 @@ def test_excel_options(fsspectest): assert fsspectest.test[0] == "read" -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string) fastparquet") def test_to_parquet_new_file(cleared_fs, df1): """Regression test for writing to a not-yet-existent GCS Parquet file.""" pytest.importorskip("fastparquet") @@ -205,7 +205,7 @@ def test_arrowparquet_options(fsspectest): assert fsspectest.test[0] == "parquet_read" -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string) fastparquet") def test_fastparquet_options(fsspectest): """Regression test for writing to a not-yet-existent GCS Parquet file.""" pytest.importorskip("fastparquet") @@ -263,7 +263,7 @@ def test_s3_protocols(s3_public_bucket_with_data, tips_file, protocol, s3so): ) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string) fastparquet") @pytest.mark.single_cpu def test_s3_parquet(s3_public_bucket, s3so, df1): pytest.importorskip("fastparquet") diff --git a/pandas/tests/io/test_gcs.py b/pandas/tests/io/test_gcs.py index bf56a5781f7cd..a9e7b2da03a4d 100644 --- a/pandas/tests/io/test_gcs.py +++ b/pandas/tests/io/test_gcs.py @@ -208,7 +208,7 @@ def test_to_csv_compression_encoding_gcs( tm.assert_frame_equal(df, read_df) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string) fastparquet") def test_to_parquet_gcs_new_file(monkeypatch, tmpdir): """Regression test for writing to a not-yet-existent GCS Parquet file.""" pytest.importorskip("fastparquet") diff --git a/pandas/tests/io/test_orc.py b/pandas/tests/io/test_orc.py index a189afbac070d..90133344fdfc9 100644 --- a/pandas/tests/io/test_orc.py +++ b/pandas/tests/io/test_orc.py @@ -9,8 +9,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas as pd from pandas import read_orc import pandas._testing as tm @@ -20,12 +18,9 @@ import pyarrow as pa -pytestmark = [ - pytest.mark.filterwarnings( - "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" - ), - pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False), -] +pytestmark = pytest.mark.filterwarnings( + "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" +) @pytest.fixture @@ -33,7 +28,7 @@ def dirpath(datapath): return datapath("io", "data", "orc") -def test_orc_reader_empty(dirpath): +def test_orc_reader_empty(dirpath, using_infer_string): columns = [ "boolean1", "byte1", @@ -54,11 +49,12 @@ def test_orc_reader_empty(dirpath): "float32", "float64", "object", - "object", + "str" if using_infer_string else "object", ] expected = pd.DataFrame(index=pd.RangeIndex(0)) for colname, dtype in zip(columns, dtypes): expected[colname] = pd.Series(dtype=dtype) + expected.columns = expected.columns.astype("str") inputfile = os.path.join(dirpath, "TestOrcFile.emptyFile.orc") got = read_orc(inputfile, columns=columns) @@ -305,7 +301,7 @@ def test_orc_writer_dtypes_not_supported(orc_writer_dtypes_not_supported): df.to_orc() -def test_orc_dtype_backend_pyarrow(): +def test_orc_dtype_backend_pyarrow(using_infer_string): pytest.importorskip("pyarrow") df = pd.DataFrame( { @@ -338,6 +334,13 @@ def test_orc_dtype_backend_pyarrow(): for col in df.columns } ) + if using_infer_string: + # ORC does not preserve distinction between string and large string + # -> the default large string comes back as string + string_dtype = pd.ArrowDtype(pa.string()) + expected["string"] = expected["string"].astype(string_dtype) + expected["string_with_nan"] = expected["string_with_nan"].astype(string_dtype) + expected["string_with_none"] = expected["string_with_none"].astype(string_dtype) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 561c718ea5851..00bf86cf9f36a 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -51,7 +51,6 @@ pytest.mark.filterwarnings( "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" ), - pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False), ] @@ -60,10 +59,17 @@ params=[ pytest.param( "fastparquet", - marks=pytest.mark.skipif( - not _HAVE_FASTPARQUET, - reason="fastparquet is not installed", - ), + marks=[ + pytest.mark.skipif( + not _HAVE_FASTPARQUET, + reason="fastparquet is not installed", + ), + pytest.mark.xfail( + using_string_dtype(), + reason="TODO(infer_string) fastparquet", + strict=False, + ), + ], ), pytest.param( "pyarrow", @@ -85,15 +91,22 @@ def pa(): @pytest.fixture -def fp(): +def fp(request): if not _HAVE_FASTPARQUET: pytest.skip("fastparquet is not installed") + if using_string_dtype(): + request.applymarker( + pytest.mark.xfail(reason="TODO(infer_string) fastparquet", strict=False) + ) return "fastparquet" @pytest.fixture def df_compat(): - return pd.DataFrame({"A": [1, 2, 3], "B": "foo"}) + # TODO(infer_string) should this give str columns? + return pd.DataFrame( + {"A": [1, 2, 3], "B": "foo"}, columns=pd.Index(["A", "B"], dtype=object) + ) @pytest.fixture @@ -365,16 +378,6 @@ def check_external_error_on_write(self, df, engine, exc): with tm.external_error_raised(exc): to_parquet(df, path, engine, compression=None) - @pytest.mark.network - @pytest.mark.single_cpu - def test_parquet_read_from_url(self, httpserver, datapath, df_compat, engine): - if engine != "auto": - pytest.importorskip(engine) - with open(datapath("io", "data", "parquet", "simple.parquet"), mode="rb") as f: - httpserver.serve_content(content=f.read()) - df = read_parquet(httpserver.url) - tm.assert_frame_equal(df, df_compat) - class TestBasic(Base): def test_error(self, engine): @@ -672,6 +675,16 @@ def test_read_empty_array(self, pa, dtype): df, pa, read_kwargs={"dtype_backend": "numpy_nullable"}, expected=expected ) + @pytest.mark.network + @pytest.mark.single_cpu + def test_parquet_read_from_url(self, httpserver, datapath, df_compat, engine): + if engine != "auto": + pytest.importorskip(engine) + with open(datapath("io", "data", "parquet", "simple.parquet"), mode="rb") as f: + httpserver.serve_content(content=f.read()) + df = read_parquet(httpserver.url, engine=engine) + tm.assert_frame_equal(df, df_compat) + class TestParquetPyArrow(Base): @pytest.mark.xfail(reason="datetime_with_nat unit doesn't round-trip") @@ -905,7 +918,7 @@ def test_write_with_schema(self, pa): out_df = df.astype(bool) check_round_trip(df, pa, write_kwargs={"schema": schema}, expected=out_df) - def test_additional_extension_arrays(self, pa): + def test_additional_extension_arrays(self, pa, using_infer_string): # test additional ExtensionArrays that are supported through the # __arrow_array__ protocol pytest.importorskip("pyarrow") @@ -916,17 +929,24 @@ def test_additional_extension_arrays(self, pa): "c": pd.Series(["a", None, "c"], dtype="string"), } ) - check_round_trip(df, pa) + if using_infer_string: + check_round_trip(df, pa, expected=df.astype({"c": "str"})) + else: + check_round_trip(df, pa) df = pd.DataFrame({"a": pd.Series([1, 2, 3, None], dtype="Int64")}) check_round_trip(df, pa) - def test_pyarrow_backed_string_array(self, pa, string_storage): + def test_pyarrow_backed_string_array(self, pa, string_storage, using_infer_string): # test ArrowStringArray supported through the __arrow_array__ protocol pytest.importorskip("pyarrow") df = pd.DataFrame({"a": pd.Series(["a", None, "c"], dtype="string[pyarrow]")}) with pd.option_context("string_storage", string_storage): - check_round_trip(df, pa, expected=df.astype(f"string[{string_storage}]")) + if using_infer_string: + expected = df.astype("str") + else: + expected = df.astype(f"string[{string_storage}]") + check_round_trip(df, pa, expected=expected) def test_additional_extension_types(self, pa): # test additional ExtensionArrays that are supported through the From 3ef26fec9003f93392f50b0a89d400fb359c0445 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 20 Aug 2024 21:02:49 +0200 Subject: [PATCH 2/3] fix expected columns dtype --- pandas/tests/io/test_feather.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pandas/tests/io/test_feather.py b/pandas/tests/io/test_feather.py index 16037f0e7dcdf..a1f3babb1ae3b 100644 --- a/pandas/tests/io/test_feather.py +++ b/pandas/tests/io/test_feather.py @@ -215,6 +215,10 @@ def test_read_feather_dtype_backend( } ) + if using_infer_string: + expected.columns = expected.columns.astype( + pd.StringDtype(string_storage, na_value=np.nan) + ) tm.assert_frame_equal(result, expected) def test_int_columns_and_index(self): From ef2f6cba04bcdfc6ac0a575294396595114b6b32 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 21 Aug 2024 21:38:06 +0200 Subject: [PATCH 3/3] another columns fix --- pandas/tests/io/test_parquet.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 339feaba9f4f7..0d0eae25781f1 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -945,6 +945,7 @@ def test_pyarrow_backed_string_array(self, pa, string_storage, using_infer_strin with pd.option_context("string_storage", string_storage): if using_infer_string: expected = df.astype("str") + expected.columns = expected.columns.astype("str") else: expected = df.astype(f"string[{string_storage}]") check_round_trip(df, pa, expected=expected)