From 5f84f2b10fc7c5678fbdabed08684de7638a1094 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 16 Mar 2023 14:39:57 -0700 Subject: [PATCH 1/3] BUG: df.to_parquet with empty columns --- pandas/io/parquet.py | 6 +++++- pandas/tests/io/test_parquet.py | 14 ++++++++++++++ 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index a606cb9287d16..2ce481e295a65 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -163,7 +163,11 @@ def validate_dataframe(df: DataFrame) -> None: each level of the MultiIndex """ ) - elif df.columns.inferred_type not in {"string", "empty"}: + elif not df.columns.empty and df.columns.inferred_type not in { + "string", + "empty", + }: + # GH 52034: RangeIndex.inferred_dtype is always "integer" if empty raise ValueError("parquet must have string column names") # index level names must be strings diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 4ba3776bf6063..9cac89647f794 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -1041,6 +1041,11 @@ def test_read_dtype_backend_pyarrow_config_index(self, pa): expected=expected, ) + def test_empty_columns(self, pa): + # GH 52034 + df = pd.DataFrame(index=pd.Index(["a", "b", "c"], name="custom name")) + check_round_trip(df, pa) + class TestParquetFastParquet(Base): def test_basic(self, fp, df_full): @@ -1281,3 +1286,12 @@ def test_invalid_dtype_backend(self, engine): df.to_parquet(path) with pytest.raises(ValueError, match=msg): read_parquet(path, dtype_backend="numpy") + + def test_empty_columns(self, fp): + # GH 52034 + df = pd.DataFrame(index=pd.Index(["a", "b", "c"], name="custom name")) + expected = pd.DataFrame( + columns=pd.Index([], dtype=object), + index=pd.Index(["a", "b", "c"], name="custom name"), + ) + check_round_trip(df, fp, expected=expected) From 88b82f9c0322824b1f1436db61d9bf1e556d593e Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 16 Mar 2023 17:06:08 -0700 Subject: [PATCH 2/3] Remove validation for non-string index or columns --- doc/source/whatsnew/v2.0.0.rst | 1 + pandas/io/parquet.py | 26 --------- pandas/tests/io/test_parquet.py | 97 ++++++++++++++++++++------------- 3 files changed, 61 insertions(+), 63 deletions(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 55185afc0a098..74319b0444659 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -1295,6 +1295,7 @@ I/O - Bug in :func:`read_csv` when ``engine="pyarrow"`` where ``encoding`` parameter was not handled correctly (:issue:`51302`) - Bug in :func:`read_xml` ignored repeated elements when iterparse is used (:issue:`51183`) - Bug in :class:`ExcelWriter` leaving file handles open if an exception occurred during instantiation (:issue:`51443`) +- Bug in :meth:`DataFrame.to_parquet` where non-string index or columns were raising a ``ValueError`` when ``engine="pyarrow"`` (:issue:`52036`) Period ^^^^^^ diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 2ce481e295a65..7791ca53a6447 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -21,7 +21,6 @@ import pandas as pd from pandas import ( DataFrame, - MultiIndex, get_option, ) from pandas.core.shared_docs import _shared_docs @@ -152,31 +151,6 @@ def validate_dataframe(df: DataFrame) -> None: if not isinstance(df, DataFrame): raise ValueError("to_parquet only supports IO with DataFrames") - # must have value column names for all index levels (strings only) - if isinstance(df.columns, MultiIndex): - if not all( - x.inferred_type in {"string", "empty"} for x in df.columns.levels - ): - raise ValueError( - """ - parquet must have string column names for all values in - each level of the MultiIndex - """ - ) - elif not df.columns.empty and df.columns.inferred_type not in { - "string", - "empty", - }: - # GH 52034: RangeIndex.inferred_dtype is always "integer" if empty - raise ValueError("parquet must have string column names") - - # index level names must be strings - valid_names = all( - isinstance(name, str) for name in df.index.names if name is not None - ) - if not valid_names: - raise ValueError("Index level names must be strings") - def write(self, df: DataFrame, path, compression, **kwargs): raise AbstractMethodError(self) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 9cac89647f794..18f24ce091482 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -404,25 +404,6 @@ def test_columns_dtypes(self, engine): df.columns = ["foo", "bar"] check_round_trip(df, engine) - def test_columns_dtypes_invalid(self, engine): - df = pd.DataFrame({"string": list("abc"), "int": list(range(1, 4))}) - - msg = "parquet must have string column names" - # numeric - df.columns = [0, 1] - self.check_error_on_write(df, engine, ValueError, msg) - - # bytes - df.columns = [b"foo", b"bar"] - self.check_error_on_write(df, engine, ValueError, msg) - - # python object - df.columns = [ - datetime.datetime(2011, 1, 1, 0, 0), - datetime.datetime(2011, 1, 1, 1, 1), - ] - self.check_error_on_write(df, engine, ValueError, msg) - @pytest.mark.parametrize("compression", [None, "gzip", "snappy", "brotli"]) def test_compression(self, engine, compression): if compression == "snappy": @@ -528,16 +509,16 @@ def test_write_column_multiindex(self, engine): # Not able to write column multi-indexes with non-string column names. mi_columns = pd.MultiIndex.from_tuples([("a", 1), ("a", 2), ("b", 1)]) df = pd.DataFrame(np.random.randn(4, 3), columns=mi_columns) - msg = ( - r"\s*parquet must have string column names for all values in\s*" - "each level of the MultiIndex" - ) - self.check_error_on_write(df, engine, ValueError, msg) - def test_write_column_multiindex_nonstring(self, pa): + if engine == "fastparquet": + self.check_error_on_write( + df, engine, TypeError, "Column name must be a string" + ) + elif engine == "pyarrow": + check_round_trip(df, engine) + + def test_write_column_multiindex_nonstring(self, engine): # GH #34777 - # Not supported in fastparquet as of 0.1.3 - engine = pa # Not able to write column multi-indexes with non-string column names arrays = [ @@ -546,11 +527,10 @@ def test_write_column_multiindex_nonstring(self, pa): ] df = pd.DataFrame(np.random.randn(8, 8), columns=arrays) df.columns.names = ["Level1", "Level2"] - msg = ( - r"\s*parquet must have string column names for all values in\s*" - "each level of the MultiIndex" - ) - self.check_error_on_write(df, engine, ValueError, msg) + if engine == "fastparquet": + self.check_error_on_write(df, engine, ValueError, "Column names must") + elif engine == "pyarrow": + check_round_trip(df, engine) def test_write_column_multiindex_string(self, pa): # GH #34777 @@ -579,17 +559,19 @@ def test_write_column_index_string(self, pa): check_round_trip(df, engine) - def test_write_column_index_nonstring(self, pa): + def test_write_column_index_nonstring(self, engine): # GH #34777 - # Not supported in fastparquet as of 0.1.3 - engine = pa # Write column indexes with string column names arrays = [1, 2, 3, 4] df = pd.DataFrame(np.random.randn(8, 4), columns=arrays) df.columns.name = "NonStringCol" - msg = r"parquet must have string column names" - self.check_error_on_write(df, engine, ValueError, msg) + if engine == "fastparquet": + self.check_error_on_write( + df, engine, TypeError, "Column name must be a string" + ) + else: + check_round_trip(df, engine) @pytest.mark.skipif(pa_version_under7p0, reason="minimum pyarrow not installed") def test_dtype_backend(self, engine, request): @@ -1041,6 +1023,26 @@ def test_read_dtype_backend_pyarrow_config_index(self, pa): expected=expected, ) + def test_columns_dtypes_not_invalid(self, pa): + df = pd.DataFrame({"string": list("abc"), "int": list(range(1, 4))}) + + # numeric + df.columns = [0, 1] + check_round_trip(df, pa) + + # bytes + df.columns = [b"foo", b"bar"] + with pytest.raises(NotImplementedError, match="|S3"): + # Bytes fails on read_parquet + check_round_trip(df, pa) + + # python object + df.columns = [ + datetime.datetime(2011, 1, 1, 0, 0), + datetime.datetime(2011, 1, 1, 1, 1), + ] + check_round_trip(df, pa) + def test_empty_columns(self, pa): # GH 52034 df = pd.DataFrame(index=pd.Index(["a", "b", "c"], name="custom name")) @@ -1057,6 +1059,27 @@ def test_basic(self, fp, df_full): df["timedelta"] = pd.timedelta_range("1 day", periods=3) check_round_trip(df, fp) + def test_columns_dtypes_invalid(self, fp): + df = pd.DataFrame({"string": list("abc"), "int": list(range(1, 4))}) + + err = TypeError + msg = "Column name must be a string" + + # numeric + df.columns = [0, 1] + self.check_error_on_write(df, fp, err, msg) + + # bytes + df.columns = [b"foo", b"bar"] + self.check_error_on_write(df, fp, err, msg) + + # python object + df.columns = [ + datetime.datetime(2011, 1, 1, 0, 0), + datetime.datetime(2011, 1, 1, 1, 1), + ] + self.check_error_on_write(df, fp, err, msg) + def test_duplicate_columns(self, fp): # not currently able to handle duplicate columns df = pd.DataFrame(np.arange(12).reshape(4, 3), columns=list("aaa")).copy() From bacbb52f5a9b934f76e76e29bf946b7e76a51de3 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 16 Mar 2023 19:32:17 -0700 Subject: [PATCH 3/3] Min version compat --- pandas/tests/io/test_parquet.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 18f24ce091482..b55e97a4fe0ae 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -528,7 +528,11 @@ def test_write_column_multiindex_nonstring(self, engine): df = pd.DataFrame(np.random.randn(8, 8), columns=arrays) df.columns.names = ["Level1", "Level2"] if engine == "fastparquet": - self.check_error_on_write(df, engine, ValueError, "Column names must") + if Version(fastparquet.__version__) < Version("0.7.0"): + err = TypeError + else: + err = ValueError + self.check_error_on_write(df, engine, err, "Column name") elif engine == "pyarrow": check_round_trip(df, engine)