From 5f84f2b10fc7c5678fbdabed08684de7638a1094 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Thu, 16 Mar 2023 14:39:57 -0700
Subject: [PATCH 1/3] BUG: df.to_parquet with empty columns

---
 pandas/io/parquet.py            |  6 +++++-
 pandas/tests/io/test_parquet.py | 14 ++++++++++++++
 2 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py
index a606cb9287d16..2ce481e295a65 100644
--- a/pandas/io/parquet.py
+++ b/pandas/io/parquet.py
@@ -163,7 +163,11 @@ def validate_dataframe(df: DataFrame) -> None:
                      each level of the MultiIndex
                     """
                 )
-        elif df.columns.inferred_type not in {"string", "empty"}:
+        elif not df.columns.empty and df.columns.inferred_type not in {
+            "string",
+            "empty",
+        }:
+            # GH 52034: RangeIndex.inferred_dtype is always "integer" if empty
             raise ValueError("parquet must have string column names")
 
         # index level names must be strings
diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py
index 4ba3776bf6063..9cac89647f794 100644
--- a/pandas/tests/io/test_parquet.py
+++ b/pandas/tests/io/test_parquet.py
@@ -1041,6 +1041,11 @@ def test_read_dtype_backend_pyarrow_config_index(self, pa):
             expected=expected,
         )
 
+    def test_empty_columns(self, pa):
+        # GH 52034
+        df = pd.DataFrame(index=pd.Index(["a", "b", "c"], name="custom name"))
+        check_round_trip(df, pa)
+
 
 class TestParquetFastParquet(Base):
     def test_basic(self, fp, df_full):
@@ -1281,3 +1286,12 @@ def test_invalid_dtype_backend(self, engine):
             df.to_parquet(path)
             with pytest.raises(ValueError, match=msg):
                 read_parquet(path, dtype_backend="numpy")
+
+    def test_empty_columns(self, fp):
+        # GH 52034
+        df = pd.DataFrame(index=pd.Index(["a", "b", "c"], name="custom name"))
+        expected = pd.DataFrame(
+            columns=pd.Index([], dtype=object),
+            index=pd.Index(["a", "b", "c"], name="custom name"),
+        )
+        check_round_trip(df, fp, expected=expected)

From 88b82f9c0322824b1f1436db61d9bf1e556d593e Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Thu, 16 Mar 2023 17:06:08 -0700
Subject: [PATCH 2/3] Remove validation for non-string index or columns

---
 doc/source/whatsnew/v2.0.0.rst  |  1 +
 pandas/io/parquet.py            | 26 ---------
 pandas/tests/io/test_parquet.py | 97 ++++++++++++++++++++-------------
 3 files changed, 61 insertions(+), 63 deletions(-)

diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst
index 55185afc0a098..74319b0444659 100644
--- a/doc/source/whatsnew/v2.0.0.rst
+++ b/doc/source/whatsnew/v2.0.0.rst
@@ -1295,6 +1295,7 @@ I/O
 - Bug in :func:`read_csv` when ``engine="pyarrow"`` where ``encoding`` parameter was not handled correctly (:issue:`51302`)
 - Bug in :func:`read_xml` ignored repeated elements when iterparse is used (:issue:`51183`)
 - Bug in :class:`ExcelWriter` leaving file handles open if an exception occurred during instantiation (:issue:`51443`)
+- Bug in :meth:`DataFrame.to_parquet` where non-string index or columns were raising a ``ValueError`` when ``engine="pyarrow"`` (:issue:`52036`)
 
 Period
 ^^^^^^
diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py
index 2ce481e295a65..7791ca53a6447 100644
--- a/pandas/io/parquet.py
+++ b/pandas/io/parquet.py
@@ -21,7 +21,6 @@
 import pandas as pd
 from pandas import (
     DataFrame,
-    MultiIndex,
     get_option,
 )
 from pandas.core.shared_docs import _shared_docs
@@ -152,31 +151,6 @@ def validate_dataframe(df: DataFrame) -> None:
         if not isinstance(df, DataFrame):
             raise ValueError("to_parquet only supports IO with DataFrames")
 
-        # must have value column names for all index levels (strings only)
-        if isinstance(df.columns, MultiIndex):
-            if not all(
-                x.inferred_type in {"string", "empty"} for x in df.columns.levels
-            ):
-                raise ValueError(
-                    """
-                    parquet must have string column names for all values in
-                     each level of the MultiIndex
-                    """
-                )
-        elif not df.columns.empty and df.columns.inferred_type not in {
-            "string",
-            "empty",
-        }:
-            # GH 52034: RangeIndex.inferred_dtype is always "integer" if empty
-            raise ValueError("parquet must have string column names")
-
-        # index level names must be strings
-        valid_names = all(
-            isinstance(name, str) for name in df.index.names if name is not None
-        )
-        if not valid_names:
-            raise ValueError("Index level names must be strings")
-
     def write(self, df: DataFrame, path, compression, **kwargs):
         raise AbstractMethodError(self)
 
diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py
index 9cac89647f794..18f24ce091482 100644
--- a/pandas/tests/io/test_parquet.py
+++ b/pandas/tests/io/test_parquet.py
@@ -404,25 +404,6 @@ def test_columns_dtypes(self, engine):
         df.columns = ["foo", "bar"]
         check_round_trip(df, engine)
 
-    def test_columns_dtypes_invalid(self, engine):
-        df = pd.DataFrame({"string": list("abc"), "int": list(range(1, 4))})
-
-        msg = "parquet must have string column names"
-        # numeric
-        df.columns = [0, 1]
-        self.check_error_on_write(df, engine, ValueError, msg)
-
-        # bytes
-        df.columns = [b"foo", b"bar"]
-        self.check_error_on_write(df, engine, ValueError, msg)
-
-        # python object
-        df.columns = [
-            datetime.datetime(2011, 1, 1, 0, 0),
-            datetime.datetime(2011, 1, 1, 1, 1),
-        ]
-        self.check_error_on_write(df, engine, ValueError, msg)
-
     @pytest.mark.parametrize("compression", [None, "gzip", "snappy", "brotli"])
     def test_compression(self, engine, compression):
         if compression == "snappy":
@@ -528,16 +509,16 @@ def test_write_column_multiindex(self, engine):
         # Not able to write column multi-indexes with non-string column names.
         mi_columns = pd.MultiIndex.from_tuples([("a", 1), ("a", 2), ("b", 1)])
         df = pd.DataFrame(np.random.randn(4, 3), columns=mi_columns)
-        msg = (
-            r"\s*parquet must have string column names for all values in\s*"
-            "each level of the MultiIndex"
-        )
-        self.check_error_on_write(df, engine, ValueError, msg)
 
-    def test_write_column_multiindex_nonstring(self, pa):
+        if engine == "fastparquet":
+            self.check_error_on_write(
+                df, engine, TypeError, "Column name must be a string"
+            )
+        elif engine == "pyarrow":
+            check_round_trip(df, engine)
+
+    def test_write_column_multiindex_nonstring(self, engine):
         # GH #34777
-        # Not supported in fastparquet as of 0.1.3
-        engine = pa
 
         # Not able to write column multi-indexes with non-string column names
         arrays = [
@@ -546,11 +527,10 @@ def test_write_column_multiindex_nonstring(self, pa):
         ]
         df = pd.DataFrame(np.random.randn(8, 8), columns=arrays)
         df.columns.names = ["Level1", "Level2"]
-        msg = (
-            r"\s*parquet must have string column names for all values in\s*"
-            "each level of the MultiIndex"
-        )
-        self.check_error_on_write(df, engine, ValueError, msg)
+        if engine == "fastparquet":
+            self.check_error_on_write(df, engine, ValueError, "Column names must")
+        elif engine == "pyarrow":
+            check_round_trip(df, engine)
 
     def test_write_column_multiindex_string(self, pa):
         # GH #34777
@@ -579,17 +559,19 @@ def test_write_column_index_string(self, pa):
 
         check_round_trip(df, engine)
 
-    def test_write_column_index_nonstring(self, pa):
+    def test_write_column_index_nonstring(self, engine):
         # GH #34777
-        # Not supported in fastparquet as of 0.1.3
-        engine = pa
 
         # Write column indexes with string column names
         arrays = [1, 2, 3, 4]
         df = pd.DataFrame(np.random.randn(8, 4), columns=arrays)
         df.columns.name = "NonStringCol"
-        msg = r"parquet must have string column names"
-        self.check_error_on_write(df, engine, ValueError, msg)
+        if engine == "fastparquet":
+            self.check_error_on_write(
+                df, engine, TypeError, "Column name must be a string"
+            )
+        else:
+            check_round_trip(df, engine)
 
     @pytest.mark.skipif(pa_version_under7p0, reason="minimum pyarrow not installed")
     def test_dtype_backend(self, engine, request):
@@ -1041,6 +1023,26 @@ def test_read_dtype_backend_pyarrow_config_index(self, pa):
             expected=expected,
         )
 
+    def test_columns_dtypes_not_invalid(self, pa):
+        df = pd.DataFrame({"string": list("abc"), "int": list(range(1, 4))})
+
+        # numeric
+        df.columns = [0, 1]
+        check_round_trip(df, pa)
+
+        # bytes
+        df.columns = [b"foo", b"bar"]
+        with pytest.raises(NotImplementedError, match="|S3"):
+            # Bytes fails on read_parquet
+            check_round_trip(df, pa)
+
+        # python object
+        df.columns = [
+            datetime.datetime(2011, 1, 1, 0, 0),
+            datetime.datetime(2011, 1, 1, 1, 1),
+        ]
+        check_round_trip(df, pa)
+
     def test_empty_columns(self, pa):
         # GH 52034
         df = pd.DataFrame(index=pd.Index(["a", "b", "c"], name="custom name"))
@@ -1057,6 +1059,27 @@ def test_basic(self, fp, df_full):
         df["timedelta"] = pd.timedelta_range("1 day", periods=3)
         check_round_trip(df, fp)
 
+    def test_columns_dtypes_invalid(self, fp):
+        df = pd.DataFrame({"string": list("abc"), "int": list(range(1, 4))})
+
+        err = TypeError
+        msg = "Column name must be a string"
+
+        # numeric
+        df.columns = [0, 1]
+        self.check_error_on_write(df, fp, err, msg)
+
+        # bytes
+        df.columns = [b"foo", b"bar"]
+        self.check_error_on_write(df, fp, err, msg)
+
+        # python object
+        df.columns = [
+            datetime.datetime(2011, 1, 1, 0, 0),
+            datetime.datetime(2011, 1, 1, 1, 1),
+        ]
+        self.check_error_on_write(df, fp, err, msg)
+
     def test_duplicate_columns(self, fp):
         # not currently able to handle duplicate columns
         df = pd.DataFrame(np.arange(12).reshape(4, 3), columns=list("aaa")).copy()

From bacbb52f5a9b934f76e76e29bf946b7e76a51de3 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Thu, 16 Mar 2023 19:32:17 -0700
Subject: [PATCH 3/3] Min version compat

---
 pandas/tests/io/test_parquet.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py
index 18f24ce091482..b55e97a4fe0ae 100644
--- a/pandas/tests/io/test_parquet.py
+++ b/pandas/tests/io/test_parquet.py
@@ -528,7 +528,11 @@ def test_write_column_multiindex_nonstring(self, engine):
         df = pd.DataFrame(np.random.randn(8, 8), columns=arrays)
         df.columns.names = ["Level1", "Level2"]
         if engine == "fastparquet":
-            self.check_error_on_write(df, engine, ValueError, "Column names must")
+            if Version(fastparquet.__version__) < Version("0.7.0"):
+                err = TypeError
+            else:
+                err = ValueError
+            self.check_error_on_write(df, engine, err, "Column name")
         elif engine == "pyarrow":
             check_round_trip(df, engine)