diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 0be4ebc627b30..e504a29748b38 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -29,7 +29,7 @@ Enhancements Other enhancements ^^^^^^^^^^^^^^^^^^ -- +- String support for paramater partition_cols in the :func:`pandas.to_parquet` (:issue:`27117`) - .. _whatsnew_1000.api_breaking: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 603a615c1f8cb..ca63e7452b873 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2156,9 +2156,10 @@ def to_parquet( .. versionadded:: 0.24.0 - partition_cols : list, optional, default None - Column names by which to partition the dataset - Columns are partitioned in the order they are given + partition_cols : list or string, optional, default None + Column names by which to partition the dataset. + Columns are partitioned in the order they are given. + String identifies a single column to be partitioned. .. versionadded:: 0.24.0 @@ -2166,6 +2167,11 @@ def to_parquet( Additional arguments passed to the parquet library. See :ref:`pandas io ` for more details. + .. versionchanged:: 1.0.0 + + partition_cols + Added ability to pass in a string for a single column name + See Also -------- read_parquet : Read a parquet file. diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 6fc70e9f4a737..acf97e4b7a161 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -235,15 +235,23 @@ def to_parquet( .. versionadded:: 0.24.0 - partition_cols : list, optional, default None - Column names by which to partition the dataset - Columns are partitioned in the order they are given + partition_cols : list or string, optional, default None + Column names by which to partition the dataset. + Columns are partitioned in the order they are given. + String identifies a single column to be partitioned. .. versionadded:: 0.24.0 kwargs Additional keyword arguments passed to the engine + + .. versionchanged:: 1.0.0 + + partition_cols + Added ability to pass in a string for a single column name """ + if isinstance(partition_cols, str): + partition_cols = [partition_cols] impl = get_engine(engine) return impl.write( df, diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index d634859e72d7b..0b2d3a07980fa 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -473,6 +473,19 @@ def test_partition_cols_supported(self, pa, df_full): assert len(dataset.partitions.partition_names) == 2 assert dataset.partitions.partition_names == set(partition_cols) + def test_partition_cols_string(self, pa, df_full): + # GH #23283 + partition_cols = "bool" + partition_cols_list = [partition_cols] + df = df_full + with tm.ensure_clean_dir() as path: + df.to_parquet(path, partition_cols=partition_cols, compression=None) + import pyarrow.parquet as pq + + dataset = pq.ParquetDataset(path, validate_schema=False) + assert len(dataset.partitions.partition_names) == 1 + assert dataset.partitions.partition_names == set(partition_cols_list) + def test_empty_dataframe(self, pa): # GH #27339 df = pd.DataFrame() @@ -543,6 +556,23 @@ def test_partition_cols_supported(self, fp, df_full): actual_partition_cols = fastparquet.ParquetFile(path, False).cats assert len(actual_partition_cols) == 2 + def test_partition_cols_string(self, fp, df_full): + # GH #23283 + partition_cols = "bool" + df = df_full + with tm.ensure_clean_dir() as path: + df.to_parquet( + path, + engine="fastparquet", + partition_cols=partition_cols, + compression=None, + ) + assert os.path.exists(path) + import fastparquet # noqa: F811 + + actual_partition_cols = fastparquet.ParquetFile(path, False).cats + assert len(actual_partition_cols) == 1 + def test_partition_on_supported(self, fp, df_full): # GH #23283 partition_cols = ["bool", "int"]