Skip to content
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v1.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ Enhancements
Other enhancements
^^^^^^^^^^^^^^^^^^

-
- String support for paramater partition_cols in the :func:`pandas.to_parquet` (:issue:`27117`)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
- String support for paramater partition_cols in the :func:`pandas.to_parquet` (:issue:`27117`)
- String support for paramater ``partition_cols`` in the :func:`pandas.to_parquet` (:issue:`27117`)

I would also try to make it a bit clearer to say that a string for a single column name is now also accepted instead of a list

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do we really want to accept a sting here (and not a list)? i think we would rather be strict about this; though i suppose this is convenient as a partition col is often a single str

-

.. _whatsnew_1000.api_breaking:
Expand Down
12 changes: 9 additions & 3 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -2156,16 +2156,22 @@ def to_parquet(

.. versionadded:: 0.24.0

partition_cols : list, optional, default None
Column names by which to partition the dataset
Columns are partitioned in the order they are given
partition_cols : list or string, optional, default None
Column names by which to partition the dataset.
Columns are partitioned in the order they are given.
String identifies a single column to be partitioned.

.. versionadded:: 0.24.0
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you add a .. versionchanged:: 1.0.0 explaining that passing a single string was added in 1.0?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think I added it correctly. This is my first contribution so bear with me please :)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You can put the .. versionchanged:: ... right below the parameter explanation (so since there is already a ..versionadded:: 0.24.0, it would go just below that


**kwargs
Additional arguments passed to the parquet library. See
:ref:`pandas io <io.parquet>` for more details.

.. versionchanged:: 1.0.0

partition_cols
Added ability to pass in a string for a single column name

See Also
--------
read_parquet : Read a parquet file.
Expand Down
14 changes: 11 additions & 3 deletions pandas/io/parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -235,15 +235,23 @@ def to_parquet(

.. versionadded:: 0.24.0

partition_cols : list, optional, default None
Column names by which to partition the dataset
Columns are partitioned in the order they are given
partition_cols : list or string, optional, default None
Column names by which to partition the dataset.
Columns are partitioned in the order they are given.
String identifies a single column to be partitioned.

.. versionadded:: 0.24.0

kwargs
Additional keyword arguments passed to the engine

.. versionchanged:: 1.0.0

partition_cols
Added ability to pass in a string for a single column name
"""
if isinstance(partition_cols, str):
partition_cols = [partition_cols]
impl = get_engine(engine)
return impl.write(
df,
Expand Down
30 changes: 30 additions & 0 deletions pandas/tests/io/test_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -473,6 +473,19 @@ def test_partition_cols_supported(self, pa, df_full):
assert len(dataset.partitions.partition_names) == 2
assert dataset.partitions.partition_names == set(partition_cols)

def test_partition_cols_string(self, pa, df_full):
# GH #23283
partition_cols = "bool"
partition_cols_list = [partition_cols]
df = df_full
with tm.ensure_clean_dir() as path:
df.to_parquet(path, partition_cols=partition_cols, compression=None)
import pyarrow.parquet as pq

dataset = pq.ParquetDataset(path, validate_schema=False)
assert len(dataset.partitions.partition_names) == 1
assert dataset.partitions.partition_names == set(partition_cols_list)

def test_empty_dataframe(self, pa):
# GH #27339
df = pd.DataFrame()
Expand Down Expand Up @@ -543,6 +556,23 @@ def test_partition_cols_supported(self, fp, df_full):
actual_partition_cols = fastparquet.ParquetFile(path, False).cats
assert len(actual_partition_cols) == 2

def test_partition_cols_string(self, fp, df_full):
# GH #23283
partition_cols = "bool"
df = df_full
with tm.ensure_clean_dir() as path:
df.to_parquet(
path,
engine="fastparquet",
partition_cols=partition_cols,
compression=None,
)
assert os.path.exists(path)
import fastparquet # noqa: F811

actual_partition_cols = fastparquet.ParquetFile(path, False).cats
assert len(actual_partition_cols) == 1

def test_partition_on_supported(self, fp, df_full):
# GH #23283
partition_cols = ["bool", "int"]
Expand Down