From 3995d5a07bd6aaac76ad442c5c8788141aba1a22 Mon Sep 17 00:00:00 2001 From: Dalynn Hatch Date: Sat, 17 Aug 2019 18:58:17 -0500 Subject: [PATCH 1/4] Added check for string in partition_cols --- pandas/io/parquet.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 6fc70e9f4a737..685a9a637cbca 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -244,6 +244,8 @@ def to_parquet( kwargs Additional keyword arguments passed to the engine """ + if isinstance(partition_cols, str): + partition_cols = [partition_cols] impl = get_engine(engine) return impl.write( df, From 93c461321c9a99c92d7b9ec70681f50e39810a80 Mon Sep 17 00:00:00 2001 From: Dalynn Hatch Date: Sat, 17 Aug 2019 19:19:08 -0500 Subject: [PATCH 2/4] Added unit tests for PyArrow and FastParquet where partition_cols is string --- pandas/tests/io/test_parquet.py | 33 +++++++++++++++++++++++++++++++-- 1 file changed, 31 insertions(+), 2 deletions(-) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index d634859e72d7b..f58ae25003b99 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -330,7 +330,7 @@ def test_write_index(self, engine): # non-default index for index in indexes: df.index = index - check_round_trip(df, engine, check_names=check_names) + check_round_trip(df, engine, check_names=chetest_partition_cols_supportedck_names) # index with meta-data df.index = [0, 1, 2] @@ -416,7 +416,7 @@ def test_basic_subset_columns(self, pa, df_full): # GH18628 df = df_full - # additional supported types for pyarrow + # additional supported types for pyarrowtest_partition_cols_supported df["datetime_tz"] = pd.date_range("20130101", periods=3, tz="Europe/Brussels") check_round_trip( @@ -473,6 +473,18 @@ def test_partition_cols_supported(self, pa, df_full): assert len(dataset.partitions.partition_names) == 2 assert dataset.partitions.partition_names == set(partition_cols) + def test_partition_cols_string(self, pa, df_full): + # GH #23283 + partition_cols = 'bool' + df = df_full + with tm.ensure_clean_dir() as path: + df.to_parquet(path, partition_cols=partition_cols, compression=None) + import pyarrow.parquet as pq + + dataset = pq.ParquetDataset(path, validate_schema=False) + assert len(dataset.partitions.partition_names) == 1 + assert dataset.partitions.partition_names == set([partition_cols]) + def test_empty_dataframe(self, pa): # GH #27339 df = pd.DataFrame() @@ -543,6 +555,23 @@ def test_partition_cols_supported(self, fp, df_full): actual_partition_cols = fastparquet.ParquetFile(path, False).cats assert len(actual_partition_cols) == 2 + def test_partition_cols_string(self, fp, df_full): + # GH #23283 + partition_cols = 'bool' + df = df_full + with tm.ensure_clean_dir() as path: + df.to_parquet( + path, + engine="fastparquet", + partition_cols=partition_cols, + compression=None, + ) + assert os.path.exists(path) + import fastparquet # noqa: F811 + + actual_partition_cols = fastparquet.ParquetFile(path, False).cats + assert len(actual_partition_cols) == 1 + def test_partition_on_supported(self, fp, df_full): # GH #23283 partition_cols = ["bool", "int"] From afd10c467ecf41f673646267ecf5c0e62d16dfed Mon Sep 17 00:00:00 2001 From: Dalynn Hatch Date: Sat, 17 Aug 2019 19:55:48 -0500 Subject: [PATCH 3/4] Added docstring to to_parquet function --- pandas/core/frame.py | 7 ++++--- pandas/io/parquet.py | 7 ++++--- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 603a615c1f8cb..627acea4951f9 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2156,9 +2156,10 @@ def to_parquet( .. versionadded:: 0.24.0 - partition_cols : list, optional, default None - Column names by which to partition the dataset - Columns are partitioned in the order they are given + partition_cols : list or string, optional, default None + Column names by which to partition the dataset. + Columns are partitioned in the order they are given. + String identifies a single column to be partitioned. .. versionadded:: 0.24.0 diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 685a9a637cbca..c01163cfd237c 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -235,9 +235,10 @@ def to_parquet( .. versionadded:: 0.24.0 - partition_cols : list, optional, default None - Column names by which to partition the dataset - Columns are partitioned in the order they are given + partition_cols : list or string, optional, default None + Column names by which to partition the dataset. + Columns are partitioned in the order they are given. + String identifies a single column to be partitioned. .. versionadded:: 0.24.0 From ec927c36760ae0165ea23581c52ca858b69a464c Mon Sep 17 00:00:00 2001 From: Dalynn Hatch Date: Sat, 17 Aug 2019 20:22:20 -0500 Subject: [PATCH 4/4] Deleted misadded code --- pandas/tests/io/test_parquet.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index f58ae25003b99..830e93104c110 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -330,7 +330,7 @@ def test_write_index(self, engine): # non-default index for index in indexes: df.index = index - check_round_trip(df, engine, check_names=chetest_partition_cols_supportedck_names) + check_round_trip(df, engine, check_names=check_names) # index with meta-data df.index = [0, 1, 2] @@ -416,7 +416,7 @@ def test_basic_subset_columns(self, pa, df_full): # GH18628 df = df_full - # additional supported types for pyarrowtest_partition_cols_supported + # additional supported types for pyarrow df["datetime_tz"] = pd.date_range("20130101", periods=3, tz="Europe/Brussels") check_round_trip(