1
1
""" test parquet compat """
2
2
import datetime
3
3
from distutils .version import LooseVersion
4
- import locale
5
4
import os
6
5
from warnings import catch_warnings
7
6
@@ -130,6 +129,7 @@ def check_round_trip(
130
129
read_kwargs = None ,
131
130
expected = None ,
132
131
check_names = True ,
132
+ check_like = False ,
133
133
repeat = 2 ,
134
134
):
135
135
"""Verify parquet serializer and deserializer produce the same results.
@@ -149,6 +149,8 @@ def check_round_trip(
149
149
Expected deserialization result, otherwise will be equal to `df`
150
150
check_names: list of str, optional
151
151
Closed set of column names to be compared
152
+ check_like: bool, optional
153
+ If True, ignore the order of index & columns.
152
154
repeat: int, optional
153
155
How many times to repeat the test
154
156
"""
@@ -169,7 +171,9 @@ def compare(repeat):
169
171
with catch_warnings (record = True ):
170
172
actual = read_parquet (path , ** read_kwargs )
171
173
172
- tm .assert_frame_equal (expected , actual , check_names = check_names )
174
+ tm .assert_frame_equal (
175
+ expected , actual , check_names = check_names , check_like = check_like
176
+ )
173
177
174
178
if path is None :
175
179
with tm .ensure_clean () as path :
@@ -485,15 +489,37 @@ def test_categorical(self, pa):
485
489
expected = df .astype (object )
486
490
check_round_trip (df , pa , expected = expected )
487
491
488
- # GH#33077 2020-03-27
489
- @pytest .mark .xfail (
490
- locale .getlocale ()[0 ] == "zh_CN" ,
491
- reason = "dateutil cannot parse e.g. '五, 27 3月 2020 21:45:38 GMT'" ,
492
- )
493
492
def test_s3_roundtrip (self , df_compat , s3_resource , pa ):
494
493
# GH #19134
495
494
check_round_trip (df_compat , pa , path = "s3://pandas-test/pyarrow.parquet" )
496
495
496
+ @td .skip_if_no ("s3fs" )
497
+ @pytest .mark .parametrize ("partition_col" , [["A" ], []])
498
+ def test_s3_roundtrip_for_dir (self , df_compat , s3_resource , pa , partition_col ):
499
+ from pandas .io .s3 import get_fs as get_s3_fs
500
+
501
+ # GH #26388
502
+ # https://github.com/apache/arrow/blob/master/python/pyarrow/tests/test_parquet.py#L2716
503
+ # As per pyarrow partitioned columns become 'categorical' dtypes
504
+ # and are added to back of dataframe on read
505
+
506
+ expected_df = df_compat .copy ()
507
+ if partition_col :
508
+ expected_df [partition_col ] = expected_df [partition_col ].astype ("category" )
509
+ check_round_trip (
510
+ df_compat ,
511
+ pa ,
512
+ expected = expected_df ,
513
+ path = "s3://pandas-test/parquet_dir" ,
514
+ write_kwargs = {
515
+ "partition_cols" : partition_col ,
516
+ "compression" : None ,
517
+ "filesystem" : get_s3_fs (),
518
+ },
519
+ check_like = True ,
520
+ repeat = 1 ,
521
+ )
522
+
497
523
def test_partition_cols_supported (self , pa , df_full ):
498
524
# GH #23283
499
525
partition_cols = ["bool" , "int" ]
0 commit comments