11""" test parquet compat """
22import datetime
33from distutils .version import LooseVersion
4- import locale
54import os
65from warnings import catch_warnings
76
@@ -131,6 +130,7 @@ def check_round_trip(
131130 read_kwargs = None ,
132131 expected = None ,
133132 check_names = True ,
133+ check_like = False ,
134134 repeat = 2 ,
135135):
136136 """Verify parquet serializer and deserializer produce the same results.
@@ -150,6 +150,8 @@ def check_round_trip(
150150 Expected deserialization result, otherwise will be equal to `df`
151151 check_names: list of str, optional
152152 Closed set of column names to be compared
153+ check_like: bool, optional
154+ If True, ignore the order of index & columns.
153155 repeat: int, optional
154156 How many times to repeat the test
155157 """
@@ -169,7 +171,9 @@ def compare(repeat):
169171 with catch_warnings (record = True ):
170172 actual = read_parquet (path , ** read_kwargs )
171173
172- tm .assert_frame_equal (expected , actual , check_names = check_names )
174+ tm .assert_frame_equal (
175+ expected , actual , check_names = check_names , check_like = check_like
176+ )
173177
174178 if path is None :
175179 with tm .ensure_clean () as path :
@@ -532,15 +536,37 @@ def test_categorical(self, pa):
532536 expected = df .astype (object )
533537 check_round_trip (df , pa , expected = expected )
534538
535- # GH#33077 2020-03-27
536- @pytest .mark .xfail (
537- locale .getlocale ()[0 ] == "zh_CN" ,
538- reason = "dateutil cannot parse e.g. '五, 27 3月 2020 21:45:38 GMT'" ,
539- )
540539 def test_s3_roundtrip (self , df_compat , s3_resource , pa ):
541540 # GH #19134
542541 check_round_trip (df_compat , pa , path = "s3://pandas-test/pyarrow.parquet" )
543542
543+ @td .skip_if_no ("s3fs" )
544+ @pytest .mark .parametrize ("partition_col" , [["A" ], []])
545+ def test_s3_roundtrip_for_dir (self , df_compat , s3_resource , pa , partition_col ):
546+ from pandas .io .s3 import get_fs as get_s3_fs
547+
548+ # GH #26388
549+ # https://github.com/apache/arrow/blob/master/python/pyarrow/tests/test_parquet.py#L2716
550+ # As per pyarrow partitioned columns become 'categorical' dtypes
551+ # and are added to back of dataframe on read
552+
553+ expected_df = df_compat .copy ()
554+ if partition_col :
555+ expected_df [partition_col ] = expected_df [partition_col ].astype ("category" )
556+ check_round_trip (
557+ df_compat ,
558+ pa ,
559+ expected = expected_df ,
560+ path = "s3://pandas-test/parquet_dir" ,
561+ write_kwargs = {
562+ "partition_cols" : partition_col ,
563+ "compression" : None ,
564+ "filesystem" : get_s3_fs (),
565+ },
566+ check_like = True ,
567+ repeat = 1 ,
568+ )
569+
544570 def test_partition_cols_supported (self , pa , df_full ):
545571 # GH #23283
546572 partition_cols = ["bool" , "int" ]
0 commit comments