diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index cde7a98eb42ae..de9a14c82b3cb 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -122,11 +122,20 @@ def write( file_obj_or_path.close() def read(self, path, columns=None, **kwargs): - parquet_ds = self.api.parquet.ParquetDataset( - path, filesystem=get_fs_for_path(path), **kwargs - ) - kwargs["columns"] = columns - result = parquet_ds.read_pandas(**kwargs).to_pandas() + fs = get_fs_for_path(path) + should_close = None + # Avoid calling get_filepath_or_buffer for s3/gcs URLs since + # since it returns an S3File which doesn't support dir reads in arrow + if not fs: + path, _, _, should_close = get_filepath_or_buffer(path) + + kwargs["use_pandas_metadata"] = True + result = self.api.parquet.read_table( + path, columns=columns, filesystem=fs, **kwargs + ).to_pandas() + if should_close: + path.close() + return result diff --git a/pandas/tests/io/data/parquet/simple.parquet b/pandas/tests/io/data/parquet/simple.parquet new file mode 100644 index 0000000000000..2862a91f508ea Binary files /dev/null and b/pandas/tests/io/data/parquet/simple.parquet differ diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 8a43d4079159b..7ee551194bf76 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -1,6 +1,7 @@ """ test parquet compat """ import datetime from distutils.version import LooseVersion +from io import BytesIO import os from warnings import catch_warnings @@ -567,6 +568,23 @@ def test_s3_roundtrip_for_dir(self, df_compat, s3_resource, pa, partition_col): repeat=1, ) + @tm.network + @td.skip_if_no("pyarrow") + def test_parquet_read_from_url(self, df_compat): + url = ( + "https://raw.githubusercontent.com/pandas-dev/pandas/" + "master/pandas/tests/io/data/parquet/simple.parquet" + ) + df = pd.read_parquet(url) + tm.assert_frame_equal(df, df_compat) + + @td.skip_if_no("pyarrow") + def test_read_file_like_obj_support(self, df_compat): + buffer = BytesIO() + df_compat.to_parquet(buffer) + df_from_buf = pd.read_parquet(buffer) + tm.assert_frame_equal(df_compat, df_from_buf) + def test_partition_cols_supported(self, pa, df_full): # GH #23283 partition_cols = ["bool", "int"]