-
Notifications
You must be signed in to change notification settings - Fork 6
feat: support conversion from pyarrow RecordBatch to pandas DataFrame #39
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
fad69fb
7b95de3
a009fc4
b4dd5cd
9c239a4
045d0e6
fe021fd
f042e83
5480c11
a6b7396
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -13,160 +13,176 @@ | |
# limitations under the License. | ||
|
||
import datetime as dt | ||
from typing import Optional | ||
|
||
import pandas | ||
import pandas.api.extensions | ||
import pandas.testing | ||
import pyarrow | ||
import pytest | ||
|
||
# To register the types. | ||
import db_dtypes # noqa | ||
import db_dtypes | ||
|
||
|
||
@pytest.mark.parametrize( | ||
("series", "expected"), | ||
def types_mapper( | ||
pyarrow_type: pyarrow.DataType, | ||
) -> Optional[pandas.api.extensions.ExtensionDtype]: | ||
type_str = str(pyarrow_type) | ||
|
||
if type_str.startswith("date32") or type_str.startswith("date64"): | ||
return db_dtypes.DateDtype | ||
elif type_str.startswith("time32") or type_str.startswith("time64"): | ||
return db_dtypes.TimeDtype | ||
else: | ||
# Use default type mapping. | ||
return None | ||
|
||
|
||
SERIES_ARRAYS_DEFAULT_TYPES = [ | ||
(pandas.Series([], dtype="dbdate"), pyarrow.array([], type=pyarrow.date32())), | ||
( | ||
(pandas.Series([], dtype="dbdate"), pyarrow.array([], type=pyarrow.date32())), | ||
( | ||
pandas.Series([None, None, None], dtype="dbdate"), | ||
pyarrow.array([None, None, None], type=pyarrow.date32()), | ||
), | ||
( | ||
pandas.Series( | ||
[dt.date(2021, 9, 27), None, dt.date(2011, 9, 27)], dtype="dbdate" | ||
), | ||
pyarrow.array( | ||
[dt.date(2021, 9, 27), None, dt.date(2011, 9, 27)], | ||
type=pyarrow.date32(), | ||
), | ||
), | ||
( | ||
pandas.Series( | ||
[dt.date(1677, 9, 22), dt.date(1970, 1, 1), dt.date(2262, 4, 11)], | ||
dtype="dbdate", | ||
), | ||
pyarrow.array( | ||
[dt.date(1677, 9, 22), dt.date(1970, 1, 1), dt.date(2262, 4, 11)], | ||
type=pyarrow.date32(), | ||
), | ||
), | ||
( | ||
pandas.Series([], dtype="dbtime"), | ||
pyarrow.array([], type=pyarrow.time64("ns")), | ||
), | ||
( | ||
pandas.Series([None, None, None], dtype="dbtime"), | ||
pyarrow.array([None, None, None], type=pyarrow.time64("ns")), | ||
), | ||
( | ||
pandas.Series( | ||
[dt.time(0, 0, 0, 0), None, dt.time(23, 59, 59, 999_999)], | ||
dtype="dbtime", | ||
), | ||
pyarrow.array( | ||
[dt.time(0, 0, 0, 0), None, dt.time(23, 59, 59, 999_999)], | ||
type=pyarrow.time64("ns"), | ||
), | ||
), | ||
( | ||
pandas.Series( | ||
[ | ||
dt.time(0, 0, 0, 0), | ||
dt.time(12, 30, 15, 125_000), | ||
dt.time(23, 59, 59, 999_999), | ||
], | ||
dtype="dbtime", | ||
), | ||
pyarrow.array( | ||
[ | ||
dt.time(0, 0, 0, 0), | ||
dt.time(12, 30, 15, 125_000), | ||
dt.time(23, 59, 59, 999_999), | ||
], | ||
type=pyarrow.time64("ns"), | ||
), | ||
pandas.Series([None, None, None], dtype="dbdate"), | ||
pyarrow.array([None, None, None], type=pyarrow.date32()), | ||
), | ||
( | ||
pandas.Series( | ||
[dt.date(2021, 9, 27), None, dt.date(2011, 9, 27)], dtype="dbdate" | ||
), | ||
pyarrow.array( | ||
[dt.date(2021, 9, 27), None, dt.date(2011, 9, 27)], type=pyarrow.date32(), | ||
), | ||
), | ||
) | ||
( | ||
pandas.Series( | ||
[dt.date(1677, 9, 22), dt.date(1970, 1, 1), dt.date(2262, 4, 11)], | ||
dtype="dbdate", | ||
), | ||
pyarrow.array( | ||
[dt.date(1677, 9, 22), dt.date(1970, 1, 1), dt.date(2262, 4, 11)], | ||
type=pyarrow.date32(), | ||
), | ||
), | ||
(pandas.Series([], dtype="dbtime"), pyarrow.array([], type=pyarrow.time64("ns")),), | ||
( | ||
pandas.Series([None, None, None], dtype="dbtime"), | ||
pyarrow.array([None, None, None], type=pyarrow.time64("ns")), | ||
), | ||
( | ||
pandas.Series( | ||
[dt.time(0, 0, 0, 0), None, dt.time(23, 59, 59, 999_999)], dtype="dbtime", | ||
), | ||
pyarrow.array( | ||
[dt.time(0, 0, 0, 0), None, dt.time(23, 59, 59, 999_999)], | ||
type=pyarrow.time64("ns"), | ||
), | ||
), | ||
( | ||
pandas.Series( | ||
[ | ||
dt.time(0, 0, 0, 0), | ||
dt.time(12, 30, 15, 125_000), | ||
dt.time(23, 59, 59, 999_999), | ||
], | ||
dtype="dbtime", | ||
), | ||
pyarrow.array( | ||
[ | ||
dt.time(0, 0, 0, 0), | ||
dt.time(12, 30, 15, 125_000), | ||
dt.time(23, 59, 59, 999_999), | ||
], | ||
type=pyarrow.time64("ns"), | ||
), | ||
), | ||
] | ||
SERIES_ARRAYS_CUSTOM_ARROW_TYPES = [ | ||
(pandas.Series([], dtype="dbdate"), pyarrow.array([], type=pyarrow.date64())), | ||
( | ||
pandas.Series([None, None, None], dtype="dbdate"), | ||
pyarrow.array([None, None, None], type=pyarrow.date64()), | ||
), | ||
( | ||
pandas.Series( | ||
[dt.date(2021, 9, 27), None, dt.date(2011, 9, 27)], dtype="dbdate" | ||
), | ||
pyarrow.array( | ||
[dt.date(2021, 9, 27), None, dt.date(2011, 9, 27)], type=pyarrow.date64(), | ||
), | ||
), | ||
( | ||
pandas.Series( | ||
[dt.date(1677, 9, 22), dt.date(1970, 1, 1), dt.date(2262, 4, 11)], | ||
dtype="dbdate", | ||
), | ||
pyarrow.array( | ||
[dt.date(1677, 9, 22), dt.date(1970, 1, 1), dt.date(2262, 4, 11)], | ||
type=pyarrow.date64(), | ||
), | ||
), | ||
(pandas.Series([], dtype="dbtime"), pyarrow.array([], type=pyarrow.time32("ms")),), | ||
( | ||
pandas.Series([None, None, None], dtype="dbtime"), | ||
pyarrow.array([None, None, None], type=pyarrow.time32("ms")), | ||
), | ||
( | ||
pandas.Series( | ||
tswast marked this conversation as resolved.
Show resolved
Hide resolved
|
||
[dt.time(0, 0, 0, 0), None, dt.time(23, 59, 59, 999_000)], dtype="dbtime", | ||
), | ||
pyarrow.array( | ||
[dt.time(0, 0, 0, 0), None, dt.time(23, 59, 59, 999_000)], | ||
type=pyarrow.time32("ms"), | ||
), | ||
), | ||
( | ||
pandas.Series( | ||
[dt.time(0, 0, 0, 0), None, dt.time(23, 59, 59, 999_999)], dtype="dbtime", | ||
), | ||
pyarrow.array( | ||
[dt.time(0, 0, 0, 0), None, dt.time(23, 59, 59, 999_999)], | ||
type=pyarrow.time64("us"), | ||
), | ||
), | ||
( | ||
pandas.Series( | ||
[ | ||
dt.time(0, 0, 0, 0), | ||
dt.time(12, 30, 15, 125_000), | ||
dt.time(23, 59, 59, 999_999), | ||
], | ||
dtype="dbtime", | ||
), | ||
pyarrow.array( | ||
[ | ||
dt.time(0, 0, 0, 0), | ||
dt.time(12, 30, 15, 125_000), | ||
dt.time(23, 59, 59, 999_999), | ||
], | ||
type=pyarrow.time64("us"), | ||
), | ||
), | ||
] | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Similarly, I'm curious what happens if the PyArrow type is There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I assume you meant |
||
|
||
@pytest.mark.parametrize(("series", "expected"), SERIES_ARRAYS_DEFAULT_TYPES) | ||
def test_to_arrow(series, expected): | ||
array = pyarrow.array(series) | ||
assert array.equals(expected) | ||
|
||
|
||
@pytest.mark.parametrize( | ||
("series", "expected"), | ||
( | ||
(pandas.Series([], dtype="dbdate"), pyarrow.array([], type=pyarrow.date64())), | ||
( | ||
pandas.Series([None, None, None], dtype="dbdate"), | ||
pyarrow.array([None, None, None], type=pyarrow.date64()), | ||
), | ||
( | ||
pandas.Series( | ||
[dt.date(2021, 9, 27), None, dt.date(2011, 9, 27)], dtype="dbdate" | ||
), | ||
pyarrow.array( | ||
[dt.date(2021, 9, 27), None, dt.date(2011, 9, 27)], | ||
type=pyarrow.date64(), | ||
), | ||
), | ||
( | ||
pandas.Series( | ||
[dt.date(1677, 9, 22), dt.date(1970, 1, 1), dt.date(2262, 4, 11)], | ||
dtype="dbdate", | ||
), | ||
pyarrow.array( | ||
[dt.date(1677, 9, 22), dt.date(1970, 1, 1), dt.date(2262, 4, 11)], | ||
type=pyarrow.date64(), | ||
), | ||
), | ||
( | ||
pandas.Series([], dtype="dbtime"), | ||
pyarrow.array([], type=pyarrow.time32("ms")), | ||
), | ||
( | ||
pandas.Series([None, None, None], dtype="dbtime"), | ||
pyarrow.array([None, None, None], type=pyarrow.time32("ms")), | ||
), | ||
( | ||
pandas.Series( | ||
[dt.time(0, 0, 0, 0), None, dt.time(23, 59, 59, 999_000)], | ||
dtype="dbtime", | ||
), | ||
pyarrow.array( | ||
[dt.time(0, 0, 0, 0), None, dt.time(23, 59, 59, 999_000)], | ||
type=pyarrow.time32("ms"), | ||
), | ||
), | ||
( | ||
pandas.Series( | ||
[dt.time(0, 0, 0, 0), None, dt.time(23, 59, 59, 999_999)], | ||
dtype="dbtime", | ||
), | ||
pyarrow.array( | ||
[dt.time(0, 0, 0, 0), None, dt.time(23, 59, 59, 999_999)], | ||
type=pyarrow.time64("us"), | ||
), | ||
), | ||
( | ||
pandas.Series( | ||
[ | ||
dt.time(0, 0, 0, 0), | ||
dt.time(12, 30, 15, 125_000), | ||
dt.time(23, 59, 59, 999_999), | ||
], | ||
dtype="dbtime", | ||
), | ||
pyarrow.array( | ||
[ | ||
dt.time(0, 0, 0, 0), | ||
dt.time(12, 30, 15, 125_000), | ||
dt.time(23, 59, 59, 999_999), | ||
], | ||
type=pyarrow.time64("us"), | ||
), | ||
), | ||
), | ||
) | ||
@pytest.mark.parametrize(("series", "expected"), SERIES_ARRAYS_CUSTOM_ARROW_TYPES) | ||
def test_to_arrow_w_arrow_type(series, expected): | ||
array = pyarrow.array(series, type=expected.type) | ||
assert array.equals(expected) | ||
|
||
|
||
@pytest.mark.parametrize( | ||
["expected", "pyarrow_array"], | ||
SERIES_ARRAYS_DEFAULT_TYPES + SERIES_ARRAYS_CUSTOM_ARROW_TYPES, | ||
) | ||
def test_from_arrow(pyarrow_array: pyarrow.Array, expected: pandas.Series): | ||
# Convert to RecordBatch because types_mapper argument is ignored when | ||
# using a pyarrow.Array. https://issues.apache.org/jira/browse/ARROW-9664 | ||
record_batch = pyarrow.RecordBatch.from_arrays([pyarrow_array], ["test_col"]) | ||
dataframe = record_batch.to_pandas(date_as_object=False, types_mapper=types_mapper) | ||
series = dataframe["test_col"] | ||
pandas.testing.assert_extension_array_equal(series, expected) |
Uh oh!
There was an error while loading. Please reload this page.