Skip to content

TST: de-xfail some pyarrow tests #55918

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Nov 11, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion pandas/io/parsers/arrow_parser_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
)
from pandas.util._exceptions import find_stack_level

from pandas.core.dtypes.common import pandas_dtype
from pandas.core.dtypes.inference import is_integer

import pandas as pd
Expand Down Expand Up @@ -203,7 +204,13 @@ def _finalize_pandas_output(self, frame: DataFrame) -> DataFrame:
# Ignore non-existent columns from dtype mapping
# like other parsers do
if isinstance(self.dtype, dict):
self.dtype = {k: v for k, v in self.dtype.items() if k in frame.columns}
self.dtype = {
k: pandas_dtype(v)
for k, v in self.dtype.items()
if k in frame.columns
}
else:
self.dtype = pandas_dtype(self.dtype)
try:
frame = frame.astype(self.dtype)
except TypeError as e:
Expand Down
72 changes: 63 additions & 9 deletions pandas/tests/io/parser/common/test_common_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,6 @@ def test_read_csv_local(all_parsers, csv1):
tm.assert_frame_equal(result, expected)


@xfail_pyarrow
def test_1000_sep(all_parsers):
parser = all_parsers
data = """A|B|C
Expand All @@ -128,6 +127,12 @@ def test_1000_sep(all_parsers):
"""
expected = DataFrame({"A": [1, 10], "B": [2334, 13], "C": [5, 10.0]})

if parser.engine == "pyarrow":
msg = "The 'thousands' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), sep="|", thousands=",")
return

result = parser.read_csv(StringIO(data), sep="|", thousands=",")
tm.assert_frame_equal(result, expected)

Expand Down Expand Up @@ -161,7 +166,6 @@ def test_csv_mixed_type(all_parsers):
tm.assert_frame_equal(result, expected)


@xfail_pyarrow
def test_read_csv_low_memory_no_rows_with_index(all_parsers):
# see gh-21141
parser = all_parsers
Expand All @@ -174,6 +178,13 @@ def test_read_csv_low_memory_no_rows_with_index(all_parsers):
2,2,3,4
3,3,4,5
"""

if parser.engine == "pyarrow":
msg = "The 'nrows' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), low_memory=True, index_col=0, nrows=0)
return

result = parser.read_csv(StringIO(data), low_memory=True, index_col=0, nrows=0)
expected = DataFrame(columns=["A", "B", "C"])
tm.assert_frame_equal(result, expected)
Expand Down Expand Up @@ -212,7 +223,6 @@ def test_read_csv_dataframe(all_parsers, csv1):
tm.assert_frame_equal(result, expected)


@xfail_pyarrow
@pytest.mark.parametrize("nrows", [3, 3.0])
def test_read_nrows(all_parsers, nrows):
# see gh-10476
Expand All @@ -230,11 +240,16 @@ def test_read_nrows(all_parsers, nrows):
)
parser = all_parsers

if parser.engine == "pyarrow":
msg = "The 'nrows' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), nrows=nrows)
return

result = parser.read_csv(StringIO(data), nrows=nrows)
tm.assert_frame_equal(result, expected)


@xfail_pyarrow
@pytest.mark.parametrize("nrows", [1.2, "foo", -1])
def test_read_nrows_bad(all_parsers, nrows):
data = """index,A,B,C,D
Expand All @@ -247,6 +262,8 @@ def test_read_nrows_bad(all_parsers, nrows):
"""
msg = r"'nrows' must be an integer >=0"
parser = all_parsers
if parser.engine == "pyarrow":
msg = "The 'nrows' option is not supported with the 'pyarrow' engine"

with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), nrows=nrows)
Expand Down Expand Up @@ -277,7 +294,6 @@ def test_missing_trailing_delimiters(all_parsers):
tm.assert_frame_equal(result, expected)


@xfail_pyarrow
def test_skip_initial_space(all_parsers):
data = (
'"09-Apr-2012", "01:10:18.300", 2456026.548822908, 12849, '
Expand All @@ -289,6 +305,18 @@ def test_skip_initial_space(all_parsers):
)
parser = all_parsers

if parser.engine == "pyarrow":
msg = "The 'skipinitialspace' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
parser.read_csv(
StringIO(data),
names=list(range(33)),
header=None,
na_values=["-9999.0"],
skipinitialspace=True,
)
return

result = parser.read_csv(
StringIO(data),
names=list(range(33)),
Expand Down Expand Up @@ -437,7 +465,6 @@ def test_read_empty_with_usecols(all_parsers, data, kwargs, expected):
tm.assert_frame_equal(result, expected)


@xfail_pyarrow
@pytest.mark.parametrize(
"kwargs,expected",
[
Expand Down Expand Up @@ -467,6 +494,12 @@ def test_trailing_spaces(all_parsers, kwargs, expected):
data = "A B C \nrandom line with trailing spaces \nskip\n1,2,3\n1,2.,4.\nrandom line with trailing tabs\t\t\t\n \n5.1,NaN,10.0\n" # noqa: E501
parser = all_parsers

if parser.engine == "pyarrow":
msg = "The 'delim_whitespace' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data.replace(",", " ")), **kwargs)
return

result = parser.read_csv(StringIO(data.replace(",", " ")), **kwargs)
tm.assert_frame_equal(result, expected)

Expand All @@ -488,7 +521,6 @@ def test_read_filepath_or_buffer(all_parsers):
parser.read_csv(filepath_or_buffer=b"input")


@xfail_pyarrow
@pytest.mark.parametrize("delim_whitespace", [True, False])
def test_single_char_leading_whitespace(all_parsers, delim_whitespace):
# see gh-9710
Expand All @@ -501,6 +533,15 @@ def test_single_char_leading_whitespace(all_parsers, delim_whitespace):
b\n"""

expected = DataFrame({"MyColumn": list("abab")})

if parser.engine == "pyarrow":
msg = "The 'skipinitialspace' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
parser.read_csv(
StringIO(data), skipinitialspace=True, delim_whitespace=delim_whitespace
)
return

result = parser.read_csv(
StringIO(data), skipinitialspace=True, delim_whitespace=delim_whitespace
)
Expand Down Expand Up @@ -688,7 +729,6 @@ def test_first_row_bom_unquoted(all_parsers):
tm.assert_frame_equal(result, expected)


@xfail_pyarrow
@pytest.mark.parametrize("nrows", range(1, 6))
def test_blank_lines_between_header_and_data_rows(all_parsers, nrows):
# GH 28071
Expand All @@ -698,6 +738,15 @@ def test_blank_lines_between_header_and_data_rows(all_parsers, nrows):
)
csv = "\nheader\n\na,b\n\n\n1,2\n\n3,4"
parser = all_parsers

if parser.engine == "pyarrow":
msg = "The 'nrows' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
parser.read_csv(
StringIO(csv), header=3, nrows=nrows, skip_blank_lines=False
)
return

df = parser.read_csv(StringIO(csv), header=3, nrows=nrows, skip_blank_lines=False)
tm.assert_frame_equal(df, ref[:nrows])

Expand Down Expand Up @@ -731,11 +780,16 @@ def test_read_csv_names_not_accepting_sets(all_parsers):
parser.read_csv(StringIO(data), names=set("QAZ"))


@xfail_pyarrow
def test_read_table_delim_whitespace_default_sep(all_parsers):
# GH: 35958
f = StringIO("a b c\n1 -2 -3\n4 5 6")
parser = all_parsers

if parser.engine == "pyarrow":
msg = "The 'delim_whitespace' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
parser.read_table(f, delim_whitespace=True)
return
result = parser.read_table(f, delim_whitespace=True)
expected = DataFrame({"a": [1, 4], "b": [-2, 5], "c": [-3, 6]})
tm.assert_frame_equal(result, expected)
Expand Down
11 changes: 8 additions & 3 deletions pandas/tests/io/parser/common/test_decimal.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,7 @@
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
)

xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")


@xfail_pyarrow
@pytest.mark.parametrize(
"data,thousands,decimal",
[
Expand All @@ -42,6 +39,14 @@ def test_1000_sep_with_decimal(all_parsers, data, thousands, decimal):
parser = all_parsers
expected = DataFrame({"A": [1, 10], "B": [2334.01, 13], "C": [5, 10.0]})

if parser.engine == "pyarrow":
msg = "The 'thousands' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
parser.read_csv(
StringIO(data), sep="|", thousands=thousands, decimal=decimal
)
return

result = parser.read_csv(
StringIO(data), sep="|", thousands=thousands, decimal=decimal
)
Expand Down
46 changes: 37 additions & 9 deletions pandas/tests/io/parser/common/test_file_buffer_url.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,8 +214,14 @@ def test_eof_states(all_parsers, data, kwargs, expected, msg, request):
# see gh-10728, gh-10548
parser = all_parsers

if parser.engine == "pyarrow" and "comment" in kwargs:
msg = "The 'comment' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), **kwargs)
return

if parser.engine == "pyarrow" and "\r" not in data:
mark = pytest.mark.xfail(reason="The 'comment' option is not supported")
mark = pytest.mark.xfail(reason="Mismatched exception type/message")
request.applymarker(mark)

if expected is None:
Expand Down Expand Up @@ -356,7 +362,6 @@ def test_read_csv_file_handle(all_parsers, io_class, encoding):
assert not handle.closed


@xfail_pyarrow # ValueError: The 'memory_map' option is not supported
def test_memory_map_compression(all_parsers, compression):
"""
Support memory map for compressed files.
Expand All @@ -369,19 +374,32 @@ def test_memory_map_compression(all_parsers, compression):
with tm.ensure_clean() as path:
expected.to_csv(path, index=False, compression=compression)

tm.assert_frame_equal(
parser.read_csv(path, memory_map=True, compression=compression),
expected,
)
if parser.engine == "pyarrow":
msg = "The 'memory_map' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
parser.read_csv(path, memory_map=True, compression=compression)
return

result = parser.read_csv(path, memory_map=True, compression=compression)

tm.assert_frame_equal(
result,
expected,
)


@xfail_pyarrow # ValueError: The 'chunksize' option is not supported
def test_context_manager(all_parsers, datapath):
# make sure that opened files are closed
parser = all_parsers

path = datapath("io", "data", "csv", "iris.csv")

if parser.engine == "pyarrow":
msg = "The 'chunksize' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
parser.read_csv(path, chunksize=1)
return

reader = parser.read_csv(path, chunksize=1)
assert not reader.handles.handle.closed
try:
Expand All @@ -392,12 +410,17 @@ def test_context_manager(all_parsers, datapath):
assert reader.handles.handle.closed


@xfail_pyarrow # ValueError: The 'chunksize' option is not supported
def test_context_manageri_user_provided(all_parsers, datapath):
# make sure that user-provided handles are not closed
parser = all_parsers

with open(datapath("io", "data", "csv", "iris.csv"), encoding="utf-8") as path:
if parser.engine == "pyarrow":
msg = "The 'chunksize' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
parser.read_csv(path, chunksize=1)
return

reader = parser.read_csv(path, chunksize=1)
assert not reader.handles.handle.closed
try:
Expand All @@ -417,7 +440,6 @@ def test_file_descriptor_leak(all_parsers, using_copy_on_write):
parser.read_csv(path)


@xfail_pyarrow # ValueError: The 'memory_map' option is not supported
def test_memory_map(all_parsers, csv_dir_path):
mmap_file = os.path.join(csv_dir_path, "test_mmap.csv")
parser = all_parsers
Expand All @@ -426,5 +448,11 @@ def test_memory_map(all_parsers, csv_dir_path):
{"a": [1, 2, 3], "b": ["one", "two", "three"], "c": ["I", "II", "III"]}
)

if parser.engine == "pyarrow":
msg = "The 'memory_map' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
parser.read_csv(mmap_file, memory_map=True)
return

result = parser.read_csv(mmap_file, memory_map=True)
tm.assert_frame_equal(result, expected)
24 changes: 16 additions & 8 deletions pandas/tests/io/parser/common/test_ints.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,10 +126,8 @@ def test_int64_min_issues(all_parsers):
tm.assert_frame_equal(result, expected)


# ValueError: The 'converters' option is not supported with the 'pyarrow' engine
@xfail_pyarrow
@pytest.mark.parametrize("conv", [None, np.int64, np.uint64])
def test_int64_overflow(all_parsers, conv):
def test_int64_overflow(all_parsers, conv, request):
data = """ID
00013007854817840016671868
00013007854817840016749251
Expand All @@ -143,6 +141,10 @@ def test_int64_overflow(all_parsers, conv):
if conv is None:
# 13007854817840016671868 > UINT64_MAX, so this
# will overflow and return object as the dtype.
if parser.engine == "pyarrow":
mark = pytest.mark.xfail(reason="parses to float64")
request.applymarker(mark)

result = parser.read_csv(StringIO(data))
expected = DataFrame(
[
Expand All @@ -161,13 +163,19 @@ def test_int64_overflow(all_parsers, conv):
# 13007854817840016671868 > UINT64_MAX, so attempts
# to cast to either int64 or uint64 will result in
# an OverflowError being raised.
msg = (
"(Python int too large to convert to C long)|"
"(long too big to convert)|"
"(int too big to convert)"
msg = "|".join(
[
"Python int too large to convert to C long",
"long too big to convert",
"int too big to convert",
]
)
err = OverflowError
if parser.engine == "pyarrow":
err = ValueError
msg = "The 'converters' option is not supported with the 'pyarrow' engine"

with pytest.raises(OverflowError, match=msg):
with pytest.raises(err, match=msg):
parser.read_csv(StringIO(data), converters={"ID": conv})


Expand Down
Loading