Skip to content

Commit ce4169a

Browse files
authored
Fix new string dtype tests for frame folder (#55409)
* Start fixing string tests * BUG: interpolate raising wrong error for ea * Fix more tests * REGR: join segfaulting for arrow string with nulls * Fix more tests * Fix more tests * BUG: rank raising for arrow string dtypes * BUG: eq not implemented for categorical and arrow backed strings * More tests * BUG: ndim of string block incorrect with string inference * Fix test * Fix tests * Fix tests * Fix more indexing tests * BUG: Index.insert raising when inserting None into new string dtype * Fix tests * BUG: Inserting ndim=0 array does not infer string dtype * Fix tests * Fix tests * Fix more tests * Fix more tests * BUG: idxmax raising for arrow strings * Fix * Fix more tests * Fix more tests * Fix more tests * Fix remaining tests * Fix remaining tests * Change default * BUG: Groupby not keeping string dtype for empty objects * Start fixing gb tests * Fix tests * Merge main * Update config_init.py * Fixup * Update
1 parent 71a3e3c commit ce4169a

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

45 files changed

+477
-170
lines changed

pandas/tests/frame/constructors/test_from_dict.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
import numpy as np
44
import pytest
55

6+
from pandas._config import using_pyarrow_string_dtype
7+
68
from pandas import (
79
DataFrame,
810
Index,
@@ -42,6 +44,9 @@ def test_constructor_single_row(self):
4244
)
4345
tm.assert_frame_equal(result, expected)
4446

47+
@pytest.mark.skipif(
48+
using_pyarrow_string_dtype(), reason="columns inferring logic broken"
49+
)
4550
def test_constructor_list_of_series(self):
4651
data = [
4752
OrderedDict([["a", 1.5], ["b", 3.0], ["c", 4.0]]),

pandas/tests/frame/constructors/test_from_records.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@
66
import pytest
77
import pytz
88

9+
from pandas._config import using_pyarrow_string_dtype
10+
911
from pandas.compat import is_platform_little_endian
1012

1113
from pandas import (
@@ -56,6 +58,9 @@ def test_from_records_with_datetimes(self):
5658
expected["EXPIRY"] = expected["EXPIRY"].astype("M8[s]")
5759
tm.assert_frame_equal(result, expected)
5860

61+
@pytest.mark.skipif(
62+
using_pyarrow_string_dtype(), reason="dtype checking logic doesn't work"
63+
)
5964
def test_from_records_sequencelike(self):
6065
df = DataFrame(
6166
{

pandas/tests/frame/indexing/test_getitem.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,7 @@ def test_getitem_list_duplicates(self):
103103

104104
def test_getitem_dupe_cols(self):
105105
df = DataFrame([[1, 2, 3], [4, 5, 6]], columns=["a", "a", "b"])
106-
msg = "\"None of [Index(['baf'], dtype='object')] are in the [columns]\""
106+
msg = "\"None of [Index(['baf'], dtype="
107107
with pytest.raises(KeyError, match=re.escape(msg)):
108108
df[["baf"]]
109109

pandas/tests/frame/indexing/test_indexing.py

Lines changed: 25 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -288,7 +288,9 @@ def test_setattr_column(self):
288288
df.foobar = 5
289289
assert (df.foobar == 5).all()
290290

291-
def test_setitem(self, float_frame, using_copy_on_write, warn_copy_on_write):
291+
def test_setitem(
292+
self, float_frame, using_copy_on_write, warn_copy_on_write, using_infer_string
293+
):
292294
# not sure what else to do here
293295
series = float_frame["A"][::2]
294296
float_frame["col5"] = series
@@ -331,7 +333,10 @@ def test_setitem(self, float_frame, using_copy_on_write, warn_copy_on_write):
331333
with pytest.raises(SettingWithCopyError, match=msg):
332334
smaller["col10"] = ["1", "2"]
333335

334-
assert smaller["col10"].dtype == np.object_
336+
if using_infer_string:
337+
assert smaller["col10"].dtype == "string"
338+
else:
339+
assert smaller["col10"].dtype == np.object_
335340
assert (smaller["col10"] == ["1", "2"]).all()
336341

337342
def test_setitem2(self):
@@ -426,7 +431,7 @@ def test_setitem_cast(self, float_frame):
426431
float_frame["something"] = 2.5
427432
assert float_frame["something"].dtype == np.float64
428433

429-
def test_setitem_corner(self, float_frame):
434+
def test_setitem_corner(self, float_frame, using_infer_string):
430435
# corner case
431436
df = DataFrame({"B": [1.0, 2.0, 3.0], "C": ["a", "b", "c"]}, index=np.arange(3))
432437
del df["B"]
@@ -463,10 +468,16 @@ def test_setitem_corner(self, float_frame):
463468
dm["foo"] = "bar"
464469
del dm["foo"]
465470
dm["foo"] = "bar"
466-
assert dm["foo"].dtype == np.object_
471+
if using_infer_string:
472+
assert dm["foo"].dtype == "string"
473+
else:
474+
assert dm["foo"].dtype == np.object_
467475

468476
dm["coercible"] = ["1", "2", "3"]
469-
assert dm["coercible"].dtype == np.object_
477+
if using_infer_string:
478+
assert dm["coercible"].dtype == "string"
479+
else:
480+
assert dm["coercible"].dtype == np.object_
470481

471482
def test_setitem_corner2(self):
472483
data = {
@@ -483,7 +494,7 @@ def test_setitem_corner2(self):
483494
assert df.loc[1, "title"] == "foobar"
484495
assert df.loc[1, "cruft"] == 0
485496

486-
def test_setitem_ambig(self):
497+
def test_setitem_ambig(self, using_infer_string):
487498
# Difficulties with mixed-type data
488499
# Created as float type
489500
dm = DataFrame(index=range(3), columns=range(3))
@@ -499,18 +510,22 @@ def test_setitem_ambig(self):
499510

500511
dm[2] = uncoercable_series
501512
assert len(dm.columns) == 3
502-
assert dm[2].dtype == np.object_
513+
if using_infer_string:
514+
assert dm[2].dtype == "string"
515+
else:
516+
assert dm[2].dtype == np.object_
503517

504-
def test_setitem_None(self, float_frame):
518+
def test_setitem_None(self, float_frame, using_infer_string):
505519
# GH #766
506520
float_frame[None] = float_frame["A"]
521+
key = None if not using_infer_string else np.nan
507522
tm.assert_series_equal(
508523
float_frame.iloc[:, -1], float_frame["A"], check_names=False
509524
)
510525
tm.assert_series_equal(
511-
float_frame.loc[:, None], float_frame["A"], check_names=False
526+
float_frame.loc[:, key], float_frame["A"], check_names=False
512527
)
513-
tm.assert_series_equal(float_frame[None], float_frame["A"], check_names=False)
528+
tm.assert_series_equal(float_frame[key], float_frame["A"], check_names=False)
514529

515530
def test_loc_setitem_boolean_mask_allfalse(self):
516531
# GH 9596

pandas/tests/frame/indexing/test_set_value.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ def test_set_value(self, float_frame):
1616
float_frame._set_value(idx, col, 1)
1717
assert float_frame[col][idx] == 1
1818

19-
def test_set_value_resize(self, float_frame):
19+
def test_set_value_resize(self, float_frame, using_infer_string):
2020
res = float_frame._set_value("foobar", "B", 0)
2121
assert res is None
2222
assert float_frame.index[-1] == "foobar"
@@ -27,8 +27,10 @@ def test_set_value_resize(self, float_frame):
2727

2828
res = float_frame.copy()
2929
res._set_value("foobar", "baz", "sam")
30-
assert res["baz"].dtype == np.object_
31-
30+
if using_infer_string:
31+
assert res["baz"].dtype == "string"
32+
else:
33+
assert res["baz"].dtype == np.object_
3234
res = float_frame.copy()
3335
with tm.assert_produces_warning(
3436
FutureWarning, match="Setting an item of incompatible dtype"

pandas/tests/frame/indexing/test_setitem.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1319,7 +1319,7 @@ def test_setitem_column_frame_as_category(self):
13191319
df["col2"] = Series([1, 2, 3], dtype="category")
13201320

13211321
expected_types = Series(
1322-
["int64", "category", "category"], index=[0, "col1", "col2"]
1322+
["int64", "category", "category"], index=[0, "col1", "col2"], dtype=object
13231323
)
13241324
tm.assert_series_equal(df.dtypes, expected_types)
13251325

pandas/tests/frame/indexing/test_where.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1077,9 +1077,13 @@ def test_where_producing_ea_cond_for_np_dtype():
10771077
@pytest.mark.parametrize(
10781078
"replacement", [0.001, True, "snake", None, datetime(2022, 5, 4)]
10791079
)
1080-
def test_where_int_overflow(replacement):
1080+
def test_where_int_overflow(replacement, using_infer_string, request):
10811081
# GH 31687
10821082
df = DataFrame([[1.0, 2e25, "nine"], [np.nan, 0.1, None]])
1083+
if using_infer_string and replacement not in (None, "snake"):
1084+
request.node.add_marker(
1085+
pytest.mark.xfail(reason="Can't set non-string into string column")
1086+
)
10831087
result = df.where(pd.notnull(df), replacement)
10841088
expected = DataFrame([[1.0, 2e25, "nine"], [replacement, 0.1, replacement]])
10851089

pandas/tests/frame/methods/test_align.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -107,7 +107,7 @@ def test_align_float(self, float_frame, using_copy_on_write):
107107
af, bf = float_frame.align(
108108
other.iloc[:, 0], join="inner", axis=1, method=None, fill_value=None
109109
)
110-
tm.assert_index_equal(bf.index, Index([]))
110+
tm.assert_index_equal(bf.index, Index([]).astype(bf.index.dtype))
111111

112112
msg = (
113113
"The 'method', 'limit', and 'fill_axis' keywords in DataFrame.align "
@@ -117,7 +117,7 @@ def test_align_float(self, float_frame, using_copy_on_write):
117117
af, bf = float_frame.align(
118118
other.iloc[:, 0], join="inner", axis=1, method=None, fill_value=0
119119
)
120-
tm.assert_index_equal(bf.index, Index([]))
120+
tm.assert_index_equal(bf.index, Index([]).astype(bf.index.dtype))
121121

122122
# Try to align DataFrame to Series along bad axis
123123
msg = "No axis named 2 for object type DataFrame"

pandas/tests/frame/methods/test_astype.py

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -166,21 +166,22 @@ def test_astype_str(self):
166166
"c": [Timedelta(x)._repr_base() for x in c._values],
167167
"d": list(map(str, d._values)),
168168
"e": list(map(str, e._values)),
169-
}
169+
},
170+
dtype="object",
170171
)
171172

172173
tm.assert_frame_equal(result, expected)
173174

174175
def test_astype_str_float(self):
175176
# see GH#11302
176177
result = DataFrame([np.nan]).astype(str)
177-
expected = DataFrame(["nan"])
178+
expected = DataFrame(["nan"], dtype="object")
178179

179180
tm.assert_frame_equal(result, expected)
180181
result = DataFrame([1.12345678901234567890]).astype(str)
181182

182183
val = "1.1234567890123457"
183-
expected = DataFrame([val])
184+
expected = DataFrame([val], dtype="object")
184185
tm.assert_frame_equal(result, expected)
185186

186187
@pytest.mark.parametrize("dtype_class", [dict, Series])
@@ -199,7 +200,7 @@ def test_astype_dict_like(self, dtype_class):
199200
expected = DataFrame(
200201
{
201202
"a": a,
202-
"b": Series(["0", "1", "2", "3", "4"]),
203+
"b": Series(["0", "1", "2", "3", "4"], dtype="object"),
203204
"c": c,
204205
"d": Series([1.0, 2.0, 3.14, 4.0, 5.4], dtype="float32"),
205206
}
@@ -282,7 +283,7 @@ def test_astype_duplicate_col_series_arg(self):
282283
result = df.astype(dtypes)
283284
expected = DataFrame(
284285
{
285-
0: vals[:, 0].astype(str),
286+
0: Series(vals[:, 0].astype(str), dtype=object),
286287
1: vals[:, 1],
287288
2: pd.array(vals[:, 2], dtype="Float64"),
288289
3: vals[:, 3],
@@ -620,6 +621,7 @@ def test_astype_arg_for_errors_dictlist(self):
620621
{"a": 2.2, "b": "15.3", "c": "another_test"},
621622
]
622623
)
624+
expected["c"] = expected["c"].astype("object")
623625
type_dict = {"a": "float64", "b": "float64", "c": "object"}
624626

625627
result = df.astype(dtype=type_dict, errors="ignore")
@@ -680,6 +682,7 @@ def test_astype_dt64tz_to_str(self, timezone_frame):
680682
],
681683
],
682684
columns=timezone_frame.columns,
685+
dtype="object",
683686
)
684687
tm.assert_frame_equal(result, expected)
685688

@@ -754,7 +757,9 @@ def test_astype_tz_object_conversion(self, tz):
754757
result = result.astype({"tz": "datetime64[ns, Europe/London]"})
755758
tm.assert_frame_equal(result, expected)
756759

757-
def test_astype_dt64_to_string(self, frame_or_series, tz_naive_fixture):
760+
def test_astype_dt64_to_string(
761+
self, frame_or_series, tz_naive_fixture, using_infer_string
762+
):
758763
# GH#41409
759764
tz = tz_naive_fixture
760765

@@ -772,7 +777,10 @@ def test_astype_dt64_to_string(self, frame_or_series, tz_naive_fixture):
772777
item = result.iloc[0]
773778
if frame_or_series is DataFrame:
774779
item = item.iloc[0]
775-
assert item is pd.NA
780+
if using_infer_string:
781+
assert item is np.nan
782+
else:
783+
assert item is pd.NA
776784

777785
# For non-NA values, we should match what we get for non-EA str
778786
alt = obj.astype(str)

pandas/tests/frame/methods/test_combine_first.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ def test_combine_first_mixed(self):
3030
combined = f.combine_first(g)
3131
tm.assert_frame_equal(combined, exp)
3232

33-
def test_combine_first(self, float_frame):
33+
def test_combine_first(self, float_frame, using_infer_string):
3434
# disjoint
3535
head, tail = float_frame[:5], float_frame[5:]
3636

@@ -76,7 +76,9 @@ def test_combine_first(self, float_frame):
7676
tm.assert_series_equal(combined["A"].reindex(g.index), g["A"])
7777

7878
# corner cases
79-
comb = float_frame.combine_first(DataFrame())
79+
warning = FutureWarning if using_infer_string else None
80+
with tm.assert_produces_warning(warning, match="empty entries"):
81+
comb = float_frame.combine_first(DataFrame())
8082
tm.assert_frame_equal(comb, float_frame)
8183

8284
comb = DataFrame().combine_first(float_frame)

0 commit comments

Comments
 (0)