-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
Fix StringArray.astype for category dtype #40450
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 4 commits
03b8b47
333e24b
c29c9dd
0c7bc59
3777066
94a8b58
f71afa2
d8ff716
c0f007d
57727df
857d2e2
98747f4
21c55d6
d9bb52d
61a7d81
b93d628
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -350,6 +350,39 @@ def test_astype_bytes(self): | |
assert result.dtypes == np.dtype("S3") | ||
|
||
|
||
class TestAstypeString: | ||
@pytest.mark.parametrize( | ||
"data, dtype", | ||
[ | ||
(["A", NA], "category"), | ||
(["2020-10-10", "2020-10-10"], "datetime64[ns]"), | ||
(["2020-10-10", "2020-10-10", NaT], "datetime64[ns]"), | ||
( | ||
["2012-01-01 00:00:00-05:00", NaT], | ||
"datetime64[ns, US/Eastern]", | ||
), | ||
([1, None], "UInt16"), | ||
(["1/1/2021", "2/1/2021"], "period[M]"), | ||
(["1/1/2021", "2/1/2021", NaT], "period[M]"), | ||
(["1 Day", "59 Days", NaT], "timedelta64[ns]"), | ||
# currently no way to parse BooleanArray, IntervalArray from a | ||
# list of strings | ||
], | ||
) | ||
def test_astype_string_to_extension_dtype_roundtrip(self, data, dtype, request): | ||
if dtype in ("timedelta64[ns]"): | ||
mark = pytest.mark.xfail(reason="TODO fix is_extension_array_dtype GH40478") | ||
request.node.add_marker(mark) | ||
if NaT in data and dtype in ("period[M]", "datetime64[ns]"): | ||
mark = pytest.mark.xfail( | ||
reason="TODO StringArray.astype() None to dtype.na_value conversion" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is there an issue for this? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I created one here: #40566 to track this |
||
) | ||
request.node.add_marker(mark) | ||
# GH-40351 | ||
s = Series(data, dtype=dtype) | ||
tm.assert_series_equal(s, s.astype("string").astype(dtype)) | ||
|
||
|
||
class TestAstypeCategorical: | ||
def test_astype_categorical_to_other(self): | ||
cat = Categorical([f"{i} - {i + 499}" for i in range(0, 10000, 500)]) | ||
|
@@ -470,6 +503,18 @@ def test_astype_categories_raises(self): | |
with pytest.raises(TypeError, match="got an unexpected"): | ||
s.astype("category", categories=["a", "b"], ordered=True) | ||
|
||
def test_astype_str_to_extension_dtype(self): | ||
siboehm marked this conversation as resolved.
Show resolved
Hide resolved
|
||
# GH-40351 | ||
s = Series(["A", np.NaN], dtype="string") | ||
result = s.astype("category") | ||
expected = Series(["A", np.NaN], dtype="category") | ||
tm.assert_series_equal(result, expected) | ||
|
||
s = Series(["1/1/2021", "2/1/2021"], dtype="string") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can u add an example for Timedelta, Datetime w/time zone and Interval (all the EA types) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I added test for all ExtensionArray dtypes. |
||
result = s.astype("period[M]") | ||
expected = Series(["1/1/2021", "2/1/2021"], dtype="period[M]") | ||
tm.assert_series_equal(result, expected) | ||
|
||
@pytest.mark.parametrize("items", [["a", "b", "c", "a"], [1, 2, 3, 1]]) | ||
def test_astype_from_categorical(self, items): | ||
ser = Series(items) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
BooleanArray can be parsed from string (see _from_sequence_of_strings, the general method)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I had an implementation that used
_from_sequence_of_strings
instead of_from_sequence
to in StringArray.astype(). That required bigger code changes. I'd like to merge this Regression PR and then implement to implement_from_sequence_of_strings
as part of #40566