diff --git a/pandas/tests/strings/test_strings.py b/pandas/tests/strings/test_strings.py index 42d81154dea0f..f77a78b8c4c49 100644 --- a/pandas/tests/strings/test_strings.py +++ b/pandas/tests/strings/test_strings.py @@ -26,16 +26,16 @@ def assert_series_or_index_equal(left, right): def test_iter(): # GH3638 strs = "google", "wikimedia", "wikipedia", "wikitravel" - ds = Series(strs) + ser = Series(strs) with tm.assert_produces_warning(FutureWarning): - for s in ds.str: + for s in ser.str: # iter must yield a Series assert isinstance(s, Series) # indices of each yielded Series should be equal to the index of # the original Series - tm.assert_index_equal(s.index, ds.index) + tm.assert_index_equal(s.index, ser.index) for el in s: # each element of the series is either a basestring/str or nan @@ -48,12 +48,12 @@ def test_iter(): def test_iter_empty(): - ds = Series([], dtype=object) + ser = Series([], dtype=object) i, s = 100, 1 with tm.assert_produces_warning(FutureWarning): - for i, s in enumerate(ds.str): + for i, s in enumerate(ser.str): pass # nothing to iterate over so nothing defined values should remain @@ -63,18 +63,18 @@ def test_iter_empty(): def test_iter_single_element(): - ds = Series(["a"]) + ser = Series(["a"]) with tm.assert_produces_warning(FutureWarning): - for i, s in enumerate(ds.str): + for i, s in enumerate(ser.str): pass assert not i - tm.assert_series_equal(ds, s) + tm.assert_series_equal(ser, s) def test_iter_object_try_string(): - ds = Series( + ser = Series( [ slice(None, np.random.randint(10), np.random.randint(10, 20)) for _ in range(4) @@ -84,7 +84,7 @@ def test_iter_object_try_string(): i, s = 100, "h" with tm.assert_produces_warning(FutureWarning): - for i, s in enumerate(ds.str): + for i, s in enumerate(ser.str): pass assert i == 100 @@ -95,44 +95,41 @@ def test_iter_object_try_string(): def test_count(): - values = np.array(["foo", "foofoo", np.nan, "foooofooofommmfoo"], dtype=np.object_) + ser = Series(["foo", "foofoo", np.nan, "foooofooofommmfoo"], dtype=np.object_) + result = ser.str.count("f[o]+") + expected = Series([1, 2, np.nan, 4]) + tm.assert_series_equal(result, expected) - result = Series(values).str.count("f[o]+") - exp = Series([1, 2, np.nan, 4]) - assert isinstance(result, Series) - tm.assert_series_equal(result, exp) - # mixed - mixed = np.array( +def test_count_mixed_object(): + ser = Series( ["a", np.nan, "b", True, datetime.today(), "foo", None, 1, 2.0], dtype=object, ) - rs = Series(mixed).str.count("a") - xp = Series([1, np.nan, 0, np.nan, np.nan, 0, np.nan, np.nan, np.nan]) - assert isinstance(rs, Series) - tm.assert_series_equal(rs, xp) + result = ser.str.count("a") + expected = Series([1, np.nan, 0, np.nan, np.nan, 0, np.nan, np.nan, np.nan]) + tm.assert_series_equal(result, expected) def test_repeat(): - values = Series(["a", "b", np.nan, "c", np.nan, "d"]) + ser = Series(["a", "b", np.nan, "c", np.nan, "d"]) - result = values.str.repeat(3) - exp = Series(["aaa", "bbb", np.nan, "ccc", np.nan, "ddd"]) - tm.assert_series_equal(result, exp) + result = ser.str.repeat(3) + expected = Series(["aaa", "bbb", np.nan, "ccc", np.nan, "ddd"]) + tm.assert_series_equal(result, expected) - result = values.str.repeat([1, 2, 3, 4, 5, 6]) - exp = Series(["a", "bb", np.nan, "cccc", np.nan, "dddddd"]) - tm.assert_series_equal(result, exp) + result = ser.str.repeat([1, 2, 3, 4, 5, 6]) + expected = Series(["a", "bb", np.nan, "cccc", np.nan, "dddddd"]) + tm.assert_series_equal(result, expected) - # mixed - mixed = Series(["a", np.nan, "b", True, datetime.today(), "foo", None, 1, 2.0]) - rs = Series(mixed).str.repeat(3) - xp = Series( +def test_repeat_mixed_object(): + ser = Series(["a", np.nan, "b", True, datetime.today(), "foo", None, 1, 2.0]) + result = ser.str.repeat(3) + expected = Series( ["aaa", np.nan, "bbb", np.nan, np.nan, "foofoofoo", np.nan, np.nan, np.nan] ) - assert isinstance(rs, Series) - tm.assert_series_equal(rs, xp) + tm.assert_series_equal(result, expected) def test_repeat_with_null(nullable_string_dtype): @@ -227,514 +224,494 @@ def test_empty_str_methods(any_string_dtype): def test_empty_str_methods_to_frame(): - empty = Series(dtype=str) - empty_df = DataFrame() - tm.assert_frame_equal(empty_df, empty.str.partition("a")) - tm.assert_frame_equal(empty_df, empty.str.rpartition("a")) + ser = Series(dtype=str) + expected = DataFrame() + + result = ser.str.partition("a") + tm.assert_frame_equal(result, expected) + + result = ser.str.rpartition("a") + tm.assert_frame_equal(result, expected) -def test_ismethods(any_string_dtype): +@pytest.mark.parametrize( + "method, expected", + [ + ("isalnum", [True, True, True, True, True, False, True, True, False, False]), + ("isalpha", [True, True, True, False, False, False, True, False, False, False]), + ( + "isdigit", + [False, False, False, True, False, False, False, True, False, False], + ), + ( + "isnumeric", + [False, False, False, True, False, False, False, True, False, False], + ), + ( + "isspace", + [False, False, False, False, False, False, False, False, False, True], + ), + ( + "islower", + [False, True, False, False, False, False, False, False, False, False], + ), + ( + "isupper", + [True, False, False, False, True, False, True, False, False, False], + ), + ( + "istitle", + [True, False, True, False, True, False, False, False, False, False], + ), + ], +) +def test_ismethods(method, expected, any_string_dtype): values = ["A", "b", "Xy", "4", "3A", "", "TT", "55", "-", " "] - str_s = Series(values, dtype=any_string_dtype) - alnum_e = [True, True, True, True, True, False, True, True, False, False] - alpha_e = [True, True, True, False, False, False, True, False, False, False] - digit_e = [False, False, False, True, False, False, False, True, False, False] - - # TODO: unused - num_e = [ # noqa - False, - False, - False, - True, - False, - False, - False, - True, - False, - False, - ] - - space_e = [False, False, False, False, False, False, False, False, False, True] - lower_e = [False, True, False, False, False, False, False, False, False, False] - upper_e = [True, False, False, False, True, False, True, False, False, False] - title_e = [True, False, True, False, True, False, False, False, False, False] - - dtype = "bool" if any_string_dtype == "object" else "boolean" - tm.assert_series_equal(str_s.str.isalnum(), Series(alnum_e, dtype=dtype)) - tm.assert_series_equal(str_s.str.isalpha(), Series(alpha_e, dtype=dtype)) - tm.assert_series_equal(str_s.str.isdigit(), Series(digit_e, dtype=dtype)) - tm.assert_series_equal(str_s.str.isspace(), Series(space_e, dtype=dtype)) - tm.assert_series_equal(str_s.str.islower(), Series(lower_e, dtype=dtype)) - tm.assert_series_equal(str_s.str.isupper(), Series(upper_e, dtype=dtype)) - tm.assert_series_equal(str_s.str.istitle(), Series(title_e, dtype=dtype)) - - assert str_s.str.isalnum().tolist() == [v.isalnum() for v in values] - assert str_s.str.isalpha().tolist() == [v.isalpha() for v in values] - assert str_s.str.isdigit().tolist() == [v.isdigit() for v in values] - assert str_s.str.isspace().tolist() == [v.isspace() for v in values] - assert str_s.str.islower().tolist() == [v.islower() for v in values] - assert str_s.str.isupper().tolist() == [v.isupper() for v in values] - assert str_s.str.istitle().tolist() == [v.istitle() for v in values] - - -def test_isnumeric(any_string_dtype): + ser = Series(values, dtype=any_string_dtype) + + expected_dtype = "bool" if any_string_dtype == "object" else "boolean" + expected = Series(expected, dtype=expected_dtype) + result = getattr(ser.str, method)() + tm.assert_series_equal(result, expected) + + # compare with standard library + expected = [getattr(v, method)() for v in values] + result = result.tolist() + assert result == expected + + +@pytest.mark.parametrize( + "method, expected", + [ + ("isnumeric", [False, True, True, False, True, True, False]), + ("isdecimal", [False, True, False, False, False, True, False]), + ], +) +def test_isnumeric_unicode(method, expected, any_string_dtype): # 0x00bc: ¼ VULGAR FRACTION ONE QUARTER # 0x2605: ★ not number # 0x1378: ፸ ETHIOPIC NUMBER SEVENTY # 0xFF13: 3 Em 3 values = ["A", "3", "¼", "★", "፸", "3", "four"] - s = Series(values, dtype=any_string_dtype) - numeric_e = [False, True, True, False, True, True, False] - decimal_e = [False, True, False, False, False, True, False] - dtype = "bool" if any_string_dtype == "object" else "boolean" - tm.assert_series_equal(s.str.isnumeric(), Series(numeric_e, dtype=dtype)) - tm.assert_series_equal(s.str.isdecimal(), Series(decimal_e, dtype=dtype)) + ser = Series(values, dtype=any_string_dtype) + expected_dtype = "bool" if any_string_dtype == "object" else "boolean" + expected = Series(expected, dtype=expected_dtype) + result = getattr(ser.str, method)() + tm.assert_series_equal(result, expected) - unicodes = ["A", "3", "¼", "★", "፸", "3", "four"] - assert s.str.isnumeric().tolist() == [v.isnumeric() for v in unicodes] - assert s.str.isdecimal().tolist() == [v.isdecimal() for v in unicodes] + # compare with standard library + expected = [getattr(v, method)() for v in values] + result = result.tolist() + assert result == expected + +@pytest.mark.parametrize( + "method, expected", + [ + ("isnumeric", [False, np.nan, True, False, np.nan, True, False]), + ("isdecimal", [False, np.nan, False, False, np.nan, True, False]), + ], +) +def test_isnumeric_unicode_missing(method, expected, any_string_dtype): values = ["A", np.nan, "¼", "★", np.nan, "3", "four"] - s = Series(values, dtype=any_string_dtype) - numeric_e = [False, np.nan, True, False, np.nan, True, False] - decimal_e = [False, np.nan, False, False, np.nan, True, False] - dtype = "object" if any_string_dtype == "object" else "boolean" - tm.assert_series_equal(s.str.isnumeric(), Series(numeric_e, dtype=dtype)) - tm.assert_series_equal(s.str.isdecimal(), Series(decimal_e, dtype=dtype)) + ser = Series(values, dtype=any_string_dtype) + expected_dtype = "object" if any_string_dtype == "object" else "boolean" + expected = Series(expected, dtype=expected_dtype) + result = getattr(ser.str, method)() + tm.assert_series_equal(result, expected) -def test_join(): - values = Series(["a_b_c", "c_d_e", np.nan, "f_g_h"]) - result = values.str.split("_").str.join("_") - tm.assert_series_equal(values, result) +def test_spilt_join_roundtrip(): + ser = Series(["a_b_c", "c_d_e", np.nan, "f_g_h"]) + result = ser.str.split("_").str.join("_") + tm.assert_series_equal(result, ser) - # mixed - mixed = Series( - [ - "a_b", - np.nan, - "asdf_cas_asdf", - True, - datetime.today(), - "foo", - None, - 1, - 2.0, - ] - ) - rs = Series(mixed).str.split("_").str.join("_") - xp = Series( - [ - "a_b", - np.nan, - "asdf_cas_asdf", - np.nan, - np.nan, - "foo", - np.nan, - np.nan, - np.nan, - ] +def test_spilt_join_roundtrip_mixed_object(): + ser = Series( + ["a_b", np.nan, "asdf_cas_asdf", True, datetime.today(), "foo", None, 1, 2.0] ) - - assert isinstance(rs, Series) - tm.assert_almost_equal(rs, xp) + result = ser.str.split("_").str.join("_") + expected = Series( + ["a_b", np.nan, "asdf_cas_asdf", np.nan, np.nan, "foo", np.nan, np.nan, np.nan] + ) + tm.assert_series_equal(result, expected) def test_len(any_string_dtype): - values = Series( + ser = Series( ["foo", "fooo", "fooooo", np.nan, "fooooooo", "foo\n", "あ"], dtype=any_string_dtype, ) - - result = values.str.len() + result = ser.str.len() expected_dtype = "float64" if any_string_dtype == "object" else "Int64" expected = Series([3, 4, 6, np.nan, 8, 4, 1], dtype=expected_dtype) tm.assert_series_equal(result, expected) def test_len_mixed(): - mixed = Series( - [ - "a_b", - np.nan, - "asdf_cas_asdf", - True, - datetime.today(), - "foo", - None, - 1, - 2.0, - ] + ser = Series( + ["a_b", np.nan, "asdf_cas_asdf", True, datetime.today(), "foo", None, 1, 2.0] ) + result = ser.str.len() + expected = Series([3, np.nan, 13, np.nan, np.nan, 3, np.nan, np.nan, np.nan]) + tm.assert_series_equal(result, expected) - rs = Series(mixed).str.len() - xp = Series([3, np.nan, 13, np.nan, np.nan, 3, np.nan, np.nan, np.nan]) - assert isinstance(rs, Series) - tm.assert_almost_equal(rs, xp) +def test_index(index_or_series): + if index_or_series is Series: + _check = tm.assert_series_equal + else: + _check = tm.assert_index_equal + obj = index_or_series(["ABCDEFG", "BCDEFEF", "DEFGHIJEF", "EFGHEF"]) -def test_index(): - def _check(result, expected): - if isinstance(result, Series): - tm.assert_series_equal(result, expected) - else: - tm.assert_index_equal(result, expected) + result = obj.str.index("EF") + _check(result, index_or_series([4, 3, 1, 0])) + expected = np.array([v.index("EF") for v in obj.values], dtype=np.int64) + tm.assert_numpy_array_equal(result.values, expected) - for klass in [Series, Index]: - s = klass(["ABCDEFG", "BCDEFEF", "DEFGHIJEF", "EFGHEF"]) + result = obj.str.rindex("EF") + _check(result, index_or_series([4, 5, 7, 4])) + expected = np.array([v.rindex("EF") for v in obj.values], dtype=np.int64) + tm.assert_numpy_array_equal(result.values, expected) - result = s.str.index("EF") - _check(result, klass([4, 3, 1, 0])) - expected = np.array([v.index("EF") for v in s.values], dtype=np.int64) - tm.assert_numpy_array_equal(result.values, expected) + result = obj.str.index("EF", 3) + _check(result, index_or_series([4, 3, 7, 4])) + expected = np.array([v.index("EF", 3) for v in obj.values], dtype=np.int64) + tm.assert_numpy_array_equal(result.values, expected) - result = s.str.rindex("EF") - _check(result, klass([4, 5, 7, 4])) - expected = np.array([v.rindex("EF") for v in s.values], dtype=np.int64) - tm.assert_numpy_array_equal(result.values, expected) + result = obj.str.rindex("EF", 3) + _check(result, index_or_series([4, 5, 7, 4])) + expected = np.array([v.rindex("EF", 3) for v in obj.values], dtype=np.int64) + tm.assert_numpy_array_equal(result.values, expected) - result = s.str.index("EF", 3) - _check(result, klass([4, 3, 7, 4])) - expected = np.array([v.index("EF", 3) for v in s.values], dtype=np.int64) - tm.assert_numpy_array_equal(result.values, expected) + result = obj.str.index("E", 4, 8) + _check(result, index_or_series([4, 5, 7, 4])) + expected = np.array([v.index("E", 4, 8) for v in obj.values], dtype=np.int64) + tm.assert_numpy_array_equal(result.values, expected) - result = s.str.rindex("EF", 3) - _check(result, klass([4, 5, 7, 4])) - expected = np.array([v.rindex("EF", 3) for v in s.values], dtype=np.int64) - tm.assert_numpy_array_equal(result.values, expected) + result = obj.str.rindex("E", 0, 5) + _check(result, index_or_series([4, 3, 1, 4])) + expected = np.array([v.rindex("E", 0, 5) for v in obj.values], dtype=np.int64) + tm.assert_numpy_array_equal(result.values, expected) - result = s.str.index("E", 4, 8) - _check(result, klass([4, 5, 7, 4])) - expected = np.array([v.index("E", 4, 8) for v in s.values], dtype=np.int64) - tm.assert_numpy_array_equal(result.values, expected) - result = s.str.rindex("E", 0, 5) - _check(result, klass([4, 3, 1, 4])) - expected = np.array([v.rindex("E", 0, 5) for v in s.values], dtype=np.int64) - tm.assert_numpy_array_equal(result.values, expected) +def test_index_not_found(index_or_series): + obj = index_or_series(["ABCDEFG", "BCDEFEF", "DEFGHIJEF", "EFGHEF"]) + with pytest.raises(ValueError, match="substring not found"): + obj.str.index("DE") - with pytest.raises(ValueError, match="substring not found"): - result = s.str.index("DE") - msg = "expected a string object, not int" - with pytest.raises(TypeError, match=msg): - result = s.str.index(0) +def test_index_wrong_type_raises(index_or_series): + obj = index_or_series([], dtype=object) + msg = "expected a string object, not int" - with pytest.raises(TypeError, match=msg): - result = s.str.rindex(0) + with pytest.raises(TypeError, match=msg): + obj.str.index(0) - # test with nan - s = Series(["abcb", "ab", "bcbe", np.nan]) - result = s.str.index("b") - tm.assert_series_equal(result, Series([1, 1, 0, np.nan])) - result = s.str.rindex("b") - tm.assert_series_equal(result, Series([3, 1, 2, np.nan])) + with pytest.raises(TypeError, match=msg): + obj.str.rindex(0) -def test_pipe_failures(): - # #2119 - s = Series(["A|B|C"]) +def test_index_missing(): + ser = Series(["abcb", "ab", "bcbe", np.nan]) - result = s.str.split("|") - exp = Series([["A", "B", "C"]]) + result = ser.str.index("b") + expected = Series([1, 1, 0, np.nan]) + tm.assert_series_equal(result, expected) - tm.assert_series_equal(result, exp) + result = ser.str.rindex("b") + expected = Series([3, 1, 2, np.nan]) + tm.assert_series_equal(result, expected) - result = s.str.replace("|", " ", regex=False) - exp = Series(["A B C"]) - tm.assert_series_equal(result, exp) +def test_pipe_failures(): + # #2119 + ser = Series(["A|B|C"]) + + result = ser.str.split("|") + expected = Series([["A", "B", "C"]]) + tm.assert_series_equal(result, expected) + + result = ser.str.replace("|", " ", regex=False) + expected = Series(["A B C"]) + tm.assert_series_equal(result, expected) @pytest.mark.parametrize( "start, stop, step, expected", [ - (2, 5, None, Series(["foo", "bar", np.nan, "baz"])), - (0, 3, -1, Series(["", "", np.nan, ""])), - (None, None, -1, Series(["owtoofaa", "owtrabaa", np.nan, "xuqzabaa"])), - (3, 10, 2, Series(["oto", "ato", np.nan, "aqx"])), - (3, 0, -1, Series(["ofa", "aba", np.nan, "aba"])), + (2, 5, None, ["foo", "bar", np.nan, "baz"]), + (0, 3, -1, ["", "", np.nan, ""]), + (None, None, -1, ["owtoofaa", "owtrabaa", np.nan, "xuqzabaa"]), + (3, 10, 2, ["oto", "ato", np.nan, "aqx"]), + (3, 0, -1, ["ofa", "aba", np.nan, "aba"]), ], ) def test_slice(start, stop, step, expected): - values = Series(["aafootwo", "aabartwo", np.nan, "aabazqux"]) - result = values.str.slice(start, stop, step) + ser = Series(["aafootwo", "aabartwo", np.nan, "aabazqux"]) + result = ser.str.slice(start, stop, step) + expected = Series(expected) tm.assert_series_equal(result, expected) - # mixed - mixed = Series( - ["aafootwo", np.nan, "aabartwo", True, datetime.today(), None, 1, 2.0] - ) - - rs = Series(mixed).str.slice(2, 5) - xp = Series(["foo", np.nan, "bar", np.nan, np.nan, np.nan, np.nan, np.nan]) - - assert isinstance(rs, Series) - tm.assert_almost_equal(rs, xp) - rs = Series(mixed).str.slice(2, 5, -1) - xp = Series(["oof", np.nan, "rab", np.nan, np.nan, np.nan, np.nan, np.nan]) +@pytest.mark.parametrize( + "start, stop, step, expected", + [ + (2, 5, None, ["foo", np.nan, "bar", np.nan, np.nan, np.nan, np.nan, np.nan]), + (4, 1, -1, ["oof", np.nan, "rab", np.nan, np.nan, np.nan, np.nan, np.nan]), + ], +) +def test_slice_mixed_object(start, stop, step, expected): + ser = Series(["aafootwo", np.nan, "aabartwo", True, datetime.today(), None, 1, 2.0]) + result = ser.str.slice(start, stop, step) + expected = Series(expected) + tm.assert_series_equal(result, expected) def test_slice_replace(): - values = Series(["short", "a bit longer", "evenlongerthanthat", "", np.nan]) + ser = Series(["short", "a bit longer", "evenlongerthanthat", "", np.nan]) - exp = Series(["shrt", "a it longer", "evnlongerthanthat", "", np.nan]) - result = values.str.slice_replace(2, 3) - tm.assert_series_equal(result, exp) + expected = Series(["shrt", "a it longer", "evnlongerthanthat", "", np.nan]) + result = ser.str.slice_replace(2, 3) + tm.assert_series_equal(result, expected) - exp = Series(["shzrt", "a zit longer", "evznlongerthanthat", "z", np.nan]) - result = values.str.slice_replace(2, 3, "z") - tm.assert_series_equal(result, exp) + expected = Series(["shzrt", "a zit longer", "evznlongerthanthat", "z", np.nan]) + result = ser.str.slice_replace(2, 3, "z") + tm.assert_series_equal(result, expected) - exp = Series(["shzort", "a zbit longer", "evzenlongerthanthat", "z", np.nan]) - result = values.str.slice_replace(2, 2, "z") - tm.assert_series_equal(result, exp) + expected = Series(["shzort", "a zbit longer", "evzenlongerthanthat", "z", np.nan]) + result = ser.str.slice_replace(2, 2, "z") + tm.assert_series_equal(result, expected) - exp = Series(["shzort", "a zbit longer", "evzenlongerthanthat", "z", np.nan]) - result = values.str.slice_replace(2, 1, "z") - tm.assert_series_equal(result, exp) + expected = Series(["shzort", "a zbit longer", "evzenlongerthanthat", "z", np.nan]) + result = ser.str.slice_replace(2, 1, "z") + tm.assert_series_equal(result, expected) - exp = Series(["shorz", "a bit longez", "evenlongerthanthaz", "z", np.nan]) - result = values.str.slice_replace(-1, None, "z") - tm.assert_series_equal(result, exp) + expected = Series(["shorz", "a bit longez", "evenlongerthanthaz", "z", np.nan]) + result = ser.str.slice_replace(-1, None, "z") + tm.assert_series_equal(result, expected) - exp = Series(["zrt", "zer", "zat", "z", np.nan]) - result = values.str.slice_replace(None, -2, "z") - tm.assert_series_equal(result, exp) + expected = Series(["zrt", "zer", "zat", "z", np.nan]) + result = ser.str.slice_replace(None, -2, "z") + tm.assert_series_equal(result, expected) - exp = Series(["shortz", "a bit znger", "evenlozerthanthat", "z", np.nan]) - result = values.str.slice_replace(6, 8, "z") - tm.assert_series_equal(result, exp) + expected = Series(["shortz", "a bit znger", "evenlozerthanthat", "z", np.nan]) + result = ser.str.slice_replace(6, 8, "z") + tm.assert_series_equal(result, expected) - exp = Series(["zrt", "a zit longer", "evenlongzerthanthat", "z", np.nan]) - result = values.str.slice_replace(-10, 3, "z") - tm.assert_series_equal(result, exp) + expected = Series(["zrt", "a zit longer", "evenlongzerthanthat", "z", np.nan]) + result = ser.str.slice_replace(-10, 3, "z") + tm.assert_series_equal(result, expected) def test_strip_lstrip_rstrip(any_string_dtype): - values = Series([" aa ", " bb \n", np.nan, "cc "], dtype=any_string_dtype) - - result = values.str.strip() - exp = Series(["aa", "bb", np.nan, "cc"], dtype=any_string_dtype) - tm.assert_series_equal(result, exp) - - result = values.str.lstrip() - exp = Series(["aa ", "bb \n", np.nan, "cc "], dtype=any_string_dtype) - tm.assert_series_equal(result, exp) - - result = values.str.rstrip() - exp = Series([" aa", " bb", np.nan, "cc"], dtype=any_string_dtype) - tm.assert_series_equal(result, exp) + ser = Series([" aa ", " bb \n", np.nan, "cc "], dtype=any_string_dtype) + result = ser.str.strip() + expected = Series(["aa", "bb", np.nan, "cc"], dtype=any_string_dtype) + tm.assert_series_equal(result, expected) -def test_strip_lstrip_rstrip_mixed(): - # mixed - mixed = Series([" aa ", np.nan, " bb \t\n", True, datetime.today(), None, 1, 2.0]) + result = ser.str.lstrip() + expected = Series(["aa ", "bb \n", np.nan, "cc "], dtype=any_string_dtype) + tm.assert_series_equal(result, expected) - rs = Series(mixed).str.strip() - xp = Series(["aa", np.nan, "bb", np.nan, np.nan, np.nan, np.nan, np.nan]) + result = ser.str.rstrip() + expected = Series([" aa", " bb", np.nan, "cc"], dtype=any_string_dtype) + tm.assert_series_equal(result, expected) - assert isinstance(rs, Series) - tm.assert_almost_equal(rs, xp) - rs = Series(mixed).str.lstrip() - xp = Series(["aa ", np.nan, "bb \t\n", np.nan, np.nan, np.nan, np.nan, np.nan]) +def test_strip_lstrip_rstrip_mixed_object(): + ser = Series([" aa ", np.nan, " bb \t\n", True, datetime.today(), None, 1, 2.0]) - assert isinstance(rs, Series) - tm.assert_almost_equal(rs, xp) + result = ser.str.strip() + expected = Series(["aa", np.nan, "bb", np.nan, np.nan, np.nan, np.nan, np.nan]) + tm.assert_series_equal(result, expected) - rs = Series(mixed).str.rstrip() - xp = Series([" aa", np.nan, " bb", np.nan, np.nan, np.nan, np.nan, np.nan]) + result = ser.str.lstrip() + expected = Series( + ["aa ", np.nan, "bb \t\n", np.nan, np.nan, np.nan, np.nan, np.nan] + ) + tm.assert_series_equal(result, expected) - assert isinstance(rs, Series) - tm.assert_almost_equal(rs, xp) + result = ser.str.rstrip() + expected = Series([" aa", np.nan, " bb", np.nan, np.nan, np.nan, np.nan, np.nan]) + tm.assert_series_equal(result, expected) def test_strip_lstrip_rstrip_args(any_string_dtype): - values = Series(["xxABCxx", "xx BNSD", "LDFJH xx"], dtype=any_string_dtype) + ser = Series(["xxABCxx", "xx BNSD", "LDFJH xx"], dtype=any_string_dtype) - rs = values.str.strip("x") - xp = Series(["ABC", " BNSD", "LDFJH "], dtype=any_string_dtype) - tm.assert_series_equal(rs, xp) + result = ser.str.strip("x") + expected = Series(["ABC", " BNSD", "LDFJH "], dtype=any_string_dtype) + tm.assert_series_equal(result, expected) - rs = values.str.lstrip("x") - xp = Series(["ABCxx", " BNSD", "LDFJH xx"], dtype=any_string_dtype) - tm.assert_series_equal(rs, xp) + result = ser.str.lstrip("x") + expected = Series(["ABCxx", " BNSD", "LDFJH xx"], dtype=any_string_dtype) + tm.assert_series_equal(result, expected) - rs = values.str.rstrip("x") - xp = Series(["xxABC", "xx BNSD", "LDFJH "], dtype=any_string_dtype) - tm.assert_series_equal(rs, xp) + result = ser.str.rstrip("x") + expected = Series(["xxABC", "xx BNSD", "LDFJH "], dtype=any_string_dtype) + tm.assert_series_equal(result, expected) def test_string_slice_get_syntax(): - s = Series( - [ - "YYY", - "B", - "C", - "YYYYYYbYYY", - "BYYYcYYY", - np.nan, - "CYYYBYYY", - "dog", - "cYYYt", - ] + ser = Series( + ["YYY", "B", "C", "YYYYYYbYYY", "BYYYcYYY", np.nan, "CYYYBYYY", "dog", "cYYYt"] ) - result = s.str[0] - expected = s.str.get(0) + result = ser.str[0] + expected = ser.str.get(0) tm.assert_series_equal(result, expected) - result = s.str[:3] - expected = s.str.slice(stop=3) + result = ser.str[:3] + expected = ser.str.slice(stop=3) tm.assert_series_equal(result, expected) - result = s.str[2::-1] - expected = s.str.slice(start=2, step=-1) + result = ser.str[2::-1] + expected = ser.str.slice(start=2, step=-1) tm.assert_series_equal(result, expected) def test_string_slice_out_of_bounds(): - s = Series([(1, 2), (1,), (3, 4, 5)]) - - result = s.str[1] + ser = Series([(1, 2), (1,), (3, 4, 5)]) + result = ser.str[1] expected = Series([2, np.nan, 4]) - tm.assert_series_equal(result, expected) - s = Series(["foo", "b", "ba"]) - result = s.str[1] + ser = Series(["foo", "b", "ba"]) + result = ser.str[1] expected = Series(["o", np.nan, "a"]) tm.assert_series_equal(result, expected) def test_encode_decode(): - base = Series(["a", "b", "a\xe4"]) - series = base.str.encode("utf-8") - - f = lambda x: x.decode("utf-8") - result = series.str.decode("utf-8") - exp = series.map(f) - - tm.assert_series_equal(result, exp) + ser = Series(["a", "b", "a\xe4"]).str.encode("utf-8") + result = ser.str.decode("utf-8") + expected = ser.map(lambda x: x.decode("utf-8")) + tm.assert_series_equal(result, expected) -def test_encode_decode_errors(): - encodeBase = Series(["a", "b", "a\x9d"]) +def test_encode_errors_kwarg(): + ser = Series(["a", "b", "a\x9d"]) msg = ( r"'charmap' codec can't encode character '\\x9d' in position 1: " "character maps to " ) with pytest.raises(UnicodeEncodeError, match=msg): - encodeBase.str.encode("cp1252") + ser.str.encode("cp1252") + + result = ser.str.encode("cp1252", "ignore") + expected = ser.map(lambda x: x.encode("cp1252", "ignore")) + tm.assert_series_equal(result, expected) - f = lambda x: x.encode("cp1252", "ignore") - result = encodeBase.str.encode("cp1252", "ignore") - exp = encodeBase.map(f) - tm.assert_series_equal(result, exp) - decodeBase = Series([b"a", b"b", b"a\x9d"]) +def test_decode_errors_kwarg(): + ser = Series([b"a", b"b", b"a\x9d"]) msg = ( "'charmap' codec can't decode byte 0x9d in position 1: " "character maps to " ) with pytest.raises(UnicodeDecodeError, match=msg): - decodeBase.str.decode("cp1252") - - f = lambda x: x.decode("cp1252", "ignore") - result = decodeBase.str.decode("cp1252", "ignore") - exp = decodeBase.map(f) + ser.str.decode("cp1252") - tm.assert_series_equal(result, exp) - - -def test_normalize(): - values = ["ABC", "ABC", "123", np.nan, "アイエ"] - s = Series(values, index=["a", "b", "c", "d", "e"]) - - normed = ["ABC", "ABC", "123", np.nan, "アイエ"] - expected = Series(normed, index=["a", "b", "c", "d", "e"]) - - result = s.str.normalize("NFKC") + result = ser.str.decode("cp1252", "ignore") + expected = ser.map(lambda x: x.decode("cp1252", "ignore")) tm.assert_series_equal(result, expected) - expected = Series( - ["ABC", "ABC", "123", np.nan, "アイエ"], index=["a", "b", "c", "d", "e"] - ) - result = s.str.normalize("NFC") +@pytest.mark.parametrize( + "form, expected", + [ + ("NFKC", ["ABC", "ABC", "123", np.nan, "アイエ"]), + ("NFC", ["ABC", "ABC", "123", np.nan, "アイエ"]), + ], +) +def test_normalize(form, expected): + ser = Series(["ABC", "ABC", "123", np.nan, "アイエ"], index=["a", "b", "c", "d", "e"]) + expected = Series(expected, index=["a", "b", "c", "d", "e"]) + result = ser.str.normalize(form) tm.assert_series_equal(result, expected) + +def test_normalize_bad_arg_raises(): + ser = Series(["ABC", "ABC", "123", np.nan, "アイエ"], index=["a", "b", "c", "d", "e"]) with pytest.raises(ValueError, match="invalid normalization form"): - s.str.normalize("xxx") + ser.str.normalize("xxx") + - s = Index(["ABC", "123", "アイエ"]) +def test_normalize_index(): + idx = Index(["ABC", "123", "アイエ"]) expected = Index(["ABC", "123", "アイエ"]) - result = s.str.normalize("NFKC") + result = idx.str.normalize("NFKC") tm.assert_index_equal(result, expected) -def test_index_str_accessor_visibility(): - from pandas.core.strings import StringMethods - - cases = [ +@pytest.mark.parametrize( + "values,inferred_type", + [ (["a", "b"], "string"), (["a", "b", 1], "mixed-integer"), (["a", "b", 1.3], "mixed"), (["a", "b", 1.3, 1], "mixed-integer"), (["aa", datetime(2011, 1, 1)], "mixed"), - ] - for values, tp in cases: - idx = Index(values) - assert isinstance(Series(values).str, StringMethods) - assert isinstance(idx.str, StringMethods) - assert idx.inferred_type == tp - - for values, tp in cases: - idx = Index(values) - assert isinstance(Series(values).str, StringMethods) - assert isinstance(idx.str, StringMethods) - assert idx.inferred_type == tp - - cases = [ + ], +) +def test_index_str_accessor_visibility(values, inferred_type, index_or_series): + from pandas.core.strings import StringMethods + + obj = index_or_series(values) + if index_or_series is Index: + assert obj.inferred_type == inferred_type + + assert isinstance(obj.str, StringMethods) + + +@pytest.mark.parametrize( + "values,inferred_type", + [ ([1, np.nan], "floating"), ([datetime(2011, 1, 1)], "datetime64"), ([timedelta(1)], "timedelta64"), - ] - for values, tp in cases: - idx = Index(values) - message = "Can only use .str accessor with string values" - with pytest.raises(AttributeError, match=message): - Series(values).str - with pytest.raises(AttributeError, match=message): - idx.str - assert idx.inferred_type == tp + ], +) +def test_index_str_accessor_non_string_values_raises( + values, inferred_type, index_or_series +): + obj = index_or_series(values) + if index_or_series is Index: + assert obj.inferred_type == inferred_type + msg = "Can only use .str accessor with string values" + with pytest.raises(AttributeError, match=msg): + obj.str + + +def test_index_str_accessor_multiindex_raises(): # MultiIndex has mixed dtype, but not allow to use accessor idx = MultiIndex.from_tuples([("a", "b"), ("a", "b")]) assert idx.inferred_type == "mixed" - message = "Can only use .str accessor with Index, not MultiIndex" - with pytest.raises(AttributeError, match=message): + + msg = "Can only use .str accessor with Index, not MultiIndex" + with pytest.raises(AttributeError, match=msg): idx.str def test_str_accessor_no_new_attributes(): # https://github.com/pandas-dev/pandas/issues/10673 - s = Series(list("aabbcde")) + ser = Series(list("aabbcde")) with pytest.raises(AttributeError, match="You cannot add any new attribute"): - s.str.xlabel = "a" + ser.str.xlabel = "a" -def test_method_on_bytes(): +def test_cat_on_bytes_raises(): lhs = Series(np.array(list("abc"), "S1").astype(object)) rhs = Series(np.array(list("def"), "S1").astype(object)) - with pytest.raises(TypeError, match="Cannot use .str.cat with values of.*"): + msg = "Cannot use .str.cat with values of inferred dtype 'bytes'" + with pytest.raises(TypeError, match=msg): lhs.str.cat(rhs)