diff --git a/doc/source/api.rst b/doc/source/api.rst index 2d9fc0df5347d..364b3ba04aefb 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -544,10 +544,12 @@ strings and apply several methods to it. These can be acccessed like Series.str.match Series.str.normalize Series.str.pad + Series.str.partition Series.str.repeat Series.str.replace Series.str.rfind Series.str.rjust + Series.str.rpartition Series.str.rstrip Series.str.slice Series.str.slice_replace diff --git a/doc/source/text.rst b/doc/source/text.rst index 359b6d61dbb64..bb27fe52ba7a5 100644 --- a/doc/source/text.rst +++ b/doc/source/text.rst @@ -262,6 +262,8 @@ Method Summary :meth:`~Series.str.strip`,Equivalent to ``str.strip`` :meth:`~Series.str.rstrip`,Equivalent to ``str.rstrip`` :meth:`~Series.str.lstrip`,Equivalent to ``str.lstrip`` + :meth:`~Series.str.partition`,Equivalent to ``str.partition`` + :meth:`~Series.str.rpartition`,Equivalent to ``str.rpartition`` :meth:`~Series.str.lower`,Equivalent to ``str.lower`` :meth:`~Series.str.upper`,Equivalent to ``str.upper`` :meth:`~Series.str.find`,Equivalent to ``str.find`` diff --git a/doc/source/whatsnew/v0.16.1.txt b/doc/source/whatsnew/v0.16.1.txt index 1c2dbaa48832b..493f299b2bf32 100755 --- a/doc/source/whatsnew/v0.16.1.txt +++ b/doc/source/whatsnew/v0.16.1.txt @@ -42,6 +42,7 @@ Enhancements - Added ``StringMethods`` (.str accessor) to ``Index`` (:issue:`9068`) - Added ``StringMethods.normalize()`` which behaves the same as standard :func:`unicodedata.normalizes` (:issue:`10031`) +- Added ``StringMethods.partition()`` and ``rpartition()`` which behave as the same as standard ``str`` (:issue:`9773`) - Allow clip, clip_lower, and clip_upper to accept array-like arguments as thresholds (:issue:`6966`). These methods now have an ``axis`` parameter which determines how the Series or DataFrame will be aligned with the threshold(s). The ``.str`` accessor is now available for both ``Series`` and ``Index``. diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 5cea4c4afe8cc..62e9e0fbc41ae 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -992,6 +992,8 @@ def __iter__(self): g = self.get(i) def _wrap_result(self, result): + # leave as it is to keep extract and get_dummies results + # can be merged to _wrap_result_expand in v0.17 from pandas.core.series import Series from pandas.core.frame import DataFrame from pandas.core.index import Index @@ -1012,6 +1014,33 @@ def _wrap_result(self, result): assert result.ndim < 3 return DataFrame(result, index=self.series.index) + def _wrap_result_expand(self, result, expand=False): + from pandas.core.index import Index + if not hasattr(result, 'ndim'): + return result + + if isinstance(self.series, Index): + name = getattr(result, 'name', None) + # if result is a boolean np.array, return the np.array + # instead of wrapping it into a boolean Index (GH 8875) + if hasattr(result, 'dtype') and is_bool_dtype(result): + return result + + if expand: + result = list(result) + return Index(result, name=name) + else: + index = self.series.index + if expand: + cons_row = self.series._constructor + cons = self.series._constructor_expanddim + data = [cons_row(x) for x in result] + return cons(data, index=index) + else: + name = getattr(result, 'name', None) + cons = self.series._constructor + return cons(result, name=name, index=index) + @copy(str_cat) def cat(self, others=None, sep=None, na_rep=None): result = str_cat(self.series, others=others, sep=sep, na_rep=na_rep) @@ -1022,6 +1051,65 @@ def split(self, pat=None, n=-1, return_type='series'): result = str_split(self.series, pat, n=n, return_type=return_type) return self._wrap_result(result) + _shared_docs['str_partition'] = (""" + Split the string at the %(side)s occurrence of `sep`, and return 3 elements + containing the part before the separator, the separator itself, + and the part after the separator. + If the separator is not found, return %(return)s. + + Parameters + ---------- + pat : string, default whitespace + String to split on. + expand : bool, default True + * If True, return DataFrame/MultiIndex expanding dimensionality. + * If False, return Series/Index + + Returns + ------- + split : DataFrame/MultiIndex or Series/Index of objects + + See Also + -------- + %(also)s + + Examples + -------- + + >>> s = Series(['A_B_C', 'D_E_F', 'X']) + 0 A_B_C + 1 D_E_F + 2 X + dtype: object + + >>> s.str.partition('_') + 0 1 2 + 0 A _ B_C + 1 D _ E_F + 2 X + + >>> s.str.rpartition('_') + 0 1 2 + 0 A_B _ C + 1 D_E _ F + 2 X + """) + @Appender(_shared_docs['str_partition'] % {'side': 'first', + 'return': '3 elements containing the string itself, followed by two empty strings', + 'also': 'rpartition : Split the string at the last occurrence of `sep`'}) + def partition(self, pat=' ', expand=True): + f = lambda x: x.partition(pat) + result = _na_map(f, self.series) + return self._wrap_result_expand(result, expand=expand) + + @Appender(_shared_docs['str_partition'] % {'side': 'last', + 'return': '3 elements containing two empty strings, followed by the string itself', + 'also': 'partition : Split the string at the first occurrence of `sep`'}) + def rpartition(self, pat=' ', expand=True): + f = lambda x: x.rpartition(pat) + result = _na_map(f, self.series) + return self._wrap_result_expand(result, expand=expand) + @copy(str_get) def get(self, i): result = str_get(self.series, i) diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index d3875f0675e9f..1f84e1dc4d155 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -664,6 +664,8 @@ def test_empty_str_methods(self): tm.assert_series_equal(empty_str, empty.str.pad(42)) tm.assert_series_equal(empty_str, empty.str.center(42)) tm.assert_series_equal(empty_list, empty.str.split('a')) + tm.assert_series_equal(empty_list, empty.str.partition('a', expand=False)) + tm.assert_series_equal(empty_list, empty.str.rpartition('a', expand=False)) tm.assert_series_equal(empty_str, empty.str.slice(stop=1)) tm.assert_series_equal(empty_str, empty.str.slice(step=1)) tm.assert_series_equal(empty_str, empty.str.strip()) @@ -687,6 +689,12 @@ def test_empty_str_methods(self): tm.assert_series_equal(empty_str, empty.str.swapcase()) tm.assert_series_equal(empty_str, empty.str.normalize('NFC')) + def test_empty_str_methods_to_frame(self): + empty_str = empty = Series(dtype=str) + empty_df = DataFrame([]) + tm.assert_frame_equal(empty_df, empty.str.partition('a')) + tm.assert_frame_equal(empty_df, empty.str.rpartition('a')) + def test_ismethods(self): values = ['A', 'b', 'Xy', '4', '3A', '', 'TT', '55', '-', ' '] str_s = Series(values) @@ -1175,6 +1183,119 @@ def test_split_to_dataframe(self): with tm.assertRaisesRegexp(ValueError, "return_type must be"): s.str.split('_', return_type="some_invalid_type") + def test_partition_series(self): + values = Series(['a_b_c', 'c_d_e', NA, 'f_g_h']) + + result = values.str.partition('_', expand=False) + exp = Series([['a', '_', 'b_c'], ['c', '_', 'd_e'], NA, ['f', '_', 'g_h']]) + tm.assert_series_equal(result, exp) + + result = values.str.rpartition('_', expand=False) + exp = Series([['a_b', '_', 'c'], ['c_d', '_', 'e'], NA, ['f_g', '_', 'h']]) + tm.assert_series_equal(result, exp) + + # more than one char + values = Series(['a__b__c', 'c__d__e', NA, 'f__g__h']) + result = values.str.partition('__', expand=False) + exp = Series([['a', '__', 'b__c'], ['c', '__', 'd__e'], NA, ['f', '__', 'g__h']]) + tm.assert_series_equal(result, exp) + + result = values.str.rpartition('__', expand=False) + exp = Series([['a__b', '__', 'c'], ['c__d', '__', 'e'], NA, ['f__g', '__', 'h']]) + tm.assert_series_equal(result, exp) + + # None + values = Series(['a b c', 'c d e', NA, 'f g h']) + result = values.str.partition(expand=False) + exp = Series([['a', ' ', 'b c'], ['c', ' ', 'd e'], NA, ['f', ' ', 'g h']]) + tm.assert_series_equal(result, exp) + + result = values.str.rpartition(expand=False) + exp = Series([['a b', ' ', 'c'], ['c d', ' ', 'e'], NA, ['f g', ' ', 'h']]) + tm.assert_series_equal(result, exp) + + # Not splited + values = Series(['abc', 'cde', NA, 'fgh']) + result = values.str.partition('_', expand=False) + exp = Series([['abc', '', ''], ['cde', '', ''], NA, ['fgh', '', '']]) + tm.assert_series_equal(result, exp) + + result = values.str.rpartition('_', expand=False) + exp = Series([['', '', 'abc'], ['', '', 'cde'], NA, ['', '', 'fgh']]) + tm.assert_series_equal(result, exp) + + # unicode + values = Series([u('a_b_c'), u('c_d_e'), NA, u('f_g_h')]) + + result = values.str.partition('_', expand=False) + exp = Series([[u('a'), u('_'), u('b_c')], [u('c'), u('_'), u('d_e')], + NA, [u('f'), u('_'), u('g_h')]]) + tm.assert_series_equal(result, exp) + + result = values.str.rpartition('_', expand=False) + exp = Series([[u('a_b'), u('_'), u('c')], [u('c_d'), u('_'), u('e')], + NA, [u('f_g'), u('_'), u('h')]]) + tm.assert_series_equal(result, exp) + + # compare to standard lib + values = Series(['A_B_C', 'B_C_D', 'E_F_G', 'EFGHEF']) + result = values.str.partition('_', expand=False).tolist() + self.assertEqual(result, [v.partition('_') for v in values]) + result = values.str.rpartition('_', expand=False).tolist() + self.assertEqual(result, [v.rpartition('_') for v in values]) + + def test_partition_index(self): + values = Index(['a_b_c', 'c_d_e', 'f_g_h']) + + result = values.str.partition('_', expand=False) + exp = Index(np.array([('a', '_', 'b_c'), ('c', '_', 'd_e'), ('f', '_', 'g_h')])) + tm.assert_index_equal(result, exp) + self.assertEqual(result.nlevels, 1) + + result = values.str.rpartition('_', expand=False) + exp = Index(np.array([('a_b', '_', 'c'), ('c_d', '_', 'e'), ('f_g', '_', 'h')])) + tm.assert_index_equal(result, exp) + self.assertEqual(result.nlevels, 1) + + result = values.str.partition('_') + exp = Index([('a', '_', 'b_c'), ('c', '_', 'd_e'), ('f', '_', 'g_h')]) + tm.assert_index_equal(result, exp) + self.assertTrue(isinstance(result, MultiIndex)) + self.assertEqual(result.nlevels, 3) + + result = values.str.rpartition('_') + exp = Index([('a_b', '_', 'c'), ('c_d', '_', 'e'), ('f_g', '_', 'h')]) + tm.assert_index_equal(result, exp) + self.assertTrue(isinstance(result, MultiIndex)) + self.assertEqual(result.nlevels, 3) + + def test_partition_to_dataframe(self): + values = Series(['a_b_c', 'c_d_e', NA, 'f_g_h']) + result = values.str.partition('_') + exp = DataFrame({0: ['a', 'c', np.nan, 'f'], + 1: ['_', '_', np.nan, '_'], + 2: ['b_c', 'd_e', np.nan, 'g_h']}) + tm.assert_frame_equal(result, exp) + + result = values.str.rpartition('_') + exp = DataFrame({0: ['a_b', 'c_d', np.nan, 'f_g'], + 1: ['_', '_', np.nan, '_'], + 2: ['c', 'e', np.nan, 'h']}) + tm.assert_frame_equal(result, exp) + + values = Series(['a_b_c', 'c_d_e', NA, 'f_g_h']) + result = values.str.partition('_', expand=True) + exp = DataFrame({0: ['a', 'c', np.nan, 'f'], + 1: ['_', '_', np.nan, '_'], + 2: ['b_c', 'd_e', np.nan, 'g_h']}) + tm.assert_frame_equal(result, exp) + + result = values.str.rpartition('_', expand=True) + exp = DataFrame({0: ['a_b', 'c_d', np.nan, 'f_g'], + 1: ['_', '_', np.nan, '_'], + 2: ['c', 'e', np.nan, 'h']}) + tm.assert_frame_equal(result, exp) + def test_pipe_failures(self): # #2119 s = Series(['A|B|C'])