diff --git a/bigframes/core/compile/scalar_op_compiler.py b/bigframes/core/compile/scalar_op_compiler.py index 2b9208137b..e2dfa38ce1 100644 --- a/bigframes/core/compile/scalar_op_compiler.py +++ b/bigframes/core/compile/scalar_op_compiler.py @@ -456,9 +456,19 @@ def upper_op_impl(x: ibis_types.Value): return typing.cast(ibis_types.StringValue, x).upper() -@scalar_op_compiler.register_unary_op(ops.strip_op) -def strip_op_impl(x: ibis_types.Value): - return typing.cast(ibis_types.StringValue, x).strip() +@scalar_op_compiler.register_unary_op(ops.StrLstripOp, pass_op=True) +def str_lstrip_op_impl(x: ibis_types.Value, op: ops.StrStripOp): + return str_lstrip_op(x, to_strip=op.to_strip) + + +@scalar_op_compiler.register_unary_op(ops.StrRstripOp, pass_op=True) +def str_rstrip_op_impl(x: ibis_types.Value, op: ops.StrRstripOp): + return str_rstrip_op(x, to_strip=op.to_strip) + + +@scalar_op_compiler.register_unary_op(ops.StrStripOp, pass_op=True) +def str_strip_op_impl(x: ibis_types.Value, op: ops.StrStripOp): + return str_strip_op(x, to_strip=op.to_strip) @scalar_op_compiler.register_unary_op(ops.isnumeric_op) @@ -519,16 +529,6 @@ def isupper_op_impl(x: ibis_types.Value): ).re_search(r"\p{Ll}|\p{Lt}") -@scalar_op_compiler.register_unary_op(ops.rstrip_op) -def rstrip_op_impl(x: ibis_types.Value): - return typing.cast(ibis_types.StringValue, x).rstrip() - - -@scalar_op_compiler.register_unary_op(ops.lstrip_op) -def lstrip_op_impl(x: ibis_types.Value): - return typing.cast(ibis_types.StringValue, x).lstrip() - - @scalar_op_compiler.register_unary_op(ops.capitalize_op) def capitalize_op_impl(x: ibis_types.Value): return typing.cast(ibis_types.StringValue, x).capitalize() @@ -2077,3 +2077,24 @@ def obj_make_ref(uri: str, authorizer: str) -> _OBJ_REF_IBIS_DTYPE: # type: ign @ibis_udf.scalar.builtin(name="OBJ.GET_ACCESS_URL") def obj_get_access_url(obj_ref: _OBJ_REF_IBIS_DTYPE, mode: ibis_dtypes.String) -> ibis_dtypes.JSON: # type: ignore """Get access url (as ObjectRefRumtime JSON) from ObjectRef.""" + + +@ibis_udf.scalar.builtin(name="ltrim") +def str_lstrip_op( # type: ignore[empty-body] + x: ibis_dtypes.String, to_strip: ibis_dtypes.String +) -> ibis_dtypes.String: + """Remove leading and trailing characters.""" + + +@ibis_udf.scalar.builtin(name="rtrim") +def str_rstrip_op( # type: ignore[empty-body] + x: ibis_dtypes.String, to_strip: ibis_dtypes.String +) -> ibis_dtypes.String: + """Remove leading and trailing characters.""" + + +@ibis_udf.scalar.builtin(name="trim") +def str_strip_op( # type: ignore[empty-body] + x: ibis_dtypes.String, to_strip: ibis_dtypes.String +) -> ibis_dtypes.String: + """Remove leading and trailing characters.""" diff --git a/bigframes/operations/__init__.py b/bigframes/operations/__init__.py index c8ccaf2a25..0f9b64b760 100644 --- a/bigframes/operations/__init__.py +++ b/bigframes/operations/__init__.py @@ -167,11 +167,9 @@ isupper_op, len_op, lower_op, - lstrip_op, RegexReplaceStrOp, ReplaceStrOp, reverse_op, - rstrip_op, StartsWithOp, strconcat_op, StrContainsOp, @@ -180,10 +178,12 @@ StrFindOp, StrGetOp, StringSplitOp, - strip_op, + StrLstripOp, StrPadOp, StrRepeatOp, + StrRstripOp, StrSliceOp, + StrStripOp, upper_op, ZfillOp, ) @@ -237,11 +237,9 @@ "isupper_op", "len_op", "lower_op", - "lstrip_op", "RegexReplaceStrOp", "ReplaceStrOp", "reverse_op", - "rstrip_op", "StartsWithOp", "strconcat_op", "StrContainsOp", @@ -249,11 +247,14 @@ "StrExtractOp", "StrFindOp", "StrGetOp", + "StrLstripOp", "StringSplitOp", "strip_op", "StrPadOp", "StrRepeatOp", + "StrRstripOp", "StrSliceOp", + "StrStripOp", "upper_op", "ZfillOp", # Date ops diff --git a/bigframes/operations/string_ops.py b/bigframes/operations/string_ops.py index b2ce0706ce..a2755f6654 100644 --- a/bigframes/operations/string_ops.py +++ b/bigframes/operations/string_ops.py @@ -41,10 +41,6 @@ name="upper", type_signature=op_typing.STRING_TRANSFORM ) -strip_op = base_ops.create_unary_op( - name="strip", type_signature=op_typing.STRING_TRANSFORM -) - isalnum_op = base_ops.create_unary_op( name="isalnum", type_signature=op_typing.STRING_PREDICATE ) @@ -77,14 +73,6 @@ name="isupper", type_signature=op_typing.STRING_PREDICATE ) -rstrip_op = base_ops.create_unary_op( - name="rstrip", type_signature=op_typing.STRING_TRANSFORM -) - -lstrip_op = base_ops.create_unary_op( - name="lstrip", type_signature=op_typing.STRING_TRANSFORM -) - capitalize_op = base_ops.create_unary_op( name="capitalize", type_signature=op_typing.STRING_TRANSFORM ) @@ -128,6 +116,33 @@ def output_type(self, *input_types): return op_typing.STRING_TRANSFORM.output_type(input_types[0]) +@dataclasses.dataclass(frozen=True) +class StrStripOp(base_ops.UnaryOp): + name: typing.ClassVar[str] = "str_strip" + to_strip: str + + def output_type(self, *input_types): + return op_typing.STRING_TRANSFORM.output_type(input_types[0]) + + +@dataclasses.dataclass(frozen=True) +class StrLstripOp(base_ops.UnaryOp): + name: typing.ClassVar[str] = "str_lstrip" + to_strip: str + + def output_type(self, *input_types): + return op_typing.STRING_TRANSFORM.output_type(input_types[0]) + + +@dataclasses.dataclass(frozen=True) +class StrRstripOp(base_ops.UnaryOp): + name: typing.ClassVar[str] = "str_rstrip" + to_strip: str + + def output_type(self, *input_types): + return op_typing.STRING_TRANSFORM.output_type(input_types[0]) + + @dataclasses.dataclass(frozen=True) class ReplaceStrOp(base_ops.UnaryOp): name: typing.ClassVar[str] = "str_replace" diff --git a/bigframes/operations/strings.py b/bigframes/operations/strings.py index 529dd87797..a8430b0b0e 100644 --- a/bigframes/operations/strings.py +++ b/bigframes/operations/strings.py @@ -91,8 +91,10 @@ def slice( ) -> series.Series: return self._apply_unary_op(ops.StrSliceOp(start=start, end=stop)) - def strip(self) -> series.Series: - return self._apply_unary_op(ops.strip_op) + def strip(self, to_strip: Optional[str] = None) -> series.Series: + return self._apply_unary_op( + ops.StrStripOp(to_strip=" \n\t" if to_strip is None else to_strip) + ) def upper(self) -> series.Series: return self._apply_unary_op(ops.upper_op) @@ -135,11 +137,15 @@ def isupper( ) -> series.Series: return self._apply_unary_op(ops.isupper_op) - def rstrip(self) -> series.Series: - return self._apply_unary_op(ops.rstrip_op) + def rstrip(self, to_strip: Optional[str] = None) -> series.Series: + return self._apply_unary_op( + ops.StrRstripOp(to_strip=" \n\t" if to_strip is None else to_strip) + ) - def lstrip(self) -> series.Series: - return self._apply_unary_op(ops.lstrip_op) + def lstrip(self, to_strip: Optional[str] = None) -> series.Series: + return self._apply_unary_op( + ops.StrLstripOp(to_strip=" \n\t" if to_strip is None else to_strip) + ) def repeat(self, repeats: int) -> series.Series: return self._apply_unary_op(ops.StrRepeatOp(repeats=repeats)) diff --git a/tests/system/small/operations/test_strings.py b/tests/system/small/operations/test_strings.py index bb328360ee..e4824875b4 100644 --- a/tests/system/small/operations/test_strings.py +++ b/tests/system/small/operations/test_strings.py @@ -265,6 +265,28 @@ def test_strip(scalars_dfs): ) +@pytest.mark.parametrize( + ("to_strip"), + [ + pytest.param(None, id="none"), + pytest.param(" ", id="space"), + pytest.param(" \n", id="space_newline"), + pytest.param("123.!? \n\t", id="multiple_chars"), + ], +) +def test_strip_w_to_strip(to_strip): + s = bpd.Series(["1. Ant. ", "2. Bee!\n", "3. Cat?\t", bpd.NA]) + pd_s = s.to_pandas() + + bf_result = s.str.strip(to_strip=to_strip).to_pandas() + pd_result = pd_s.str.strip(to_strip=to_strip) + + assert_series_equal( + pd_result, + bf_result, + ) + + def test_upper(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs col_name = "string_col" @@ -387,6 +409,28 @@ def test_rstrip(scalars_dfs): ) +@pytest.mark.parametrize( + ("to_strip"), + [ + pytest.param(None, id="none"), + pytest.param(" ", id="space"), + pytest.param(" \n", id="space_newline"), + pytest.param("123.!? \n\t", id="multiple_chars"), + ], +) +def test_rstrip_w_to_strip(to_strip): + s = bpd.Series(["1. Ant. ", "2. Bee!\n", "3. Cat?\t", bpd.NA]) + pd_s = s.to_pandas() + + bf_result = s.str.rstrip(to_strip=to_strip).to_pandas() + pd_result = pd_s.str.rstrip(to_strip=to_strip) + + assert_series_equal( + pd_result, + bf_result, + ) + + def test_lstrip(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs col_name = "string_col" @@ -400,6 +444,28 @@ def test_lstrip(scalars_dfs): ) +@pytest.mark.parametrize( + ("to_strip"), + [ + pytest.param(None, id="none"), + pytest.param(" ", id="space"), + pytest.param(" \n", id="space_newline"), + pytest.param("123.!? \n\t", id="multiple_chars"), + ], +) +def test_lstrip_w_to_strip(to_strip): + s = bpd.Series(["1. Ant. ", "2. Bee!\n", "3. Cat?\t", bpd.NA]) + pd_s = s.to_pandas() + + bf_result = s.str.lstrip(to_strip=to_strip).to_pandas() + pd_result = pd_s.str.lstrip(to_strip=to_strip) + + assert_series_equal( + pd_result, + bf_result, + ) + + @pytest.mark.parametrize(["repeats"], [(5,), (0,), (1,)]) def test_repeat(scalars_dfs, repeats): scalars_df, scalars_pandas_df = scalars_dfs diff --git a/third_party/bigframes_vendored/pandas/core/strings/accessor.py b/third_party/bigframes_vendored/pandas/core/strings/accessor.py index bd5e78f415..9f3d87ecb7 100644 --- a/third_party/bigframes_vendored/pandas/core/strings/accessor.py +++ b/third_party/bigframes_vendored/pandas/core/strings/accessor.py @@ -239,7 +239,7 @@ def slice(self, start=None, stop=None): raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) - def strip(self): + def strip(self, to_strip: typing.Optional[str] = None): """Remove leading and trailing characters. Strip whitespaces (including newlines) or a set of specified characters @@ -252,22 +252,35 @@ def strip(self): >>> import bigframes.pandas as bpd >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series(['Ant', ' Bee ', '\\tCat\\n', bpd.NA]) + >>> s = bpd.Series(['1. Ant.', ' 2. Bee? ', '\\t3. Cat!\\n', bpd.NA]) >>> s - 0 Ant - 1 Bee - 2 Cat + 0 1. Ant. + 1 2. Bee? + 2 3. Cat! 3 dtype: string >>> s.str.strip() + 0 1. Ant. + 1 2. Bee? + 2 3. Cat! + 3 + dtype: string + + >>> s.str.strip('123.!? \\n\\t') 0 Ant 1 Bee 2 Cat 3 dtype: string + Args: + to_strip (str, default None): + Specifying the set of characters to be removed. All combinations + of this set of characters will be stripped. If None then + whitespaces are removed. + Returns: bigframes.series.Series: Series or Index without leading and trailing characters. @@ -529,7 +542,7 @@ def isdecimal(self): raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) - def rstrip(self): + def rstrip(self, to_strip: typing.Optional[str] = None): """Remove trailing characters. Strip whitespaces (including newlines) or a set of specified characters @@ -558,13 +571,19 @@ def rstrip(self): 3 dtype: string + Args: + to_strip (str, default None): + Specifying the set of characters to be removed. All combinations + of this set of characters will be stripped. If None then + whitespaces are removed. + Returns: bigframes.series.Series: Series without trailing characters. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) - def lstrip(self): + def lstrip(self, to_strip: typing.Optional[str] = None): """Remove leading characters. Strip whitespaces (including newlines) or a set of specified characters @@ -594,6 +613,12 @@ def lstrip(self): 3 dtype: string + Args: + to_strip (str, default None): + Specifying the set of characters to be removed. All combinations + of this set of characters will be stripped. If None then + whitespaces are removed. + Returns: bigframes.series.Series: Series without leading characters. """