Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 34 additions & 13 deletions bigframes/core/compile/scalar_op_compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -456,9 +456,19 @@ def upper_op_impl(x: ibis_types.Value):
return typing.cast(ibis_types.StringValue, x).upper()


@scalar_op_compiler.register_unary_op(ops.strip_op)
def strip_op_impl(x: ibis_types.Value):
return typing.cast(ibis_types.StringValue, x).strip()
@scalar_op_compiler.register_unary_op(ops.StrLstripOp, pass_op=True)
def str_lstrip_op_impl(x: ibis_types.Value, op: ops.StrStripOp):
return str_lstrip_op(x, to_strip=op.to_strip)


@scalar_op_compiler.register_unary_op(ops.StrRstripOp, pass_op=True)
def str_rstrip_op_impl(x: ibis_types.Value, op: ops.StrRstripOp):
return str_rstrip_op(x, to_strip=op.to_strip)


@scalar_op_compiler.register_unary_op(ops.StrStripOp, pass_op=True)
def str_strip_op_impl(x: ibis_types.Value, op: ops.StrStripOp):
return str_strip_op(x, to_strip=op.to_strip)


@scalar_op_compiler.register_unary_op(ops.isnumeric_op)
Expand Down Expand Up @@ -519,16 +529,6 @@ def isupper_op_impl(x: ibis_types.Value):
).re_search(r"\p{Ll}|\p{Lt}")


@scalar_op_compiler.register_unary_op(ops.rstrip_op)
def rstrip_op_impl(x: ibis_types.Value):
return typing.cast(ibis_types.StringValue, x).rstrip()


@scalar_op_compiler.register_unary_op(ops.lstrip_op)
def lstrip_op_impl(x: ibis_types.Value):
return typing.cast(ibis_types.StringValue, x).lstrip()


@scalar_op_compiler.register_unary_op(ops.capitalize_op)
def capitalize_op_impl(x: ibis_types.Value):
return typing.cast(ibis_types.StringValue, x).capitalize()
Expand Down Expand Up @@ -2077,3 +2077,24 @@ def obj_make_ref(uri: str, authorizer: str) -> _OBJ_REF_IBIS_DTYPE: # type: ign
@ibis_udf.scalar.builtin(name="OBJ.GET_ACCESS_URL")
def obj_get_access_url(obj_ref: _OBJ_REF_IBIS_DTYPE, mode: ibis_dtypes.String) -> ibis_dtypes.JSON: # type: ignore
"""Get access url (as ObjectRefRumtime JSON) from ObjectRef."""


@ibis_udf.scalar.builtin(name="ltrim")
def str_lstrip_op( # type: ignore[empty-body]
x: ibis_dtypes.String, to_strip: ibis_dtypes.String
) -> ibis_dtypes.String:
"""Remove leading and trailing characters."""


@ibis_udf.scalar.builtin(name="rtrim")
def str_rstrip_op( # type: ignore[empty-body]
x: ibis_dtypes.String, to_strip: ibis_dtypes.String
) -> ibis_dtypes.String:
"""Remove leading and trailing characters."""


@ibis_udf.scalar.builtin(name="trim")
def str_strip_op( # type: ignore[empty-body]
x: ibis_dtypes.String, to_strip: ibis_dtypes.String
) -> ibis_dtypes.String:
"""Remove leading and trailing characters."""
11 changes: 6 additions & 5 deletions bigframes/operations/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,11 +167,9 @@
isupper_op,
len_op,
lower_op,
lstrip_op,
RegexReplaceStrOp,
ReplaceStrOp,
reverse_op,
rstrip_op,
StartsWithOp,
strconcat_op,
StrContainsOp,
Expand All @@ -180,10 +178,12 @@
StrFindOp,
StrGetOp,
StringSplitOp,
strip_op,
StrLstripOp,
StrPadOp,
StrRepeatOp,
StrRstripOp,
StrSliceOp,
StrStripOp,
upper_op,
ZfillOp,
)
Expand Down Expand Up @@ -237,23 +237,24 @@
"isupper_op",
"len_op",
"lower_op",
"lstrip_op",
"RegexReplaceStrOp",
"ReplaceStrOp",
"reverse_op",
"rstrip_op",
"StartsWithOp",
"strconcat_op",
"StrContainsOp",
"StrContainsRegexOp",
"StrExtractOp",
"StrFindOp",
"StrGetOp",
"StrLstripOp",
"StringSplitOp",
"strip_op",
"StrPadOp",
"StrRepeatOp",
"StrRstripOp",
"StrSliceOp",
"StrStripOp",
"upper_op",
"ZfillOp",
# Date ops
Expand Down
39 changes: 27 additions & 12 deletions bigframes/operations/string_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,10 +41,6 @@
name="upper", type_signature=op_typing.STRING_TRANSFORM
)

strip_op = base_ops.create_unary_op(
name="strip", type_signature=op_typing.STRING_TRANSFORM
)

isalnum_op = base_ops.create_unary_op(
name="isalnum", type_signature=op_typing.STRING_PREDICATE
)
Expand Down Expand Up @@ -77,14 +73,6 @@
name="isupper", type_signature=op_typing.STRING_PREDICATE
)

rstrip_op = base_ops.create_unary_op(
name="rstrip", type_signature=op_typing.STRING_TRANSFORM
)

lstrip_op = base_ops.create_unary_op(
name="lstrip", type_signature=op_typing.STRING_TRANSFORM
)

capitalize_op = base_ops.create_unary_op(
name="capitalize", type_signature=op_typing.STRING_TRANSFORM
)
Expand Down Expand Up @@ -128,6 +116,33 @@ def output_type(self, *input_types):
return op_typing.STRING_TRANSFORM.output_type(input_types[0])


@dataclasses.dataclass(frozen=True)
class StrStripOp(base_ops.UnaryOp):
name: typing.ClassVar[str] = "str_strip"
to_strip: str

def output_type(self, *input_types):
return op_typing.STRING_TRANSFORM.output_type(input_types[0])


@dataclasses.dataclass(frozen=True)
class StrLstripOp(base_ops.UnaryOp):
name: typing.ClassVar[str] = "str_lstrip"
to_strip: str

def output_type(self, *input_types):
return op_typing.STRING_TRANSFORM.output_type(input_types[0])


@dataclasses.dataclass(frozen=True)
class StrRstripOp(base_ops.UnaryOp):
name: typing.ClassVar[str] = "str_rstrip"
to_strip: str

def output_type(self, *input_types):
return op_typing.STRING_TRANSFORM.output_type(input_types[0])


@dataclasses.dataclass(frozen=True)
class ReplaceStrOp(base_ops.UnaryOp):
name: typing.ClassVar[str] = "str_replace"
Expand Down
18 changes: 12 additions & 6 deletions bigframes/operations/strings.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,8 +91,10 @@ def slice(
) -> series.Series:
return self._apply_unary_op(ops.StrSliceOp(start=start, end=stop))

def strip(self) -> series.Series:
return self._apply_unary_op(ops.strip_op)
def strip(self, to_strip: Optional[str] = None) -> series.Series:
return self._apply_unary_op(
ops.StrStripOp(to_strip=" \n\t" if to_strip is None else to_strip)
)

def upper(self) -> series.Series:
return self._apply_unary_op(ops.upper_op)
Expand Down Expand Up @@ -135,11 +137,15 @@ def isupper(
) -> series.Series:
return self._apply_unary_op(ops.isupper_op)

def rstrip(self) -> series.Series:
return self._apply_unary_op(ops.rstrip_op)
def rstrip(self, to_strip: Optional[str] = None) -> series.Series:
return self._apply_unary_op(
ops.StrRstripOp(to_strip=" \n\t" if to_strip is None else to_strip)
)

def lstrip(self) -> series.Series:
return self._apply_unary_op(ops.lstrip_op)
def lstrip(self, to_strip: Optional[str] = None) -> series.Series:
return self._apply_unary_op(
ops.StrLstripOp(to_strip=" \n\t" if to_strip is None else to_strip)
)

def repeat(self, repeats: int) -> series.Series:
return self._apply_unary_op(ops.StrRepeatOp(repeats=repeats))
Expand Down
66 changes: 66 additions & 0 deletions tests/system/small/operations/test_strings.py
Original file line number Diff line number Diff line change
Expand Up @@ -265,6 +265,28 @@ def test_strip(scalars_dfs):
)


@pytest.mark.parametrize(
("to_strip"),
[
pytest.param(None, id="none"),
pytest.param(" ", id="space"),
pytest.param(" \n", id="space_newline"),
pytest.param("123.!? \n\t", id="multiple_chars"),
],
)
def test_strip_w_to_strip(to_strip):
s = bpd.Series(["1. Ant. ", "2. Bee!\n", "3. Cat?\t", bpd.NA])
pd_s = s.to_pandas()

bf_result = s.str.strip(to_strip=to_strip).to_pandas()
pd_result = pd_s.str.strip(to_strip=to_strip)

assert_series_equal(
pd_result,
bf_result,
)


def test_upper(scalars_dfs):
scalars_df, scalars_pandas_df = scalars_dfs
col_name = "string_col"
Expand Down Expand Up @@ -387,6 +409,28 @@ def test_rstrip(scalars_dfs):
)


@pytest.mark.parametrize(
("to_strip"),
[
pytest.param(None, id="none"),
pytest.param(" ", id="space"),
pytest.param(" \n", id="space_newline"),
pytest.param("123.!? \n\t", id="multiple_chars"),
],
)
def test_rstrip_w_to_strip(to_strip):
s = bpd.Series(["1. Ant. ", "2. Bee!\n", "3. Cat?\t", bpd.NA])
pd_s = s.to_pandas()

bf_result = s.str.rstrip(to_strip=to_strip).to_pandas()
pd_result = pd_s.str.rstrip(to_strip=to_strip)

assert_series_equal(
pd_result,
bf_result,
)


def test_lstrip(scalars_dfs):
scalars_df, scalars_pandas_df = scalars_dfs
col_name = "string_col"
Expand All @@ -400,6 +444,28 @@ def test_lstrip(scalars_dfs):
)


@pytest.mark.parametrize(
("to_strip"),
[
pytest.param(None, id="none"),
pytest.param(" ", id="space"),
pytest.param(" \n", id="space_newline"),
pytest.param("123.!? \n\t", id="multiple_chars"),
],
)
def test_lstrip_w_to_strip(to_strip):
s = bpd.Series(["1. Ant. ", "2. Bee!\n", "3. Cat?\t", bpd.NA])
pd_s = s.to_pandas()

bf_result = s.str.lstrip(to_strip=to_strip).to_pandas()
pd_result = pd_s.str.lstrip(to_strip=to_strip)

assert_series_equal(
pd_result,
bf_result,
)


@pytest.mark.parametrize(["repeats"], [(5,), (0,), (1,)])
def test_repeat(scalars_dfs, repeats):
scalars_df, scalars_pandas_df = scalars_dfs
Expand Down
39 changes: 32 additions & 7 deletions third_party/bigframes_vendored/pandas/core/strings/accessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -239,7 +239,7 @@ def slice(self, start=None, stop=None):

raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)

def strip(self):
def strip(self, to_strip: typing.Optional[str] = None):
"""Remove leading and trailing characters.

Strip whitespaces (including newlines) or a set of specified characters
Expand All @@ -252,22 +252,35 @@ def strip(self):
>>> import bigframes.pandas as bpd
>>> bpd.options.display.progress_bar = None

>>> s = bpd.Series(['Ant', ' Bee ', '\\tCat\\n', bpd.NA])
>>> s = bpd.Series(['1. Ant.', ' 2. Bee? ', '\\t3. Cat!\\n', bpd.NA])
>>> s
0 Ant
1 Bee
2 Cat
0 1. Ant.
1 2. Bee?
2 3. Cat!
<BLANKLINE>
3 <NA>
dtype: string

>>> s.str.strip()
0 1. Ant.
1 2. Bee?
2 3. Cat!
3 <NA>
dtype: string

>>> s.str.strip('123.!? \\n\\t')
0 Ant
1 Bee
2 Cat
3 <NA>
dtype: string

Args:
to_strip (str, default None):
Specifying the set of characters to be removed. All combinations
of this set of characters will be stripped. If None then
whitespaces are removed.

Returns:
bigframes.series.Series: Series or Index without leading
and trailing characters.
Expand Down Expand Up @@ -529,7 +542,7 @@ def isdecimal(self):

raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)

def rstrip(self):
def rstrip(self, to_strip: typing.Optional[str] = None):
"""Remove trailing characters.

Strip whitespaces (including newlines) or a set of specified characters
Expand Down Expand Up @@ -558,13 +571,19 @@ def rstrip(self):
3 <NA>
dtype: string

Args:
to_strip (str, default None):
Specifying the set of characters to be removed. All combinations
of this set of characters will be stripped. If None then
whitespaces are removed.

Returns:
bigframes.series.Series: Series without trailing characters.
"""

raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)

def lstrip(self):
def lstrip(self, to_strip: typing.Optional[str] = None):
"""Remove leading characters.

Strip whitespaces (including newlines) or a set of specified characters
Expand Down Expand Up @@ -594,6 +613,12 @@ def lstrip(self):
3 <NA>
dtype: string

Args:
to_strip (str, default None):
Specifying the set of characters to be removed. All combinations
of this set of characters will be stripped. If None then
whitespaces are removed.

Returns:
bigframes.series.Series: Series without leading characters.
"""
Expand Down