Skip to content

Commit bd54203

Browse files
committed
feat: add to_strip for xstrip methods
1 parent 8f115e7 commit bd54203

File tree

6 files changed

+177
-43
lines changed

6 files changed

+177
-43
lines changed

bigframes/core/compile/scalar_op_compiler.py

Lines changed: 34 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -456,9 +456,19 @@ def upper_op_impl(x: ibis_types.Value):
456456
return typing.cast(ibis_types.StringValue, x).upper()
457457

458458

459-
@scalar_op_compiler.register_unary_op(ops.strip_op)
460-
def strip_op_impl(x: ibis_types.Value):
461-
return typing.cast(ibis_types.StringValue, x).strip()
459+
@scalar_op_compiler.register_unary_op(ops.StrLstripOp, pass_op=True)
460+
def str_lstrip_op_impl(x: ibis_types.Value, op: ops.StrStripOp):
461+
return str_lstrip_op(x, to_strip=op.to_strip)
462+
463+
464+
@scalar_op_compiler.register_unary_op(ops.StrRstripOp, pass_op=True)
465+
def str_rstrip_op_impl(x: ibis_types.Value, op: ops.StrRstripOp):
466+
return str_rstrip_op(x, to_strip=op.to_strip)
467+
468+
469+
@scalar_op_compiler.register_unary_op(ops.StrStripOp, pass_op=True)
470+
def str_strip_op_impl(x: ibis_types.Value, op: ops.StrStripOp):
471+
return str_strip_op(x, to_strip=op.to_strip)
462472

463473

464474
@scalar_op_compiler.register_unary_op(ops.isnumeric_op)
@@ -519,16 +529,6 @@ def isupper_op_impl(x: ibis_types.Value):
519529
).re_search(r"\p{Ll}|\p{Lt}")
520530

521531

522-
@scalar_op_compiler.register_unary_op(ops.rstrip_op)
523-
def rstrip_op_impl(x: ibis_types.Value):
524-
return typing.cast(ibis_types.StringValue, x).rstrip()
525-
526-
527-
@scalar_op_compiler.register_unary_op(ops.lstrip_op)
528-
def lstrip_op_impl(x: ibis_types.Value):
529-
return typing.cast(ibis_types.StringValue, x).lstrip()
530-
531-
532532
@scalar_op_compiler.register_unary_op(ops.capitalize_op)
533533
def capitalize_op_impl(x: ibis_types.Value):
534534
return typing.cast(ibis_types.StringValue, x).capitalize()
@@ -2077,3 +2077,24 @@ def obj_make_ref(uri: str, authorizer: str) -> _OBJ_REF_IBIS_DTYPE: # type: ign
20772077
@ibis_udf.scalar.builtin(name="OBJ.GET_ACCESS_URL")
20782078
def obj_get_access_url(obj_ref: _OBJ_REF_IBIS_DTYPE, mode: ibis_dtypes.String) -> ibis_dtypes.JSON: # type: ignore
20792079
"""Get access url (as ObjectRefRumtime JSON) from ObjectRef."""
2080+
2081+
2082+
@ibis_udf.scalar.builtin(name="ltrim")
2083+
def str_lstrip_op( # type: ignore[empty-body]
2084+
x: ibis_dtypes.String, to_strip: ibis_dtypes.String
2085+
) -> ibis_dtypes.String:
2086+
"""Remove leading and trailing characters."""
2087+
2088+
2089+
@ibis_udf.scalar.builtin(name="rtrim")
2090+
def str_rstrip_op( # type: ignore[empty-body]
2091+
x: ibis_dtypes.String, to_strip: ibis_dtypes.String
2092+
) -> ibis_dtypes.String:
2093+
"""Remove leading and trailing characters."""
2094+
2095+
2096+
@ibis_udf.scalar.builtin(name="trim")
2097+
def str_strip_op( # type: ignore[empty-body]
2098+
x: ibis_dtypes.String, to_strip: ibis_dtypes.String
2099+
) -> ibis_dtypes.String:
2100+
"""Remove leading and trailing characters."""

bigframes/operations/__init__.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -167,11 +167,9 @@
167167
isupper_op,
168168
len_op,
169169
lower_op,
170-
lstrip_op,
171170
RegexReplaceStrOp,
172171
ReplaceStrOp,
173172
reverse_op,
174-
rstrip_op,
175173
StartsWithOp,
176174
strconcat_op,
177175
StrContainsOp,
@@ -180,10 +178,12 @@
180178
StrFindOp,
181179
StrGetOp,
182180
StringSplitOp,
183-
strip_op,
181+
StrLstripOp,
184182
StrPadOp,
185183
StrRepeatOp,
184+
StrRstripOp,
186185
StrSliceOp,
186+
StrStripOp,
187187
upper_op,
188188
ZfillOp,
189189
)
@@ -237,23 +237,24 @@
237237
"isupper_op",
238238
"len_op",
239239
"lower_op",
240-
"lstrip_op",
241240
"RegexReplaceStrOp",
242241
"ReplaceStrOp",
243242
"reverse_op",
244-
"rstrip_op",
245243
"StartsWithOp",
246244
"strconcat_op",
247245
"StrContainsOp",
248246
"StrContainsRegexOp",
249247
"StrExtractOp",
250248
"StrFindOp",
251249
"StrGetOp",
250+
"StrLstripOp",
252251
"StringSplitOp",
253252
"strip_op",
254253
"StrPadOp",
255254
"StrRepeatOp",
255+
"StrRstripOp",
256256
"StrSliceOp",
257+
"StrStripOp",
257258
"upper_op",
258259
"ZfillOp",
259260
# Date ops

bigframes/operations/string_ops.py

Lines changed: 27 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -41,10 +41,6 @@
4141
name="upper", type_signature=op_typing.STRING_TRANSFORM
4242
)
4343

44-
strip_op = base_ops.create_unary_op(
45-
name="strip", type_signature=op_typing.STRING_TRANSFORM
46-
)
47-
4844
isalnum_op = base_ops.create_unary_op(
4945
name="isalnum", type_signature=op_typing.STRING_PREDICATE
5046
)
@@ -77,14 +73,6 @@
7773
name="isupper", type_signature=op_typing.STRING_PREDICATE
7874
)
7975

80-
rstrip_op = base_ops.create_unary_op(
81-
name="rstrip", type_signature=op_typing.STRING_TRANSFORM
82-
)
83-
84-
lstrip_op = base_ops.create_unary_op(
85-
name="lstrip", type_signature=op_typing.STRING_TRANSFORM
86-
)
87-
8876
capitalize_op = base_ops.create_unary_op(
8977
name="capitalize", type_signature=op_typing.STRING_TRANSFORM
9078
)
@@ -128,6 +116,33 @@ def output_type(self, *input_types):
128116
return op_typing.STRING_TRANSFORM.output_type(input_types[0])
129117

130118

119+
@dataclasses.dataclass(frozen=True)
120+
class StrStripOp(base_ops.UnaryOp):
121+
name: typing.ClassVar[str] = "str_strip"
122+
to_strip: str
123+
124+
def output_type(self, *input_types):
125+
return op_typing.STRING_TRANSFORM.output_type(input_types[0])
126+
127+
128+
@dataclasses.dataclass(frozen=True)
129+
class StrLstripOp(base_ops.UnaryOp):
130+
name: typing.ClassVar[str] = "str_lstrip"
131+
to_strip: str
132+
133+
def output_type(self, *input_types):
134+
return op_typing.STRING_TRANSFORM.output_type(input_types[0])
135+
136+
137+
@dataclasses.dataclass(frozen=True)
138+
class StrRstripOp(base_ops.UnaryOp):
139+
name: typing.ClassVar[str] = "str_rstrip"
140+
to_strip: str
141+
142+
def output_type(self, *input_types):
143+
return op_typing.STRING_TRANSFORM.output_type(input_types[0])
144+
145+
131146
@dataclasses.dataclass(frozen=True)
132147
class ReplaceStrOp(base_ops.UnaryOp):
133148
name: typing.ClassVar[str] = "str_replace"

bigframes/operations/strings.py

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -91,8 +91,10 @@ def slice(
9191
) -> series.Series:
9292
return self._apply_unary_op(ops.StrSliceOp(start=start, end=stop))
9393

94-
def strip(self) -> series.Series:
95-
return self._apply_unary_op(ops.strip_op)
94+
def strip(self, to_strip: Optional[str] = None) -> series.Series:
95+
return self._apply_unary_op(
96+
ops.StrStripOp(to_strip=" \n\t" if to_strip is None else to_strip)
97+
)
9698

9799
def upper(self) -> series.Series:
98100
return self._apply_unary_op(ops.upper_op)
@@ -135,11 +137,15 @@ def isupper(
135137
) -> series.Series:
136138
return self._apply_unary_op(ops.isupper_op)
137139

138-
def rstrip(self) -> series.Series:
139-
return self._apply_unary_op(ops.rstrip_op)
140+
def rstrip(self, to_strip: Optional[str] = None) -> series.Series:
141+
return self._apply_unary_op(
142+
ops.StrRstripOp(to_strip=" \n\t" if to_strip is None else to_strip)
143+
)
140144

141-
def lstrip(self) -> series.Series:
142-
return self._apply_unary_op(ops.lstrip_op)
145+
def lstrip(self, to_strip: Optional[str] = None) -> series.Series:
146+
return self._apply_unary_op(
147+
ops.StrLstripOp(to_strip=" \n\t" if to_strip is None else to_strip)
148+
)
143149

144150
def repeat(self, repeats: int) -> series.Series:
145151
return self._apply_unary_op(ops.StrRepeatOp(repeats=repeats))

tests/system/small/operations/test_strings.py

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -265,6 +265,28 @@ def test_strip(scalars_dfs):
265265
)
266266

267267

268+
@pytest.mark.parametrize(
269+
("to_strip"),
270+
[
271+
pytest.param(None, id="none"),
272+
pytest.param(" ", id="space"),
273+
pytest.param(" \n", id="space_newline"),
274+
pytest.param("123.!? \n\t", id="multiple_chars"),
275+
],
276+
)
277+
def test_strip_w_to_strip(to_strip):
278+
s = bpd.Series(["1. Ant. ", "2. Bee!\n", "3. Cat?\t", bpd.NA])
279+
pd_s = s.to_pandas()
280+
281+
bf_result = s.str.strip(to_strip=to_strip).to_pandas()
282+
pd_result = pd_s.str.strip(to_strip=to_strip)
283+
284+
assert_series_equal(
285+
pd_result,
286+
bf_result,
287+
)
288+
289+
268290
def test_upper(scalars_dfs):
269291
scalars_df, scalars_pandas_df = scalars_dfs
270292
col_name = "string_col"
@@ -387,6 +409,28 @@ def test_rstrip(scalars_dfs):
387409
)
388410

389411

412+
@pytest.mark.parametrize(
413+
("to_strip"),
414+
[
415+
pytest.param(None, id="none"),
416+
pytest.param(" ", id="space"),
417+
pytest.param(" \n", id="space_newline"),
418+
pytest.param("123.!? \n\t", id="multiple_chars"),
419+
],
420+
)
421+
def test_rstrip_w_to_strip(to_strip):
422+
s = bpd.Series(["1. Ant. ", "2. Bee!\n", "3. Cat?\t", bpd.NA])
423+
pd_s = s.to_pandas()
424+
425+
bf_result = s.str.rstrip(to_strip=to_strip).to_pandas()
426+
pd_result = pd_s.str.rstrip(to_strip=to_strip)
427+
428+
assert_series_equal(
429+
pd_result,
430+
bf_result,
431+
)
432+
433+
390434
def test_lstrip(scalars_dfs):
391435
scalars_df, scalars_pandas_df = scalars_dfs
392436
col_name = "string_col"
@@ -400,6 +444,28 @@ def test_lstrip(scalars_dfs):
400444
)
401445

402446

447+
@pytest.mark.parametrize(
448+
("to_strip"),
449+
[
450+
pytest.param(None, id="none"),
451+
pytest.param(" ", id="space"),
452+
pytest.param(" \n", id="space_newline"),
453+
pytest.param("123.!? \n\t", id="multiple_chars"),
454+
],
455+
)
456+
def test_lstrip_w_to_strip(to_strip):
457+
s = bpd.Series(["1. Ant. ", "2. Bee!\n", "3. Cat?\t", bpd.NA])
458+
pd_s = s.to_pandas()
459+
460+
bf_result = s.str.lstrip(to_strip=to_strip).to_pandas()
461+
pd_result = pd_s.str.lstrip(to_strip=to_strip)
462+
463+
assert_series_equal(
464+
pd_result,
465+
bf_result,
466+
)
467+
468+
403469
@pytest.mark.parametrize(["repeats"], [(5,), (0,), (1,)])
404470
def test_repeat(scalars_dfs, repeats):
405471
scalars_df, scalars_pandas_df = scalars_dfs

third_party/bigframes_vendored/pandas/core/strings/accessor.py

Lines changed: 32 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -239,7 +239,7 @@ def slice(self, start=None, stop=None):
239239

240240
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
241241

242-
def strip(self):
242+
def strip(self, to_strip: typing.Optional[str] = None):
243243
"""Remove leading and trailing characters.
244244
245245
Strip whitespaces (including newlines) or a set of specified characters
@@ -252,22 +252,35 @@ def strip(self):
252252
>>> import bigframes.pandas as bpd
253253
>>> bpd.options.display.progress_bar = None
254254
255-
>>> s = bpd.Series(['Ant', ' Bee ', '\\tCat\\n', bpd.NA])
255+
>>> s = bpd.Series(['1. Ant.', ' 2. Bee? ', '\\t3. Cat!\\n', bpd.NA])
256256
>>> s
257-
0 Ant
258-
1 Bee
259-
2 Cat
257+
0 1. Ant.
258+
1 2. Bee?
259+
2 3. Cat!
260260
<BLANKLINE>
261261
3 <NA>
262262
dtype: string
263263
264264
>>> s.str.strip()
265+
0 1. Ant.
266+
1 2. Bee!
267+
2 3. Cat?
268+
3 <NA>
269+
dtype: string
270+
271+
>>> s.str.strip('123.!? \\n\\t')
265272
0 Ant
266273
1 Bee
267274
2 Cat
268275
3 <NA>
269276
dtype: string
270277
278+
Args:
279+
to_strip (str, default None):
280+
Specifying the set of characters to be removed. All combinations
281+
of this set of characters will be stripped. If None then
282+
whitespaces are removed.
283+
271284
Returns:
272285
bigframes.series.Series: Series or Index without leading
273286
and trailing characters.
@@ -529,7 +542,7 @@ def isdecimal(self):
529542

530543
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
531544

532-
def rstrip(self):
545+
def rstrip(self, to_strip: typing.Optional[str] = None):
533546
"""Remove trailing characters.
534547
535548
Strip whitespaces (including newlines) or a set of specified characters
@@ -558,13 +571,19 @@ def rstrip(self):
558571
3 <NA>
559572
dtype: string
560573
574+
Args:
575+
to_strip (str, default None):
576+
Specifying the set of characters to be removed. All combinations
577+
of this set of characters will be stripped. If None then
578+
whitespaces are removed.
579+
561580
Returns:
562581
bigframes.series.Series: Series without trailing characters.
563582
"""
564583

565584
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
566585

567-
def lstrip(self):
586+
def lstrip(self, to_strip: typing.Optional[str] = None):
568587
"""Remove leading characters.
569588
570589
Strip whitespaces (including newlines) or a set of specified characters
@@ -594,6 +613,12 @@ def lstrip(self):
594613
3 <NA>
595614
dtype: string
596615
616+
Args:
617+
to_strip (str, default None):
618+
Specifying the set of characters to be removed. All combinations
619+
of this set of characters will be stripped. If None then
620+
whitespaces are removed.
621+
597622
Returns:
598623
bigframes.series.Series: Series without leading characters.
599624
"""

0 commit comments

Comments
 (0)