Skip to content

Commit 637e860

Browse files
authored
fix!: make dataset and name params mandatory in udf (#1619)
This change will force the user to provide a dataset and name for the BigQuery managed python udf created through BigFrames, for easy discovery and cleanup when necessary.
1 parent ec6b8ce commit 637e860

File tree

5 files changed

+52
-29
lines changed

5 files changed

+52
-29
lines changed

README.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ Version 2.0 introduces breaking changes for improved security and performance. K
2525
``cloud_function_service_account="default"``. And network ingress now defaults to ``"internal-only"``.
2626
* **@remote_function Argument Passing:** Arguments other than ``input_types``, ``output_type``, and ``dataset``
2727
to ``remote_function`` must now be passed using keyword syntax, as positional arguments are no longer supported.
28+
* **@udf Argument Passing:** Arguments ``dataset`` and ``name`` to ``udf`` are now mandatory.
2829
* **Endpoint Connections:** Automatic fallback to locational endpoints in certain regions is removed.
2930
* **LLM Updates (Gemini Integration):** Integrations now default to the ``gemini-2.0-flash-001`` model.
3031
PaLM2 support has been removed; please migrate any existing PaLM2 usage to Gemini. **Note:** The current default

bigframes/functions/_function_session.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -793,7 +793,7 @@ def udf(
793793
``bigframes.pandas.reset_session``/
794794
``bigframes.pandas.clean_up_by_session_id``) does not clean up
795795
the function, and leaves it for the user to manage the function
796-
and the associated cloud function directly.
796+
directly.
797797
packages (str[], Optional):
798798
Explicit name of the external package dependencies. Each
799799
dependency is added to the `requirements.txt` as is, and can be

bigframes/pandas/__init__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -117,9 +117,9 @@ def udf(
117117
*,
118118
input_types: Union[None, type, Sequence[type]] = None,
119119
output_type: Optional[type] = None,
120-
dataset: Optional[str] = None,
120+
dataset: str,
121121
bigquery_connection: Optional[str] = None,
122-
name: Optional[str] = None,
122+
name: str,
123123
packages: Optional[Sequence[str]] = None,
124124
):
125125
return global_session.with_default_session(

bigframes/session/__init__.py

Lines changed: 9 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1441,9 +1441,9 @@ def udf(
14411441
*,
14421442
input_types: Union[None, type, Sequence[type]] = None,
14431443
output_type: Optional[type] = None,
1444-
dataset: Optional[str] = None,
1444+
dataset: str,
14451445
bigquery_connection: Optional[str] = None,
1446-
name: Optional[str] = None,
1446+
name: str,
14471447
packages: Optional[Sequence[str]] = None,
14481448
):
14491449
"""Decorator to turn a Python user defined function (udf) into a
@@ -1470,11 +1470,10 @@ def udf(
14701470
be specified. The supported output types are `bool`, `bytes`,
14711471
`float`, `int`, `str`, `list[bool]`, `list[float]`, `list[int]`
14721472
and `list[str]`.
1473-
dataset (str, Optional):
1473+
dataset (str):
14741474
Dataset in which to create a BigQuery managed function. It
14751475
should be in `<project_id>.<dataset_name>` or `<dataset_name>`
1476-
format. If this parameter is not provided then session dataset
1477-
id is used.
1476+
format.
14781477
bigquery_connection (str, Optional):
14791478
Name of the BigQuery connection. It is used to provide an
14801479
identity to the serverless instances running the user code. It
@@ -1486,18 +1485,18 @@ def udf(
14861485
will be created without any connection. A udf without a
14871486
connection has no internet access and no access to other GCP
14881487
services.
1489-
name (str, Optional):
1488+
name (str):
14901489
Explicit name of the persisted BigQuery managed function. Use it
14911490
with caution, because more than one users working in the same
14921491
project and dataset could overwrite each other's managed
1493-
functions if they use the same persistent name. When an explicit
1494-
name is provided, any session specific clean up (
1492+
functions if they use the same persistent name. Please note that
1493+
any session specific clean up (
14951494
``bigframes.session.Session.close``/
14961495
``bigframes.pandas.close_session``/
14971496
``bigframes.pandas.reset_session``/
14981497
``bigframes.pandas.clean_up_by_session_id``) does not clean up
1499-
the function, and leaves it for the user to manage the function
1500-
and the associated cloud function directly.
1498+
this function, and leaves it for the user to manage the function
1499+
directly.
15011500
packages (str[], Optional):
15021501
Explicit name of the external package dependencies. Each
15031502
dependency is added to the `requirements.txt` as is, and can be

tests/system/large/functions/test_managed_function.py

Lines changed: 39 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -16,12 +16,15 @@
1616
import pandas
1717
import pyarrow
1818
import pytest
19+
import test_utils.prefixer
1920

2021
import bigframes
2122
import bigframes.exceptions as bfe
2223
import bigframes.pandas as bpd
2324
from tests.system.utils import cleanup_function_assets
2425

26+
prefixer = test_utils.prefixer.Prefixer("bigframes", "")
27+
2528

2629
def test_managed_function_multiply_with_ibis(
2730
session,
@@ -37,6 +40,7 @@ def test_managed_function_multiply_with_ibis(
3740
input_types=[int, int],
3841
output_type=int,
3942
dataset=dataset_id,
43+
name=prefixer.create_prefix(),
4044
)
4145
def multiply(x, y):
4246
return x * y
@@ -87,6 +91,7 @@ def test_managed_function_stringify_with_ibis(
8791
input_types=[int],
8892
output_type=str,
8993
dataset=dataset_id,
94+
name=prefixer.create_prefix(),
9095
)
9196
def stringify(x):
9297
return f"I got {x}"
@@ -123,7 +128,10 @@ def stringify(x):
123128
def test_managed_function_array_output(session, scalars_dfs, dataset_id):
124129
try:
125130

126-
@session.udf(dataset=dataset_id)
131+
@session.udf(
132+
dataset=dataset_id,
133+
name=prefixer.create_prefix(),
134+
)
127135
def featurize(x: int) -> list[float]:
128136
return [float(i) for i in [x, x + 1, x + 2]]
129137

@@ -160,10 +168,10 @@ def featurize(x: int) -> list[float]:
160168
cleanup_function_assets(featurize, session.bqclient, ignore_failures=False)
161169

162170

163-
def test_managed_function_series_apply(session, scalars_dfs):
171+
def test_managed_function_series_apply(session, dataset_id, scalars_dfs):
164172
try:
165173

166-
@session.udf()
174+
@session.udf(dataset=dataset_id, name=prefixer.create_prefix())
167175
def foo(x: int) -> bytes:
168176
return bytes(abs(x))
169177

@@ -214,13 +222,14 @@ def foo(x: int) -> bytes:
214222

215223
def test_managed_function_series_apply_array_output(
216224
session,
225+
dataset_id,
217226
scalars_dfs,
218227
):
219228
try:
220229

221230
with pytest.warns(bfe.PreviewWarning, match="udf is in preview."):
222231

223-
@session.udf()
232+
@session.udf(dataset=dataset_id, name=prefixer.create_prefix())
224233
def foo_list(x: int) -> list[float]:
225234
return [float(abs(x)), float(abs(x) + 1)]
226235

@@ -243,7 +252,7 @@ def foo_list(x: int) -> list[float]:
243252
cleanup_function_assets(foo_list, session.bqclient, ignore_failures=False)
244253

245254

246-
def test_managed_function_series_combine(session, scalars_dfs):
255+
def test_managed_function_series_combine(session, dataset_id, scalars_dfs):
247256
try:
248257
# This function is deliberately written to not work with NA input.
249258
def add(x: int, y: int) -> int:
@@ -258,7 +267,9 @@ def add(x: int, y: int) -> int:
258267
# make sure there are NA values in the test column.
259268
assert any([pandas.isna(val) for val in bf_df[int_col_name_with_nulls]])
260269

261-
add_managed_func = session.udf()(add)
270+
add_managed_func = session.udf(
271+
dataset=dataset_id, name=prefixer.create_prefix()
272+
)(add)
262273

263274
# with nulls in the series the managed function application would fail.
264275
with pytest.raises(
@@ -301,7 +312,7 @@ def add(x: int, y: int) -> int:
301312
)
302313

303314

304-
def test_managed_function_series_combine_array_output(session, scalars_dfs):
315+
def test_managed_function_series_combine_array_output(session, dataset_id, scalars_dfs):
305316
try:
306317

307318
def add_list(x: int, y: int) -> list[int]:
@@ -316,7 +327,9 @@ def add_list(x: int, y: int) -> list[int]:
316327
# Make sure there are NA values in the test column.
317328
assert any([pandas.isna(val) for val in bf_df[int_col_name_with_nulls]])
318329

319-
add_list_managed_func = session.udf()(add_list)
330+
add_list_managed_func = session.udf(
331+
dataset=dataset_id, name=prefixer.create_prefix()
332+
)(add_list)
320333

321334
# After filtering out nulls the managed function application should work
322335
# similar to pandas.
@@ -364,7 +377,7 @@ def add_list(x: int, y: int) -> list[int]:
364377
)
365378

366379

367-
def test_managed_function_dataframe_map(session, scalars_dfs):
380+
def test_managed_function_dataframe_map(session, dataset_id, scalars_dfs):
368381
try:
369382

370383
def add_one(x):
@@ -373,6 +386,8 @@ def add_one(x):
373386
mf_add_one = session.udf(
374387
input_types=[int],
375388
output_type=int,
389+
dataset=dataset_id,
390+
name=prefixer.create_prefix(),
376391
)(add_one)
377392

378393
scalars_df, scalars_pandas_df = scalars_dfs
@@ -398,9 +413,7 @@ def add_one(x):
398413
cleanup_function_assets(mf_add_one, session.bqclient, ignore_failures=False)
399414

400415

401-
def test_managed_function_dataframe_map_array_output(
402-
session, scalars_dfs, dataset_id_permanent
403-
):
416+
def test_managed_function_dataframe_map_array_output(session, scalars_dfs, dataset_id):
404417
try:
405418

406419
def add_one_list(x):
@@ -409,6 +422,8 @@ def add_one_list(x):
409422
mf_add_one_list = session.udf(
410423
input_types=[int],
411424
output_type=list[int],
425+
dataset=dataset_id,
426+
name=prefixer.create_prefix(),
412427
)(add_one_list)
413428

414429
scalars_df, scalars_pandas_df = scalars_dfs
@@ -439,7 +454,7 @@ def add_one_list(x):
439454
)
440455

441456

442-
def test_managed_function_dataframe_apply_axis_1(session, scalars_dfs):
457+
def test_managed_function_dataframe_apply_axis_1(session, dataset_id, scalars_dfs):
443458
try:
444459
scalars_df, scalars_pandas_df = scalars_dfs
445460
series = scalars_df["int64_too"]
@@ -451,6 +466,8 @@ def add_ints(x, y):
451466
add_ints_mf = session.udf(
452467
input_types=[int, int],
453468
output_type=int,
469+
dataset=dataset_id,
470+
name=prefixer.create_prefix(),
454471
)(add_ints)
455472
assert add_ints_mf.bigframes_bigquery_function # type: ignore
456473

@@ -475,7 +492,7 @@ def add_ints(x, y):
475492
cleanup_function_assets(add_ints_mf, session.bqclient, ignore_failures=False)
476493

477494

478-
def test_managed_function_dataframe_apply_axis_1_array_output(session):
495+
def test_managed_function_dataframe_apply_axis_1_array_output(session, dataset_id):
479496
bf_df = bigframes.dataframe.DataFrame(
480497
{
481498
"Id": [1, 2, 3],
@@ -498,6 +515,8 @@ def test_managed_function_dataframe_apply_axis_1_array_output(session):
498515
@session.udf(
499516
input_types=[int, float, str],
500517
output_type=list[str],
518+
dataset=dataset_id,
519+
name=prefixer.create_prefix(),
501520
)
502521
def foo(x, y, z):
503522
return [str(x), str(y), z]
@@ -591,12 +610,16 @@ def foo(x, y, z):
591610
],
592611
)
593612
def test_managed_function_with_connection(
594-
session, scalars_dfs, request, connection_fixture
613+
session, scalars_dfs, dataset_id, request, connection_fixture
595614
):
596615
try:
597616
bigquery_connection = request.getfixturevalue(connection_fixture)
598617

599-
@session.udf(bigquery_connection=bigquery_connection)
618+
@session.udf(
619+
bigquery_connection=bigquery_connection,
620+
dataset=dataset_id,
621+
name=prefixer.create_prefix(),
622+
)
600623
def foo(x: int) -> int:
601624
return x + 10
602625

0 commit comments

Comments
 (0)