From 830d77d77809d304e4e86e3f62fdf4b5bde80e12 Mon Sep 17 00:00:00 2001 From: Shobhit Singh Date: Wed, 16 Apr 2025 00:20:33 +0000 Subject: [PATCH] chore!: make `dataset` and `name` params mandatory in `udf` This change will force the user to provide a dataset and name for the BigQuery managed python udf created through BigFrames, for easy discovery and cleanup when necessary. --- README.rst | 1 + bigframes/functions/_function_session.py | 2 +- bigframes/pandas/__init__.py | 4 +- bigframes/session/__init__.py | 19 +++---- .../large/functions/test_managed_function.py | 55 +++++++++++++------ 5 files changed, 52 insertions(+), 29 deletions(-) diff --git a/README.rst b/README.rst index 2419029c29..7f487b9077 100644 --- a/README.rst +++ b/README.rst @@ -25,6 +25,7 @@ Version 2.0 introduces breaking changes for improved security and performance. K ``cloud_function_service_account="default"``. And network ingress now defaults to ``"internal-only"``. * **@remote_function Argument Passing:** Arguments other than ``input_types``, ``output_type``, and ``dataset`` to ``remote_function`` must now be passed using keyword syntax, as positional arguments are no longer supported. +* **@udf Argument Passing:** Arguments ``dataset`` and ``name`` to ``udf`` are now mandatory. * **Endpoint Connections:** Automatic fallback to locational endpoints in certain regions is removed. * **LLM Updates (Gemini Integration):** Integrations now default to the ``gemini-2.0-flash-001`` model. PaLM2 support has been removed; please migrate any existing PaLM2 usage to Gemini. **Note:** The current default diff --git a/bigframes/functions/_function_session.py b/bigframes/functions/_function_session.py index dba03029c8..ec0e977782 100644 --- a/bigframes/functions/_function_session.py +++ b/bigframes/functions/_function_session.py @@ -793,7 +793,7 @@ def udf( ``bigframes.pandas.reset_session``/ ``bigframes.pandas.clean_up_by_session_id``) does not clean up the function, and leaves it for the user to manage the function - and the associated cloud function directly. + directly. packages (str[], Optional): Explicit name of the external package dependencies. Each dependency is added to the `requirements.txt` as is, and can be diff --git a/bigframes/pandas/__init__.py b/bigframes/pandas/__init__.py index 5df69e3da5..8e1e03e024 100644 --- a/bigframes/pandas/__init__.py +++ b/bigframes/pandas/__init__.py @@ -117,9 +117,9 @@ def udf( *, input_types: Union[None, type, Sequence[type]] = None, output_type: Optional[type] = None, - dataset: Optional[str] = None, + dataset: str, bigquery_connection: Optional[str] = None, - name: Optional[str] = None, + name: str, packages: Optional[Sequence[str]] = None, ): return global_session.with_default_session( diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index b94e6985c3..41894c2189 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -1444,9 +1444,9 @@ def udf( *, input_types: Union[None, type, Sequence[type]] = None, output_type: Optional[type] = None, - dataset: Optional[str] = None, + dataset: str, bigquery_connection: Optional[str] = None, - name: Optional[str] = None, + name: str, packages: Optional[Sequence[str]] = None, ): """Decorator to turn a Python user defined function (udf) into a @@ -1473,11 +1473,10 @@ def udf( be specified. The supported output types are `bool`, `bytes`, `float`, `int`, `str`, `list[bool]`, `list[float]`, `list[int]` and `list[str]`. - dataset (str, Optional): + dataset (str): Dataset in which to create a BigQuery managed function. It should be in `.` or `` - format. If this parameter is not provided then session dataset - id is used. + format. bigquery_connection (str, Optional): Name of the BigQuery connection. It is used to provide an identity to the serverless instances running the user code. It @@ -1489,18 +1488,18 @@ def udf( will be created without any connection. A udf without a connection has no internet access and no access to other GCP services. - name (str, Optional): + name (str): Explicit name of the persisted BigQuery managed function. Use it with caution, because more than one users working in the same project and dataset could overwrite each other's managed - functions if they use the same persistent name. When an explicit - name is provided, any session specific clean up ( + functions if they use the same persistent name. Please note that + any session specific clean up ( ``bigframes.session.Session.close``/ ``bigframes.pandas.close_session``/ ``bigframes.pandas.reset_session``/ ``bigframes.pandas.clean_up_by_session_id``) does not clean up - the function, and leaves it for the user to manage the function - and the associated cloud function directly. + this function, and leaves it for the user to manage the function + directly. packages (str[], Optional): Explicit name of the external package dependencies. Each dependency is added to the `requirements.txt` as is, and can be diff --git a/tests/system/large/functions/test_managed_function.py b/tests/system/large/functions/test_managed_function.py index 4902489ac7..a15bce83ad 100644 --- a/tests/system/large/functions/test_managed_function.py +++ b/tests/system/large/functions/test_managed_function.py @@ -16,12 +16,15 @@ import pandas import pyarrow import pytest +import test_utils.prefixer import bigframes import bigframes.exceptions as bfe import bigframes.pandas as bpd from tests.system.utils import cleanup_function_assets +prefixer = test_utils.prefixer.Prefixer("bigframes", "") + def test_managed_function_multiply_with_ibis( session, @@ -37,6 +40,7 @@ def test_managed_function_multiply_with_ibis( input_types=[int, int], output_type=int, dataset=dataset_id, + name=prefixer.create_prefix(), ) def multiply(x, y): return x * y @@ -87,6 +91,7 @@ def test_managed_function_stringify_with_ibis( input_types=[int], output_type=str, dataset=dataset_id, + name=prefixer.create_prefix(), ) def stringify(x): return f"I got {x}" @@ -123,7 +128,10 @@ def stringify(x): def test_managed_function_array_output(session, scalars_dfs, dataset_id): try: - @session.udf(dataset=dataset_id) + @session.udf( + dataset=dataset_id, + name=prefixer.create_prefix(), + ) def featurize(x: int) -> list[float]: return [float(i) for i in [x, x + 1, x + 2]] @@ -160,10 +168,10 @@ def featurize(x: int) -> list[float]: cleanup_function_assets(featurize, session.bqclient, ignore_failures=False) -def test_managed_function_series_apply(session, scalars_dfs): +def test_managed_function_series_apply(session, dataset_id, scalars_dfs): try: - @session.udf() + @session.udf(dataset=dataset_id, name=prefixer.create_prefix()) def foo(x: int) -> bytes: return bytes(abs(x)) @@ -214,13 +222,14 @@ def foo(x: int) -> bytes: def test_managed_function_series_apply_array_output( session, + dataset_id, scalars_dfs, ): try: with pytest.warns(bfe.PreviewWarning, match="udf is in preview."): - @session.udf() + @session.udf(dataset=dataset_id, name=prefixer.create_prefix()) def foo_list(x: int) -> list[float]: return [float(abs(x)), float(abs(x) + 1)] @@ -243,7 +252,7 @@ def foo_list(x: int) -> list[float]: cleanup_function_assets(foo_list, session.bqclient, ignore_failures=False) -def test_managed_function_series_combine(session, scalars_dfs): +def test_managed_function_series_combine(session, dataset_id, scalars_dfs): try: # This function is deliberately written to not work with NA input. def add(x: int, y: int) -> int: @@ -258,7 +267,9 @@ def add(x: int, y: int) -> int: # make sure there are NA values in the test column. assert any([pandas.isna(val) for val in bf_df[int_col_name_with_nulls]]) - add_managed_func = session.udf()(add) + add_managed_func = session.udf( + dataset=dataset_id, name=prefixer.create_prefix() + )(add) # with nulls in the series the managed function application would fail. with pytest.raises( @@ -301,7 +312,7 @@ def add(x: int, y: int) -> int: ) -def test_managed_function_series_combine_array_output(session, scalars_dfs): +def test_managed_function_series_combine_array_output(session, dataset_id, scalars_dfs): try: def add_list(x: int, y: int) -> list[int]: @@ -316,7 +327,9 @@ def add_list(x: int, y: int) -> list[int]: # Make sure there are NA values in the test column. assert any([pandas.isna(val) for val in bf_df[int_col_name_with_nulls]]) - add_list_managed_func = session.udf()(add_list) + add_list_managed_func = session.udf( + dataset=dataset_id, name=prefixer.create_prefix() + )(add_list) # After filtering out nulls the managed function application should work # similar to pandas. @@ -364,7 +377,7 @@ def add_list(x: int, y: int) -> list[int]: ) -def test_managed_function_dataframe_map(session, scalars_dfs): +def test_managed_function_dataframe_map(session, dataset_id, scalars_dfs): try: def add_one(x): @@ -373,6 +386,8 @@ def add_one(x): mf_add_one = session.udf( input_types=[int], output_type=int, + dataset=dataset_id, + name=prefixer.create_prefix(), )(add_one) scalars_df, scalars_pandas_df = scalars_dfs @@ -398,9 +413,7 @@ def add_one(x): cleanup_function_assets(mf_add_one, session.bqclient, ignore_failures=False) -def test_managed_function_dataframe_map_array_output( - session, scalars_dfs, dataset_id_permanent -): +def test_managed_function_dataframe_map_array_output(session, scalars_dfs, dataset_id): try: def add_one_list(x): @@ -409,6 +422,8 @@ def add_one_list(x): mf_add_one_list = session.udf( input_types=[int], output_type=list[int], + dataset=dataset_id, + name=prefixer.create_prefix(), )(add_one_list) scalars_df, scalars_pandas_df = scalars_dfs @@ -439,7 +454,7 @@ def add_one_list(x): ) -def test_managed_function_dataframe_apply_axis_1(session, scalars_dfs): +def test_managed_function_dataframe_apply_axis_1(session, dataset_id, scalars_dfs): try: scalars_df, scalars_pandas_df = scalars_dfs series = scalars_df["int64_too"] @@ -451,6 +466,8 @@ def add_ints(x, y): add_ints_mf = session.udf( input_types=[int, int], output_type=int, + dataset=dataset_id, + name=prefixer.create_prefix(), )(add_ints) assert add_ints_mf.bigframes_bigquery_function # type: ignore @@ -475,7 +492,7 @@ def add_ints(x, y): cleanup_function_assets(add_ints_mf, session.bqclient, ignore_failures=False) -def test_managed_function_dataframe_apply_axis_1_array_output(session): +def test_managed_function_dataframe_apply_axis_1_array_output(session, dataset_id): bf_df = bigframes.dataframe.DataFrame( { "Id": [1, 2, 3], @@ -498,6 +515,8 @@ def test_managed_function_dataframe_apply_axis_1_array_output(session): @session.udf( input_types=[int, float, str], output_type=list[str], + dataset=dataset_id, + name=prefixer.create_prefix(), ) def foo(x, y, z): return [str(x), str(y), z] @@ -591,12 +610,16 @@ def foo(x, y, z): ], ) def test_managed_function_with_connection( - session, scalars_dfs, request, connection_fixture + session, scalars_dfs, dataset_id, request, connection_fixture ): try: bigquery_connection = request.getfixturevalue(connection_fixture) - @session.udf(bigquery_connection=bigquery_connection) + @session.udf( + bigquery_connection=bigquery_connection, + dataset=dataset_id, + name=prefixer.create_prefix(), + ) def foo(x: int) -> int: return x + 10