fix!: make dataset and name params mandatory in udf (#1619)

shobsi · web-flow · commit 637e860d3cea · 2025-04-16T09:20:35.000-05:00
This change will force the user to provide a dataset and name for the
BigQuery managed python udf created through BigFrames, for easy
discovery and cleanup when necessary.
diff --git a/README.rst b/README.rst
@@ -25,6 +25,7 @@ Version 2.0 introduces breaking changes for improved security and performance. K
   ``cloud_function_service_account="default"``. And network ingress now defaults to ``"internal-only"``.
 * **@remote_function Argument Passing:** Arguments other than ``input_types``, ``output_type``, and ``dataset``
   to ``remote_function`` must now be passed using keyword syntax, as positional arguments are no longer supported.
+* **@udf Argument Passing:** Arguments ``dataset`` and ``name`` to ``udf`` are now mandatory.
 * **Endpoint Connections:** Automatic fallback to locational endpoints in certain regions is removed.
 * **LLM Updates (Gemini Integration):** Integrations now default to the ``gemini-2.0-flash-001`` model.
   PaLM2 support has been removed; please migrate any existing PaLM2 usage to Gemini. **Note:** The current default
diff --git a/bigframes/functions/_function_session.py b/bigframes/functions/_function_session.py
@@ -793,7 +793,7 @@ def udf(
                 ``bigframes.pandas.reset_session``/
                 ``bigframes.pandas.clean_up_by_session_id``) does not clean up
                 the function, and leaves it for the user to manage the function
-                and the associated cloud function directly.
+                directly.
             packages (str[], Optional):
                 Explicit name of the external package dependencies. Each
                 dependency is added to the `requirements.txt` as is, and can be
diff --git a/bigframes/pandas/__init__.py b/bigframes/pandas/__init__.py
@@ -117,9 +117,9 @@ def udf(
     *,
     input_types: Union[None, type, Sequence[type]] = None,
     output_type: Optional[type] = None,
-    dataset: Optional[str] = None,
+    dataset: str,
     bigquery_connection: Optional[str] = None,
-    name: Optional[str] = None,
+    name: str,
     packages: Optional[Sequence[str]] = None,
 ):
     return global_session.with_default_session(
diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py
@@ -1441,9 +1441,9 @@ def udf(
         *,
         input_types: Union[None, type, Sequence[type]] = None,
         output_type: Optional[type] = None,
-        dataset: Optional[str] = None,
+        dataset: str,
         bigquery_connection: Optional[str] = None,
-        name: Optional[str] = None,
+        name: str,
         packages: Optional[Sequence[str]] = None,
     ):
         """Decorator to turn a Python user defined function (udf) into a
@@ -1470,11 +1470,10 @@ def udf(
                 be specified. The supported output types are `bool`, `bytes`,
                 `float`, `int`, `str`, `list[bool]`, `list[float]`, `list[int]`
                 and `list[str]`.
-            dataset (str, Optional):
+            dataset (str):
                 Dataset in which to create a BigQuery managed function. It
                 should be in `<project_id>.<dataset_name>` or `<dataset_name>`
-                format. If this parameter is not provided then session dataset
-                id is used.
+                format.
             bigquery_connection (str, Optional):
                 Name of the BigQuery connection. It is used to provide an
                 identity to the serverless instances running the user code. It
@@ -1486,18 +1485,18 @@ def udf(
                 will be created without any connection. A udf without a
                 connection has no internet access and no access to other GCP
                 services.
-            name (str, Optional):
+            name (str):
                 Explicit name of the persisted BigQuery managed function. Use it
                 with caution, because more than one users working in the same
                 project and dataset could overwrite each other's managed
-                functions if they use the same persistent name. When an explicit
-                name is provided, any session specific clean up (
+                functions if they use the same persistent name. Please note that
+                any session specific clean up (
                 ``bigframes.session.Session.close``/
                 ``bigframes.pandas.close_session``/
                 ``bigframes.pandas.reset_session``/
                 ``bigframes.pandas.clean_up_by_session_id``) does not clean up
-                the function, and leaves it for the user to manage the function
-                and the associated cloud function directly.
+                this function, and leaves it for the user to manage the function
+                directly.
             packages (str[], Optional):
                 Explicit name of the external package dependencies. Each
                 dependency is added to the `requirements.txt` as is, and can be
diff --git a/tests/system/large/functions/test_managed_function.py b/tests/system/large/functions/test_managed_function.py
@@ -16,12 +16,15 @@
 import pandas
 import pyarrow
 import pytest
+import test_utils.prefixer
 
 import bigframes
 import bigframes.exceptions as bfe
 import bigframes.pandas as bpd
 from tests.system.utils import cleanup_function_assets
 
+prefixer = test_utils.prefixer.Prefixer("bigframes", "")
+
 
 def test_managed_function_multiply_with_ibis(
     session,
@@ -37,6 +40,7 @@ def test_managed_function_multiply_with_ibis(
             input_types=[int, int],
             output_type=int,
             dataset=dataset_id,
+            name=prefixer.create_prefix(),
         )
         def multiply(x, y):
             return x * y
@@ -87,6 +91,7 @@ def test_managed_function_stringify_with_ibis(
             input_types=[int],
             output_type=str,
             dataset=dataset_id,
+            name=prefixer.create_prefix(),
         )
         def stringify(x):
             return f"I got {x}"
@@ -123,7 +128,10 @@ def stringify(x):
 def test_managed_function_array_output(session, scalars_dfs, dataset_id):
     try:
 
-        @session.udf(dataset=dataset_id)
+        @session.udf(
+            dataset=dataset_id,
+            name=prefixer.create_prefix(),
+        )
         def featurize(x: int) -> list[float]:
             return [float(i) for i in [x, x + 1, x + 2]]
 
@@ -160,10 +168,10 @@ def featurize(x: int) -> list[float]:
         cleanup_function_assets(featurize, session.bqclient, ignore_failures=False)
 
 
-def test_managed_function_series_apply(session, scalars_dfs):
+def test_managed_function_series_apply(session, dataset_id, scalars_dfs):
     try:
 
-        @session.udf()
+        @session.udf(dataset=dataset_id, name=prefixer.create_prefix())
         def foo(x: int) -> bytes:
             return bytes(abs(x))
 
@@ -214,13 +222,14 @@ def foo(x: int) -> bytes:
 
 def test_managed_function_series_apply_array_output(
     session,
+    dataset_id,
     scalars_dfs,
 ):
     try:
 
         with pytest.warns(bfe.PreviewWarning, match="udf is in preview."):
 
-            @session.udf()
+            @session.udf(dataset=dataset_id, name=prefixer.create_prefix())
             def foo_list(x: int) -> list[float]:
                 return [float(abs(x)), float(abs(x) + 1)]
 
@@ -243,7 +252,7 @@ def foo_list(x: int) -> list[float]:
         cleanup_function_assets(foo_list, session.bqclient, ignore_failures=False)
 
 
-def test_managed_function_series_combine(session, scalars_dfs):
+def test_managed_function_series_combine(session, dataset_id, scalars_dfs):
     try:
         # This function is deliberately written to not work with NA input.
         def add(x: int, y: int) -> int:
@@ -258,7 +267,9 @@ def add(x: int, y: int) -> int:
         # make sure there are NA values in the test column.
         assert any([pandas.isna(val) for val in bf_df[int_col_name_with_nulls]])
 
-        add_managed_func = session.udf()(add)
+        add_managed_func = session.udf(
+            dataset=dataset_id, name=prefixer.create_prefix()
+        )(add)
 
         # with nulls in the series the managed function application would fail.
         with pytest.raises(
@@ -301,7 +312,7 @@ def add(x: int, y: int) -> int:
         )
 
 
-def test_managed_function_series_combine_array_output(session, scalars_dfs):
+def test_managed_function_series_combine_array_output(session, dataset_id, scalars_dfs):
     try:
 
         def add_list(x: int, y: int) -> list[int]:
@@ -316,7 +327,9 @@ def add_list(x: int, y: int) -> list[int]:
         # Make sure there are NA values in the test column.
         assert any([pandas.isna(val) for val in bf_df[int_col_name_with_nulls]])
 
-        add_list_managed_func = session.udf()(add_list)
+        add_list_managed_func = session.udf(
+            dataset=dataset_id, name=prefixer.create_prefix()
+        )(add_list)
 
         # After filtering out nulls the managed function application should work
         # similar to pandas.
@@ -364,7 +377,7 @@ def add_list(x: int, y: int) -> list[int]:
         )
 
 
-def test_managed_function_dataframe_map(session, scalars_dfs):
+def test_managed_function_dataframe_map(session, dataset_id, scalars_dfs):
     try:
 
         def add_one(x):
@@ -373,6 +386,8 @@ def add_one(x):
         mf_add_one = session.udf(
             input_types=[int],
             output_type=int,
+            dataset=dataset_id,
+            name=prefixer.create_prefix(),
         )(add_one)
 
         scalars_df, scalars_pandas_df = scalars_dfs
@@ -398,9 +413,7 @@ def add_one(x):
         cleanup_function_assets(mf_add_one, session.bqclient, ignore_failures=False)
 
 
-def test_managed_function_dataframe_map_array_output(
-    session, scalars_dfs, dataset_id_permanent
-):
+def test_managed_function_dataframe_map_array_output(session, scalars_dfs, dataset_id):
     try:
 
         def add_one_list(x):
@@ -409,6 +422,8 @@ def add_one_list(x):
         mf_add_one_list = session.udf(
             input_types=[int],
             output_type=list[int],
+            dataset=dataset_id,
+            name=prefixer.create_prefix(),
         )(add_one_list)
 
         scalars_df, scalars_pandas_df = scalars_dfs
@@ -439,7 +454,7 @@ def add_one_list(x):
         )
 
 
-def test_managed_function_dataframe_apply_axis_1(session, scalars_dfs):
+def test_managed_function_dataframe_apply_axis_1(session, dataset_id, scalars_dfs):
     try:
         scalars_df, scalars_pandas_df = scalars_dfs
         series = scalars_df["int64_too"]
@@ -451,6 +466,8 @@ def add_ints(x, y):
         add_ints_mf = session.udf(
             input_types=[int, int],
             output_type=int,
+            dataset=dataset_id,
+            name=prefixer.create_prefix(),
         )(add_ints)
         assert add_ints_mf.bigframes_bigquery_function  # type: ignore
 
@@ -475,7 +492,7 @@ def add_ints(x, y):
         cleanup_function_assets(add_ints_mf, session.bqclient, ignore_failures=False)
 
 
-def test_managed_function_dataframe_apply_axis_1_array_output(session):
+def test_managed_function_dataframe_apply_axis_1_array_output(session, dataset_id):
     bf_df = bigframes.dataframe.DataFrame(
         {
             "Id": [1, 2, 3],
@@ -498,6 +515,8 @@ def test_managed_function_dataframe_apply_axis_1_array_output(session):
         @session.udf(
             input_types=[int, float, str],
             output_type=list[str],
+            dataset=dataset_id,
+            name=prefixer.create_prefix(),
         )
         def foo(x, y, z):
             return [str(x), str(y), z]
@@ -591,12 +610,16 @@ def foo(x, y, z):
     ],
 )
 def test_managed_function_with_connection(
-    session, scalars_dfs, request, connection_fixture
+    session, scalars_dfs, dataset_id, request, connection_fixture
 ):
     try:
         bigquery_connection = request.getfixturevalue(connection_fixture)
 
-        @session.udf(bigquery_connection=bigquery_connection)
+        @session.udf(
+            bigquery_connection=bigquery_connection,
+            dataset=dataset_id,
+            name=prefixer.create_prefix(),
+        )
         def foo(x: int) -> int:
             return x + 10