Skip to content

Commit 53caa8d

Browse files
authored
docs: add sample code snippets for udf (#1649)
* docs: add sample code snippets for `udf` * remove connection cleanup, not neede for udf * use bigframes project for doctest * restore python version agnostic logic for udf
1 parent 6199023 commit 53caa8d

File tree

5 files changed

+184
-5
lines changed

5 files changed

+184
-5
lines changed

bigframes/functions/_function_session.py

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -838,9 +838,18 @@ def wrapper(func):
838838
TypeError, f"func must be a callable, got {func}"
839839
)
840840

841-
# Managed function supports version >= 3.11.
842-
signature_kwargs: Mapping[str, Any] = {"eval_str": True}
843-
signature = inspect.signature(func, **signature_kwargs)
841+
if sys.version_info >= (3, 10):
842+
# Add `eval_str = True` so that deferred annotations are turned into their
843+
# corresponding type objects. Need Python 3.10 for eval_str parameter.
844+
# https://docs.python.org/3/library/inspect.html#inspect.signature
845+
signature_kwargs: Mapping[str, Any] = {"eval_str": True}
846+
else:
847+
signature_kwargs = {} # type: ignore
848+
849+
signature = inspect.signature(
850+
func,
851+
**signature_kwargs,
852+
)
844853

845854
# Try to get input types via type annotations.
846855
if input_types is None:

samples/snippets/conftest.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,8 @@
2424
"python-bigquery-dataframes", "samples/snippets"
2525
)
2626

27+
routine_prefixer = test_utils.prefixer.Prefixer("bigframes", "")
28+
2729

2830
@pytest.fixture(scope="session", autouse=True)
2931
def cleanup_datasets(bigquery_client: bigquery.Client) -> None:
@@ -106,3 +108,12 @@ def random_model_id_eu(
106108
full_model_id = f"{project_id}.{dataset_id_eu}.{random_model_id_eu}"
107109
yield full_model_id
108110
bigquery_client.delete_model(full_model_id, not_found_ok=True)
111+
112+
113+
@pytest.fixture
114+
def routine_id() -> Iterator[str]:
115+
"""Create a new BQ routine ID each time, so random_routine_id can be used as
116+
target for udf creation.
117+
"""
118+
random_routine_id = routine_prefixer.create_prefix()
119+
yield random_routine_id

samples/snippets/remote_function.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ def run_remote_function_and_read_gbq_function(project_id: str) -> None:
2121

2222
# Set BigQuery DataFrames options
2323
bpd.options.bigquery.project = your_gcp_project_id
24-
bpd.options.bigquery.location = "us"
24+
bpd.options.bigquery.location = "US"
2525

2626
# BigQuery DataFrames gives you the ability to turn your custom scalar
2727
# functions into a BigQuery remote function. It requires the GCP project to
@@ -56,7 +56,7 @@ def get_bucket(num: float) -> str:
5656
boundary = 4000
5757
return "at_or_above_4000" if num >= boundary else "below_4000"
5858

59-
# Then we can apply the remote function on the `Series`` of interest via
59+
# Then we can apply the remote function on the `Series` of interest via
6060
# `apply` API and store the result in a new column in the DataFrame.
6161
df = df.assign(body_mass_bucket=df["body_mass_g"].apply(get_bucket))
6262

samples/snippets/udf.py

Lines changed: 121 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,121 @@
1+
# Copyright 2025 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
16+
def run_udf_and_read_gbq_function(
17+
project_id: str, dataset_id: str, routine_id: str
18+
) -> None:
19+
your_gcp_project_id = project_id
20+
your_bq_dataset_id = dataset_id
21+
your_bq_routine_id = routine_id
22+
23+
# [START bigquery_dataframes_udf]
24+
import bigframes.pandas as bpd
25+
26+
# Set BigQuery DataFrames options
27+
bpd.options.bigquery.project = your_gcp_project_id
28+
bpd.options.bigquery.location = "US"
29+
30+
# BigQuery DataFrames gives you the ability to turn your custom functions
31+
# into a BigQuery Python UDF. One can find more details about the usage and
32+
# the requirements via `help` command.
33+
help(bpd.udf)
34+
35+
# Read a table and inspect the column of interest.
36+
df = bpd.read_gbq("bigquery-public-data.ml_datasets.penguins")
37+
df["body_mass_g"].peek(10)
38+
39+
# Define a custom function, and specify the intent to turn it into a
40+
# BigQuery Python UDF. Let's try a `pandas`-like use case in which we want
41+
# to apply a user defined function to every value in a `Series`, more
42+
# specifically bucketize the `body_mass_g` value of the penguins, which is a
43+
# real number, into a category, which is a string.
44+
@bpd.udf(
45+
dataset=your_bq_dataset_id,
46+
name=your_bq_routine_id,
47+
)
48+
def get_bucket(num: float) -> str:
49+
if not num:
50+
return "NA"
51+
boundary = 4000
52+
return "at_or_above_4000" if num >= boundary else "below_4000"
53+
54+
# Then we can apply the udf on the `Series` of interest via
55+
# `apply` API and store the result in a new column in the DataFrame.
56+
df = df.assign(body_mass_bucket=df["body_mass_g"].apply(get_bucket))
57+
58+
# This will add a new column `body_mass_bucket` in the DataFrame. You can
59+
# preview the original value and the bucketized value side by side.
60+
df[["body_mass_g", "body_mass_bucket"]].peek(10)
61+
62+
# The above operation was possible by doing all the computation on the
63+
# cloud through an underlying BigQuery Python UDF that was created to
64+
# support the user's operations in the Python code.
65+
66+
# The BigQuery Python UDF created to support the BigQuery DataFrames
67+
# udf can be located via a property `bigframes_bigquery_function`
68+
# set in the udf object.
69+
print(f"Created BQ Python UDF: {get_bucket.bigframes_bigquery_function}")
70+
71+
# If you have already defined a custom function in BigQuery, either via the
72+
# BigQuery Google Cloud Console or with the `udf` decorator,
73+
# or otherwise, you may use it with BigQuery DataFrames with the
74+
# `read_gbq_function` method. More details are available via the `help`
75+
# command.
76+
help(bpd.read_gbq_function)
77+
78+
existing_get_bucket_bq_udf = get_bucket.bigframes_bigquery_function
79+
80+
# Here is an example of using `read_gbq_function` to load an existing
81+
# BigQuery Python UDF.
82+
df = bpd.read_gbq("bigquery-public-data.ml_datasets.penguins")
83+
get_bucket_function = bpd.read_gbq_function(existing_get_bucket_bq_udf)
84+
85+
df = df.assign(body_mass_bucket=df["body_mass_g"].apply(get_bucket_function))
86+
df.peek(10)
87+
88+
# Let's continue trying other potential use cases of udf. Let's say we
89+
# consider the `species`, `island` and `sex` of the penguins sensitive
90+
# information and want to redact that by replacing with their hash code
91+
# instead. Let's define another scalar custom function and decorate it
92+
# as a udf. The custom function in this example has external package
93+
# dependency, which can be specified via `packages` parameter.
94+
@bpd.udf(
95+
dataset=your_bq_dataset_id,
96+
name=your_bq_routine_id,
97+
packages=["cryptography"],
98+
)
99+
def get_hash(input: str) -> str:
100+
from cryptography.fernet import Fernet
101+
102+
# handle missing value
103+
if input is None:
104+
input = ""
105+
106+
key = Fernet.generate_key()
107+
f = Fernet(key)
108+
return f.encrypt(input.encode()).decode()
109+
110+
# We can use this udf in another `pandas`-like API `map` that
111+
# can be applied on a DataFrame
112+
df_redacted = df[["species", "island", "sex"]].map(get_hash)
113+
df_redacted.peek(10)
114+
115+
# [END bigquery_dataframes_udf]
116+
117+
# Clean up cloud artifacts
118+
session = bpd.get_global_session()
119+
session.bqclient.delete_routine(
120+
f"{your_bq_dataset_id}.{your_bq_routine_id}", not_found_ok=True
121+
)

samples/snippets/udf_test.py

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
# Copyright 2025 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import pytest
16+
17+
import bigframes.pandas
18+
19+
from . import udf
20+
21+
22+
def test_udf_and_read_gbq_function(
23+
capsys: pytest.CaptureFixture[str],
24+
dataset_id: str,
25+
routine_id: str,
26+
) -> None:
27+
# We need a fresh session since we're modifying connection options.
28+
bigframes.pandas.close_session()
29+
30+
# Determine project id, in this case prefer the one set in the environment
31+
# variable GOOGLE_CLOUD_PROJECT (if any)
32+
import os
33+
34+
your_project_id = os.getenv("GOOGLE_CLOUD_PROJECT", "bigframes-dev")
35+
36+
udf.run_udf_and_read_gbq_function(your_project_id, dataset_id, routine_id)
37+
out, _ = capsys.readouterr()
38+
assert "Created BQ Python UDF:" in out

0 commit comments

Comments
 (0)