12
12
# See the License for the specific language governing permissions and
13
13
# limitations under the License.
14
14
15
+ from __future__ import annotations
16
+
15
17
import re
16
18
import typing
17
- from typing import List , Optional
19
+ from typing import Dict , List , Optional
18
20
import warnings
19
21
20
22
import numpy as np
@@ -34,7 +36,13 @@ def __init__(self, df) -> None:
34
36
35
37
self ._df : bigframes .dataframe .DataFrame = df
36
38
37
- def filter (self , instruction : str , model , ground_with_google_search : bool = False ):
39
+ def filter (
40
+ self ,
41
+ instruction : str ,
42
+ model ,
43
+ ground_with_google_search : bool = False ,
44
+ attach_logprobs : bool = False ,
45
+ ):
38
46
"""
39
47
Filters the DataFrame with the semantics of the user instruction.
40
48
@@ -74,6 +82,10 @@ def filter(self, instruction: str, model, ground_with_google_search: bool = Fals
74
82
page for details: https://cloud.google.com/vertex-ai/generative-ai/pricing#google_models
75
83
The default is `False`.
76
84
85
+ attach_logprobs (bool, default False):
86
+ Controls whether to attach an additional "logprob" column for each result. Logprobs are float-point values reflecting the confidence level
87
+ of the LLM for their responses. Higher values indicate more confidence. The value is in the range between negative infinite and 0.
88
+
77
89
Returns:
78
90
bigframes.pandas.DataFrame: DataFrame filtered by the instruction.
79
91
@@ -82,72 +94,27 @@ def filter(self, instruction: str, model, ground_with_google_search: bool = Fals
82
94
ValueError: when the instruction refers to a non-existing column, or when no
83
95
columns are referred to.
84
96
"""
85
- import bigframes .dataframe
86
- import bigframes .series
87
97
88
- self ._validate_model (model )
89
- columns = self ._parse_columns (instruction )
90
- for column in columns :
91
- if column not in self ._df .columns :
92
- raise ValueError (f"Column { column } not found." )
98
+ answer_col = "answer"
93
99
94
- if ground_with_google_search :
95
- msg = exceptions .format_message (
96
- "Enables Grounding with Google Search may impact billing cost. See pricing "
97
- "details: https://cloud.google.com/vertex-ai/generative-ai/pricing#google_models"
98
- )
99
- warnings .warn (msg , category = UserWarning )
100
-
101
- self ._confirm_operation (len (self ._df ))
102
-
103
- df : bigframes .dataframe .DataFrame = self ._df [columns ].copy ()
104
- has_blob_column = False
105
- for column in columns :
106
- if df [column ].dtype == dtypes .OBJ_REF_DTYPE :
107
- # Don't cast blob columns to string
108
- has_blob_column = True
109
- continue
110
-
111
- if df [column ].dtype != dtypes .STRING_DTYPE :
112
- df [column ] = df [column ].astype (dtypes .STRING_DTYPE )
113
-
114
- user_instruction = self ._format_instruction (instruction , columns )
115
- output_instruction = "Based on the provided context, reply to the following claim by only True or False:"
116
-
117
- if has_blob_column :
118
- results = typing .cast (
119
- bigframes .dataframe .DataFrame ,
120
- model .predict (
121
- df ,
122
- prompt = self ._make_multimodel_prompt (
123
- df , columns , user_instruction , output_instruction
124
- ),
125
- temperature = 0.0 ,
126
- ground_with_google_search = ground_with_google_search ,
127
- ),
128
- )
129
- else :
130
- results = typing .cast (
131
- bigframes .dataframe .DataFrame ,
132
- model .predict (
133
- self ._make_text_prompt (
134
- df , columns , user_instruction , output_instruction
135
- ),
136
- temperature = 0.0 ,
137
- ground_with_google_search = ground_with_google_search ,
138
- ),
139
- )
100
+ output_schema = {answer_col : "bool" }
101
+ result = self .map (
102
+ instruction ,
103
+ model ,
104
+ output_schema ,
105
+ ground_with_google_search ,
106
+ attach_logprobs ,
107
+ )
140
108
141
- return self ._df [
142
- results ["ml_generate_text_llm_result" ].str .lower ().str .contains ("true" )
143
- ]
109
+ return result [result [answer_col ]].drop (answer_col , axis = 1 )
144
110
145
111
def map (
146
112
self ,
147
113
instruction : str ,
148
- output_column : str ,
149
114
model ,
115
+ output_schema : Dict [str , str ] | None = None ,
150
116
ground_with_google_search : bool = False ,
117
+ attach_logprobs = False ,
151
118
):
152
119
"""
153
120
Maps the DataFrame with the semantics of the user instruction.
@@ -163,7 +130,7 @@ def map(
163
130
>>> model = llm.GeminiTextGenerator(model_name="gemini-2.0-flash-001")
164
131
165
132
>>> df = bpd.DataFrame({"ingredient_1": ["Burger Bun", "Soy Bean"], "ingredient_2": ["Beef Patty", "Bittern"]})
166
- >>> df.ai.map("What is the food made from {ingredient_1} and {ingredient_2}? One word only.", output_column= "food", model=model )
133
+ >>> df.ai.map("What is the food made from {ingredient_1} and {ingredient_2}? One word only.", model=model, output_schema={ "food": "string"} )
167
134
ingredient_1 ingredient_2 food
168
135
0 Burger Bun Beef Patty Burger
169
136
<BLANKLINE>
@@ -180,12 +147,14 @@ def map(
180
147
in the instructions like:
181
148
"Get the ingredients of {food}."
182
149
183
- output_column (str):
184
- The column name of the mapping result.
185
-
186
150
model (bigframes.ml.llm.GeminiTextGenerator):
187
151
A GeminiTextGenerator provided by Bigframes ML package.
188
152
153
+ output_schema (Dict[str, str] or None, default None):
154
+ The schema used to generate structured output as a bigframes DataFrame. The schema is a string key-value pair of <column_name>:<type>.
155
+ Supported types are int64, float64, bool, string, array<type> and struct<column type>. If None, generate string result under the column
156
+ "ml_generate_text_llm_result".
157
+
189
158
ground_with_google_search (bool, default False):
190
159
Enables Grounding with Google Search for the GeminiTextGenerator model.
191
160
When set to True, the model incorporates relevant information from Google
@@ -194,6 +163,11 @@ def map(
194
163
page for details: https://cloud.google.com/vertex-ai/generative-ai/pricing#google_models
195
164
The default is `False`.
196
165
166
+ attach_logprobs (bool, default False):
167
+ Controls whether to attach an additional "logprob" column for each result. Logprobs are float-point values reflecting the confidence level
168
+ of the LLM for their responses. Higher values indicate more confidence. The value is in the range between negative infinite and 0.
169
+
170
+
197
171
Returns:
198
172
bigframes.pandas.DataFrame: DataFrame with attached mapping results.
199
173
@@ -236,6 +210,9 @@ def map(
236
210
"Based on the provided contenxt, answer the following instruction:"
237
211
)
238
212
213
+ if output_schema is None :
214
+ output_schema = {"ml_generate_text_llm_result" : "string" }
215
+
239
216
if has_blob_column :
240
217
results = typing .cast (
241
218
bigframes .series .Series ,
@@ -246,7 +223,8 @@ def map(
246
223
),
247
224
temperature = 0.0 ,
248
225
ground_with_google_search = ground_with_google_search ,
249
- )["ml_generate_text_llm_result" ],
226
+ output_schema = output_schema ,
227
+ ),
250
228
)
251
229
else :
252
230
results = typing .cast (
@@ -257,19 +235,36 @@ def map(
257
235
),
258
236
temperature = 0.0 ,
259
237
ground_with_google_search = ground_with_google_search ,
260
- )["ml_generate_text_llm_result" ],
238
+ output_schema = output_schema ,
239
+ ),
240
+ )
241
+
242
+ attach_columns = [results [col ] for col , _ in output_schema .items ()]
243
+
244
+ def extract_logprob (s : bigframes .series .Series ) -> bigframes .series .Series :
245
+ from bigframes import bigquery as bbq
246
+
247
+ logprob_jsons = bbq .json_extract_array (s , "$.candidates" ).list [0 ]
248
+ logprobs = bbq .json_extract (logprob_jsons , "$.avg_logprobs" ).astype (
249
+ "Float64"
261
250
)
251
+ logprobs .name = "logprob"
252
+ return logprobs
253
+
254
+ if attach_logprobs :
255
+ attach_columns .append (extract_logprob (results ["full_response" ]))
262
256
263
257
from bigframes .core .reshape .api import concat
264
258
265
- return concat ([self ._df , results . rename ( output_column ) ], axis = 1 )
259
+ return concat ([self ._df , * attach_columns ], axis = 1 )
266
260
267
261
def join (
268
262
self ,
269
263
other ,
270
264
instruction : str ,
271
265
model ,
272
266
ground_with_google_search : bool = False ,
267
+ attach_logprobs = False ,
273
268
):
274
269
"""
275
270
Joines two dataframes by applying the instruction over each pair of rows from
@@ -313,10 +308,6 @@ def join(
313
308
model (bigframes.ml.llm.GeminiTextGenerator):
314
309
A GeminiTextGenerator provided by Bigframes ML package.
315
310
316
- max_rows (int, default 1000):
317
- The maximum number of rows allowed to be sent to the model per call. If the result is too large, the method
318
- call will end early with an error.
319
-
320
311
ground_with_google_search (bool, default False):
321
312
Enables Grounding with Google Search for the GeminiTextGenerator model.
322
313
When set to True, the model incorporates relevant information from Google
@@ -325,6 +316,10 @@ def join(
325
316
page for details: https://cloud.google.com/vertex-ai/generative-ai/pricing#google_models
326
317
The default is `False`.
327
318
319
+ attach_logprobs (bool, default False):
320
+ Controls whether to attach an additional "logprob" column for each result. Logprobs are float-point values reflecting the confidence level
321
+ of the LLM for their responses. Higher values indicate more confidence. The value is in the range between negative infinite and 0.
322
+
328
323
Returns:
329
324
bigframes.pandas.DataFrame: The joined dataframe.
330
325
@@ -400,7 +395,10 @@ def join(
400
395
joined_df = self ._df .merge (other , how = "cross" , suffixes = ("_left" , "_right" ))
401
396
402
397
return joined_df .ai .filter (
403
- instruction , model , ground_with_google_search = ground_with_google_search
398
+ instruction ,
399
+ model ,
400
+ ground_with_google_search = ground_with_google_search ,
401
+ attach_logprobs = attach_logprobs ,
404
402
).reset_index (drop = True )
405
403
406
404
def search (
0 commit comments