Skip to content

feat: support score() in GeminiTextGenerator #740

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Jun 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
61 changes: 61 additions & 0 deletions bigframes/ml/llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -732,6 +732,67 @@ def predict(

return df

def score(
self,
X: Union[bpd.DataFrame, bpd.Series],
y: Union[bpd.DataFrame, bpd.Series],
task_type: Literal[
"text_generation", "classification", "summarization", "question_answering"
] = "text_generation",
) -> bpd.DataFrame:
"""Calculate evaluation metrics of the model. Only "gemini-pro" model is supported for now.

.. note::

This product or feature is subject to the "Pre-GA Offerings Terms" in the General Service Terms section of the
Service Specific Terms(https://cloud.google.com/terms/service-terms#1). Pre-GA products and features are available "as is"
and might have limited support. For more information, see the launch stage descriptions
(https://cloud.google.com/products#product-launch-stages).

.. note::

Output matches that of the BigQuery ML.EVALUTE function.
See: https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-evaluate#remote-model-llm
for the outputs relevant to this model type.

Args:
X (bigframes.dataframe.DataFrame or bigframes.series.Series):
A BigQuery DataFrame as evaluation data, which contains only one column of input_text
that contains the prompt text to use when evaluating the model.
y (bigframes.dataframe.DataFrame or bigframes.series.Series):
A BigQuery DataFrame as evaluation labels, which contains only one column of output_text
that you would expect to be returned by the model.
task_type (str):
The type of the task for LLM model. Default to "text_generation".
Possible values: "text_generation", "classification", "summarization", and "question_answering".

Returns:
bigframes.dataframe.DataFrame: The DataFrame as evaluation result.
"""
if not self._bqml_model:
raise RuntimeError("A model must be fitted before score")

# TODO(ashleyxu): Support gemini-1.5 when the rollout is ready. b/344891364.
if self._bqml_model.model_name.startswith("gemini-1.5"):
raise NotImplementedError("Score is not supported for gemini-1.5 model.")

X, y = utils.convert_to_dataframe(X, y)

if len(X.columns) != 1 or len(y.columns) != 1:
raise ValueError(
f"Only support one column as input for X and y. {constants.FEEDBACK_LINK}"
)

# BQML identified the column by name
X_col_label = cast(blocks.Label, X.columns[0])
y_col_label = cast(blocks.Label, y.columns[0])
X = X.rename(columns={X_col_label: "input_text"})
y = y.rename(columns={y_col_label: "output_text"})

input_data = X.join(y, how="outer")

return self._bqml_model.llm_evaluate(input_data, task_type)

def to_gbq(self, model_name: str, replace: bool = False) -> GeminiTextGenerator:
"""Save the model to BigQuery.

Expand Down
12 changes: 12 additions & 0 deletions tests/system/small/ml/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,18 @@ def llm_text_pandas_df():
)


@pytest.fixture(scope="session")
def llm_fine_tune_df_default_index(
session: bigframes.Session,
) -> bigframes.dataframe.DataFrame:
training_table_name = "llm_tuning.emotion_classification_train"
df = session.read_gbq(training_table_name)
prefix = "Please do sentiment analysis on the following text and only output a number from 0 to 5 where 0 means sadness, 1 means joy, 2 means love, 3 means anger, 4 means fear, and 5 means surprise. Text: "
df["prompt"] = prefix + df["text"]
df["label"] = df["label"].astype("string")
return df


@pytest.fixture(scope="session")
def onnx_iris_pandas_df():
"""Data matching the iris dataset."""
Expand Down
46 changes: 46 additions & 0 deletions tests/system/small/ml/test_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
import pytest

from bigframes.ml import llm
from tests.system import utils


def test_create_text_generator_model(
Expand Down Expand Up @@ -366,3 +367,48 @@ def test_gemini_text_generator_predict_with_params_success(
assert "ml_generate_text_llm_result" in df.columns
series = df["ml_generate_text_llm_result"]
assert all(series.str.len() > 20)


@pytest.mark.flaky(retries=2)
def test_llm_gemini_pro_score(llm_fine_tune_df_default_index):
model = llm.GeminiTextGenerator(model_name="gemini-pro")

# Check score to ensure the model was fitted
score_result = model.score(
X=llm_fine_tune_df_default_index[["prompt"]],
y=llm_fine_tune_df_default_index[["label"]],
).to_pandas()
utils.check_pandas_df_schema_and_index(
score_result,
columns=[
"bleu4_score",
"rouge-l_precision",
"rouge-l_recall",
"rouge-l_f1_score",
"evaluation_status",
],
index=1,
)


@pytest.mark.flaky(retries=2)
def test_llm_gemini_pro_score_params(llm_fine_tune_df_default_index):
model = llm.GeminiTextGenerator(model_name="gemini-pro")

# Check score to ensure the model was fitted
score_result = model.score(
X=llm_fine_tune_df_default_index["prompt"],
y=llm_fine_tune_df_default_index["label"],
task_type="classification",
).to_pandas()
utils.check_pandas_df_schema_and_index(
score_result,
columns=[
"precision",
"recall",
"f1_score",
"label",
"evaluation_status",
],
index=6,
)