diff --git a/README.rst b/README.rst index 38f7d0ab7a..a977a9f31e 100644 --- a/README.rst +++ b/README.rst @@ -172,6 +172,8 @@ Create estimators for imported models by using the `bigframes.ml.imported module to import Open Neural Network Exchange (ONNX) models. * Use the `TensorFlowModel class `_ to import TensorFlow models. +* Use the `XGBoostModel class `_ + to import XGBoostModel models. **Linear models** diff --git a/bigframes/ml/core.py b/bigframes/ml/core.py index 266ab1b058..51fed05901 100644 --- a/bigframes/ml/core.py +++ b/bigframes/ml/core.py @@ -355,3 +355,33 @@ def create_imported_model( ) return self._create_model_with_sql(session=session, sql=sql) + + def create_xgboost_imported_model( + self, + session: bigframes.Session, + input: Mapping[str, str] = {}, + output: Mapping[str, str] = {}, + options: Mapping[str, Union[str, int, float, Iterable[str]]] = {}, + ) -> BqmlModel: + """Create a session-temporary BQML imported model with the CREATE OR REPLACE MODEL statement + + Args: + input: + input schema for imported xgboost models + output: + output schema for imported xgboost models + options: a dict of options to configure the model. Generates a BQML OPTIONS + clause + + Returns: a BqmlModel, wrapping a trained model in BigQuery + """ + model_ref = self._create_model_ref(session._anonymous_dataset) + + sql = self._model_creation_sql_generator.create_xgboost_imported_model( + model_ref=model_ref, + input=input, + output=output, + options=options, + ) + + return self._create_model_with_sql(session=session, sql=sql) diff --git a/bigframes/ml/globals.py b/bigframes/ml/globals.py index c139476daa..44e9463727 100644 --- a/bigframes/ml/globals.py +++ b/bigframes/ml/globals.py @@ -19,6 +19,17 @@ _BASE_SQL_GENERATOR = sql.BaseSqlGenerator() _BQML_MODEL_FACTORY = core.BqmlModelFactory() +_SUPPORTED_DTYPES = ( + "bool", + "string", + "int64", + "float64", + "array", + "array", + "array", + "array", +) + def base_sql_generator() -> sql.BaseSqlGenerator: """Base SQL Generator.""" diff --git a/bigframes/ml/imported.py b/bigframes/ml/imported.py index 4ae0a8ea4d..e2be154703 100644 --- a/bigframes/ml/imported.py +++ b/bigframes/ml/imported.py @@ -16,13 +16,14 @@ from __future__ import annotations -from typing import cast, Optional, Union +from typing import cast, Mapping, Optional, Union from google.cloud import bigquery import bigframes from bigframes.core import log_adapter from bigframes.ml import base, core, globals, utils +from bigframes.ml.globals import _SUPPORTED_DTYPES import bigframes.pandas as bpd @@ -176,3 +177,117 @@ def to_gbq(self, model_name: str, replace: bool = False) -> ONNXModel: new_model = self._bqml_model.copy(model_name, replace) return new_model.session.read_gbq_model(model_name) + + +@log_adapter.class_logger +class XGBoostModel(base.Predictor): + """Imported XGBoost model. + + .. warning:: + + Imported XGBoost models have the several limitations. See: + https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-create-xgboost#limitations + + Args: + session (BigQuery Session): + BQ session to create the model + input (Dict, default None): + Specify the model input schema information when you + create the XGBoost model. The input should be the format of + {field_name: field_type}. Input is optional only if feature_names + and feature_types are both specified in the model file. Supported types + are "bool", "string", "int64", "float64", "array", "array", "array", "array". + output (Dict, default None): + Specify the model output schema information when you + create the XGBoost model. The input should be the format of + {field_name: field_type}. Output is optional only if feature_names + and feature_types are both specified in the model file. Supported types + are "bool", "string", "int64", "float64", "array", "array", "array", "array". + model_path (str): + Cloud Storage path that holds the model files.""" + + def __init__( + self, + session: Optional[bigframes.Session] = None, + input: Mapping[str, str] = {}, + output: Mapping[str, str] = {}, + model_path: Optional[str] = None, + ): + self.session = session or bpd.get_global_session() + self.model_path = model_path + self.input = input + self.output = output + self._bqml_model: Optional[core.BqmlModel] = None + self._bqml_model_factory = globals.bqml_model_factory() + + def _create_bqml_model(self): + options = {"model_type": "XGBOOST", "model_path": self.model_path} + + if not self.input and not self.output: + return self._bqml_model_factory.create_imported_model( + session=self.session, options=options + ) + else: + for io in (self.input, self.output): + for v in io.values(): + if v not in _SUPPORTED_DTYPES: + raise ValueError( + f"field_type {v} is not supported. We only support {', '.join(_SUPPORTED_DTYPES)}." + ) + + return self._bqml_model_factory.create_xgboost_imported_model( + session=self.session, + input=self.input, + output=self.output, + options=options, + ) + + @classmethod + def _from_bq( + cls, session: bigframes.Session, model: bigquery.Model + ) -> XGBoostModel: + assert model.model_type == "XGBOOST" + + xgboost_model = cls(session=session, model_path=None) + xgboost_model._bqml_model = core.BqmlModel(session, model) + return xgboost_model + + def predict(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: + """Predict the result from input DataFrame. + + Args: + X (bigframes.dataframe.DataFrame or bigframes.series.Series): + Input DataFrame or Series, schema is defined by the model. + + Returns: + bigframes.dataframe.DataFrame: Output DataFrame, schema is defined by the model.""" + + if not self._bqml_model: + if self.model_path is None: + raise ValueError("Model GCS path must be provided.") + self._bqml_model = self._create_bqml_model() + self._bqml_model = cast(core.BqmlModel, self._bqml_model) + + (X,) = utils.convert_to_dataframe(X) + + return self._bqml_model.predict(X) + + def to_gbq(self, model_name: str, replace: bool = False) -> XGBoostModel: + """Save the model to BigQuery. + + Args: + model_name (str): + the name of the model. + replace (bool, default False): + whether to replace if the model already exists. Default to False. + + Returns: + XGBoostModel: saved model.""" + if not self._bqml_model: + if self.model_path is None: + raise ValueError("Model GCS path must be provided.") + self._bqml_model = self._create_bqml_model() + self._bqml_model = cast(core.BqmlModel, self._bqml_model) + + new_model = self._bqml_model.copy(model_name, replace) + return new_model.session.read_gbq_model(model_name) diff --git a/bigframes/ml/loader.py b/bigframes/ml/loader.py index 4ffde43543..99a31922d8 100644 --- a/bigframes/ml/loader.py +++ b/bigframes/ml/loader.py @@ -45,6 +45,7 @@ "RANDOM_FOREST_CLASSIFIER": ensemble.RandomForestClassifier, "TENSORFLOW": imported.TensorFlowModel, "ONNX": imported.ONNXModel, + "XGBOOST": imported.XGBoostModel, } ) @@ -72,6 +73,7 @@ def from_bq( ensemble.RandomForestClassifier, imported.TensorFlowModel, imported.ONNXModel, + imported.XGBoostModel, llm.PaLM2TextGenerator, llm.PaLM2TextEmbeddingGenerator, pipeline.Pipeline, diff --git a/bigframes/ml/remote.py b/bigframes/ml/remote.py index 8da073802d..44fde4f32f 100644 --- a/bigframes/ml/remote.py +++ b/bigframes/ml/remote.py @@ -23,19 +23,9 @@ from bigframes import clients from bigframes.core import log_adapter from bigframes.ml import base, core, globals, utils +from bigframes.ml.globals import _SUPPORTED_DTYPES import bigframes.pandas as bpd -_SUPPORTED_DTYPES = ( - "bool", - "string", - "int64", - "float64", - "array", - "array", - "array", - "array", -) - _REMOTE_MODEL_STATUS = "remote_model_status" diff --git a/bigframes/ml/sql.py b/bigframes/ml/sql.py index 152f881ec0..f726317bb4 100644 --- a/bigframes/ml/sql.py +++ b/bigframes/ml/sql.py @@ -193,6 +193,24 @@ def create_imported_model( parts.append(self.options(**options)) return "\n".join(parts) + def create_xgboost_imported_model( + self, + model_ref: google.cloud.bigquery.ModelReference, + input: Mapping[str, str] = {}, + output: Mapping[str, str] = {}, + options: Mapping[str, Union[str, int, float, Iterable[str]]] = {}, + ) -> str: + """Encode the CREATE OR REPLACE MODEL statement for BQML remote model.""" + + parts = [f"CREATE OR REPLACE MODEL {self._model_id_sql(model_ref)}"] + if input: + parts.append(self.input(**input)) + if output: + parts.append(self.output(**output)) + if options: + parts.append(self.options(**options)) + return "\n".join(parts) + class ModelManipulationSqlGenerator(BaseSqlGenerator): """Sql generator for manipulating a model entity. Model name is the full model path of project_id.dataset_id.model_id.""" diff --git a/docs/templates/toc.yml b/docs/templates/toc.yml index b680a5fc1a..d387a513a1 100644 --- a/docs/templates/toc.yml +++ b/docs/templates/toc.yml @@ -91,6 +91,8 @@ uid: bigframes.ml.imported.ONNXModel - name: TensorFlowModel uid: bigframes.ml.imported.TensorFlowModel + - name: XGBoostModel + uid: bigframes.ml.imported.XGBoostModel name: imported - items: - name: Overview diff --git a/tests/system/small/ml/conftest.py b/tests/system/small/ml/conftest.py index e3180d2892..422ea6f1f4 100644 --- a/tests/system/small/ml/conftest.py +++ b/tests/system/small/ml/conftest.py @@ -190,11 +190,29 @@ def onnx_iris_pandas_df(): ) +@pytest.fixture(scope="session") +def xgboost_iris_pandas_df(): + """Data matching the iris dataset.""" + return pd.DataFrame( + { + "sepal_length": [4.9, 5.1, 34.7], + "sepal_width": [3.0, 5.1, 24.7], + "petal_length": [1.4, 1.5, 13.3], + "petal_width": [0.4, 0.2, 18.3], + } + ) + + @pytest.fixture(scope="session") def onnx_iris_df(session, onnx_iris_pandas_df): return session.read_pandas(onnx_iris_pandas_df) +@pytest.fixture(scope="session") +def xgboost_iris_df(session, xgboost_iris_pandas_df): + return session.read_pandas(xgboost_iris_pandas_df) + + @pytest.fixture(scope="session") def llm_text_df(session, llm_text_pandas_df): return session.read_pandas(llm_text_pandas_df) @@ -322,6 +340,11 @@ def imported_onnx_model_path() -> str: return "gs://cloud-samples-data/bigquery/ml/onnx/pipeline_rf.onnx" +@pytest.fixture(scope="session") +def imported_xgboost_array_model_path() -> str: + return "gs://bigframes-dev-testing/xgboost-testdata/model.bst" + + @pytest.fixture(scope="session") def imported_tensorflow_model( session, imported_tensorflow_model_path @@ -346,3 +369,20 @@ def imported_onnx_model(session, imported_onnx_model_path) -> imported.ONNXModel session=session, model_path=imported_onnx_model_path, ) + + +@pytest.fixture(scope="session") +def imported_xgboost_model( + session, imported_xgboost_array_model_path +) -> imported.XGBoostModel: + return imported.XGBoostModel( + session=session, + input={ + "petal_length": "float64", + "petal_width": "float64", + "sepal_length": "float64", + "sepal_width": "float64", + }, + output={"predicted_label": "float64"}, + model_path=imported_xgboost_array_model_path, + ) diff --git a/tests/system/small/ml/test_imported.py b/tests/system/small/ml/test_imported.py index 8ffd9924e9..2b8d04c3ae 100644 --- a/tests/system/small/ml/test_imported.py +++ b/tests/system/small/ml/test_imported.py @@ -70,7 +70,7 @@ def test_onnx_create_model(imported_onnx_model): def test_onnx_create_model_default_session(imported_onnx_model_path): - model = imported.TensorFlowModel(model_path=imported_onnx_model_path) + model = imported.ONNXModel(model_path=imported_onnx_model_path) assert model is not None @@ -100,3 +100,43 @@ def test_onnx_model_to_gbq(imported_onnx_model: imported.ONNXModel, dataset_id: imported_onnx_model.to_gbq(f"{dataset_id}.test_onnx_model", replace=True) with pytest.raises(google.api_core.exceptions.Conflict): imported_onnx_model.to_gbq(f"{dataset_id}.test_onnx_model") + + +def test_xgboost_create_model(imported_xgboost_model): + # Model creation doesn't return error + assert imported_xgboost_model is not None + + +def test_xgboost_create_model_default_session(imported_xgboost_array_model_path): + model = imported.XGBoostModel(model_path=imported_xgboost_array_model_path) + assert model is not None + + +def test_xgboost_model_predict(imported_xgboost_model, xgboost_iris_df): + predictions = imported_xgboost_model.predict(xgboost_iris_df).to_pandas() + assert predictions.shape == (3, 5) + result = predictions[["predicted_label"]] + value1 = np.array([0.00362173, 0.01580198, 0.98057634]) + value2 = np.array([0.00349651, 0.00999565, 0.98650789]) + value3 = np.array([0.00561748, 0.0108124, 0.98357016]) + expected = pd.DataFrame( + { + "predicted_label": [value1, value2, value3], + }, + index=pd.Index([0, 1, 2], dtype="Int64"), + ) + pd.testing.assert_frame_equal( + result, + expected, + check_exact=False, + check_dtype=False, + atol=0.1, + ) + + +def test_xgboost_model_to_gbq( + imported_xgboost_model: imported.XGBoostModel, dataset_id: str +): + imported_xgboost_model.to_gbq(f"{dataset_id}.test_xgboost_model", replace=True) + with pytest.raises(google.api_core.exceptions.Conflict): + imported_xgboost_model.to_gbq(f"{dataset_id}.test_xgboost_model") diff --git a/tests/unit/ml/test_sql.py b/tests/unit/ml/test_sql.py index 37cc33d33e..de80dad710 100644 --- a/tests/unit/ml/test_sql.py +++ b/tests/unit/ml/test_sql.py @@ -234,6 +234,30 @@ def test_create_imported_model_produces_correct_sql( ) +def test_create_xgboost_imported_model_produces_correct_sql( + model_creation_sql_generator: ml_sql.ModelCreationSqlGenerator, +): + sql = model_creation_sql_generator.create_xgboost_imported_model( + model_ref=bigquery.ModelReference.from_string( + "test-proj._anonXYZ.create_xgboost_imported_model" + ), + input={"column1": "int64"}, + output={"result": "array"}, + options={"option_key1": "option_value1", "option_key2": 2}, + ) + assert ( + sql + == """CREATE OR REPLACE MODEL `test-proj`.`_anonXYZ`.`create_xgboost_imported_model` +INPUT( + column1 int64) +OUTPUT( + result array) +OPTIONS( + option_key1="option_value1", + option_key2=2)""" + ) + + def test_alter_model_correct_sql( model_manipulation_sql_generator: ml_sql.ModelManipulationSqlGenerator, ):