diff --git a/bigframes/bigquery/__init__.py b/bigframes/bigquery/__init__.py index 5808aa28bf..85a9010a7d 100644 --- a/bigframes/bigquery/__init__.py +++ b/bigframes/bigquery/__init__.py @@ -21,11 +21,15 @@ from __future__ import annotations import typing +from typing import Literal, Optional, Union import bigframes.constants as constants import bigframes.core.groupby as groupby +import bigframes.core.sql +import bigframes.ml.utils as utils import bigframes.operations as ops import bigframes.operations.aggregations as agg_ops +import bigframes.series if typing.TYPE_CHECKING: import bigframes.dataframe as dataframe @@ -148,3 +152,153 @@ def array_to_string(series: series.Series, delimiter: str) -> series.Series: """ return series._apply_unary_op(ops.ArrayToStringOp(delimiter=delimiter)) + + +def vector_search( + base_table: str, + column_to_search: str, + query: Union[dataframe.DataFrame, series.Series], + *, + query_column_to_search: Optional[str] = None, + top_k: Optional[int] = 10, + distance_type: Literal["euclidean", "cosine"] = "euclidean", + fraction_lists_to_search: Optional[float] = None, + use_brute_force: bool = False, +) -> dataframe.DataFrame: + """ + Conduct vector search which searches embeddings to find semantically similar entities. + + **Examples:** + + + >>> import bigframes.pandas as bpd + >>> import bigframes.bigquery as bbq + >>> bpd.options.display.progress_bar = None + + DataFrame embeddings for which to find nearest neighbors. The ``ARRAY`` column + is used as the search query: + + >>> search_query = bpd.DataFrame({"query_id": ["dog", "cat"], + ... "embedding": [[1.0, 2.0], [3.0, 5.2]]}) + >>> bbq.vector_search( + ... base_table="bigframes-dev.bigframes_tests_sys.base_table", + ... column_to_search="my_embedding", + ... query=search_query, + ... top_k=2) + query_id embedding id my_embedding distance + 1 cat [3. 5.2] 5 [5. 5.4] 2.009975 + 0 dog [1. 2.] 1 [1. 2.] 0.0 + 0 dog [1. 2.] 4 [1. 3.2] 1.2 + 1 cat [3. 5.2] 2 [2. 4.] 1.56205 + + [4 rows x 5 columns] + + Series embeddings for which to find nearest neighbors: + + >>> search_query = bpd.Series([[1.0, 2.0], [3.0, 5.2]], + ... index=["dog", "cat"], + ... name="embedding") + >>> bbq.vector_search( + ... base_table="bigframes-dev.bigframes_tests_sys.base_table", + ... column_to_search="my_embedding", + ... query=search_query, + ... top_k=2) + embedding id my_embedding distance + dog [1. 2.] 1 [1. 2.] 0.0 + cat [3. 5.2] 5 [5. 5.4] 2.009975 + dog [1. 2.] 4 [1. 3.2] 1.2 + cat [3. 5.2] 2 [2. 4.] 1.56205 + + [4 rows x 4 columns] + + You can specify the name of the column in the query DataFrame embeddings and distance type. + If you specify query_column_to_search_value, it will use the provided column which contains + the embeddings for which to find nearest neighbors. Otherwiese, it uses the column_to_search value. + + >>> search_query = bpd.DataFrame({"query_id": ["dog", "cat"], + ... "embedding": [[1.0, 2.0], [3.0, 5.2]], + ... "another_embedding": [[0.7, 2.2], [3.3, 5.2]]}) + >>> bbq.vector_search( + ... base_table="bigframes-dev.bigframes_tests_sys.base_table", + ... column_to_search="my_embedding", + ... query=search_query, + ... distance_type="cosine", + ... query_column_to_search="another_embedding", + ... top_k=2) + query_id embedding another_embedding id my_embedding distance + 1 cat [3. 5.2] [3.3 5.2] 2 [2. 4.] 0.005181 + 0 dog [1. 2.] [0.7 2.2] 4 [1. 3.2] 0.000013 + 1 cat [3. 5.2] [3.3 5.2] 1 [1. 2.] 0.005181 + 0 dog [1. 2.] [0.7 2.2] 3 [1.5 7. ] 0.004697 + + [4 rows x 6 columns] + + Args: + base_table (str): + The table to search for nearest neighbor embeddings. + column_to_search (str): + The name of the base table column to search for nearest neighbor embeddings. + The column must have a type of ``ARRAY``. All elements in the array must be non-NULL. + query (bigframes.dataframe.DataFrame | bigframes.dataframe.Series): + A Series or DataFrame that provides the embeddings for which to find nearest neighbors. + query_column_to_search (str): + Specifies the name of the column in the query that contains the embeddings for which to + find nearest neighbors. The column must have a type of ``ARRAY``. All elements in + the array must be non-NULL and all values in the column must have the same array dimensions + as the values in the ``column_to_search`` column. Can only be set when query is a DataFrame. + top_k (int, default 10): + Sepecifies the number of nearest neighbors to return. Default to 10. + distance_type (str, defalt "euclidean"): + Specifies the type of metric to use to compute the distance between two vectors. + Possible values are "euclidean" and "cosine". Default to "euclidean". + fraction_lists_to_search (float, range in [0.0, 1.0]): + Specifies the percentage of lists to search. Specifying a higher percentage leads to + higher recall and slower performance, and the converse is true when specifying a lower + percentage. It is only used when a vector index is also used. You can only specify + ``fraction_lists_to_search`` when ``use_brute_force`` is set to False. + use_brute_force (bool, default False): + Determines whether to use brute force search by skipping the vector index if one is available. + Default to False. + + Returns: + bigframes.dataframe.DataFrame: A DataFrame containing vector search result. + """ + if not fraction_lists_to_search and use_brute_force is True: + raise ValueError( + "You can't specify fraction_lists_to_search when use_brute_force is set to True." + ) + if ( + isinstance(query, bigframes.series.Series) + and query_column_to_search is not None + ): + raise ValueError( + "You can't specify query_column_to_search when query is a Series." + ) + # TODO(ashleyxu): Support options in vector search. b/344019989 + if fraction_lists_to_search is not None or use_brute_force is True: + raise NotImplementedError( + f"fraction_lists_to_search and use_brute_force is not supported. {constants.FEEDBACK_LINK}" + ) + options = { + "base_table": base_table, + "column_to_search": column_to_search, + "query_column_to_search": query_column_to_search, + "distance_type": distance_type, + "top_k": top_k, + "fraction_lists_to_search": fraction_lists_to_search, + "use_brute_force": use_brute_force, + } + + (query,) = utils.convert_to_dataframe(query) + sql_string, index_col_ids, index_labels = query._to_sql_query(include_index=True) + + sql = bigframes.core.sql.create_vector_search_sql( + sql_string=sql_string, options=options # type: ignore + ) + if index_col_ids is not None: + df = query._session.read_gbq(sql, index_col=index_col_ids) + else: + df = query._session.read_gbq(sql) + df.index.names = index_labels + + return df diff --git a/bigframes/core/sql.py b/bigframes/core/sql.py index c1e319b860..a011bc9965 100644 --- a/bigframes/core/sql.py +++ b/bigframes/core/sql.py @@ -20,7 +20,7 @@ import datetime import math import textwrap -from typing import Iterable, TYPE_CHECKING +from typing import Iterable, Mapping, TYPE_CHECKING, Union # Literals and identifiers matching this pattern can be unquoted unquoted = r"^[A-Za-z_][A-Za-z_0-9]*$" @@ -169,3 +169,47 @@ def ordering_clause( part = f"`{ordering_expr.id}` {asc_desc} {null_clause}" parts.append(part) return f"ORDER BY {' ,'.join(parts)}" + + +def create_vector_search_sql( + sql_string: str, + options: Mapping[str, Union[str | int | bool | float]] = {}, +) -> str: + """Encode the VECTOR SEARCH statement for BigQuery Vector Search.""" + + base_table = options["base_table"] + column_to_search = options["column_to_search"] + distance_type = options["distance_type"] + top_k = options["top_k"] + query_column_to_search = options.get("query_column_to_search", None) + + if query_column_to_search is not None: + query_str = f""" + SELECT + query.*, + base.*, + distance, + FROM VECTOR_SEARCH( + TABLE `{base_table}`, + {simple_literal(column_to_search)}, + ({sql_string}), + {simple_literal(query_column_to_search)}, + distance_type => {simple_literal(distance_type)}, + top_k => {simple_literal(top_k)} + ) + """ + else: + query_str = f""" + SELECT + query.*, + base.*, + distance, + FROM VECTOR_SEARCH( + TABLE `{base_table}`, + {simple_literal(column_to_search)}, + ({sql_string}), + distance_type => {simple_literal(distance_type)}, + top_k => {simple_literal(top_k)} + ) + """ + return query_str diff --git a/tests/system/small/bigquery/test_vector_search.py b/tests/system/small/bigquery/test_vector_search.py new file mode 100644 index 0000000000..4280c0a888 --- /dev/null +++ b/tests/system/small/bigquery/test_vector_search.py @@ -0,0 +1,136 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import pandas as pd + +import bigframes.bigquery as bbq +import bigframes.pandas as bpd + + +def test_vector_search_basic_params_with_df(): + search_query = bpd.DataFrame( + { + "query_id": ["dog", "cat"], + "embedding": [[1.0, 2.0], [3.0, 5.2]], + } + ) + vector_search_result = bbq.vector_search( + base_table="bigframes-dev.bigframes_tests_sys.base_table", + column_to_search="my_embedding", + query=search_query, + top_k=2, + ).to_pandas() # type:ignore + expected = pd.DataFrame( + { + "query_id": ["cat", "dog", "dog", "cat"], + "embedding": [ + np.array([3.0, 5.2]), + np.array([1.0, 2.0]), + np.array([1.0, 2.0]), + np.array([3.0, 5.2]), + ], + "id": [5, 1, 4, 2], + "my_embedding": [ + np.array([5.0, 5.4]), + np.array([1.0, 2.0]), + np.array([1.0, 3.2]), + np.array([2.0, 4.0]), + ], + "distance": [2.009975, 0.0, 1.2, 1.56205], + }, + index=pd.Index([1, 0, 0, 1], dtype="Int64"), + ) + pd.testing.assert_frame_equal( + vector_search_result, expected, check_dtype=False, rtol=0.1 + ) + + +def test_vector_search_different_params_with_query(): + search_query = bpd.Series([[1.0, 2.0], [3.0, 5.2]]) + vector_search_result = bbq.vector_search( + base_table="bigframes-dev.bigframes_tests_sys.base_table", + column_to_search="my_embedding", + query=search_query, + distance_type="cosine", + top_k=2, + ).to_pandas() # type:ignore + expected = pd.DataFrame( + { + "0": [ + np.array([1.0, 2.0]), + np.array([1.0, 2.0]), + np.array([3.0, 5.2]), + np.array([3.0, 5.2]), + ], + "id": [2, 1, 1, 2], + "my_embedding": [ + np.array([2.0, 4.0]), + np.array([1.0, 2.0]), + np.array([1.0, 2.0]), + np.array([2.0, 4.0]), + ], + "distance": [0.0, 0.0, 0.001777, 0.001777], + }, + index=pd.Index([0, 0, 1, 1], dtype="Int64"), + ) + pd.testing.assert_frame_equal( + vector_search_result, expected, check_dtype=False, rtol=0.1 + ) + + +def test_vector_search_df_with_query_column_to_search(): + search_query = bpd.DataFrame( + { + "query_id": ["dog", "cat"], + "embedding": [[1.0, 2.0], [3.0, 5.2]], + "another_embedding": [[1.0, 2.5], [3.3, 5.2]], + } + ) + vector_search_result = bbq.vector_search( + base_table="bigframes-dev.bigframes_tests_sys.base_table", + column_to_search="my_embedding", + query=search_query, + query_column_to_search="another_embedding", + top_k=2, + ).to_pandas() # type:ignore + expected = pd.DataFrame( + { + "query_id": ["dog", "dog", "cat", "cat"], + "embedding": [ + np.array([1.0, 2.0]), + np.array([1.0, 2.0]), + np.array([3.0, 5.2]), + np.array([3.0, 5.2]), + ], + "another_embedding": [ + np.array([1.0, 2.5]), + np.array([1.0, 2.5]), + np.array([3.3, 5.2]), + np.array([3.3, 5.2]), + ], + "id": [1, 4, 2, 5], + "my_embedding": [ + np.array([1.0, 2.0]), + np.array([1.0, 3.2]), + np.array([2.0, 4.0]), + np.array([5.0, 5.4]), + ], + "distance": [0.5, 0.7, 1.769181, 1.711724], + }, + index=pd.Index([0, 0, 1, 1], dtype="Int64"), + ) + pd.testing.assert_frame_equal( + vector_search_result, expected, check_dtype=False, rtol=0.1 + ) diff --git a/tests/unit/core/test_sql.py b/tests/unit/core/test_sql.py new file mode 100644 index 0000000000..29f1e48a70 --- /dev/null +++ b/tests/unit/core/test_sql.py @@ -0,0 +1,78 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from bigframes.core import sql + + +def test_create_vector_search_sql_simple(): + sql_string = "SELECT embedding FROM my_embeddings_table WHERE id = 1" + options = { + "base_table": "my_base_table", + "column_to_search": "my_embedding_column", + "distance_type": "COSINE", + "top_k": 10, + "use_brute_force": False, + } + + expected_query = f""" + SELECT + query.*, + base.*, + distance, + FROM VECTOR_SEARCH( + TABLE `my_base_table`, + 'my_embedding_column', + ({sql_string}), + distance_type => 'COSINE', + top_k => 10 + ) + """ + + result_query = sql.create_vector_search_sql( + sql_string, options # type:ignore + ) + assert result_query == expected_query + + +def test_create_vector_search_sql_query_column_to_search(): + sql_string = "SELECT embedding FROM my_embeddings_table WHERE id = 1" + options = { + "base_table": "my_base_table", + "column_to_search": "my_embedding_column", + "distance_type": "COSINE", + "top_k": 10, + "query_column_to_search": "new_embedding_column", + "use_brute_force": False, + } + + expected_query = f""" + SELECT + query.*, + base.*, + distance, + FROM VECTOR_SEARCH( + TABLE `my_base_table`, + 'my_embedding_column', + ({sql_string}), + 'new_embedding_column', + distance_type => 'COSINE', + top_k => 10 + ) + """ + + result_query = sql.create_vector_search_sql( + sql_string, options # type:ignore + ) + assert result_query == expected_query