googleapis
diff --git a/‎bigframes/bigquery/__init__.py
Lines changed: 142 additions & 8 deletions b/‎bigframes/bigquery/__init__.py
Lines changed: 142 additions & 8 deletions
diff --git a/‎bigframes/bigquery/utils.py
Lines changed: 85 additions & 0 deletions b/‎bigframes/bigquery/utils.py
Lines changed: 85 additions & 0 deletions
@@ -20,19 +20,17 @@
 
 from __future__ import annotations
 
-import typing
+from typing import Literal, Optional, Union
 
+import bigframes.bigquery.utils as utils
 import bigframes.constants as constants
 import bigframes.core.groupby as groupby
 import bigframes.operations as ops
 import bigframes.operations.aggregations as agg_ops
+import bigframes.pandas as bpd
 
-if typing.TYPE_CHECKING:
-    import bigframes.dataframe as dataframe
-    import bigframes.series as series
 
-
-def array_length(series: series.Series) -> series.Series:
+def array_length(series: bpd.Series) -> bpd.Series:
     """Compute the length of each array element in the Series.
 
     **Examples:**
@@ -69,7 +67,7 @@ def array_length(series: series.Series) -> series.Series:
 
 def array_agg(
     obj: groupby.SeriesGroupBy | groupby.DataFrameGroupBy,
-) -> series.Series | dataframe.DataFrame:
+) -> bpd.Series | bpd.DataFrame:
     """Group data and create arrays from selected columns, omitting NULLs to avoid
     BigQuery errors (NULLs not allowed in arrays).
 
@@ -120,7 +118,7 @@ def array_agg(
         )
 
 
-def array_to_string(series: series.Series, delimiter: str) -> series.Series:
+def array_to_string(series: bpd.Series, delimiter: str) -> bpd.Series:
     """Converts array elements within a Series into delimited strings.
 
     **Examples:**
@@ -148,3 +146,139 @@ def array_to_string(series: series.Series, delimiter: str) -> series.Series:
 
     """
     return series._apply_unary_op(ops.ArrayToStringOp(delimiter=delimiter))
+
+
+def vector_search(
+    base_table: str,
+    column_to_search: str,
+    query: Union[bpd.DataFrame, bpd.Series],
+    *,
+    query_column_to_search: Optional[str] = None,
+    top_k: Optional[int] = 10,
+    distance_type: Literal["euclidean", "cosine"] = "euclidean",
+    fraction_lists_to_search: Optional[float] = None,
+    use_brute_force: bool = False,
+) -> bpd.DataFrame:
+    """
+    Conduct vector search to earch embeddings to find semantically similar entities.
+
+    **Examples:**
+
+
+        >>> import bigframes.pandas as bpd
+        >>> import bigframes.bigquery as bbq
+        >>> bpd.options.display.progress_bar = None
+
+    DataFrame embeddings for which to find nearest neighbors:
+
+        >>> search_query = bpd.DataFrame({"query_id": ["dog", "cat"],
+        ...                               "embedding": [[1.0, 2.0], [3.0, 5.2]]})
+        >>> bbq.vector_search(
+        ...             base_table="bigframes-dev.bigframes_tests_sys.base_table",
+        ...             column_to_search="my_embedding",
+        ...             query=search_query,
+        ...             top_k=2)
+            query_id  embedding  id my_embedding  distance
+        1      cat  [3.  5.2]   5    [5.  5.4]  2.009975
+        0      dog    [1. 2.]   1      [1. 2.]       0.0
+        0      dog    [1. 2.]   4    [1.  3.2]       1.2
+        1      cat  [3.  5.2]   2      [2. 4.]   1.56205
+        <BLANKLINE>
+        [4 rows x 5 columns]
+
+    Series embeddings for which to find nearest neighbors:
+
+        >>> search_query = bpd.Series([[1.0, 2.0], [3.0, 5.2]],
+        ...                            index=["dog", "cat"],
+        ...                            name="embedding")
+        >>> bbq.vector_search(
+        ...             base_table="bigframes-dev.bigframes_tests_sys.base_table",
+        ...             column_to_search="my_embedding",
+        ...             query=search_query,
+        ...             top_k=2)
+             embedding  id my_embedding  distance
+        dog    [1. 2.]   1      [1. 2.]       0.0
+        cat  [3.  5.2]   5    [5.  5.4]  2.009975
+        dog    [1. 2.]   4    [1.  3.2]       1.2
+        cat  [3.  5.2]   2      [2. 4.]   1.56205
+        <BLANKLINE>
+        [4 rows x 4 columns]
+
+    You can specify the name of the column in the query DataFrame embeddings and distance type:
+
+        >>> search_query = bpd.DataFrame({"query_id": ["dog", "cat"],
+        ...                               "embedding": [[1.0, 2.0], [3.0, 5.2]],
+        ...                               "another_embedding": [[0.7, 2.2], [3.3, 5.2]]})
+        >>> bbq.vector_search(
+        ...             base_table="bigframes-dev.bigframes_tests_sys.base_table",
+        ...             column_to_search="my_embedding",
+        ...             query=search_query,
+        ...             distance_type="cosine",
+        ...             query_column_to_search="another_embedding",
+        ...             top_k=2)
+          query_id  embedding another_embedding  id my_embedding  distance
+        1      cat  [3.  5.2]         [3.3 5.2]   2      [2. 4.]  0.005181
+        0      dog    [1. 2.]         [0.7 2.2]   4    [1.  3.2]  0.000013
+        1      cat  [3.  5.2]         [3.3 5.2]   1      [1. 2.]  0.005181
+        0      dog    [1. 2.]         [0.7 2.2]   3    [1.5 7. ]  0.004697
+        <BLANKLINE>
+        [4 rows x 6 columns]
+
+    Args:
+        base_table (str):
+            The table to search for nearest neighbor embeddings.
+        column_to_search (groupby.SeriesGroupBy | groupby.DataFrameGroupBy):
+            The name of the base table column to search for nearest neighbor embeddings.
+            The column must have a type of ``ARRAY<FLOAT64>``. All elements in the array must be non-NULL.
+        query (bigframes.dataframe.DataFrame | bigframes.dataframe.Series):
+            A Series or DataFrame that provides the embeddings for which to find nearest neighbors.
+        query_column_to_search (str):
+            Specifies the name of the column in the query that contains the embeddings for which to
+            find nearest neighbors. The column must have a type of ``ARRAY<FLOAT64>``. All elements in
+            the array must be non-NULL and all values in the column must have the same array dimensions
+            as the values in the ``column_to_search`` column. Can only be set when query is a DataFrame.
+        top_k (int, default 10):
+            Sepecifies the number of nearest neighbors to return. Default to 10.
+        distance_type (str, defalt "euclidean"):
+            Specifies the type of metric to use to compute the distance between two vectors.
+            Possible values are "euclidean" and "cosine". Default to "euclidean".
+        fraction_lists_to_search (float, range in [0.0, 1.0]):
+            Specifies the percentage of lists to search. Specifying a higher percentage leads to
+            higher recall and slower performance, and the converse is true when specifying a lower
+            percentage. It is only used when a vector index is also used. You can only specify
+            ``fraction_lists_to_search`` when ``use_brute_force`` is set to False.
+        use_brute_force (bool, default False):
+            Determines whether to use brute force search by skipping the vector index if one is available.
+            Default to False.
+
+    Returns:
+        bigframes.dataframe.DataFrame: A DataFrame containing vector search result.
+    """
+    if not fraction_lists_to_search and use_brute_force is True:
+        raise ValueError(
+            "You can't specify fraction_lists_to_search when use_brute_force is set to True."
+        )
+    if isinstance(query, bpd.Series) and query_column_to_search is not None:
+        raise ValueError(
+            "You can't specify query_column_to_search when query is a Series."
+        )
+    ## (TODO: ashleyxu. Support options in vector search.)
+    if fraction_lists_to_search is not None or use_brute_force is True:
+        raise NotImplementedError(
+            f"fraction_lists_to_search and use_brute_force is not supported. {constants.FEEDBACK_LINK}"
+        )
+    options = {
+        "base_table": base_table,
+        "column_to_search": column_to_search,
+        "query_column_to_search": query_column_to_search,
+        "distance_type": distance_type,
+        "top_k": top_k,
+        "fraction_lists_to_search": fraction_lists_to_search,
+        "use_brute_force": use_brute_force,
+    }
+
+    df = utils.apply_sql(
+        query,
+        options,  # type:ignore
+    )
+    return df
@@ -0,0 +1,85 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Iterable, Mapping, Union
+
+import bigframes.ml.utils as utils
+import bigframes.pandas as bpd
+
+
+def create_vector_search_sql(
+    sql_string: str,
+    options: Mapping[str, Union[str, int, float, Iterable[str]]] = {},
+) -> str:
+    """Encode the VECTOR SEARCH statement for BigQuery Vector Search."""
+
+    base_table = options["base_table"]
+    column_to_search = options["column_to_search"]
+    distance_type = options["distance_type"]
+    top_k = options["top_k"]
+    query_column_to_search = options.get("query_column_to_search", None)
+
+    if query_column_to_search is not None:
+        query_str = f"""
+    SELECT
+        query.*,
+        base.*,
+        distance,
+    FROM VECTOR_SEARCH(
+        TABLE `{base_table}`,
+        "{column_to_search}",
+        ({sql_string}),
+        "{query_column_to_search}",
+        distance_type => "{distance_type}",
+        top_k => {top_k}
+    )
+    """
+    else:
+        query_str = f"""
+    SELECT
+        query.*,
+        base.*,
+        distance,
+    FROM VECTOR_SEARCH(
+        TABLE `{base_table}`,
+        "{column_to_search}",
+        ({sql_string}),
+        distance_type => "{distance_type}",
+        top_k => {top_k}
+    )
+    """
+    return query_str
+
+
+def apply_sql(
+    query: Union[bpd.DataFrame, bpd.Series],
+    options: Mapping[str, Union[str, int, float, Iterable[str]]] = {},
+) -> bpd.DataFrame:
+    """Helper to wrap a dataframe in a SQL query, keeping the index intact.
+
+    Args:
+        query (bigframes.dataframe.DataFrame):
+            The dataframe to be wrapped.
+    """
+    (query,) = utils.convert_to_dataframe(query)
+    sql_string, index_col_ids, index_labels = query._to_sql_query(include_index=True)
+
+    sql = create_vector_search_sql(sql_string=sql_string, options=options)
+    if index_col_ids is not None:
+        df = query._session.read_gbq(sql, index_col=index_col_ids)
+    else:
+        df = query._session.read_gbq(sql)
+    df.index.names = index_labels
+
+    return df