|
21 | 21 | from __future__ import annotations
|
22 | 22 |
|
23 | 23 | import typing
|
| 24 | +from typing import Literal, Optional, Union |
24 | 25 |
|
25 | 26 | import bigframes.constants as constants
|
26 | 27 | import bigframes.core.groupby as groupby
|
| 28 | +import bigframes.core.sql |
| 29 | +import bigframes.ml.utils as utils |
27 | 30 | import bigframes.operations as ops
|
28 | 31 | import bigframes.operations.aggregations as agg_ops
|
| 32 | +import bigframes.series |
29 | 33 |
|
30 | 34 | if typing.TYPE_CHECKING:
|
31 | 35 | import bigframes.dataframe as dataframe
|
@@ -148,3 +152,153 @@ def array_to_string(series: series.Series, delimiter: str) -> series.Series:
|
148 | 152 |
|
149 | 153 | """
|
150 | 154 | return series._apply_unary_op(ops.ArrayToStringOp(delimiter=delimiter))
|
| 155 | + |
| 156 | + |
| 157 | +def vector_search( |
| 158 | + base_table: str, |
| 159 | + column_to_search: str, |
| 160 | + query: Union[dataframe.DataFrame, series.Series], |
| 161 | + *, |
| 162 | + query_column_to_search: Optional[str] = None, |
| 163 | + top_k: Optional[int] = 10, |
| 164 | + distance_type: Literal["euclidean", "cosine"] = "euclidean", |
| 165 | + fraction_lists_to_search: Optional[float] = None, |
| 166 | + use_brute_force: bool = False, |
| 167 | +) -> dataframe.DataFrame: |
| 168 | + """ |
| 169 | + Conduct vector search which searches embeddings to find semantically similar entities. |
| 170 | +
|
| 171 | + **Examples:** |
| 172 | +
|
| 173 | +
|
| 174 | + >>> import bigframes.pandas as bpd |
| 175 | + >>> import bigframes.bigquery as bbq |
| 176 | + >>> bpd.options.display.progress_bar = None |
| 177 | +
|
| 178 | + DataFrame embeddings for which to find nearest neighbors. The ``ARRAY<FLOAT64>`` column |
| 179 | + is used as the search query: |
| 180 | +
|
| 181 | + >>> search_query = bpd.DataFrame({"query_id": ["dog", "cat"], |
| 182 | + ... "embedding": [[1.0, 2.0], [3.0, 5.2]]}) |
| 183 | + >>> bbq.vector_search( |
| 184 | + ... base_table="bigframes-dev.bigframes_tests_sys.base_table", |
| 185 | + ... column_to_search="my_embedding", |
| 186 | + ... query=search_query, |
| 187 | + ... top_k=2) |
| 188 | + query_id embedding id my_embedding distance |
| 189 | + 1 cat [3. 5.2] 5 [5. 5.4] 2.009975 |
| 190 | + 0 dog [1. 2.] 1 [1. 2.] 0.0 |
| 191 | + 0 dog [1. 2.] 4 [1. 3.2] 1.2 |
| 192 | + 1 cat [3. 5.2] 2 [2. 4.] 1.56205 |
| 193 | + <BLANKLINE> |
| 194 | + [4 rows x 5 columns] |
| 195 | +
|
| 196 | + Series embeddings for which to find nearest neighbors: |
| 197 | +
|
| 198 | + >>> search_query = bpd.Series([[1.0, 2.0], [3.0, 5.2]], |
| 199 | + ... index=["dog", "cat"], |
| 200 | + ... name="embedding") |
| 201 | + >>> bbq.vector_search( |
| 202 | + ... base_table="bigframes-dev.bigframes_tests_sys.base_table", |
| 203 | + ... column_to_search="my_embedding", |
| 204 | + ... query=search_query, |
| 205 | + ... top_k=2) |
| 206 | + embedding id my_embedding distance |
| 207 | + dog [1. 2.] 1 [1. 2.] 0.0 |
| 208 | + cat [3. 5.2] 5 [5. 5.4] 2.009975 |
| 209 | + dog [1. 2.] 4 [1. 3.2] 1.2 |
| 210 | + cat [3. 5.2] 2 [2. 4.] 1.56205 |
| 211 | + <BLANKLINE> |
| 212 | + [4 rows x 4 columns] |
| 213 | +
|
| 214 | + You can specify the name of the column in the query DataFrame embeddings and distance type. |
| 215 | + If you specify query_column_to_search_value, it will use the provided column which contains |
| 216 | + the embeddings for which to find nearest neighbors. Otherwiese, it uses the column_to_search value. |
| 217 | +
|
| 218 | + >>> search_query = bpd.DataFrame({"query_id": ["dog", "cat"], |
| 219 | + ... "embedding": [[1.0, 2.0], [3.0, 5.2]], |
| 220 | + ... "another_embedding": [[0.7, 2.2], [3.3, 5.2]]}) |
| 221 | + >>> bbq.vector_search( |
| 222 | + ... base_table="bigframes-dev.bigframes_tests_sys.base_table", |
| 223 | + ... column_to_search="my_embedding", |
| 224 | + ... query=search_query, |
| 225 | + ... distance_type="cosine", |
| 226 | + ... query_column_to_search="another_embedding", |
| 227 | + ... top_k=2) |
| 228 | + query_id embedding another_embedding id my_embedding distance |
| 229 | + 1 cat [3. 5.2] [3.3 5.2] 2 [2. 4.] 0.005181 |
| 230 | + 0 dog [1. 2.] [0.7 2.2] 4 [1. 3.2] 0.000013 |
| 231 | + 1 cat [3. 5.2] [3.3 5.2] 1 [1. 2.] 0.005181 |
| 232 | + 0 dog [1. 2.] [0.7 2.2] 3 [1.5 7. ] 0.004697 |
| 233 | + <BLANKLINE> |
| 234 | + [4 rows x 6 columns] |
| 235 | +
|
| 236 | + Args: |
| 237 | + base_table (str): |
| 238 | + The table to search for nearest neighbor embeddings. |
| 239 | + column_to_search (str): |
| 240 | + The name of the base table column to search for nearest neighbor embeddings. |
| 241 | + The column must have a type of ``ARRAY<FLOAT64>``. All elements in the array must be non-NULL. |
| 242 | + query (bigframes.dataframe.DataFrame | bigframes.dataframe.Series): |
| 243 | + A Series or DataFrame that provides the embeddings for which to find nearest neighbors. |
| 244 | + query_column_to_search (str): |
| 245 | + Specifies the name of the column in the query that contains the embeddings for which to |
| 246 | + find nearest neighbors. The column must have a type of ``ARRAY<FLOAT64>``. All elements in |
| 247 | + the array must be non-NULL and all values in the column must have the same array dimensions |
| 248 | + as the values in the ``column_to_search`` column. Can only be set when query is a DataFrame. |
| 249 | + top_k (int, default 10): |
| 250 | + Sepecifies the number of nearest neighbors to return. Default to 10. |
| 251 | + distance_type (str, defalt "euclidean"): |
| 252 | + Specifies the type of metric to use to compute the distance between two vectors. |
| 253 | + Possible values are "euclidean" and "cosine". Default to "euclidean". |
| 254 | + fraction_lists_to_search (float, range in [0.0, 1.0]): |
| 255 | + Specifies the percentage of lists to search. Specifying a higher percentage leads to |
| 256 | + higher recall and slower performance, and the converse is true when specifying a lower |
| 257 | + percentage. It is only used when a vector index is also used. You can only specify |
| 258 | + ``fraction_lists_to_search`` when ``use_brute_force`` is set to False. |
| 259 | + use_brute_force (bool, default False): |
| 260 | + Determines whether to use brute force search by skipping the vector index if one is available. |
| 261 | + Default to False. |
| 262 | +
|
| 263 | + Returns: |
| 264 | + bigframes.dataframe.DataFrame: A DataFrame containing vector search result. |
| 265 | + """ |
| 266 | + if not fraction_lists_to_search and use_brute_force is True: |
| 267 | + raise ValueError( |
| 268 | + "You can't specify fraction_lists_to_search when use_brute_force is set to True." |
| 269 | + ) |
| 270 | + if ( |
| 271 | + isinstance(query, bigframes.series.Series) |
| 272 | + and query_column_to_search is not None |
| 273 | + ): |
| 274 | + raise ValueError( |
| 275 | + "You can't specify query_column_to_search when query is a Series." |
| 276 | + ) |
| 277 | + # TODO(ashleyxu): Support options in vector search. b/344019989 |
| 278 | + if fraction_lists_to_search is not None or use_brute_force is True: |
| 279 | + raise NotImplementedError( |
| 280 | + f"fraction_lists_to_search and use_brute_force is not supported. {constants.FEEDBACK_LINK}" |
| 281 | + ) |
| 282 | + options = { |
| 283 | + "base_table": base_table, |
| 284 | + "column_to_search": column_to_search, |
| 285 | + "query_column_to_search": query_column_to_search, |
| 286 | + "distance_type": distance_type, |
| 287 | + "top_k": top_k, |
| 288 | + "fraction_lists_to_search": fraction_lists_to_search, |
| 289 | + "use_brute_force": use_brute_force, |
| 290 | + } |
| 291 | + |
| 292 | + (query,) = utils.convert_to_dataframe(query) |
| 293 | + sql_string, index_col_ids, index_labels = query._to_sql_query(include_index=True) |
| 294 | + |
| 295 | + sql = bigframes.core.sql.create_vector_search_sql( |
| 296 | + sql_string=sql_string, options=options # type: ignore |
| 297 | + ) |
| 298 | + if index_col_ids is not None: |
| 299 | + df = query._session.read_gbq(sql, index_col=index_col_ids) |
| 300 | + else: |
| 301 | + df = query._session.read_gbq(sql) |
| 302 | + df.index.names = index_labels |
| 303 | + |
| 304 | + return df |
0 commit comments