diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 7e82ba125c..599546284b 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -306,6 +306,11 @@ def empty(self) -> bool: def values(self) -> numpy.ndarray: return self.to_numpy() + @property + def bqclient(self) -> bigframes.Session: + """BigQuery REST API Client the DataFrame uses for operations.""" + return self._session.bqclient + @property def _session(self) -> bigframes.Session: return self._get_block().expr.session diff --git a/notebooks/dataframes/dataframe.ipynb b/notebooks/dataframes/dataframe.ipynb index 15da075552..de9bb1d04f 100644 --- a/notebooks/dataframes/dataframe.ipynb +++ b/notebooks/dataframes/dataframe.ipynb @@ -1,5 +1,27 @@ { "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "eeec3428", + "metadata": {}, + "outputs": [], + "source": [ + "# Copyright 2023 Google LLC\n", + "#\n", + "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "# you may not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "#\n", + "# https://www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License." + ] + }, { "attachments": {}, "cell_type": "markdown", diff --git a/notebooks/dataframes/integrations.ipynb b/notebooks/dataframes/integrations.ipynb new file mode 100644 index 0000000000..735e18d94e --- /dev/null +++ b/notebooks/dataframes/integrations.ipynb @@ -0,0 +1,635 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [], + "source": [ + "# Copyright 2024 Google LLC\n", + "#\n", + "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "# you may not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "#\n", + "# https://www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Integrating with BigQuery DataFrames\n", + "\n", + "This notebook demonstrates operations for building applications that integrate with BigQuery DataFrames. Follow these samples to build an integration that accepts a BigQuery DataFrames object or returns one." + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [], + "source": [ + "import bigframes.pandas as bpd\n", + "\n", + "# Sample data\n", + "df = bpd.DataFrame({\n", + " \"index\": [0, 1, 2, 3, 4],\n", + " \"int_col\": [1, 2, 3, 4, 5],\n", + " \"float_col\": [1.0, -0.5, 0.25, -0.125, 0.0625],\n", + " \"string_col\": [\"a\", \"b\", \"c\", \"d\", \"e\"],\n", + "}).set_index(\"index\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Accepting a BigQuery DataFrames (bigframes) DataFrame\n", + "\n", + "The recommended serialization format for a BigQuery DataFrames (bigframes) DataFrame is a BigQuery table. To write a DataFrame to a BigQuery table, use the `DataFrame.to_gbq()` method. With no `destination_table`, BigQuery DataFrames creates a table in the anonymous dataset corresponding to the BigQuery user & location and returns the corresponding table ID." + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Query job 00b5c727-f2bf-4265-be22-d7d505619db7 is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "'swast-scratch._63cfa399614a54153cc386c27d6c0c6fdb249f9e.bqdf20240327_43bbc4c64fb947f7b69db570a5641506'" + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "table_id = df.to_gbq()\n", + "table_id" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Sharing the table with your application's backend\n", + "\n", + "Tables created in the user's anonymous dataset are only queryable by the user who created them. Many applications authenticate with a [service account](https://cloud.google.com/iam/docs/service-account-overview), which may be different from the end-user running BigQuery DataFrames (bigframes).\n", + "\n", + "Grant your application access to this table by granting your application's service account associated with the customer the `roles/bigquery.dataViewer` role on the [BigQuery table with an IAM policy](https://cloud.google.com/bigquery/docs/control-access-to-resources-iam#grant_access_to_a_table_or_view)." + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Query job f9c39ac2-a428-45c9-bb3a-643fc62a1c5b is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " index int_col float_col string_col\n", + "0 2 3 0.2500 c\n", + "1 4 5 0.0625 e\n", + "2 0 1 1.0000 a\n", + "3 1 2 -0.5000 b\n", + "4 3 4 -0.1250 d\n" + ] + } + ], + "source": [ + "# This sample assumes the client code knows which service account to share with.\n", + "your_service_account_email = \"your-service-account@bigframes-samples.iam.gserviceaccount.com\"\n", + "\n", + "\n", + "def df_to_gbq_plus_workoad(df):\n", + " table_id = df.to_gbq()\n", + "\n", + " bqclient = df.bqclient\n", + " policy = bqclient.get_iam_policy(table_id)\n", + " binding = {\n", + " \"role\": \"roles/bigquery.dataViewer\",\n", + " \"members\": {f\"serviceAccount:{your_service_account_email}\"},\n", + " }\n", + " policy.bindings.append(binding)\n", + " bqclient.set_iam_policy(table_id, policy)\n", + "\n", + " # TODO(developer): Pass table_id to your application and start your workload.\n", + " example_workload(table_id)\n", + "\n", + "\n", + "def example_workload(table_id):\n", + " # For example, for one node workloads, use the client library to read the table\n", + " # as a pandas DataFrame.\n", + " from google.cloud import bigquery\n", + "\n", + " # This sample assumes this client is authenticated as the user\n", + " # your_service_account_email.\n", + " client = bigquery.Client()\n", + " pandas_df = client.list_rows(table_id).to_dataframe()\n", + " print(pandas_df)\n", + "\n", + "\n", + "df_to_gbq_plus_workoad(df)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Query job ad53c7f2-e3bd-4667-b60b-b700c24b7a81 is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " index int_col float_col string_col\n", + "0 4 5 0.0625 e\n", + "1 0 1 1.0000 a\n", + "2 2 3 0.2500 c\n", + "3 3 4 -0.1250 d\n", + "4 1 2 -0.5000 b\n" + ] + } + ], + "source": [ + "# This sample assumes the client code doesn't know which service account to share with.\n", + "\n", + "\n", + "def df_to_gbq_plus_workoad(df):\n", + " table_id = df.to_gbq()\n", + "\n", + " bqclient = df.bqclient\n", + " token = bqclient._credentials.token\n", + " project_id = bqclient.project\n", + "\n", + " share_table_and_start_workload(table_id, token, project_id)\n", + "\n", + "\n", + "def share_table_and_start_workload(table_id, token, project_id):\n", + " # This code runs in the backend for your application.\n", + " from google.cloud import bigquery\n", + " import google.oauth2.credentials\n", + "\n", + " # Note: these credentials don't have any way to be refreshed,\n", + " # so only use them long enough to share the table with the\n", + " # service account.\n", + " credentials = google.oauth2.credentials.Credentials(token)\n", + " bqclient = bigquery.Client(\n", + " project=project_id,\n", + " credentials=credentials,\n", + " )\n", + "\n", + " # This is assumed to only be available on the backend.\n", + " your_service_account_email = \"your-service-account@bigframes-samples.iam.gserviceaccount.com\"\n", + " policy = bqclient.get_iam_policy(table_id)\n", + " binding = {\n", + " \"role\": \"roles/bigquery.dataViewer\",\n", + " \"members\": {f\"serviceAccount:{your_service_account_email}\"},\n", + " }\n", + " policy.bindings.append(binding)\n", + " bqclient.set_iam_policy(table_id, policy)\n", + "\n", + " # Now that the table has been shared, bqclient with the temporary token\n", + " # is no longer needed.\n", + " example_workload(table_id)\n", + "\n", + "\n", + "def example_workload(table_id):\n", + " # For example, for one node workloads, use the client library to read the table\n", + " # as a pandas DataFrame.\n", + " from google.cloud import bigquery\n", + "\n", + " # This sample assumes this client is authenticated as the user\n", + " # your_service_account_email.\n", + " client = bigquery.Client()\n", + " pandas_df = client.list_rows(table_id).to_dataframe()\n", + " print(pandas_df)\n", + "\n", + "\n", + "df_to_gbq_plus_workoad(df)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Preserving order\n", + "\n", + "Depending on your use case, you may want to include the ordering so that it can be restored withing your application." + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Query job 2aa7033c-c547-4ae2-a9aa-33272be82b9c is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "'swast-scratch._63cfa399614a54153cc386c27d6c0c6fdb249f9e.bqdf20240327_b484a3967fba4a41850f4eb21b4b3bd8'" + ] + }, + "execution_count": 40, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ordering_column = \"ordering_id_maybe_with_some_random_text_to_avoid_collisions\"\n", + "table_id = df.to_gbq(ordering_id=ordering_column)\n", + "table_id" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Creating clustered tables\n", + "\n", + "Large tables can be optimized by passing in `clustering_columns` to create a [clustered table](https://cloud.google.com/bigquery/docs/clustered-tables)." + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Query job 1d489f94-2840-405e-9114-d439dcfcf7aa is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "'swast-scratch._63cfa399614a54153cc386c27d6c0c6fdb249f9e.bqdf20240327_d00699eeeed743b487c870dca5bcf23b'" + ] + }, + "execution_count": 41, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "table_id = df.to_gbq(clustering_columns=(\"index\", \"int_col\"))\n", + "table_id" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Returning a BigQuery DataFrames (bigframes) DataFrame\n", + "\n", + "The recommended way to construct a DataFrame is from a BigQuery table which has a unique primary key. By default a primary key is used as the index, which allows for more efficient queries than the default index generation.\n", + "\n", + "This sample assumes there is a shared dataset that\n", + "\n", + "1. The application can write to and\n", + "2. the bigframes user can read from.\n", + "\n", + "There are many ways an application can [write to a BigQuery table](https://cloud.google.com/bigquery/docs/loading-data), including BigQuery load jobs, DML, streaming REST API, and the BigQuery Write API. Each has different costs, performance, and limitations. Choose the one that best suits your application's needs." + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Dataset(DatasetReference('swast-scratch', 'my_dataset'))" + ] + }, + "execution_count": 42, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# The assumption is that there is a shared dataset to work with.\n", + "from google.cloud import bigquery\n", + "\n", + "bqclient = bigquery.Client()\n", + "bqclient.create_dataset(\"my_dataset\", exists_ok=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Query job 40977e60-97c3-4c93-89e2-d7334e5af71d is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job 81e35bb8-2e27-4a18-b596-15a7805331f0 is DONE. 270 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
statepostal_codepop
unique_index
2MI48105669
3GA303092581
5TX787015373
7CO803012087
11MA021422592
13IL606072630
17MI482012
19NC27701801
23CA926121115
29WA980334952
\n", + "

10 rows × 3 columns

\n", + "
[10 rows x 3 columns in total]" + ], + "text/plain": [ + " state postal_code pop\n", + "unique_index \n", + "2 MI 48105 669\n", + "3 GA 30309 2581\n", + "5 TX 78701 5373\n", + "7 CO 80301 2087\n", + "11 MA 02142 2592\n", + "13 IL 60607 2630\n", + "17 MI 48201 2\n", + "19 NC 27701 801\n", + "23 CA 92612 1115\n", + "29 WA 98033 4952\n", + "\n", + "[10 rows x 3 columns]" + ] + }, + "execution_count": 43, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# For simplicity, this sample assumes your application uses\n", + "# a load job with the CSV file format.\n", + "# See: https://cloud.google.com/bigquery/docs/loading-data-cloud-storage-csv#python\n", + "import datetime\n", + "import io\n", + "import random\n", + "\n", + "\n", + "def create_table_for_bigframes():\n", + " # This code is assumed to run on the application's backend.\n", + " from google.cloud import bigquery\n", + "\n", + " client = bigquery.Client()\n", + "\n", + " # The end-user is expected to have read access to this table.\n", + " table_suffix = f\"{datetime.datetime.now().strftime('%Y%m%d_%H%M%S_%f')}_{random.randrange(1_000_000)}\"\n", + " table_id = f\"{client.project}.my_dataset.integrations_ipynb_{table_suffix}\"\n", + "\n", + " # Best practice: set the primary key to a unique column to use as the\n", + " # index and default ordering in a BigQuery DataFrames (bigframes) DataFrame.\n", + " # Having a unique identity column allows the DataFrame to be constructed\n", + " # more efficiently.\n", + " #\n", + " # Note 1: Even a random UUID would be helpful for efficiency.\n", + " #\n", + " # Note 2: Don't do this if you can't guarantee uniqueness, as the BigQuery\n", + " # query engine uses this property to optimize queries. Non-unique primary\n", + " # keys result in undefined behavior.\n", + " #\n", + " # Note 3: client.create_table doesn't support primary key, so instead\n", + " # use DDL to create the table.\n", + " create_table_ddl = f\"\"\"\n", + " CREATE OR REPLACE TABLE `{table_id}`\n", + " (\n", + " unique_index INT64,\n", + " state STRING,\n", + " postal_code STRING,\n", + " pop INT64,\n", + " PRIMARY KEY (unique_index) NOT ENFORCED\n", + " )\n", + " -- Clustering by the index column can make joins and loc operations more efficient.\n", + " -- Also cluster by columns which are expected to be used as common filters.\n", + " CLUSTER BY unique_index, state\n", + " \"\"\"\n", + " client.query_and_wait(create_table_ddl)\n", + "\n", + " csv_file = io.BytesIO(\n", + "b\"\"\"unique_index,state,postal_code,pop\n", + "2,MI,48105,669\n", + "3,GA,30309,2581\n", + "5,TX,78701,5373\n", + "7,CO,80301,2087\n", + "11,MA,02142,2592\n", + "13,IL,60607,2630\n", + "17,MI,48201,2\n", + "19,NC,27701,801\n", + "23,CA,92612,1115\n", + "29,WA,98033,4952\n", + "\"\"\"\n", + " )\n", + " job_config = bigquery.LoadJobConfig(\n", + " skip_leading_rows=1,\n", + " source_format=bigquery.SourceFormat.CSV,\n", + " )\n", + " load_job = client.load_table_from_file(\n", + " csv_file, table_id, job_config=job_config\n", + " )\n", + " load_job.result() # Waits for the job to complete.\n", + "\n", + " return table_id\n", + "\n", + "\n", + "table_id = create_table_for_bigframes()\n", + "\n", + "\n", + "# This is assumed to run on the client.\n", + "import bigframes.pandas as bpd\n", + "df = bpd.read_gbq_table(table_id, index_col=[\"unique_index\"])\n", + "df" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "bigframes", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.1" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}