diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py
index 7e82ba125c..599546284b 100644
--- a/bigframes/dataframe.py
+++ b/bigframes/dataframe.py
@@ -306,6 +306,11 @@ def empty(self) -> bool:
def values(self) -> numpy.ndarray:
return self.to_numpy()
+ @property
+ def bqclient(self) -> bigframes.Session:
+ """BigQuery REST API Client the DataFrame uses for operations."""
+ return self._session.bqclient
+
@property
def _session(self) -> bigframes.Session:
return self._get_block().expr.session
diff --git a/notebooks/dataframes/dataframe.ipynb b/notebooks/dataframes/dataframe.ipynb
index 15da075552..de9bb1d04f 100644
--- a/notebooks/dataframes/dataframe.ipynb
+++ b/notebooks/dataframes/dataframe.ipynb
@@ -1,5 +1,27 @@
{
"cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "eeec3428",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Copyright 2023 Google LLC\n",
+ "#\n",
+ "# Licensed under the Apache License, Version 2.0 (the \"License\");\n",
+ "# you may not use this file except in compliance with the License.\n",
+ "# You may obtain a copy of the License at\n",
+ "#\n",
+ "# https://www.apache.org/licenses/LICENSE-2.0\n",
+ "#\n",
+ "# Unless required by applicable law or agreed to in writing, software\n",
+ "# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
+ "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
+ "# See the License for the specific language governing permissions and\n",
+ "# limitations under the License."
+ ]
+ },
{
"attachments": {},
"cell_type": "markdown",
diff --git a/notebooks/dataframes/integrations.ipynb b/notebooks/dataframes/integrations.ipynb
new file mode 100644
index 0000000000..735e18d94e
--- /dev/null
+++ b/notebooks/dataframes/integrations.ipynb
@@ -0,0 +1,635 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 35,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Copyright 2024 Google LLC\n",
+ "#\n",
+ "# Licensed under the Apache License, Version 2.0 (the \"License\");\n",
+ "# you may not use this file except in compliance with the License.\n",
+ "# You may obtain a copy of the License at\n",
+ "#\n",
+ "# https://www.apache.org/licenses/LICENSE-2.0\n",
+ "#\n",
+ "# Unless required by applicable law or agreed to in writing, software\n",
+ "# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
+ "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
+ "# See the License for the specific language governing permissions and\n",
+ "# limitations under the License."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Integrating with BigQuery DataFrames\n",
+ "\n",
+ "This notebook demonstrates operations for building applications that integrate with BigQuery DataFrames. Follow these samples to build an integration that accepts a BigQuery DataFrames object or returns one."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 36,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import bigframes.pandas as bpd\n",
+ "\n",
+ "# Sample data\n",
+ "df = bpd.DataFrame({\n",
+ " \"index\": [0, 1, 2, 3, 4],\n",
+ " \"int_col\": [1, 2, 3, 4, 5],\n",
+ " \"float_col\": [1.0, -0.5, 0.25, -0.125, 0.0625],\n",
+ " \"string_col\": [\"a\", \"b\", \"c\", \"d\", \"e\"],\n",
+ "}).set_index(\"index\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Accepting a BigQuery DataFrames (bigframes) DataFrame\n",
+ "\n",
+ "The recommended serialization format for a BigQuery DataFrames (bigframes) DataFrame is a BigQuery table. To write a DataFrame to a BigQuery table, use the `DataFrame.to_gbq()` method. With no `destination_table`, BigQuery DataFrames creates a table in the anonymous dataset corresponding to the BigQuery user & location and returns the corresponding table ID."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 37,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "Query job 00b5c727-f2bf-4265-be22-d7d505619db7 is DONE. 0 Bytes processed. Open Job"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/plain": [
+ "'swast-scratch._63cfa399614a54153cc386c27d6c0c6fdb249f9e.bqdf20240327_43bbc4c64fb947f7b69db570a5641506'"
+ ]
+ },
+ "execution_count": 37,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "table_id = df.to_gbq()\n",
+ "table_id"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Sharing the table with your application's backend\n",
+ "\n",
+ "Tables created in the user's anonymous dataset are only queryable by the user who created them. Many applications authenticate with a [service account](https://cloud.google.com/iam/docs/service-account-overview), which may be different from the end-user running BigQuery DataFrames (bigframes).\n",
+ "\n",
+ "Grant your application access to this table by granting your application's service account associated with the customer the `roles/bigquery.dataViewer` role on the [BigQuery table with an IAM policy](https://cloud.google.com/bigquery/docs/control-access-to-resources-iam#grant_access_to_a_table_or_view)."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 38,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "Query job f9c39ac2-a428-45c9-bb3a-643fc62a1c5b is DONE. 0 Bytes processed. Open Job"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " index int_col float_col string_col\n",
+ "0 2 3 0.2500 c\n",
+ "1 4 5 0.0625 e\n",
+ "2 0 1 1.0000 a\n",
+ "3 1 2 -0.5000 b\n",
+ "4 3 4 -0.1250 d\n"
+ ]
+ }
+ ],
+ "source": [
+ "# This sample assumes the client code knows which service account to share with.\n",
+ "your_service_account_email = \"your-service-account@bigframes-samples.iam.gserviceaccount.com\"\n",
+ "\n",
+ "\n",
+ "def df_to_gbq_plus_workoad(df):\n",
+ " table_id = df.to_gbq()\n",
+ "\n",
+ " bqclient = df.bqclient\n",
+ " policy = bqclient.get_iam_policy(table_id)\n",
+ " binding = {\n",
+ " \"role\": \"roles/bigquery.dataViewer\",\n",
+ " \"members\": {f\"serviceAccount:{your_service_account_email}\"},\n",
+ " }\n",
+ " policy.bindings.append(binding)\n",
+ " bqclient.set_iam_policy(table_id, policy)\n",
+ "\n",
+ " # TODO(developer): Pass table_id to your application and start your workload.\n",
+ " example_workload(table_id)\n",
+ "\n",
+ "\n",
+ "def example_workload(table_id):\n",
+ " # For example, for one node workloads, use the client library to read the table\n",
+ " # as a pandas DataFrame.\n",
+ " from google.cloud import bigquery\n",
+ "\n",
+ " # This sample assumes this client is authenticated as the user\n",
+ " # your_service_account_email.\n",
+ " client = bigquery.Client()\n",
+ " pandas_df = client.list_rows(table_id).to_dataframe()\n",
+ " print(pandas_df)\n",
+ "\n",
+ "\n",
+ "df_to_gbq_plus_workoad(df)\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 39,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "Query job ad53c7f2-e3bd-4667-b60b-b700c24b7a81 is DONE. 0 Bytes processed. Open Job"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " index int_col float_col string_col\n",
+ "0 4 5 0.0625 e\n",
+ "1 0 1 1.0000 a\n",
+ "2 2 3 0.2500 c\n",
+ "3 3 4 -0.1250 d\n",
+ "4 1 2 -0.5000 b\n"
+ ]
+ }
+ ],
+ "source": [
+ "# This sample assumes the client code doesn't know which service account to share with.\n",
+ "\n",
+ "\n",
+ "def df_to_gbq_plus_workoad(df):\n",
+ " table_id = df.to_gbq()\n",
+ "\n",
+ " bqclient = df.bqclient\n",
+ " token = bqclient._credentials.token\n",
+ " project_id = bqclient.project\n",
+ "\n",
+ " share_table_and_start_workload(table_id, token, project_id)\n",
+ "\n",
+ "\n",
+ "def share_table_and_start_workload(table_id, token, project_id):\n",
+ " # This code runs in the backend for your application.\n",
+ " from google.cloud import bigquery\n",
+ " import google.oauth2.credentials\n",
+ "\n",
+ " # Note: these credentials don't have any way to be refreshed,\n",
+ " # so only use them long enough to share the table with the\n",
+ " # service account.\n",
+ " credentials = google.oauth2.credentials.Credentials(token)\n",
+ " bqclient = bigquery.Client(\n",
+ " project=project_id,\n",
+ " credentials=credentials,\n",
+ " )\n",
+ "\n",
+ " # This is assumed to only be available on the backend.\n",
+ " your_service_account_email = \"your-service-account@bigframes-samples.iam.gserviceaccount.com\"\n",
+ " policy = bqclient.get_iam_policy(table_id)\n",
+ " binding = {\n",
+ " \"role\": \"roles/bigquery.dataViewer\",\n",
+ " \"members\": {f\"serviceAccount:{your_service_account_email}\"},\n",
+ " }\n",
+ " policy.bindings.append(binding)\n",
+ " bqclient.set_iam_policy(table_id, policy)\n",
+ "\n",
+ " # Now that the table has been shared, bqclient with the temporary token\n",
+ " # is no longer needed.\n",
+ " example_workload(table_id)\n",
+ "\n",
+ "\n",
+ "def example_workload(table_id):\n",
+ " # For example, for one node workloads, use the client library to read the table\n",
+ " # as a pandas DataFrame.\n",
+ " from google.cloud import bigquery\n",
+ "\n",
+ " # This sample assumes this client is authenticated as the user\n",
+ " # your_service_account_email.\n",
+ " client = bigquery.Client()\n",
+ " pandas_df = client.list_rows(table_id).to_dataframe()\n",
+ " print(pandas_df)\n",
+ "\n",
+ "\n",
+ "df_to_gbq_plus_workoad(df)\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Preserving order\n",
+ "\n",
+ "Depending on your use case, you may want to include the ordering so that it can be restored withing your application."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 40,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "Query job 2aa7033c-c547-4ae2-a9aa-33272be82b9c is DONE. 0 Bytes processed. Open Job"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/plain": [
+ "'swast-scratch._63cfa399614a54153cc386c27d6c0c6fdb249f9e.bqdf20240327_b484a3967fba4a41850f4eb21b4b3bd8'"
+ ]
+ },
+ "execution_count": 40,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "ordering_column = \"ordering_id_maybe_with_some_random_text_to_avoid_collisions\"\n",
+ "table_id = df.to_gbq(ordering_id=ordering_column)\n",
+ "table_id"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Creating clustered tables\n",
+ "\n",
+ "Large tables can be optimized by passing in `clustering_columns` to create a [clustered table](https://cloud.google.com/bigquery/docs/clustered-tables)."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 41,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "Query job 1d489f94-2840-405e-9114-d439dcfcf7aa is DONE. 0 Bytes processed. Open Job"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/plain": [
+ "'swast-scratch._63cfa399614a54153cc386c27d6c0c6fdb249f9e.bqdf20240327_d00699eeeed743b487c870dca5bcf23b'"
+ ]
+ },
+ "execution_count": 41,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "table_id = df.to_gbq(clustering_columns=(\"index\", \"int_col\"))\n",
+ "table_id"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Returning a BigQuery DataFrames (bigframes) DataFrame\n",
+ "\n",
+ "The recommended way to construct a DataFrame is from a BigQuery table which has a unique primary key. By default a primary key is used as the index, which allows for more efficient queries than the default index generation.\n",
+ "\n",
+ "This sample assumes there is a shared dataset that\n",
+ "\n",
+ "1. The application can write to and\n",
+ "2. the bigframes user can read from.\n",
+ "\n",
+ "There are many ways an application can [write to a BigQuery table](https://cloud.google.com/bigquery/docs/loading-data), including BigQuery load jobs, DML, streaming REST API, and the BigQuery Write API. Each has different costs, performance, and limitations. Choose the one that best suits your application's needs."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 42,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Dataset(DatasetReference('swast-scratch', 'my_dataset'))"
+ ]
+ },
+ "execution_count": 42,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# The assumption is that there is a shared dataset to work with.\n",
+ "from google.cloud import bigquery\n",
+ "\n",
+ "bqclient = bigquery.Client()\n",
+ "bqclient.create_dataset(\"my_dataset\", exists_ok=True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 43,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "Query job 40977e60-97c3-4c93-89e2-d7334e5af71d is DONE. 0 Bytes processed. Open Job"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "Query job 81e35bb8-2e27-4a18-b596-15a7805331f0 is DONE. 270 Bytes processed. Open Job"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " state | \n",
+ " postal_code | \n",
+ " pop | \n",
+ "
\n",
+ " \n",
+ " unique_index | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 2 | \n",
+ " MI | \n",
+ " 48105 | \n",
+ " 669 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " GA | \n",
+ " 30309 | \n",
+ " 2581 | \n",
+ "
\n",
+ " \n",
+ " 5 | \n",
+ " TX | \n",
+ " 78701 | \n",
+ " 5373 | \n",
+ "
\n",
+ " \n",
+ " 7 | \n",
+ " CO | \n",
+ " 80301 | \n",
+ " 2087 | \n",
+ "
\n",
+ " \n",
+ " 11 | \n",
+ " MA | \n",
+ " 02142 | \n",
+ " 2592 | \n",
+ "
\n",
+ " \n",
+ " 13 | \n",
+ " IL | \n",
+ " 60607 | \n",
+ " 2630 | \n",
+ "
\n",
+ " \n",
+ " 17 | \n",
+ " MI | \n",
+ " 48201 | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " 19 | \n",
+ " NC | \n",
+ " 27701 | \n",
+ " 801 | \n",
+ "
\n",
+ " \n",
+ " 23 | \n",
+ " CA | \n",
+ " 92612 | \n",
+ " 1115 | \n",
+ "
\n",
+ " \n",
+ " 29 | \n",
+ " WA | \n",
+ " 98033 | \n",
+ " 4952 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
10 rows × 3 columns
\n",
+ "
[10 rows x 3 columns in total]"
+ ],
+ "text/plain": [
+ " state postal_code pop\n",
+ "unique_index \n",
+ "2 MI 48105 669\n",
+ "3 GA 30309 2581\n",
+ "5 TX 78701 5373\n",
+ "7 CO 80301 2087\n",
+ "11 MA 02142 2592\n",
+ "13 IL 60607 2630\n",
+ "17 MI 48201 2\n",
+ "19 NC 27701 801\n",
+ "23 CA 92612 1115\n",
+ "29 WA 98033 4952\n",
+ "\n",
+ "[10 rows x 3 columns]"
+ ]
+ },
+ "execution_count": 43,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# For simplicity, this sample assumes your application uses\n",
+ "# a load job with the CSV file format.\n",
+ "# See: https://cloud.google.com/bigquery/docs/loading-data-cloud-storage-csv#python\n",
+ "import datetime\n",
+ "import io\n",
+ "import random\n",
+ "\n",
+ "\n",
+ "def create_table_for_bigframes():\n",
+ " # This code is assumed to run on the application's backend.\n",
+ " from google.cloud import bigquery\n",
+ "\n",
+ " client = bigquery.Client()\n",
+ "\n",
+ " # The end-user is expected to have read access to this table.\n",
+ " table_suffix = f\"{datetime.datetime.now().strftime('%Y%m%d_%H%M%S_%f')}_{random.randrange(1_000_000)}\"\n",
+ " table_id = f\"{client.project}.my_dataset.integrations_ipynb_{table_suffix}\"\n",
+ "\n",
+ " # Best practice: set the primary key to a unique column to use as the\n",
+ " # index and default ordering in a BigQuery DataFrames (bigframes) DataFrame.\n",
+ " # Having a unique identity column allows the DataFrame to be constructed\n",
+ " # more efficiently.\n",
+ " #\n",
+ " # Note 1: Even a random UUID would be helpful for efficiency.\n",
+ " #\n",
+ " # Note 2: Don't do this if you can't guarantee uniqueness, as the BigQuery\n",
+ " # query engine uses this property to optimize queries. Non-unique primary\n",
+ " # keys result in undefined behavior.\n",
+ " #\n",
+ " # Note 3: client.create_table doesn't support primary key, so instead\n",
+ " # use DDL to create the table.\n",
+ " create_table_ddl = f\"\"\"\n",
+ " CREATE OR REPLACE TABLE `{table_id}`\n",
+ " (\n",
+ " unique_index INT64,\n",
+ " state STRING,\n",
+ " postal_code STRING,\n",
+ " pop INT64,\n",
+ " PRIMARY KEY (unique_index) NOT ENFORCED\n",
+ " )\n",
+ " -- Clustering by the index column can make joins and loc operations more efficient.\n",
+ " -- Also cluster by columns which are expected to be used as common filters.\n",
+ " CLUSTER BY unique_index, state\n",
+ " \"\"\"\n",
+ " client.query_and_wait(create_table_ddl)\n",
+ "\n",
+ " csv_file = io.BytesIO(\n",
+ "b\"\"\"unique_index,state,postal_code,pop\n",
+ "2,MI,48105,669\n",
+ "3,GA,30309,2581\n",
+ "5,TX,78701,5373\n",
+ "7,CO,80301,2087\n",
+ "11,MA,02142,2592\n",
+ "13,IL,60607,2630\n",
+ "17,MI,48201,2\n",
+ "19,NC,27701,801\n",
+ "23,CA,92612,1115\n",
+ "29,WA,98033,4952\n",
+ "\"\"\"\n",
+ " )\n",
+ " job_config = bigquery.LoadJobConfig(\n",
+ " skip_leading_rows=1,\n",
+ " source_format=bigquery.SourceFormat.CSV,\n",
+ " )\n",
+ " load_job = client.load_table_from_file(\n",
+ " csv_file, table_id, job_config=job_config\n",
+ " )\n",
+ " load_job.result() # Waits for the job to complete.\n",
+ "\n",
+ " return table_id\n",
+ "\n",
+ "\n",
+ "table_id = create_table_for_bigframes()\n",
+ "\n",
+ "\n",
+ "# This is assumed to run on the client.\n",
+ "import bigframes.pandas as bpd\n",
+ "df = bpd.read_gbq_table(table_id, index_col=[\"unique_index\"])\n",
+ "df"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "bigframes",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.11.1"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}