ENH: add project id to destination table in to_gbq() (#347)

vreyespue · tswast · web-flow · commit 97c9aaaf548b · 2020-12-04T10:32:54.000-06:00
* ENH: add project id to destination table in to_gbq()

* ENH: fix non-callable client error when adding project id to destination table

* Update pandas_gbq/gbq.py (pass table reference)

Co-authored-by: Tim Swast &lt;swast@google.com&gt;

* Update pandas_gbq/load.py (pass destination table)

Co-authored-by: Tim Swast &lt;swast@google.com&gt;

* Update pandas_gbq/load.py (delete unnecessary variable)

Co-authored-by: Tim Swast &lt;swast@google.com&gt;

* Update pandas_gbq/gbq.py (pass destination_table_ref)

Co-authored-by: Tim Swast &lt;swast@google.com&gt;

* Fix call to load.load_chunks (now using only destination_table_ref)

* add assertions for project ID to unit test

* add to changelog

* use project from credentials if none provided

Co-authored-by: Tim Swast &lt;swast@google.com&gt;
diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst
@@ -1,6 +1,19 @@
 Changelog
 =========
 
+.. _changelog-0.15.0:
+
+0.15.0 / TBD
+------------
+
+Features
+~~~~~~~~
+
+- Load DataFrame with ``to_gbq`` to a table in a project different from the API
+  client project. Specify the target table ID as ``project.dataset.table`` to
+  use this feature. (:issue:`321`, :issue:`347`)
+
+
 .. _changelog-0.14.1:
 
 0.14.1 / 2020-11-10
diff --git a/pandas_gbq/gbq.py b/pandas_gbq/gbq.py
@@ -604,8 +604,7 @@ def _download_results(
     def load_data(
         self,
         dataframe,
-        dataset_id,
-        table_id,
+        destination_table_ref,
         chunksize=None,
         schema=None,
         progress_bar=True,
@@ -618,8 +617,7 @@ def load_data(
             chunks = load.load_chunks(
                 self.client,
                 dataframe,
-                dataset_id,
-                table_id,
+                destination_table_ref,
                 chunksize=chunksize,
                 schema=schema,
                 location=self.location,
@@ -1037,7 +1035,8 @@ def to_gbq(
     dataframe : pandas.DataFrame
         DataFrame to be written to a Google BigQuery table.
     destination_table : str
-        Name of table to be written, in the form ``dataset.tablename``.
+        Name of table to be written, in the form ``dataset.tablename`` or
+        ``project.dataset.tablename``.
     project_id : str, optional
         Google BigQuery Account project ID. Optional when available from
         the environment.
@@ -1133,7 +1132,8 @@ def to_gbq(
 
     if "." not in destination_table:
         raise NotFoundException(
-            "Invalid Table Name. Should be of the form 'datasetId.tableId' "
+            "Invalid Table Name. Should be of the form 'datasetId.tableId' or "
+            "'projectId.datasetId.tableId'"
         )
 
     connector = GbqConnector(
@@ -1145,7 +1145,14 @@ def to_gbq(
         private_key=private_key,
     )
     bqclient = connector.client
-    dataset_id, table_id = destination_table.rsplit(".", 1)
+
+    destination_table_ref = bigquery.table.TableReference.from_string(
+        destination_table, default_project=connector.project_id
+    )
+
+    project_id_table = destination_table_ref.project
+    dataset_id = destination_table_ref.dataset_id
+    table_id = destination_table_ref.table_id
 
     default_schema = _generate_bq_schema(dataframe)
     if not table_schema:
@@ -1157,10 +1164,10 @@ def to_gbq(
 
     # If table exists, check if_exists parameter
     try:
-        table = bqclient.get_table(destination_table)
+        table = bqclient.get_table(destination_table_ref)
     except google_exceptions.NotFound:
         table_connector = _Table(
-            project_id,
+            project_id_table,
             dataset_id,
             location=location,
             credentials=connector.credentials,
@@ -1203,8 +1210,7 @@ def to_gbq(
 
     connector.load_data(
         dataframe,
-        dataset_id,
-        table_id,
+        destination_table_ref,
         chunksize=chunksize,
         schema=table_schema,
         progress_bar=progress_bar,
@@ -1279,8 +1285,12 @@ def exists(self, table_id):
             true if table exists, otherwise false
         """
         from google.api_core.exceptions import NotFound
+        from google.cloud.bigquery import DatasetReference
+        from google.cloud.bigquery import TableReference
 
-        table_ref = self.client.dataset(self.dataset_id).table(table_id)
+        table_ref = TableReference(
+            DatasetReference(self.project_id, self.dataset_id), table_id
+        )
         try:
             self.client.get_table(table_ref)
             return True
@@ -1300,12 +1310,14 @@ def create(self, table_id, schema):
             Use the generate_bq_schema to generate your table schema from a
             dataframe.
         """
+        from google.cloud.bigquery import DatasetReference
         from google.cloud.bigquery import SchemaField
         from google.cloud.bigquery import Table
+        from google.cloud.bigquery import TableReference
 
         if self.exists(table_id):
             raise TableCreationError(
-                "Table {0} already " "exists".format(table_id)
+                "Table {0} already exists".format(table_id)
             )
 
         if not _Dataset(self.project_id, credentials=self.credentials).exists(
@@ -1317,7 +1329,9 @@ def create(self, table_id, schema):
                 location=self.location,
             ).create(self.dataset_id)
 
-        table_ref = self.client.dataset(self.dataset_id).table(table_id)
+        table_ref = TableReference(
+            DatasetReference(self.project_id, self.dataset_id), table_id
+        )
         table = Table(table_ref)
 
         schema = pandas_gbq.schema.add_default_nullable_mode(schema)
diff --git a/pandas_gbq/load.py b/pandas_gbq/load.py
@@ -50,13 +50,11 @@ def encode_chunks(dataframe, chunksize=None):
 def load_chunks(
     client,
     dataframe,
-    dataset_id,
-    table_id,
+    destination_table_ref,
     chunksize=None,
     schema=None,
     location=None,
 ):
-    destination_table = client.dataset(dataset_id).table(table_id)
     job_config = bigquery.LoadJobConfig()
     job_config.write_disposition = "WRITE_APPEND"
     job_config.source_format = "CSV"
@@ -77,7 +75,7 @@ def load_chunks(
             yield remaining_rows
             client.load_table_from_file(
                 chunk_buffer,
-                destination_table,
+                destination_table_ref,
                 job_config=job_config,
                 location=location,
             ).result()
diff --git a/tests/unit/test_gbq.py b/tests/unit/test_gbq.py
@@ -257,6 +257,50 @@ def test_to_gbq_w_empty_df(mock_bigquery_client):
     mock_bigquery_client.load_table_from_file.assert_not_called()
 
 
+def test_to_gbq_w_default_project(mock_bigquery_client):
+    """If no project is specified, we should be able to use project from
+    default credentials.
+    """
+    import google.api_core.exceptions
+    from google.cloud.bigquery.table import TableReference
+
+    mock_bigquery_client.get_table.side_effect = (
+        google.api_core.exceptions.NotFound("my_table")
+    )
+    gbq.to_gbq(DataFrame(), "my_dataset.my_table")
+
+    mock_bigquery_client.get_table.assert_called_with(
+        TableReference.from_string("default-project.my_dataset.my_table")
+    )
+    mock_bigquery_client.create_table.assert_called_with(mock.ANY)
+    table = mock_bigquery_client.create_table.call_args[0][0]
+    assert table.project == "default-project"
+
+
+def test_to_gbq_w_project_table(mock_bigquery_client):
+    """If a project is included in the table ID, use that instead of the client
+    project. See: https://github.com/pydata/pandas-gbq/issues/321
+    """
+    import google.api_core.exceptions
+    from google.cloud.bigquery.table import TableReference
+
+    mock_bigquery_client.get_table.side_effect = (
+        google.api_core.exceptions.NotFound("my_table")
+    )
+    gbq.to_gbq(
+        DataFrame(),
+        "project_table.my_dataset.my_table",
+        project_id="project_client",
+    )
+
+    mock_bigquery_client.get_table.assert_called_with(
+        TableReference.from_string("project_table.my_dataset.my_table")
+    )
+    mock_bigquery_client.create_table.assert_called_with(mock.ANY)
+    table = mock_bigquery_client.create_table.call_args[0][0]
+    assert table.project == "project_table"
+
+
 def test_to_gbq_creates_dataset(mock_bigquery_client):
     import google.api_core.exceptions