Skip to content

feat: support INFORMATION_SCHEMA views in read_gbq #1895

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 11 commits into
base: main
Choose a base branch
from
57 changes: 51 additions & 6 deletions bigframes/session/_io/bigquery/read_gbq_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,17 +42,53 @@
import bigframes.session


def get_information_schema_metadata(
bqclient: bigquery.Client,
table_id: str,
default_project: Optional[str],
) -> bigquery.Table:
job_config = bigquery.QueryJobConfig(dry_run=True)
job = bqclient.query(
# TODO: better escaping?
f"SELECT * FROM `{table_id}`",
job_config=job_config,
)
parts = table_id.split(".")
if len(parts) < 3:
project = default_project
dataset = parts[0]
table_id_short = ".".join(parts[1:])
else:
project = parts[0]
dataset = parts[1]
table_id_short = ".".join(parts[2:])

table = bigquery.Table.from_api_repr(
{
"tableReference": {
"projectId": project,
"datasetId": dataset,
"tableId": table_id_short,
},
"location": job.location,
}
)
table.schema = job.schema
return table


def get_table_metadata(
bqclient: bigquery.Client,
table_ref: google.cloud.bigquery.table.TableReference,
bq_time: datetime.datetime,
*,
cache: Dict[bigquery.TableReference, Tuple[datetime.datetime, bigquery.Table]],
table_id: str,
default_project: Optional[str],
bq_time: datetime.datetime,
cache: Dict[str, Tuple[datetime.datetime, bigquery.Table]],
use_cache: bool = True,
) -> Tuple[datetime.datetime, google.cloud.bigquery.table.Table]:
"""Get the table metadata, either from cache or via REST API."""

cached_table = cache.get(table_ref)
cached_table = cache.get(table_id)
if use_cache and cached_table is not None:
snapshot_timestamp, _ = cached_table

Expand All @@ -76,15 +112,24 @@ def get_table_metadata(
warnings.warn(msg, stacklevel=7)
return cached_table

table = bqclient.get_table(table_ref)
if "INFORMATION_SCHEMA".casefold() in table_id.casefold():
table = get_information_schema_metadata(
bqclient=bqclient, table_id=table_id, default_project=default_project
)
else:
table_ref = google.cloud.bigquery.table.TableReference.from_string(
table_id, default_project=default_project
)
table = bqclient.get_table(table_ref)

# local time will lag a little bit do to network latency
# make sure it is at least table creation time.
# This is relevant if the table was created immediately before loading it here.
if (table.created is not None) and (table.created > bq_time):
bq_time = table.created

cached_table = (bq_time, table)
cache[table_ref] = cached_table
cache[table_id] = cached_table
return cached_table


Expand Down
11 changes: 3 additions & 8 deletions bigframes/session/loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -268,9 +268,7 @@ def __init__(
self._default_index_type = default_index_type
self._scan_index_uniqueness = scan_index_uniqueness
self._force_total_order = force_total_order
self._df_snapshot: Dict[
bigquery.TableReference, Tuple[datetime.datetime, bigquery.Table]
] = {}
self._df_snapshot: Dict[str, Tuple[datetime.datetime, bigquery.Table]] = {}
self._metrics = metrics
# Unfortunate circular reference, but need to pass reference when constructing objects
self._session = session
Expand Down Expand Up @@ -617,10 +615,6 @@ def read_gbq_table(

_check_duplicates("columns", columns)

table_ref = google.cloud.bigquery.table.TableReference.from_string(
table_id, default_project=self._bqclient.project
)

columns = list(columns)
include_all_columns = columns is None or len(columns) == 0
filters = typing.cast(list, list(filters))
Expand All @@ -631,7 +625,8 @@ def read_gbq_table(

time_travel_timestamp, table = bf_read_gbq_table.get_table_metadata(
self._bqclient,
table_ref=table_ref,
table_id=table_id,
default_project=self._bqclient.project,
bq_time=self._clock.get_time(),
cache=self._df_snapshot,
use_cache=use_cache,
Expand Down
18 changes: 18 additions & 0 deletions tests/system/small/pandas/test_read_gbq_information_schema.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


def test_read_gbq_jobs_by_user_returns_schema(session):
df = session.read_gbq("region-US.INFORMATION_SCHEMA.JOBS_BY_USER")
assert df.dtypes is not None
Loading