Skip to content
Merged
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
.pytest_cache
.testmon*
.vscode/
.env

# Docs #
########
Expand Down
6 changes: 6 additions & 0 deletions docs/source/changelog.rst
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,10 @@ Changelog
0.7.0 / [unreleased]
--------------------

- `int` columns which contain `NULL` are now cast to `float`, rather than
`object` type. (:issue:`174`)
- `DATE`, `DATETIME` and `TIMESTAMP` columns are now parsed as pandas' `timestamp`
objects (:issue:`224`)
- Add :class:`pandas_gbq.Context` to cache credentials in-memory, across
calls to ``read_gbq`` and ``to_gbq``. (:issue:`198`, :issue:`208`)
- Fast queries now do not log above ``DEBUG`` level. (:issue:`204`)
Expand All @@ -20,6 +24,8 @@ Internal changes
~~~~~~~~~~~~~~~~

- Avoid listing datasets and tables in system tests. (:issue:`215`)
- Improved performance from eliminating some duplicative parsing steps
(:issue:`224`)

.. _changelog-0.6.1:

Expand Down
8 changes: 1 addition & 7 deletions noxfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,13 +77,7 @@ def test_latest_deps(session, python=latest_python):
@nox.session
def lint(session, python=latest_python):
session.install("black")
session.run(
"black",
"--check",
"--exclude",
"(\.git|\.hg|\.mypy_cache|\.tox|\.nox|\.venv|_build|buck-out|build|dist)",
".",
)
session.run("black", "--check", ".")


@nox.session
Expand Down
40 changes: 22 additions & 18 deletions pandas_gbq/gbq.py
Original file line number Diff line number Diff line change
Expand Up @@ -283,7 +283,7 @@ def __init__(

# BQ Queries costs $5 per TB. First 1 TB per month is free
# see here for more: https://cloud.google.com/bigquery/pricing
self.query_price_for_TB = 5. / 2 ** 40 # USD/TB
self.query_price_for_TB = 5.0 / 2 ** 40 # USD/TB

def _start_timer(self):
self.start = time.time()
Expand Down Expand Up @@ -577,24 +577,41 @@ def _parse_schema(schema_fields):
# see:
# http://pandas.pydata.org/pandas-docs/dev/missing_data.html
# #missing-data-casting-rules-and-indexing
dtype_map = {"FLOAT": np.dtype(float), "TIMESTAMP": "M8[ns]"}
dtype_map = {
"FLOAT": np.dtype(float),
"TIMESTAMP": "datetime64[ns]",
"TIME": "datetime64[ns]",
"DATE": "datetime64[ns]",
"DATETIME": "datetime64[ns]",
"BOOLEAN": bool,
"INTEGER": np.int64,
}

for field in schema_fields:
name = str(field["name"])
if field["mode"].upper() == "REPEATED":
yield name, object
else:
dtype = dtype_map.get(field["type"].upper(), object)
dtype = dtype_map.get(field["type"].upper())
yield name, dtype


def _parse_data(schema, rows):

column_dtypes = OrderedDict(_parse_schema(schema["fields"]))

df = DataFrame(data=(iter(r) for r in rows), columns=column_dtypes.keys())

for column in df:
df[column] = df[column].astype(column_dtypes[column])
dtype = column_dtypes[column]
null_safe = (
df[column].notnull().all()
or dtype == float
or dtype == "datetime64[ns]"
)
if dtype and null_safe:
df[column] = df[column].astype(
column_dtypes[column], errors="ignore"
)
return df


Expand Down Expand Up @@ -747,19 +764,6 @@ def read_gbq(
"Column order does not match this DataFrame."
)

# cast BOOLEAN and INTEGER columns from object to bool/int
# if they dont have any nulls AND field mode is not repeated (i.e., array)
type_map = {"BOOLEAN": bool, "INTEGER": np.int64}
for field in schema["fields"]:
if (
field["type"].upper() in type_map
and final_df[field["name"]].notnull().all()
and field["mode"].lower() != "repeated"
):
final_df[field["name"]] = final_df[field["name"]].astype(
type_map[field["type"].upper()]
)

connector.log_elapsed_seconds(
"Total time taken",
datetime.now().strftime("s.\nFinished at %Y-%m-%d %H:%M:%S."),
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,5 @@ exclude = '''
versioneer.py
| _version.py
| docs
| .nox
'''
60 changes: 33 additions & 27 deletions tests/system/test_gbq.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
# -*- coding: utf-8 -*-

import sys
from datetime import datetime
import uuid
from datetime import datetime

import numpy as np
import pandas.util.testing as tm
Expand Down Expand Up @@ -200,9 +200,7 @@ def test_should_properly_handle_nullable_integers(self, project_id):
private_key=self.credentials,
dialect="legacy",
)
tm.assert_frame_equal(
df, DataFrame({"nullable_integer": [1, None]}).astype(object)
)
tm.assert_frame_equal(df, DataFrame({"nullable_integer": [1, None]}))

def test_should_properly_handle_valid_longs(self, project_id):
query = "SELECT 1 << 62 AS valid_long"
Expand All @@ -225,7 +223,7 @@ def test_should_properly_handle_nullable_longs(self, project_id):
dialect="legacy",
)
tm.assert_frame_equal(
df, DataFrame({"nullable_long": [1 << 62, None]}).astype(object)
df, DataFrame({"nullable_long": [1 << 62, None]})
)

def test_should_properly_handle_null_integers(self, project_id):
Expand Down Expand Up @@ -338,35 +336,43 @@ def test_should_properly_handle_arbitrary_timestamp(self, project_id):
),
)

def test_should_properly_handle_null_timestamp(self, project_id):
query = "SELECT TIMESTAMP(NULL) AS null_timestamp"
df = gbq.read_gbq(
query,
project_id=project_id,
private_key=self.credentials,
dialect="legacy",
)
tm.assert_frame_equal(df, DataFrame({"null_timestamp": [NaT]}))
@pytest.mark.parametrize(
"expression, type_",
[
("current_date()", "<M8[ns]"),
("current_timestamp()", "<M8[ns]"),
("current_datetime()", "<M8[ns]"),
("TRUE", bool),
("FALSE", bool),
],
)
def test_return_correct_types(self, project_id, expression, type_):
"""
All type checks can be added to this function using additional
parameters, rather than creating additional functions.
We can consolidate the existing functions here in time

def test_should_properly_handle_true_boolean(self, project_id):
query = "SELECT BOOLEAN(TRUE) AS true_boolean"
TODO: time doesn't currently parse
("time(12,30,00)", "<M8[ns]"),
"""
query = "SELECT {} AS _".format(expression)
df = gbq.read_gbq(
query,
project_id=project_id,
private_key=self.credentials,
dialect="legacy",
dialect="standard",
)
tm.assert_frame_equal(df, DataFrame({"true_boolean": [True]}))
assert df["_"].dtype == type_

def test_should_properly_handle_false_boolean(self, project_id):
query = "SELECT BOOLEAN(FALSE) AS false_boolean"
def test_should_properly_handle_null_timestamp(self, project_id):
query = "SELECT TIMESTAMP(NULL) AS null_timestamp"
df = gbq.read_gbq(
query,
project_id=project_id,
private_key=self.credentials,
dialect="legacy",
)
tm.assert_frame_equal(df, DataFrame({"false_boolean": [False]}))
tm.assert_frame_equal(df, DataFrame({"null_timestamp": [NaT]}))

def test_should_properly_handle_null_boolean(self, project_id):
query = "SELECT BOOLEAN(NULL) AS null_boolean"
Expand Down Expand Up @@ -741,12 +747,12 @@ def test_query_response_bytes(self):
assert self.gbq_connector.sizeof_fmt(1048576) == "1.0 MB"
assert self.gbq_connector.sizeof_fmt(1048576000) == "1000.0 MB"
assert self.gbq_connector.sizeof_fmt(1073741824) == "1.0 GB"
assert self.gbq_connector.sizeof_fmt(1.099512E12) == "1.0 TB"
assert self.gbq_connector.sizeof_fmt(1.125900E15) == "1.0 PB"
assert self.gbq_connector.sizeof_fmt(1.152922E18) == "1.0 EB"
assert self.gbq_connector.sizeof_fmt(1.180592E21) == "1.0 ZB"
assert self.gbq_connector.sizeof_fmt(1.208926E24) == "1.0 YB"
assert self.gbq_connector.sizeof_fmt(1.208926E28) == "10000.0 YB"
assert self.gbq_connector.sizeof_fmt(1.099512e12) == "1.0 TB"
assert self.gbq_connector.sizeof_fmt(1.125900e15) == "1.0 PB"
assert self.gbq_connector.sizeof_fmt(1.152922e18) == "1.0 EB"
assert self.gbq_connector.sizeof_fmt(1.180592e21) == "1.0 ZB"
assert self.gbq_connector.sizeof_fmt(1.208926e24) == "1.0 YB"
assert self.gbq_connector.sizeof_fmt(1.208926e28) == "10000.0 YB"

def test_struct(self, project_id):
query = """SELECT 1 int_field,
Expand Down
1 change: 0 additions & 1 deletion tests/unit/test_schema.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@

import datetime

import pandas
Expand Down