diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst index c2c14cf5..d710b37f 100644 --- a/docs/source/changelog.rst +++ b/docs/source/changelog.rst @@ -12,6 +12,12 @@ Changelog version. This is required to use new functionality such as the BigQuery Storage API. (:issue:`267`) +Documentation +~~~~~~~~~~~~~ + +- Document :ref:`BigQuery data type to pandas dtype conversion + ` for ``read_gbq``. (:issue:`269`) + Dependency updates ~~~~~~~~~~~~~~~~~~ @@ -27,11 +33,14 @@ Internal changes Enhancements ~~~~~~~~~~~~ + - Allow ``table_schema`` in :func:`to_gbq` to contain only a subset of columns, with the rest being populated using the DataFrame dtypes (:issue:`218`) (contributed by @johnpaton) - Read ``project_id`` in :func:`to_gbq` from provided ``credentials`` if available (contributed by @daureg) +- ``read_gbq`` uses the timezone-aware ``DatetimeTZDtype(unit='ns', + tz='UTC')`` dtype for BigQuery ``TIMESTAMP`` columns. (:issue:`269`) .. _changelog-0.9.0: diff --git a/docs/source/reading.rst b/docs/source/reading.rst index add61ed2..4a7b9d66 100644 --- a/docs/source/reading.rst +++ b/docs/source/reading.rst @@ -9,21 +9,32 @@ Suppose you want to load all data from an existing BigQuery table .. code-block:: python - # Insert your BigQuery Project ID Here - # Can be found in the Google web console + import pandas_gbq + + # TODO: Set your BigQuery Project ID. projectid = "xxxxxxxx" - data_frame = read_gbq('SELECT * FROM test_dataset.test_table', projectid) + data_frame = pandas_gbq.read_gbq( + 'SELECT * FROM `test_dataset.test_table`', + project_id=projectid) + +.. note:: + A project ID is sometimes optional if it can be inferred during + authentication, but it is required when authenticating with user + credentials. You can find your project ID in the `Google Cloud console + `__. You can define which column from BigQuery to use as an index in the destination DataFrame as well as a preferred column order as follows: .. code-block:: python - data_frame = read_gbq('SELECT * FROM test_dataset.test_table', - index_col='index_column_name', - col_order=['col1', 'col2', 'col3'], projectid) + data_frame = pandas_gbq.read_gbq( + 'SELECT * FROM `test_dataset.test_table`', + project_id=projectid, + index_col='index_column_name', + col_order=['col1', 'col2', 'col3']) You can specify the query config as parameter to use additional options of @@ -37,20 +48,39 @@ your job. For more information about query configuration parameters see `here "useQueryCache": False } } - data_frame = read_gbq('SELECT * FROM test_dataset.test_table', - configuration=configuration, projectid) + data_frame = read_gbq( + 'SELECT * FROM `test_dataset.test_table`', + project_id=projectid, + configuration=configuration) -.. note:: +The ``dialect`` argument can be used to indicate whether to use +BigQuery's ``'legacy'`` SQL or BigQuery's ``'standard'`` SQL (beta). The +default value is ``'standard'`` For more information on BigQuery's standard +SQL, see `BigQuery SQL Reference +`__ - You can find your project id in the `Google developers console - `__. +.. code-block:: python + data_frame = pandas_gbq.read_gbq( + 'SELECT * FROM [test_dataset.test_table]', + project_id=projectid, + dialect='legacy') -.. note:: - The ``dialect`` argument can be used to indicate whether to use BigQuery's ``'legacy'`` SQL - or BigQuery's ``'standard'`` SQL (beta). The default value is ``'legacy'``, though this will change - in a subsequent release to ``'standard'``. For more information - on BigQuery's standard SQL, see `BigQuery SQL Reference - `__ +.. _reading-dtypes: + +Inferring the DataFrame's dtypes +-------------------------------- + +The :func:`~pandas_gbq.read_gbq` method infers the pandas dtype for each column, based on the BigQuery table schema. + +================== ========================= +BigQuery Data Type dtype +================== ========================= +FLOAT float +TIMESTAMP :class:`~pandas.DatetimeTZDtype` with ``unit='ns'`` and ``tz='UTC'`` +DATETIME datetime64[ns] +TIME datetime64[ns] +DATE datetime64[ns] +================== ========================= diff --git a/pandas_gbq/gbq.py b/pandas_gbq/gbq.py index 17d18263..b9978887 100644 --- a/pandas_gbq/gbq.py +++ b/pandas_gbq/gbq.py @@ -494,6 +494,9 @@ def run_query(self, query, **kwargs): if df.empty: df = _cast_empty_df_dtypes(schema_fields, df) + # Ensure any TIMESTAMP columns are tz-aware. + df = _localize_df(schema_fields, df) + logger.debug("Got {} rows.\n".format(rows_iter.total_rows)) return df @@ -644,17 +647,21 @@ def delete_and_recreate_table(self, dataset_id, table_id, table_schema): def _bqschema_to_nullsafe_dtypes(schema_fields): - # Only specify dtype when the dtype allows nulls. Otherwise, use pandas's - # default dtype choice. - # - # See: - # http://pandas.pydata.org/pandas-docs/dev/missing_data.html - # #missing-data-casting-rules-and-indexing + """Specify explicit dtypes based on BigQuery schema. + + This function only specifies a dtype when the dtype allows nulls. + Otherwise, use pandas's default dtype choice. + + See: http://pandas.pydata.org/pandas-docs/dev/missing_data.html + #missing-data-casting-rules-and-indexing + """ + # If you update this mapping, also update the table at + # `docs/source/reading.rst`. dtype_map = { "FLOAT": np.dtype(float), - # Even though TIMESTAMPs are timezone-aware in BigQuery, pandas doesn't - # support datetime64[ns, UTC] as dtype in DataFrame constructors. See: - # https://github.com/pandas-dev/pandas/issues/12513 + # pandas doesn't support timezone-aware dtype in DataFrame/Series + # constructors. It's more idiomatic to localize after construction. + # https://github.com/pandas-dev/pandas/issues/25843 "TIMESTAMP": "datetime64[ns]", "TIME": "datetime64[ns]", "DATE": "datetime64[ns]", @@ -702,6 +709,24 @@ def _cast_empty_df_dtypes(schema_fields, df): return df +def _localize_df(schema_fields, df): + """Localize any TIMESTAMP columns to tz-aware type. + + In pandas versions before 0.24.0, DatetimeTZDtype cannot be used as the + dtype in Series/DataFrame construction, so localize those columns after + the DataFrame is constructed. + """ + for field in schema_fields: + column = str(field["name"]) + if field["mode"].upper() == "REPEATED": + continue + + if field["type"].upper() == "TIMESTAMP" and df[column].dt.tz is None: + df[column] = df[column].dt.tz_localize("UTC") + + return df + + def read_gbq( query, project_id=None, diff --git a/tests/system/test_gbq.py b/tests/system/test_gbq.py index 4480f203..6c876068 100644 --- a/tests/system/test_gbq.py +++ b/tests/system/test_gbq.py @@ -310,13 +310,15 @@ def test_should_properly_handle_timestamp_unix_epoch(self, project_id): credentials=self.credentials, dialect="legacy", ) - tm.assert_frame_equal( - df, - DataFrame( - {"unix_epoch": ["1970-01-01T00:00:00.000000Z"]}, - dtype="datetime64[ns]", - ), + expected = DataFrame( + {"unix_epoch": ["1970-01-01T00:00:00.000000Z"]}, + dtype="datetime64[ns]", ) + if expected["unix_epoch"].dt.tz is None: + expected["unix_epoch"] = expected["unix_epoch"].dt.tz_localize( + "UTC" + ) + tm.assert_frame_equal(df, expected) def test_should_properly_handle_arbitrary_timestamp(self, project_id): query = 'SELECT TIMESTAMP("2004-09-15 05:00:00") AS valid_timestamp' @@ -326,13 +328,15 @@ def test_should_properly_handle_arbitrary_timestamp(self, project_id): credentials=self.credentials, dialect="legacy", ) - tm.assert_frame_equal( - df, - DataFrame( - {"valid_timestamp": ["2004-09-15T05:00:00.000000Z"]}, - dtype="datetime64[ns]", - ), + expected = DataFrame( + {"valid_timestamp": ["2004-09-15T05:00:00.000000Z"]}, + dtype="datetime64[ns]", ) + if expected["valid_timestamp"].dt.tz is None: + expected["valid_timestamp"] = expected[ + "valid_timestamp" + ].dt.tz_localize("UTC") + tm.assert_frame_equal(df, expected) def test_should_properly_handle_datetime_unix_epoch(self, project_id): query = 'SELECT DATETIME("1970-01-01 00:00:00") AS unix_epoch' @@ -368,7 +372,7 @@ def test_should_properly_handle_arbitrary_datetime(self, project_id): "expression, is_expected_dtype", [ ("current_date()", pandas.api.types.is_datetime64_ns_dtype), - ("current_timestamp()", pandas.api.types.is_datetime64_ns_dtype), + ("current_timestamp()", pandas.api.types.is_datetime64tz_dtype), ("current_datetime()", pandas.api.types.is_datetime64_ns_dtype), ("TRUE", pandas.api.types.is_bool_dtype), ("FALSE", pandas.api.types.is_bool_dtype), @@ -402,9 +406,11 @@ def test_should_properly_handle_null_timestamp(self, project_id): credentials=self.credentials, dialect="legacy", ) - tm.assert_frame_equal( - df, DataFrame({"null_timestamp": [NaT]}, dtype="datetime64[ns]") + expected = DataFrame({"null_timestamp": [NaT]}, dtype="datetime64[ns]") + expected["null_timestamp"] = expected["null_timestamp"].dt.tz_localize( + "UTC" ) + tm.assert_frame_equal(df, expected) def test_should_properly_handle_null_datetime(self, project_id): query = "SELECT CAST(NULL AS DATETIME) AS null_datetime" @@ -594,6 +600,7 @@ def test_zero_rows(self, project_id): expected_result = DataFrame( empty_columns, columns=["title", "id", "is_bot", "ts"] ) + expected_result["ts"] = expected_result["ts"].dt.tz_localize("UTC") tm.assert_frame_equal(df, expected_result, check_index_type=False) def test_one_row_one_column(self, project_id): diff --git a/tests/unit/test_gbq.py b/tests/unit/test_gbq.py index 3a047741..6956be20 100644 --- a/tests/unit/test_gbq.py +++ b/tests/unit/test_gbq.py @@ -1,21 +1,26 @@ # -*- coding: utf-8 -*- -import pandas.util.testing as tm -import pytest +try: + import mock +except ImportError: # pragma: NO COVER + from unittest import mock + import numpy from pandas import DataFrame +import pandas.util.testing as tm +import pkg_resources +import pytest import pandas_gbq.exceptions from pandas_gbq import gbq -try: - import mock -except ImportError: # pragma: NO COVER - from unittest import mock pytestmark = pytest.mark.filter_warnings( "ignore:credentials from Google Cloud SDK" ) +pandas_installed_version = pkg_resources.get_distribution( + "pandas" +).parsed_version @pytest.fixture @@ -90,6 +95,7 @@ def no_auth(monkeypatch): ("INTEGER", None), # Can't handle NULL ("BOOLEAN", None), # Can't handle NULL ("FLOAT", numpy.dtype(float)), + # TIMESTAMP will be localized after DataFrame construction. ("TIMESTAMP", "datetime64[ns]"), ("DATETIME", "datetime64[ns]"), ], @@ -200,6 +206,10 @@ def test_to_gbq_with_verbose_old_pandas_no_warnings(recwarn, min_bq_version): assert len(recwarn) == 0 +@pytest.mark.skipif( + pandas_installed_version < pkg_resources.parse_version("0.24.0"), + reason="Requires pandas 0.24+", +) def test_to_gbq_with_private_key_new_pandas_warns_deprecation( min_bq_version, monkeypatch ): @@ -413,6 +423,10 @@ def test_read_gbq_with_verbose_old_pandas_no_warnings(recwarn, min_bq_version): assert len(recwarn) == 0 +@pytest.mark.skipif( + pandas_installed_version < pkg_resources.parse_version("0.24.0"), + reason="Requires pandas 0.24+", +) def test_read_gbq_with_private_key_new_pandas_warns_deprecation( min_bq_version, monkeypatch ):