diff --git a/docs/reading.rst b/docs/reading.rst index aaecf9a0..e3e3dc5a 100644 --- a/docs/reading.rst +++ b/docs/reading.rst @@ -59,11 +59,13 @@ column, based on the BigQuery table schema. ================== ========================= BigQuery Data Type dtype ================== ========================= -FLOAT float -TIMESTAMP :class:`~pandas.DatetimeTZDtype` with ``unit='ns'`` and ``tz='UTC'`` +DATE datetime64[ns] DATETIME datetime64[ns] +BOOL boolean +FLOAT float +INT64 Int64 TIME datetime64[ns] -DATE datetime64[ns] +TIMESTAMP :class:`~pandas.DatetimeTZDtype` with ``unit='ns'`` and ``tz='UTC'`` ================== ========================= .. _reading-bqstorage-api: diff --git a/pandas_gbq/features.py b/pandas_gbq/features.py index fc8ef568..4259eaf1 100644 --- a/pandas_gbq/features.py +++ b/pandas_gbq/features.py @@ -10,6 +10,7 @@ BIGQUERY_BQSTORAGE_VERSION = "1.24.0" BIGQUERY_FROM_DATAFRAME_CSV_VERSION = "2.6.0" PANDAS_VERBOSITY_DEPRECATION_VERSION = "0.23.0" +PANDAS_BOOLEAN_DTYPE_VERSION = "1.0.0" PANDAS_PARQUET_LOSSLESS_TIMESTAMP_VERSION = "1.1.0" @@ -90,6 +91,13 @@ def pandas_has_deprecated_verbose(self): ) return self.pandas_installed_version >= pandas_verbosity_deprecation + @property + def pandas_has_boolean_dtype(self): + import pkg_resources + + desired_version = pkg_resources.parse_version(PANDAS_BOOLEAN_DTYPE_VERSION) + return self.pandas_installed_version >= desired_version + @property def pandas_has_parquet_with_lossless_timestamp(self): import pkg_resources diff --git a/pandas_gbq/gbq.py b/pandas_gbq/gbq.py index 87c2327c..a1ae2896 100644 --- a/pandas_gbq/gbq.py +++ b/pandas_gbq/gbq.py @@ -579,12 +579,13 @@ def _bqschema_to_nullsafe_dtypes(schema_fields): #missing-data-casting-rules-and-indexing """ # If you update this mapping, also update the table at - # `docs/source/reading.rst`. + # `docs/reading.rst`. dtype_map = { "DATE": "datetime64[ns]", "DATETIME": "datetime64[ns]", "FLOAT": np.dtype(float), "GEOMETRY": "object", + "INTEGER": "Int64", "RECORD": "object", "STRING": "object", # datetime.time objects cannot be case to datetime64. @@ -596,6 +597,10 @@ def _bqschema_to_nullsafe_dtypes(schema_fields): "TIMESTAMP": "datetime64[ns]", } + # Amend dtype_map with newer extension types if pandas version allows. + if FEATURES.pandas_has_boolean_dtype: + dtype_map["BOOLEAN"] = "boolean" + dtypes = {} for field in schema_fields: name = str(field["name"]) diff --git a/setup.py b/setup.py index 283e5ea8..4be5a722 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ "google-auth-oauthlib", # 2.4.* has a bug where waiting for the query can hang indefinitely. # https://github.com/pydata/pandas-gbq/issues/343 - "google-cloud-bigquery[bqstorage,pandas] >=1.11.1,<3.0.0dev,!=2.4.*", + "google-cloud-bigquery[bqstorage,pandas] >=1.11.1,<4.0.0dev,!=2.4.*", ] extras = { "tqdm": "tqdm>=4.23.0", diff --git a/tests/system/test_gbq.py b/tests/system/test_gbq.py index f268a85d..812d2089 100644 --- a/tests/system/test_gbq.py +++ b/tests/system/test_gbq.py @@ -10,7 +10,7 @@ import numpy as np import pandas import pandas.api.types -import pandas.util.testing as tm +import pandas.testing as tm from pandas import DataFrame, NaT try: @@ -21,6 +21,7 @@ import pytz from pandas_gbq import gbq +from pandas_gbq.features import FEATURES import pandas_gbq.schema @@ -32,6 +33,18 @@ def test_imports(): gbq._test_google_api_imports() +def make_mixed_dataframe_v1(): + # Re-implementation of private pandas.util.testing.makeMixedDataFrame + return pandas.DataFrame( + { + "A": [0.0, 1.0, 2.0, 3.0, 4.0], + "B": [0.0, 1.0, 0.0, 1.0, 0.0], + "C": ["foo1", "foo2", "foo3", "foo4", "foo5"], + "D": pandas.bdate_range("1/1/2009", periods=5), + } + ) + + def make_mixed_dataframe_v2(test_size): # create df to test for all BQ datatypes except RECORD bools = np.random.randint(2, size=(1, test_size)).astype(bool) @@ -168,7 +181,7 @@ def test_should_properly_handle_valid_integers(self, project_id): credentials=self.credentials, dialect="standard", ) - tm.assert_frame_equal(df, DataFrame({"valid_integer": [3]})) + tm.assert_frame_equal(df, DataFrame({"valid_integer": [3]}, dtype="Int64")) def test_should_properly_handle_nullable_integers(self, project_id): query = """SELECT * FROM @@ -194,7 +207,7 @@ def test_should_properly_handle_valid_longs(self, project_id): credentials=self.credentials, dialect="standard", ) - tm.assert_frame_equal(df, DataFrame({"valid_long": [1 << 62]})) + tm.assert_frame_equal(df, DataFrame({"valid_long": [1 << 62]}, dtype="Int64")) def test_should_properly_handle_nullable_longs(self, project_id): query = """SELECT * FROM @@ -433,7 +446,10 @@ def test_should_properly_handle_null_boolean(self, project_id): credentials=self.credentials, dialect="legacy", ) - tm.assert_frame_equal(df, DataFrame({"null_boolean": [None]})) + expected_dtype = "boolean" if FEATURES.pandas_has_boolean_dtype else None + tm.assert_frame_equal( + df, DataFrame({"null_boolean": [None]}, dtype=expected_dtype) + ) def test_should_properly_handle_nullable_booleans(self, project_id): query = """SELECT * FROM @@ -445,8 +461,9 @@ def test_should_properly_handle_nullable_booleans(self, project_id): credentials=self.credentials, dialect="legacy", ) + expected_dtype = "boolean" if FEATURES.pandas_has_boolean_dtype else None tm.assert_frame_equal( - df, DataFrame({"nullable_boolean": [True, None]}).astype(object) + df, DataFrame({"nullable_boolean": [True, None]}, dtype=expected_dtype) ) def test_unicode_string_conversion_and_normalization(self, project_id): @@ -629,7 +646,7 @@ def test_one_row_one_column(self, project_id): credentials=self.credentials, dialect="standard", ) - expected_result = DataFrame(dict(v=[3])) + expected_result = DataFrame(dict(v=[3]), dtype="Int64") tm.assert_frame_equal(df, expected_result) def test_legacy_sql(self, project_id): @@ -719,7 +736,7 @@ def test_query_with_parameters(self, project_id): configuration=config, dialect="legacy", ) - tm.assert_frame_equal(df, DataFrame({"valid_result": [3]})) + tm.assert_frame_equal(df, DataFrame({"valid_result": [3]}, dtype="Int64")) def test_query_inside_configuration(self, project_id): query_no_use = 'SELECT "PI_WRONG" AS valid_string' @@ -842,7 +859,11 @@ def test_struct(self, project_id): dialect="standard", ) expected = DataFrame( - [[1, {"letter": "a", "num": 1}]], columns=["int_field", "struct_field"], + { + "int_field": pandas.Series([1], dtype="Int64"), + "struct_field": [{"letter": "a", "num": 1}], + }, + columns=["int_field", "struct_field"], ) tm.assert_frame_equal(df, expected) @@ -874,7 +895,12 @@ def test_array_length_zero(self, project_id): dialect="standard", ) expected = DataFrame( - [["a", [""], 1], ["b", [], 0]], columns=["letter", "array_field", "len"], + { + "letter": ["a", "b"], + "array_field": [[""], []], + "len": pandas.Series([1, 0], dtype="Int64"), + }, + columns=["letter", "array_field", "len"], ) tm.assert_frame_equal(df, expected) @@ -908,7 +934,13 @@ def test_array_of_floats(self, project_id): credentials=self.credentials, dialect="standard", ) - tm.assert_frame_equal(df, DataFrame([[[1.1, 2.2, 3.3], 4]], columns=["a", "b"])) + tm.assert_frame_equal( + df, + DataFrame( + {"a": [[1.1, 2.2, 3.3]], "b": pandas.Series([4], dtype="Int64")}, + columns=["a", "b"], + ), + ) def test_tokyo(self, tokyo_dataset, tokyo_table, project_id): df = gbq.read_gbq( @@ -1021,7 +1053,7 @@ def test_upload_data_if_table_exists_append(self, project_id): test_id = "3" test_size = 10 df = make_mixed_dataframe_v2(test_size) - df_different_schema = tm.makeMixedDataFrame() + df_different_schema = make_mixed_dataframe_v1() # Initialize table with sample data gbq.to_gbq( @@ -1101,7 +1133,7 @@ def test_upload_data_if_table_exists_replace(self, project_id): test_id = "4" test_size = 10 df = make_mixed_dataframe_v2(test_size) - df_different_schema = tm.makeMixedDataFrame() + df_different_schema = make_mixed_dataframe_v1() # Initialize table with sample data gbq.to_gbq( @@ -1225,7 +1257,7 @@ def test_upload_data_with_newlines(self, project_id): result = result_df["s"].sort_values() expected = df["s"].sort_values() - tm.assert_numpy_array_equal(expected.values, result.values) + tm.assert_series_equal(expected, result) def test_upload_data_flexible_column_order(self, project_id): test_id = "13" @@ -1254,7 +1286,7 @@ def test_upload_data_flexible_column_order(self, project_id): def test_upload_data_with_valid_user_schema(self, project_id): # Issue #46; tests test scenarios with user-provided # schemas - df = tm.makeMixedDataFrame() + df = make_mixed_dataframe_v1() test_id = "18" test_schema = [ {"name": "A", "type": "FLOAT"}, @@ -1276,7 +1308,7 @@ def test_upload_data_with_valid_user_schema(self, project_id): ) def test_upload_data_with_invalid_user_schema_raises_error(self, project_id): - df = tm.makeMixedDataFrame() + df = make_mixed_dataframe_v1() test_id = "19" test_schema = [ {"name": "A", "type": "FLOAT"}, @@ -1295,7 +1327,7 @@ def test_upload_data_with_invalid_user_schema_raises_error(self, project_id): ) def test_upload_data_with_missing_schema_fields_raises_error(self, project_id): - df = tm.makeMixedDataFrame() + df = make_mixed_dataframe_v1() test_id = "20" test_schema = [ {"name": "A", "type": "FLOAT"}, @@ -1351,7 +1383,7 @@ def test_upload_data_with_timestamp(self, project_id): tm.assert_series_equal(expected, result) def test_upload_data_with_different_df_and_user_schema(self, project_id): - df = tm.makeMixedDataFrame() + df = make_mixed_dataframe_v1() df["A"] = df["A"].astype(str) df["B"] = df["B"].astype(str) test_id = "22" @@ -1460,13 +1492,13 @@ def test_dataset_does_not_exist(gbq_dataset, random_dataset_id): def test_create_table(gbq_table): - schema = gbq._generate_bq_schema(tm.makeMixedDataFrame()) + schema = gbq._generate_bq_schema(make_mixed_dataframe_v1()) gbq_table.create("test_create_table", schema) assert gbq_table.exists("test_create_table") def test_create_table_already_exists(gbq_table): - schema = gbq._generate_bq_schema(tm.makeMixedDataFrame()) + schema = gbq._generate_bq_schema(make_mixed_dataframe_v1()) gbq_table.create("test_create_table_exists", schema) with pytest.raises(gbq.TableCreationError): gbq_table.create("test_create_table_exists", schema) diff --git a/tests/unit/test_gbq.py b/tests/unit/test_gbq.py index 0a5ecad2..8784a98b 100644 --- a/tests/unit/test_gbq.py +++ b/tests/unit/test_gbq.py @@ -64,8 +64,8 @@ def no_auth(monkeypatch): @pytest.mark.parametrize( ("type_", "expected"), [ - ("INTEGER", None), # Can't handle NULL - ("BOOLEAN", None), # Can't handle NULL + ("SOME_NEW_UNKNOWN_TYPE", None), + ("INTEGER", "Int64"), ("FLOAT", numpy.dtype(float)), # TIMESTAMP will be localized after DataFrame construction. ("TIMESTAMP", "datetime64[ns]"),