From d5753efecefd6699ac0496704b9d5e8c0d682c17 Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Wed, 12 Feb 2025 22:17:30 +0000 Subject: [PATCH 1/7] feat: use JSONArrowType for JSON data --- bigframes/core/compile/ibis_types.py | 2 +- bigframes/dtypes.py | 2 +- bigframes/session/_io/pandas.py | 3 --- tests/system/small/test_dataframe_io.py | 10 +++++----- tests/system/small/test_series.py | 2 +- 5 files changed, 8 insertions(+), 11 deletions(-) diff --git a/bigframes/core/compile/ibis_types.py b/bigframes/core/compile/ibis_types.py index 54a5a37736..54b0a1408a 100644 --- a/bigframes/core/compile/ibis_types.py +++ b/bigframes/core/compile/ibis_types.py @@ -75,7 +75,7 @@ IBIS_GEO_TYPE, gpd.array.GeometryDtype(), ), - (ibis_dtypes.json, db_dtypes.JSONDtype()), + (ibis_dtypes.json, pd.ArrowDtype(db_dtypes.JSONArrowType())), ) BIGFRAMES_TO_IBIS: Dict[bigframes.dtypes.Dtype, ibis_dtypes.DataType] = { diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py index 5e9f1f108b..5f462222e0 100644 --- a/bigframes/dtypes.py +++ b/bigframes/dtypes.py @@ -62,7 +62,7 @@ # No arrow equivalent GEO_DTYPE = gpd.array.GeometryDtype() # JSON -JSON_DTYPE = db_dtypes.JSONDtype() +JSON_DTYPE = pd.ArrowDtype(db_dtypes.JSONArrowType()) OBJ_REF_DTYPE = pd.ArrowDtype( pa.struct( ( diff --git a/bigframes/session/_io/pandas.py b/bigframes/session/_io/pandas.py index a1549238b3..ca70ee774c 100644 --- a/bigframes/session/_io/pandas.py +++ b/bigframes/session/_io/pandas.py @@ -18,7 +18,6 @@ from typing import Collection, Union import bigframes_vendored.constants as constants -import db_dtypes # type: ignore import geopandas # type: ignore import numpy as np import pandas @@ -125,8 +124,6 @@ def arrow_to_pandas( ) elif isinstance(dtype, pandas.ArrowDtype): series = _arrow_to_pandas_arrowdtype(column, dtype) - elif isinstance(dtype, db_dtypes.JSONDtype): - series = db_dtypes.JSONArray(column) else: series = column.to_pandas(types_mapper=lambda _: dtype) diff --git a/tests/system/small/test_dataframe_io.py b/tests/system/small/test_dataframe_io.py index 4758c2d5b4..bfa920e940 100644 --- a/tests/system/small/test_dataframe_io.py +++ b/tests/system/small/test_dataframe_io.py @@ -304,7 +304,7 @@ def test_load_json_w_unboxed_py_value(session): """ df = session.read_gbq(sql, index_col="id") - assert df.dtypes["json_col"] == db_dtypes.JSONDtype() + assert df.dtypes["json_col"] == pd.ArrowDtype(db_dtypes.JSONArrowType()) assert isinstance(df["json_col"][0], dict) assert df["json_col"][0]["boolean"] @@ -321,13 +321,13 @@ def test_load_json_w_unboxed_py_value(session): def test_load_json_to_pandas_has_correct_result(session): df = session.read_gbq("SELECT JSON_OBJECT('foo', 10, 'bar', TRUE) AS json_col") - assert df.dtypes["json_col"] == db_dtypes.JSONDtype() + assert df.dtypes["json_col"] == pd.ArrowDtype(db_dtypes.JSONArrowType()) result = df.to_pandas() # The order of keys within the JSON object shouldn't matter for equality checks. pd_df = pd.DataFrame( {"json_col": [{"bar": True, "foo": 10}]}, - dtype=db_dtypes.JSONDtype(), + dtype=pd.ArrowDtype(db_dtypes.JSONArrowType()), ) pd_df.index = pd_df.index.astype("Int64") pd.testing.assert_series_equal(result.dtypes, pd_df.dtypes) @@ -365,7 +365,7 @@ def test_load_json_in_struct(session): assert isinstance(df.dtypes["struct_col"].pyarrow_dtype, pa.StructType) data = df["struct_col"].struct.field("data") - assert data.dtype == db_dtypes.JSONDtype() + assert data.dtype == pd.ArrowDtype(db_dtypes.JSONArrowType()) assert data[0]["boolean"] assert data[1]["int"] == 100 @@ -406,7 +406,7 @@ def test_load_json_in_array(session): data = df["array_col"].list assert data.len()[0] == 7 - assert data[0].dtype == db_dtypes.JSONDtype() + assert data[0].dtype == pd.ArrowDtype(db_dtypes.JSONArrowType()) assert data[0][0]["boolean"] assert data[1][0]["int"] == 100 diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index 980f2226b7..c1594983d1 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -383,7 +383,7 @@ def test_get_column(scalars_dfs, col_name, expected_dtype): def test_get_column_w_json(json_df, json_pandas_df): series = json_df["json_col"] series_pandas = series.to_pandas() - assert series.dtype == db_dtypes.JSONDtype() + assert series.dtype == pd.ArrowDtype(db_dtypes.JSONArrowType()) assert series_pandas.shape[0] == json_pandas_df.shape[0] From d80e641824a560200faaa2a3b4ae06bfa5ec78e9 Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Wed, 5 Mar 2025 22:19:17 +0000 Subject: [PATCH 2/7] fix related system tests --- bigframes/core/array_value.py | 4 +- setup.py | 3 +- tests/system/small/bigquery/test_json.py | 59 ++++++++++---------- tests/system/small/test_dataframe_io.py | 71 ++++++++++++------------ tests/system/small/test_series.py | 25 +++++---- tests/system/small/test_session.py | 24 ++++---- 6 files changed, 94 insertions(+), 92 deletions(-) diff --git a/bigframes/core/array_value.py b/bigframes/core/array_value.py index 9c44255941..431bc25464 100644 --- a/bigframes/core/array_value.py +++ b/bigframes/core/array_value.py @@ -108,8 +108,8 @@ def from_table( raise ValueError("must set at most one of 'offests', 'primary_key'") if any(i.field_type == "JSON" for i in table.schema if i.name in schema.names): msg = bfe.format_message( - "Interpreting JSON column(s) as the `db_dtypes.dbjson` extension type is " - "in preview; this behavior may change in future versions." + "JSON column interpretation as a custom PyArrow extention in `db_dtypes` " + "is a preview feature and subject to change." ) warnings.warn(msg, bfe.PreviewWarning) # define data source only for needed columns, this makes row-hashing cheaper diff --git a/setup.py b/setup.py index 9ea563b3cb..cb61043375 100644 --- a/setup.py +++ b/setup.py @@ -60,7 +60,8 @@ "ipywidgets >=7.7.1", "humanize >=4.6.0", "matplotlib >=3.7.1", - "db-dtypes >=1.4.0", + "db-dtypes@ git+https://github.com/googleapis/python-db-dtypes-pandas.git@main", + # "db-dtypes >=1.4.0", # For vendored ibis-framework. "atpublic>=2.3,<6", "parsy>=2,<3", diff --git a/tests/system/small/bigquery/test_json.py b/tests/system/small/bigquery/test_json.py index 492c0cf9b6..bade725733 100644 --- a/tests/system/small/bigquery/test_json.py +++ b/tests/system/small/bigquery/test_json.py @@ -12,30 +12,29 @@ # See the License for the specific language governing permissions and # limitations under the License. -import db_dtypes # type: ignore import geopandas as gpd # type: ignore import pandas as pd import pyarrow as pa import pytest import bigframes.bigquery as bbq -import bigframes.dtypes +import bigframes.dtypes as dtypes import bigframes.pandas as bpd @pytest.mark.parametrize( ("json_path", "expected_json"), [ - pytest.param("$.a", [{"a": 10}], id="simple"), - pytest.param("$.a.b.c", [{"a": {"b": {"c": 10, "d": []}}}], id="nested"), + pytest.param("$.a", ['{"a": 10}'], id="simple"), + pytest.param("$.a.b.c", ['{"a": {"b": {"c": 10, "d": []}}}'], id="nested"), ], ) def test_json_set_at_json_path(json_path, expected_json): - original_json = [{"a": {"b": {"c": "tester", "d": []}}}] - s = bpd.Series(original_json, dtype=db_dtypes.JSONDtype()) + original_json = ['{"a": {"b": {"c": "tester", "d": []}}}'] + s = bpd.Series(original_json, dtype=dtypes.JSON_DTYPE) actual = bbq.json_set(s, json_path_value_pairs=[(json_path, 10)]) - expected = bpd.Series(expected_json, dtype=db_dtypes.JSONDtype()) + expected = bpd.Series(expected_json, dtype=dtypes.JSON_DTYPE) pd.testing.assert_series_equal( actual.to_pandas(), expected.to_pandas(), @@ -45,18 +44,20 @@ def test_json_set_at_json_path(json_path, expected_json): @pytest.mark.parametrize( ("json_value", "expected_json"), [ - pytest.param(10, [{"a": {"b": 10}}, {"a": {"b": 10}}], id="int"), - pytest.param(0.333, [{"a": {"b": 0.333}}, {"a": {"b": 0.333}}], id="float"), - pytest.param("eng", [{"a": {"b": "eng"}}, {"a": {"b": "eng"}}], id="string"), - pytest.param([1, 2], [{"a": {"b": 1}}, {"a": {"b": 2}}], id="series"), + pytest.param(10, ['{"a": {"b": 10}}', '{"a": {"b": 10}}'], id="int"), + pytest.param(0.333, ['{"a": {"b": 0.333}}', '{"a": {"b": 0.333}}'], id="float"), + pytest.param( + "eng", ['{"a": {"b": "eng"}}', '{"a": {"b": "eng"}}'], id="string" + ), + pytest.param([1, 2], ['{"a": {"b": 1}}', '{"a": {"b": 2}}'], id="series"), ], ) def test_json_set_at_json_value_type(json_value, expected_json): - original_json = [{"a": {"b": "dev"}}, {"a": {"b": [1, 2]}}] - s = bpd.Series(original_json, dtype=db_dtypes.JSONDtype()) + original_json = ['{"a": {"b": "dev"}}', '{"a": {"b": [1, 2]}}'] + s = bpd.Series(original_json, dtype=dtypes.JSON_DTYPE) actual = bbq.json_set(s, json_path_value_pairs=[("$.a.b", json_value)]) - expected = bpd.Series(expected_json, dtype=db_dtypes.JSONDtype()) + expected = bpd.Series(expected_json, dtype=dtypes.JSON_DTYPE) pd.testing.assert_series_equal( actual.to_pandas(), expected.to_pandas(), @@ -64,14 +65,14 @@ def test_json_set_at_json_value_type(json_value, expected_json): def test_json_set_w_more_pairs(): - original_json = [{"a": 2}, {"b": 5}, {"c": 1}] - s = bpd.Series(original_json, dtype=db_dtypes.JSONDtype()) + original_json = ['{"a": 2}', '{"b": 5}', '{"c": 1}'] + s = bpd.Series(original_json, dtype=dtypes.JSON_DTYPE) actual = bbq.json_set( s, json_path_value_pairs=[("$.a", 1), ("$.b", 2), ("$.a", [3, 4, 5])] ) - expected_json = [{"a": 3, "b": 2}, {"a": 4, "b": 2}, {"a": 5, "b": 2, "c": 1}] - expected = bpd.Series(expected_json, dtype=db_dtypes.JSONDtype()) + expected_json = ['{"a": 3, "b": 2}', '{"a": 4, "b": 2}', '{"a": 5, "b": 2, "c": 1}'] + expected = bpd.Series(expected_json, dtype=dtypes.JSON_DTYPE) pd.testing.assert_series_equal( actual.to_pandas(), expected.to_pandas(), @@ -79,13 +80,13 @@ def test_json_set_w_more_pairs(): def test_json_set_w_invalid_json_path_value_pairs(): - s = bpd.Series([{"a": 10}], dtype=db_dtypes.JSONDtype()) + s = bpd.Series(['{"a": 10}'], dtype=dtypes.JSON_DTYPE) with pytest.raises(ValueError): bbq.json_set(s, json_path_value_pairs=[("$.a", 1, 100)]) # type: ignore def test_json_set_w_invalid_value_type(): - s = bpd.Series([{"a": 10}], dtype=db_dtypes.JSONDtype()) + s = bpd.Series(['{"a": 10}'], dtype=dtypes.JSON_DTYPE) with pytest.raises(TypeError): bbq.json_set( s, @@ -101,17 +102,18 @@ def test_json_set_w_invalid_value_type(): def test_json_set_w_invalid_series_type(): + s = bpd.Series([1, 2]) with pytest.raises(TypeError): - bbq.json_set(bpd.Series([1, 2]), json_path_value_pairs=[("$.a", 1)]) + bbq.json_set(s, json_path_value_pairs=[("$.a", 1)]) def test_json_extract_from_json(): s = bpd.Series( - [{"a": {"b": [1, 2]}}, {"a": {"c": 1}}, {"a": {"b": 0}}], - dtype=db_dtypes.JSONDtype(), + ['{"a": {"b": [1, 2]}}', '{"a": {"c": 1}}', '{"a": {"b": 0}}'], + dtype=dtypes.JSON_DTYPE, ) actual = bbq.json_extract(s, "$.a.b").to_pandas() - expected = bpd.Series([[1, 2], None, 0], dtype=db_dtypes.JSONDtype()).to_pandas() + expected = bpd.Series(["[1, 2]", None, "0"], dtype=dtypes.JSON_DTYPE).to_pandas() pd.testing.assert_series_equal( actual, expected, @@ -132,14 +134,15 @@ def test_json_extract_from_string(): def test_json_extract_w_invalid_series_type(): + s = bpd.Series([1, 2]) with pytest.raises(TypeError): - bbq.json_extract(bpd.Series([1, 2]), "$.a") + bbq.json_extract(s, "$.a") def test_json_extract_array_from_json(): s = bpd.Series( - [{"a": ["ab", "2", "3 xy"]}, {"a": []}, {"a": ["4", "5"]}, {}], - dtype=db_dtypes.JSONDtype(), + ['{"a": ["ab", "2", "3 xy"]}', '{"a": []}', '{"a": ["4", "5"]}', "{}"], + dtype=dtypes.JSON_DTYPE, ) actual = bbq.json_extract_array(s, "$.a") @@ -225,7 +228,7 @@ def test_json_extract_string_array_from_array_strings(): def test_json_extract_string_array_as_float_array_from_array_strings(): s = bpd.Series(["[1, 2.5, 3]", "[]", "[4,5]"]) - actual = bbq.json_extract_string_array(s, value_dtype=bigframes.dtypes.FLOAT_DTYPE) + actual = bbq.json_extract_string_array(s, value_dtype=dtypes.FLOAT_DTYPE) expected = bpd.Series([[1, 2.5, 3], [], [4, 5]]) pd.testing.assert_series_equal( actual.to_pandas(), diff --git a/tests/system/small/test_dataframe_io.py b/tests/system/small/test_dataframe_io.py index bfa920e940..149821a741 100644 --- a/tests/system/small/test_dataframe_io.py +++ b/tests/system/small/test_dataframe_io.py @@ -22,6 +22,7 @@ import pyarrow as pa import pytest +import bigframes.dtypes as dtypes from tests.system import utils try: @@ -278,7 +279,7 @@ def test_to_arrow_override_global_option(scalars_df_index): assert scalars_df_index._query_job.destination.table_id == table_id -def test_load_json_w_unboxed_py_value(session): +def test_load_json_w_json_string_items(session): sql = """ SELECT 0 AS id, JSON_OBJECT('boolean', True) AS json_col, UNION ALL @@ -292,31 +293,32 @@ def test_load_json_w_unboxed_py_value(session): UNION ALL SELECT 5, JSON_OBJECT('null', null), UNION ALL + SELECT 6, JSON_OBJECT('b', 2, 'a', 1), + UNION ALL SELECT - 6, + 7, JSON_OBJECT( 'dict', JSON_OBJECT( 'int', 1, - 'array', [JSON_OBJECT('bar', 'hello'), JSON_OBJECT('foo', 1)] + 'array', [JSON_OBJECT('foo', 1), JSON_OBJECT('bar', 'hello')] ) ), """ df = session.read_gbq(sql, index_col="id") assert df.dtypes["json_col"] == pd.ArrowDtype(db_dtypes.JSONArrowType()) - assert isinstance(df["json_col"][0], dict) - assert df["json_col"][0]["boolean"] - assert df["json_col"][1]["int"] == 100 - assert math.isclose(df["json_col"][2]["float"], 0.98) - assert df["json_col"][3]["string"] == "hello world" - assert df["json_col"][4]["array"] == [8, 9, 10] - assert df["json_col"][5]["null"] is None - assert df["json_col"][6]["dict"] == { - "int": 1, - "array": [{"bar": "hello"}, {"foo": 1}], - } + assert df["json_col"][0] == '{"boolean":true}' + assert df["json_col"][1] == '{"int":100}' + assert df["json_col"][2] == '{"float":0.98}' + assert df["json_col"][3] == '{"string":"hello world"}' + assert df["json_col"][4] == '{"array":[8,9,10]}' + assert df["json_col"][5] == '{"null":null}' + + # Verifies JSON strings preserve array order, regardless of dictionary key order. + assert df["json_col"][6] == '{"a":1,"b":2}' + assert df["json_col"][7] == '{"dict":{"array":[{"bar":"hello"},{"foo":1}],"int":1}}' def test_load_json_to_pandas_has_correct_result(session): @@ -324,9 +326,9 @@ def test_load_json_to_pandas_has_correct_result(session): assert df.dtypes["json_col"] == pd.ArrowDtype(db_dtypes.JSONArrowType()) result = df.to_pandas() - # The order of keys within the JSON object shouldn't matter for equality checks. + # These JSON strings are compatible with BigQuery's JSON storage, pd_df = pd.DataFrame( - {"json_col": [{"bar": True, "foo": 10}]}, + {"json_col": ['{"bar":true,"foo":10}']}, dtype=pd.ArrowDtype(db_dtypes.JSONArrowType()), ) pd_df.index = pd_df.index.astype("Int64") @@ -367,16 +369,13 @@ def test_load_json_in_struct(session): data = df["struct_col"].struct.field("data") assert data.dtype == pd.ArrowDtype(db_dtypes.JSONArrowType()) - assert data[0]["boolean"] - assert data[1]["int"] == 100 - assert math.isclose(data[2]["float"], 0.98) - assert data[3]["string"] == "hello world" - assert data[4]["array"] == [8, 9, 10] - assert data[5]["null"] is None - assert data[6]["dict"] == { - "int": 1, - "array": [{"bar": "hello"}, {"foo": 1}], - } + assert data[0] == '{"boolean":true}' + assert data[1] == '{"int":100}' + assert data[2] == '{"float":0.98}' + assert data[3] == '{"string":"hello world"}' + assert data[4] == '{"array":[8,9,10]}' + assert data[5] == '{"null":null}' + assert data[6] == '{"dict":{"array":[{"bar":"hello"},{"foo":1}],"int":1}}' def test_load_json_in_array(session): @@ -408,16 +407,13 @@ def test_load_json_in_array(session): assert data.len()[0] == 7 assert data[0].dtype == pd.ArrowDtype(db_dtypes.JSONArrowType()) - assert data[0][0]["boolean"] - assert data[1][0]["int"] == 100 - assert math.isclose(data[2][0]["float"], 0.98) - assert data[3][0]["string"] == "hello world" - assert data[4][0]["array"] == [8, 9, 10] - assert data[5][0]["null"] is None - assert data[6][0]["dict"] == { - "int": 1, - "array": [{"bar": "hello"}, {"foo": 1}], - } + assert data[0][0] == '{"boolean":true}' + assert data[1][0] == '{"int":100}' + assert data[2][0] == '{"float":0.98}' + assert data[3][0] == '{"string":"hello world"}' + assert data[4][0] == '{"array":[8,9,10]}' + assert data[5][0] == '{"null":null}' + assert data[6][0] == '{"dict":{"array":[{"bar":"hello"},{"foo":1}],"int":1}}' def test_to_pandas_batches_w_correct_dtypes(scalars_df_default_index): @@ -691,7 +687,8 @@ def test_to_gbq_w_json(bigquery_client): """Test the `to_gbq` API can get a JSON column.""" s1 = bpd.Series([1, 2, 3, 4]) s2 = bpd.Series( - ["a", 1, False, ["a", {"b": 1}], {"c": [1, 2, 3]}], dtype=db_dtypes.JSONDtype() + ['"a"', "1", "false", '["a", {"b": 1}]', '{"c": [1, 2, 3]}'], + dtype=dtypes.JSON_DTYPE, ) df = bpd.DataFrame({"id": s1, "json_col": s2}) diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index c1594983d1..7df115d5f7 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -26,6 +26,7 @@ import pytest import shapely # type: ignore +import bigframes.dtypes as dtypes import bigframes.features import bigframes.pandas import bigframes.series as series @@ -304,22 +305,22 @@ def test_series_construct_w_dtype_for_array_struct(): def test_series_construct_w_dtype_for_json(): data = [ - 1, - "str", - False, - ["a", {"b": 1}, None], + "1", + '"str"', + "false", + '["a", {"b": 1}, null]', None, - {"a": {"b": [1, 2, 3], "c": True}}, + '{"a": {"b": [1, 2, 3], "c": true}}', ] - s = bigframes.pandas.Series(data, dtype=db_dtypes.JSONDtype()) + s = bigframes.pandas.Series(data, dtype=dtypes.JSON_DTYPE) - assert s[0] == 1 - assert s[1] == "str" - assert s[2] is False - assert s[3][0] == "a" - assert s[3][1]["b"] == 1 + assert s[0] == "1" + assert s[1] == '"str"' + assert s[2] == "false" + # TODO: check old branch results for null. + assert s[3] == '["a",{"b":1},null]' assert pd.isna(s[4]) - assert s[5]["a"] == {"b": [1, 2, 3], "c": True} + assert s[5] == '{"a":{"b":[1,2,3],"c":true}}' def test_series_keys(scalars_dfs): diff --git a/tests/system/small/test_session.py b/tests/system/small/test_session.py index 0c8da52774..c1a16492cc 100644 --- a/tests/system/small/test_session.py +++ b/tests/system/small/test_session.py @@ -759,13 +759,13 @@ def test_read_pandas_timedelta_index(session, write_engine): ) def test_read_pandas_json_dataframes(session, write_engine): json_data = [ - 1, + "1", None, - ["1", "3", "5"], - {"a": 1, "b": ["x", "y"], "c": {"z": False, "x": []}}, + '["1","3","5"]', + '{"a":1,"b":["x","y"],"c":{"x":[],"z":false}}', ] expected_df = pd.DataFrame( - {"my_col": pd.Series(json_data, dtype=db_dtypes.JSONDtype())} + {"my_col": pd.Series(json_data, dtype=bigframes.dtypes.JSON_DTYPE)} ) actual_result = session.read_pandas( @@ -783,12 +783,12 @@ def test_read_pandas_json_dataframes(session, write_engine): ) def test_read_pandas_json_series(session, write_engine): json_data = [ - 1, + "1", None, - ["1", "3", "5"], - {"a": 1, "b": ["x", "y"], "c": {"z": False, "x": []}}, + '["1","3","5"]', + '{"a":1,"b":["x","y"],"c":{"x":[],"z":false}}', ] - expected_series = pd.Series(json_data, dtype=db_dtypes.JSONDtype()) + expected_series = pd.Series(json_data, dtype=bigframes.dtypes.JSON_DTYPE) actual_result = session.read_pandas( expected_series, write_engine=write_engine @@ -807,12 +807,12 @@ def test_read_pandas_json_series(session, write_engine): ) def test_read_pandas_json_index(session, write_engine): json_data = [ - 1, + "1", None, - ["1", "3", "5"], - {"a": 1, "b": ["x", "y"], "c": {"z": False, "x": []}}, + '["1","3","5"]', + '{"a":1,"b":["x","y"],"c":{"x":[],"z":false}}', ] - expected_index = pd.Index(json_data, dtype=db_dtypes.JSONDtype()) + expected_index: pd.Index = pd.Index(json_data, dtype=bigframes.dtypes.JSON_DTYPE) actual_result = session.read_pandas( expected_index, write_engine=write_engine ).to_pandas() From 416a2268a18639296b9417042de48a409d551722 Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Wed, 5 Mar 2025 23:09:50 +0000 Subject: [PATCH 3/7] fixes lint and doctest --- bigframes/bigquery/_operations/json.py | 4 ++-- bigframes/dtypes.py | 1 - tests/system/small/test_dataframe_io.py | 2 -- tests/system/small/test_session.py | 1 - 4 files changed, 2 insertions(+), 6 deletions(-) diff --git a/bigframes/bigquery/_operations/json.py b/bigframes/bigquery/_operations/json.py index 0223811ebc..07efc5fa51 100644 --- a/bigframes/bigquery/_operations/json.py +++ b/bigframes/bigquery/_operations/json.py @@ -53,7 +53,7 @@ def json_set( >>> s = bpd.read_gbq("SELECT JSON '{\\\"a\\\": 1}' AS data")["data"] >>> bbq.json_set(s, json_path_value_pairs=[("$.a", 100), ("$.b", "hi")]) 0 {"a":100,"b":"hi"} - Name: data, dtype: dbjson + Name: data, dtype: extension>[pyarrow] Args: input (bigframes.series.Series): @@ -253,7 +253,7 @@ def parse_json( dtype: string >>> bbq.parse_json(s) 0 {"class":{"students":[{"id":5},{"id":12}]}} - dtype: dbjson + dtype: extension>[pyarrow] Args: input (bigframes.series.Series): diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py index 5f462222e0..6b17fe0736 100644 --- a/bigframes/dtypes.py +++ b/bigframes/dtypes.py @@ -301,7 +301,6 @@ def is_object_like(type_: Union[ExpressionType, str]) -> bool: return type_ in ("object", "O") or ( getattr(type_, "kind", None) == "O" and getattr(type_, "storage", None) != "pyarrow" - and getattr(type_, "name", None) != "dbjson" ) diff --git a/tests/system/small/test_dataframe_io.py b/tests/system/small/test_dataframe_io.py index 149821a741..16c3d19801 100644 --- a/tests/system/small/test_dataframe_io.py +++ b/tests/system/small/test_dataframe_io.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import math from typing import Tuple import db_dtypes # type:ignore @@ -36,7 +35,6 @@ from google.cloud import bigquery import bigframes -from bigframes import dtypes import bigframes.dataframe import bigframes.features import bigframes.pandas as bpd diff --git a/tests/system/small/test_session.py b/tests/system/small/test_session.py index c1a16492cc..4b7495694b 100644 --- a/tests/system/small/test_session.py +++ b/tests/system/small/test_session.py @@ -22,7 +22,6 @@ import warnings import bigframes_vendored.pandas.io.gbq as vendored_pandas_gbq -import db_dtypes # type: ignore import google import google.cloud.bigquery as bigquery import numpy as np From 4dd298532d21244f105d1e6b25cbb1b199f2ab29 Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Thu, 6 Mar 2025 00:00:57 +0000 Subject: [PATCH 4/7] switch db_dtypes into 1.4.2 --- setup.py | 3 +-- testing/constraints-3.9.txt | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/setup.py b/setup.py index cb61043375..34e013c9a3 100644 --- a/setup.py +++ b/setup.py @@ -60,8 +60,7 @@ "ipywidgets >=7.7.1", "humanize >=4.6.0", "matplotlib >=3.7.1", - "db-dtypes@ git+https://github.com/googleapis/python-db-dtypes-pandas.git@main", - # "db-dtypes >=1.4.0", + "db-dtypes >=1.4.2", # For vendored ibis-framework. "atpublic>=2.3,<6", "parsy>=2,<3", diff --git a/testing/constraints-3.9.txt b/testing/constraints-3.9.txt index b355e0915b..8c7c69efa7 100644 --- a/testing/constraints-3.9.txt +++ b/testing/constraints-3.9.txt @@ -25,7 +25,7 @@ tabulate==0.9 ipywidgets==7.7.1 humanize==4.6.0 matplotlib==3.7.1 -db-dtypes==1.4.0 +db-dtypes==1.4.2 # For vendored ibis-framework. atpublic==2.3 parsy==2.0 From fe263bbbde49b3cb7ca989f8067f7d8fb9e1df6b Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Thu, 6 Mar 2025 05:47:51 +0000 Subject: [PATCH 5/7] fix tests --- tests/system/small/test_dataframe_io.py | 6 +++--- tests/system/small/test_series.py | 1 - 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/tests/system/small/test_dataframe_io.py b/tests/system/small/test_dataframe_io.py index 16c3d19801..e80668939a 100644 --- a/tests/system/small/test_dataframe_io.py +++ b/tests/system/small/test_dataframe_io.py @@ -316,7 +316,7 @@ def test_load_json_w_json_string_items(session): # Verifies JSON strings preserve array order, regardless of dictionary key order. assert df["json_col"][6] == '{"a":1,"b":2}' - assert df["json_col"][7] == '{"dict":{"array":[{"bar":"hello"},{"foo":1}],"int":1}}' + assert df["json_col"][7] == '{"dict":{"array":[{"foo":1},{"bar":"hello"}],"int":1}}' def test_load_json_to_pandas_has_correct_result(session): @@ -355,7 +355,7 @@ def test_load_json_in_struct(session): 'dict', JSON_OBJECT( 'int', 1, - 'array', [JSON_OBJECT('bar', 'hello'), JSON_OBJECT('foo', 1)] + 'array', [JSON_OBJECT('foo', 1), JSON_OBJECT('bar', 'hello')] ) ), 7), """ @@ -373,7 +373,7 @@ def test_load_json_in_struct(session): assert data[3] == '{"string":"hello world"}' assert data[4] == '{"array":[8,9,10]}' assert data[5] == '{"null":null}' - assert data[6] == '{"dict":{"array":[{"bar":"hello"},{"foo":1}],"int":1}}' + assert data[6] == '{"dict":{"array":[{"foo":1},{"bar":"hello"}],"int":1}}' def test_load_json_in_array(session): diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index 7df115d5f7..d62af962fc 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -317,7 +317,6 @@ def test_series_construct_w_dtype_for_json(): assert s[0] == "1" assert s[1] == '"str"' assert s[2] == "false" - # TODO: check old branch results for null. assert s[3] == '["a",{"b":1},null]' assert pd.isna(s[4]) assert s[5] == '{"a":{"b":[1,2,3],"c":true}}' From a2edcbf650e9874116d8e642fa658d012c38226c Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Thu, 6 Mar 2025 23:55:13 +0000 Subject: [PATCH 6/7] fix test_df_drop_duplicates_w_json --- bigframes/core/array_value.py | 2 +- tests/system/small/test_dataframe.py | 13 ++++++++++++- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/bigframes/core/array_value.py b/bigframes/core/array_value.py index 431bc25464..7ede7b7e65 100644 --- a/bigframes/core/array_value.py +++ b/bigframes/core/array_value.py @@ -108,7 +108,7 @@ def from_table( raise ValueError("must set at most one of 'offests', 'primary_key'") if any(i.field_type == "JSON" for i in table.schema if i.name in schema.names): msg = bfe.format_message( - "JSON column interpretation as a custom PyArrow extention in `db_dtypes` " + "JSON column interpretation as a custom PyArrow extention in `db_dtypes` " "is a preview feature and subject to change." ) warnings.warn(msg, bfe.PreviewWarning) diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index f80b811217..9415f9657e 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -30,6 +30,7 @@ import bigframes._config.display_options as display_options import bigframes.core.indexes as bf_indexes import bigframes.dataframe as dataframe +import bigframes.dtypes as dtypes import bigframes.pandas as bpd import bigframes.series as series from tests.system.utils import ( @@ -4584,7 +4585,17 @@ def test_df_drop_duplicates(scalars_df_index, scalars_pandas_df_index, keep, sub ) def test_df_drop_duplicates_w_json(json_df, keep): bf_df = json_df.drop_duplicates(keep=keep).to_pandas() - pd_df = json_df.to_pandas().drop_duplicates(keep=keep) + + # drop_duplicates relies on pa.compute.dictionary_encode, which is incompatible + # with Arrow string extension types. Temporary conversion to standard Pandas + # strings is required. + json_pandas_df = json_df.to_pandas() + json_pandas_df["json_col"] = json_pandas_df["json_col"].astype( + pd.StringDtype(storage="pyarrow") + ) + + pd_df = json_pandas_df.drop_duplicates(keep=keep) + pd_df["json_col"] = pd_df["json_col"].astype(dtypes.JSON_DTYPE) pd.testing.assert_frame_equal( pd_df, bf_df, From d30c4b56dcd7a390e0019f8b4d25e1bc2a8f6c52 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= Date: Mon, 10 Mar 2025 16:15:48 -0500 Subject: [PATCH 7/7] commit suggestion --- bigframes/dtypes.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py index 6b17fe0736..22cc521e8e 100644 --- a/bigframes/dtypes.py +++ b/bigframes/dtypes.py @@ -62,7 +62,9 @@ # No arrow equivalent GEO_DTYPE = gpd.array.GeometryDtype() # JSON -JSON_DTYPE = pd.ArrowDtype(db_dtypes.JSONArrowType()) +# TODO: switch to pyarrow.json_(pyarrow.string()) when available. +JSON_ARROW_TYPE = db_dtypes.JSONArrowType() +JSON_DTYPE = pd.ArrowDtype(JSON_ARROW_TYPE) OBJ_REF_DTYPE = pd.ArrowDtype( pa.struct( ( @@ -80,7 +82,7 @@ ), pa.field( "details", - db_dtypes.JSONArrowType(), + JSON_ARROW_TYPE, ), ) )