From 59a719adbd9a0e8394fedcf2fd50f9179c096cd5 Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Tue, 20 May 2025 23:44:52 +0000 Subject: [PATCH 1/2] fix: support JSON and STRUCT for bbq.sql_scalar --- bigframes/bigquery/_operations/sql.py | 13 +-- bigframes/core/compile/sqlglot/sqlglot_ir.py | 7 ++ bigframes/dtypes.py | 27 ------ tests/system/small/bigquery/test_sql.py | 82 ++++++++++++++++++- .../test_compile_readlocal/out.sql | 12 +-- .../ibis/backends/bigquery/datatypes.py | 2 + 6 files changed, 102 insertions(+), 41 deletions(-) diff --git a/bigframes/bigquery/_operations/sql.py b/bigframes/bigquery/_operations/sql.py index 7ccf63fcda..e202251996 100644 --- a/bigframes/bigquery/_operations/sql.py +++ b/bigframes/bigquery/_operations/sql.py @@ -20,6 +20,7 @@ import google.cloud.bigquery +import bigframes.core.compile.sqlglot.sqlglot_ir as sqlglot_ir import bigframes.core.sql import bigframes.dataframe import bigframes.dtypes @@ -72,16 +73,16 @@ def sql_scalar( # Another benefit of this is that if there is a syntax error in the SQL # template, then this will fail with an error earlier in the process, # aiding users in debugging. - base_series = columns[0] - literals = [ - bigframes.dtypes.bigframes_dtype_to_literal(column.dtype) for column in columns + literals_sql = [ + sqlglot_ir._literal(column.values[0], column.dtype).sql(dialect="bigquery") + for column in columns ] - literals_sql = [bigframes.core.sql.simple_literal(literal) for literal in literals] + select_sql = sql_template.format(*literals_sql) + dry_run_sql = f"SELECT {select_sql}" # Use the executor directly, because we want the original column IDs, not # the user-friendly column names that block.to_sql_query() would produce. - select_sql = sql_template.format(*literals_sql) - dry_run_sql = f"SELECT {select_sql}" + base_series = columns[0] bqclient = base_series._session.bqclient job = bqclient.query( dry_run_sql, job_config=google.cloud.bigquery.QueryJobConfig(dry_run=True) diff --git a/bigframes/core/compile/sqlglot/sqlglot_ir.py b/bigframes/core/compile/sqlglot/sqlglot_ir.py index 935ad393f8..bd1d225d65 100644 --- a/bigframes/core/compile/sqlglot/sqlglot_ir.py +++ b/bigframes/core/compile/sqlglot/sqlglot_ir.py @@ -18,6 +18,7 @@ import typing from google.cloud import bigquery +import numpy as np import pyarrow as pa import sqlglot as sg import sqlglot.dialects.bigquery @@ -213,7 +214,11 @@ def _literal(value: typing.Any, dtype: dtypes.Dtype) -> sge.Expression: elif dtype == dtypes.BYTES_DTYPE: return _cast(str(value), sqlglot_type) elif dtypes.is_time_like(dtype): + if isinstance(value, np.generic): + value = value.item() return _cast(sge.convert(value.isoformat()), sqlglot_type) + elif dtype in (dtypes.NUMERIC_DTYPE, dtypes.BIGNUMERIC_DTYPE): + return _cast(sge.convert(value), sqlglot_type) elif dtypes.is_geo_like(dtype): wkt = value if isinstance(value, str) else to_wkt(value) return sge.func("ST_GEOGFROMTEXT", sge.convert(wkt)) @@ -234,6 +239,8 @@ def _literal(value: typing.Any, dtype: dtypes.Dtype) -> sge.Expression: ) return values if len(value) > 0 else _cast(values, sqlglot_type) else: + if isinstance(value, np.generic): + value = value.item() return sge.convert(value) diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py index ec115a93d0..262fa9dde7 100644 --- a/bigframes/dtypes.py +++ b/bigframes/dtypes.py @@ -499,33 +499,6 @@ def bigframes_dtype_to_arrow_dtype( ) -def bigframes_dtype_to_literal( - bigframes_dtype: Dtype, -) -> Any: - """Create a representative literal value for a bigframes dtype. - - The inverse of infer_literal_type(). - """ - if isinstance(bigframes_dtype, pd.ArrowDtype): - arrow_type = bigframes_dtype.pyarrow_dtype - return arrow_type_to_literal(arrow_type) - - if isinstance(bigframes_dtype, pd.Float64Dtype): - return 1.0 - if isinstance(bigframes_dtype, pd.Int64Dtype): - return 1 - if isinstance(bigframes_dtype, pd.BooleanDtype): - return True - if isinstance(bigframes_dtype, pd.StringDtype): - return "string" - if isinstance(bigframes_dtype, gpd.array.GeometryDtype): - return shapely.geometry.Point((0, 0)) - - raise TypeError( - f"No literal conversion for {bigframes_dtype}. {constants.FEEDBACK_LINK}" - ) - - def arrow_type_to_literal( arrow_type: pa.DataType, ) -> Any: diff --git a/tests/system/small/bigquery/test_sql.py b/tests/system/small/bigquery/test_sql.py index 283624100a..66337b9dee 100644 --- a/tests/system/small/bigquery/test_sql.py +++ b/tests/system/small/bigquery/test_sql.py @@ -12,11 +12,16 @@ # See the License for the specific language governing permissions and # limitations under the License. -import bigframes.bigquery +import pandas as pd +import pyarrow as pa + +import bigframes.bigquery as bbq +import bigframes.dtypes as dtypes +import bigframes.pandas as bpd def test_sql_scalar_on_scalars_null_index(scalars_df_null_index): - series = bigframes.bigquery.sql_scalar( + series = bbq.sql_scalar( """ CAST({0} AS INT64) + BYTE_LENGTH({1}) @@ -48,3 +53,76 @@ def test_sql_scalar_on_scalars_null_index(scalars_df_null_index): ) result = series.to_pandas() assert len(result) == len(scalars_df_null_index) + + +def test_sql_scalar_w_bool_series(scalars_df_index): + series: bpd.Series = scalars_df_index["bool_col"] + result = bbq.sql_scalar("CAST({0} AS INT64)", [series]) + expected = series.astype(dtypes.INT_DTYPE) + expected.name = None + pd.testing.assert_series_equal(result.to_pandas(), expected.to_pandas()) + + +def test_sql_scalar_w_array_series(repeated_df): + result = bbq.sql_scalar( + """ + ARRAY_LENGTH({0}) + ARRAY_LENGTH({1}) + ARRAY_LENGTH({2}) + + ARRAY_LENGTH({3}) + ARRAY_LENGTH({4}) + ARRAY_LENGTH({5}) + + ARRAY_LENGTH({6}) + """, + [ + repeated_df["int_list_col"], + repeated_df["bool_list_col"], + repeated_df["float_list_col"], + repeated_df["date_list_col"], + repeated_df["date_time_list_col"], + repeated_df["numeric_list_col"], + repeated_df["string_list_col"], + ], + ) + + expected = ( + repeated_df["int_list_col"].list.len() + + repeated_df["bool_list_col"].list.len() + + repeated_df["float_list_col"].list.len() + + repeated_df["date_list_col"].list.len() + + repeated_df["date_time_list_col"].list.len() + + repeated_df["numeric_list_col"].list.len() + + repeated_df["string_list_col"].list.len() + ) + pd.testing.assert_series_equal(result.to_pandas(), expected.to_pandas()) + + +def test_sql_scalar_w_struct_series(nested_structs_df): + result = bbq.sql_scalar( + "CHAR_LENGTH({0}.name) + {0}.age", + [nested_structs_df["person"]], + ) + expected = nested_structs_df["person"].struct.field( + "name" + ).str.len() + nested_structs_df["person"].struct.field("age") + pd.testing.assert_series_equal(result.to_pandas(), expected.to_pandas()) + + +def test_sql_scalar_w_json_series(json_df): + result = bbq.sql_scalar( + """JSON_VALUE({0}, '$.int_value')""", + [ + json_df["json_col"], + ], + ) + expected = bbq.json_value(json_df["json_col"], "$.int_value") + expected.name = None + pd.testing.assert_series_equal(result.to_pandas(), expected.to_pandas()) + + +def test_sql_scalar_w_array_output(json_df): + result = bbq.sql_scalar( + """JSON_VALUE_ARRAY({0}, '$.order.items')""", + [ + json_df["json_col"], + ], + ) + assert len(result) == len(json_df) + assert result.dtype == pd.ArrowDtype(pa.list_(pa.string())) + assert result[15] == ["book", "pen"] diff --git a/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal/out.sql b/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal/out.sql index f04f9ed023..f73ef34051 100644 --- a/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal/out.sql +++ b/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal/out.sql @@ -10,7 +10,7 @@ WITH `bfcte_0` AS ( ST_GEOGFROMTEXT('POINT (-122.0838511 37.3860517)'), 123456789, 0, - 1.234567890, + CAST(1.234567890 AS NUMERIC), 1.25, 0, 0, @@ -27,7 +27,7 @@ WITH `bfcte_0` AS ( ST_GEOGFROMTEXT('POINT (-71.104 42.315)'), -987654321, 1, - 1.234567890, + CAST(1.234567890 AS NUMERIC), 2.51, 1, 1, @@ -44,7 +44,7 @@ WITH `bfcte_0` AS ( ST_GEOGFROMTEXT('POINT (-0.124474760143016 51.5007826749545)'), 314159, 0, - 101.101010100, + CAST(101.101010100 AS NUMERIC), 25000000000.0, 2, 2, @@ -95,7 +95,7 @@ WITH `bfcte_0` AS ( CAST(NULL AS GEOGRAPHY), 55555, 0, - 5.555555000, + CAST(5.555555000 AS NUMERIC), 555.555, 5, 5, @@ -112,7 +112,7 @@ WITH `bfcte_0` AS ( ST_GEOGFROMTEXT('LINESTRING (-0.127959 51.507728, -0.127026 51.507473)'), 101202303, 2, - -10.090807000, + CAST(-10.090807000 AS NUMERIC), -123.456, 6, 6, @@ -129,7 +129,7 @@ WITH `bfcte_0` AS ( CAST(NULL AS GEOGRAPHY), -214748367, 2, - 11111111.100000000, + CAST(11111111.100000000 AS NUMERIC), 42.42, 7, 7, diff --git a/third_party/bigframes_vendored/ibis/backends/bigquery/datatypes.py b/third_party/bigframes_vendored/ibis/backends/bigquery/datatypes.py index 5b4e4d85a1..fba0339ae9 100644 --- a/third_party/bigframes_vendored/ibis/backends/bigquery/datatypes.py +++ b/third_party/bigframes_vendored/ibis/backends/bigquery/datatypes.py @@ -53,6 +53,8 @@ def from_ibis(cls, dtype: dt.DataType) -> str: ) elif dtype.is_integer(): return "INT64" + elif dtype.is_boolean(): + return "BOOLEAN" elif dtype.is_binary(): return "BYTES" elif dtype.is_string(): From 97b49e0d8f18f7d33289ed3a5900f85d403ffa67 Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Wed, 21 May 2025 20:13:43 +0000 Subject: [PATCH 2/2] generates literal from None --- bigframes/bigquery/_operations/sql.py | 2 +- tests/system/small/bigquery/test_sql.py | 65 +++++++++++++++++++------ 2 files changed, 50 insertions(+), 17 deletions(-) diff --git a/bigframes/bigquery/_operations/sql.py b/bigframes/bigquery/_operations/sql.py index e202251996..a84c074e01 100644 --- a/bigframes/bigquery/_operations/sql.py +++ b/bigframes/bigquery/_operations/sql.py @@ -74,7 +74,7 @@ def sql_scalar( # template, then this will fail with an error earlier in the process, # aiding users in debugging. literals_sql = [ - sqlglot_ir._literal(column.values[0], column.dtype).sql(dialect="bigquery") + sqlglot_ir._literal(None, column.dtype).sql(dialect="bigquery") for column in columns ] select_sql = sql_template.format(*literals_sql) diff --git a/tests/system/small/bigquery/test_sql.py b/tests/system/small/bigquery/test_sql.py index 66337b9dee..c519b427fa 100644 --- a/tests/system/small/bigquery/test_sql.py +++ b/tests/system/small/bigquery/test_sql.py @@ -13,14 +13,14 @@ # limitations under the License. import pandas as pd -import pyarrow as pa +import pytest import bigframes.bigquery as bbq import bigframes.dtypes as dtypes import bigframes.pandas as bpd -def test_sql_scalar_on_scalars_null_index(scalars_df_null_index): +def test_sql_scalar_for_all_scalar_types(scalars_df_null_index): series = bbq.sql_scalar( """ CAST({0} AS INT64) @@ -55,7 +55,7 @@ def test_sql_scalar_on_scalars_null_index(scalars_df_null_index): assert len(result) == len(scalars_df_null_index) -def test_sql_scalar_w_bool_series(scalars_df_index): +def test_sql_scalar_for_bool_series(scalars_df_index): series: bpd.Series = scalars_df_index["bool_col"] result = bbq.sql_scalar("CAST({0} AS INT64)", [series]) expected = series.astype(dtypes.INT_DTYPE) @@ -63,7 +63,31 @@ def test_sql_scalar_w_bool_series(scalars_df_index): pd.testing.assert_series_equal(result.to_pandas(), expected.to_pandas()) -def test_sql_scalar_w_array_series(repeated_df): +@pytest.mark.parametrize( + ("column_name"), + [ + pytest.param("bool_col"), + pytest.param("bytes_col"), + pytest.param("date_col"), + pytest.param("datetime_col"), + pytest.param("geography_col"), + pytest.param("int64_col"), + pytest.param("numeric_col"), + pytest.param("float64_col"), + pytest.param("string_col"), + pytest.param("time_col"), + pytest.param("timestamp_col"), + ], +) +def test_sql_scalar_outputs_all_scalar_types(scalars_df_index, column_name): + series: bpd.Series = scalars_df_index[column_name] + result = bbq.sql_scalar("{0}", [series]) + expected = series + expected.name = None + pd.testing.assert_series_equal(result.to_pandas(), expected.to_pandas()) + + +def test_sql_scalar_for_array_series(repeated_df): result = bbq.sql_scalar( """ ARRAY_LENGTH({0}) + ARRAY_LENGTH({1}) + ARRAY_LENGTH({2}) @@ -93,7 +117,14 @@ def test_sql_scalar_w_array_series(repeated_df): pd.testing.assert_series_equal(result.to_pandas(), expected.to_pandas()) -def test_sql_scalar_w_struct_series(nested_structs_df): +def test_sql_scalar_outputs_array_series(repeated_df): + result = bbq.sql_scalar("{0}", [repeated_df["int_list_col"]]) + expected = repeated_df["int_list_col"] + expected.name = None + pd.testing.assert_series_equal(result.to_pandas(), expected.to_pandas()) + + +def test_sql_scalar_for_struct_series(nested_structs_df): result = bbq.sql_scalar( "CHAR_LENGTH({0}.name) + {0}.age", [nested_structs_df["person"]], @@ -104,7 +135,14 @@ def test_sql_scalar_w_struct_series(nested_structs_df): pd.testing.assert_series_equal(result.to_pandas(), expected.to_pandas()) -def test_sql_scalar_w_json_series(json_df): +def test_sql_scalar_outputs_struct_series(nested_structs_df): + result = bbq.sql_scalar("{0}", [nested_structs_df["person"]]) + expected = nested_structs_df["person"] + expected.name = None + pd.testing.assert_series_equal(result.to_pandas(), expected.to_pandas()) + + +def test_sql_scalar_for_json_series(json_df): result = bbq.sql_scalar( """JSON_VALUE({0}, '$.int_value')""", [ @@ -116,13 +154,8 @@ def test_sql_scalar_w_json_series(json_df): pd.testing.assert_series_equal(result.to_pandas(), expected.to_pandas()) -def test_sql_scalar_w_array_output(json_df): - result = bbq.sql_scalar( - """JSON_VALUE_ARRAY({0}, '$.order.items')""", - [ - json_df["json_col"], - ], - ) - assert len(result) == len(json_df) - assert result.dtype == pd.ArrowDtype(pa.list_(pa.string())) - assert result[15] == ["book", "pen"] +def test_sql_scalar_outputs_json_series(json_df): + result = bbq.sql_scalar("{0}", [json_df["json_col"]]) + expected = json_df["json_col"] + expected.name = None + pd.testing.assert_series_equal(result.to_pandas(), expected.to_pandas())