From 59a719adbd9a0e8394fedcf2fd50f9179c096cd5 Mon Sep 17 00:00:00 2001
From: Chelsea Lin <chelsealin@google.com>
Date: Tue, 20 May 2025 23:44:52 +0000
Subject: [PATCH 1/2] fix: support JSON and STRUCT for bbq.sql_scalar

---
 bigframes/bigquery/_operations/sql.py         | 13 +--
 bigframes/core/compile/sqlglot/sqlglot_ir.py  |  7 ++
 bigframes/dtypes.py                           | 27 ------
 tests/system/small/bigquery/test_sql.py       | 82 ++++++++++++++++++-
 .../test_compile_readlocal/out.sql            | 12 +--
 .../ibis/backends/bigquery/datatypes.py       |  2 +
 6 files changed, 102 insertions(+), 41 deletions(-)

diff --git a/bigframes/bigquery/_operations/sql.py b/bigframes/bigquery/_operations/sql.py
index 7ccf63fcda..e202251996 100644
--- a/bigframes/bigquery/_operations/sql.py
+++ b/bigframes/bigquery/_operations/sql.py
@@ -20,6 +20,7 @@
 
 import google.cloud.bigquery
 
+import bigframes.core.compile.sqlglot.sqlglot_ir as sqlglot_ir
 import bigframes.core.sql
 import bigframes.dataframe
 import bigframes.dtypes
@@ -72,16 +73,16 @@ def sql_scalar(
     # Another benefit of this is that if there is a syntax error in the SQL
     # template, then this will fail with an error earlier in the process,
     # aiding users in debugging.
-    base_series = columns[0]
-    literals = [
-        bigframes.dtypes.bigframes_dtype_to_literal(column.dtype) for column in columns
+    literals_sql = [
+        sqlglot_ir._literal(column.values[0], column.dtype).sql(dialect="bigquery")
+        for column in columns
     ]
-    literals_sql = [bigframes.core.sql.simple_literal(literal) for literal in literals]
+    select_sql = sql_template.format(*literals_sql)
+    dry_run_sql = f"SELECT {select_sql}"
 
     # Use the executor directly, because we want the original column IDs, not
     # the user-friendly column names that block.to_sql_query() would produce.
-    select_sql = sql_template.format(*literals_sql)
-    dry_run_sql = f"SELECT {select_sql}"
+    base_series = columns[0]
     bqclient = base_series._session.bqclient
     job = bqclient.query(
         dry_run_sql, job_config=google.cloud.bigquery.QueryJobConfig(dry_run=True)
diff --git a/bigframes/core/compile/sqlglot/sqlglot_ir.py b/bigframes/core/compile/sqlglot/sqlglot_ir.py
index 935ad393f8..bd1d225d65 100644
--- a/bigframes/core/compile/sqlglot/sqlglot_ir.py
+++ b/bigframes/core/compile/sqlglot/sqlglot_ir.py
@@ -18,6 +18,7 @@
 import typing
 
 from google.cloud import bigquery
+import numpy as np
 import pyarrow as pa
 import sqlglot as sg
 import sqlglot.dialects.bigquery
@@ -213,7 +214,11 @@ def _literal(value: typing.Any, dtype: dtypes.Dtype) -> sge.Expression:
     elif dtype == dtypes.BYTES_DTYPE:
         return _cast(str(value), sqlglot_type)
     elif dtypes.is_time_like(dtype):
+        if isinstance(value, np.generic):
+            value = value.item()
         return _cast(sge.convert(value.isoformat()), sqlglot_type)
+    elif dtype in (dtypes.NUMERIC_DTYPE, dtypes.BIGNUMERIC_DTYPE):
+        return _cast(sge.convert(value), sqlglot_type)
     elif dtypes.is_geo_like(dtype):
         wkt = value if isinstance(value, str) else to_wkt(value)
         return sge.func("ST_GEOGFROMTEXT", sge.convert(wkt))
@@ -234,6 +239,8 @@ def _literal(value: typing.Any, dtype: dtypes.Dtype) -> sge.Expression:
         )
         return values if len(value) > 0 else _cast(values, sqlglot_type)
     else:
+        if isinstance(value, np.generic):
+            value = value.item()
         return sge.convert(value)
 
 
diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py
index ec115a93d0..262fa9dde7 100644
--- a/bigframes/dtypes.py
+++ b/bigframes/dtypes.py
@@ -499,33 +499,6 @@ def bigframes_dtype_to_arrow_dtype(
         )
 
 
-def bigframes_dtype_to_literal(
-    bigframes_dtype: Dtype,
-) -> Any:
-    """Create a representative literal value for a bigframes dtype.
-
-    The inverse of infer_literal_type().
-    """
-    if isinstance(bigframes_dtype, pd.ArrowDtype):
-        arrow_type = bigframes_dtype.pyarrow_dtype
-        return arrow_type_to_literal(arrow_type)
-
-    if isinstance(bigframes_dtype, pd.Float64Dtype):
-        return 1.0
-    if isinstance(bigframes_dtype, pd.Int64Dtype):
-        return 1
-    if isinstance(bigframes_dtype, pd.BooleanDtype):
-        return True
-    if isinstance(bigframes_dtype, pd.StringDtype):
-        return "string"
-    if isinstance(bigframes_dtype, gpd.array.GeometryDtype):
-        return shapely.geometry.Point((0, 0))
-
-    raise TypeError(
-        f"No literal  conversion for {bigframes_dtype}. {constants.FEEDBACK_LINK}"
-    )
-
-
 def arrow_type_to_literal(
     arrow_type: pa.DataType,
 ) -> Any:
diff --git a/tests/system/small/bigquery/test_sql.py b/tests/system/small/bigquery/test_sql.py
index 283624100a..66337b9dee 100644
--- a/tests/system/small/bigquery/test_sql.py
+++ b/tests/system/small/bigquery/test_sql.py
@@ -12,11 +12,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import bigframes.bigquery
+import pandas as pd
+import pyarrow as pa
+
+import bigframes.bigquery as bbq
+import bigframes.dtypes as dtypes
+import bigframes.pandas as bpd
 
 
 def test_sql_scalar_on_scalars_null_index(scalars_df_null_index):
-    series = bigframes.bigquery.sql_scalar(
+    series = bbq.sql_scalar(
         """
         CAST({0} AS INT64)
         + BYTE_LENGTH({1})
@@ -48,3 +53,76 @@ def test_sql_scalar_on_scalars_null_index(scalars_df_null_index):
     )
     result = series.to_pandas()
     assert len(result) == len(scalars_df_null_index)
+
+
+def test_sql_scalar_w_bool_series(scalars_df_index):
+    series: bpd.Series = scalars_df_index["bool_col"]
+    result = bbq.sql_scalar("CAST({0} AS INT64)", [series])
+    expected = series.astype(dtypes.INT_DTYPE)
+    expected.name = None
+    pd.testing.assert_series_equal(result.to_pandas(), expected.to_pandas())
+
+
+def test_sql_scalar_w_array_series(repeated_df):
+    result = bbq.sql_scalar(
+        """
+        ARRAY_LENGTH({0}) + ARRAY_LENGTH({1}) + ARRAY_LENGTH({2})
+        + ARRAY_LENGTH({3}) + ARRAY_LENGTH({4}) + ARRAY_LENGTH({5})
+        + ARRAY_LENGTH({6})
+        """,
+        [
+            repeated_df["int_list_col"],
+            repeated_df["bool_list_col"],
+            repeated_df["float_list_col"],
+            repeated_df["date_list_col"],
+            repeated_df["date_time_list_col"],
+            repeated_df["numeric_list_col"],
+            repeated_df["string_list_col"],
+        ],
+    )
+
+    expected = (
+        repeated_df["int_list_col"].list.len()
+        + repeated_df["bool_list_col"].list.len()
+        + repeated_df["float_list_col"].list.len()
+        + repeated_df["date_list_col"].list.len()
+        + repeated_df["date_time_list_col"].list.len()
+        + repeated_df["numeric_list_col"].list.len()
+        + repeated_df["string_list_col"].list.len()
+    )
+    pd.testing.assert_series_equal(result.to_pandas(), expected.to_pandas())
+
+
+def test_sql_scalar_w_struct_series(nested_structs_df):
+    result = bbq.sql_scalar(
+        "CHAR_LENGTH({0}.name) + {0}.age",
+        [nested_structs_df["person"]],
+    )
+    expected = nested_structs_df["person"].struct.field(
+        "name"
+    ).str.len() + nested_structs_df["person"].struct.field("age")
+    pd.testing.assert_series_equal(result.to_pandas(), expected.to_pandas())
+
+
+def test_sql_scalar_w_json_series(json_df):
+    result = bbq.sql_scalar(
+        """JSON_VALUE({0}, '$.int_value')""",
+        [
+            json_df["json_col"],
+        ],
+    )
+    expected = bbq.json_value(json_df["json_col"], "$.int_value")
+    expected.name = None
+    pd.testing.assert_series_equal(result.to_pandas(), expected.to_pandas())
+
+
+def test_sql_scalar_w_array_output(json_df):
+    result = bbq.sql_scalar(
+        """JSON_VALUE_ARRAY({0}, '$.order.items')""",
+        [
+            json_df["json_col"],
+        ],
+    )
+    assert len(result) == len(json_df)
+    assert result.dtype == pd.ArrowDtype(pa.list_(pa.string()))
+    assert result[15] == ["book", "pen"]
diff --git a/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal/out.sql b/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal/out.sql
index f04f9ed023..f73ef34051 100644
--- a/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal/out.sql
+++ b/tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal/out.sql
@@ -10,7 +10,7 @@ WITH `bfcte_0` AS (
     ST_GEOGFROMTEXT('POINT (-122.0838511 37.3860517)'),
     123456789,
     0,
-    1.234567890,
+    CAST(1.234567890 AS NUMERIC),
     1.25,
     0,
     0,
@@ -27,7 +27,7 @@ WITH `bfcte_0` AS (
     ST_GEOGFROMTEXT('POINT (-71.104 42.315)'),
     -987654321,
     1,
-    1.234567890,
+    CAST(1.234567890 AS NUMERIC),
     2.51,
     1,
     1,
@@ -44,7 +44,7 @@ WITH `bfcte_0` AS (
     ST_GEOGFROMTEXT('POINT (-0.124474760143016 51.5007826749545)'),
     314159,
     0,
-    101.101010100,
+    CAST(101.101010100 AS NUMERIC),
     25000000000.0,
     2,
     2,
@@ -95,7 +95,7 @@ WITH `bfcte_0` AS (
     CAST(NULL AS GEOGRAPHY),
     55555,
     0,
-    5.555555000,
+    CAST(5.555555000 AS NUMERIC),
     555.555,
     5,
     5,
@@ -112,7 +112,7 @@ WITH `bfcte_0` AS (
     ST_GEOGFROMTEXT('LINESTRING (-0.127959 51.507728, -0.127026 51.507473)'),
     101202303,
     2,
-    -10.090807000,
+    CAST(-10.090807000 AS NUMERIC),
     -123.456,
     6,
     6,
@@ -129,7 +129,7 @@ WITH `bfcte_0` AS (
     CAST(NULL AS GEOGRAPHY),
     -214748367,
     2,
-    11111111.100000000,
+    CAST(11111111.100000000 AS NUMERIC),
     42.42,
     7,
     7,
diff --git a/third_party/bigframes_vendored/ibis/backends/bigquery/datatypes.py b/third_party/bigframes_vendored/ibis/backends/bigquery/datatypes.py
index 5b4e4d85a1..fba0339ae9 100644
--- a/third_party/bigframes_vendored/ibis/backends/bigquery/datatypes.py
+++ b/third_party/bigframes_vendored/ibis/backends/bigquery/datatypes.py
@@ -53,6 +53,8 @@ def from_ibis(cls, dtype: dt.DataType) -> str:
             )
         elif dtype.is_integer():
             return "INT64"
+        elif dtype.is_boolean():
+            return "BOOLEAN"
         elif dtype.is_binary():
             return "BYTES"
         elif dtype.is_string():

From 97b49e0d8f18f7d33289ed3a5900f85d403ffa67 Mon Sep 17 00:00:00 2001
From: Chelsea Lin <chelsealin@google.com>
Date: Wed, 21 May 2025 20:13:43 +0000
Subject: [PATCH 2/2] generates literal from None

---
 bigframes/bigquery/_operations/sql.py   |  2 +-
 tests/system/small/bigquery/test_sql.py | 65 +++++++++++++++++++------
 2 files changed, 50 insertions(+), 17 deletions(-)

diff --git a/bigframes/bigquery/_operations/sql.py b/bigframes/bigquery/_operations/sql.py
index e202251996..a84c074e01 100644
--- a/bigframes/bigquery/_operations/sql.py
+++ b/bigframes/bigquery/_operations/sql.py
@@ -74,7 +74,7 @@ def sql_scalar(
     # template, then this will fail with an error earlier in the process,
     # aiding users in debugging.
     literals_sql = [
-        sqlglot_ir._literal(column.values[0], column.dtype).sql(dialect="bigquery")
+        sqlglot_ir._literal(None, column.dtype).sql(dialect="bigquery")
         for column in columns
     ]
     select_sql = sql_template.format(*literals_sql)
diff --git a/tests/system/small/bigquery/test_sql.py b/tests/system/small/bigquery/test_sql.py
index 66337b9dee..c519b427fa 100644
--- a/tests/system/small/bigquery/test_sql.py
+++ b/tests/system/small/bigquery/test_sql.py
@@ -13,14 +13,14 @@
 # limitations under the License.
 
 import pandas as pd
-import pyarrow as pa
+import pytest
 
 import bigframes.bigquery as bbq
 import bigframes.dtypes as dtypes
 import bigframes.pandas as bpd
 
 
-def test_sql_scalar_on_scalars_null_index(scalars_df_null_index):
+def test_sql_scalar_for_all_scalar_types(scalars_df_null_index):
     series = bbq.sql_scalar(
         """
         CAST({0} AS INT64)
@@ -55,7 +55,7 @@ def test_sql_scalar_on_scalars_null_index(scalars_df_null_index):
     assert len(result) == len(scalars_df_null_index)
 
 
-def test_sql_scalar_w_bool_series(scalars_df_index):
+def test_sql_scalar_for_bool_series(scalars_df_index):
     series: bpd.Series = scalars_df_index["bool_col"]
     result = bbq.sql_scalar("CAST({0} AS INT64)", [series])
     expected = series.astype(dtypes.INT_DTYPE)
@@ -63,7 +63,31 @@ def test_sql_scalar_w_bool_series(scalars_df_index):
     pd.testing.assert_series_equal(result.to_pandas(), expected.to_pandas())
 
 
-def test_sql_scalar_w_array_series(repeated_df):
+@pytest.mark.parametrize(
+    ("column_name"),
+    [
+        pytest.param("bool_col"),
+        pytest.param("bytes_col"),
+        pytest.param("date_col"),
+        pytest.param("datetime_col"),
+        pytest.param("geography_col"),
+        pytest.param("int64_col"),
+        pytest.param("numeric_col"),
+        pytest.param("float64_col"),
+        pytest.param("string_col"),
+        pytest.param("time_col"),
+        pytest.param("timestamp_col"),
+    ],
+)
+def test_sql_scalar_outputs_all_scalar_types(scalars_df_index, column_name):
+    series: bpd.Series = scalars_df_index[column_name]
+    result = bbq.sql_scalar("{0}", [series])
+    expected = series
+    expected.name = None
+    pd.testing.assert_series_equal(result.to_pandas(), expected.to_pandas())
+
+
+def test_sql_scalar_for_array_series(repeated_df):
     result = bbq.sql_scalar(
         """
         ARRAY_LENGTH({0}) + ARRAY_LENGTH({1}) + ARRAY_LENGTH({2})
@@ -93,7 +117,14 @@ def test_sql_scalar_w_array_series(repeated_df):
     pd.testing.assert_series_equal(result.to_pandas(), expected.to_pandas())
 
 
-def test_sql_scalar_w_struct_series(nested_structs_df):
+def test_sql_scalar_outputs_array_series(repeated_df):
+    result = bbq.sql_scalar("{0}", [repeated_df["int_list_col"]])
+    expected = repeated_df["int_list_col"]
+    expected.name = None
+    pd.testing.assert_series_equal(result.to_pandas(), expected.to_pandas())
+
+
+def test_sql_scalar_for_struct_series(nested_structs_df):
     result = bbq.sql_scalar(
         "CHAR_LENGTH({0}.name) + {0}.age",
         [nested_structs_df["person"]],
@@ -104,7 +135,14 @@ def test_sql_scalar_w_struct_series(nested_structs_df):
     pd.testing.assert_series_equal(result.to_pandas(), expected.to_pandas())
 
 
-def test_sql_scalar_w_json_series(json_df):
+def test_sql_scalar_outputs_struct_series(nested_structs_df):
+    result = bbq.sql_scalar("{0}", [nested_structs_df["person"]])
+    expected = nested_structs_df["person"]
+    expected.name = None
+    pd.testing.assert_series_equal(result.to_pandas(), expected.to_pandas())
+
+
+def test_sql_scalar_for_json_series(json_df):
     result = bbq.sql_scalar(
         """JSON_VALUE({0}, '$.int_value')""",
         [
@@ -116,13 +154,8 @@ def test_sql_scalar_w_json_series(json_df):
     pd.testing.assert_series_equal(result.to_pandas(), expected.to_pandas())
 
 
-def test_sql_scalar_w_array_output(json_df):
-    result = bbq.sql_scalar(
-        """JSON_VALUE_ARRAY({0}, '$.order.items')""",
-        [
-            json_df["json_col"],
-        ],
-    )
-    assert len(result) == len(json_df)
-    assert result.dtype == pd.ArrowDtype(pa.list_(pa.string()))
-    assert result[15] == ["book", "pen"]
+def test_sql_scalar_outputs_json_series(json_df):
+    result = bbq.sql_scalar("{0}", [json_df["json_col"]])
+    expected = json_df["json_col"]
+    expected.name = None
+    pd.testing.assert_series_equal(result.to_pandas(), expected.to_pandas())