diff --git a/bigframes/core/compile/compiled.py b/bigframes/core/compile/compiled.py index 461c2c005a..537d9c8b52 100644 --- a/bigframes/core/compile/compiled.py +++ b/bigframes/core/compile/compiled.py @@ -21,6 +21,7 @@ import ibis import ibis.backends.bigquery as ibis_bigquery +import ibis.common.deferred # type: ignore import ibis.expr.datatypes as ibis_dtypes import ibis.expr.types as ibis_types import pandas @@ -62,7 +63,16 @@ def __init__( self._columns = tuple(columns) # To allow for more efficient lookup by column name, create a # dictionary mapping names to column values. - self._column_names = {column.get_name(): column for column in self._columns} + self._column_names = { + ( + column.resolve(table) + # TODO(https://github.com/ibis-project/ibis/issues/7613): use + # public API to refer to Deferred type. + if isinstance(column, ibis.common.deferred.Deferred) + else column + ).get_name(): column + for column in self._columns + } @property def columns(self) -> typing.Tuple[ibis_types.Value, ...]: @@ -643,7 +653,16 @@ def __init__( # To allow for more efficient lookup by column name, create a # dictionary mapping names to column values. - self._column_names = {column.get_name(): column for column in self._columns} + self._column_names = { + ( + column.resolve(table) + # TODO(https://github.com/ibis-project/ibis/issues/7613): use + # public API to refer to Deferred type. + if isinstance(column, ibis.common.deferred.Deferred) + else column + ).get_name(): column + for column in self._columns + } self._hidden_ordering_column_names = { column.get_name(): column for column in self._hidden_ordering_columns } @@ -860,7 +879,7 @@ def project_window_op( case_statement = ibis.case() for clause in clauses: case_statement = case_statement.when(clause[0], clause[1]) - case_statement = case_statement.else_(window_op).end() + case_statement = case_statement.else_(window_op).end() # type: ignore window_op = case_statement result = self._set_or_replace_by_id(output_name or column_name, window_op) diff --git a/bigframes/core/reshape/__init__.py b/bigframes/core/reshape/__init__.py index dc61c3baad..24c1bff309 100644 --- a/bigframes/core/reshape/__init__.py +++ b/bigframes/core/reshape/__init__.py @@ -18,6 +18,7 @@ import bigframes.constants as constants import bigframes.core as core +import bigframes.core.ordering as order import bigframes.core.utils as utils import bigframes.dataframe import bigframes.operations as ops @@ -145,7 +146,10 @@ def qcut( block, result = block.apply_window_op( x._value_column, agg_ops.QcutOp(q), - window_spec=core.WindowSpec(grouping_keys=(nullity_id,)), + window_spec=core.WindowSpec( + grouping_keys=(nullity_id,), + ordering=(order.OrderingColumnReference(x._value_column),), + ), ) block, result = block.apply_binary_op( result, nullity_id, ops.partial_arg3(ops.where_op, None), result_label=label diff --git a/bigframes/operations/__init__.py b/bigframes/operations/__init__.py index a29dd36c72..0655aafdb3 100644 --- a/bigframes/operations/__init__.py +++ b/bigframes/operations/__init__.py @@ -18,6 +18,7 @@ import typing import ibis +import ibis.common.annotations import ibis.common.exceptions import ibis.expr.datatypes as ibis_dtypes import ibis.expr.operations.generic @@ -352,14 +353,23 @@ def _as_ibis(self, x: ibis_types.Value): str_val = typing.cast(ibis_types.StringValue, x) # SQL pad operations will truncate, we do not want to truncate though. - pad_length = ibis.greatest(str_val.length(), self._length) + pad_length = typing.cast( + ibis_types.IntegerValue, ibis.greatest(str_val.length(), self._length) + ) if self._side == "left": return str_val.lpad(pad_length, self._fillchar) elif self._side == "right": return str_val.rpad(pad_length, self._fillchar) else: # side == both # Pad more on right side if can't pad both sides equally - lpad_amount = ((pad_length - str_val.length()) // 2) + str_val.length() + lpad_amount = typing.cast( + ibis_types.IntegerValue, + ( + (pad_length - str_val.length()) + // typing.cast(ibis_types.NumericValue, ibis.literal(2)) + ) + + str_val.length(), + ) return str_val.lpad(lpad_amount, self._fillchar).rpad( pad_length, self._fillchar ) @@ -722,10 +732,29 @@ def ne_op( return x != y +def _null_or_value(value: ibis_types.Value, where_value: ibis_types.BooleanValue): + return ibis.where( + where_value, + value, + ibis.null(), + ) + + def and_op( x: ibis_types.Value, y: ibis_types.Value, ): + # Workaround issue https://github.com/ibis-project/ibis/issues/7775 by + # implementing three-valued logic ourselves. For AND, when we encounter a + # NULL value, we only know when the result is FALSE, otherwise the result + # is unknown (NULL). See: truth table at + # https://en.wikibooks.org/wiki/Structured_Query_Language/NULLs_and_the_Three_Valued_Logic#AND,_OR + if isinstance(x, ibis_types.NullScalar): + return _null_or_value(y, y == ibis.literal(False)) + + if isinstance(y, ibis_types.NullScalar): + return _null_or_value(x, x == ibis.literal(False)) + return typing.cast(ibis_types.BooleanValue, x) & typing.cast( ibis_types.BooleanValue, y ) @@ -735,6 +764,17 @@ def or_op( x: ibis_types.Value, y: ibis_types.Value, ): + # Workaround issue https://github.com/ibis-project/ibis/issues/7775 by + # implementing three-valued logic ourselves. For OR, when we encounter a + # NULL value, we only know when the result is TRUE, otherwise the result + # is unknown (NULL). See: truth table at + # https://en.wikibooks.org/wiki/Structured_Query_Language/NULLs_and_the_Three_Valued_Logic#AND,_OR + if isinstance(x, ibis_types.NullScalar): + return _null_or_value(y, y == ibis.literal(True)) + + if isinstance(y, ibis_types.NullScalar): + return _null_or_value(x, x == ibis.literal(True)) + return typing.cast(ibis_types.BooleanValue, x) | typing.cast( ibis_types.BooleanValue, y ) @@ -746,10 +786,16 @@ def add_op( y: ibis_types.Value, ): if isinstance(x, ibis_types.NullScalar) or isinstance(x, ibis_types.NullScalar): - return - return typing.cast(ibis_types.NumericValue, x) + typing.cast( - ibis_types.NumericValue, y - ) + return ibis.null() + try: + # Could be string concatenation or numeric addition. + return x + y # type: ignore + except ibis.common.annotations.SignatureValidationError as exc: + left_type = bigframes.dtypes.ibis_dtype_to_bigframes_dtype(x.type()) + right_type = bigframes.dtypes.ibis_dtype_to_bigframes_dtype(y.type()) + raise TypeError( + f"Cannot add {repr(left_type)} and {repr(right_type)}. {constants.FEEDBACK_LINK}" + ) from exc @short_circuit_nulls() @@ -1047,7 +1093,7 @@ def where_op( replacement: ibis_types.Value, ) -> ibis_types.Value: """Returns x if y is true, otherwise returns z.""" - return ibis.case().when(condition, original).else_(replacement).end() + return ibis.case().when(condition, original).else_(replacement).end() # type: ignore def clip_op( @@ -1060,7 +1106,7 @@ def clip_op( not isinstance(upper, ibis_types.NullScalar) ): return ( - ibis.case() + ibis.case() # type: ignore .when(upper.isnull() | (original > upper), upper) .else_(original) .end() @@ -1069,7 +1115,7 @@ def clip_op( upper, ibis_types.NullScalar ): return ( - ibis.case() + ibis.case() # type: ignore .when(lower.isnull() | (original < lower), lower) .else_(original) .end() @@ -1079,9 +1125,11 @@ def clip_op( ): return original else: - # Note: Pandas has unchanged behavior when upper bound and lower bound are flipped. This implementation requires that lower_bound < upper_bound + # Note: Pandas has unchanged behavior when upper bound and lower bound + # are flipped. + # This implementation requires that lower_bound < upper_bound. return ( - ibis.case() + ibis.case() # type: ignore .when(lower.isnull() | (original < lower), lower) .when(upper.isnull() | (original > upper), upper) .else_(original) diff --git a/bigframes/operations/aggregations.py b/bigframes/operations/aggregations.py index 465d188724..363dfe819d 100644 --- a/bigframes/operations/aggregations.py +++ b/bigframes/operations/aggregations.py @@ -74,7 +74,7 @@ def _as_ibis( # Will be null if all inputs are null. Pandas defaults to zero sum though. bq_sum = _apply_window_if_present(column.sum(), window) return ( - ibis.case().when(bq_sum.isnull(), ibis_types.literal(0)).else_(bq_sum).end() + ibis.case().when(bq_sum.isnull(), ibis_types.literal(0)).else_(bq_sum).end() # type: ignore ) @@ -167,7 +167,7 @@ def _as_ibis( .else_(magnitude * pow(-1, negative_count_parity)) .end() ) - return float_result.cast(column.type()) + return float_result.cast(column.type()) # type: ignore class MaxOp(AggregateOp): @@ -290,7 +290,7 @@ def _as_ibis( dtypes.literal_to_ibis_scalar(bucket_n, force_dtype=Int64Dtype()), ) out = out.else_(None) - return out.end() + return out.end() # type: ignore @property def skips_nulls(self): @@ -482,7 +482,7 @@ def _map_to_literal( original: ibis_types.Value, literal: ibis_types.Scalar ) -> ibis_types.Column: # Hack required to perform aggregations on literals in ibis, even though bigquery will let you directly aggregate literals (eg. 'SELECT COUNT(1) from table1') - return ibis.ifelse(original.isnull(), literal, literal) + return ibis.ifelse(original.isnull(), literal, literal) # type: ignore sum_op = SumOp() diff --git a/bigframes/remote_function.py b/bigframes/remote_function.py index a899ebd371..f54c26fa56 100644 --- a/bigframes/remote_function.py +++ b/bigframes/remote_function.py @@ -535,17 +535,14 @@ def remote_function_node( """Creates an Ibis node representing a remote function call.""" fields = { - name: rlz.value(type_) if type_ else rlz.any + name: rlz.ValueOf(None if type_ == "ANY TYPE" else type_) for name, type_ in zip( ibis_signature.parameter_names, ibis_signature.input_types ) } - try: - fields["output_type"] = rlz.shape_like("args", dtype=ibis_signature.output_type) # type: ignore - except TypeError: - fields["output_dtype"] = property(lambda _: ibis_signature.output_type) - fields["output_shape"] = rlz.shape_like("args") + fields["dtype"] = ibis_signature.output_type # type: ignore + fields["shape"] = rlz.shape_like("args") node = type(routine_ref_to_string_for_query(routine_ref), (ops.ValueOp,), fields) # type: ignore diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 5364060d1c..fb5fab86ce 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -79,9 +79,9 @@ import bigframes.session.clients import bigframes.version -# Even though the ibis.backends.bigquery.registry import is unused, it's needed +# Even though the ibis.backends.bigquery import is unused, it's needed # to register new and replacement ops with the Ibis BigQuery backend. -import third_party.bigframes_vendored.ibis.backends.bigquery.registry # noqa +import third_party.bigframes_vendored.ibis.backends.bigquery # noqa import third_party.bigframes_vendored.ibis.expr.operations as vendored_ibis_ops import third_party.bigframes_vendored.pandas.io.gbq as third_party_pandas_gbq import third_party.bigframes_vendored.pandas.io.parquet as third_party_pandas_parquet @@ -873,8 +873,9 @@ def _read_pandas( total_ordering_columns=frozenset([ordering_col]), integer_encoding=IntegerEncoding(True, is_sequential=True), ) - table_expression = self.ibis_client.table( + table_expression = self.ibis_client.table( # type: ignore load_table_destination.table_id, + # TODO: use "dataset_id" as the "schema" database=f"{load_table_destination.project}.{load_table_destination.dataset_id}", ) diff --git a/mypy.ini b/mypy.ini index 901394813a..3809f8e241 100644 --- a/mypy.ini +++ b/mypy.ini @@ -24,5 +24,8 @@ ignore_missing_imports = True [mypy-pyarrow] ignore_missing_imports = True +[mypy-ibis.*] +ignore_missing_imports = True + [mypy-ipywidgets] ignore_missing_imports = True diff --git a/noxfile.py b/noxfile.py index 2174e27529..c0ec3b0c54 100644 --- a/noxfile.py +++ b/noxfile.py @@ -524,23 +524,19 @@ def prerelease(session: nox.sessions.Session, tests_path): ) already_installed.add("pandas") - # TODO(shobs): - # Commit https://github.com/ibis-project/ibis/commit/c20ba7feab6bdea6c299721310e04dbc10551cc2 - # introduced breaking change that removed the following: - # ibis.expr.rules.column - # ibis.expr.rules.value - # ibis.expr.rules.any - # Let's exclude ibis head from prerelease install list for now. Instead, use - # a working ibis-framework version resolved via setup.by (currently resolves - # to version 6.2.0 due to version requirement "6.2.0,<7.0.0dev"). - # We should enable the head back once bigframes support a version that - # includes the above commit. + # Ibis has introduced breaking changes. Let's exclude ibis head + # from prerelease install list for now. We should enable the head back + # once bigframes supports the version at HEAD. # session.install( - # "--upgrade", - # "-e", # Use -e so that py.typed file is included. - # "git+https://github.com/ibis-project/ibis.git#egg=ibis-framework", + # "--upgrade", + # "-e", # Use -e so that py.typed file is included. + # "git+https://github.com/ibis-project/ibis.git@7.x.x#egg=ibis-framework", # ) - session.install("--no-deps", "ibis-framework==6.2.0") + session.install( + "--upgrade", + # "--pre", + "ibis-framework>=7.1.0,<8.0.0dev", + ) already_installed.add("ibis-framework") # Workaround https://github.com/googleapis/python-db-dtypes-pandas/issues/178 diff --git a/setup.py b/setup.py index 3351542985..1ad4bbd3eb 100644 --- a/setup.py +++ b/setup.py @@ -43,8 +43,8 @@ "google-cloud-iam >=2.12.1", "google-cloud-resource-manager >=1.10.3", "google-cloud-storage >=2.0.0", + "ibis-framework[bigquery] >=7.1.0,<8.0.0dev", # TODO: Relax upper bound once we have fixed `system_prerelease` tests. - "ibis-framework[bigquery] >=6.2.0,<7.0.0dev", "pandas >=1.5.0,<2.1.4", "pydata-google-auth >=1.8.2", "requests >=2.27.1", diff --git a/testing/constraints-3.9.txt b/testing/constraints-3.9.txt index f43d3b4ca0..218255c77e 100644 --- a/testing/constraints-3.9.txt +++ b/testing/constraints-3.9.txt @@ -45,7 +45,7 @@ greenlet==2.0.2 grpc-google-iam-v1==0.12.6 grpcio==1.53.0 grpcio-status==1.48.2 -ibis-framework==6.2.0 +ibis-framework==7.1.0 humanize==4.6.0 identify==2.5.22 idna==3.4 @@ -107,7 +107,7 @@ scikit-learn==1.2.2 SecretStorage==3.3.3 six==1.16.0 SQLAlchemy==1.4.0 -sqlglot==10.6.4 +sqlglot==18.12.0 tomli==2.0.1 toolz==0.12.0 tqdm==4.65.0 diff --git a/tests/system/small/test_ibis.py b/tests/system/small/test_ibis.py index 58b78e0048..9fe1176068 100644 --- a/tests/system/small/test_ibis.py +++ b/tests/system/small/test_ibis.py @@ -23,11 +23,16 @@ def test_approximate_quantiles(session: bigframes.Session, scalars_table_id: str): num_bins = 3 ibis_client = session.ibis_client - _, dataset, table_id = scalars_table_id.split(".") - ibis_table: ibis_types.Table = ibis_client.table(table_id, database=dataset) + project, dataset, table_id = scalars_table_id.split(".") + ibis_table: ibis_types.Table = ibis_client.table( # type: ignore + table_id, + schema=dataset, + database=project, + ) ibis_column: ibis_types.NumericColumn = ibis_table["int64_col"] - quantiles: ibis_types.ArrayScalar = vendored_ibis_ops.ApproximateMultiQuantile( # type: ignore - ibis_column, num_bins=num_bins + quantiles: ibis_types.ArrayScalar = vendored_ibis_ops.ApproximateMultiQuantile( + ibis_column, # type: ignore + num_bins=num_bins, # type: ignore ).to_expr() value = quantiles[1] num_edges = quantiles.length() diff --git a/tests/unit/resources.py b/tests/unit/resources.py index 8ba321d122..b239b04671 100644 --- a/tests/unit/resources.py +++ b/tests/unit/resources.py @@ -79,7 +79,7 @@ def create_dataframe( # might not actually be used. Mock out the global session, too. monkeypatch.setattr(bigframes.core.global_session, "_global_session", session) bigframes.options.bigquery._session_started = True - return bigframes.dataframe.DataFrame({}, session=session) + return bigframes.dataframe.DataFrame({"col": []}, session=session) def create_pandas_session(tables: Dict[str, pandas.DataFrame]) -> bigframes.Session: diff --git a/third_party/bigframes_vendored/ibis/backends/bigquery/__init__.py b/third_party/bigframes_vendored/ibis/backends/bigquery/__init__.py index e69de29bb2..43508fab11 100644 --- a/third_party/bigframes_vendored/ibis/backends/bigquery/__init__.py +++ b/third_party/bigframes_vendored/ibis/backends/bigquery/__init__.py @@ -0,0 +1,3 @@ +# Import all sub-modules to monkeypatch everything. +import third_party.bigframes_vendored.ibis.backends.bigquery.compiler # noqa +import third_party.bigframes_vendored.ibis.backends.bigquery.registry # noqa diff --git a/third_party/bigframes_vendored/ibis/backends/bigquery/compiler.py b/third_party/bigframes_vendored/ibis/backends/bigquery/compiler.py new file mode 100644 index 0000000000..414f0a7c81 --- /dev/null +++ b/third_party/bigframes_vendored/ibis/backends/bigquery/compiler.py @@ -0,0 +1,59 @@ +# Contains code from https://github.com/ibis-project/ibis/blob/master/ibis/backends/bigquery/compiler.py +"""Module to convert from Ibis expression to SQL string.""" + +from __future__ import annotations + +import re + +from ibis.backends.base.sql import compiler as sql_compiler +import ibis.backends.bigquery.compiler +from ibis.backends.bigquery.datatypes import BigQueryType +import ibis.expr.datatypes as dt +import ibis.expr.operations as ops + +_NAME_REGEX = re.compile(r'[^!"$()*,./;?@[\\\]^`{}~\n]+') +_EXACT_NAME_REGEX = re.compile(f"^{_NAME_REGEX.pattern}$") + + +class BigQueryTableSetFormatter(sql_compiler.TableSetFormatter): + def _quote_identifier(self, name): + """Restore 6.x version of identifier quoting. + + 7.x uses sqlglot which as of December 2023 doesn't know about the + extended unicode names for BigQuery yet. + """ + if _EXACT_NAME_REGEX.match(name) is not None: + return name + return f"`{name}`" + + def _format_in_memory_table(self, op): + """Restore 6.x version of InMemoryTable. + + BigQuery DataFrames explicitly uses InMemoryTable only when we know + the data is small enough to embed in SQL. + """ + schema = op.schema + names = schema.names + types = schema.types + + raw_rows = [] + for row in op.data.to_frame().itertuples(index=False): + raw_row = ", ".join( + f"{self._translate(lit)} AS {name}" + for lit, name in zip( + map(ops.Literal, row, types), map(self._quote_identifier, names) + ) + ) + raw_rows.append(f"STRUCT({raw_row})") + array_type = BigQueryType.from_ibis(dt.Array(op.schema.as_struct())) + + return f"UNNEST({array_type}[{', '.join(raw_rows)}])" + + +# Override implementation. +ibis.backends.bigquery.compiler.BigQueryTableSetFormatter._quote_identifier = ( + BigQueryTableSetFormatter._quote_identifier +) +ibis.backends.bigquery.compiler.BigQueryTableSetFormatter._format_in_memory_table = ( + BigQueryTableSetFormatter._format_in_memory_table +) diff --git a/third_party/bigframes_vendored/ibis/expr/operations/analytic.py b/third_party/bigframes_vendored/ibis/expr/operations/analytic.py index 038987cac9..3d6a3b37b1 100644 --- a/third_party/bigframes_vendored/ibis/expr/operations/analytic.py +++ b/third_party/bigframes_vendored/ibis/expr/operations/analytic.py @@ -2,22 +2,22 @@ from __future__ import annotations -from ibis.expr.operations.analytic import Analytic +import ibis.expr.operations as ops import ibis.expr.rules as rlz -class FirstNonNullValue(Analytic): +class FirstNonNullValue(ops.Analytic): """Retrieve the first element.""" - arg = rlz.column(rlz.any) - output_dtype = rlz.dtype_like("arg") + arg: ops.Column + dtype = rlz.dtype_like("arg") -class LastNonNullValue(Analytic): +class LastNonNullValue(ops.Analytic): """Retrieve the last element.""" - arg = rlz.column(rlz.any) - output_dtype = rlz.dtype_like("arg") + arg: ops.Column + dtype = rlz.dtype_like("arg") __all__ = [ diff --git a/third_party/bigframes_vendored/ibis/expr/operations/json.py b/third_party/bigframes_vendored/ibis/expr/operations/json.py index dbb3fa3066..772c2e8ff4 100644 --- a/third_party/bigframes_vendored/ibis/expr/operations/json.py +++ b/third_party/bigframes_vendored/ibis/expr/operations/json.py @@ -6,4 +6,4 @@ class ToJsonString(Unary): - output_dtype = dt.string + dtype = dt.string diff --git a/third_party/bigframes_vendored/ibis/expr/operations/reductions.py b/third_party/bigframes_vendored/ibis/expr/operations/reductions.py index 5e6ad9ecf2..e6644f477a 100644 --- a/third_party/bigframes_vendored/ibis/expr/operations/reductions.py +++ b/third_party/bigframes_vendored/ibis/expr/operations/reductions.py @@ -3,8 +3,8 @@ from __future__ import annotations import ibis.expr.datatypes as dt +import ibis.expr.operations.core as ibis_ops_core from ibis.expr.operations.reductions import Filterable, Reduction -import ibis.expr.rules as rlz class ApproximateMultiQuantile(Filterable, Reduction): @@ -13,9 +13,9 @@ class ApproximateMultiQuantile(Filterable, Reduction): See: https://cloud.google.com/bigquery/docs/reference/standard-sql/approximate_aggregate_functions#approx_quantiles """ - arg = rlz.any - num_bins = rlz.value(dt.int64) - output_dtype = dt.Array(dt.float64) + arg: ibis_ops_core.Value + num_bins: ibis_ops_core.Value[dt.Int64] + dtype = dt.Array(dt.float64) __all__ = [