Skip to content

Commit 7666976

Browse files
committed
Merge remote-tracking branch 'origin/main' into b329460931-rename-inplace
2 parents 45f9232 + bb45db8 commit 7666976

File tree

78 files changed

+1993
-662
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

78 files changed

+1993
-662
lines changed

bigframes/_config/__init__.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,12 +56,21 @@ class Options:
5656
"""Global options affecting BigQuery DataFrames behavior."""
5757

5858
def __init__(self):
59+
self.reset()
60+
61+
def reset(self) -> Options:
62+
"""Reset the option settings to defaults.
63+
64+
Returns:
65+
bigframes._config.Options: Options object with default values.
66+
"""
5967
self._local = ThreadLocalConfig()
6068

6169
# BigQuery options are special because they can only be set once per
6270
# session, so we need an indicator as to whether we are using the
6371
# thread-local session or the global session.
6472
self._bigquery_options = bigquery_options.BigQueryOptions()
73+
return self
6574

6675
def _init_bigquery_thread_local(self):
6776
"""Initialize thread-local options, based on current global options."""

bigframes/_config/bigquery_options.py

Lines changed: 46 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,10 +16,11 @@
1616

1717
from __future__ import annotations
1818

19-
from typing import Literal, Optional
19+
from typing import Literal, Optional, Sequence, Tuple
2020
import warnings
2121

2222
import google.auth.credentials
23+
import requests.adapters
2324

2425
import bigframes.enums
2526
import bigframes.exceptions as bfe
@@ -90,6 +91,9 @@ def __init__(
9091
allow_large_results: bool = False,
9192
ordering_mode: Literal["strict", "partial"] = "strict",
9293
client_endpoints_override: Optional[dict] = None,
94+
requests_transport_adapters: Sequence[
95+
Tuple[str, requests.adapters.BaseAdapter]
96+
] = (),
9397
):
9498
self._credentials = credentials
9599
self._project = project
@@ -100,6 +104,7 @@ def __init__(
100104
self._kms_key_name = kms_key_name
101105
self._skip_bq_connection_check = skip_bq_connection_check
102106
self._allow_large_results = allow_large_results
107+
self._requests_transport_adapters = requests_transport_adapters
103108
self._session_started = False
104109
# Determines the ordering strictness for the session.
105110
self._ordering_mode = _validate_ordering_mode(ordering_mode)
@@ -379,3 +384,43 @@ def client_endpoints_override(self, value: dict):
379384
)
380385

381386
self._client_endpoints_override = value
387+
388+
@property
389+
def requests_transport_adapters(
390+
self,
391+
) -> Sequence[Tuple[str, requests.adapters.BaseAdapter]]:
392+
"""Transport adapters for requests-based REST clients such as the
393+
google-cloud-bigquery package.
394+
395+
For more details, see the explanation in `requests guide to transport
396+
adapters
397+
<https://requests.readthedocs.io/en/latest/user/advanced/#transport-adapters>`_.
398+
399+
**Examples:**
400+
401+
Increase the connection pool size using the requests `HTTPAdapter
402+
<https://requests.readthedocs.io/en/latest/api/#requests.adapters.HTTPAdapter>`_.
403+
404+
>>> import bigframes.pandas as bpd
405+
>>> bpd.options.bigquery.requests_transport_adapters = (
406+
... ("http://", requests.adapters.HTTPAdapter(pool_maxsize=100)),
407+
... ("https://", requests.adapters.HTTPAdapter(pool_maxsize=100)),
408+
... ) # doctest: +SKIP
409+
410+
Returns:
411+
Sequence[Tuple[str, requests.adapters.BaseAdapter]]:
412+
Prefixes and corresponding transport adapters to `mount
413+
<https://requests.readthedocs.io/en/latest/api/#requests.Session.mount>`_
414+
in requests-based REST clients.
415+
"""
416+
return self._requests_transport_adapters
417+
418+
@requests_transport_adapters.setter
419+
def requests_transport_adapters(
420+
self, value: Sequence[Tuple[str, requests.adapters.BaseAdapter]]
421+
) -> None:
422+
if self._session_started and self._requests_transport_adapters != value:
423+
raise ValueError(
424+
SESSION_STARTED_MESSAGE.format(attribute="requests_transport_adapters")
425+
)
426+
self._requests_transport_adapters = value

bigframes/bigquery/__init__.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@
3737
json_extract,
3838
json_extract_array,
3939
json_extract_string_array,
40+
json_query,
4041
json_set,
4142
json_value,
4243
parse_json,
@@ -58,10 +59,11 @@
5859
"st_distance",
5960
"st_intersection",
6061
# json ops
61-
"json_set",
6262
"json_extract",
6363
"json_extract_array",
6464
"json_extract_string_array",
65+
"json_query",
66+
"json_set",
6567
"json_value",
6668
"parse_json",
6769
# search ops

bigframes/bigquery/_operations/json.py

Lines changed: 45 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,9 +22,11 @@
2222
from __future__ import annotations
2323

2424
from typing import Any, cast, Optional, Sequence, Tuple, Union
25+
import warnings
2526

2627
import bigframes.core.utils as utils
2728
import bigframes.dtypes
29+
import bigframes.exceptions as bfe
2830
import bigframes.operations as ops
2931
import bigframes.series as series
3032

@@ -87,9 +89,13 @@ def json_extract(
8789
input: series.Series,
8890
json_path: str,
8991
) -> series.Series:
90-
"""Extracts a JSON value and converts it to a SQL JSON-formatted `STRING` or `JSON`
91-
value. This function uses single quotes and brackets to escape invalid JSONPath
92-
characters in JSON keys.
92+
"""Extracts a JSON value and converts it to a SQL JSON-formatted ``STRING`` or
93+
``JSON`` value. This function uses single quotes and brackets to escape invalid
94+
JSONPath characters in JSON keys.
95+
96+
.. deprecated:: 2.5.0
97+
The ``json_extract`` is deprecated and will be removed in a future version.
98+
Use ``json_query`` instead.
9399
94100
**Examples:**
95101
@@ -111,6 +117,11 @@ def json_extract(
111117
Returns:
112118
bigframes.series.Series: A new Series with the JSON or JSON-formatted STRING.
113119
"""
120+
msg = (
121+
"The `json_extract` is deprecated and will be removed in a future version. "
122+
"Use `json_query` instead."
123+
)
124+
warnings.warn(bfe.format_message(msg), category=UserWarning)
114125
return input._apply_unary_op(ops.JSONExtract(json_path=json_path))
115126

116127

@@ -231,6 +242,37 @@ def json_extract_string_array(
231242
return array_series
232243

233244

245+
def json_query(
246+
input: series.Series,
247+
json_path: str,
248+
) -> series.Series:
249+
"""Extracts a JSON value and converts it to a SQL JSON-formatted ``STRING``
250+
or ``JSON`` value. This function uses double quotes to escape invalid JSONPath
251+
characters in JSON keys. For example: ``"a.b"``.
252+
253+
**Examples:**
254+
255+
>>> import bigframes.pandas as bpd
256+
>>> import bigframes.bigquery as bbq
257+
>>> bpd.options.display.progress_bar = None
258+
259+
>>> s = bpd.Series(['{"class": {"students": [{"id": 5}, {"id": 12}]}}'])
260+
>>> bbq.json_query(s, json_path="$.class")
261+
0 {"students":[{"id":5},{"id":12}]}
262+
dtype: string
263+
264+
Args:
265+
input (bigframes.series.Series):
266+
The Series containing JSON data (as native JSON objects or JSON-formatted strings).
267+
json_path (str):
268+
The JSON path identifying the data that you want to obtain from the input.
269+
270+
Returns:
271+
bigframes.series.Series: A new Series with the JSON or JSON-formatted STRING.
272+
"""
273+
return input._apply_unary_op(ops.JSONQuery(json_path=json_path))
274+
275+
234276
def json_value(
235277
input: series.Series,
236278
json_path: str,

bigframes/bigquery/_operations/sql.py

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020

2121
import google.cloud.bigquery
2222

23+
import bigframes.core.compile.sqlglot.sqlglot_ir as sqlglot_ir
2324
import bigframes.core.sql
2425
import bigframes.dataframe
2526
import bigframes.dtypes
@@ -72,16 +73,16 @@ def sql_scalar(
7273
# Another benefit of this is that if there is a syntax error in the SQL
7374
# template, then this will fail with an error earlier in the process,
7475
# aiding users in debugging.
75-
base_series = columns[0]
76-
literals = [
77-
bigframes.dtypes.bigframes_dtype_to_literal(column.dtype) for column in columns
76+
literals_sql = [
77+
sqlglot_ir._literal(None, column.dtype).sql(dialect="bigquery")
78+
for column in columns
7879
]
79-
literals_sql = [bigframes.core.sql.simple_literal(literal) for literal in literals]
80+
select_sql = sql_template.format(*literals_sql)
81+
dry_run_sql = f"SELECT {select_sql}"
8082

8183
# Use the executor directly, because we want the original column IDs, not
8284
# the user-friendly column names that block.to_sql_query() would produce.
83-
select_sql = sql_template.format(*literals_sql)
84-
dry_run_sql = f"SELECT {select_sql}"
85+
base_series = columns[0]
8586
bqclient = base_series._session.bqclient
8687
job = bqclient.query(
8788
dry_run_sql, job_config=google.cloud.bigquery.QueryJobConfig(dry_run=True)

bigframes/constants.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -128,4 +128,8 @@
128128
# BigQuery default is 10000, leave 100 for overhead
129129
MAX_COLUMNS = 9900
130130

131+
# BigQuery has 1 MB query size limit. Don't want to take up more than a few % of that inlining a table.
132+
# Also must assume that text encoding as literals is much less efficient than in-memory representation.
133+
MAX_INLINE_BYTES = 5000
134+
131135
SUGGEST_PEEK_PREVIEW = "Use .peek(n) to preview n arbitrary rows."

bigframes/core/array_value.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -133,8 +133,17 @@ def from_table(
133133
ordering=ordering,
134134
n_rows=n_rows,
135135
)
136+
return cls.from_bq_data_source(source_def, scan_list, session)
137+
138+
@classmethod
139+
def from_bq_data_source(
140+
cls,
141+
source: nodes.BigqueryDataSource,
142+
scan_list: nodes.ScanList,
143+
session: Session,
144+
):
136145
node = nodes.ReadTableNode(
137-
source=source_def,
146+
source=source,
138147
scan_list=scan_list,
139148
table_session=session,
140149
)

bigframes/core/bigframe_node.py

Lines changed: 3 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222
import typing
2323
from typing import Callable, Dict, Generator, Iterable, Mapping, Sequence, Set, Tuple
2424

25-
from bigframes.core import identifiers
25+
from bigframes.core import field, identifiers
2626
import bigframes.core.schema as schemata
2727
import bigframes.dtypes
2828

@@ -34,23 +34,6 @@
3434
T = typing.TypeVar("T")
3535

3636

37-
@dataclasses.dataclass(frozen=True)
38-
class Field:
39-
id: identifiers.ColumnId
40-
dtype: bigframes.dtypes.Dtype
41-
# Best effort, nullable=True if not certain
42-
nullable: bool = True
43-
44-
def with_nullable(self) -> Field:
45-
return Field(self.id, self.dtype, nullable=True)
46-
47-
def with_nonnull(self) -> Field:
48-
return Field(self.id, self.dtype, nullable=False)
49-
50-
def with_id(self, id: identifiers.ColumnId) -> Field:
51-
return Field(id, self.dtype, nullable=self.nullable)
52-
53-
5437
@dataclasses.dataclass(eq=False, frozen=True)
5538
class BigFrameNode:
5639
"""
@@ -162,7 +145,7 @@ def roots(self) -> typing.Set[BigFrameNode]:
162145
# TODO: Store some local data lazily for select, aggregate nodes.
163146
@property
164147
@abc.abstractmethod
165-
def fields(self) -> Sequence[Field]:
148+
def fields(self) -> Sequence[field.Field]:
166149
...
167150

168151
@property
@@ -292,7 +275,7 @@ def _dtype_lookup(self) -> dict[identifiers.ColumnId, bigframes.dtypes.Dtype]:
292275
return {field.id: field.dtype for field in self.fields}
293276

294277
@functools.cached_property
295-
def field_by_id(self) -> Mapping[identifiers.ColumnId, Field]:
278+
def field_by_id(self) -> Mapping[identifiers.ColumnId, field.Field]:
296279
return {field.id: field for field in self.fields}
297280

298281
# Plan algorithms

bigframes/core/blocks.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2166,7 +2166,7 @@ def merge(
21662166
result_columns.append(get_column_left[col_id])
21672167
for col_id in other.value_columns:
21682168
if col_id in right_join_ids:
2169-
if other.col_id_to_label[matching_right_id] in matching_join_labels:
2169+
if other.col_id_to_label[col_id] in matching_join_labels:
21702170
pass
21712171
else:
21722172
result_columns.append(get_column_right[col_id])

bigframes/core/compile/scalar_op_compiler.py

Lines changed: 16 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1356,6 +1356,19 @@ def json_extract_string_array_op_impl(
13561356
return json_extract_string_array(json_obj=x, json_path=op.json_path)
13571357

13581358

1359+
@scalar_op_compiler.register_unary_op(ops.JSONQuery, pass_op=True)
1360+
def json_query_op_impl(x: ibis_types.Value, op: ops.JSONQuery):
1361+
# Define a user-defined function whose returned type is dynamically matching the input.
1362+
def json_query(json_or_json_string, json_path: ibis_dtypes.str): # type: ignore
1363+
"""Extracts a JSON value and converts it to a SQL JSON-formatted STRING or JSON value."""
1364+
...
1365+
1366+
return_type = x.type()
1367+
json_query.__annotations__["return"] = return_type
1368+
json_query_op = ibis_udf.scalar.builtin(json_query)
1369+
return json_query_op(json_or_json_string=x, json_path=op.json_path)
1370+
1371+
13591372
@scalar_op_compiler.register_unary_op(ops.ParseJSON, pass_op=True)
13601373
def parse_json_op_impl(x: ibis_types.Value, op: ops.ParseJSON):
13611374
return parse_json(json_str=x)
@@ -1935,34 +1948,18 @@ def clip_op(
19351948
if isinstance(lower, ibis_types.NullScalar) and (
19361949
not isinstance(upper, ibis_types.NullScalar)
19371950
):
1938-
return (
1939-
ibis_api.case() # type: ignore
1940-
.when(upper.isnull() | (original > upper), upper)
1941-
.else_(original)
1942-
.end()
1943-
)
1951+
return ibis_api.least(original, upper)
19441952
elif (not isinstance(lower, ibis_types.NullScalar)) and isinstance(
19451953
upper, ibis_types.NullScalar
19461954
):
1947-
return (
1948-
ibis_api.case() # type: ignore
1949-
.when(lower.isnull() | (original < lower), lower)
1950-
.else_(original)
1951-
.end()
1952-
)
1955+
return ibis_api.greatest(original, lower)
19531956
elif isinstance(lower, ibis_types.NullScalar) and (
19541957
isinstance(upper, ibis_types.NullScalar)
19551958
):
19561959
return original
19571960
else:
19581961
# Note: Pandas has unchanged behavior when upper bound and lower bound are flipped. This implementation requires that lower_bound < upper_bound
1959-
return (
1960-
ibis_api.case() # type: ignore
1961-
.when(lower.isnull() | (original < lower), lower)
1962-
.when(upper.isnull() | (original > upper), upper)
1963-
.else_(original)
1964-
.end()
1965-
)
1962+
return ibis_api.greatest(ibis_api.least(original, upper), lower)
19661963

19671964

19681965
# N-ary Operations

0 commit comments

Comments
 (0)