Skip to content

Commit 7a426d8

Browse files
milkshakeiiiGenesis929
authored andcommitted
feat: Support max_columns in repr and make repr more efficient (#515)
1 parent e21882e commit 7a426d8

File tree

5 files changed

+70
-65
lines changed

5 files changed

+70
-65
lines changed

bigframes/core/blocks.py

Lines changed: 29 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -464,6 +464,23 @@ def to_pandas_batches(self):
464464
self._copy_index_to_pandas(df)
465465
yield df
466466

467+
def download_pandas_preview(
468+
self, max_rows: int
469+
) -> Tuple[pd.DataFrame, bigquery.QueryJob]:
470+
"""Download one page of results and return the query job."""
471+
dtypes = dict(zip(self.index_columns, self.index.dtypes))
472+
dtypes.update(zip(self.value_columns, self.dtypes))
473+
results_iterator, query_job = self.session._execute(
474+
self.expr, sorted=True, max_results=max_rows
475+
)
476+
arrow_results_iterator = results_iterator.to_arrow_iterable()
477+
arrow_table = next(arrow_results_iterator)
478+
downloaded_df = bigframes.session._io.pandas.arrow_to_pandas(
479+
arrow_table, dtypes
480+
)
481+
self._copy_index_to_pandas(downloaded_df)
482+
return downloaded_df, query_job
483+
467484
def _copy_index_to_pandas(self, df: pd.DataFrame):
468485
"""Set the index on pandas DataFrame to match this block.
469486
@@ -1294,26 +1311,25 @@ def _forward_slice(self, start: int = 0, stop=None, step: int = 1):
12941311
# queries.
12951312
@functools.cache
12961313
def retrieve_repr_request_results(
1297-
self, max_results: int
1298-
) -> Tuple[pd.DataFrame, int, bigquery.QueryJob]:
1314+
self, max_results: int, max_columns: int
1315+
) -> Tuple[pd.DataFrame, Tuple[int, int], bigquery.QueryJob]:
12991316
"""
13001317
Retrieves a pandas dataframe containing only max_results many rows for use
13011318
with printing methods.
13021319
1303-
Returns a tuple of the dataframe and the overall number of rows of the query.
1320+
Returns a tuple of the dataframe preview for printing and the overall number
1321+
of rows and columns of the table, as well as the query job used.
13041322
"""
1305-
# TODO(swast): Select a subset of columns if max_columns is less than the
1306-
# number of columns in the schema.
1307-
count = self.shape[0]
1308-
if count > max_results:
1309-
head_block = self.slice(0, max_results)
1310-
else:
1311-
head_block = self
1312-
computed_df, query_job = head_block.to_pandas()
1313-
formatted_df = computed_df.set_axis(self.column_labels, axis=1)
1323+
pandas_df, query_job = self.download_pandas_preview(max_results)
1324+
row_count = self.session._get_table_row_count(query_job.destination)
1325+
column_count = len(self.value_columns)
1326+
1327+
formatted_df = pandas_df.set_axis(self.column_labels, axis=1)
13141328
# we reset the axis and substitute the bf index name for the default
13151329
formatted_df.index.name = self.index.name
1316-
return formatted_df, count, query_job
1330+
# limit column count
1331+
formatted_df = formatted_df.iloc[:, 0:max_columns]
1332+
return formatted_df, (row_count, column_count), query_job
13171333

13181334
def promote_offsets(self, label: Label = None) -> typing.Tuple[Block, str]:
13191335
result_id = guid.generate_guid()

bigframes/core/indexes/index.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -205,17 +205,17 @@ def query_job(self) -> Optional[bigquery.QueryJob]:
205205
return self._query_job
206206

207207
def __repr__(self) -> str:
208-
# TODO(swast): Add a timeout here? If the query is taking a long time,
209-
# maybe we just print the job metadata that we have so far?
210-
# TODO(swast): Avoid downloading the whole series by using job
211-
# metadata, like we do with DataFrame.
212208
opts = bigframes.options.display
213209
max_results = opts.max_rows
210+
max_columns = opts.max_columns
214211
if opts.repr_mode == "deferred":
215212
return formatter.repr_query_job(self.query_job)
216213

217-
pandas_df, _, query_job = self._block.retrieve_repr_request_results(max_results)
214+
pandas_df, _, query_job = self._block.retrieve_repr_request_results(
215+
max_results, max_columns
216+
)
218217
self._query_job = query_job
218+
219219
return repr(pandas_df.index)
220220

221221
def copy(self, name: Optional[Hashable] = None):

bigframes/dataframe.py

Lines changed: 26 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -579,71 +579,57 @@ def __setattr__(self, key: str, value):
579579
object.__setattr__(self, key, value)
580580

581581
def __repr__(self) -> str:
582-
"""Converts a DataFrame to a string. Calls to_pandas.
582+
"""Converts a DataFrame to a string using pandas dataframe __repr__.
583583
584-
Only represents the first `bigframes.options.display.max_rows`.
584+
Only represents the first `bigframes.options.display.max_rows`
585+
and `bigframes.options.display.max_columns`.
585586
"""
586-
opts = bigframes.options.display
587-
max_results = opts.max_rows
588-
if opts.repr_mode == "deferred":
587+
if bigframes.options.display.repr_mode == "deferred":
589588
return formatter.repr_query_job(self.query_job)
590589

591-
self._cached()
592-
# TODO(swast): pass max_columns and get the true column count back. Maybe
593-
# get 1 more column than we have requested so that pandas can add the
594-
# ... for us?
595-
pandas_df, row_count, query_job = self._block.retrieve_repr_request_results(
596-
max_results
597-
)
598-
599-
self._set_internal_query_job(query_job)
600-
601-
column_count = len(pandas_df.columns)
602-
603-
with display_options.pandas_repr(opts):
590+
pandas_df, shape = self._perform_repr_request()
591+
with display_options.pandas_repr(bigframes.options.display):
604592
repr_string = repr(pandas_df)
605593

606594
# Modify the end of the string to reflect count.
607595
lines = repr_string.split("\n")
608596
pattern = re.compile("\\[[0-9]+ rows x [0-9]+ columns\\]")
609597
if pattern.match(lines[-1]):
610598
lines = lines[:-2]
611-
612-
if row_count > len(lines) - 1:
599+
if shape[0] > len(lines) - 1:
613600
lines.append("...")
614-
615601
lines.append("")
616-
lines.append(f"[{row_count} rows x {column_count} columns]")
602+
lines.append(f"[{shape[0]} rows x {shape[1]} columns]")
617603
return "\n".join(lines)
618604

605+
def _perform_repr_request(self) -> Tuple[pandas.DataFrame, Tuple[int, int]]:
606+
max_results = bigframes.options.display.max_rows
607+
max_columns = bigframes.options.display.max_columns
608+
self._cached()
609+
pandas_df, shape, query_job = self._block.retrieve_repr_request_results(
610+
max_results, max_columns
611+
)
612+
self._set_internal_query_job(query_job)
613+
return pandas_df, shape
614+
619615
def _repr_html_(self) -> str:
620616
"""
621617
Returns an html string primarily for use by notebooks for displaying
622-
a representation of the DataFrame. Displays 20 rows by default since
623-
many notebooks are not configured for large tables.
618+
a representation of the DataFrame. Displays at most the number of rows
619+
and columns given by `bigframes.options.display.max_rows` and
620+
`bigframes.options.display.max_columns`.
624621
"""
625-
opts = bigframes.options.display
626-
max_results = bigframes.options.display.max_rows
627-
if opts.repr_mode == "deferred":
628-
return formatter.repr_query_job_html(self.query_job)
629622

630-
self._cached()
631-
# TODO(swast): pass max_columns and get the true column count back. Maybe
632-
# get 1 more column than we have requested so that pandas can add the
633-
# ... for us?
634-
pandas_df, row_count, query_job = self._block.retrieve_repr_request_results(
635-
max_results
636-
)
637-
638-
self._set_internal_query_job(query_job)
623+
if bigframes.options.display.repr_mode == "deferred":
624+
return formatter.repr_query_job_html(self.query_job)
639625

640-
column_count = len(pandas_df.columns)
626+
pandas_df, shape = self._perform_repr_request()
641627

642-
with display_options.pandas_repr(opts):
628+
with display_options.pandas_repr(bigframes.options.display):
643629
# _repr_html_ stub is missing so mypy thinks it's a Series. Ignore mypy.
644630
html_string = pandas_df._repr_html_() # type:ignore
645631

646-
html_string += f"[{row_count} rows x {column_count} columns in total]"
632+
html_string += f"[{shape[0]} rows x {shape[1]} columns in total]"
647633
return html_string
648634

649635
def __setitem__(self, key: str, value: SingleItemValue):

bigframes/series.py

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -272,17 +272,16 @@ def reset_index(
272272
return bigframes.dataframe.DataFrame(block)
273273

274274
def __repr__(self) -> str:
275-
# TODO(swast): Add a timeout here? If the query is taking a long time,
276-
# maybe we just print the job metadata that we have so far?
277-
# TODO(swast): Avoid downloading the whole series by using job
278-
# metadata, like we do with DataFrame.
279275
opts = bigframes.options.display
280276
max_results = opts.max_rows
277+
max_columns = opts.max_columns
281278
if opts.repr_mode == "deferred":
282279
return formatter.repr_query_job(self.query_job)
283280

284281
self._cached()
285-
pandas_df, _, query_job = self._block.retrieve_repr_request_results(max_results)
282+
pandas_df, _, query_job = self._block.retrieve_repr_request_results(
283+
max_results, max_columns
284+
)
286285
self._set_internal_query_job(query_job)
287286

288287
return repr(pandas_df.iloc[:, 0])

bigframes/session/__init__.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1831,6 +1831,7 @@ def _execute(
18311831
sorted: bool = True,
18321832
dry_run=False,
18331833
col_id_overrides: Mapping[str, str] = {},
1834+
max_results: Optional[int] = None,
18341835
) -> tuple[bigquery.table.RowIterator, bigquery.QueryJob]:
18351836
sql = self._to_sql(
18361837
array_value, sorted=sorted, col_id_overrides=col_id_overrides
@@ -1840,8 +1841,7 @@ def _execute(
18401841
else:
18411842
job_config.dry_run = dry_run
18421843
return self._start_query(
1843-
sql=sql,
1844-
job_config=job_config,
1844+
sql=sql, job_config=job_config, max_results=max_results
18451845
)
18461846

18471847
def _peek(
@@ -1886,6 +1886,10 @@ def _get_table_size(self, destination_table):
18861886
table = self.bqclient.get_table(destination_table)
18871887
return table.num_bytes
18881888

1889+
def _get_table_row_count(self, destination_table) -> int:
1890+
table = self.bqclient.get_table(destination_table)
1891+
return table.num_rows
1892+
18891893
def _rows_to_dataframe(
18901894
self, row_iterator: bigquery.table.RowIterator, dtypes: Dict
18911895
) -> pandas.DataFrame:

0 commit comments

Comments
 (0)