diff --git a/.kokoro/continuous/notebook.cfg b/.kokoro/continuous/notebook.cfg index 94e2a3c686..ca3d98b58b 100644 --- a/.kokoro/continuous/notebook.cfg +++ b/.kokoro/continuous/notebook.cfg @@ -6,6 +6,11 @@ env_vars: { value: "notebook" } +env_vars: { + key: "BENCHMARK_AND_PUBLISH" + value: "true" +} + env_vars: { key: "GOOGLE_CLOUD_PROJECT" value: "bigframes-load-testing" diff --git a/.kokoro/load/benchmark.cfg b/.kokoro/load/benchmark.cfg index a489e05bbc..bc2d7a2655 100644 --- a/.kokoro/load/benchmark.cfg +++ b/.kokoro/load/benchmark.cfg @@ -6,6 +6,11 @@ env_vars: { value: "benchmark" } +env_vars: { + key: "BENCHMARK_AND_PUBLISH" + value: "true" +} + env_vars: { key: "GOOGLE_CLOUD_PROJECT" value: "bigframes-load-testing" diff --git a/bigframes/session/metrics.py b/bigframes/session/metrics.py index 85a7f6aa4b..579cac1ac3 100644 --- a/bigframes/session/metrics.py +++ b/bigframes/session/metrics.py @@ -33,16 +33,18 @@ class ExecutionMetrics: def count_job_stats(self, query_job: bq_job.QueryJob): stats = get_performance_stats(query_job) if stats is not None: - bytes_processed, slot_millis = stats + bytes_processed, slot_millis, exec_seconds = stats self.execution_count += 1 self.bytes_processed += bytes_processed self.slot_millis += slot_millis if LOGGING_NAME_ENV_VAR in os.environ: # when running notebooks via pytest nbmake - write_stats_to_disk(bytes_processed, slot_millis) + write_stats_to_disk(bytes_processed, slot_millis, exec_seconds) -def get_performance_stats(query_job: bigquery.QueryJob) -> Optional[Tuple[int, int]]: +def get_performance_stats( + query_job: bigquery.QueryJob, +) -> Optional[Tuple[int, int, float]]: """Parse the query job for performance stats. Return None if the stats do not reflect real work done in bigquery. @@ -57,14 +59,21 @@ def get_performance_stats(query_job: bigquery.QueryJob) -> Optional[Tuple[int, i slot_millis = query_job.slot_millis if not isinstance(slot_millis, int): return None # filter out mocks + if query_job.configuration.dry_run: # dry run stats are just predictions of the real run slot_millis = 0 - return bytes_processed, slot_millis + exec_seconds = ( + (query_job.ended - query_job.created).total_seconds() + if query_job.created is not None and query_job.ended is not None + else None + ) + + return bytes_processed, slot_millis, exec_seconds -def write_stats_to_disk(bytes_processed: int, slot_millis: int): +def write_stats_to_disk(bytes_processed: int, slot_millis: int, exec_seconds: float): """For pytest runs only, log information about the query job to a file in order to create a performance report. """ @@ -83,6 +92,13 @@ def write_stats_to_disk(bytes_processed: int, slot_millis: int): f.write(str(bytes_processed) + "\n") # store slot milliseconds - bytes_file = os.path.join(current_directory, test_name + ".slotmillis") - with open(bytes_file, "a") as f: + slot_file = os.path.join(current_directory, test_name + ".slotmillis") + with open(slot_file, "a") as f: f.write(str(slot_millis) + "\n") + + # store execution time seconds + exec_time_file = os.path.join( + current_directory, test_name + ".bq_exec_time_seconds" + ) + with open(exec_time_file, "a") as f: + f.write(str(exec_seconds) + "\n") diff --git a/noxfile.py b/noxfile.py index c464b47270..9ed85290fa 100644 --- a/noxfile.py +++ b/noxfile.py @@ -16,10 +16,8 @@ from __future__ import absolute_import -from multiprocessing import Process import os import pathlib -from pathlib import Path import re import shutil from typing import Dict, List @@ -42,7 +40,6 @@ "third_party", "noxfile.py", "setup.py", - os.path.join("scripts", "benchmark"), ] DEFAULT_PYTHON_VERSION = "3.10" @@ -686,7 +683,7 @@ def notebook(session: nox.Session): "seaborn", ) - notebooks_list = list(Path("notebooks/").glob("*/*.ipynb")) + notebooks_list = list(pathlib.Path("notebooks/").glob("*/*.ipynb")) denylist = [ # Regionalized testing is manually added later. @@ -698,7 +695,7 @@ def notebook(session: nox.Session): # With the notebooks_fill_params.py script, we are able to find and # replace the PROJECT_ID parameter, but not the others. # - # TODO(ashleyxu): Test these notebooks by replacing parameters with + # TODO(b/357904266): Test these notebooks by replacing parameters with # appropriate values and omitting cleanup logic that may break # our test infrastructure. "notebooks/getting_started/ml_fundamentals_bq_dataframes.ipynb", # Needs DATASET. @@ -748,17 +745,6 @@ def notebook(session: nox.Session): for nb in notebooks + list(notebooks_reg): assert os.path.exists(nb), nb - # TODO(shobs): For some reason --retries arg masks exceptions occurred in - # notebook failures, and shows unhelpful INTERNALERROR. Investigate that - # and enable retries if we can find a way to surface the real exception - # bacause the notebook is running against real GCP and something may fail - # due to transient issues. - pytest_command = [ - "py.test", - "--nbmake", - "--nbmake-timeout=900", # 15 minutes - ] - try: # Populate notebook parameters and make a backup so that the notebooks # are runnable. @@ -767,22 +753,23 @@ def notebook(session: nox.Session): CURRENT_DIRECTORY / "scripts" / "notebooks_fill_params.py", *notebooks, ) - - # Run notebooks in parallel session.run's, since each notebook - # takes an environment variable for performance logging - processes = [] for notebook in notebooks: - process = Process( - target=session.run, - args=(*pytest_command, notebook), - kwargs={"env": {LOGGING_NAME_ENV_VAR: os.path.basename(notebook)}}, + session.run( + "python", + "scripts/run_and_publish_benchmark.py", + "--notebook", + f"--benchmark-path={notebook}", ) - process.start() - processes.append(process) - - for process in processes: - process.join() + for notebook, regions in notebooks_reg.items(): + for region in regions: + session.run( + "python", + "scripts/run_and_publish_benchmark.py", + "--notebook", + f"--benchmark-path={notebook}", + f"--region={region}", + ) finally: # Prevent our notebook changes from getting checked in to git # accidentally. @@ -791,116 +778,37 @@ def notebook(session: nox.Session): CURRENT_DIRECTORY / "scripts" / "notebooks_restore_from_backup.py", *notebooks, ) - - # Additionally run regionalized notebooks in parallel session.run's. - # Each notebook takes a different region via env param. - processes = [] - for notebook, regions in notebooks_reg.items(): - for region in regions: - process = Process( - target=session.run, - args=(*pytest_command, notebook), - kwargs={ - "env": { - "BIGQUERY_LOCATION": region, - LOGGING_NAME_ENV_VAR: os.path.basename(notebook), - } - }, - ) - process.start() - processes.append(process) - - for process in processes: - process.join() - - # when the environment variable is set as it is above, - # notebooks output a .bytesprocessed and .slotmillis report - # collect those reports and print a summary - _print_performance_report("notebooks/") + session.run( + "python", + "scripts/run_and_publish_benchmark.py", + "--notebook", + "--publish-benchmarks=notebooks/", + ) @nox.session(python=DEFAULT_PYTHON_VERSION) def benchmark(session: nox.Session): session.install("-e", ".[all]") - base_path = os.path.join("scripts", "benchmark") - - benchmark_script_list = list(Path(base_path).rglob("*.py")) - # Run benchmarks in parallel session.run's, since each benchmark - # takes an environment variable for performance logging - processes = [] - for benchmark in benchmark_script_list: - process = Process( - target=session.run, - args=("python", benchmark), - kwargs={"env": {LOGGING_NAME_ENV_VAR: benchmark.as_posix()}}, - ) - process.start() - processes.append(process) - - for process in processes: - process.join() - - # when the environment variable is set as it is above, - # notebooks output a .bytesprocessed and .slotmillis report - # collect those reports and print a summary - _print_performance_report(base_path) + base_path = os.path.join("tests", "benchmark") + benchmark_script_list = list(pathlib.Path(base_path).rglob("*.py")) -def _print_performance_report(path: str): - """Add an informational report about http queries, bytes - processed, and slot time to the testlog output for purposes - of measuring bigquery-related performance changes. - - Looks specifically for output files in subfolders of the - passed path. (*/*.bytesprocessed and */*.slotmillis) - """ - print("---BIGQUERY USAGE REPORT---") - results_dict = {} - bytes_reports = sorted(Path(path).rglob("*.bytesprocessed")) - for bytes_report in bytes_reports: - with open(bytes_report, "r") as bytes_file: - filename = bytes_report.relative_to(path).with_suffix("") - lines = bytes_file.read().splitlines() - query_count = len(lines) - total_bytes = sum([int(line) for line in lines]) - results_dict[filename] = [query_count, total_bytes] - os.remove(bytes_report) - - millis_reports = sorted(Path(path).rglob("*.slotmillis")) - for millis_report in millis_reports: - with open(millis_report, "r") as millis_file: - filename = millis_report.relative_to(path).with_suffix("") - lines = millis_file.read().splitlines() - total_slot_millis = sum([int(line) for line in lines]) - results_dict[filename] += [total_slot_millis] - os.remove(millis_report) - - cumulative_queries = 0 - cumulative_bytes = 0 - cumulative_slot_millis = 0 - for name, results in results_dict.items(): - if len(results) != 3: - raise IOError( - "Mismatch in performance logging output. " - "Expected one .bytesprocessed and one .slotmillis " - "file for each notebook." + try: + for benchmark in benchmark_script_list: + if benchmark.name in ("__init__.py", "utils.py"): + continue + session.run( + "python", + "scripts/run_and_publish_benchmark.py", + f"--benchmark-path={benchmark}", ) - query_count, total_bytes, total_slot_millis = results - cumulative_queries += query_count - cumulative_bytes += total_bytes - cumulative_slot_millis += total_slot_millis - print( - f"{name} - query count: {query_count}," - f" bytes processed sum: {total_bytes}," - f" slot millis sum: {total_slot_millis}" + finally: + session.run( + "python", + "scripts/run_and_publish_benchmark.py", + f"--publish-benchmarks={base_path}", ) - print( - f"---total queries: {cumulative_queries}, " - f"total bytes: {cumulative_bytes}, " - f"total slot millis: {cumulative_slot_millis}---" - ) - @nox.session(python="3.10") def release_dry_run(session): diff --git a/scripts/benchmark/db-benchmark/groupby/G1_1e9_1e2_5_0/q1.py b/scripts/benchmark/db-benchmark/groupby/G1_1e9_1e2_5_0/q1.py deleted file mode 100644 index cc5f77b49b..0000000000 --- a/scripts/benchmark/db-benchmark/groupby/G1_1e9_1e2_5_0/q1.py +++ /dev/null @@ -1,14 +0,0 @@ -# Contains code from https://github.com/duckdblabs/db-benchmark/blob/master/pandas/groupby-pandas.py - -import bigframes.pandas as bpd - -print("Groupby benchmark 1: sum v1 by id1") - -x = bpd.read_gbq("bigframes-dev-perf.dbbenchmark.G1_1e9_1e2_5_0") - -ans = x.groupby("id1", as_index=False, dropna=False).agg({"v1": "sum"}) -print(ans.shape) -chk = [ans["v1"].sum()] -print(chk) - -bpd.reset_session() diff --git a/scripts/benchmark/db-benchmark/groupby/G1_1e9_1e2_5_0/q10.py b/scripts/benchmark/db-benchmark/groupby/G1_1e9_1e2_5_0/q10.py deleted file mode 100644 index 83d5d4ee14..0000000000 --- a/scripts/benchmark/db-benchmark/groupby/G1_1e9_1e2_5_0/q10.py +++ /dev/null @@ -1,16 +0,0 @@ -# Contains code from https://github.com/duckdblabs/db-benchmark/blob/master/pandas/groupby-pandas.py - -import bigframes.pandas as bpd - -print("Groupby benchmark 10: sum v3 count by id1:id6") - -x = bpd.read_gbq("bigframes-dev-perf.dbbenchmark.G1_1e9_1e2_5_0") - -ans = x.groupby( - ["id1", "id2", "id3", "id4", "id5", "id6"], as_index=False, dropna=False -).agg({"v3": "sum", "v1": "size"}) -print(ans.shape) -chk = [ans["v3"].sum(), ans["v1"].sum()] -print(chk) - -bpd.reset_session() diff --git a/scripts/benchmark/db-benchmark/groupby/G1_1e9_1e2_5_0/q2.py b/scripts/benchmark/db-benchmark/groupby/G1_1e9_1e2_5_0/q2.py deleted file mode 100644 index 734a17242b..0000000000 --- a/scripts/benchmark/db-benchmark/groupby/G1_1e9_1e2_5_0/q2.py +++ /dev/null @@ -1,14 +0,0 @@ -# Contains code from https://github.com/duckdblabs/db-benchmark/blob/master/pandas/groupby-pandas.py - -import bigframes.pandas as bpd - -print("Groupby benchmark 2: sum v1 by id1:id2") - -x = bpd.read_gbq("bigframes-dev-perf.dbbenchmark.G1_1e9_1e2_5_0") - -ans = x.groupby(["id1", "id2"], as_index=False, dropna=False).agg({"v1": "sum"}) -print(ans.shape) -chk = [ans["v1"].sum()] -print(chk) - -bpd.reset_session() diff --git a/scripts/benchmark/db-benchmark/groupby/G1_1e9_1e2_5_0/q3.py b/scripts/benchmark/db-benchmark/groupby/G1_1e9_1e2_5_0/q3.py deleted file mode 100644 index 242902de64..0000000000 --- a/scripts/benchmark/db-benchmark/groupby/G1_1e9_1e2_5_0/q3.py +++ /dev/null @@ -1,14 +0,0 @@ -# Contains code from https://github.com/duckdblabs/db-benchmark/blob/master/pandas/groupby-pandas.py - -import bigframes.pandas as bpd - -print("Groupby benchmark 3: sum v1 mean v3 by id3") - -x = bpd.read_gbq("bigframes-dev-perf.dbbenchmark.G1_1e9_1e2_5_0") - -ans = x.groupby("id3", as_index=False, dropna=False).agg({"v1": "sum", "v3": "mean"}) -print(ans.shape) -chk = [ans["v1"].sum(), ans["v3"].sum()] -print(chk) - -bpd.reset_session() diff --git a/scripts/benchmark/db-benchmark/groupby/G1_1e9_1e2_5_0/q4.py b/scripts/benchmark/db-benchmark/groupby/G1_1e9_1e2_5_0/q4.py deleted file mode 100644 index e4f769545e..0000000000 --- a/scripts/benchmark/db-benchmark/groupby/G1_1e9_1e2_5_0/q4.py +++ /dev/null @@ -1,16 +0,0 @@ -# Contains code from https://github.com/duckdblabs/db-benchmark/blob/master/pandas/groupby-pandas.py - -import bigframes.pandas as bpd - -print("Groupby benchmark 4: mean v1:v3 by id4") - -x = bpd.read_gbq("bigframes-dev-perf.dbbenchmark.G1_1e9_1e2_5_0") - -ans = x.groupby("id4", as_index=False, dropna=False).agg( - {"v1": "mean", "v2": "mean", "v3": "mean"} -) -print(ans.shape) -chk = [ans["v1"].sum(), ans["v2"].sum(), ans["v3"].sum()] -print(chk) - -bpd.reset_session() diff --git a/scripts/benchmark/db-benchmark/groupby/G1_1e9_1e2_5_0/q5.py b/scripts/benchmark/db-benchmark/groupby/G1_1e9_1e2_5_0/q5.py deleted file mode 100644 index d34a6c055f..0000000000 --- a/scripts/benchmark/db-benchmark/groupby/G1_1e9_1e2_5_0/q5.py +++ /dev/null @@ -1,16 +0,0 @@ -# Contains code from https://github.com/duckdblabs/db-benchmark/blob/master/pandas/groupby-pandas.py - -import bigframes.pandas as bpd - -print("Groupby benchmark 5: sum v1:v3 by id6") - -x = bpd.read_gbq("bigframes-dev-perf.dbbenchmark.G1_1e9_1e2_5_0") - -ans = x.groupby("id6", as_index=False, dropna=False).agg( - {"v1": "sum", "v2": "sum", "v3": "sum"} -) -print(ans.shape) -chk = [ans["v1"].sum(), ans["v2"].sum(), ans["v3"].sum()] -print(chk) - -bpd.reset_session() diff --git a/scripts/benchmark/db-benchmark/groupby/G1_1e9_1e2_5_0/q6.py b/scripts/benchmark/db-benchmark/groupby/G1_1e9_1e2_5_0/q6.py deleted file mode 100644 index 0f3240a129..0000000000 --- a/scripts/benchmark/db-benchmark/groupby/G1_1e9_1e2_5_0/q6.py +++ /dev/null @@ -1,16 +0,0 @@ -# Contains code from https://github.com/duckdblabs/db-benchmark/blob/master/pandas/groupby-pandas.py - -import bigframes.pandas as bpd - -print("Groupby benchmark 6: median v3 sd v3 by id4 id5") - -x = bpd.read_gbq("bigframes-dev-perf.dbbenchmark.G1_1e9_1e2_5_0") - -ans = x.groupby(["id4", "id5"], as_index=False, dropna=False).agg( - {"v3": ["median", "std"]} -) -print(ans.shape) -chk = [ans["v3"]["median"].sum(), ans["v3"]["std"].sum()] -print(chk) - -bpd.reset_session() diff --git a/scripts/benchmark/db-benchmark/groupby/G1_1e9_1e2_5_0/q7.py b/scripts/benchmark/db-benchmark/groupby/G1_1e9_1e2_5_0/q7.py deleted file mode 100644 index 78e1e94b85..0000000000 --- a/scripts/benchmark/db-benchmark/groupby/G1_1e9_1e2_5_0/q7.py +++ /dev/null @@ -1,18 +0,0 @@ -# Contains code from https://github.com/duckdblabs/db-benchmark/blob/master/pandas/groupby-pandas.py - -import bigframes.pandas as bpd - -print("Groupby benchmark 7: max v1 - min v2 by id3") - -x = bpd.read_gbq("bigframes-dev-perf.dbbenchmark.G1_1e9_1e2_5_0") - -ans = ( - x.groupby("id3", as_index=False, dropna=False) - .agg({"v1": "max", "v2": "min"}) - .assign(range_v1_v2=lambda x: x["v1"] - x["v2"])[["id3", "range_v1_v2"]] -) -print(ans.shape) -chk = [ans["range_v1_v2"].sum()] -print(chk) - -bpd.reset_session() diff --git a/scripts/benchmark/db-benchmark/groupby/G1_1e9_1e2_5_0/q8.py b/scripts/benchmark/db-benchmark/groupby/G1_1e9_1e2_5_0/q8.py deleted file mode 100644 index 7a57d03efe..0000000000 --- a/scripts/benchmark/db-benchmark/groupby/G1_1e9_1e2_5_0/q8.py +++ /dev/null @@ -1,20 +0,0 @@ -# Contains code from https://github.com/duckdblabs/db-benchmark/blob/master/pandas/groupby-pandas.py - -import bigframes.pandas as bpd - -print("Groupby benchmark 8: largest two v3 by id6") - -x = bpd.read_gbq("bigframes-dev-perf.dbbenchmark.G1_1e9_1e2_5_0") - -ans = ( - x[~x["v3"].isna()][["id6", "v3"]] - .sort_values("v3", ascending=False) - .groupby("id6", as_index=False, dropna=False) - .head(2) -) -ans = ans.reset_index(drop=True) -print(ans.shape) -chk = [ans["v3"].sum()] -print(chk) - -bpd.reset_session() diff --git a/scripts/benchmark/db-benchmark/join/J1_1e9_NA_0_0/q1.py b/scripts/benchmark/db-benchmark/join/J1_1e9_NA_0_0/q1.py deleted file mode 100644 index 429dc72ad0..0000000000 --- a/scripts/benchmark/db-benchmark/join/J1_1e9_NA_0_0/q1.py +++ /dev/null @@ -1,16 +0,0 @@ -# Contains code from https://github.com/duckdblabs/db-benchmark/blob/master/pandas/join-pandas.py - -import bigframes.pandas as bpd - -print("Join benchmark 1: small inner on int") - -x = bpd.read_gbq("bigframes-dev-perf.dbbenchmark.J1_1e9_NA_0_0") -small = bpd.read_gbq("bigframes-dev-perf.dbbenchmark.J1_1e9_1e3_0_0") - -ans = x.merge(small, on="id1") -print(ans.shape) - -chk = [ans["v1"].sum(), ans["v2"].sum()] -print(chk) - -bpd.reset_session() diff --git a/scripts/benchmark/db-benchmark/join/J1_1e9_NA_0_0/q2.py b/scripts/benchmark/db-benchmark/join/J1_1e9_NA_0_0/q2.py deleted file mode 100644 index 210c29f844..0000000000 --- a/scripts/benchmark/db-benchmark/join/J1_1e9_NA_0_0/q2.py +++ /dev/null @@ -1,16 +0,0 @@ -# Contains code from https://github.com/duckdblabs/db-benchmark/blob/master/pandas/join-pandas.py - -import bigframes.pandas as bpd - -print("Join benchmark 2: medium inner on int") - -x = bpd.read_gbq("bigframes-dev-perf.dbbenchmark.J1_1e9_NA_0_0") -medium = bpd.read_gbq("bigframes-dev-perf.dbbenchmark.J1_1e9_1e6_0_0") - -ans = x.merge(medium, on="id2") -print(ans.shape) - -chk = [ans["v1"].sum(), ans["v2"].sum()] -print(chk) - -bpd.reset_session() diff --git a/scripts/benchmark/db-benchmark/join/J1_1e9_NA_0_0/q3.py b/scripts/benchmark/db-benchmark/join/J1_1e9_NA_0_0/q3.py deleted file mode 100644 index d88d943604..0000000000 --- a/scripts/benchmark/db-benchmark/join/J1_1e9_NA_0_0/q3.py +++ /dev/null @@ -1,16 +0,0 @@ -# Contains code from https://github.com/duckdblabs/db-benchmark/blob/master/pandas/join-pandas.py - -import bigframes.pandas as bpd - -print("Join benchmark 3: medium outer on int") - -x = bpd.read_gbq("bigframes-dev-perf.dbbenchmark.J1_1e9_NA_0_0") -medium = bpd.read_gbq("bigframes-dev-perf.dbbenchmark.J1_1e9_1e6_0_0") - -ans = x.merge(medium, how="left", on="id2") -print(ans.shape) - -chk = [ans["v1"].sum(), ans["v2"].sum()] -print(chk) - -bpd.reset_session() diff --git a/scripts/benchmark/db-benchmark/join/J1_1e9_NA_0_0/q4.py b/scripts/benchmark/db-benchmark/join/J1_1e9_NA_0_0/q4.py deleted file mode 100644 index 9167043d9a..0000000000 --- a/scripts/benchmark/db-benchmark/join/J1_1e9_NA_0_0/q4.py +++ /dev/null @@ -1,16 +0,0 @@ -# Contains code from https://github.com/duckdblabs/db-benchmark/blob/master/pandas/join-pandas.py - -import bigframes.pandas as bpd - -print("Join benchmark 4: medium inner on factor") - -x = bpd.read_gbq("bigframes-dev-perf.dbbenchmark.J1_1e9_NA_0_0") -medium = bpd.read_gbq("bigframes-dev-perf.dbbenchmark.J1_1e9_1e6_0_0") - -ans = x.merge(medium, on="id5") -print(ans.shape) - -chk = [ans["v1"].sum(), ans["v2"].sum()] -print(chk) - -bpd.reset_session() diff --git a/scripts/benchmark/db-benchmark/join/J1_1e9_NA_0_0/q5.py b/scripts/benchmark/db-benchmark/join/J1_1e9_NA_0_0/q5.py deleted file mode 100644 index 39eb23ac45..0000000000 --- a/scripts/benchmark/db-benchmark/join/J1_1e9_NA_0_0/q5.py +++ /dev/null @@ -1,16 +0,0 @@ -# Contains code from https://github.com/duckdblabs/db-benchmark/blob/master/pandas/join-pandas.py - -import bigframes.pandas as bpd - -print("Join benchmark 5: big inner on int") - -x = bpd.read_gbq("bigframes-dev-perf.dbbenchmark.J1_1e9_NA_0_0") -big = bpd.read_gbq("bigframes-dev-perf.dbbenchmark.J1_1e9_1e9_0_0") - -ans = x.merge(big, on="id3") -print(ans.shape) - -chk = [ans["v1"].sum(), ans["v2"].sum()] -print(chk) - -bpd.reset_session() diff --git a/scripts/benchmark/db-benchmark/sort/J1_1e9_NA_0_0/q1.py b/scripts/benchmark/db-benchmark/sort/J1_1e9_NA_0_0/q1.py deleted file mode 100644 index 45cac7b543..0000000000 --- a/scripts/benchmark/db-benchmark/sort/J1_1e9_NA_0_0/q1.py +++ /dev/null @@ -1,15 +0,0 @@ -# Contains code from https://github.com/duckdblabs/db-benchmark/blob/master/pandas/sort-pandas.py - -import bigframes.pandas as bpd - -print("Sort benchmark 1: sort by int id2") - -x = bpd.read_gbq("bigframes-dev-perf.dbbenchmark.J1_1e9_NA_0_0") - -ans = x.sort_values("id2") -print(ans.shape) - -chk = [ans["v1"].sum()] -print(chk) - -bpd.reset_session() diff --git a/scripts/run_and_publish_benchmark.py b/scripts/run_and_publish_benchmark.py new file mode 100644 index 0000000000..675db39493 --- /dev/null +++ b/scripts/run_and_publish_benchmark.py @@ -0,0 +1,371 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import datetime +import json +import os +import pathlib +import subprocess +import sys +from typing import Dict, List, Union + +import numpy as np +import pandas as pd +import pandas_gbq + +LOGGING_NAME_ENV_VAR = "BIGFRAMES_PERFORMANCE_LOG_NAME" +CURRENT_DIRECTORY = pathlib.Path(__file__).parent.absolute() + + +def run_benchmark_subprocess(args, log_env_name_var, filename=None, region=None): + """ + Runs a benchmark subprocess with configured environment variables. Adjusts PYTHONPATH, + sets region-specific BigQuery location, and logs environment variables. + + This function terminates the benchmark session if the subprocess exits with an error, + due to `check=True` in subprocess.run, which raises CalledProcessError on non-zero + exit status. + """ + env = os.environ.copy() + current_pythonpath = env.get("PYTHONPATH", "") + env["PYTHONPATH"] = ( + os.path.join(os.getcwd(), "tests") + os.pathsep + current_pythonpath + ) + + if region: + env["BIGQUERY_LOCATION"] = region + env[LOGGING_NAME_ENV_VAR] = log_env_name_var + subprocess.run(args, env=env, check=True) + + +def collect_benchmark_result(benchmark_path: str) -> pd.DataFrame: + """Generate a DataFrame report on HTTP queries, bytes processed, slot time and execution time from log files.""" + path = pathlib.Path(benchmark_path) + try: + results_dict: Dict[str, List[Union[int, float, None]]] = {} + bytes_files = sorted(path.rglob("*.bytesprocessed")) + millis_files = sorted(path.rglob("*.slotmillis")) + bq_seconds_files = sorted(path.rglob("*.bq_exec_time_seconds")) + + local_seconds_files = sorted(path.rglob("*.local_exec_time_seconds")) + has_local_seconds = len(local_seconds_files) > 0 + + if has_local_seconds: + if not ( + len(bytes_files) + == len(millis_files) + == len(local_seconds_files) + == len(bq_seconds_files) + ): + raise ValueError( + "Mismatch in the number of report files for bytes, millis, and seconds." + ) + else: + if not (len(bytes_files) == len(millis_files) == len(bq_seconds_files)): + raise ValueError( + "Mismatch in the number of report files for bytes, millis, and seconds." + ) + + for idx in range(len(bytes_files)): + bytes_file = bytes_files[idx] + millis_file = millis_files[idx] + bq_seconds_file = bq_seconds_files[idx] + filename = bytes_file.relative_to(path).with_suffix("") + + if filename != millis_file.relative_to(path).with_suffix( + "" + ) or filename != bq_seconds_file.relative_to(path).with_suffix(""): + raise ValueError( + "File name mismatch among bytes, millis, and seconds reports." + ) + + if has_local_seconds: + local_seconds_file = local_seconds_files[idx] + if filename != local_seconds_file.relative_to(path).with_suffix(""): + raise ValueError( + "File name mismatch among bytes, millis, and seconds reports." + ) + + with open(bytes_file, "r") as file: + lines = file.read().splitlines() + query_count = len(lines) + total_bytes = sum(int(line) for line in lines) + + with open(millis_file, "r") as file: + lines = file.read().splitlines() + total_slot_millis = sum(int(line) for line in lines) + + if has_local_seconds: + # 'local_seconds' captures the total execution time for a benchmark as it + # starts timing immediately before the benchmark code begins and stops + # immediately after it ends. Unlike other metrics that might accumulate + # values proportional to the number of queries executed, 'local_seconds' is + # a singular measure of the time taken for the complete execution of the + # benchmark, from start to finish. + with open(local_seconds_file, "r") as file: + local_seconds = float(file.readline().strip()) + else: + local_seconds = None + + with open(bq_seconds_file, "r") as file: + lines = file.read().splitlines() + bq_seconds = sum(float(line) for line in lines) + + results_dict[str(filename)] = [ + query_count, + total_bytes, + total_slot_millis, + local_seconds, + bq_seconds, + ] + finally: + for files_to_remove in ( + path.rglob("*.bytesprocessed"), + path.rglob("*.slotmillis"), + path.rglob("*.local_exec_time_seconds"), + path.rglob("*.bq_exec_time_seconds"), + ): + for log_file in files_to_remove: + log_file.unlink() + + columns = [ + "Query_Count", + "Bytes_Processed", + "Slot_Millis", + "Local_Execution_Time_Sec", + "BigQuery_Execution_Time_Sec", + ] + + benchmark_metrics = pd.DataFrame.from_dict( + results_dict, + orient="index", + columns=columns, + ) + + print("---BIGQUERY USAGE REPORT---") + for index, row in benchmark_metrics.iterrows(): + print( + f"{index} - query count: {row['Query_Count']}," + f" bytes processed sum: {row['Bytes_Processed']}," + f" slot millis sum: {row['Slot_Millis']}," + f" local execution time: {round(row['Local_Execution_Time_Sec'], 1)} seconds," + f" bigquery execution time: {round(row['BigQuery_Execution_Time_Sec'], 1)} seconds" + ) + + geometric_mean_queries = geometric_mean(benchmark_metrics["Query_Count"]) + geometric_mean_bytes = geometric_mean(benchmark_metrics["Bytes_Processed"]) + geometric_mean_slot_millis = geometric_mean(benchmark_metrics["Slot_Millis"]) + geometric_mean_local_seconds = geometric_mean( + benchmark_metrics["Local_Execution_Time_Sec"] + ) + geometric_mean_bq_seconds = geometric_mean( + benchmark_metrics["BigQuery_Execution_Time_Sec"] + ) + + print( + f"---Geometric mean of queries: {geometric_mean_queries}, " + f"Geometric mean of bytes processed: {geometric_mean_bytes}, " + f"Geometric mean of slot millis: {geometric_mean_slot_millis}, " + f"Geometric mean of local execution time: {geometric_mean_local_seconds} seconds, " + f"Geometric mean of BigQuery execution time: {geometric_mean_bq_seconds} seconds---" + ) + + return benchmark_metrics.reset_index().rename(columns={"index": "Benchmark_Name"}) + + +def geometric_mean(data): + """ + Calculate the geometric mean of a dataset, rounding the result to one decimal place. + Returns NaN if the dataset is empty or contains only NaN values. + """ + data = data.dropna() + if len(data) == 0: + return np.nan + log_data = np.log(data) + return round(np.exp(log_data.mean()), 1) + + +def get_repository_status(): + current_directory = os.getcwd() + subprocess.run( + ["git", "config", "--global", "--add", "safe.directory", current_directory], + check=True, + ) + + git_hash = subprocess.check_output( + ["git", "rev-parse", "--short", "HEAD"], text=True + ).strip() + bigframes_version = subprocess.check_output( + ["python", "-c", "import bigframes; print(bigframes.__version__)"], text=True + ).strip() + release_version = ( + f"{bigframes_version}dev{datetime.datetime.now().strftime('%Y%m%d')}+{git_hash}" + ) + + return { + "benchmark_start_time": datetime.datetime.now().isoformat(), + "git_hash": git_hash, + "bigframes_version": bigframes_version, + "release_version": release_version, + "python_version": sys.version, + } + + +def find_config(start_path): + """ + Searches for a 'config.jsonl' file starting from the given path and moving up to parent + directories. + + This function ascends from the initial directory specified by `start_path` up to 3 + levels or until it reaches a directory named 'benchmark'. The search moves upwards + because if there are multiple 'config.jsonl' files in the path hierarchy, the closest + configuration to the starting directory (the lowest level) is expected to take effect. + It checks each directory for the presence of 'config.jsonl'. If found, it returns the + path to the configuration file. If not found within the limit or upon reaching + the 'benchmark' directory, it returns None. + """ + target_file = "config.jsonl" + current_path = pathlib.Path(start_path).resolve() + if current_path.is_file(): + current_path = current_path.parent + + levels_checked = 0 + while current_path.name != "benchmark" and levels_checked < 3: + config_path = current_path / target_file + if config_path.exists(): + return config_path + if current_path.parent == current_path: + break + current_path = current_path.parent + levels_checked += 1 + + return None + + +def run_benchmark_from_config(benchmark: str): + print(benchmark) + config_path = find_config(benchmark) + + if config_path: + benchmark_configs = [] + with open(config_path, "r") as f: + for line in f: + config = json.loads(line) + python_args = [f"--{key}={value}" for key, value in config.items()] + suffix = ( + config["benchmark_suffix"] + if "benchmark_suffix" in config + else "_".join(f"{key}_{value}" for key, value in config.items()) + ) + benchmark_configs.append((suffix, python_args)) + else: + benchmark_configs = [(None, [])] + + for benchmark_config in benchmark_configs: + args = ["python", str(benchmark)] + args.extend(benchmark_config[1]) + log_env_name_var = str(benchmark) + if benchmark_config[0] is not None: + log_env_name_var += f"_{benchmark_config[0]}" + run_benchmark_subprocess(args=args, log_env_name_var=log_env_name_var) + + +def run_notebook_benchmark(benchmark_file: str, region: str): + export_file = f"{benchmark_file}_{region}" if region else benchmark_file + log_env_name_var = os.path.basename(export_file) + # TODO(shobs): For some reason --retries arg masks exceptions occurred in + # notebook failures, and shows unhelpful INTERNALERROR. Investigate that + # and enable retries if we can find a way to surface the real exception + # bacause the notebook is running against real GCP and something may fail + # due to transient issues. + pytest_command = [ + "py.test", + "--nbmake", + "--nbmake-timeout=900", # 15 minutes + ] + benchmark_args = (*pytest_command, benchmark_file) + + run_benchmark_subprocess( + args=benchmark_args, + log_env_name_var=log_env_name_var, + filename=export_file, + region=region, + ) + + +def parse_arguments(): + parser = argparse.ArgumentParser( + description="Run benchmarks for different scenarios." + ) + parser.add_argument( + "--notebook", + action="store_true", + help="Set this flag to run the benchmark as a notebook. If not set, it assumes a Python (.py) file.", + ) + + parser.add_argument( + "--benchmark-path", + type=str, + default=None, + help="Specify the file path to the benchmark script, either a Jupyter notebook or a Python script.", + ) + + parser.add_argument( + "--region", + type=str, + default=None, + help="Specify the region where the benchmark will be executed or where the data resides. This parameter is optional.", + ) + + parser.add_argument( + "--publish-benchmarks", + type=str, + default=None, + help="Set the benchmarks to be published to BigQuery.", + ) + + return parser.parse_args() + + +def main(): + args = parse_arguments() + + if args.publish_benchmarks: + bigquery_table = ( + "bigframes-metrics.benchmark_report.notebook_benchmark" + if args.notebook + else "bigframes-metrics.benchmark_report.benchmark" + ) + benchmark_metrics = collect_benchmark_result(args.publish_benchmarks) + + if os.getenv("BENCHMARK_AND_PUBLISH", "false") == "true": + repo_status = get_repository_status() + for idx, col in enumerate(repo_status.keys()): + benchmark_metrics.insert(idx, col, repo_status[col]) + + pandas_gbq.to_gbq( + dataframe=benchmark_metrics, + destination_table=bigquery_table, + if_exists="append", + ) + print("Results have been successfully uploaded to BigQuery.") + elif args.notebook: + run_notebook_benchmark(args.benchmark_path, args.region) + else: + run_benchmark_from_config(args.benchmark_path) + + +if __name__ == "__main__": + main() diff --git a/tests/benchmark/__init__.py b/tests/benchmark/__init__.py new file mode 100644 index 0000000000..6d5e14bcf4 --- /dev/null +++ b/tests/benchmark/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/tests/benchmark/db_benchmark/sort/config.jsonl b/tests/benchmark/db_benchmark/sort/config.jsonl new file mode 100644 index 0000000000..72884d6c5a --- /dev/null +++ b/tests/benchmark/db_benchmark/sort/config.jsonl @@ -0,0 +1,2 @@ +{"benchmark_suffix": "50g_ordered", "table_id": "J1_1e9_NA_0_0", "ordered": true} +{"benchmark_suffix": "50g_unordered", "table_id": "J1_1e9_NA_0_0", "ordered": false} diff --git a/scripts/benchmark/simple_benchmark.py b/tests/benchmark/db_benchmark/sort/q1.py similarity index 58% rename from scripts/benchmark/simple_benchmark.py rename to tests/benchmark/db_benchmark/sort/q1.py index 53b35c52ad..f17a843192 100644 --- a/scripts/benchmark/simple_benchmark.py +++ b/tests/benchmark/db_benchmark/sort/q1.py @@ -12,16 +12,15 @@ # See the License for the specific language governing permissions and # limitations under the License. -import bigframes.pandas as bpd +import pathlib -# This is a placeholder benchmark. -# TODO(340278185): Add more data analysis tasks and benchmark files -# like this one. +import benchmark.utils as utils +import bigframes_vendored.db_benchmark.sort_queries as vendored_dbbenchmark_sort_queries -print("Performing simple benchmark.") -df = bpd.DataFrame() -df["column_1"] = bpd.Series([i for i in range(100000)]) -df["column_2"] = bpd.Series([i * 2 for i in range(100000)]) -df["column_3"] = df["column_1"] + df["column_2"] -df.__repr__() -bpd.reset_session() +if __name__ == "__main__": + table_id, session, suffix = utils.get_dbbenchmark_configuration() + current_path = pathlib.Path(__file__).absolute() + + utils.get_execution_time( + vendored_dbbenchmark_sort_queries.q1, current_path, suffix, table_id, session + ) diff --git a/tests/benchmark/utils.py b/tests/benchmark/utils.py new file mode 100644 index 0000000000..c286d4e229 --- /dev/null +++ b/tests/benchmark/utils.py @@ -0,0 +1,95 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import time + +import bigframes + + +def get_dbbenchmark_configuration(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--table_id", + type=str, + required=True, + help="The BigQuery table ID to query.", + ) + parser.add_argument( + "--ordered", + type=str, + help="Set to True (default) to have an ordered session, or False for an unordered session.", + ) + parser.add_argument( + "--benchmark_suffix", + type=str, + help="Suffix to append to benchmark names for identification purposes.", + ) + args = parser.parse_args() + session = _initialize_session(_str_to_bool(args.ordered)) + return args.table_id, session, args.benchmark_suffix + + +def get_tpch_configuration(): + parser = argparse.ArgumentParser(description="Process TPC-H Query using BigFrames.") + parser.add_argument( + "--dataset_id", + type=str, + required=True, + help="The BigQuery dataset ID to query.", + ) + parser.add_argument( + "--ordered", + type=str, + help="Set to True (default) to have an ordered session, or False for an unordered session.", + ) + parser.add_argument( + "--benchmark_suffix", + type=str, + help="Suffix to append to benchmark names for identification purposes.", + ) + + args = parser.parse_args() + session = _initialize_session(_str_to_bool(args.ordered)) + return args.dataset_id, session, args.benchmark_suffix + + +def get_execution_time(func, current_path, suffix, *args, **kwargs): + start_time = time.perf_counter() + func(*args, **kwargs) + end_time = time.perf_counter() + runtime = end_time - start_time + + clock_time_file_path = f"{current_path}_{suffix}.local_exec_time_seconds" + + with open(clock_time_file_path, "w") as log_file: + log_file.write(f"{runtime}\n") + + +def _str_to_bool(value): + if value == "True": + return True + elif value == "False": + return False + else: + raise argparse.ArgumentTypeError('Only "True" or "False" expected.') + + +def _initialize_session(ordered: bool): + context = bigframes.BigQueryOptions( + location="US", ordering_mode="strict" if ordered else "partial" + ) + session = bigframes.Session(context=context) + print(f"Initialized {'ordered' if ordered else 'unordered'} session.") + return session diff --git a/third_party/bigframes_vendored/db_benchmark/LICENSE b/third_party/bigframes_vendored/db_benchmark/LICENSE new file mode 100644 index 0000000000..a612ad9813 --- /dev/null +++ b/third_party/bigframes_vendored/db_benchmark/LICENSE @@ -0,0 +1,373 @@ +Mozilla Public License Version 2.0 +================================== + +1. Definitions +-------------- + +1.1. "Contributor" + means each individual or legal entity that creates, contributes to + the creation of, or owns Covered Software. + +1.2. "Contributor Version" + means the combination of the Contributions of others (if any) used + by a Contributor and that particular Contributor's Contribution. + +1.3. "Contribution" + means Covered Software of a particular Contributor. + +1.4. "Covered Software" + means Source Code Form to which the initial Contributor has attached + the notice in Exhibit A, the Executable Form of such Source Code + Form, and Modifications of such Source Code Form, in each case + including portions thereof. + +1.5. "Incompatible With Secondary Licenses" + means + + (a) that the initial Contributor has attached the notice described + in Exhibit B to the Covered Software; or + + (b) that the Covered Software was made available under the terms of + version 1.1 or earlier of the License, but not also under the + terms of a Secondary License. + +1.6. "Executable Form" + means any form of the work other than Source Code Form. + +1.7. "Larger Work" + means a work that combines Covered Software with other material, in + a separate file or files, that is not Covered Software. + +1.8. "License" + means this document. + +1.9. "Licensable" + means having the right to grant, to the maximum extent possible, + whether at the time of the initial grant or subsequently, any and + all of the rights conveyed by this License. + +1.10. "Modifications" + means any of the following: + + (a) any file in Source Code Form that results from an addition to, + deletion from, or modification of the contents of Covered + Software; or + + (b) any new file in Source Code Form that contains any Covered + Software. + +1.11. "Patent Claims" of a Contributor + means any patent claim(s), including without limitation, method, + process, and apparatus claims, in any patent Licensable by such + Contributor that would be infringed, but for the grant of the + License, by the making, using, selling, offering for sale, having + made, import, or transfer of either its Contributions or its + Contributor Version. + +1.12. "Secondary License" + means either the GNU General Public License, Version 2.0, the GNU + Lesser General Public License, Version 2.1, the GNU Affero General + Public License, Version 3.0, or any later versions of those + licenses. + +1.13. "Source Code Form" + means the form of the work preferred for making modifications. + +1.14. "You" (or "Your") + means an individual or a legal entity exercising rights under this + License. For legal entities, "You" includes any entity that + controls, is controlled by, or is under common control with You. For + purposes of this definition, "control" means (a) the power, direct + or indirect, to cause the direction or management of such entity, + whether by contract or otherwise, or (b) ownership of more than + fifty percent (50%) of the outstanding shares or beneficial + ownership of such entity. + +2. License Grants and Conditions +-------------------------------- + +2.1. Grants + +Each Contributor hereby grants You a world-wide, royalty-free, +non-exclusive license: + +(a) under intellectual property rights (other than patent or trademark) + Licensable by such Contributor to use, reproduce, make available, + modify, display, perform, distribute, and otherwise exploit its + Contributions, either on an unmodified basis, with Modifications, or + as part of a Larger Work; and + +(b) under Patent Claims of such Contributor to make, use, sell, offer + for sale, have made, import, and otherwise transfer either its + Contributions or its Contributor Version. + +2.2. Effective Date + +The licenses granted in Section 2.1 with respect to any Contribution +become effective for each Contribution on the date the Contributor first +distributes such Contribution. + +2.3. Limitations on Grant Scope + +The licenses granted in this Section 2 are the only rights granted under +this License. No additional rights or licenses will be implied from the +distribution or licensing of Covered Software under this License. +Notwithstanding Section 2.1(b) above, no patent license is granted by a +Contributor: + +(a) for any code that a Contributor has removed from Covered Software; + or + +(b) for infringements caused by: (i) Your and any other third party's + modifications of Covered Software, or (ii) the combination of its + Contributions with other software (except as part of its Contributor + Version); or + +(c) under Patent Claims infringed by Covered Software in the absence of + its Contributions. + +This License does not grant any rights in the trademarks, service marks, +or logos of any Contributor (except as may be necessary to comply with +the notice requirements in Section 3.4). + +2.4. Subsequent Licenses + +No Contributor makes additional grants as a result of Your choice to +distribute the Covered Software under a subsequent version of this +License (see Section 10.2) or under the terms of a Secondary License (if +permitted under the terms of Section 3.3). + +2.5. Representation + +Each Contributor represents that the Contributor believes its +Contributions are its original creation(s) or it has sufficient rights +to grant the rights to its Contributions conveyed by this License. + +2.6. Fair Use + +This License is not intended to limit any rights You have under +applicable copyright doctrines of fair use, fair dealing, or other +equivalents. + +2.7. Conditions + +Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted +in Section 2.1. + +3. Responsibilities +------------------- + +3.1. Distribution of Source Form + +All distribution of Covered Software in Source Code Form, including any +Modifications that You create or to which You contribute, must be under +the terms of this License. You must inform recipients that the Source +Code Form of the Covered Software is governed by the terms of this +License, and how they can obtain a copy of this License. You may not +attempt to alter or restrict the recipients' rights in the Source Code +Form. + +3.2. Distribution of Executable Form + +If You distribute Covered Software in Executable Form then: + +(a) such Covered Software must also be made available in Source Code + Form, as described in Section 3.1, and You must inform recipients of + the Executable Form how they can obtain a copy of such Source Code + Form by reasonable means in a timely manner, at a charge no more + than the cost of distribution to the recipient; and + +(b) You may distribute such Executable Form under the terms of this + License, or sublicense it under different terms, provided that the + license for the Executable Form does not attempt to limit or alter + the recipients' rights in the Source Code Form under this License. + +3.3. Distribution of a Larger Work + +You may create and distribute a Larger Work under terms of Your choice, +provided that You also comply with the requirements of this License for +the Covered Software. If the Larger Work is a combination of Covered +Software with a work governed by one or more Secondary Licenses, and the +Covered Software is not Incompatible With Secondary Licenses, this +License permits You to additionally distribute such Covered Software +under the terms of such Secondary License(s), so that the recipient of +the Larger Work may, at their option, further distribute the Covered +Software under the terms of either this License or such Secondary +License(s). + +3.4. Notices + +You may not remove or alter the substance of any license notices +(including copyright notices, patent notices, disclaimers of warranty, +or limitations of liability) contained within the Source Code Form of +the Covered Software, except that You may alter any license notices to +the extent required to remedy known factual inaccuracies. + +3.5. Application of Additional Terms + +You may choose to offer, and to charge a fee for, warranty, support, +indemnity or liability obligations to one or more recipients of Covered +Software. However, You may do so only on Your own behalf, and not on +behalf of any Contributor. You must make it absolutely clear that any +such warranty, support, indemnity, or liability obligation is offered by +You alone, and You hereby agree to indemnify every Contributor for any +liability incurred by such Contributor as a result of warranty, support, +indemnity or liability terms You offer. You may include additional +disclaimers of warranty and limitations of liability specific to any +jurisdiction. + +4. Inability to Comply Due to Statute or Regulation +--------------------------------------------------- + +If it is impossible for You to comply with any of the terms of this +License with respect to some or all of the Covered Software due to +statute, judicial order, or regulation then You must: (a) comply with +the terms of this License to the maximum extent possible; and (b) +describe the limitations and the code they affect. Such description must +be placed in a text file included with all distributions of the Covered +Software under this License. Except to the extent prohibited by statute +or regulation, such description must be sufficiently detailed for a +recipient of ordinary skill to be able to understand it. + +5. Termination +-------------- + +5.1. The rights granted under this License will terminate automatically +if You fail to comply with any of its terms. However, if You become +compliant, then the rights granted under this License from a particular +Contributor are reinstated (a) provisionally, unless and until such +Contributor explicitly and finally terminates Your grants, and (b) on an +ongoing basis, if such Contributor fails to notify You of the +non-compliance by some reasonable means prior to 60 days after You have +come back into compliance. Moreover, Your grants from a particular +Contributor are reinstated on an ongoing basis if such Contributor +notifies You of the non-compliance by some reasonable means, this is the +first time You have received notice of non-compliance with this License +from such Contributor, and You become compliant prior to 30 days after +Your receipt of the notice. + +5.2. If You initiate litigation against any entity by asserting a patent +infringement claim (excluding declaratory judgment actions, +counter-claims, and cross-claims) alleging that a Contributor Version +directly or indirectly infringes any patent, then the rights granted to +You by any and all Contributors for the Covered Software under Section +2.1 of this License shall terminate. + +5.3. In the event of termination under Sections 5.1 or 5.2 above, all +end user license agreements (excluding distributors and resellers) which +have been validly granted by You or Your distributors under this License +prior to termination shall survive termination. + +************************************************************************ +* * +* 6. Disclaimer of Warranty * +* ------------------------- * +* * +* Covered Software is provided under this License on an "as is" * +* basis, without warranty of any kind, either expressed, implied, or * +* statutory, including, without limitation, warranties that the * +* Covered Software is free of defects, merchantable, fit for a * +* particular purpose or non-infringing. The entire risk as to the * +* quality and performance of the Covered Software is with You. * +* Should any Covered Software prove defective in any respect, You * +* (not any Contributor) assume the cost of any necessary servicing, * +* repair, or correction. This disclaimer of warranty constitutes an * +* essential part of this License. No use of any Covered Software is * +* authorized under this License except under this disclaimer. * +* * +************************************************************************ + +************************************************************************ +* * +* 7. Limitation of Liability * +* -------------------------- * +* * +* Under no circumstances and under no legal theory, whether tort * +* (including negligence), contract, or otherwise, shall any * +* Contributor, or anyone who distributes Covered Software as * +* permitted above, be liable to You for any direct, indirect, * +* special, incidental, or consequential damages of any character * +* including, without limitation, damages for lost profits, loss of * +* goodwill, work stoppage, computer failure or malfunction, or any * +* and all other commercial damages or losses, even if such party * +* shall have been informed of the possibility of such damages. This * +* limitation of liability shall not apply to liability for death or * +* personal injury resulting from such party's negligence to the * +* extent applicable law prohibits such limitation. Some * +* jurisdictions do not allow the exclusion or limitation of * +* incidental or consequential damages, so this exclusion and * +* limitation may not apply to You. * +* * +************************************************************************ + +8. Litigation +------------- + +Any litigation relating to this License may be brought only in the +courts of a jurisdiction where the defendant maintains its principal +place of business and such litigation shall be governed by laws of that +jurisdiction, without reference to its conflict-of-law provisions. +Nothing in this Section shall prevent a party's ability to bring +cross-claims or counter-claims. + +9. Miscellaneous +---------------- + +This License represents the complete agreement concerning the subject +matter hereof. If any provision of this License is held to be +unenforceable, such provision shall be reformed only to the extent +necessary to make it enforceable. Any law or regulation which provides +that the language of a contract shall be construed against the drafter +shall not be used to construe this License against a Contributor. + +10. Versions of the License +--------------------------- + +10.1. New Versions + +Mozilla Foundation is the license steward. Except as provided in Section +10.3, no one other than the license steward has the right to modify or +publish new versions of this License. Each version will be given a +distinguishing version number. + +10.2. Effect of New Versions + +You may distribute the Covered Software under the terms of the version +of the License under which You originally received the Covered Software, +or under the terms of any subsequent version published by the license +steward. + +10.3. Modified Versions + +If you create software not governed by this License, and you want to +create a new license for such software, you may create and use a +modified version of this License if you rename the license and remove +any references to the name of the license steward (except to note that +such modified license differs from this License). + +10.4. Distributing Source Code Form that is Incompatible With Secondary +Licenses + +If You choose to distribute Source Code Form that is Incompatible With +Secondary Licenses under the terms of this version of the License, the +notice described in Exhibit B of this License must be attached. + +Exhibit A - Source Code Form License Notice +------------------------------------------- + + This Source Code Form is subject to the terms of the Mozilla Public + License, v. 2.0. If a copy of the MPL was not distributed with this + file, You can obtain one at http://mozilla.org/MPL/2.0/. + +If it is not possible or desirable to put the notice in a particular +file, then You may include the notice in a location (such as a LICENSE +file in a relevant directory) where a recipient would be likely to look +for such a notice. + +You may add additional accurate notices of copyright ownership. + +Exhibit B - "Incompatible With Secondary Licenses" Notice +--------------------------------------------------------- + + This Source Code Form is "Incompatible With Secondary Licenses", as + defined by the Mozilla Public License, v. 2.0. diff --git a/third_party/bigframes_vendored/db_benchmark/METADATA b/third_party/bigframes_vendored/db_benchmark/METADATA new file mode 100644 index 0000000000..6163ac69b7 --- /dev/null +++ b/third_party/bigframes_vendored/db_benchmark/METADATA @@ -0,0 +1,18 @@ +name: "db-benchmark" +description: + "This repository contains a reproducible benchmarking suite for evaluating " + "database-like operations in single-node environments. It assesses " + "scalability across varying data volumes and complexities." + +third_party { + identifier { + type: "Git" + value: "https://github.com/h2oai/db-benchmark" + primary_source: true + version: "Latest Commit on Main Branch as of Access" + } + version: "Latest Commit on Main Branch as of Access" + last_upgrade_date { year: 2024 month: 7 day: 12 } + license_type: RECIPROCAL + local_modifications: "Modified the queries to test and benchmark the BigFrames project" +} diff --git a/third_party/bigframes_vendored/db_benchmark/README.md b/third_party/bigframes_vendored/db_benchmark/README.md new file mode 100644 index 0000000000..aba227b0eb --- /dev/null +++ b/third_party/bigframes_vendored/db_benchmark/README.md @@ -0,0 +1,76 @@ +Repository for reproducible benchmarking of database-like operations in single-node environment. +Benchmark report is available at [h2oai.github.io/db-benchmark](https://h2oai.github.io/db-benchmark). +We focused mainly on portability and reproducibility. Benchmark is routinely re-run to present up-to-date timings. Most of solutions used are automatically upgraded to their stable or development versions. +This benchmark is meant to compare scalability both in data volume and data complexity. +Contribution and feedback are very welcome! + +# Tasks + + - [x] groupby + - [x] join + - [x] groupby2014 + +# Solutions + + - [x] [dask](https://github.com/dask/dask) + - [x] [data.table](https://github.com/Rdatatable/data.table) + - [x] [dplyr](https://github.com/tidyverse/dplyr) + - [x] [DataFrames.jl](https://github.com/JuliaData/DataFrames.jl) + - [x] [pandas](https://github.com/pandas-dev/pandas) + - [x] [(py)datatable](https://github.com/h2oai/datatable) + - [x] [spark](https://github.com/apache/spark) + - [x] [cuDF](https://github.com/rapidsai/cudf) + - [x] [ClickHouse](https://github.com/yandex/ClickHouse) + - [x] [Polars](https://github.com/ritchie46/polars) + - [x] [Arrow](https://github.com/apache/arrow) + - [x] [DuckDB](https://github.com/duckdb/duckdb) + +More solutions has been proposed. Status of those can be tracked in issues tracker of our project repository by using [_new solution_](https://github.com/h2oai/db-benchmark/issues?q=is%3Aissue+is%3Aopen+label%3A%22new+solution%22) label. + +# Reproduce + +## Batch benchmark run + +- edit `path.env` and set `julia` and `java` paths +- if solution uses python create new `virtualenv` as `$solution/py-$solution`, example for `pandas` use `virtualenv pandas/py-pandas --python=/usr/bin/python3.6` +- install every solution, follow `$solution/setup-$solution.sh` scripts +- edit `run.conf` to define solutions and tasks to benchmark +- generate data, for `groupby` use `Rscript _data/groupby-datagen.R 1e7 1e2 0 0` to create `G1_1e7_1e2_0_0.csv`, re-save to binary format where needed (see below), create `data` directory and keep all data files there +- edit `_control/data.csv` to define data sizes to benchmark using `active` flag +- ensure SWAP is disabled and ClickHouse server is not yet running +- start benchmark with `./run.sh` + +## Single solution benchmark + +- install solution software + - for python we recommend to use `virtualenv` for better isolation + - for R ensure that library is installed in a solution subdirectory, so that `library("dplyr", lib.loc="./dplyr/r-dplyr")` or `library("data.table", lib.loc="./datatable/r-datatable")` works + - note that some solutions may require another to be installed to speed-up csv data load, for example, `dplyr` requires `data.table` and similarly `pandas` requires (py)`datatable` +- generate data using `_data/*-datagen.R` scripts, for example, `Rscript _data/groupby-datagen.R 1e7 1e2 0 0` creates `G1_1e7_1e2_0_0.csv`, put data files in `data` directory +- run benchmark for a single solution using `./_launcher/solution.R --solution=data.table --task=groupby --nrow=1e7` +- run other data cases by passing extra parameters `--k=1e2 --na=0 --sort=0` +- use `--quiet=true` to suppress script's output and print timings only, using `--print=question,run,time_sec` specify columns to be printed to console, to print all use `--print=*` +- use `--out=time.csv` to write timings to a file rather than console + +## Running script interactively + +- install software in expected location, details above +- ensure data name to be used in env var below is present in `./data` dir +- source python virtual environment if needed +- call `SRC_DATANAME=G1_1e7_1e2_0_0 R`, if desired replace `R` with `python` or `julia` +- proceed pasting code from benchmark script + +## Extra care needed + +- `cudf` uses `conda` instead of `virtualenv` + +# Example environment + +- setting up r3-8xlarge: 244GB RAM, 32 cores: [Amazon EC2 for beginners](https://github.com/Rdatatable/data.table/wiki/Amazon-EC2-for-beginners) +- (slightly outdated) full reproduce script on clean Ubuntu 16.04: [_utils/repro.sh](https://github.com/h2oai/db-benchmark/blob/master/_utils/repro.sh) + +# Acknowledgment + +Timings for some solutions might be missing for particular data sizes or questions. Some functions are not yet implemented in all solutions so we were unable to answer all questions in all solutions. Some solutions might also run out of memory when running benchmark script which results the process to be killed by OS. Lastly we also added timeout for single benchmark script to run, once timeout value is reached script is terminated. +Please check [_exceptions_](https://github.com/h2oai/db-benchmark/issues?q=is%3Aissue+is%3Aopen+label%3Aexceptions) label in our repository for a list of issues/defects in solutions, that makes us unable to provide all timings. +There is also [_no documentation_](https://github.com/h2oai/db-benchmark/labels/no%20documentation) label that lists issues that are blocked by missing documentation in solutions we are benchmarking. diff --git a/third_party/bigframes_vendored/db_benchmark/__init__.py b/third_party/bigframes_vendored/db_benchmark/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/third_party/bigframes_vendored/db_benchmark/sort_queries.py b/third_party/bigframes_vendored/db_benchmark/sort_queries.py new file mode 100644 index 0000000000..600df103cf --- /dev/null +++ b/third_party/bigframes_vendored/db_benchmark/sort_queries.py @@ -0,0 +1,16 @@ +# Contains code from https://github.com/duckdblabs/db-benchmark/blob/master/pandas/sort-pandas.py + +import bigframes +import bigframes.session + + +def q1(table_id: str, session: bigframes.Session) -> None: + print("Sort benchmark 1: sort by int id2") + + x = session.read_gbq(f"bigframes-dev-perf.dbbenchmark.{table_id}") + + ans = x.sort_values("id2") + print(ans.shape) + + chk = [ans["v1"].sum()] + print(chk)