chore: make benchmark to read subfolders and three groupby benchmarks. (#754)

Genesis929 · gcf-owl-bot[bot] · web-flow · commit 60f13e708566 · 2024-06-06T16:12:22.000-07:00
* chore: make benchmark to read subfolders. * group_by test for 1E9 rows * update q1 * update noxfile for lint path. * update performance report logic * format update * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md --------- Co-authored-by: Owl Bot <gcf-owl-bot[bot]@users.noreply.github.com>
diff --git a/noxfile.py b/noxfile.py
@@ -35,7 +35,15 @@
 # https://github.com/str0zzapreti/pytest-retry/issues/32
 PYTEST_VERSION = "pytest<8.0.0dev"
 SPHINX_VERSION = "sphinx==4.5.0"
-LINT_PATHS = ["docs", "bigframes", "tests", "third_party", "noxfile.py", "setup.py"]
+LINT_PATHS = [
+    "docs",
+    "bigframes",
+    "tests",
+    "third_party",
+    "noxfile.py",
+    "setup.py",
+    os.path.join("scripts", "benchmark"),
+]
 
 DEFAULT_PYTHON_VERSION = "3.10"
 
@@ -813,22 +821,17 @@ def notebook(session: nox.Session):
 @nox.session(python=SYSTEM_TEST_PYTHON_VERSIONS)
 def benchmark(session: nox.Session):
     session.install("-e", ".[all]")
+    base_path = os.path.join("scripts", "benchmark")
 
-    benchmark_script_list = list(Path("scripts/benchmark/").glob("*.py"))
-
+    benchmark_script_list = list(Path(base_path).rglob("*.py"))
     # Run benchmarks in parallel session.run's, since each benchmark
     # takes an environment variable for performance logging
     processes = []
     for benchmark in benchmark_script_list:
         process = Process(
             target=session.run,
             args=("python", benchmark),
-            kwargs={
-                "env": {
-                    LOGGING_NAME_ENV_VAR: "scripts/benchmark/"
-                    + os.path.basename(benchmark)
-                }
-            },
+            kwargs={"env": {LOGGING_NAME_ENV_VAR: benchmark.as_posix()}},
         )
         process.start()
         processes.append(process)
@@ -839,7 +842,7 @@ def benchmark(session: nox.Session):
     # when the environment variable is set as it is above,
     # notebooks output a .bytesprocessed and .slotmillis report
     # collect those reports and print a summary
-    _print_performance_report("scripts/")
+    _print_performance_report(base_path)
 
 
 def _print_performance_report(path: str):
@@ -852,19 +855,24 @@ def _print_performance_report(path: str):
     """
     print("---BIGQUERY USAGE REPORT---")
     results_dict = {}
-    for bytes_report in Path(path).glob("*/*.bytesprocessed"):
+    bytes_reports = sorted(Path(path).rglob("*.bytesprocessed"), key=lambda x: x.name)
+    for bytes_report in bytes_reports:
         with open(bytes_report, "r") as bytes_file:
-            filename = bytes_report.stem
+            filename = bytes_report.relative_to(path).with_suffix("")
             lines = bytes_file.read().splitlines()
             query_count = len(lines)
             total_bytes = sum([int(line) for line in lines])
             results_dict[filename] = [query_count, total_bytes]
-    for millis_report in Path(path).glob("*/*.slotmillis"):
+        os.remove(bytes_report)
+
+    millis_reports = sorted(Path(path).rglob("*.slotmillis"), key=lambda x: x.name)
+    for millis_report in millis_reports:
         with open(millis_report, "r") as millis_file:
-            filename = millis_report.stem
+            filename = millis_report.relative_to(path).with_suffix("")
             lines = millis_file.read().splitlines()
             total_slot_millis = sum([int(line) for line in lines])
             results_dict[filename] += [total_slot_millis]
+        os.remove(millis_report)
 
     cumulative_queries = 0
     cumulative_bytes = 0
diff --git a/scripts/benchmark/db-benchmark/groupby/G1_1e9_1e2_5_0/q1.py b/scripts/benchmark/db-benchmark/groupby/G1_1e9_1e2_5_0/q1.py
@@ -0,0 +1,14 @@
+# Contains code from https://github.com/duckdblabs/db-benchmark/blob/master/pandas/groupby-pandas.py
+
+import bigframes.pandas as bpd
+
+print("Groupby benchmark 1: sum v1 by id1")
+
+x = bpd.read_gbq("bigframes-dev-perf.dbbenchmark.G1_1e9_1e2_5_0")
+
+ans = x.groupby("id1", as_index=False, dropna=False).agg({"v1": "sum"})
+print(ans.shape)
+chk = [ans["v1"].sum()]
+print(chk)
+
+bpd.reset_session()
diff --git a/scripts/benchmark/db-benchmark/groupby/G1_1e9_1e2_5_0/q2.py b/scripts/benchmark/db-benchmark/groupby/G1_1e9_1e2_5_0/q2.py
@@ -0,0 +1,14 @@
+# Contains code from https://github.com/duckdblabs/db-benchmark/blob/master/pandas/groupby-pandas.py
+
+import bigframes.pandas as bpd
+
+print("Groupby benchmark 2: sum v1 by id1:id2")
+
+x = bpd.read_gbq("bigframes-dev-perf.dbbenchmark.G1_1e9_1e2_5_0")
+
+ans = x.groupby(["id1", "id2"], as_index=False, dropna=False).agg({"v1": "sum"})
+print(ans.shape)
+chk = [ans["v1"].sum()]
+print(chk)
+
+bpd.reset_session()
diff --git a/scripts/benchmark/db-benchmark/groupby/G1_1e9_1e2_5_0/q3.py b/scripts/benchmark/db-benchmark/groupby/G1_1e9_1e2_5_0/q3.py
@@ -0,0 +1,14 @@
+# Contains code from https://github.com/duckdblabs/db-benchmark/blob/master/pandas/groupby-pandas.py
+
+import bigframes.pandas as bpd
+
+print("Groupby benchmark 3: sum v1 mean v3 by id3")
+
+x = bpd.read_gbq("bigframes-dev-perf.dbbenchmark.G1_1e9_1e2_5_0")
+
+ans = x.groupby("id3", as_index=False, dropna=False).agg({"v1": "sum", "v3": "mean"})
+print(ans.shape)
+chk = [ans["v1"].sum(), ans["v3"].sum()]
+print(chk)
+
+bpd.reset_session()