tests: add benchmark and split nox sessions into multiple kokoro jobs (#734)

milkshakeiii · tswast · web-flow · commit 74d447956925 · 2024-06-04T09:28:31.000-05:00
* tests: add benchmark nox session and split nox sessions into multiple kokoro jobs

* remove accidentally added file

* Update dataframe.ipynb

* update noxfile

* revert noxfile

* remove benchmark notebook

* use regular python files for benchmark scripts

* add benchmark script

* remove accidentally added files

* remove stray line

* add to comment

* correct filepath

* fix filenames

---------

Co-authored-by: Tim Sweña (Swast) &lt;swast@google.com&gt;
diff --git a/.kokoro/continuous/doctest.cfg b/.kokoro/continuous/doctest.cfg
@@ -0,0 +1,17 @@
+# Format: //devtools/kokoro/config/proto/build.proto
+
+# Only run this nox session.
+env_vars: {
+    key: "NOX_SESSION"
+    value: "doctest"
+}
+
+env_vars: {
+    key: "GOOGLE_CLOUD_PROJECT"
+    value: "bigframes-load-testing"
+}
+
+env_vars: {
+    key: "BIGFRAMES_TEST_MODEL_VERTEX_ENDPOINT"
+    value: "https://us-central1-aiplatform.googleapis.com/v1/projects/272725758477/locations/us-central1/endpoints/590545496255234048"
+}
diff --git a/.kokoro/continuous/e2e.cfg b/.kokoro/continuous/e2e.cfg
@@ -3,7 +3,7 @@
 # Only run this nox session.
 env_vars: {
     key: "NOX_SESSION"
-    value: "e2e doctest notebook unit_prerelease system_prerelease system_noextras"
+    value: "e2e unit_prerelease system_prerelease system_noextras"
 }
 
 env_vars: {
diff --git a/.kokoro/continuous/notebook.cfg b/.kokoro/continuous/notebook.cfg
@@ -0,0 +1,17 @@
+# Format: //devtools/kokoro/config/proto/build.proto
+
+# Only run this nox session.
+env_vars: {
+    key: "NOX_SESSION"
+    value: "notebook"
+}
+
+env_vars: {
+    key: "GOOGLE_CLOUD_PROJECT"
+    value: "bigframes-load-testing"
+}
+
+env_vars: {
+    key: "BIGFRAMES_TEST_MODEL_VERTEX_ENDPOINT"
+    value: "https://us-central1-aiplatform.googleapis.com/v1/projects/272725758477/locations/us-central1/endpoints/590545496255234048"
+}
diff --git a/.kokoro/load/benchmark.cfg b/.kokoro/load/benchmark.cfg
@@ -0,0 +1,17 @@
+# Format: //devtools/kokoro/config/proto/build.proto
+
+# Only run this nox session.
+env_vars: {
+    key: "NOX_SESSION"
+    value: "benchmark"
+}
+
+env_vars: {
+    key: "GOOGLE_CLOUD_PROJECT"
+    value: "bigframes-load-testing"
+}
+
+env_vars: {
+    key: "BIGFRAMES_TEST_MODEL_VERTEX_ENDPOINT"
+    value: "https://us-central1-aiplatform.googleapis.com/v1/projects/272725758477/locations/us-central1/endpoints/590545496255234048"
+}
diff --git a/.kokoro/presubmit/doctest.cfg b/.kokoro/presubmit/doctest.cfg
@@ -0,0 +1,17 @@
+# Format: //devtools/kokoro/config/proto/build.proto
+
+# Only run this nox session.
+env_vars: {
+    key: "NOX_SESSION"
+    value: "doctest"
+}
+
+env_vars: {
+    key: "GOOGLE_CLOUD_PROJECT"
+    value: "bigframes-load-testing"
+}
+
+env_vars: {
+    key: "BIGFRAMES_TEST_MODEL_VERTEX_ENDPOINT"
+    value: "https://us-central1-aiplatform.googleapis.com/v1/projects/272725758477/locations/us-central1/endpoints/590545496255234048"
+}
diff --git a/.kokoro/presubmit/e2e.cfg b/.kokoro/presubmit/e2e.cfg
@@ -3,7 +3,7 @@
 # Only run this nox session.
 env_vars: {
     key: "NOX_SESSION"
-    value: "e2e doctest notebook unit_prerelease system_prerelease system_noextras"
+    value: "e2e unit_prerelease system_prerelease system_noextras"
 }
 
 env_vars: {
diff --git a/.kokoro/presubmit/notebook.cfg b/.kokoro/presubmit/notebook.cfg
@@ -0,0 +1,17 @@
+# Format: //devtools/kokoro/config/proto/build.proto
+
+# Only run this nox session.
+env_vars: {
+    key: "NOX_SESSION"
+    value: "notebook"
+}
+
+env_vars: {
+    key: "GOOGLE_CLOUD_PROJECT"
+    value: "bigframes-load-testing"
+}
+
+env_vars: {
+    key: "BIGFRAMES_TEST_MODEL_VERTEX_ENDPOINT"
+    value: "https://us-central1-aiplatform.googleapis.com/v1/projects/272725758477/locations/us-central1/endpoints/590545496255234048"
+}
diff --git a/noxfile.py b/noxfile.py
@@ -76,6 +76,8 @@
 SYSTEM_TEST_EXTRAS: List[str] = ["tests"]
 SYSTEM_TEST_EXTRAS_BY_PYTHON: Dict[str, List[str]] = {}
 
+LOGGING_NAME_ENV_VAR = "BIGFRAMES_PERFORMANCE_LOG_NAME"
+
 CURRENT_DIRECTORY = pathlib.Path(__file__).parent.absolute()
 
 # Sessions are executed in the order so putting the smaller sessions
@@ -748,8 +750,6 @@ def notebook(session: nox.Session):
         "--nbmake-timeout=900",  # 15 minutes
     ]
 
-    logging_name_env_var = "BIGFRAMES_PERFORMANCE_LOG_NAME"
-
     try:
         # Populate notebook parameters and make a backup so that the notebooks
         # are runnable.
@@ -763,10 +763,10 @@ def notebook(session: nox.Session):
         # takes an environment variable for performance logging
         processes = []
         for notebook in notebooks:
-            session.env[logging_name_env_var] = os.path.basename(notebook)
             process = Process(
                 target=session.run,
                 args=(*pytest_command, notebook),
+                kwargs={"env": {LOGGING_NAME_ENV_VAR: os.path.basename(notebook)}},
             )
             process.start()
             processes.append(process)
@@ -788,11 +788,15 @@ def notebook(session: nox.Session):
     processes = []
     for notebook, regions in notebooks_reg.items():
         for region in regions:
-            session.env[logging_name_env_var] = os.path.basename(notebook)
             process = Process(
                 target=session.run,
                 args=(*pytest_command, notebook),
-                kwargs={"env": {"BIGQUERY_LOCATION": region}},
+                kwargs={
+                    "env": {
+                        "BIGQUERY_LOCATION": region,
+                        LOGGING_NAME_ENV_VAR: os.path.basename(notebook),
+                    }
+                },
             )
             process.start()
             processes.append(process)
@@ -803,24 +807,59 @@ def notebook(session: nox.Session):
     # when the environment variable is set as it is above,
     # notebooks output a .bytesprocessed and .slotmillis report
     # collect those reports and print a summary
-    _print_performance_report()
+    _print_performance_report("notebooks/")
+
+
+@nox.session(python=SYSTEM_TEST_PYTHON_VERSIONS)
+def benchmark(session: nox.Session):
+    session.install("-e", ".[all]")
+
+    benchmark_script_list = list(Path("scripts/benchmark/").glob("*.py"))
+
+    # Run benchmarks in parallel session.run's, since each benchmark
+    # takes an environment variable for performance logging
+    processes = []
+    for benchmark in benchmark_script_list:
+        process = Process(
+            target=session.run,
+            args=("python", benchmark),
+            kwargs={
+                "env": {
+                    LOGGING_NAME_ENV_VAR: "scripts/benchmark/"
+                    + os.path.basename(benchmark)
+                }
+            },
+        )
+        process.start()
+        processes.append(process)
+
+    for process in processes:
+        process.join()
+
+    # when the environment variable is set as it is above,
+    # notebooks output a .bytesprocessed and .slotmillis report
+    # collect those reports and print a summary
+    _print_performance_report("scripts/")
 
 
-def _print_performance_report():
+def _print_performance_report(path: str):
     """Add an informational report about http queries, bytes
     processed, and slot time to the testlog output for purposes
     of measuring bigquery-related performance changes.
+
+    Looks specifically for output files in subfolders of the
+    passed path. (*/*.bytesprocessed and */*.slotmillis)
     """
     print("---BIGQUERY USAGE REPORT---")
     results_dict = {}
-    for bytes_report in Path("notebooks/").glob("*/*.bytesprocessed"):
+    for bytes_report in Path(path).glob("*/*.bytesprocessed"):
         with open(bytes_report, "r") as bytes_file:
             filename = bytes_report.stem
             lines = bytes_file.read().splitlines()
             query_count = len(lines)
             total_bytes = sum([int(line) for line in lines])
             results_dict[filename] = [query_count, total_bytes]
-    for millis_report in Path("notebooks/").glob("*/*.slotmillis"):
+    for millis_report in Path(path).glob("*/*.slotmillis"):
         with open(millis_report, "r") as millis_file:
             filename = millis_report.stem
             lines = millis_file.read().splitlines()
@@ -830,7 +869,7 @@ def _print_performance_report():
     cumulative_queries = 0
     cumulative_bytes = 0
     cumulative_slot_millis = 0
-    for results in results_dict.values():
+    for name, results in results_dict.items():
         if len(results) != 3:
             raise IOError(
                 "Mismatch in performance logging output. "
@@ -842,7 +881,7 @@ def _print_performance_report():
         cumulative_bytes += total_bytes
         cumulative_slot_millis += total_slot_millis
         print(
-            f"{filename} - query count: {query_count},"
+            f"{name} - query count: {query_count},"
             f" bytes processed sum: {total_bytes},"
             f" slot millis sum: {total_slot_millis}"
         )
diff --git a/scripts/benchmark/simple_benchmark.py b/scripts/benchmark/simple_benchmark.py
@@ -0,0 +1,27 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import bigframes.pandas as bpd
+
+# This is a placeholder benchmark.
+# TODO(340278185): Add more data analysis tasks and benchmark files
+# like this one.
+
+print("Performing simple benchmark.")
+df = bpd.DataFrame()
+df["column_1"] = bpd.Series([i for i in range(100000)])
+df["column_2"] = bpd.Series([i * 2 for i in range(100000)])
+df["column_3"] = df["column_1"] + df["column_2"]
+df.__repr__()
+bpd.reset_session()

Original file line number	Diff line number	Diff line change
`@@ -3,7 +3,7 @@`
`3`	`3`	`# Only run this nox session.`
`4`	`4`	`env_vars: {`
`5`	`5`	`key: "NOX_SESSION"`
`6`		`- value: "e2e doctest notebook unit_prerelease system_prerelease system_noextras"`
	`6`	`+ value: "e2e unit_prerelease system_prerelease system_noextras"`
`7`	`7`	`}`
`8`	`8`
`9`	`9`	`env_vars: {`