Upload benchmark results in v3 schema to ossci-benchmarks S3 bucket (#5845)

huydhn · web-flow · commit c661a340325d · 2024-11-07T15:19:19.000-08:00
After #5839, it's time to upload the GHA to upload to S3. I'll update the S3 lambda replicator in a separate PR. ### Testing * Locally ``` # Backward compatibility, upload to both dynamoDB and S3 for v2 schema $python upload_benchmark_results.py --benchmark-results-dir benchmark-results-dir-for-testing/v2 --schema-version v2 --dry-run INFO:root:Uploading benchmark-results-dir-for-testing/v2/android-artifacts-31017223108.json to dynamoDB (v2) INFO:root:Writing 16 documents to DynamoDB torchci-oss-ci-benchmark INFO:root:Upload benchmark-results-dir-for-testing/v2/android-artifacts-31017223108.json to s3://ossci-benchmarks/v2/pytorch/executorch/12345/31017223108/android-artifacts-31017223108.json INFO:root:Uploading benchmark-results-dir-for-testing/v2/android-artifacts-31017223431.json to dynamoDB (v2) INFO:root:Writing 12 documents to DynamoDB torchci-oss-ci-benchmark INFO:root:Upload benchmark-results-dir-for-testing/v2/android-artifacts-31017223431.json to s3://ossci-benchmarks/v2/pytorch/executorch/12345/31017223431/android-artifacts-31017223431.json # We use only S3 for v3 schema $python upload_benchmark_results.py --benchmark-results-dir benchmark-results-dir-for-testing/v3 --schema-version v3 INFO:root:Upload benchmark-results-dir-for-testing/v3/mock.json to s3://ossci-benchmarks/v3/pytorch/pytorch/1/1/mock.json ``` * CI * v2 https://github.com/pytorch/test-infra/actions/runs/11606273442/job/32318017857?pr=5845#step:4:55 * v3 https://github.com/pytorch/test-infra/actions/runs/11606273442/job/32318017857?pr=5845#step:5:43 * Test PR on ExecuTorch to use the new version https://github.com/pytorch/executorch/actions/runs/11606339159 to see that the files are uploaded to S3 https://github.com/pytorch/executorch/actions/runs/11606339159/job/32318826449#step:8:87
diff --git a/.github/actions/upload-benchmark-results/action.yml b/.github/actions/upload-benchmark-results/action.yml
@@ -6,6 +6,9 @@ inputs:
     required: True
   dry-run:
     default: 'true'
+  # TODO (huydhn): Use this to gate the migration to oss_ci_benchmark_v3 on S3
+  schema-version:
+    default: 'v2'
 
 runs:
   using: composite
@@ -16,21 +19,22 @@ runs:
         set -eux
         python3 -mpip install boto3==1.35.33
 
-    # TODO (huydhn): Once the generic benchmark database is ready, this will be
-    # uploaded to S3 instead
-    - name: Upload benchmark results to DynamoDB
+    - name: Upload benchmark results
       shell: bash
       env:
         BENCHMARK_RESULTS_DIR: ${{ inputs.benchmark-results-dir }}
         DRY_RUN: ${{ inputs.dry-run }}
+        SCHEMA_VERSION: ${{ inputs.schema-version }}
       run: |
         set -eux
 
         if [[ "${DRY_RUN}" == "true" ]]; then
           python3 "${GITHUB_ACTION_PATH}/../../scripts/upload_benchmark_results.py" \
             --benchmark-results-dir "${BENCHMARK_RESULTS_DIR}" \
+            --schema-version "${SCHEMA_VERSION}" \
             --dry-run
         else
           python3 "${GITHUB_ACTION_PATH}/../../scripts/upload_benchmark_results.py" \
-            --benchmark-results-dir "${BENCHMARK_RESULTS_DIR}"
+            --benchmark-results-dir "${BENCHMARK_RESULTS_DIR}" \
+            --schema-version "${SCHEMA_VERSION}"
         fi
diff --git a/.github/scripts/benchmark-results-dir-for-testing/v2/android-artifacts-31017223108.json b/.github/scripts/benchmark-results-dir-for-testing/v2/android-artifacts-31017223108.json
diff --git a/.github/scripts/benchmark-results-dir-for-testing/v2/android-artifacts-31017223431.json b/.github/scripts/benchmark-results-dir-for-testing/v2/android-artifacts-31017223431.json
diff --git a/.github/scripts/benchmark-results-dir-for-testing/v3/mock.json b/.github/scripts/benchmark-results-dir-for-testing/v3/mock.json
@@ -0,0 +1 @@
+[{}, {"repo": "pytorch/pytorch"}, {"repo": "pytorch/pytorch", "workflow_id": 1}, {"repo": "pytorch/pytorch", "workflow_id": 1, "job_id": 1}]
diff --git a/.github/scripts/upload_benchmark_results.py b/.github/scripts/upload_benchmark_results.py
@@ -5,6 +5,7 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+import gzip
 import hashlib
 import json
 import logging
@@ -21,6 +22,9 @@
 logging.basicConfig(level=logging.INFO)
 
 
+OSSCI_BENCHMARKS_BUCKET = "ossci-benchmarks"
+
+
 class ValidateDir(Action):
     def __call__(
         self,
@@ -57,6 +61,13 @@ def parse_args() -> Any:
         default="torchci-oss-ci-benchmark",
         help="the name of the DynamoDB table to upload to",
     )
+    # v3 is defined at torchci/clickhouse_queries/oss_ci_benchmark_v3/query.sql
+    parser.add_argument(
+        "--schema-version",
+        choices=["v2", "v3"],
+        required=True,
+        help="the database schema to use",
+    )
 
     return parser.parse_args()
 
@@ -69,7 +80,6 @@ def default(self, o: Any) -> Any:
         return super().default(o)
 
 
-# TODO (huydhn): This can be replaced by S3 path once we move to S3
 def generate_partition_key(doc: Dict[str, Any]) -> str:
     """
     Generate an unique partition key for the document on DynamoDB
@@ -106,24 +116,95 @@ def upload_to_dynamodb(
                 batch.put_item(Item=doc)
 
 
+def generate_s3_path(filepath: str, schema_version: str) -> Optional[str]:
+    with open(filepath) as f:
+        docs = json.load(f)
+
+        if not docs:
+            info(f"{filepath} is empty")
+            return ""
+
+        for doc in docs:
+            repo = doc.get("repo", "")
+            workflow_id = doc.get("workflow_id", 0)
+            job_id = doc.get("job_id", 0)
+            servicelab_experiment_id = doc.get("servicelab_experiment_id", 0)
+            servicelab_trial_id = doc.get("servicelab_trial_id", 0)
+
+            # Also handle service lab records here
+            workflow_id = workflow_id if workflow_id else servicelab_experiment_id
+            job_id = job_id if job_id else servicelab_trial_id
+
+            # We just need one record here to get some metadata to generate the s3 path
+            if repo and workflow_id and job_id:
+                break
+
+        if not repo or not workflow_id or not job_id:
+            info(
+                f"{filepath} is without any information about the repo, workflow, or job id"
+            )
+            return ""
+
+    filename = os.path.basename(filepath)
+    return f"{schema_version}/{repo}/{workflow_id}/{job_id}/{filename}"
+
+
+def upload_to_s3(
+    s3_bucket: str,
+    filepath: str,
+    schema_version: str,
+    dry_run: bool = True,
+) -> None:
+    """
+    Upload the benchmark results to S3
+    """
+    s3_path = generate_s3_path(filepath, schema_version)
+    if not s3_path:
+        info(f"Could not generate an S3 path for {filepath}, skipping...")
+        return
+
+    info(f"Upload {filepath} to s3://{s3_bucket}/{s3_path}")
+    if not dry_run:
+        # Copied from upload stats script
+        with open(filepath) as f:
+            boto3.resource("s3").Object(
+                f"{s3_bucket}",
+                f"{s3_path}",
+            ).put(
+                Body=gzip.compress(f.read().encode()),
+                ContentEncoding="gzip",
+                ContentType="application/json",
+            )
+
+
 def main() -> None:
     args = parse_args()
+    schema_version = args.schema_version
 
     for file in os.listdir(args.benchmark_results_dir):
         if not file.endswith(".json"):
             continue
 
         filepath = os.path.join(args.benchmark_results_dir, file)
-        info(f"Loading {filepath}")
 
-        with open(filepath) as f:
-            upload_to_dynamodb(
-                dynamodb_table=args.dynamodb_table,
-                # NB: DynamoDB only accepts decimal number, not float
-                docs=json.load(f, parse_float=Decimal),
-                generate_partition_key=generate_partition_key,
-                dry_run=args.dry_run,
-            )
+        # NB: This is for backward compatibility before we move to schema v3
+        if schema_version == "v2":
+            with open(filepath) as f:
+                info(f"Uploading {filepath} to dynamoDB ({schema_version})")
+                upload_to_dynamodb(
+                    dynamodb_table=args.dynamodb_table,
+                    # NB: DynamoDB only accepts decimal number, not float
+                    docs=json.load(f, parse_float=Decimal),
+                    generate_partition_key=generate_partition_key,
+                    dry_run=args.dry_run,
+                )
+
+        upload_to_s3(
+            s3_bucket=OSSCI_BENCHMARKS_BUCKET,
+            filepath=filepath,
+            schema_version=schema_version,
+            dry_run=args.dry_run,
+        )
 
 
 if __name__ == "__main__":
diff --git a/.github/workflows/test_upload_benchmark_results.yml b/.github/workflows/test_upload_benchmark_results.yml
@@ -13,8 +13,16 @@ jobs:
     steps:
       - uses: actions/checkout@v3
 
-      - name: Test upload the benchmark results
+      - name: Test upload the benchmark results (v2)
         uses: ./.github/actions/upload-benchmark-results
         with:
-          benchmark-results-dir: .github/scripts/benchmark-results-dir-for-testing
+          benchmark-results-dir: .github/scripts/benchmark-results-dir-for-testing/v2
+          schema-version: v2
+          dry-run: true
+
+      - name: Test upload the benchmark results (v3)
+        uses: ./.github/actions/upload-benchmark-results
+        with:
+          benchmark-results-dir: .github/scripts/benchmark-results-dir-for-testing/v3
+          schema-version: v3
           dry-run: true

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+[{}, {"repo": "pytorch/pytorch"}, {"repo": "pytorch/pytorch", "workflow_id": 1}, {"repo": "pytorch/pytorch", "workflow_id": 1, "job_id": 1}]`