Improve and simplify the microbenchmarks CI setup (#7571)

igoragoli · web-flow · commit 6e87745716a3 · 2025-10-02T18:33:37.000+02:00
- [x] Tackle all TODOs on `microbenchmarks.yml` before merging. - [ ] Merge [refactor: simplify dd-trace-dotnet microbenchmarks](DataDog/benchmarking-platform#200) before merging. ## Summary of changes - Add a dd-octo-sts policy allowing GitLab runners to access dd-trace-dotnet contents. - Improve and simplify microbenchmarks and macrobenchmarks CI setup. - Rename the AMI creation job for `build-dd-trace-dotnet-microbenchmarks-ami`. - Use `dd-octo-sts` for generating GitHub tokens. - Simplify and reduce the number of CI variables, standardizing on the `BP_INFRA_*` prefix. - Make instance cleanup conditional on a `CLEANUP` variable. This allows us to SSH/RDP into instances after benchmarks are run, if necessary. Related changes on benchmarking-platform: [refactor: simplify dd-trace-dotnet microbenchmarks](DataDog/benchmarking-platform#200) ## Reason for change https://datadoghq.atlassian.net/browse/APMSP-2282 and https://datadoghq.atlassian.net/browse/APMSP-1908. ## Implementation details ## Test coverage Benchmark run on the CI: https://gitlab.ddbuild.io/DataDog/apm-reliability/dd-trace-dotnet/-/jobs/1154611844 ## Other details
diff --git a/.github/chainguard/gitlab.github-access.read-contents.sts.yaml b/.github/chainguard/gitlab.github-access.read-contents.sts.yaml
@@ -0,0 +1,6 @@
+issuer: https://gitlab.ddbuild.io
+
+subject_pattern: "project_path:DataDog/apm-reliability/dd-trace-dotnet:ref_type:(branch|tag):ref:.*"
+
+permissions:
+  contents: read
diff --git a/.gitlab/benchmarks/macrobenchmarks.yml b/.gitlab/benchmarks/macrobenchmarks.yml
@@ -467,6 +467,13 @@ profiler_cpu_timer_create-arm64:
       - platform/artifacts/
     expire_in: 3 months
   variables:
+    AWS_REGION: "us-east-1"
+    
+    # Branch containing 1. scripts to launch Windows benchmarks on ephemeral 
+    # instances (to be used by GitLab CI runners) and 2. scripts to run Windows 
+    # benchmarks (to be used by the ephemeral instances).
+    BP_INFRA_BENCHMARKING_PLATFORM_BRANCH: "dd-trace-dotnet/macro"
+
     # Whether to cleanup ephemeral instances after benchmarks are run
     CLEANUP: "true"
 
@@ -489,14 +496,17 @@ profiler_cpu_timer_create-arm64:
   script:
     - source build-id.txt
     - echo "Building for the following build https://dev.azure.com/datadoghq/dd-trace-dotnet/_build/results?buildId=$buildId&view=results"
-    - export BP_INFRA_BENCHMARKING_PLATFORM_BRANCH=dd-trace-dotnet/macro
     - git clone --branch $BP_INFRA_BENCHMARKING_PLATFORM_BRANCH https://gitlab-ci-token:${CI_JOB_TOKEN}@gitlab.ddbuild.io/DataDog/benchmarking-platform platform && cd platform
     - ./ephemeral-infra/run-windows-benchmarks.sh
   after_script:
     - |
-      bp-infra cleanup --provision ./platform/ephemeral-infra/provisions/macrobenchmark-ephemeral-instance.yaml \
-        --region "${AWS_REGION}" \
-        --bypass-stack-destroy
+      if [ "$CLEANUP" == "true" ]; then
+        bp-infra cleanup --provision ./platform/ephemeral-infra/provisions/macrobenchmark-ephemeral-instance.yaml \
+          --region "${AWS_REGION}" \
+          --bypass-stack-destroy
+      else
+        echo "'CLEANUP' is set to 'false'. Will not cleanup."
+      fi
 
 baseline-win:
   extends: .benchmarks-win
diff --git a/.gitlab/benchmarks/microbenchmarks.yml b/.gitlab/benchmarks/microbenchmarks.yml
@@ -1,101 +1,133 @@
-.setup:
-  script:
-    - mkdir -p ~/.aws
-    - /app/bp-infra/tools/fetch-ssm-parameter.sh $AWS_EPHEMERAL_INFRA_PROFILE_SSM_PARAMETER > ~/.aws/config || exit $?
-    - export AWS_PROFILE=ephemeral-infra-ci
-    - export BP_INFRA_KEY_PAIR_NAME=$(cat ~/.aws/key-pair-name.txt)
-    - export BP_INFRA_KEY_PAIR_PRIVATE_KEY_PATH=~/.aws/key-pair-private-key.pem
+.dd-octo-sts-setup:
+  before_script:
+    - |
+      set +e
+      error_output=$({ dd-octo-sts token --scope DataDog/dd-trace-dotnet --policy gitlab.github-access.read-contents > "/tmp/github-token"; } 2>&1)
+      exit_code=$?
+      if [ $exit_code -ne 0 ]; then
+        echo "ERROR: Failed to retrieve a GitHub token with dd-octo-sts gitlab.github-access.read-contents policy."
+        echo "Original error: $error_output"
+        echo "Continuing execution anyway..."
+      fi
+      set -e
 
 stages:
-  - infra-update
+  - build
   - benchmarks
 
-update-bp-infra:
-  stage: infra-update
+build-dd-trace-dotnet-microbenchmarks-ami:
+  stage: build
+  tags: ["arch:amd64"]
   timeout: 3h
-  tags: ["arch:amd64"]  
   allow_failure: true
-  # Image created in the following job https://gitlab.ddbuild.io/DataDog/benchmarking-platform-tools/-/jobs/869830045
+  when: manual
   image: registry.ddbuild.io/images/benchmarking-platform-tools-ubuntu:dd-trace-dotnet-micro
+  id_tokens:
+    DDOCTOSTS_ID_TOKEN:
+      aud: dd-octo-sts
+  variables:
+    AWS_REGION: "us-east-1"
+
+    # Branch containing a provision for building the AMI
+    BP_INFRA_BENCHMARKING_PLATFORM_BRANCH: "dd-trace-dotnet/micro"
+
+    PROVISION_FILE: "platform/ephemeral-infra/ami.yaml"
 
+    # Where AMI creation artifacts will be stored
+    BP_INFRA_ARTIFACTS_BUCKET_NAME: "windows-benchmarking-results-us-east-1"
+
+    # Whether to cleanup instances after building the AMI, since the AMI is 
+    # based on an instance that is created in this job
+    CLEANUP: "true"
+  before_script:
+    - !reference [.dd-octo-sts-setup, before_script]
   script:
-    - git clone --branch dd-trace-dotnet/micro https://gitlab-ci-token:${CI_JOB_TOKEN}@gitlab.ddbuild.io/DataDog/benchmarking-platform platform
-    - mkdir -p ~/.aws
-    - /app/bp-infra/tools/fetch-ssm-parameter.sh $AWS_EPHEMERAL_INFRA_PROFILE_SSM_PARAMETER >> ~/.aws/config || exit $?
-    - aws ssm get-parameter --region "$AWS_REGION" --name "ci.${CI_PROJECT_NAME}.ephemeral-infra-ci.windows-benchmarking-key-pair-name" --with-decryption --query "Parameter.Value" --out text >> ~/.aws/key-pair-name.txt
-    - aws ssm get-parameter --region "$AWS_REGION" --name "ci.${CI_PROJECT_NAME}.ephemeral-infra-ci.windows-benchmarking-key-private-key" --with-decryption --query "Parameter.Value" --out text >> ~/.aws/key-pair-private-key.pem
-    - export AWS_PROFILE=ephemeral-infra-ci
-    - export BP_INFRA_KEY_PAIR_NAME=$(cat ~/.aws/key-pair-name.txt)
-    - export BP_INFRA_KEY_PAIR_PRIVATE_KEY_PATH=~/.aws/key-pair-private-key.pem
-    - bp-infra launch --provision ./platform/ephemeral-infra/base-instance.yaml --region "${AWS_REGION}" --bypass-stack-destroy
+    - git clone --branch $BP_INFRA_BENCHMARKING_PLATFORM_BRANCH https://gitlab-ci-token:${CI_JOB_TOKEN}@gitlab.ddbuild.io/DataDog/benchmarking-platform platform
+    - echo "GITHUB_TOKEN=$(cat /tmp/github-token)" > .env
+    - CLEANUP_ARG=$([[ "$CLEANUP" == "false" ]] && echo "--no-cleanup" || echo "")
+    - |
+      bp-infra launch --region "${AWS_REGION}" --os "windows" \
+        --provision "${PROVISION_FILE}" \
+        --bypass-stack-destroy \
+        --env .env \
+        $CLEANUP_ARG
   after_script:
-    - !reference [.setup, script]
+    # Makes sure the instance is cleaned up.
+    # Note: This does not clean up the created AMI.
     - |
-      bp-infra cleanup --provision ./platform/ephemeral-infra/base-instance.yaml \
-        --region "${AWS_REGION}" \
-        --bypass-stack-destroy
-    
-  rules:
-    - when: manual
-  variables:
-    AWS_REGION: "us-east-1"
-    CLEANUP: "false"
-    AWS_EPHEMERAL_INFRA_PROFILE_SSM_PARAMETER: "ci.dd-trace-dotnet.ephemeral-infra-ci.dd-trace-dotnet-profile"
-    AWS_EPHEMERAL_INFRA_PROFILE_NAME: "ephemeral-infra-ci"
-    AWS_EPHEMERAL_INFRA_ARTIFACTS_BUCKET_URI: "s3://windows-benchmarking-results/$CI_PROJECT_NAME/$CI_COMMIT_REF_NAME/$CI_JOB_ID"
-    AWS_EPHEMERAL_INFRA_REGION: "us-east-1"
+      if [ "$CLEANUP" == "true" ]; then
+        bp-infra cleanup --region "${AWS_REGION}" --os "windows" \
+          --provision "${PROVISION_FILE}" \
+          --bypass-stack-destroy
+      else 
+        echo "'CLEANUP' is set to 'false'. Will not cleanup."
+      fi
 
 run-benchmarks:
   stage: benchmarks
   tags: ["arch:amd64"]
   timeout: 2h
   # Image created in the following job https://gitlab.ddbuild.io/DataDog/benchmarking-platform-tools/-/jobs/869830045
   image: registry.ddbuild.io/images/benchmarking-platform-tools-ubuntu:dd-trace-dotnet-micro
+  id_tokens:
+    DDOCTOSTS_ID_TOKEN:
+      aud: dd-octo-sts
+  rules:
+    - when: on_success
+  variables:
+    AWS_REGION: "us-east-1"
+
+    # Branch containing 1. scripts to launch Windows benchmarks on ephemeral 
+    # instances (to be used by GitLab CI runners) and 2. scripts to run Windows 
+    # benchmarks (to be used by the ephemeral instances).
+    BP_INFRA_BENCHMARKING_PLATFORM_BRANCH: "dd-trace-dotnet/micro"
 
+    # Where benchmarking results will be stored
+    BP_INFRA_ARTIFACTS_BUCKET_NAME: "windows-benchmarking-results-us-east-1"
+
+    # Whether to cleanup ephemeral instances after benchmarks are run
+    CLEANUP: "true"
+
+  before_script:
+    - !reference [.dd-octo-sts-setup, before_script]
   script:
-    - git clone --branch dd-trace-dotnet/micro https://gitlab-ci-token:${CI_JOB_TOKEN}@gitlab.ddbuild.io/DataDog/benchmarking-platform platform
-    - AWS_REGION=${AWS_REGION} ./platform/steps/launch-instance.sh
+    - git clone --branch $BP_INFRA_BENCHMARKING_PLATFORM_BRANCH https://gitlab-ci-token:${CI_JOB_TOKEN}@gitlab.ddbuild.io/DataDog/benchmarking-platform platform
+    - ./platform/steps/run-windows-benchmarks.sh
   after_script:
+    # Future improvement: Ideally, should be in a script.
     - |
-      bp-infra cleanup --provision ./platform/ephemeral-infra/ephemeral-instance-main.yaml \
-        --region "${AWS_REGION}" \
-        --bypass-stack-destroy
+      if [ "$CLEANUP" == "true" ]; then
+        bp-infra cleanup --provision ./platform/ephemeral-infra/instance.yaml \
+          --region "${AWS_REGION}" \
+          --bypass-stack-destroy
+      else
+        echo "'CLEANUP' is set to 'false'. Will not cleanup."
+      fi
     - ./platform/steps/post-pr-comment.sh
-    # Temporarily commented out pending issue resolution with sending files to backend
+    # TODO: Uncomment this when the issue with sending files to backend is resolved
     # - ./platform/steps/upload-to-bp-ui.sh
     
-  rules:
-    - when: on_success
-  variables:
-    AWS_REGION: "us-east-1"
-
+# TODO: Remove if unnecessary
 upload-to-bp-ui:
   stage: benchmarks
   tags: ["arch:amd64"]
   timeout: 1h
-  # Image created in the following job https://gitlab.ddbuild.io/DataDog/benchmarking-platform-tools/-/jobs/869830045
-  image: registry.ddbuild.io/images/benchmarking-platform-tools-ubuntu:dotnet-microbenchmarks
-
-  script:
-    - git clone --branch fayssal/test-micro-delivery https://gitlab-ci-token:${CI_JOB_TOKEN}@gitlab.ddbuild.io/DataDog/benchmarking-platform platform
-    # - ./platform/steps/launch-instance.sh
-    # - ./platform/steps/post-pr-comment.sh
-    # Temporarely commented out pending issue resolution with sending files to backend
-    - ./platform/steps/upload-to-bp-ui.sh
-
+  image: registry.ddbuild.io/images/benchmarking-platform-tools-ubuntu:dd-trace-dotnet-micro
+  when: manual
   artifacts:
     name: "artifacts"
     when: always
     paths:
       - candidate-results/
     expire_in: 3 months
-    
-  rules:
-    - when: manual
   variables:
     AWS_REGION: "us-east-1"
-    CLEANUP: "false"
-    AWS_EPHEMERAL_INFRA_PROFILE_SSM_PARAMETER: "ci.dd-trace-dotnet.ephemeral-infra-ci.dd-trace-dotnet-profile"
-    AWS_EPHEMERAL_INFRA_PROFILE_NAME: "ephemeral-infra-ci"
-    AWS_EPHEMERAL_INFRA_ARTIFACTS_BUCKET_URI: "s3://windows-benchmarking-results/$CI_PROJECT_NAME/$CI_COMMIT_REF_NAME/$CI_JOB_ID"
-    AWS_EPHEMERAL_INFRA_REGION: "us-east-1"
+
+    # Here, we don't include BP_INFRA since we don't use BP_INFRA in this job.
+    BENCHMARKING_PLATFORM_BRANCH: "dd-trace-dotnet/micro"
+
+    # Where to fetch results from
+    BP_INFRA_ARTIFACTS_BUCKET_NAME: "windows-benchmarking-results-us-east-1"
+  script:
+    - git clone --branch $BENCHMARKING_PLATFORM_BRANCH https://gitlab-ci-token:${CI_JOB_TOKEN}@gitlab.ddbuild.io/DataDog/benchmarking-platform platform
+    - ./platform/steps/upload-to-bp-ui.sh