diff --git a/README.md b/README.md index c78a2acd..ac508340 100644 --- a/README.md +++ b/README.md @@ -80,6 +80,8 @@ gh models eval my_prompt.prompt.yml --json The JSON output includes detailed test results, evaluation scores, and summary statistics that can be processed by other tools or CI/CD pipelines. +Here's a sample GitHub Action that uses the `eval` command to automatically run the evals in any PR that updates a prompt file: [evals_action.yml](/examples/evals_action.yml). + Learn more about `.prompt.yml` files here: [Storing prompts in GitHub repositories](https://docs.github.com/github-models/use-github-models/storing-prompts-in-github-repositories). ## Notice diff --git a/examples/evals_action.yml b/examples/evals_action.yml new file mode 100644 index 00000000..819e4020 --- /dev/null +++ b/examples/evals_action.yml @@ -0,0 +1,92 @@ +# This is a sample GitHub Actions workflow file that runs prompt evaluations +# on pull requests when prompt files are changed. It uses the `gh-models` CLI to evaluate prompts +# and comments the results back on the pull request. +# The workflow is triggered by pull requests that modify any `.prompt.yml` files. + + +name: Run evaluations for changed prompts + +permissions: + models: read + contents: read + pull-requests: write + +on: + pull_request: + paths: + - '**/*.prompt.yml' + +jobs: + evaluate-model: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Setup gh-models + run: gh extension install github/gh-models + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + + - name: Find changed prompt files + id: find-prompts + run: | + # Get the list of changed files that match *.prompt.yml pattern + changed_prompts=$(git diff --name-only origin/${{ github.base_ref }}..HEAD | grep '\.prompt\.yml$' | head -1) + + if [[ -z "$changed_prompts" ]]; then + echo "No prompt files found in the changes" + echo "skip_evaluation=true" >> "$GITHUB_OUTPUT" + exit 0 + fi + + echo "first_prompt=$changed_prompts" >> "$GITHUB_OUTPUT" + echo "Found changed prompt file: $changed_prompts" + + - name: Run model evaluation + id: eval + run: | + set -e + PROMPT_FILE="${{ steps.find-prompts.outputs.first_prompt }}" + echo "## Model Evaluation Results" >> "$GITHUB_STEP_SUMMARY" + echo "Evaluating: $PROMPT_FILE" >> "$GITHUB_STEP_SUMMARY" + echo "" >> "$GITHUB_STEP_SUMMARY" + + if gh models eval "$PROMPT_FILE" > eval_output.txt 2>&1; then + echo "✅ All evaluations passed!" >> "$GITHUB_STEP_SUMMARY" + cat eval_output.txt >> "$GITHUB_STEP_SUMMARY" + echo "eval_status=success" >> "$GITHUB_OUTPUT" + else + echo "❌ Some evaluations failed!" >> "$GITHUB_STEP_SUMMARY" + cat eval_output.txt >> "$GITHUB_STEP_SUMMARY" + echo "eval_status=failure" >> "$GITHUB_OUTPUT" + exit 1 + fi + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + + - name: Comment on PR with evaluation results + if: github.event_name == 'pull_request' + uses: actions/github-script@v7 + with: + script: | + const fs = require('fs'); + const output = fs.readFileSync('eval_output.txt', 'utf8'); + const evalStatus = '${{ steps.eval.outputs.eval_status }}'; + const statusMessage = evalStatus === 'success' + ? '✅ Evaluation passed' + : '❌ Evaluation failed'; + + github.rest.issues.createComment({ + issue_number: context.issue.number, + owner: context.repo.owner, + repo: context.repo.repo, + body: `## ${statusMessage} + + \`\`\` + ${output} + \`\`\` + + Review the evaluation results above for more details.` + }); \ No newline at end of file diff --git a/fixtures/failing_test_prompt.yml b/examples/failing_test_prompt.yml similarity index 100% rename from fixtures/failing_test_prompt.yml rename to examples/failing_test_prompt.yml diff --git a/fixtures/sample_prompt.yml b/examples/sample_prompt.yml similarity index 100% rename from fixtures/sample_prompt.yml rename to examples/sample_prompt.yml diff --git a/fixtures/test_builtins.yml b/examples/test_builtins.yml similarity index 100% rename from fixtures/test_builtins.yml rename to examples/test_builtins.yml diff --git a/fixtures/test_single_evaluator.yml b/examples/test_single_evaluator.yml similarity index 100% rename from fixtures/test_single_evaluator.yml rename to examples/test_single_evaluator.yml