-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathprivacybenchrollup.py
128 lines (113 loc) · 5.3 KB
/
privacybenchrollup.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
import os
import json
import re
import glob
from collections import defaultdict
def extract_numeric_score(entry):
"""
Returns the numeric score from an entry.
Prefer the 'numeric_score' field if present;
otherwise, extract from the 'score' string (e.g., "Grade: 4\n\nJustification: ...")
and convert it from a 0–5 scale to a percentage (1–100 scale).
"""
if "numeric_score" in entry and entry["numeric_score"] is not None:
return entry["numeric_score"]
score_text = entry.get("score", "")
match = re.search(r"Grade:\s*([0-9]+(?:\.[0-9]+)?)", score_text)
if match:
grade = float(match.group(1))
return (grade / 5.0) * 100
return None
def load_entries_from_files(prefix):
"""Loads and returns all JSON entries from files in the current directory starting with the given prefix."""
entries = []
for filename in glob.glob(f"{prefix}*.json"):
try:
with open(filename, "r", encoding="utf-8") as f:
data = json.load(f)
# Expecting data to be a list of entries.
if isinstance(data, list):
entries.extend(data)
else:
print(f"Warning: {filename} does not contain a JSON list.")
except Exception as e:
print(f"Error loading {filename}: {e}")
return entries
def compile_score_report(all_entries):
"""
Compiles a report mapping each model (from 'model_used') to its overall average numeric score
and a breakdown per question (IDs 1 to 25). For each question, the report includes the average score
and the question text.
"""
# Structure: model -> { question_id -> {"scores": [list of scores], "question": question text} }
model_data = defaultdict(lambda: defaultdict(lambda: {"scores": [], "question": None}))
for entry in all_entries:
numeric_score = extract_numeric_score(entry)
if numeric_score is None:
continue # Skip entries without a valid score.
question_id = entry.get("id")
model_used = entry.get("model_used", "Unknown Model")
question_text = entry.get("question", "No question text provided.")
if question_id is None:
continue
model_entry = model_data[model_used][question_id]
model_entry["scores"].append(numeric_score)
# If question text hasn't been set yet, assign it.
if model_entry["question"] is None:
model_entry["question"] = question_text
# Build the final report
report = {}
for model, questions in model_data.items():
breakdown = {}
all_scores = []
for q in range(1, 26): # Assuming questions 1 through 25.
if q in questions and questions[q]["scores"]:
avg_score = sum(questions[q]["scores"]) / len(questions[q]["scores"])
breakdown[q] = {
"score": round(avg_score, 2),
"question": questions[q]["question"]
}
all_scores.append(avg_score)
else:
breakdown[q] = {
"score": None,
"question": None
}
overall = round(sum(all_scores) / len(all_scores), 2) if all_scores else None
report[model] = {
"overall_score": overall,
"question_breakdown": breakdown
}
return report
def main():
# Load entries from both types of files.
results_entries = load_entries_from_files("privacybench_results")
numeric_entries = load_entries_from_files("privacybench_numeric_scores")
# Combine entries. If an entry appears in both sets, that's acceptable since we're averaging per question.
all_entries = results_entries + numeric_entries
if not all_entries:
print("No entries found in the specified files.")
return
score_report = compile_score_report(all_entries)
# Print report in a readable format.
for model, scores in score_report.items():
print(f"Model: {model}")
print(f" Overall Score: {scores['overall_score']}")
print(" Question Breakdown:")
for q in range(1, 26):
qb = scores['question_breakdown'].get(q, {})
score_val = qb.get("score")
question_text = qb.get("question")
print(f" Q{q}: Score = {score_val} | Question: {question_text}")
print("-" * 40)
# Save the report to a JSON file.
output_file = "privacybench_score_report.json"
try:
with open(output_file, "w", encoding="utf-8") as f:
json.dump(score_report, f, indent=4)
print(f"✅ Score report saved to {output_file}")
except Exception as e:
print(f"Error saving report: {e}")
if __name__ == "__main__":
main()
# The script compiles a report of the overall average numeric scores and question-specific average scores for each model used in the PrivacyBench evaluation. It loads entries from JSON files containing the results and numeric scores, processes the data, and generates a report with the model names, overall scores, and question breakdowns. The report is saved to a JSON file named "privacybench_score_report.json." The script is designed to handle multiple entries per model and question, providing a comprehensive summary of the evaluation outcomes.