Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
303 changes: 217 additions & 86 deletions .github/ci_expected_accuracy/check_expected.py
Original file line number Diff line number Diff line change
@@ -1,98 +1,229 @@
#!/usr/bin/env python3
"""
Accuracy Check Script
Compares test results against reference data and calculates pass rates.
Reference last updated: https://github.com/intel/torch-xpu-ops/pull/1223
"""

import re
import json
import argparse
import pandas as pd
import pathlib

# Reference last updated is https://github.com/intel/torch-xpu-ops/pull/1223

parser = argparse.ArgumentParser(description="Accuracy Check", formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument("--driver", type=str, default="rolling", help="rolling or lts")
parser.add_argument("--category", type=str, default="inductor", help="inductor")
parser.add_argument("--suite", type=str, required=True, help="huggingface, timm_models or torchbench")
parser.add_argument("--mode", type=str, required=True, help="inference or training")
parser.add_argument("--dtype", type=str, required=True, help="float32, bfloat16, float16, amp_bf16 or amp_fp16")
# parser.add_argument("--scenario", type=str, required=True, help="accuracy or performance")
parser.add_argument("--csv_file", type=str, required=True, help="your result csv path")
parser.add_argument('--update', action='store_true', help="whether update new pass and new failed info")
args = parser.parse_args()


# load csv files
test_data = pd.read_csv(args.csv_file, comment='#')
# test_data = test_data.reset_index() # make sure indexes pair with number of rows
# test_data = test_data.sort_values(by=["name"], ascending=True)
test_names = [row["name"] for index, row in test_data.iterrows()]

current_path = pathlib.Path(__file__).parent.resolve()
refer_file = str(current_path) + "/" + args.driver + "/" + args.category + "_" + args.suite + "_" + args.mode + ".csv"
refer_data = pd.read_csv(refer_file, comment='#')
# refer_data = refer_data.reset_index() # make sure indexes pair with number of rows
# refer_data = refer_data.sort_values(by=["name"], ascending=True)
refer_names = [row["name"] for index, row in refer_data.iterrows()]

# summary
model_names = set(refer_names + test_names)
passed_models = []
real_failed_models = []
expected_failed_models = []
new_models = []
new_pass_models = []
lost_models = []
timeout_models = []
# for index, row in refer_data.iterrows():
for model_name in model_names:
test_row = next(([i, line] for i, line in test_data.iterrows() if line["name"] == model_name), "N/A")
refer_row = next(([i, line] for i, line in refer_data.iterrows() if line["name"] == model_name), "N/A")
test_accuracy = test_row[1]["accuracy"] if test_row != "N/A" else "N/A"
refer_accuracy = refer_row[1][args.dtype] if refer_row != "N/A" else "N/A"
test_accuracy = str(test_accuracy)
refer_accuracy = str(refer_accuracy)
from pathlib import Path


def load_data(csv_file):
"""Load CSV file with comment support."""
return pd.read_csv(csv_file, comment='#')


def find_model_row(dataframe, model_name):
"""Find row for a specific model in dataframe."""
matches = dataframe[dataframe['name'] == model_name]
return matches.iloc[0] if not matches.empty else None


def get_test_result(data, suite, dtype, mode, model):
"""
Get test result for specific test configuration.

Args:
data: JSON data containing test results
suite: Test suite name
dtype: Data type
mode: Inference or training mode
model: Model name

Returns:
Test result or "N/A" if not found
"""
for issue in data:
for row in issue.get('table_rows', []):
if len(row) >= 6 and row[:4] == [suite, dtype, mode, model]:
return row[4]
return "N/A"


def parse_file_name(filename):
"""
Parse benchmark file name to extract suite, dtype, and mode.

Args:
filename: Input filename to parse

Returns:
tuple: (suite, dtype, mode) or ("N/A", "N/A", "N/A") if pattern not found
"""
pattern = (
r"_(huggingface|timm_models|torchbench)_"
r"(float32|bfloat16|float16|amp_bf16|amp_fp16)_"
r"(inference|training)_"
)
match = re.search(pattern, filename)
return match.groups() if match else ("N/A", "N/A", "N/A")


def load_known_data(issue_file):
"""Load known test data from JSON file."""
try:
with open(issue_file, encoding='utf-8') as file:
return json.load(file)
except (FileNotFoundError, json.JSONDecodeError) as e:
print(f"Error loading known data from {issue_file}: {e}")
return []


def update_reference_dataframe(refer_data, model_name, dtype, accuracy):
"""
Update reference dataframe with new or updated model results.

Args:
refer_data: Reference dataframe to update
model_name: Name of the model to update
dtype: Data type column to update
accuracy: Accuracy value to set

Returns:
Updated dataframe
"""
mask = refer_data['name'] == model_name
if mask.any():
refer_data.loc[mask, dtype] = accuracy
else:
new_row = {'name': model_name, dtype: accuracy}
refer_data = pd.concat([refer_data, pd.DataFrame([new_row])], ignore_index=True)
return refer_data


def categorize_model(test_accuracy, refer_accuracy, known_accuracy):
"""
Categorize a model based on its test results.

Returns:
tuple: (category, should_update_reference)
"""
if test_accuracy == "N/A":
lost_models.append([model_name, test_accuracy])
return "lost", False
elif 'pass' in test_accuracy:
passed_models.append([model_name, test_accuracy])
if refer_accuracy == "N/A":
new_models.append([model_name, test_accuracy])
refer_data.loc[len(refer_data), :] = "N/A"
refer_data.at[len(refer_data) - 1, "name"] = model_name
refer_data.at[len(refer_data) - 1, args.dtype] = test_accuracy
return "new", True
elif 'pass' not in refer_accuracy:
new_pass_models.append([model_name, test_accuracy])
refer_data.at[refer_row[0], args.dtype] = test_accuracy
return "new_pass", True
return "passed", False
elif 'timeout' in test_accuracy:
timeout_models.append([model_name, test_accuracy])
if refer_accuracy == "N/A":
new_models.append([model_name, test_accuracy])
refer_data.loc[len(refer_data), :] = "N/A"
refer_data.at[len(refer_data) - 1, "name"] = model_name
refer_data.at[len(refer_data) - 1, args.dtype] = test_accuracy
else:
return "new", True
return "timeout", False
else: # Failed cases
if refer_accuracy == "N/A":
new_models.append([model_name, test_accuracy])
# Not failed for new models
expected_failed_models.append([model_name, test_accuracy])
refer_data.loc[len(refer_data), :] = "N/A"
refer_data.at[len(refer_data) - 1, "name"] = model_name
refer_data.at[len(refer_data) - 1, args.dtype] = test_accuracy
elif "pass" in refer_accuracy:
real_failed_models.append([model_name, test_accuracy])
return "expected_failed", True
elif "pass" in refer_accuracy and known_accuracy != test_accuracy:
return "real_failed", False
else:
expected_failed_models.append([model_name, test_accuracy])
if test_accuracy != refer_accuracy:
refer_data.at[refer_row[0], args.dtype] = test_accuracy

# pass rate
print(f"============ Summary for {args.suite} {args.dtype} {args.mode} accuracy ============")
print("Total models:", len(model_names))
print("Passed models:", len(passed_models))
print("Real failed models:", len(real_failed_models), real_failed_models)
print("Expected failed models:", len(expected_failed_models), expected_failed_models)
print("Warning timeout models:", len(timeout_models), timeout_models)
print("New models:", len(new_models), new_models)
print("Failed to passed models:", len(new_pass_models), new_pass_models)
print("Not run/in models:", len(lost_models), lost_models)
print(f"Pass rate: {len(passed_models) / len(model_names) * 100:.2f}%")

# update reference csv
if len(new_pass_models + new_models) > 0 and args.update:
refer_data.to_csv(refer_file, sep=',', encoding='utf-8', index=False)
return "expected_failed", True
return "expected_failed", False


def print_results_summary(suite, dtype, mode, categories):
"""Print formatted summary of results."""
print(f"============ Summary for {suite} {dtype} {mode} accuracy ============")
print(f"Total models: {len(categories['all_models'])}")
print(f"Passed models: {len(categories['passed'])}")
print(f"Real failed models: {len(categories['real_failed'])} , {categories['real_failed']}")
print(f"Expected failed models: {len(categories['expected_failed'])} , {categories['expected_failed']}")
print(f"Warning timeout models: {len(categories['timeout'])} , {categories['timeout']}")
print(f"New models: {len(categories['new'])} , {categories['new']}")
print(f"Failed to passed models: {len(categories['new_pass'])} , {categories['new_pass']}")
print(f"Not run/in models: {len(categories['lost'])} , {categories['lost']}")

total_models = len(categories['all_models'])
if total_models > 0:
pass_rate = len(categories['passed']) / total_models * 100
print(f"Pass rate: {pass_rate:.2f}%")


def main():
"""Main function to run accuracy comparison."""
parser = argparse.ArgumentParser(
description="Accuracy Check",
formatter_class=argparse.ArgumentDefaultsHelpFormatter
)
parser.add_argument("--driver", type=str, default="rolling", help="rolling or lts")
parser.add_argument("--category", type=str, default="inductor", help="inductor")
parser.add_argument("--suite", type=str, required=True, help="huggingface, timm_models or torchbench")
parser.add_argument("--mode", type=str, required=True, help="inference or training")
parser.add_argument("--dtype", type=str, required=True, help="float32, bfloat16, float16, amp_bf16 or amp_fp16")
parser.add_argument("--csv_file", type=str, required=True, help="Test results CSV file path")
parser.add_argument("--issue_file", type=str, required=True, help="Known test data JSON file path")
parser.add_argument('--update', action='store_true', help="Whether to update new pass and new failed info")

args = parser.parse_args()

# Load data files
test_data = load_data(args.csv_file)
test_known_data = load_known_data(args.issue_file)
suite, dtype, mode = parse_file_name(args.csv_file)

# Load reference data
current_path = Path(__file__).parent.resolve()
refer_filename = f"{args.category}_{args.suite}_{args.mode}.csv"
refer_file = current_path / args.driver / refer_filename
refer_data = load_data(refer_file)

# Get model names
test_names = test_data['name'].tolist()
refer_names = refer_data['name'].tolist()
model_names = set(refer_names + test_names)

# Initialize result categories
categories = {
'all_models': list(model_names),
'passed': [],
'real_failed': [],
'expected_failed': [],
'new': [],
'new_pass': [],
'lost': [],
'timeout': []
}

needs_update = False

# Process each model
for model_name in model_names:
test_row = find_model_row(test_data, model_name)
refer_row = find_model_row(refer_data, model_name)

test_accuracy = str(test_row['accuracy']) if test_row is not None else "N/A"
refer_accuracy = str(refer_row[args.dtype]) if refer_row is not None else "N/A"
known_accuracy = get_test_result(test_known_data, suite, dtype, mode, model_name)

# Debug print (optional)
# print(f"{model_name}: test={test_accuracy}, ref={refer_accuracy}, known={known_accuracy}")

# Categorize model and determine if reference needs update
category, should_update = categorize_model(
test_accuracy, refer_accuracy, known_accuracy
)

categories[category].append([model_name, test_accuracy])

# Update reference data if needed
if should_update and args.update:
refer_data = update_reference_dataframe(
refer_data, model_name, args.dtype, test_accuracy
)
needs_update = True

# Print summary
print_results_summary(args.suite, args.dtype, args.mode, categories)

# Update reference CSV if requested
if needs_update:
refer_data.to_csv(refer_file, sep=',', encoding='utf-8', index=False)
print(f"Reference file updated: {refer_file}")


if __name__ == "__main__":
main()
Loading