diff --git a/.github/ci_expected_accuracy/check_expected.py b/.github/ci_expected_accuracy/check_expected.py index 3c82666af0..abda5922f6 100644 --- a/.github/ci_expected_accuracy/check_expected.py +++ b/.github/ci_expected_accuracy/check_expected.py @@ -1,98 +1,229 @@ +#!/usr/bin/env python3 +""" +Accuracy Check Script +Compares test results against reference data and calculates pass rates. +Reference last updated: https://github.com/intel/torch-xpu-ops/pull/1223 +""" +import re +import json import argparse import pandas as pd -import pathlib - -# Reference last updated is https://github.com/intel/torch-xpu-ops/pull/1223 - -parser = argparse.ArgumentParser(description="Accuracy Check", formatter_class=argparse.ArgumentDefaultsHelpFormatter) -parser.add_argument("--driver", type=str, default="rolling", help="rolling or lts") -parser.add_argument("--category", type=str, default="inductor", help="inductor") -parser.add_argument("--suite", type=str, required=True, help="huggingface, timm_models or torchbench") -parser.add_argument("--mode", type=str, required=True, help="inference or training") -parser.add_argument("--dtype", type=str, required=True, help="float32, bfloat16, float16, amp_bf16 or amp_fp16") -# parser.add_argument("--scenario", type=str, required=True, help="accuracy or performance") -parser.add_argument("--csv_file", type=str, required=True, help="your result csv path") -parser.add_argument('--update', action='store_true', help="whether update new pass and new failed info") -args = parser.parse_args() - - -# load csv files -test_data = pd.read_csv(args.csv_file, comment='#') -# test_data = test_data.reset_index() # make sure indexes pair with number of rows -# test_data = test_data.sort_values(by=["name"], ascending=True) -test_names = [row["name"] for index, row in test_data.iterrows()] - -current_path = pathlib.Path(__file__).parent.resolve() -refer_file = str(current_path) + "/" + args.driver + "/" + args.category + "_" + args.suite + "_" + args.mode + ".csv" -refer_data = pd.read_csv(refer_file, comment='#') -# refer_data = refer_data.reset_index() # make sure indexes pair with number of rows -# refer_data = refer_data.sort_values(by=["name"], ascending=True) -refer_names = [row["name"] for index, row in refer_data.iterrows()] - -# summary -model_names = set(refer_names + test_names) -passed_models = [] -real_failed_models = [] -expected_failed_models = [] -new_models = [] -new_pass_models = [] -lost_models = [] -timeout_models = [] -# for index, row in refer_data.iterrows(): -for model_name in model_names: - test_row = next(([i, line] for i, line in test_data.iterrows() if line["name"] == model_name), "N/A") - refer_row = next(([i, line] for i, line in refer_data.iterrows() if line["name"] == model_name), "N/A") - test_accuracy = test_row[1]["accuracy"] if test_row != "N/A" else "N/A" - refer_accuracy = refer_row[1][args.dtype] if refer_row != "N/A" else "N/A" - test_accuracy = str(test_accuracy) - refer_accuracy = str(refer_accuracy) +from pathlib import Path + + +def load_data(csv_file): + """Load CSV file with comment support.""" + return pd.read_csv(csv_file, comment='#') + + +def find_model_row(dataframe, model_name): + """Find row for a specific model in dataframe.""" + matches = dataframe[dataframe['name'] == model_name] + return matches.iloc[0] if not matches.empty else None + + +def get_test_result(data, suite, dtype, mode, model): + """ + Get test result for specific test configuration. + + Args: + data: JSON data containing test results + suite: Test suite name + dtype: Data type + mode: Inference or training mode + model: Model name + + Returns: + Test result or "N/A" if not found + """ + for issue in data: + for row in issue.get('table_rows', []): + if len(row) >= 6 and row[:4] == [suite, dtype, mode, model]: + return row[4] + return "N/A" + + +def parse_file_name(filename): + """ + Parse benchmark file name to extract suite, dtype, and mode. + + Args: + filename: Input filename to parse + + Returns: + tuple: (suite, dtype, mode) or ("N/A", "N/A", "N/A") if pattern not found + """ + pattern = ( + r"_(huggingface|timm_models|torchbench)_" + r"(float32|bfloat16|float16|amp_bf16|amp_fp16)_" + r"(inference|training)_" + ) + match = re.search(pattern, filename) + return match.groups() if match else ("N/A", "N/A", "N/A") + + +def load_known_data(issue_file): + """Load known test data from JSON file.""" + try: + with open(issue_file, encoding='utf-8') as file: + return json.load(file) + except (FileNotFoundError, json.JSONDecodeError) as e: + print(f"Error loading known data from {issue_file}: {e}") + return [] + + +def update_reference_dataframe(refer_data, model_name, dtype, accuracy): + """ + Update reference dataframe with new or updated model results. + + Args: + refer_data: Reference dataframe to update + model_name: Name of the model to update + dtype: Data type column to update + accuracy: Accuracy value to set + + Returns: + Updated dataframe + """ + mask = refer_data['name'] == model_name + if mask.any(): + refer_data.loc[mask, dtype] = accuracy + else: + new_row = {'name': model_name, dtype: accuracy} + refer_data = pd.concat([refer_data, pd.DataFrame([new_row])], ignore_index=True) + return refer_data + + +def categorize_model(test_accuracy, refer_accuracy, known_accuracy): + """ + Categorize a model based on its test results. + + Returns: + tuple: (category, should_update_reference) + """ if test_accuracy == "N/A": - lost_models.append([model_name, test_accuracy]) + return "lost", False elif 'pass' in test_accuracy: - passed_models.append([model_name, test_accuracy]) if refer_accuracy == "N/A": - new_models.append([model_name, test_accuracy]) - refer_data.loc[len(refer_data), :] = "N/A" - refer_data.at[len(refer_data) - 1, "name"] = model_name - refer_data.at[len(refer_data) - 1, args.dtype] = test_accuracy + return "new", True elif 'pass' not in refer_accuracy: - new_pass_models.append([model_name, test_accuracy]) - refer_data.at[refer_row[0], args.dtype] = test_accuracy + return "new_pass", True + return "passed", False elif 'timeout' in test_accuracy: - timeout_models.append([model_name, test_accuracy]) if refer_accuracy == "N/A": - new_models.append([model_name, test_accuracy]) - refer_data.loc[len(refer_data), :] = "N/A" - refer_data.at[len(refer_data) - 1, "name"] = model_name - refer_data.at[len(refer_data) - 1, args.dtype] = test_accuracy - else: + return "new", True + return "timeout", False + else: # Failed cases if refer_accuracy == "N/A": - new_models.append([model_name, test_accuracy]) - # Not failed for new models - expected_failed_models.append([model_name, test_accuracy]) - refer_data.loc[len(refer_data), :] = "N/A" - refer_data.at[len(refer_data) - 1, "name"] = model_name - refer_data.at[len(refer_data) - 1, args.dtype] = test_accuracy - elif "pass" in refer_accuracy: - real_failed_models.append([model_name, test_accuracy]) + return "expected_failed", True + elif "pass" in refer_accuracy and known_accuracy != test_accuracy: + return "real_failed", False else: - expected_failed_models.append([model_name, test_accuracy]) if test_accuracy != refer_accuracy: - refer_data.at[refer_row[0], args.dtype] = test_accuracy - -# pass rate -print(f"============ Summary for {args.suite} {args.dtype} {args.mode} accuracy ============") -print("Total models:", len(model_names)) -print("Passed models:", len(passed_models)) -print("Real failed models:", len(real_failed_models), real_failed_models) -print("Expected failed models:", len(expected_failed_models), expected_failed_models) -print("Warning timeout models:", len(timeout_models), timeout_models) -print("New models:", len(new_models), new_models) -print("Failed to passed models:", len(new_pass_models), new_pass_models) -print("Not run/in models:", len(lost_models), lost_models) -print(f"Pass rate: {len(passed_models) / len(model_names) * 100:.2f}%") - -# update reference csv -if len(new_pass_models + new_models) > 0 and args.update: - refer_data.to_csv(refer_file, sep=',', encoding='utf-8', index=False) + return "expected_failed", True + return "expected_failed", False + + +def print_results_summary(suite, dtype, mode, categories): + """Print formatted summary of results.""" + print(f"============ Summary for {suite} {dtype} {mode} accuracy ============") + print(f"Total models: {len(categories['all_models'])}") + print(f"Passed models: {len(categories['passed'])}") + print(f"Real failed models: {len(categories['real_failed'])} , {categories['real_failed']}") + print(f"Expected failed models: {len(categories['expected_failed'])} , {categories['expected_failed']}") + print(f"Warning timeout models: {len(categories['timeout'])} , {categories['timeout']}") + print(f"New models: {len(categories['new'])} , {categories['new']}") + print(f"Failed to passed models: {len(categories['new_pass'])} , {categories['new_pass']}") + print(f"Not run/in models: {len(categories['lost'])} , {categories['lost']}") + + total_models = len(categories['all_models']) + if total_models > 0: + pass_rate = len(categories['passed']) / total_models * 100 + print(f"Pass rate: {pass_rate:.2f}%") + + +def main(): + """Main function to run accuracy comparison.""" + parser = argparse.ArgumentParser( + description="Accuracy Check", + formatter_class=argparse.ArgumentDefaultsHelpFormatter + ) + parser.add_argument("--driver", type=str, default="rolling", help="rolling or lts") + parser.add_argument("--category", type=str, default="inductor", help="inductor") + parser.add_argument("--suite", type=str, required=True, help="huggingface, timm_models or torchbench") + parser.add_argument("--mode", type=str, required=True, help="inference or training") + parser.add_argument("--dtype", type=str, required=True, help="float32, bfloat16, float16, amp_bf16 or amp_fp16") + parser.add_argument("--csv_file", type=str, required=True, help="Test results CSV file path") + parser.add_argument("--issue_file", type=str, required=True, help="Known test data JSON file path") + parser.add_argument('--update', action='store_true', help="Whether to update new pass and new failed info") + + args = parser.parse_args() + + # Load data files + test_data = load_data(args.csv_file) + test_known_data = load_known_data(args.issue_file) + suite, dtype, mode = parse_file_name(args.csv_file) + + # Load reference data + current_path = Path(__file__).parent.resolve() + refer_filename = f"{args.category}_{args.suite}_{args.mode}.csv" + refer_file = current_path / args.driver / refer_filename + refer_data = load_data(refer_file) + + # Get model names + test_names = test_data['name'].tolist() + refer_names = refer_data['name'].tolist() + model_names = set(refer_names + test_names) + + # Initialize result categories + categories = { + 'all_models': list(model_names), + 'passed': [], + 'real_failed': [], + 'expected_failed': [], + 'new': [], + 'new_pass': [], + 'lost': [], + 'timeout': [] + } + + needs_update = False + + # Process each model + for model_name in model_names: + test_row = find_model_row(test_data, model_name) + refer_row = find_model_row(refer_data, model_name) + + test_accuracy = str(test_row['accuracy']) if test_row is not None else "N/A" + refer_accuracy = str(refer_row[args.dtype]) if refer_row is not None else "N/A" + known_accuracy = get_test_result(test_known_data, suite, dtype, mode, model_name) + + # Debug print (optional) + # print(f"{model_name}: test={test_accuracy}, ref={refer_accuracy}, known={known_accuracy}") + + # Categorize model and determine if reference needs update + category, should_update = categorize_model( + test_accuracy, refer_accuracy, known_accuracy + ) + + categories[category].append([model_name, test_accuracy]) + + # Update reference data if needed + if should_update and args.update: + refer_data = update_reference_dataframe( + refer_data, model_name, args.dtype, test_accuracy + ) + needs_update = True + + # Print summary + print_results_summary(args.suite, args.dtype, args.mode, categories) + + # Update reference CSV if requested + if needs_update: + refer_data.to_csv(refer_file, sep=',', encoding='utf-8', index=False) + print(f"Reference file updated: {refer_file}") + + +if __name__ == "__main__": + main() diff --git a/.github/scripts/calculate_best_perf.py b/.github/scripts/calculate_best_perf.py index 3d2f532337..8acdc636bc 100644 --- a/.github/scripts/calculate_best_perf.py +++ b/.github/scripts/calculate_best_perf.py @@ -1,16 +1,22 @@ +#!/usr/bin/env python3 +""" +Performance Benchmark Analyzer -# To get the best performance number -# Usage: -# python calculate_best_perf.py --best /path/to/best.csv -# --new /path/to/new/measured/performance/result/dir -# --device -# --os -# --driver -# --oneapi -# --gcc -# --python -# --pytorch -# --torch-xpu-ops +Calculate and update best performance metrics by comparing new results +with historical best performance data. + +Usage: + python calculate_best_perf.py --best /path/to/best.csv + --new /path/to/new/measured/performance/result/dir + --device + --os + --driver + --oneapi + --gcc + --python + --pytorch + --torch-xpu-ops +""" import re import os @@ -19,88 +25,236 @@ import pandas as pd from datetime import date -parser = argparse.ArgumentParser(description="Get Best Performance", - formatter_class=argparse.ArgumentDefaultsHelpFormatter) -parser.add_argument("--best", required=True, help="Saved best performance file") -parser.add_argument("--new", required=True, help="New round launch") -parser.add_argument("--device", default=None, type=str, help="Device name, such as PVC1100") -parser.add_argument("--os", default=None, type=str, help="OS version, such as Ubuntu 22.04") -parser.add_argument("--driver", default=None, type=str, help="Driver version, such as 25.05.32567") -parser.add_argument("--oneapi", default=None, type=str, help="OneAPI version, such as 2025.1") -parser.add_argument("--gcc", default=None, type=str, help="GCC version, such as 11") -parser.add_argument("--python", default=None, type=str, help="Python version, such as 3.10") -parser.add_argument("--pytorch", default=None, type=str, help="PyTorch version") -parser.add_argument("--torch-xpu-ops", default=None, type=str, help="Torch XPU Ops version") -args = parser.parse_args() - def multiple_replace(text): - REGEX_REPLACEMENTS = [ + """ + Apply regex replacements to extract category from filename. + + Args: + text: Input filename to process + + Returns: + Cleaned category name + """ + regex_replacements = [ (r".*inductor_", ""), (r"_xpu_performance.csv", ""), ] - for old, new in REGEX_REPLACEMENTS: - text = re.sub(old, new, text, flags=re.IGNORECASE) + for old_pattern, new_pattern in regex_replacements: + text = re.sub(old_pattern, new_pattern, text, flags=re.IGNORECASE) return text -def find_files(pattern, path): - result = [] - for root, dirs, files in os.walk(path): - for name in files: - if fnmatch.fnmatch(name, pattern): - result.append(os.path.join(root, name)) - return result - -# comparison result output -best_header = ["Category", "Model", "Value Type", "Value", - "Device", "OS", "Driver", "OneAPI", "GCC", "Python", - "PyTorch", "Torch XPU Ops", "Date"] -best_data = pd.read_csv(args.best) if os.path.isfile(args.best) else pd.DataFrame(columns=best_header) -best_data = best_data.reset_index(drop=True) -new_files = find_files("*_xpu_performance.csv", args.new) -for new_file in new_files: - category = multiple_replace(new_file) - new_data = pd.read_csv(new_file) - new_data = new_data.reset_index(drop=True) - for index, row in new_data.iterrows(): - # eager - new_eager = row["abs_latency"] * row["speedup"] - eager_row = next(([i, line] for i, line in best_data.iterrows() - if (line["Category"] == category - and line["Model"] == row["name"] - and line["Value Type"] == "eager")), "N/A") - best_eager_value = best_data.loc[ - (best_data["Category"] == category) & - (best_data["Model"] == row["name"]) & - (best_data["Value Type"] == "eager")] - if eager_row != "N/A": - if new_eager < best_eager_value["Value"].values[0]: - best_data.loc[eager_row[0]] = [category, row["name"], "eager", new_eager, - args.device, args.os, args.driver, args.oneapi, args.gcc, args.python, - args.pytorch, args.torch_xpu_ops, date.today().strftime('%F')] - else: - best_data.loc[len(best_data), :] = None - best_data.loc[len(best_data) - 1] = [category, row["name"], "eager", new_eager, - args.device, args.os, args.driver, args.oneapi, args.gcc, args.python, - args.pytorch, args.torch_xpu_ops, date.today().strftime('%F')] - # inductor - inductor_row = next(([i, line] for i, line in best_data.iterrows() - if (line["Category"] == category - and line["Model"] == row["name"] - and line["Value Type"] == "inductor")), "N/A") - best_inductor_value = best_data.loc[ - (best_data["Category"] == category) & - (best_data["Model"] == row["name"]) & - (best_data["Value Type"] == "inductor")] - if inductor_row != "N/A": - if row["abs_latency"] < best_inductor_value["Value"].values[0]: - best_data.loc[inductor_row[0]] = [category, row["name"], "inductor", row["abs_latency"], - args.device, args.os, args.driver, args.oneapi, args.gcc, args.python, - args.pytorch, args.torch_xpu_ops, date.today().strftime('%F')] - else: - best_data.loc[len(best_data), :] = None - best_data.loc[len(best_data) - 1] = [category, row["name"], "inductor", row["abs_latency"], - args.device, args.os, args.driver, args.oneapi, args.gcc, args.python, - args.pytorch, args.torch_xpu_ops, date.today().strftime('%F')] - -best_data.to_csv(args.best, sep=',', encoding='utf-8', index=False) + +def find_files(pattern, search_path): + """ + Recursively find files matching pattern in directory. + + Args: + pattern: File pattern to match (e.g., "*.csv") + search_path: Directory path to search in + + Returns: + List of matching file paths + """ + matched_files = [] + for root, _, files in os.walk(search_path): + for filename in files: + if fnmatch.fnmatch(filename, pattern): + matched_files.append(os.path.join(root, filename)) + return matched_files + + +def find_best_row(dataframe, category, model, value_type): + """ + Find the best performance row for given criteria. + + Args: + dataframe: DataFrame to search in + category: Test category + model: Model name + value_type: 'eager' or 'inductor' + + Returns: + Tuple of (row_index, row_data) or None if not found + """ + matches = dataframe[ + (dataframe["Category"] == category) & + (dataframe["Model"] == model) & + (dataframe["Value Type"] == value_type) + ] + if not matches.empty: + return matches.index[0], matches.iloc[0] + return None + + +def create_new_row(category, model, value_type, value, args_dict): + """ + Create a new row for best performance data. + + Args: + category: Test category + model: Model name + value_type: 'eager' or 'inductor' + value: Performance value + args_dict: Dictionary of system configuration arguments + + Returns: + Dictionary representing the new row + """ + return { + "Category": category, + "Model": model, + "Value Type": value_type, + "Value": value, + "Device": args_dict["device"], + "OS": args_dict["os"], + "Driver": args_dict["driver"], + "OneAPI": args_dict["oneapi"], + "GCC": args_dict["gcc"], + "Python": args_dict["python"], + "PyTorch": args_dict["pytorch"], + "Torch XPU Ops": args_dict["torch_xpu_ops"], + "Date": date.today().strftime('%F') + } + + +def update_best_performance(best_data, category, model, value_type, + new_value, args_dict): + """ + Update best performance data with new value if better. + + Args: + best_data: DataFrame with best performance data + category: Test category + model: Model name + value_type: 'eager' or 'inductor' + new_value: New performance value to compare + args_dict: System configuration arguments + + Returns: + Updated DataFrame + """ + best_row = find_best_row(best_data, category, model, value_type) + + # For performance metrics, lower values are better + current_best = best_row[1]["Value"] if best_row else float('inf') + is_better = new_value < current_best + + if best_row and is_better: + # Update existing row + best_data.loc[best_row[0]] = create_new_row( + category, model, value_type, new_value, args_dict + ) + elif not best_row: + # Add new row + new_row = create_new_row(category, model, value_type, new_value, args_dict) + best_data = pd.concat([ + best_data, + pd.DataFrame([new_row]) + ], ignore_index=True) + + return best_data + + +def main(): + """Main function to calculate and update best performance metrics.""" + parser = argparse.ArgumentParser( + description="Get Best Performance", + formatter_class=argparse.ArgumentDefaultsHelpFormatter + ) + parser.add_argument("--best", required=True, + help="Saved best performance file") + parser.add_argument("--new", required=True, + help="New performance results directory") + parser.add_argument("--device", type=str, + help="Device name, such as PVC1100") + parser.add_argument("--os", type=str, + help="OS version, such as Ubuntu 22.04") + parser.add_argument("--driver", type=str, + help="Driver version, such as 25.05.32567") + parser.add_argument("--oneapi", type=str, + help="OneAPI version, such as 2025.1") + parser.add_argument("--gcc", type=str, + help="GCC version, such as 11") + parser.add_argument("--python", type=str, + help="Python version, such as 3.10") + parser.add_argument("--pytorch", type=str, + help="PyTorch version") + parser.add_argument("--torch-xpu-ops", type=str, + help="Torch XPU Ops version") + + args = parser.parse_args() + + # Prepare system configuration dictionary + system_config = { + "device": args.device, + "os": args.os, + "driver": args.driver, + "oneapi": args.oneapi, + "gcc": args.gcc, + "python": args.python, + "pytorch": args.pytorch, + "torch_xpu_ops": getattr(args, 'torch-xpu-ops', None) + } + + # Define output columns + best_columns = [ + "Category", "Model", "Value Type", "Value", + "Device", "OS", "Driver", "OneAPI", "GCC", "Python", + "PyTorch", "Torch XPU Ops", "Date" + ] + + # Load or initialize best performance data + if os.path.isfile(args.best): + best_data = pd.read_csv(args.best) + else: + best_data = pd.DataFrame(columns=best_columns) + + best_data = best_data.reset_index(drop=True) + + # Find and process new performance files + new_files = find_files("*_xpu_performance.csv", args.new) + + if not new_files: + print(f"No performance files found in {args.new}") + return + + print(f"Processing {len(new_files)} performance files...") + + for new_file in new_files: + category = multiple_replace(new_file) + # print(f"Processing category: {category}") + + try: + new_data = pd.read_csv(new_file) + new_data = new_data.reset_index(drop=True) + + for _, row in new_data.iterrows(): + model_name = row["name"] + + # Process eager performance + eager_perf = row["abs_latency"] * row["speedup"] + best_data = update_best_performance( + best_data, category, model_name, "eager", + eager_perf, system_config + ) + + # Process inductor performance + inductor_perf = row["abs_latency"] + best_data = update_best_performance( + best_data, category, model_name, "inductor", + inductor_perf, system_config + ) + + except Exception as e: + print(f"Error processing {new_file}: {e}") + continue + + # Save updated best performance data + best_data.to_csv(args.best, sep=',', encoding='utf-8', index=False) + print(f"Best performance data saved to: {args.best}") + print(f"Total records: {len(best_data)}") + + +if __name__ == "__main__": + main() diff --git a/.github/scripts/e2e_summary.sh b/.github/scripts/e2e_summary.sh index 7275a4a146..b5e2506a9b 100644 --- a/.github/scripts/e2e_summary.sh +++ b/.github/scripts/e2e_summary.sh @@ -1,235 +1,384 @@ -#!/bin/bash - -results_dir="$1" -reference_dir="$2" -rm -rf /tmp/tmp-*.txt - -function get_acc_details() { - echo -e "#### accuracy\n\n - - - - - - - - " |tee accuracy.details.html > accuracy.regression.html - suite_list=$( - find "${results_dir}" -name "*.csv" |grep -E "_xpu_accuracy.csv" |\ - sed "s/.*inductor_//;s/_[abf].*//" |sort |uniq - ) - for suite in ${suite_list} - do - model_list=$( - find "${results_dir}" -name "*.csv" |grep -E ".*${suite}.*_xpu_accuracy.csv" |\ - xargs cat |grep "^xpu," |cut -d, -f2 |sort |uniq - ) - for model in ${model_list} - do - for dtype in float32 bfloat16 float16 amp_bf16 amp_fp16 - do - for mode in training inference - do - colorful=$(grep -w "${model}" "/tmp/tmp-${suite}-${mode}-${dtype}.txt" 2>&1 |awk 'BEGIN{ - color = "black"; - exit_label = 0; - }{ - if ($0 ~/Real failed/){ - color="🔴"; - exit_label++; - }else if ($0 ~/Expected failed/){ - color="🔵"; - }else if ($0 ~/Warning timeout/){ - color="🟡"; - }else if ($0 ~/New models/){ - color="🔵"; - }else if ($0 ~/Failed to passed/){ - color="🟢"; - exit_label++; - } - }END{print color, exit_label}') - echo "${colorful}" >> /tmp/tmp-result.txt - context=$(find "${results_dir}" -name "*.csv" |\ - grep -E ".*${suite}_${dtype}_${mode}_xpu_accuracy.csv" |xargs grep ",${model}," |cut -d, -f4 |\ - awk -v c="${colorful/ *}" '{if(c=="black") {print $0}else {printf("%s%s", c, $1)}}') - eval "export ${mode}_${dtype}=\${context}" - done - done - accuracy_row="$(echo -e " - - - - - - - - - - - - - " - )" - if [[ "${accuracy_row}" =~ "red" ]];then - echo "${accuracy_row}" |tee -a accuracy.details.html >> accuracy.regression.html - accuracy_regression=1 - elif [[ "${accuracy_row}" =~ "green" ]];then - echo "${accuracy_row}" |tee -a accuracy.details.html >> accuracy.regression.html - accuracy_regression=1 - elif [[ "${accuracy_row}" =~ "orange" ]];then - echo "${accuracy_row}" |tee -a accuracy.details.html >> accuracy.regression.html - accuracy_regression=1 - else - echo "${accuracy_row}" >> accuracy.details.html - fi - done +#!/usr/bin/env bash + +set -euo pipefail + +# Script: test_results_processor.sh +# Description: Process accuracy and performance test results for XPU operations + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +readonly SCRIPT_DIR +SCRIPT_NAME="$(basename "$0")" +readonly SCRIPT_NAME + +# Constants +readonly RED='🔴' +readonly GREEN='🟢' +readonly BLUE='🔵' +readonly YELLOW='🟡' + +# Global state +accuracy_regression=0 +performance_regression=0 + +main() { + if [[ $# -ne 2 ]]; then + echo "Usage: $0 " >&2 + exit 1 + fi + + local results_dir="$1" + local reference_dir="$2" + + validate_directories "$results_dir" "$reference_dir" + cleanup_temp_files + + echo "Processing test results..." + echo "Results: $results_dir, Reference: $reference_dir" + + process_accuracy "$results_dir" + process_performance "$results_dir" "$reference_dir" + generate_report + + echo "Processing completed" +} + +validate_directories() { + for dir in "$1" "$2"; do + if [[ ! -d "$dir" ]]; then + echo "Error: Directory not found: $dir" >&2 + exit 1 + fi done - echo -e "
Suite Model Training Inference
float32 bfloat16 float16 amp_bf16 amp_fp16 float32 bfloat16 float16 amp_bf16 amp_fp16
${suite}${model}${training_float32}${training_bfloat16}${training_float16}${training_amp_bf16}${training_amp_fp16}${inference_float32}${inference_bfloat16}${inference_float16}${inference_amp_bf16}${inference_amp_fp16}
\n" |tee -a accuracy.details.html >> accuracy.regression.html - if [ "${accuracy_regression}" -ne 1 ];then - echo > accuracy.regression.html +} + +cleanup_temp_files() { + rm -rf /tmp/tmp-*.txt /tmp/tmp-*.json + rm -rf accuracy.*.html performance.*.html +} + +# Accuracy Processing +process_accuracy() { + local results_dir="$1" + + if ! find "$results_dir" -name "*_xpu_accuracy.csv" -quit; then + return fi + + echo "Processing accuracy results..." + + # Get known issues + python "$SCRIPT_DIR/../scripts/get_issue.py" \ + --repo_owner intel \ + --repo_name torch-xpu-ops \ + --labels "module: infra" E2E Accuracy skipped \ + --output /tmp/tmp-known-issue.json + + generate_accuracy_summary "$results_dir" + generate_accuracy_details "$results_dir" } -# Accuracy summary -rm -rf accuracy.*.html -accuracy_regression=0 -accuracy=$(find "${results_dir}" -name "*_xpu_accuracy.csv" |wc -l) -if [ "${accuracy}" -gt 0 ];then +generate_accuracy_summary() { + local results_dir="$1" + local check_file="$SCRIPT_DIR/../ci_expected_accuracy/check_expected.py" + cat > accuracy.summary.html << EOF #### accuracy | Category | Total | Passed | Pass Rate | Failed | Xfailed | Timeout | New Passed | New Enabled | Not Run | -| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | +|----------|-------|--------|-----------|--------|---------|---------|------------|-------------|---------| EOF - check_file="$(dirname "$0")/../ci_expected_accuracy/check_expected.py" - for csv in $(find "${results_dir}" -name "*.csv" |grep -E "_xpu_accuracy.csv" |sort) - do - category="$(echo "${csv}" |sed 's/.*inductor_//;s/_xpu_accuracy.*//')" - suite="$(echo "${csv}" |sed 's/.*inductor_//;s/_.*//;s/timm/timm_models/')" - mode="$(echo "${csv}" |sed 's/_xpu_accuracy.*//;s/.*_//')" - dtype="$(echo "${csv}" |sed -E 's/.*inductor_[a-z]*_//;s/models_//;s/_infer.*|_train.*//')" - python "${check_file}" --suite "${suite}" --mode "${mode}" --dtype "${dtype}" --csv_file "${csv}" > "/tmp/tmp-${suite}-${mode}-${dtype}.txt" - test_result="$(sed 's/, /,/g' "/tmp/tmp-${suite}-${mode}-${dtype}.txt" |awk '{ - if($0 ~/Total/){ - total = $3; - } - if($0 ~/Passed/){ - passed = $3; - } - if($0 ~/Pass rate/){ - pass_rate = $3; - } - if($0 ~/Real failed/){ - failed = $4; - failed_models = $5; - if(failed > 0){ - failed = "🔴"$4; - } - } - if($0 ~/Expected failed/){ - xfail = $4; - xfail_models = $5; - if(xfail > 0){ - xfail = "🔵"$4; - } - } - if($0 ~/timeout/){ - timeout = $4; - timeout_models = $5; - if(timeout > 0){ - timeout = "🟡"$4; - } - } - if($0 ~/Failed to passed/){ - new_passed = $5; - new_passed_models = $6; - if(new_passed > 0){ - new_passed = "🟢"$4; - } - } - if($0 ~/Not run/){ - not_run = $4; - not_run_models = $5; - } - if($0 ~/New models/){ - new_enabled = $3; - new_enabled_models = $4; - if(new_enabled > 0){ - new_enabled = "🔵"$4; - } - } - }END { - printf(" %s | %s | %s | %s | %s | %s | %s | %s | %s\n", - total, passed, pass_rate, failed, xfail, timeout, new_passed, new_enabled, not_run); - }')" - echo "| ${category} | ${test_result} |" >> accuracy.summary.html + while IFS= read -r csv_file; do + process_csv_file "$check_file" "$csv_file" + done < <(find "$results_dir" -name "*_xpu_accuracy.csv" | sort) + echo -e "\n\n" >> accuracy.summary.html +} + +process_csv_file() { + local check_file="$1" csv_file="$2" + local category suite mode dtype + + category=$(basename "$csv_file" | sed 's/inductor_//;s/_xpu_accuracy.*//') + suite=$(echo "$csv_file" | sed 's/.*inductor_//;s/_.*//;s/timm/timm_models/') + mode=$(echo "$csv_file" | sed 's/_xpu_accuracy.*//;s/.*_//') + dtype=$(echo "$csv_file" | sed -E 's/.*inductor_[a-z]*_//;s/models_//;s/_infer.*|_train.*//') + + local tmp_file="/tmp/tmp-${suite}-${mode}-${dtype}.txt" + + python "$check_file" \ + --suite "$suite" \ + --mode "$mode" \ + --dtype "$dtype" \ + --issue_file /tmp/tmp-known-issue.json \ + --csv_file "$csv_file" > "$tmp_file" + + local result + result=$(parse_test_results "$tmp_file") + echo "| $category | $result |" >> accuracy.summary.html +} + +parse_test_results() { + local tmp_file="$1" + sed 's/, /,/g' "$tmp_file" | awk ' + BEGIN { + total = passed = pass_rate = failed = xfail = timeout = 0 + new_passed = new_enabled = not_run = 0 + } + /Total models:/ { total = $3 } + /Passed models:/ { passed = $3 } + /Pass rate:/ { pass_rate = $3 } + /Real failed models:/ { failed = format_count($4, "🔴") } + /Expected failed models:/ { xfail = format_count($4, "🔵") } + /Warning timeout models:/ { timeout = format_count($4, "🟡") } + /Failed to passed models:/ { new_passed = format_count($5, "🟢") } + /Not run.in models:/ { not_run = $4 } + /New models:/ { new_enabled = format_count($3, "🔵") } + + function format_count(count, icon) { + return count > 0 ? icon count : count + } + + END { + printf "%s | %s | %s | %s | %s | %s | %s | %s | %s", + total, passed, pass_rate, failed, xfail, timeout, new_passed, new_enabled, not_run + }' +} + +generate_accuracy_details() { + local results_dir="$1" + + # Create table headers + cat > accuracy.details.html << EOF + +#### accuracy + + + + + + + + + + + + + + + +EOF + + cp accuracy.details.html accuracy.regression.html + + # Process all test suites + while IFS= read -r suite; do + process_suite "$results_dir" "$suite" + done < <(find "$results_dir" -name "*_xpu_accuracy.csv" | \ + sed 's/.*inductor_//;s/_[abf].*//' | sort | uniq) + + echo -e "
SuiteModelTrainingInference
float32bfloat16float16amp_bf16amp_fp16float32bfloat16float16amp_bf16amp_fp16
\n\n" >> accuracy.details.html + echo -e "\n\n" >> accuracy.regression.html + + # Clear regression file if no issues + if [[ $accuracy_regression -eq 0 ]]; then + rm -f accuracy.regression.html + fi +} + +process_suite() { + local results_dir="$1" suite="$2" + + while IFS= read -r model; do + process_model "$results_dir" "$suite" "$model" + done < <(get_models_for_suite "$results_dir" "$suite") +} + +get_models_for_suite() { + local results_dir="$1" suite="$2" + find "$results_dir" -name "*${suite}*_xpu_accuracy.csv" -exec cat {} \; | \ + grep "^xpu," | cut -d, -f2 | sort | uniq +} + +process_model() { + local results_dir="$1" suite="$2" model="$3" + local -A results=() + + # Collect results for all data types and modes + for dtype in float32 bfloat16 float16 amp_bf16 amp_fp16; do + for mode in training inference; do + local key="${mode}_${dtype}" + results[$key]=$(get_model_result "$results_dir" "$suite" "$model" "$dtype" "$mode") + done done - get_acc_details -fi -# Performance summary -rm -rf performance.*.html -performance_regression=0 -performance=$(find "${results_dir}" -name "*_xpu_performance.csv" |wc -l) -if [ "${performance}" -gt 0 ];then - if [ "${GITHUB_EVENT_NAME}" == "pull_request" ];then - python "$(dirname "$0")/perf_comparison.py" --target ${results_dir} --baseline ${reference_dir} --pr + local row + row=$(generate_html_row "$suite" "$model" "${results[@]}") + + if [[ "$row" =~ ${RED}|${GREEN}|${YELLOW} ]]; then + echo "$row" | tee -a accuracy.details.html >> accuracy.regression.html + accuracy_regression=1 + echo "acc 1" >> /tmp/tmp-acc-result.txt + else + echo "$row" >> accuracy.details.html + echo "acc 0" >> /tmp/tmp-acc-result.txt + fi +} + +get_model_result() { + local results_dir="$1" suite="$2" model="$3" dtype="$4" mode="$5" + local tmp_file="/tmp/tmp-${suite}-${mode}-${dtype}.txt" + local color="black" + + if [[ -f "$tmp_file" ]] && grep -q -w "$model" "$tmp_file"; then + color=$(determine_color "$tmp_file" "$model") + fi + + local value + value=$(find "$results_dir" -name "*${suite}_${dtype}_${mode}_xpu_accuracy.csv" -type f | \ + head -1 | xargs grep -h ",${model}," 2>/dev/null | cut -d, -f4 | head -1) + + if [[ "$color" != "black" ]]; then + echo "${color}${value}" else - python "$(dirname "$0")/perf_comparison.py" --target ${results_dir} --baseline ${reference_dir} + echo "${value}" fi - if [ -e performance.regression.html ];then +} + +determine_color() { + local tmp_file="$1" model="$2" + grep -w "$model" "$tmp_file" | awk ' + /Real failed models:/ { print "🔴"; exit } + /Expected failed models:|New models:/ { print "🔵"; exit } + /Warning timeout models:/ { print "🟡"; exit } + /Failed to passed models:/ { print "🟢"; exit } + { print "black" } + ' | head -1 +} + +generate_html_row() { + local suite="$1" model="$2" + shift 2 + local results=("$@") + + cat << EOF + + $suite + $model + ${results[0]}${results[1]}${results[2]}${results[3]}${results[4]} + ${results[5]}${results[6]}${results[7]}${results[8]}${results[9]} + +EOF +} + +# Performance Processing +process_performance() { + local results_dir="$1" reference_dir="$2" + + if ! find "$results_dir" -name "*_xpu_performance.csv" -quit; then + return + fi + + echo "Processing performance results..." + + local perf_args=("--target" "$results_dir" "--baseline" "$reference_dir") + if [[ "${GITHUB_EVENT_NAME:-}" == "pull_request" ]]; then + perf_args+=("--pr") + fi + + python "$SCRIPT_DIR/perf_comparison.py" "${perf_args[@]}" + + if [[ -f "performance.regression.html" ]]; then performance_regression=1 fi - # fetch best performance value - cp ${reference_dir}/best.csv ${results_dir}/best.csv || true - python "$(dirname "$0")/calculate_best_perf.py" \ - --new ${results_dir} \ - --best ${results_dir}/best.csv \ - --device PVC1100 --os "${OS_PRETTY_NAME}" \ - --driver "${DRIVER_VERSION}" --oneapi "${BUNDLE_VERSION}" \ - --gcc "${GCC_VERSION}" --python "${python}" \ - --pytorch "${TORCH_BRANCH_ID}/${TORCH_COMMIT_ID}" --torch-xpu-ops "${TORCH_XPU_OPS_COMMIT:-"${GITHUB_SHA}"}" -fi -echo "performance_regression=${performance_regression}" >> ${GITHUB_OUTPUT} -# Show result -summary_file="e2e-test-result.html" -cat > ${summary_file} << EOF + update_best_performance "$results_dir" "$reference_dir" +} + +update_best_performance() { + local results_dir="$1" reference_dir="$2" + local best_file="$results_dir/best.csv" + local output_file="${GITHUB_OUTPUT:-/dev/null}" + + cp "$reference_dir/best.csv" "$best_file" 2>/dev/null || true + + python "$SCRIPT_DIR/calculate_best_perf.py" \ + --new "$results_dir" \ + --best "$best_file" \ + --device PVC1100 \ + --os "${OS_PRETTY_NAME:-}" \ + --driver "${DRIVER_VERSION:-}" \ + --oneapi "${BUNDLE_VERSION:-}" \ + --gcc "${GCC_VERSION:-}" \ + --python "${python:-}" \ + --pytorch "${TORCH_BRANCH_ID:-}/${TORCH_COMMIT_ID:-}" \ + --torch-xpu-ops "${TORCH_XPU_OPS_COMMIT:-${GITHUB_SHA:-}}" + + echo "performance_regression=$performance_regression" >> "$output_file" +} + +# Report Generation +generate_report() { + local summary_file="e2e-test-result.html" + + { + generate_header + generate_highlights + generate_summary + generate_details + } > "$summary_file" + + echo "Report generated: $summary_file" +} + +generate_header() { + cat << EOF #### Note: -🔴: the failed cases which need look into -🟢: the new passed cases which need update reference -🔵: the expected failed or new enabled cases -🟡: the warning cases -Empty means the cases NOT run - -$( - if ((accuracy_regression + performance_regression > 0));then - echo -e "\n### 🎯 Highlight regressions\n" - if (( accuracy_regression > 0 ));then - cat accuracy.regression.html - fi - if (( performance_regression > 0 ));then - cat performance.regression.html - fi - fi -) +🔴: Failed cases needing investigation +🟢: New passed cases needing reference update +🔵: Expected failed or new enabled cases +🟡: Warning cases +Empty: Cases not run -### 📊 Summary +EOF +} -$(cat accuracy.summary.html) +generate_highlights() { + if (( accuracy_regression + performance_regression > 0 )); then + echo -e "### 🎯 Highlight regressions\n" + [[ $accuracy_regression -gt 0 ]] && cat accuracy.regression.html + [[ $performance_regression -gt 0 ]] && cat performance.regression.html + else + echo -e "### ✅ No regressions detected\n" + fi +} -$(cat performance.summary.html) +generate_summary() { + echo "### 📊 Summary" + echo + cat accuracy.summary.html 2>/dev/null || echo "No accuracy data" + echo + cat performance.summary.html 2>/dev/null || echo "No performance data" + echo +} +generate_details() { + cat << EOF ### 📖 Details -
View detailed result +
+View detailed result -$(cat accuracy.details.html) +EOF -$(cat performance.details.html) + cat accuracy.details.html 2>/dev/null || echo "No accuracy details" + cat performance.details.html 2>/dev/null || echo "No performance details" -
-EOF + echo "
" +} + +# Run main if script is executed directly +if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then + main "$@" +fi diff --git a/.github/scripts/get_issue.py b/.github/scripts/get_issue.py new file mode 100644 index 0000000000..589c024a78 --- /dev/null +++ b/.github/scripts/get_issue.py @@ -0,0 +1,120 @@ +#!/usr/bin/env python3 +""" +GitHub Issues Data Extractor - Efficient Version +Fetches GitHub issues and extracts table data with optimized performance. +""" + +import os +import re +import json +import argparse +from github import Github + + +def get_github_issues(repo_owner, repo_name, labels, state='open'): + """ + Efficiently get GitHub issues and extract table data. + Uses batch processing and optimized filtering. + """ + repo = g.get_repo(f"{repo_owner}/{repo_name}") + issues_data = [] + + # Use generator to avoid loading all issues into memory at once + issues = repo.get_issues(state=state, labels=labels) + + for issue in issues: + # Quick filter for issues with body content + if not issue.body: + continue + + # Fast table extraction with pre-compiled regex + table_rows = fast_extract_table_rows(issue.body) + + if table_rows: + issues_data.append({ + 'issue_number': issue.number, + 'issue_title': issue.title, + 'table_rows': table_rows + }) + + return issues_data + + +def fast_extract_table_rows(issue_body): + """ + Fast table extraction using pre-compiled regex patterns. + """ + # Pre-compile regex patterns for better performance + TABLE_ROW_PATTERN = re.compile(r'^(?=.*\|)(?!.*Suite)(?=.*[a-z]).+$', re.MULTILINE) + WHITESPACE_PATTERN = re.compile(r'\s+') + + # Find all table rows in one pass + rows = TABLE_ROW_PATTERN.findall(issue_body) + + # Process rows in batch + clean_rows = [] + for row in rows: + # Fast cleaning: replace multiple spaces with single space and split + clean_cells = [cell.strip() for cell in WHITESPACE_PATTERN.sub(' ', row).split('|')] + clean_cells = [cell for cell in clean_cells if cell] # Filter empty cells + + if len(clean_cells) > 1: # Only add rows with multiple cells + clean_rows.append(clean_cells) + + return clean_rows + + +def save_issues_json(issues_data, output_file): + """ + Efficient JSON saving with direct file writing. + """ + with open(output_file, 'w') as f: + # Write JSON manually for better control + f.write('[\n') + for i, issue in enumerate(issues_data): + if i > 0: + f.write(',\n') + json.dump(issue, f, separators=(',', ':')) # Compact JSON + f.write('\n]') + + +def main(): + """Optimized main function.""" + parser = argparse.ArgumentParser(description="Efficient GitHub issues exporter") + parser.add_argument("--repo_owner", default="intel", help="Repo owner") + parser.add_argument("--repo_name", default="torch-xpu-ops", help="Repo name") + parser.add_argument('--labels', nargs='*', help='Filter by labels') + parser.add_argument("--output", default="issues.json", help="Output file") + parser.add_argument("--state", default="open", help="Issue state") + + args = parser.parse_args() + + # Quick token check + token = os.getenv('GH_TOKEN') + global g + g = Github(token) # Increase page size for fewer API calls + + print(f"Fetching known issues from {args.repo_owner}/{args.repo_name}...") + + # Time the operation + issues_data = get_github_issues( + repo_owner=args.repo_owner, + repo_name=args.repo_name, + labels=args.labels, + state=args.state + ) + + # Save results + with open(args.output, "w") as f: + json.dump(issues_data, f, indent=2) + + # Results summary + print(f"✅ Done: {len(issues_data)} issues -> {args.output}") + + if issues_data: + total_rows = sum(len(issue['table_rows']) for issue in issues_data) + print(f"📊 {total_rows} table rows extracted") + + +if __name__ == "__main__": + main() diff --git a/.github/scripts/lintrunner.sh b/.github/scripts/lintrunner.sh index 644e0fc304..d731107c8d 100755 --- a/.github/scripts/lintrunner.sh +++ b/.github/scripts/lintrunner.sh @@ -2,13 +2,16 @@ set -ex # Creat a venv for lint check -python3 -m venv lint +if ! uv --help > /dev/null 2>&1; then + curl -LsSf https://astral.sh/uv/install.sh | sh + export PATH="$PATH:$HOME/.local/bin" +fi +uv venv lint --python 3.12 --clear source lint/bin/activate -python3 -m pip install -U pip setuptools wheel +uv pip install -U pip setuptools wheel # Use uv to speed up lintrunner init -python3 -m pip install uv==0.1.45 -python3 -m pip install ruamel.yaml +uv pip install ruamel.yaml CACHE_DIRECTORY="/tmp/.lintbin" # Try to recover the cached binaries @@ -20,11 +23,11 @@ fi # if lintrunner is not installed, install it if ! command -v lintrunner &> /dev/null; then - python3 -m pip install lintrunner==0.12.7 + uv pip install lintrunner fi # Ignoring errors in one specific run -export SHELLCHECK_OPTS="-e SC2154 -e SC2086 -e SC1091 -e SC2046 -e SC2076 -e SC2034" +export SHELLCHECK_OPTS="-e SC2154 -e SC2086 -e SC1091 -e SC2046 -e SC2076 -e SC2034 -e SC2190" # This has already been cached in the docker image lintrunner init 2> /dev/null @@ -37,8 +40,8 @@ if [[ "${CLANG}" == "1" ]]; then echo "Please run the checker under pytorch source code folder" fi fi -#python3 -m tools.generate_torch_version --is_debug=false -#python3 -m tools.pyi.gen_pyi \ +#uv tools.generate_torch_version --is_debug=false +#uv tools.pyi.gen_pyi \ # --native-functions-path aten/src/ATen/native/native_functions.yaml \ # --tags-path aten/src/ATen/native/tags.yaml \ # --deprecated-functions-path "tools/autograd/deprecated.yaml" diff --git a/.github/scripts/perf_comparison.py b/.github/scripts/perf_comparison.py index 455523e1ab..2e48f45cdd 100644 --- a/.github/scripts/perf_comparison.py +++ b/.github/scripts/perf_comparison.py @@ -2,9 +2,10 @@ """ Performance comparison script. -To compare the performance diff +Compare performance differences between target and baseline results. Usage: - python perf_comparison.py --target /path/to/xpu/performance/result/dir --baseline /path/to/reference/dir + python perf_comparison.py --target /path/to/xpu/performance/result/dir + --baseline /path/to/reference/dir """ import re @@ -13,86 +14,158 @@ import argparse import pandas as pd from statistics import geometric_mean +from typing import List, Tuple, Any, Optional -parser = argparse.ArgumentParser( - description="Analysis", - formatter_class=argparse.ArgumentDefaultsHelpFormatter -) -parser.add_argument( - "--target", - default=None, - help="XPU performance result csv files dir" -) -parser.add_argument( - "--baseline", - default=None, - help="XPU reference result csv files dir" -) -parser.add_argument( - "--pr", - action="store_true", - help="Only show results xpu has" -) -args = parser.parse_args() - - -def multiple_replace(text): - """Apply multiple regex replacements to text.""" + +def multiple_replace(text: str) -> str: + """ + Apply multiple regex replacements to text. + + Args: + text: Input text to process + + Returns: + Processed text with replacements applied + """ regex_replacements = [ (r".*inductor_", ""), (r"_xpu_performance.csv", ""), ] - for old, new in regex_replacements: - text = re.sub(old, new, text, flags=re.IGNORECASE) + for old_pattern, new_pattern in regex_replacements: + text = re.sub(old_pattern, new_pattern, text, flags=re.IGNORECASE) return text -def find_files(pattern, path): - """Find files matching pattern in directory tree.""" - result = [] - for root, dirs, files in os.walk(path): - for name in files: - if fnmatch.fnmatch(name, pattern): - result.append(os.path.join(root, name)) - return result +def find_files(pattern: str, search_path: str) -> List[str]: + """ + Find files matching pattern in directory tree. + + Args: + pattern: File pattern to match + search_path: Directory path to search + + Returns: + List of matching file paths + """ + matched_files = [] + for root, _, files in os.walk(search_path): + for filename in files: + if fnmatch.fnmatch(filename, pattern): + matched_files.append(os.path.join(root, filename)) + return matched_files -def color_result(input_val): - """Add color coding to performance results.""" +def color_result(input_val: float) -> str: + """ + Add color coding to performance results. + + Args: + input_val: Performance ratio value + + Returns: + Colored string representation + """ if input_val == -1: - output = input_val + return str(input_val) elif input_val < 0.8: - output = f"🔴{input_val}" + return f"🔴{input_val:.3f}" elif input_val < 0.9: - output = f"🟡{input_val}" - elif input_val > 1.1: # Fixed: 1 + 0.1 -> 1.1 - output = f"🟢{input_val}" + return f"🟡{input_val:.3f}" + elif input_val > 1.1: + return f"🟢{input_val:.3f}" else: - output = input_val - return output + return f"{input_val:.3f}" + +def write_report(cases: pd.DataFrame, filename: str, + message: str, write_mode: str) -> None: + """ + Helper function to write reports to HTML files. -def write_report(cases, filename, message, method): - """Helper function to write reports.""" + Args: + cases: DataFrame to write + filename: Output filename + message: Header message + write_mode: File write mode ('w' or 'a') + """ if not cases.empty: output = cases.to_html(index=False) - with open(filename, method, encoding='utf-8') as file: - file.write(f"\n\n{message}\n\n{output}") + with open(filename, write_mode, encoding='utf-8') as file: + file.write(f"\n\n{message}\n\n{output}\n\n") + + +def calculate_latencies(value: Optional[pd.Series]) -> Tuple[float, float, float]: + """ + Calculate eager and inductor latencies from value row. + Args: + value: DataFrame row with performance data -def calculate_latencies(value): - """Calculate eager and inductor latencies from value row.""" + Returns: + Tuple of (eager_latency, inductor_latency, inductor_vs_eager) + """ if value is None: - return -1, -1, -1 + return -1.0, -1.0, -1.0 + eager_latency = value["speedup"] * value["abs_latency"] inductor_latency = value["abs_latency"] inductor_vs_eager = value["speedup"] + return eager_latency, inductor_latency, inductor_vs_eager -def process_comparison_data(): - """Process and compare performance data between target and baseline.""" - # Comparison result output +def find_matching_row(dataframe: pd.DataFrame, model_name: str) -> Optional[pd.Series]: + """ + Find row for specific model in dataframe. + + Args: + dataframe: DataFrame to search + model_name: Model name to find + + Returns: + Matching row or None if not found + """ + matches = dataframe[dataframe["name"] == model_name] + return matches.iloc[0] if not matches.empty else None + + +def calculate_comparison_ratios(xpu_value: Optional[pd.Series], + refer_value: Optional[pd.Series]) -> Tuple[float, float]: + """ + Calculate performance comparison ratios between target and baseline. + + Args: + xpu_value: Target performance data + refer_value: Baseline performance data + + Returns: + Tuple of (eager_ratio, inductor_ratio) + """ + if xpu_value is None or refer_value is None: + return 0.0, 0.0 + + # Calculate eager comparison + xpu_eager = xpu_value["speedup"] * xpu_value["abs_latency"] + refer_eager = refer_value["speedup"] * refer_value["abs_latency"] + eager_ratio = refer_eager / xpu_eager if xpu_eager > 0 else 0.0 + + # Calculate inductor comparison + inductor_ratio = (refer_value["abs_latency"] / xpu_value["abs_latency"] + if xpu_value["abs_latency"] > 0 else 0.0) + + return eager_ratio, inductor_ratio + + +def process_comparison_data(args: argparse.Namespace) -> Tuple[List[List[Any]], List[str]]: + """ + Process and compare performance data between target and baseline. + + Args: + args: Command line arguments + + Returns: + Tuple of (output_data, output_header) + """ output_header = [ "Category", "Model", "Target eager", "Target inductor", "Inductor vs. Eager [Target]", "Baseline eager", "Baseline inductor", @@ -101,227 +174,276 @@ def process_comparison_data(): ] output_data = [] + # Process target files xpu_files = find_files("*_xpu_performance.csv", args.target) + for xpu_file in xpu_files: - xpu_data = pd.read_csv(xpu_file) - xpu_names = xpu_data["name"].tolist() - refer_file = re.sub( - args.target, - args.baseline + "/", - xpu_file, - flags=re.IGNORECASE, - count=1 - ) + try: + xpu_data = pd.read_csv(xpu_file) + category = multiple_replace(xpu_file) + + # Find corresponding baseline file + refer_file = xpu_file.replace(args.target, args.baseline) + + if os.path.isfile(refer_file): + refer_data = pd.read_csv(refer_file) + process_matching_models(xpu_data, refer_data, category, output_data) + else: + process_target_only_models(xpu_data, category, output_data) + + except Exception as e: + print(f"Error processing {xpu_file}: {e}") + continue + + # Process baseline-only files if not in PR mode + if not args.pr: + process_baseline_only_models(args, output_data) + + return output_data, output_header + + +def process_matching_models(xpu_data: pd.DataFrame, refer_data: pd.DataFrame, + category: str, output_data: List[List[Any]]) -> None: + """ + Process models that exist in both target and baseline. + """ + xpu_names = set(xpu_data["name"].tolist()) + refer_names = set(refer_data["name"].tolist()) + all_names = sorted(xpu_names | refer_names) + + for model_name in all_names: + xpu_value = find_matching_row(xpu_data, model_name) + refer_value = find_matching_row(refer_data, model_name) + + (xpu_eager, xpu_inductor, xpu_ratio) = calculate_latencies(xpu_value) + (refer_eager, refer_inductor, refer_ratio) = calculate_latencies(refer_value) + eager_ratio, inductor_ratio = calculate_comparison_ratios(xpu_value, refer_value) + + output_data.append([ + category, model_name, + xpu_eager, xpu_inductor, xpu_ratio, + refer_eager, refer_inductor, refer_ratio, + eager_ratio, inductor_ratio + ]) + + +def process_target_only_models(xpu_data: pd.DataFrame, category: str, + output_data: List[List[Any]]) -> None: + """ + Process models that only exist in target data. + """ + for model_name in sorted(xpu_data["name"].tolist()): + xpu_value = find_matching_row(xpu_data, model_name) + if xpu_value is not None: + xpu_eager = xpu_value["speedup"] * xpu_value["abs_latency"] + output_data.append([ + category, model_name, + xpu_eager, xpu_value["abs_latency"], xpu_value["speedup"], + -1, -1, -1, -1, -1 + ]) + + +def process_baseline_only_models(args: argparse.Namespace, + output_data: List[List[Any]]) -> None: + """ + Process models that only exist in baseline data. + """ + refer_files = find_files("*_xpu_performance.csv", args.baseline) + + for refer_file in refer_files: + try: + # Find corresponding target file + xpu_file = refer_file.replace(args.baseline, args.target) + if os.path.isfile(xpu_file): + continue - if os.path.isfile(refer_file): refer_data = pd.read_csv(refer_file) - refer_names = [row["name"] for index, row in refer_data.iterrows()] - names = set(xpu_names) - names = sorted(names) - - for name in names: - # XPU info - xpu_value = next( - (row for index, row in xpu_data.iterrows() - if row["name"] == name), - None - ) - (xpu_eager_latency, xpu_inductor_latency, - xpu_inductor_vs_eager) = calculate_latencies(xpu_value) - - # Reference info - refer_value = next( - (row for index, row in refer_data.iterrows() - if row["name"] == name), - None - ) - (refer_eager_latency, refer_inductor_latency, - refer_inductor_vs_eager) = calculate_latencies(refer_value) - - # XPU vs reference comparisons - if (xpu_value is not None and refer_value is not None - and xpu_eager_latency > 0): - xpu_vs_refer_eager = (refer_eager_latency / xpu_eager_latency) - else: - xpu_vs_refer_eager = 0 - - if (xpu_value is not None and refer_value is not None - and xpu_inductor_latency > 0): - xpu_vs_refer_inductor = (float(refer_value["abs_latency"]) / - xpu_value["abs_latency"]) - else: - xpu_vs_refer_inductor = 0 - - # Output data - output_data.append([ - multiple_replace(xpu_file), name, - xpu_eager_latency, xpu_inductor_latency, - xpu_inductor_vs_eager, - refer_eager_latency, refer_inductor_latency, - refer_inductor_vs_eager, - xpu_vs_refer_eager, xpu_vs_refer_inductor - ]) - else: - names = set(xpu_names) - names = sorted(names) - for name in names: - xpu_value = next( - (row for index, row in xpu_data.iterrows() - if row["name"] == name), - None - ) - if xpu_value is not None: - xpu_eager_latency = (xpu_value["speedup"] * - xpu_value["abs_latency"]) + category = multiple_replace(refer_file) + + for model_name in sorted(refer_data["name"].tolist()): + refer_value = find_matching_row(refer_data, model_name) + if refer_value is not None: + refer_eager = refer_value["speedup"] * refer_value["abs_latency"] output_data.append([ - multiple_replace(xpu_file), name, - xpu_eager_latency, xpu_value["abs_latency"], - xpu_value["speedup"], -1, -1, -1, -1, -1 + category, model_name, + -1, -1, -1, + refer_eager, refer_value["abs_latency"], + refer_value["speedup"], -1, -1 ]) + except Exception as e: + print(f"Error processing baseline file {refer_file}: {e}") + continue - if not args.pr: - refer_files = find_files("*_xpu_performance.csv", args.baseline) - for refer_file in refer_files: - refer_data = pd.read_csv(refer_file) - refer_names = refer_data["name"].tolist() - xpu_file = re.sub( - args.baseline, - args.target + "/", - refer_file, - flags=re.IGNORECASE, - count=1 - ) - if not os.path.isfile(xpu_file): - names = set(refer_names) - names = sorted(names) - for name in names: - refer_value = next( - (row for index, row in refer_data.iterrows() - if row["name"] == name), - None - ) - if refer_value is not None: - refer_eager_latency = (refer_value["speedup"] * - refer_value["abs_latency"]) - output_data.append([ - multiple_replace(refer_file), name, - -1, -1, -1, - refer_eager_latency, refer_value["abs_latency"], - refer_value["speedup"], -1, -1 - ]) - return output_data, output_header +def generate_summary(output_data: pd.DataFrame, args: argparse.Namespace) -> pd.DataFrame: + """ + Generate performance summary statistics. + Args: + output_data: Processed performance data + args: Command line arguments -def generate_summary(output_data): - """Generate performance summary statistics.""" - geomean_sum = { + Returns: + DataFrame with summary statistics + """ + geomean_results = { "all": [], "huggingface": [], "timm_models": [], "torchbench": [] } - columns = [ + comparison_columns = [ "Target vs. Baseline [Inductor]", "Target vs. Baseline [Eager]", "Inductor vs. Eager [Target]" ] + comparison_data = output_data.loc[ + (output_data['Target inductor'] > 0) & (output_data['Baseline inductor'] > 0) + ] - for column_name in columns: - data = [ - row[column_name] for index, row in output_data.iterrows() + for column_name in comparison_columns: + # Overall geometric mean + valid_data = [ + row[column_name] for _, row in comparison_data.iterrows() if row[column_name] > 0 ] - if data: - geomean_sum["all"].append(color_result(geometric_mean(data))) - else: - geomean_sum["all"].append("🔴") - - for model_name in ["huggingface", "timm_models", "torchbench"]: - data = [ - row[column_name] for index, row in output_data.iterrows() + geomean_results["all"].append( + color_result(geometric_mean(valid_data)) if valid_data else None + ) + + # Per-category geometric means + for category in ["huggingface", "timm_models", "torchbench"]: + category_data = [ + row[column_name] for _, row in comparison_data.iterrows() if (row[column_name] > 0 and - re.match(model_name, row["Category"])) + re.match(category, row["Category"])) ] - if os.path.exists(os.path.join(args.target, model_name)): - if data: - geomean_sum[model_name].append( - color_result(geometric_mean(data)) - ) - else: - geomean_sum[model_name].append("🔴") - - geomean_sum = {k: v for k, v in geomean_sum.items() if v} - output_sum = pd.DataFrame( - geomean_sum, - index=columns - ).T - return output_sum + geomean_results[category].append( + color_result(geometric_mean(category_data)) if category_data else None + ) + + # Filter out empty categories + geomean_results = {k: v for k, v in geomean_results.items() if any(v)} + + return pd.DataFrame(geomean_results, index=comparison_columns).T + + +def generate_regression_reports(output_data: pd.DataFrame, args: argparse.Namespace) -> None: + """ + Generate regression analysis reports. + + Args: + output_data: Processed performance data + args: Command line arguments + """ + criteria_high = 0.8 + criteria_medium = 0.9 + + # Regression cases for full report + regression_cases = output_data.loc[ + ((output_data['Target vs. Baseline [Inductor]'] < criteria_medium) | + (output_data['Target vs. Baseline [Eager]'] < criteria_medium)) & + (output_data['Target inductor'] > 0) & + (output_data['Baseline inductor'] > 0) + ] + + write_report(regression_cases, 'performance.regression.html', + "#### Performance Regression", "w") + + # PR-specific reports + if args.pr: + generate_pr_report(output_data, criteria_high, criteria_medium) + + +def generate_pr_report(output_data: pd.DataFrame, criteria_high: float, + criteria_medium: float) -> None: + """ + Generate PR-specific performance report. + + Args: + output_data: Processed performance data + criteria_high: High regression threshold + criteria_medium: Medium regression threshold + """ + pr_data = output_data.loc[(output_data['Target inductor'] > 0) & (output_data['Baseline inductor'] > 0)] + pr_data = pr_data[[ + "Category", "Model", "Target vs. Baseline [Eager]", + "Target vs. Baseline [Inductor]" + ]] + + # High regression cases + high_regression = pr_data.loc[ + (pr_data['Target vs. Baseline [Inductor]'] < criteria_high) | + (pr_data['Target vs. Baseline [Eager]'] < criteria_high) + ] + + # Medium regression cases + medium_regression = pr_data.loc[ + ((pr_data['Target vs. Baseline [Inductor]'] < criteria_medium) | + (pr_data['Target vs. Baseline [Eager]'] < criteria_medium)) & + (pr_data['Target vs. Baseline [Inductor]'] >= criteria_high) & + (pr_data['Target vs. Baseline [Eager]'] >= criteria_high) + ] + + if not high_regression.empty or not medium_regression.empty: + with open('performance.regression.pr.html', 'w', encoding='utf-8') as f: + f.write("\n### Performance outliers, please check!\n") + + write_report(high_regression, 'performance.regression.pr.html', + "- 🔴 [-1, 80%), should be regression", 'a') + write_report(medium_regression, 'performance.regression.pr.html', + "- 🟡 [80%, 90%), may be fluctuations", 'a') def main(): """Main function to run performance comparison.""" - output_data, output_header = process_comparison_data() + args = argparse.ArgumentParser( + description="Performance Analysis", + formatter_class=argparse.ArgumentDefaultsHelpFormatter + ) + args.add_argument( + "--target", + required=True, + help="XPU performance result csv files directory" + ) + args.add_argument( + "--baseline", + required=True, + help="XPU reference result csv files directory" + ) + args.add_argument( + "--pr", + action="store_true", + help="Only show results that XPU has" + ) + args = args.parse_args() + + # Process comparison data + output_data, output_header = process_comparison_data(args) + output_df = pd.DataFrame(output_data, columns=output_header) - # Create DataFrame and sort - output_data = pd.DataFrame(output_data, columns=output_header) - output_data = output_data.sort_values( + # Sort by performance ratios + output_df = output_df.sort_values( ['Target vs. Baseline [Inductor]', 'Target vs. Baseline [Eager]'], ascending=[True, True] ) - # Generate summary - output_sum = generate_summary(output_data) - output = output_sum.to_html(header=True) + # Generate summary report + summary_df = generate_summary(output_df, args) with open('performance.summary.html', 'w', encoding='utf-8') as f: - f.write("\n\n#### performance\n\n" + output) + f.write("\n\n#### Performance Summary\n\n" + summary_df.to_html(header=True) + "\n\n") - # Generate details - output = output_data.to_html(index=False) + # Generate detailed report + output_df_clean = output_df.replace(-1, '') with open('performance.details.html', 'w', encoding='utf-8') as f: - f.write("\n\n#### performance\n\n" + output) - - # Regression analysis - CRITERIA_HIGH = 0.8 - CRITERIA_MEDIUM = 0.9 - PERFORMANCE_FILE = 'performance.regression.html' - PR_FILE = 'performance.regression.pr.html' - - # Regression cases - cases_regression = output_data.loc[ - ((output_data['Target vs. Baseline [Inductor]'] < CRITERIA_MEDIUM) - | (output_data['Target vs. Baseline [Eager]'] < CRITERIA_MEDIUM)) - & (output_data['Baseline inductor'] > 0) - ] - write_report(cases_regression, PERFORMANCE_FILE, "#### performance", "w") + f.write("\n\n#### Performance Details\n\n" + output_df_clean.to_html(index=False) + "\n\n") - # Highlight in PR - if args.pr: - filtered_data = output_data.loc[(output_data['Baseline inductor'] > 0)] - filtered_data = filtered_data[[ - "Category", "Model", "Target vs. Baseline [Eager]", - "Target vs. Baseline [Inductor]" - ]] - cases_h = filtered_data.loc[ - ((filtered_data['Target vs. Baseline [Inductor]'] < CRITERIA_HIGH) - | (filtered_data['Target vs. Baseline [Eager]'] < CRITERIA_HIGH)) - ] - cases_m = filtered_data.loc[ - ((filtered_data['Target vs. Baseline [Inductor]'] < CRITERIA_MEDIUM) - | (filtered_data['Target vs. Baseline [Eager]'] < CRITERIA_MEDIUM)) - & ((filtered_data['Target vs. Baseline [Inductor]'] >= CRITERIA_HIGH) - & (filtered_data['Target vs. Baseline [Eager]'] >= CRITERIA_HIGH)) - ] - if not cases_h.empty or not cases_m.empty: - with open(PR_FILE, 'w', encoding='utf-8') as f: - f.write("\n### Performance outliers, please check!\n") - write_report( - cases_h, PR_FILE, "- 🔴 [-1, 80%), should be regression", 'a' - ) - write_report( - cases_m, PR_FILE, "- 🟡 [80%, 90%), may be fluctuations", 'a' - ) + # Generate regression reports + generate_regression_reports(output_df, args) + + print("Performance comparison completed!") + print(f"Processed {len(output_df)} model comparisons") if __name__ == "__main__": diff --git a/.github/workflows/_linux_e2e_summary.yml b/.github/workflows/_linux_e2e_summary.yml index c7ff06e653..9b06b8da69 100644 --- a/.github/workflows/_linux_e2e_summary.yml +++ b/.github/workflows/_linux_e2e_summary.yml @@ -39,7 +39,7 @@ jobs: run: | sudo apt-get update sudo apt-get install gh rsync ca-certificates -y - pip install pandas requests + pip install pandas requests pygithub - name: Download Target Artifact run: | mkdir target/ @@ -76,7 +76,7 @@ jobs: cat e2e-test-result.html >> ${GITHUB_STEP_SUMMARY} e2e_result=1 fi - acc_failed=$(awk 'BEGIN{sum=0}{if($2>0){sum++}}END{print sum}' /tmp/tmp-result.txt) + acc_failed=$(awk 'BEGIN{sum=0}{if($2>0){sum++}}END{print sum}' /tmp/tmp-acc-result.txt) fi echo "acc_failed=${acc_failed}" >> ${GITHUB_ENV} echo "e2e_result=${e2e_result}" >> ${GITHUB_OUTPUT}