Test userbenchmark

xuzhao9 · xuzhao9 · commit 551f9e9ce829 · 2025-01-27T00:51:12.000-05:00
diff --git a/.github/workflows/userbenchmark-a100.yml b/.github/workflows/userbenchmark-a100.yml
@@ -27,9 +27,6 @@ jobs:
       - name: Install Conda
         run: |
           bash ./.ci/torchbench/install-conda.sh
-      - name: Install TorchBench
-        run: |
-          bash ./.ci/torchbench/install.sh
       - name: Run user benchmark
         run: |
           set -x
diff --git a/userbenchmark/release-test/run.py b/userbenchmark/release-test/run.py
@@ -1,155 +1,12 @@
-import argparse
-import itertools
 import os
-import shutil
 import subprocess
-import time
-from datetime import datetime
-from pathlib import Path
-from typing import List
-
-import yaml
-from git import Repo
-
-from ..utils import dump_output, get_output_dir, get_output_json
-from .result_analyzer import analyze
 
-# Expected WORK_DIR structure
-# WORK_DIR/
-#  |---examples/
-#  |---pytorch-<ver1>-cuda<ver1>/
-#        |---run.sh
-#        |---mnist/
-#        |---mnist-hogwild/
-#        |---<other-benchmarks>
-#  |---pytorch-<ver2>-cuda<ver2>/
-#  |---summary.csv
+from typing import List
 
 BM_NAME = "release-test"
 EXAMPLE_URL = "https://github.com/pytorch/examples.git"
 CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
-DEFAULT_CONFIG_PATH = os.path.join(
-    os.path.dirname(os.path.abspath(__file__)), "configs"
-)
-RUN_TEMPLATE = """
-# GENERATED BY userbenchmark/release-test/__init__.py. DO NOT EDIT!
-bash {RELEASE_TEST_ROOT}/setup_env.sh '{CUDA_VERSION}' '{MAGMA_VERSION}' '{PYTORCH_VERSION}' '{PYTORCH_CHANNEL}' '{WORK_DIR}'
-bash {RELEASE_TEST_ROOT}/run_release_test.sh '{CUDA_VERSION}' '{RESULT_DIR}'
-"""
-
-
-def get_timestamp():
-    return datetime.fromtimestamp(time.time()).strftime("%Y%m%d%H%M%S")
-
-
-def get_work_dir(output_dir):
-    work_dir = output_dir.joinpath(f"run-{get_timestamp()}")
-    work_dir.mkdir(exist_ok=True, parents=True)
-    return work_dir
-
-
-def generate_test_scripts(config, work_dir):
-    assert "cuda" in config and isinstance(
-        config["cuda"], list
-    ), f"Expected CUDA config list, but not found."
-    assert "pytorch" in config and isinstance(
-        config["pytorch"], list
-    ), f"Exptected pytorch version list, but not found."
-    bm_matrix = [config["cuda"], config["pytorch"]]
-    run_scripts = {}
-    for cuda, pytorch in itertools.product(*bm_matrix):
-        run_key = f"pytorch-{pytorch['version']}-cuda-{cuda['version']}"
-        run_script = RUN_TEMPLATE.format(
-            RELEASE_TEST_ROOT=CURRENT_DIR,
-            CUDA_VERSION=cuda["version"],
-            MAGMA_VERSION=cuda["magma_version"],
-            PYTORCH_VERSION=pytorch["version"],
-            PYTORCH_CHANNEL=pytorch["conda_channel"],
-            WORK_DIR=work_dir,
-            RESULT_DIR=work_dir.joinpath(run_key),
-        )
-        run_scripts[run_key] = run_script
-    return run_scripts
-
-
-def dump_test_scripts(run_scripts, work_dir):
-    for run_key, run_script in run_scripts.items():
-        run_script_loc = work_dir.joinpath(run_key)
-        run_script_loc.mkdir(exist_ok=True)
-        with open(run_script_loc.joinpath("run.sh"), "w") as rs:
-            rs.write(run_script)
-
-
-def dump_result_to_json(metrics):
-    result = get_output_json(BM_NAME, metrics)
-    dump_output(BM_NAME, result)
-
-
-def run_benchmark(run_scripts, work_dir):
-    for run_key, _rscript in run_scripts.items():
-        run_script_path = work_dir.joinpath(run_key, "run.sh")
-        # run the benchmark
-        print(f"Running benchmark {run_key} ...")
-        subprocess.check_call(["bash", str(run_script_path)])
-
-
-def get_config(config_name: str):
-    if os.path.exists(os.path.join(DEFAULT_CONFIG_PATH, config_name)):
-        config_name = os.path.join(DEFAULT_CONFIG_PATH, config_name)
-    elif os.path.exists(os.path.join(DEFAULT_CONFIG_PATH, f"{config_name}.yaml")):
-        config_name = os.path.join(DEFAULT_CONFIG_PATH, f"{config_name}.yaml")
-    else:
-        raise ValueError(
-            f"Can't find config name {config_name} in config path {DEFAULT_CONFIG_PATH}."
-        )
-    with open(config_name, "r") as yfile:
-        config = yaml.safe_load(yfile)
-    return config
-
-
-def parse_args(args):
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--config", "-c", default="1.12.1", type=str, help="Config for release testing"
-    )
-    parser.add_argument(
-        "--dry-run",
-        action="store_true",
-        help="Only generate the test scripts. Do not run the benchmark.",
-    )
-    parser.add_argument(
-        "--analyze",
-        type=str,
-        help="Only analyze the result of the specified work directory.",
-    )
-    args = parser.parse_args(args)
-    return args
-
-
-def prepare_release_tests(args: argparse.Namespace, work_dir: Path):
-    config = get_config(args.config)
-    run_scripts = generate_test_scripts(config, work_dir)
-    dump_test_scripts(run_scripts, work_dir)
-    # clone the examples repo
-    Repo.clone_from(EXAMPLE_URL, work_dir.joinpath("examples"))
-    return run_scripts
-
-
-def cleanup_release_tests(work_dir):
-    examples_path = work_dir.joinpath("examples")
-    if examples_path.exists():
-        shutil.rmtree(examples_path)
 
 
 def run(args: List[str]):
-    args = parse_args(args)
-    if args.analyze:
-        analyze(args.analyze)
-        return
-    work_dir = get_work_dir(get_output_dir(BM_NAME))
-    run_scripts = prepare_release_tests(args=args, work_dir=work_dir)
-    if not args.dry_run:
-        run_benchmark(run_scripts, work_dir)
-        metrics = analyze(work_dir)
-        dump_result_to_json(metrics)
-        cleanup_release_tests(work_dir)
+    subprocess.check_call(["bash", f"{CURRENT_DIR}/run_release_test.sh"])
diff --git a/userbenchmark/release-test/run_release_test.sh b/userbenchmark/release-test/run_release_test.sh
@@ -1,60 +1,10 @@
 #!/bin/bash
 
-set -xeuo pipefail
+set -euo pipefail
 
-CUDA_VERSION="$1"
-RESULT_DIR="$2"
-EXAMPLES_DIR="${RESULT_DIR}/../examples"
-# get the directory of the current script
-CURRENT_DIR=$(dirname -- "$0")
+python -c "import torch; import time; a = torch.randn([4096, 4096]).cuda(); time.sleep(60); print('done!')"  > log.txt 2>&1 &
 
-PREFIX=""
-if [[ ${PLATFORM_NAME} == "aws_t4_metal" ]]; then
- PREFIX="taskset -c 24-47";
- export GOMP_CPU_AFFINITY="24-47"
-fi
-
-. switch-cuda.sh "${CUDA_VERSION}"
-
-
-nvcc --version
-sudo apt update
-sudo apt-get install bc
-sudo apt-get install --reinstall time
-which time
-# run mnist
-mkdir -p "${RESULT_DIR}/mnist"
-pushd "${EXAMPLES_DIR}/mnist"
-export LOG_FILE=${RESULT_DIR}/mnist/result.log
-export MEM_FILE=${RESULT_DIR}/mnist/result_mem.log
-${PREFIX} bash "${CURRENT_DIR}/monitor_proc.sh" python main.py --epochs 3
-# run mnist-hogwild
-mkdir -p ${RESULT_DIR}/mnist_hogwild
-pushd "${EXAMPLES_DIR}/mnist_hogwild"
-export LOG_FILE=${RESULT_DIR}/mnist_hogwild/result.log
-export MEM_FILE=${RESULT_DIR}/mnist_hogwild/result_mem.log
-${PREFIX} bash "${CURRENT_DIR}/monitor_proc.sh" python main.py --epochs 3
-# run CPU WLM LSTM
-mkdir -p ${RESULT_DIR}/wlm_cpu_lstm
-pushd "${EXAMPLES_DIR}/word_language_model"
-export LOG_FILE=${RESULT_DIR}/wlm_cpu_lstm/result.log
-export MEM_FILE=${RESULT_DIR}/wlm_cpu_lstm/result_mem.log
-${PREFIX} bash "${CURRENT_DIR}/monitor_proc.sh" python main.py --epochs 3 --model LSTM
-# run GPU WLM LSTM
-mkdir -p ${RESULT_DIR}/wlm_gpu_lstm
-pushd "${EXAMPLES_DIR}/word_language_model"
-export LOG_FILE=${RESULT_DIR}/wlm_gpu_lstm/result.log
-export MEM_FILE=${RESULT_DIR}/wlm_gpu_lstm/result_mem.log
-${PREFIX} bash "${CURRENT_DIR}/monitor_proc.sh" python main.py --epochs 3 --model LSTM --cuda
-# run CPU WLM Transformer
-mkdir -p ${RESULT_DIR}/wlm_cpu_trans
-pushd "${EXAMPLES_DIR}/word_language_model"
-export LOG_FILE=${RESULT_DIR}/wlm_cpu_trans/result.log
-export MEM_FILE=${RESULT_DIR}/wlm_cpu_trans/result_mem.log
-${PREFIX} bash "${CURRENT_DIR}/monitor_proc.sh" python main.py --epochs 3 --model Transformer
-# run GPU WLM Transformer
-mkdir -p ${RESULT_DIR}/wlm_gpu_trans
-pushd "${EXAMPLES_DIR}/word_language_model"
-export LOG_FILE=${RESULT_DIR}/wlm_gpu_trans/result.log
-export MEM_FILE=${RESULT_DIR}/wlm_gpu_trans/result_mem.log
-${PREFIX} bash "${CURRENT_DIR}/monitor_proc.sh" python main.py --epochs 3 --model Transformer --cuda
+for i in {1..120}; do
+    nvidia-smi pmon -s m -c 1 -o T 
+    sleep 0.5
+done