@@ -317,6 +317,7 @@ test_inductor_distributed() {
317
317
python test/run_test.py -i distributed/test_c10d_functional_native.py --verbose
318
318
python test/run_test.py -i distributed/_tensor/test_dtensor_compile.py --verbose
319
319
python test/run_test.py -i distributed/tensor/parallel/test_fsdp_2d_parallel.py --verbose
320
+ python test/run_test.py -i distributed/tensor/parallel/test_micro_pipeline_tp.py --verbose
320
321
python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_comm.py --verbose
321
322
python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_training.py -k test_train_parity_multi_group --verbose
322
323
python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_training.py -k test_train_parity_with_activation_checkpointing --verbose
@@ -429,16 +430,13 @@ test_perf_for_dashboard() {
429
430
local targets=(accuracy performance)
430
431
431
432
local device=cuda
432
- local taskset=" "
433
433
if [[ " ${TEST_CONFIG} " == * cpu* ]]; then
434
434
if [[ " ${TEST_CONFIG} " == * cpu_x86* ]]; then
435
435
device=cpu_x86
436
436
elif [[ " ${TEST_CONFIG} " == * cpu_aarch64* ]]; then
437
437
device=cpu_aarch64
438
438
fi
439
439
test_inductor_set_cpu_affinity
440
- end_core=$(( $(test_inductor_get_core_number)- 1 ))
441
- taskset=" taskset -c 0-$end_core "
442
440
elif [[ " ${TEST_CONFIG} " == * cuda_a10g* ]]; then
443
441
device=cuda_a10g
444
442
fi
@@ -458,51 +456,51 @@ test_perf_for_dashboard() {
458
456
fi
459
457
460
458
if [[ " $DASHBOARD_TAG " == * default-true* ]]; then
461
- $taskset python " benchmarks/dynamo/$suite .py" \
459
+ $TASKSET python " benchmarks/dynamo/$suite .py" \
462
460
" ${target_flag[@]} " --" $mode " --" $dtype " --backend " $backend " --disable-cudagraphs " $@ " \
463
461
--output " $TEST_REPORTS_DIR /${backend} _no_cudagraphs_${suite} _${dtype} _${mode} _${device} _${target} .csv"
464
462
fi
465
463
if [[ " $DASHBOARD_TAG " == * cudagraphs-true* ]]; then
466
- $taskset python " benchmarks/dynamo/$suite .py" \
464
+ $TASKSET python " benchmarks/dynamo/$suite .py" \
467
465
" ${target_flag[@]} " --" $mode " --" $dtype " --backend " $backend " " $@ " \
468
466
--output " $TEST_REPORTS_DIR /${backend} _with_cudagraphs_${suite} _${dtype} _${mode} _${device} _${target} .csv"
469
467
fi
470
468
if [[ " $DASHBOARD_TAG " == * dynamic-true* ]]; then
471
- $taskset python " benchmarks/dynamo/$suite .py" \
469
+ $TASKSET python " benchmarks/dynamo/$suite .py" \
472
470
" ${target_flag[@]} " --" $mode " --" $dtype " --backend " $backend " --dynamic-shapes \
473
471
--dynamic-batch-only " $@ " \
474
472
--output " $TEST_REPORTS_DIR /${backend} _dynamic_${suite} _${dtype} _${mode} _${device} _${target} .csv"
475
473
fi
476
474
if [[ " $DASHBOARD_TAG " == * cppwrapper-true* ]] && [[ " $mode " == " inference" ]]; then
477
- TORCHINDUCTOR_CPP_WRAPPER=1 $taskset python " benchmarks/dynamo/$suite .py" \
475
+ TORCHINDUCTOR_CPP_WRAPPER=1 $TASKSET python " benchmarks/dynamo/$suite .py" \
478
476
" ${target_flag[@]} " --" $mode " --" $dtype " --backend " $backend " --disable-cudagraphs " $@ " \
479
477
--output " $TEST_REPORTS_DIR /${backend} _cpp_wrapper_${suite} _${dtype} _${mode} _${device} _${target} .csv"
480
478
fi
481
479
if [[ " $DASHBOARD_TAG " == * freezing_cudagraphs-true* ]] && [[ " $mode " == " inference" ]]; then
482
- $taskset python " benchmarks/dynamo/$suite .py" \
480
+ $TASKSET python " benchmarks/dynamo/$suite .py" \
483
481
" ${target_flag[@]} " --" $mode " --" $dtype " --backend " $backend " " $@ " --freezing \
484
482
--output " $TEST_REPORTS_DIR /${backend} _with_cudagraphs_freezing_${suite} _${dtype} _${mode} _${device} _${target} .csv"
485
483
fi
486
484
if [[ " $DASHBOARD_TAG " == * freeze_autotune_cudagraphs-true* ]] && [[ " $mode " == " inference" ]]; then
487
- TORCHINDUCTOR_MAX_AUTOTUNE=1 $taskset python " benchmarks/dynamo/$suite .py" \
485
+ TORCHINDUCTOR_MAX_AUTOTUNE=1 $TASKSET python " benchmarks/dynamo/$suite .py" \
488
486
" ${target_flag[@]} " --" $mode " --" $dtype " --backend " $backend " " $@ " --freezing \
489
487
--output " $TEST_REPORTS_DIR /${backend} _with_cudagraphs_freezing_autotune_${suite} _${dtype} _${mode} _${device} _${target} .csv"
490
488
fi
491
489
if [[ " $DASHBOARD_TAG " == * aotinductor-true* ]] && [[ " $mode " == " inference" ]]; then
492
- TORCHINDUCTOR_ABI_COMPATIBLE=1 $taskset python " benchmarks/dynamo/$suite .py" \
490
+ TORCHINDUCTOR_ABI_COMPATIBLE=1 $TASKSET python " benchmarks/dynamo/$suite .py" \
493
491
" ${target_flag[@]} " --" $mode " --" $dtype " --export-aot-inductor --disable-cudagraphs " $@ " \
494
492
--output " $TEST_REPORTS_DIR /${backend} _aot_inductor_${suite} _${dtype} _${mode} _${device} _${target} .csv"
495
493
fi
496
494
if [[ " $DASHBOARD_TAG " == * maxautotune-true* ]]; then
497
- TORCHINDUCTOR_MAX_AUTOTUNE=1 $taskset python " benchmarks/dynamo/$suite .py" \
495
+ TORCHINDUCTOR_MAX_AUTOTUNE=1 $TASKSET python " benchmarks/dynamo/$suite .py" \
498
496
" ${target_flag[@]} " --" $mode " --" $dtype " --backend " $backend " " $@ " \
499
497
--output " $TEST_REPORTS_DIR /${backend} _max_autotune_${suite} _${dtype} _${mode} _${device} _${target} .csv"
500
498
fi
501
499
if [[ " $DASHBOARD_TAG " == * cudagraphs_low_precision-true* ]] && [[ " $mode " == " inference" ]]; then
502
500
# TODO: This has a new dtype called quant and the benchmarks script needs to be updated to support this.
503
501
# The tentative command is as follows. It doesn't work now, but it's ok because we only need mock data
504
502
# to fill the dashboard.
505
- $taskset python " benchmarks/dynamo/$suite .py" \
503
+ $TASKSET python " benchmarks/dynamo/$suite .py" \
506
504
" ${target_flag[@]} " --" $mode " --quant --backend " $backend " " $@ " \
507
505
--output " $TEST_REPORTS_DIR /${backend} _cudagraphs_low_precision_${suite} _quant_${mode} _${device} _${target} .csv" || true
508
506
# Copy cudagraph results as mock data, easiest choice?
@@ -664,27 +662,32 @@ test_inductor_torchbench_smoketest_perf() {
664
662
}
665
663
666
664
test_inductor_get_core_number () {
667
- echo $(( $(lscpu | grep 'Socket(s): ' | awk '{print $2 }') * $(lscpu | grep 'Core(s) per socket: ' | awk '{print $4 }')) )
665
+ if [[ " ${TEST_CONFIG} " == * aarch64 ]]; then
666
+ echo " $(( $(lscpu | grep 'Cluster(s): ' | awk '{print $2 }') * $(lscpu | grep 'Core(s) per cluster: ' | awk '{print $4 }')) )"
667
+ else
668
+ echo " $(( $(lscpu | grep 'Socket(s): ' | awk '{print $2 }') * $(lscpu | grep 'Core(s) per socket: ' | awk '{print $4 }')) )"
669
+ fi
668
670
}
669
671
670
672
test_inductor_set_cpu_affinity (){
671
673
# set jemalloc
672
- JEMALLOC_LIB=" /usr/lib/x86_64-linux-gnu/ libjemalloc.so.2"
674
+ JEMALLOC_LIB=" $( find /usr/lib -name libjemalloc.so.2) "
673
675
IOMP_LIB=" $( dirname " $( which python) " ) /../lib/libiomp5.so"
674
676
export LD_PRELOAD=" $JEMALLOC_LIB " :" $IOMP_LIB " :" $LD_PRELOAD "
675
677
export MALLOC_CONF=" oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:-1,muzzy_decay_ms:-1"
676
678
export KMP_AFFINITY=granularity=fine,compact,1,0
677
679
export KMP_BLOCKTIME=1
678
680
cores=$( test_inductor_get_core_number)
679
681
export OMP_NUM_THREADS=$cores
682
+ end_core=$(( cores- 1 ))
683
+ export TASKSET=" taskset -c 0-$end_core "
680
684
}
681
685
682
686
test_inductor_torchbench_cpu_smoketest_perf (){
683
687
TEST_REPORTS_DIR=$( pwd) /test/test-reports
684
688
mkdir -p " $TEST_REPORTS_DIR "
685
689
686
690
test_inductor_set_cpu_affinity
687
- end_core=$(( $(test_inductor_get_core_number)- 1 ))
688
691
MODELS_SPEEDUP_TARGET=benchmarks/dynamo/expected_ci_speedup_inductor_torchbench_cpu.csv
689
692
690
693
grep -v ' ^ *#' < " $MODELS_SPEEDUP_TARGET " | while IFS=' ,' read -r -a model_cfg
@@ -701,11 +704,11 @@ test_inductor_torchbench_cpu_smoketest_perf(){
701
704
local output_name=" $TEST_REPORTS_DIR /inductor_inference_${model_cfg[0]} _${model_cfg[1]} _${model_cfg[2]} _${model_cfg[3]} _cpu_smoketest.csv"
702
705
703
706
if [[ ${model_cfg[3]} == " dynamic" ]]; then
704
- taskset -c 0- " $end_core " python benchmarks/dynamo/torchbench.py \
707
+ $TASKSET python benchmarks/dynamo/torchbench.py \
705
708
--inference --performance --" $data_type " -dcpu -n50 --only " $model_name " --dynamic-shapes \
706
709
--dynamic-batch-only --freezing --timeout 9000 --" $backend " --output " $output_name "
707
710
else
708
- taskset -c 0- " $end_core " python benchmarks/dynamo/torchbench.py \
711
+ $TASKSET python benchmarks/dynamo/torchbench.py \
709
712
--inference --performance --" $data_type " -dcpu -n50 --only " $model_name " \
710
713
--freezing --timeout 9000 --" $backend " --output " $output_name "
711
714
fi
0 commit comments