Skip to content

Commit 9a49e5c

Browse files
committed
Update base for Update on "[subclasses] Fix nested subclasses flattened tensors ordering"
get_plain_tensors() should result in DFS of leaves. The error was that plain tensors (leaves) on the same level were returned before subclasses plained tensors even if subclasses are before in "flatten" list. Original issue from AO: pytorch/ao#515 Test:TBD, need to make asymetric subclass with dense tensors and subclasses [ghstack-poisoned]
2 parents d039b14 + a94e507 commit 9a49e5c

File tree

312 files changed

+5745
-14707
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

312 files changed

+5745
-14707
lines changed

.ci/pytorch/test.sh

Lines changed: 20 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -317,6 +317,7 @@ test_inductor_distributed() {
317317
python test/run_test.py -i distributed/test_c10d_functional_native.py --verbose
318318
python test/run_test.py -i distributed/_tensor/test_dtensor_compile.py --verbose
319319
python test/run_test.py -i distributed/tensor/parallel/test_fsdp_2d_parallel.py --verbose
320+
python test/run_test.py -i distributed/tensor/parallel/test_micro_pipeline_tp.py --verbose
320321
python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_comm.py --verbose
321322
python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_training.py -k test_train_parity_multi_group --verbose
322323
python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_training.py -k test_train_parity_with_activation_checkpointing --verbose
@@ -429,16 +430,13 @@ test_perf_for_dashboard() {
429430
local targets=(accuracy performance)
430431

431432
local device=cuda
432-
local taskset=""
433433
if [[ "${TEST_CONFIG}" == *cpu* ]]; then
434434
if [[ "${TEST_CONFIG}" == *cpu_x86* ]]; then
435435
device=cpu_x86
436436
elif [[ "${TEST_CONFIG}" == *cpu_aarch64* ]]; then
437437
device=cpu_aarch64
438438
fi
439439
test_inductor_set_cpu_affinity
440-
end_core=$(( $(test_inductor_get_core_number)-1 ))
441-
taskset="taskset -c 0-$end_core"
442440
elif [[ "${TEST_CONFIG}" == *cuda_a10g* ]]; then
443441
device=cuda_a10g
444442
fi
@@ -458,51 +456,51 @@ test_perf_for_dashboard() {
458456
fi
459457

460458
if [[ "$DASHBOARD_TAG" == *default-true* ]]; then
461-
$taskset python "benchmarks/dynamo/$suite.py" \
459+
$TASKSET python "benchmarks/dynamo/$suite.py" \
462460
"${target_flag[@]}" --"$mode" --"$dtype" --backend "$backend" --disable-cudagraphs "$@" \
463461
--output "$TEST_REPORTS_DIR/${backend}_no_cudagraphs_${suite}_${dtype}_${mode}_${device}_${target}.csv"
464462
fi
465463
if [[ "$DASHBOARD_TAG" == *cudagraphs-true* ]]; then
466-
$taskset python "benchmarks/dynamo/$suite.py" \
464+
$TASKSET python "benchmarks/dynamo/$suite.py" \
467465
"${target_flag[@]}" --"$mode" --"$dtype" --backend "$backend" "$@" \
468466
--output "$TEST_REPORTS_DIR/${backend}_with_cudagraphs_${suite}_${dtype}_${mode}_${device}_${target}.csv"
469467
fi
470468
if [[ "$DASHBOARD_TAG" == *dynamic-true* ]]; then
471-
$taskset python "benchmarks/dynamo/$suite.py" \
469+
$TASKSET python "benchmarks/dynamo/$suite.py" \
472470
"${target_flag[@]}" --"$mode" --"$dtype" --backend "$backend" --dynamic-shapes \
473471
--dynamic-batch-only "$@" \
474472
--output "$TEST_REPORTS_DIR/${backend}_dynamic_${suite}_${dtype}_${mode}_${device}_${target}.csv"
475473
fi
476474
if [[ "$DASHBOARD_TAG" == *cppwrapper-true* ]] && [[ "$mode" == "inference" ]]; then
477-
TORCHINDUCTOR_CPP_WRAPPER=1 $taskset python "benchmarks/dynamo/$suite.py" \
475+
TORCHINDUCTOR_CPP_WRAPPER=1 $TASKSET python "benchmarks/dynamo/$suite.py" \
478476
"${target_flag[@]}" --"$mode" --"$dtype" --backend "$backend" --disable-cudagraphs "$@" \
479477
--output "$TEST_REPORTS_DIR/${backend}_cpp_wrapper_${suite}_${dtype}_${mode}_${device}_${target}.csv"
480478
fi
481479
if [[ "$DASHBOARD_TAG" == *freezing_cudagraphs-true* ]] && [[ "$mode" == "inference" ]]; then
482-
$taskset python "benchmarks/dynamo/$suite.py" \
480+
$TASKSET python "benchmarks/dynamo/$suite.py" \
483481
"${target_flag[@]}" --"$mode" --"$dtype" --backend "$backend" "$@" --freezing \
484482
--output "$TEST_REPORTS_DIR/${backend}_with_cudagraphs_freezing_${suite}_${dtype}_${mode}_${device}_${target}.csv"
485483
fi
486484
if [[ "$DASHBOARD_TAG" == *freeze_autotune_cudagraphs-true* ]] && [[ "$mode" == "inference" ]]; then
487-
TORCHINDUCTOR_MAX_AUTOTUNE=1 $taskset python "benchmarks/dynamo/$suite.py" \
485+
TORCHINDUCTOR_MAX_AUTOTUNE=1 $TASKSET python "benchmarks/dynamo/$suite.py" \
488486
"${target_flag[@]}" --"$mode" --"$dtype" --backend "$backend" "$@" --freezing \
489487
--output "$TEST_REPORTS_DIR/${backend}_with_cudagraphs_freezing_autotune_${suite}_${dtype}_${mode}_${device}_${target}.csv"
490488
fi
491489
if [[ "$DASHBOARD_TAG" == *aotinductor-true* ]] && [[ "$mode" == "inference" ]]; then
492-
TORCHINDUCTOR_ABI_COMPATIBLE=1 $taskset python "benchmarks/dynamo/$suite.py" \
490+
TORCHINDUCTOR_ABI_COMPATIBLE=1 $TASKSET python "benchmarks/dynamo/$suite.py" \
493491
"${target_flag[@]}" --"$mode" --"$dtype" --export-aot-inductor --disable-cudagraphs "$@" \
494492
--output "$TEST_REPORTS_DIR/${backend}_aot_inductor_${suite}_${dtype}_${mode}_${device}_${target}.csv"
495493
fi
496494
if [[ "$DASHBOARD_TAG" == *maxautotune-true* ]]; then
497-
TORCHINDUCTOR_MAX_AUTOTUNE=1 $taskset python "benchmarks/dynamo/$suite.py" \
495+
TORCHINDUCTOR_MAX_AUTOTUNE=1 $TASKSET python "benchmarks/dynamo/$suite.py" \
498496
"${target_flag[@]}" --"$mode" --"$dtype" --backend "$backend" "$@" \
499497
--output "$TEST_REPORTS_DIR/${backend}_max_autotune_${suite}_${dtype}_${mode}_${device}_${target}.csv"
500498
fi
501499
if [[ "$DASHBOARD_TAG" == *cudagraphs_low_precision-true* ]] && [[ "$mode" == "inference" ]]; then
502500
# TODO: This has a new dtype called quant and the benchmarks script needs to be updated to support this.
503501
# The tentative command is as follows. It doesn't work now, but it's ok because we only need mock data
504502
# to fill the dashboard.
505-
$taskset python "benchmarks/dynamo/$suite.py" \
503+
$TASKSET python "benchmarks/dynamo/$suite.py" \
506504
"${target_flag[@]}" --"$mode" --quant --backend "$backend" "$@" \
507505
--output "$TEST_REPORTS_DIR/${backend}_cudagraphs_low_precision_${suite}_quant_${mode}_${device}_${target}.csv" || true
508506
# Copy cudagraph results as mock data, easiest choice?
@@ -664,27 +662,32 @@ test_inductor_torchbench_smoketest_perf() {
664662
}
665663

666664
test_inductor_get_core_number() {
667-
echo $(($(lscpu | grep 'Socket(s):' | awk '{print $2}') * $(lscpu | grep 'Core(s) per socket:' | awk '{print $4}')))
665+
if [[ "${TEST_CONFIG}" == *aarch64 ]]; then
666+
echo "$(($(lscpu | grep 'Cluster(s):' | awk '{print $2}') * $(lscpu | grep 'Core(s) per cluster:' | awk '{print $4}')))"
667+
else
668+
echo "$(($(lscpu | grep 'Socket(s):' | awk '{print $2}') * $(lscpu | grep 'Core(s) per socket:' | awk '{print $4}')))"
669+
fi
668670
}
669671

670672
test_inductor_set_cpu_affinity(){
671673
#set jemalloc
672-
JEMALLOC_LIB="/usr/lib/x86_64-linux-gnu/libjemalloc.so.2"
674+
JEMALLOC_LIB="$(find /usr/lib -name libjemalloc.so.2)"
673675
IOMP_LIB="$(dirname "$(which python)")/../lib/libiomp5.so"
674676
export LD_PRELOAD="$JEMALLOC_LIB":"$IOMP_LIB":"$LD_PRELOAD"
675677
export MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:-1,muzzy_decay_ms:-1"
676678
export KMP_AFFINITY=granularity=fine,compact,1,0
677679
export KMP_BLOCKTIME=1
678680
cores=$(test_inductor_get_core_number)
679681
export OMP_NUM_THREADS=$cores
682+
end_core=$((cores-1))
683+
export TASKSET="taskset -c 0-$end_core"
680684
}
681685

682686
test_inductor_torchbench_cpu_smoketest_perf(){
683687
TEST_REPORTS_DIR=$(pwd)/test/test-reports
684688
mkdir -p "$TEST_REPORTS_DIR"
685689

686690
test_inductor_set_cpu_affinity
687-
end_core=$(( $(test_inductor_get_core_number)-1 ))
688691
MODELS_SPEEDUP_TARGET=benchmarks/dynamo/expected_ci_speedup_inductor_torchbench_cpu.csv
689692

690693
grep -v '^ *#' < "$MODELS_SPEEDUP_TARGET" | while IFS=',' read -r -a model_cfg
@@ -701,11 +704,11 @@ test_inductor_torchbench_cpu_smoketest_perf(){
701704
local output_name="$TEST_REPORTS_DIR/inductor_inference_${model_cfg[0]}_${model_cfg[1]}_${model_cfg[2]}_${model_cfg[3]}_cpu_smoketest.csv"
702705

703706
if [[ ${model_cfg[3]} == "dynamic" ]]; then
704-
taskset -c 0-"$end_core" python benchmarks/dynamo/torchbench.py \
707+
$TASKSET python benchmarks/dynamo/torchbench.py \
705708
--inference --performance --"$data_type" -dcpu -n50 --only "$model_name" --dynamic-shapes \
706709
--dynamic-batch-only --freezing --timeout 9000 --"$backend" --output "$output_name"
707710
else
708-
taskset -c 0-"$end_core" python benchmarks/dynamo/torchbench.py \
711+
$TASKSET python benchmarks/dynamo/torchbench.py \
709712
--inference --performance --"$data_type" -dcpu -n50 --only "$model_name" \
710713
--freezing --timeout 9000 --"$backend" --output "$output_name"
711714
fi

.github/ci_commit_pins/audio.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
69b2a0adc2ec03ab99990d7e8be3d4510438c148
1+
b3f6f511f2a1082bd56b13a3f6794e7fc3ba4862

.github/lf-canary-scale-config.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ runner_types:
3535
disk_size: 200
3636
instance_type: m4.10xlarge
3737
is_ephemeral: false
38-
max_available: 60
38+
max_available: 450
3939
os: linux
4040
lf.c.linux.24xl.spr-metal:
4141
disk_size: 200
@@ -200,7 +200,7 @@ runner_types:
200200
disk_size: 200
201201
instance_type: m4.10xlarge
202202
is_ephemeral: false
203-
max_available: 60
203+
max_available: 450
204204
os: linux
205205
ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
206206
lf.c.amz2023.linux.24xl.spr-metal:

.github/lf-scale-config.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ runner_types:
3535
disk_size: 200
3636
instance_type: m4.10xlarge
3737
is_ephemeral: false
38-
max_available: 60
38+
max_available: 450
3939
os: linux
4040
lf.linux.24xl.spr-metal:
4141
disk_size: 200
@@ -200,7 +200,7 @@ runner_types:
200200
disk_size: 200
201201
instance_type: m4.10xlarge
202202
is_ephemeral: false
203-
max_available: 60
203+
max_available: 450
204204
os: linux
205205
ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
206206
lf.amz2023.linux.24xl.spr-metal:

.github/scripts/generate_binary_build_matrix.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -215,7 +215,7 @@ def arch_type(arch_version: str) -> str:
215215
("cpu", CXX11_ABI): f"pytorch/libtorch-cxx11-builder:cpu-{DEFAULT_TAG}",
216216
}
217217

218-
FULL_PYTHON_VERSIONS = ["3.8", "3.9", "3.10", "3.11", "3.12"]
218+
FULL_PYTHON_VERSIONS = ["3.9", "3.10", "3.11", "3.12"]
219219

220220

221221
def translate_desired_cuda(gpu_arch_type: str, gpu_arch_version: str) -> str:

.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml

Lines changed: 0 additions & 108 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)