pytorch
diff --git a/‎.ci/pytorch/test.sh
Lines changed: 20 additions & 17 deletions b/‎.ci/pytorch/test.sh
Lines changed: 20 additions & 17 deletions
diff --git a/‎.github/ci_commit_pins/audio.txt
Lines changed: 1 addition & 1 deletion b/‎.github/ci_commit_pins/audio.txt
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/lf-canary-scale-config.yml
Lines changed: 2 additions & 2 deletions b/‎.github/lf-canary-scale-config.yml
Lines changed: 2 additions & 2 deletions
diff --git a/‎.github/lf-scale-config.yml
Lines changed: 2 additions & 2 deletions b/‎.github/lf-scale-config.yml
Lines changed: 2 additions & 2 deletions
diff --git a/‎.github/scripts/generate_binary_build_matrix.py
Lines changed: 1 addition & 1 deletion b/‎.github/scripts/generate_binary_build_matrix.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
Lines changed: 0 additions & 108 deletions b/‎.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
Lines changed: 0 additions & 108 deletions
@@ -317,6 +317,7 @@ test_inductor_distributed() {
   python test/run_test.py -i distributed/test_c10d_functional_native.py --verbose
   python test/run_test.py -i distributed/_tensor/test_dtensor_compile.py --verbose
   python test/run_test.py -i distributed/tensor/parallel/test_fsdp_2d_parallel.py --verbose
+  python test/run_test.py -i distributed/tensor/parallel/test_micro_pipeline_tp.py --verbose
   python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_comm.py --verbose
   python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_training.py -k test_train_parity_multi_group --verbose
   python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_training.py -k test_train_parity_with_activation_checkpointing --verbose
@@ -429,16 +430,13 @@ test_perf_for_dashboard() {
   local targets=(accuracy performance)
 
   local device=cuda
-  local taskset=""
   if [[ "${TEST_CONFIG}" == *cpu* ]]; then
     if [[ "${TEST_CONFIG}" == *cpu_x86* ]]; then
       device=cpu_x86
     elif [[ "${TEST_CONFIG}" == *cpu_aarch64* ]]; then
       device=cpu_aarch64
     fi
     test_inductor_set_cpu_affinity
-    end_core=$(( $(test_inductor_get_core_number)-1 ))
-    taskset="taskset -c 0-$end_core"
   elif [[ "${TEST_CONFIG}" == *cuda_a10g* ]]; then
     device=cuda_a10g
   fi
@@ -458,51 +456,51 @@ test_perf_for_dashboard() {
       fi
 
       if [[ "$DASHBOARD_TAG" == *default-true* ]]; then
-        $taskset python "benchmarks/dynamo/$suite.py" \
+        $TASKSET python "benchmarks/dynamo/$suite.py" \
             "${target_flag[@]}" --"$mode" --"$dtype" --backend "$backend" --disable-cudagraphs "$@" \
             --output "$TEST_REPORTS_DIR/${backend}_no_cudagraphs_${suite}_${dtype}_${mode}_${device}_${target}.csv"
       fi
       if [[ "$DASHBOARD_TAG" == *cudagraphs-true* ]]; then
-        $taskset python "benchmarks/dynamo/$suite.py" \
+        $TASKSET python "benchmarks/dynamo/$suite.py" \
             "${target_flag[@]}" --"$mode" --"$dtype" --backend "$backend" "$@" \
             --output "$TEST_REPORTS_DIR/${backend}_with_cudagraphs_${suite}_${dtype}_${mode}_${device}_${target}.csv"
       fi
       if [[ "$DASHBOARD_TAG" == *dynamic-true* ]]; then
-        $taskset python "benchmarks/dynamo/$suite.py" \
+        $TASKSET python "benchmarks/dynamo/$suite.py" \
             "${target_flag[@]}" --"$mode" --"$dtype" --backend "$backend" --dynamic-shapes \
             --dynamic-batch-only "$@" \
             --output "$TEST_REPORTS_DIR/${backend}_dynamic_${suite}_${dtype}_${mode}_${device}_${target}.csv"
       fi
       if [[ "$DASHBOARD_TAG" == *cppwrapper-true* ]] && [[ "$mode" == "inference" ]]; then
-        TORCHINDUCTOR_CPP_WRAPPER=1 $taskset python "benchmarks/dynamo/$suite.py" \
+        TORCHINDUCTOR_CPP_WRAPPER=1 $TASKSET python "benchmarks/dynamo/$suite.py" \
             "${target_flag[@]}" --"$mode" --"$dtype" --backend "$backend" --disable-cudagraphs "$@" \
             --output "$TEST_REPORTS_DIR/${backend}_cpp_wrapper_${suite}_${dtype}_${mode}_${device}_${target}.csv"
       fi
       if [[ "$DASHBOARD_TAG" == *freezing_cudagraphs-true* ]] && [[ "$mode" == "inference" ]]; then
-        $taskset python "benchmarks/dynamo/$suite.py" \
+        $TASKSET python "benchmarks/dynamo/$suite.py" \
             "${target_flag[@]}" --"$mode" --"$dtype" --backend "$backend" "$@" --freezing \
             --output "$TEST_REPORTS_DIR/${backend}_with_cudagraphs_freezing_${suite}_${dtype}_${mode}_${device}_${target}.csv"
       fi
       if [[ "$DASHBOARD_TAG" == *freeze_autotune_cudagraphs-true* ]] && [[ "$mode" == "inference" ]]; then
-        TORCHINDUCTOR_MAX_AUTOTUNE=1 $taskset python "benchmarks/dynamo/$suite.py" \
+        TORCHINDUCTOR_MAX_AUTOTUNE=1 $TASKSET python "benchmarks/dynamo/$suite.py" \
             "${target_flag[@]}" --"$mode" --"$dtype" --backend "$backend" "$@" --freezing \
             --output "$TEST_REPORTS_DIR/${backend}_with_cudagraphs_freezing_autotune_${suite}_${dtype}_${mode}_${device}_${target}.csv"
       fi
       if [[ "$DASHBOARD_TAG" == *aotinductor-true* ]] && [[ "$mode" == "inference" ]]; then
-        TORCHINDUCTOR_ABI_COMPATIBLE=1 $taskset python "benchmarks/dynamo/$suite.py" \
+        TORCHINDUCTOR_ABI_COMPATIBLE=1 $TASKSET python "benchmarks/dynamo/$suite.py" \
             "${target_flag[@]}" --"$mode" --"$dtype" --export-aot-inductor --disable-cudagraphs "$@" \
             --output "$TEST_REPORTS_DIR/${backend}_aot_inductor_${suite}_${dtype}_${mode}_${device}_${target}.csv"
       fi
       if [[ "$DASHBOARD_TAG" == *maxautotune-true* ]]; then
-        TORCHINDUCTOR_MAX_AUTOTUNE=1 $taskset python "benchmarks/dynamo/$suite.py" \
+        TORCHINDUCTOR_MAX_AUTOTUNE=1 $TASKSET python "benchmarks/dynamo/$suite.py" \
             "${target_flag[@]}" --"$mode" --"$dtype" --backend "$backend" "$@" \
             --output "$TEST_REPORTS_DIR/${backend}_max_autotune_${suite}_${dtype}_${mode}_${device}_${target}.csv"
       fi
       if [[ "$DASHBOARD_TAG" == *cudagraphs_low_precision-true* ]] && [[ "$mode" == "inference" ]]; then
         # TODO: This has a new dtype called quant and the benchmarks script needs to be updated to support this.
         # The tentative command is as follows. It doesn't work now, but it's ok because we only need mock data
         # to fill the dashboard.
-        $taskset python "benchmarks/dynamo/$suite.py" \
+        $TASKSET python "benchmarks/dynamo/$suite.py" \
           "${target_flag[@]}" --"$mode" --quant --backend "$backend" "$@" \
           --output "$TEST_REPORTS_DIR/${backend}_cudagraphs_low_precision_${suite}_quant_${mode}_${device}_${target}.csv" || true
         # Copy cudagraph results as mock data, easiest choice?
@@ -664,27 +662,32 @@ test_inductor_torchbench_smoketest_perf() {
 }
 
 test_inductor_get_core_number() {
-  echo $(($(lscpu | grep 'Socket(s):' | awk '{print $2}') * $(lscpu | grep 'Core(s) per socket:' | awk '{print $4}')))
+  if [[ "${TEST_CONFIG}" == *aarch64 ]]; then
+    echo "$(($(lscpu | grep 'Cluster(s):' | awk '{print $2}') * $(lscpu | grep 'Core(s) per cluster:' | awk '{print $4}')))"
+  else
+    echo "$(($(lscpu | grep 'Socket(s):' | awk '{print $2}') * $(lscpu | grep 'Core(s) per socket:' | awk '{print $4}')))"
+  fi
 }
 
 test_inductor_set_cpu_affinity(){
   #set jemalloc
-  JEMALLOC_LIB="/usr/lib/x86_64-linux-gnu/libjemalloc.so.2"
+  JEMALLOC_LIB="$(find /usr/lib -name libjemalloc.so.2)"
   IOMP_LIB="$(dirname "$(which python)")/../lib/libiomp5.so"
   export LD_PRELOAD="$JEMALLOC_LIB":"$IOMP_LIB":"$LD_PRELOAD"
   export MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:-1,muzzy_decay_ms:-1"
   export KMP_AFFINITY=granularity=fine,compact,1,0
   export KMP_BLOCKTIME=1
   cores=$(test_inductor_get_core_number)
   export OMP_NUM_THREADS=$cores
+  end_core=$((cores-1))
+  export TASKSET="taskset -c 0-$end_core"
 }
 
 test_inductor_torchbench_cpu_smoketest_perf(){
   TEST_REPORTS_DIR=$(pwd)/test/test-reports
   mkdir -p "$TEST_REPORTS_DIR"
 
   test_inductor_set_cpu_affinity
-  end_core=$(( $(test_inductor_get_core_number)-1 ))
   MODELS_SPEEDUP_TARGET=benchmarks/dynamo/expected_ci_speedup_inductor_torchbench_cpu.csv
 
   grep -v '^ *#' < "$MODELS_SPEEDUP_TARGET" | while IFS=',' read -r -a model_cfg
@@ -701,11 +704,11 @@ test_inductor_torchbench_cpu_smoketest_perf(){
     local output_name="$TEST_REPORTS_DIR/inductor_inference_${model_cfg[0]}_${model_cfg[1]}_${model_cfg[2]}_${model_cfg[3]}_cpu_smoketest.csv"
 
     if [[ ${model_cfg[3]} == "dynamic" ]]; then
-      taskset -c 0-"$end_core" python benchmarks/dynamo/torchbench.py \
+      $TASKSET python benchmarks/dynamo/torchbench.py \
         --inference --performance --"$data_type" -dcpu -n50 --only "$model_name" --dynamic-shapes \
         --dynamic-batch-only --freezing --timeout 9000 --"$backend" --output "$output_name"
     else
-      taskset -c 0-"$end_core" python benchmarks/dynamo/torchbench.py \
+      $TASKSET python benchmarks/dynamo/torchbench.py \
         --inference --performance --"$data_type" -dcpu -n50 --only "$model_name" \
         --freezing --timeout 9000 --"$backend" --output "$output_name"
     fi
 
@@ -1 +1 @@
-69b2a0adc2ec03ab99990d7e8be3d4510438c148
+b3f6f511f2a1082bd56b13a3f6794e7fc3ba4862
@@ -35,7 +35,7 @@ runner_types:
     disk_size: 200
     instance_type: m4.10xlarge
     is_ephemeral: false
-    max_available: 60
+    max_available: 450
     os: linux
   lf.c.linux.24xl.spr-metal:
     disk_size: 200
@@ -200,7 +200,7 @@ runner_types:
     disk_size: 200
     instance_type: m4.10xlarge
     is_ephemeral: false
-    max_available: 60
+    max_available: 450
     os: linux
     ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
   lf.c.amz2023.linux.24xl.spr-metal:
 
@@ -35,7 +35,7 @@ runner_types:
     disk_size: 200
     instance_type: m4.10xlarge
     is_ephemeral: false
-    max_available: 60
+    max_available: 450
     os: linux
   lf.linux.24xl.spr-metal:
     disk_size: 200
@@ -200,7 +200,7 @@ runner_types:
     disk_size: 200
     instance_type: m4.10xlarge
     is_ephemeral: false
-    max_available: 60
+    max_available: 450
     os: linux
     ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
   lf.amz2023.linux.24xl.spr-metal:
 
@@ -215,7 +215,7 @@ def arch_type(arch_version: str) -> str:
     ("cpu", CXX11_ABI): f"pytorch/libtorch-cxx11-builder:cpu-{DEFAULT_TAG}",
 }
 
-FULL_PYTHON_VERSIONS = ["3.8", "3.9", "3.10", "3.11", "3.12"]
+FULL_PYTHON_VERSIONS = ["3.9", "3.10", "3.11", "3.12"]
 
 
 def translate_desired_cuda(gpu_arch_type: str, gpu_arch_version: str) -> str:
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-69b2a0adc2ec03ab99990d7e8be3d4510438c148`
	`1`	`+b3f6f511f2a1082bd56b13a3f6794e7fc3ba4862`
Original file line number	Diff line number	Diff line change
`@@ -215,7 +215,7 @@ def arch_type(arch_version: str) -> str:`
`215`	`215`	`("cpu", CXX11_ABI): f"pytorch/libtorch-cxx11-builder:cpu-{DEFAULT_TAG}",`
`216`	`216`	`}`
`217`	`217`
`218`		`-FULL_PYTHON_VERSIONS = ["3.8", "3.9", "3.10", "3.11", "3.12"]`
	`218`	`+FULL_PYTHON_VERSIONS = ["3.9", "3.10", "3.11", "3.12"]`
`219`	`219`
`220`	`220`
`221`	`221`	`def translate_desired_cuda(gpu_arch_type: str, gpu_arch_version: str) -> str:`