pytorch
diff --git a/‎.github/scripts/torchao_model_releases/README.md‎
Lines changed: 12 additions & 1 deletion b/‎.github/scripts/torchao_model_releases/README.md‎
Lines changed: 12 additions & 1 deletion
diff --git a/‎.github/scripts/torchao_model_releases/eval.sh‎
Lines changed: 1 addition & 1 deletion b/‎.github/scripts/torchao_model_releases/eval.sh‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/scripts/torchao_model_releases/eval_latency.sh‎
Lines changed: 1 addition & 1 deletion b/‎.github/scripts/torchao_model_releases/eval_latency.sh‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/scripts/torchao_model_releases/quantize_and_upload.py‎
Lines changed: 52 additions & 27 deletions b/‎.github/scripts/torchao_model_releases/quantize_and_upload.py‎
Lines changed: 52 additions & 27 deletions
diff --git a/‎.github/scripts/torchao_model_releases/release.sh‎
Lines changed: 14 additions & 8 deletions b/‎.github/scripts/torchao_model_releases/release.sh‎
Lines changed: 14 additions & 8 deletions
diff --git a/‎.github/workflows/torchao_experimental_test.yml‎ renamed to ‎.github/workflows/regression_test_aarch64.yml‎
Lines changed: 7 additions & 4 deletions b/‎.github/workflows/torchao_experimental_test.yml‎ renamed to ‎.github/workflows/regression_test_aarch64.yml‎
Lines changed: 7 additions & 4 deletions
diff --git a/‎benchmarks/benchmark_aq.py‎
Lines changed: 6 additions & 6 deletions b/‎benchmarks/benchmark_aq.py‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎benchmarks/float8/training/llama3.sh‎
Lines changed: 1 addition & 1 deletion b/‎benchmarks/float8/training/llama3.sh‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎benchmarks/microbenchmarks/utils.py‎
Lines changed: 1 addition & 3 deletions b/‎benchmarks/microbenchmarks/utils.py‎
Lines changed: 1 addition & 3 deletions
diff --git a/‎benchmarks/prototype/moe_training/benchmark_2d_3d_grouped_gemms.py‎ renamed to ‎benchmarks/prototype/moe_training/bench_2d_3d_grouped_gemm.py‎
Lines changed: 3 additions & 3 deletions b/‎benchmarks/prototype/moe_training/benchmark_2d_3d_grouped_gemms.py‎ renamed to ‎benchmarks/prototype/moe_training/bench_2d_3d_grouped_gemm.py‎
Lines changed: 3 additions & 3 deletions
@@ -18,6 +18,8 @@ Examples:
 ./release.sh --model_id Qwen/Qwen3-8B --quants INT4 FP8
 ```
 
+Note: for initial release, please include `--populate_model_card_template` to populate model card template.
+
 ### AWQ-INT4
 [AWQ](https://arxiv.org/abs/2306.00978) is a technique to improve accuracy for weight only quantization. It improves accuracy by preserving "salient" weight channels that has high impact on the accuracy of output, through multiplying the weight channel by a scale, and do the reverse for the correspnoding activation, since activation is not quantized, there is no additional loss from activation, while the quantization loss from weight can be reduced.
 
@@ -30,6 +32,15 @@ Examples:
 python quantize_and_upload.py --model_id Qwen/Qwen3-8B --quant AWQ-INT4 --push_to_hub --task bbh --calibration_limit 2
 ```
 
+### Update checkpoints for a different user_id (e.g. pytorch)
+Sometimes we may want to update the checkpoints for a different user id, without changing model card. For this we can use `--push_to_user_id`, e.g.
+
+```
+sh release.sh --model_id microsoft/Phi-4-mini-instruct --quants FP8 --push_to_hub --push_to_user_id pytorch
+```
+
+This will update `pytorch/Phi-4-mini-instruct-FP8` without changing the model card.
+
 ## Eval
 After we run the release script for a model, we can find new models in the huggingface hub page for the user, e.g. https://huggingface.co/torchao-testing, the models will have a model card that's filled in with template content, such as information about the model and eval instructions, there are a few things we need to fill in, including 1. peak memory usage, 2. latency when running model with vllm and 3. quality measurement using lm-eval.
 
@@ -78,7 +89,7 @@ After environment is setup, we can run eval:
 sh eval.sh --eval_type quality --model_ids Qwen/Qwen3-8B --tasks hellaswag,mmlu
 ```
 
-# ### Summarize results
+#### Summarize results
 After we have finished all evals for each model, we can summarize the results with:
 ```
 sh summarize_results.sh --model_ids Qwen/Qwen3-8B pytorch/Qwen3-8B-INT4
 
@@ -110,5 +110,5 @@ done
 
 # Run summarize_results.sh with MODEL_IDS if eval_type is "all"
 if [[ "$EVAL_TYPE" == "all" ]]; then
-  sh summarize_results.sh --model_id "${MODEL_ID_ARRAY[@]}"
+  sh summarize_results.sh --model_ids "${MODEL_ID_ARRAY[@]}"
 fi
@@ -75,7 +75,7 @@ for MODEL_ID in "${MODEL_ID_ARRAY[@]}"; do
     for BATCH_SIZE in "${BATCH_SIZE_ARRAY[@]}"; do
         OUTPUT_FILE="$ORIG_DIR/${SAFE_MODEL_ID}_latency_batch${BATCH_SIZE}_in${INPUT_LEN}_out${OUTPUT_LEN}.log"
         echo "Running latency eval for model $MODEL_ID with batch size $BATCH_SIZE with input length: $INPUT_LEN and output length: $OUTPUT_LEN"
-        VLLM_DISABLE_COMPILE_CACHE=1 python benchmarks/benchmark_latency.py --input-len $INPUT_LEN --output-len $OUTPUT_LEN --model $MODEL_ID --batch-size $BATCH_SIZE > "$OUTPUT_FILE" 2>&1
+        VLLM_DISABLE_COMPILE_CACHE=1 vllm bench latency --input-len $INPUT_LEN --output-len $OUTPUT_LEN --model $MODEL_ID --batch-size $BATCH_SIZE > "$OUTPUT_FILE" 2>&1
         echo "Latency eval result saved to $OUTPUT_FILE"
     done
     echo "======================== Eval Latency $MODEL_ID End ========================="
 
@@ -5,6 +5,7 @@
 # LICENSE file in the root directory of this source tree.
 
 import argparse
+from typing import List
 
 import torch
 from huggingface_hub import ModelCard, get_token, whoami
@@ -206,7 +207,7 @@ def _untie_weights_and_save_locally(model_id):
 
 _int4_quant_code = """
 from torchao.quantization import Int4WeightOnlyConfig
-quant_config = Int4WeightOnlyConfig(group_size=128, packing_format="tile_packed_to_4d", int4_choose_qparams_algorithm="hqq", version=2)
+quant_config = Int4WeightOnlyConfig(group_size=128, int4_packing_format="tile_packed_to_4d", int4_choose_qparams_algorithm="hqq")
 quantization_config = TorchAoConfig(quant_type=quant_config)
 quantized_model = AutoModelForCausalLM.from_pretrained(model_to_quantize, device_map="auto", torch_dtype=torch.bfloat16, quantization_config=quantization_config)
 tokenizer = AutoTokenizer.from_pretrained(model_id)
@@ -230,12 +231,10 @@ def _untie_weights_and_save_locally(model_id):
 embedding_config = IntxWeightOnlyConfig(
     weight_dtype=torch.int8,
     granularity=PerAxis(0),
-    version=2,
 )
 linear_config = Int8DynamicActivationIntxWeightConfig(
     weight_dtype=torch.int4,
     weight_granularity=PerGroup(32),
-    version=2,
 )
 quant_config = ModuleFqnToConfig({{"_default": linear_config, "model.embed_tokens": embedding_config}})
 quantization_config = TorchAoConfig(quant_type=quant_config, include_input_output_embeddings=True, modules_to_not_convert=[])
@@ -256,7 +255,7 @@ def _untie_weights_and_save_locally(model_id):
 )
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 
-base_config = Int4WeightOnlyConfig(group_size=128, version=2)
+base_config = Int4WeightOnlyConfig(group_size=128)
 quant_config = AWQConfig(base_config, step="prepare")
 quantize_(
     model,
@@ -585,57 +584,66 @@ def _untie_weights_and_save_locally(model_id):
 Once ExecuTorch is [set-up](https://pytorch.org/executorch/main/getting-started.html), exporting and running the model on device is a breeze.
 
 ExecuTorch's LLM export scripts require the checkpoint keys and parameters have certain names, which differ from those used in Hugging Face.
-So we first use a conversion script that converts the Hugging Face checkpoint key names to ones that ExecuTorch expects:
+So we first use a script that converts the Hugging Face checkpoint key names to ones that ExecuTorch expects:
+The following script does this for you.
 
 [TODO: fix command below where necessary]
 ```Shell
 python -m executorch.examples.models.qwen3.convert_weights $(hf download {quantized_model}) pytorch_model_converted.bin
 ```
 
-Once we have the checkpoint, we export it to ExecuTorch with the XNNPACK backend as follows.
-(ExecuTorch LLM export script requires config.json have certain key names.  The correct config to use for the LLM export script is located at [TODO: fill in, e.g., examples/models/qwen3/config/4b_config.json] within the ExecuTorch repo.)
+Once we have the checkpoint, we export it to ExecuTorch with a max_seq_length/max_context_length of 1024 to the XNNPACK backend as follows. 
+
+[TODO: fix config path in note where necessary]
+(Note: ExecuTorch LLM export script requires config.json have certain key names. The correct config to use for the LLM export script is located at examples/models/qwen3/config/4b_config.json within the ExecuTorch repo.)
 
 [TODO: fix command below where necessary]
 ```Shell
 python -m executorch.examples.models.llama.export_llama \
-    --model "qwen3_4b" \
-	--checkpoint pytorch_model_converted.bin \
-	--params examples/models/qwen3/config/4b_config.json \
-	--output_name="model.pte" \
-	-kv \
-	--use_sdpa_with_kv_cache \
-	-X \
-	--xnnpack-extended-ops \
-	--max_context_length 1024 \
-	--max_seq_length 1024 \
-	--dtype fp32 \
-	--metadata '{{"get_bos_id":199999, "get_eos_ids":[200020,199999]}}'
+  --model "qwen3_4b" \
+  --checkpoint pytorch_model_converted.bin \
+  --params examples/models/qwen3/config/4b_config.json \
+  --output_name model.pte \
+  -kv \
+  --use_sdpa_with_kv_cache \
+  -X \
+  --xnnpack-extended-ops \
+  --max_context_length 1024 \
+  --max_seq_length 1024 \
+  --dtype fp32 \
+  --metadata '{"get_bos_id":199999, "get_eos_ids":[200020,199999]}'
 ```
 
 After that you can run the model in a mobile app (see [Running in a mobile app](#running-in-a-mobile-app)).
+
+(We try to keep these instructions up-to-date, but if you find they do not work, check out our [CI test in ExecuTorch](https://github.com/pytorch/executorch/blob/main/.ci/scripts/test_torchao_huggingface_checkpoints.sh) for the latest source of truth, and let us know we need to update our model card.)
 """
 
 
 def quantize_and_upload(
-    model_id, quant, tasks, calibration_limit, max_seq_length, push_to_hub
+    model_id: str,
+    quant: str,
+    tasks: List[str],
+    calibration_limit: int,
+    max_seq_length: int,
+    push_to_hub: bool,
+    push_to_user_id: str,
+    populate_model_card_template: bool,
 ):
     _int8_int4_linear_config = Int8DynamicActivationIntxWeightConfig(
         weight_dtype=torch.int4,
         weight_granularity=PerGroup(32),
-        version=2,
     )
     _int8_int4_embedding_config = IntxWeightOnlyConfig(
         weight_dtype=torch.int8,
         granularity=PerAxis(0),
-        version=2,
     )
     quant_to_config = {
         "FP8": Float8DynamicActivationFloat8WeightConfig(granularity=PerRow()),
         "INT4": Int4WeightOnlyConfig(
             group_size=128,
-            packing_format="tile_packed_to_4d",
+            int4_packing_format="tile_packed_to_4d",
             int4_choose_qparams_algorithm="hqq",
-            version=2,
         ),
         "INT8-INT4": ModuleFqnToConfig(
             {
@@ -669,7 +677,7 @@ def quantize_and_upload(
         )
         tokenizer = AutoTokenizer.from_pretrained(model_id)
 
-        base_config = Int4WeightOnlyConfig(group_size=128, version=2)
+        base_config = Int4WeightOnlyConfig(group_size=128)
         quant_config = AWQConfig(base_config, step="prepare")
         quantize_(
             model,
@@ -713,7 +721,9 @@ def quantize_and_upload(
     username = _get_username()
 
     MODEL_NAME = model_id.split("/")[-1]
-    save_to = f"{username}/{MODEL_NAME}-{quant}"
+
+    save_to_user_id = username if push_to_user_id is None else push_to_user_id
+    save_to = f"{save_to_user_id}/{MODEL_NAME}-{quant}"
     untied_model_path = 'f"{{MODEL_NAME}}-untied-weights"'
     is_mobile = quant == "INT8-INT4"
     quantized_model_id = save_to
@@ -759,7 +769,8 @@ def quantize_and_upload(
     if push_to_hub:
         quantized_model.push_to_hub(quantized_model_id, safe_serialization=False)
         tokenizer.push_to_hub(quantized_model_id)
-        card.push_to_hub(quantized_model_id)
+        if populate_model_card_template:
+            card.push_to_hub(quantized_model_id)
     else:
         quantized_model.save_pretrained(quantized_model_id, safe_serialization=False)
         tokenizer.save_pretrained(quantized_model_id)
@@ -828,6 +839,18 @@ def quantize_and_upload(
         default=False,
         help="Flag to indicate whether push to huggingface hub or not",
     )
+    parser.add_argument(
+        "--push_to_user_id",
+        type=str,
+        default=None,
+        help="The user_id to use for pushing the quantized model, only used when --push_to_hub is set",
+    )
+    parser.add_argument(
+        "--populate_model_card_template",
+        action="store_true",
+        default=False,
+        help="Flag to indicate whether push model card to huggingface hub or not",
+    )
     args = parser.parse_args()
     quantize_and_upload(
         args.model_id,
@@ -836,4 +859,6 @@ def quantize_and_upload(
         args.calibration_limit,
         args.max_seq_length,
         args.push_to_hub,
+        args.push_to_user_id,
+        args.populate_model_card_template,
     )
@@ -6,15 +6,13 @@
 
 #!/bin/bash
 
-# Example uses
-# release with default quant options (FP8, INT4, INT8-INT4)
-# ./release.sh --model_id Qwen/Qwen3-8B
-# release a custom set of quant options
-# ./release.sh --model_id Qwen/Qwen3-8B --quants INT4 FP8
+# see README.md for instructions
 
 # Default quantization options
 default_quants=("FP8" "INT4" "INT8-INT4")
 push_to_hub=""
+push_to_user_id=""
+populate_model_card_template=""
 # Parse arguments
 while [[ $# -gt 0 ]]; do
   case "$1" in
@@ -34,6 +32,14 @@ while [[ $# -gt 0 ]]; do
       push_to_hub="--push_to_hub"
       shift
       ;;
+     --push_to_user_id)
+      push_to_user_id=("--push_to_user_id $2")
+      shift 2
+      ;;
+     --populate_model_card_template)
+      populate_model_card_template="--populate_model_card_template"
+      shift
+      ;;
     *)
       echo "Unknown option: $1"
       exit 1
@@ -43,14 +49,14 @@ done
 # Use default quants if none specified
 if [[ -z "$model_id" ]]; then
   echo "Error: --model_id is required"
-  echo "Usage: $0 --model_id <model_id> [--quants <quant1> [quant2 ...]] [--push_to_hub]"
+  echo "Usage: $0 --model_id <model_id> [--quants <quant1> [quant2 ...]] [--push_to_hub] [--push_to_user_id <push_to_user_id>] [--populate_model_card_template]"
   exit 1
 fi
 if [[ ${#quants[@]} -eq 0 ]]; then
   quants=("${default_quants[@]}")
 fi
 # Run the python command for each quantization option
 for quant in "${quants[@]}"; do
-  echo "Running: python quantize_and_upload.py --model_id $model_id --quant $quant $push_to_hub"
-  python quantize_and_upload.py --model_id "$model_id" --quant "$quant" $push_to_hub
+  echo "Running: python quantize_and_upload.py --model_id $model_id --quant $quant $push_to_hub $push_to_user_id $populate_model_card_template"
+  python quantize_and_upload.py --model_id "$model_id" --quant "$quant" $push_to_hub $push_to_user_id $populate_model_card_template
 done
@@ -1,4 +1,4 @@
-name: Run TorchAO Experimental Tests
+name: Run Regression Tests (aarch64)
 
 on:
   push:
@@ -44,17 +44,20 @@ jobs:
         if: runner.os == 'Linux'
         run: |
           conda activate venv
+          pip install coremltools
           pip install torch==2.7.0 --index-url https://download.pytorch.org/whl/cpu --force-reinstall
           pip install -r dev-requirements.txt
           BUILD_TORCHAO_EXPERIMENTAL=1 TORCHAO_BUILD_CPU_AARCH64=1 TORCHAO_BUILD_KLEIDIAI=1 TORCHAO_ENABLE_ARM_NEON_DOT=1 TORCHAO_PARALLEL_BACKEND=OPENMP pip install .
       - name: Run python tests
         run: |
           conda activate venv
           pytest -s test/quantization/test_int8_dynamic_activation_intx_weight_config_v1.py
-          pytest -s torchao/experimental/tests/test_embedding_xbit_quantizer.py
-          pytest -s torchao/experimental/tests/test_quant_passes.py
-          pytest -s test/prototype/test_dynamic_activation_lut.py
           pytest -s test/quantization/quantize_/workflows/intx/test_intx_opaque_tensor.py
+          pytest -s test/prototype/test_embedding.py
+          pytest -s test/prototype/test_int8_lut_tensor.py
+          pytest -s test/prototype/test_tensor_conversion.py
+          pytest -s test/prototype/test_groupwise_lowbit_weight_lut_quantizer.py
+          pytest -s test/prototype/test_parq.py
       - name: torchao/csrc/cpu - build and run C++ tests
         if: runner.os == 'macOS'
         run: |
 
@@ -10,10 +10,10 @@
 import torch
 
 from torchao.quantization.quant_api import (
+    Int4WeightOnlyConfig,
+    Int8DynamicActivationInt8WeightConfig,
+    Int8WeightOnlyConfig,
     _replace_with_custom_fn_if_matches_filter,
-    int4_weight_only,
-    int8_dynamic_activation_int8_weight,
-    int8_weight_only,
     quantize_,
 )
 from torchao.quantization.subclass import (
@@ -23,13 +23,13 @@
 
 
 def _int8wo_api(mod, **kwargs):
-    quantize_(mod, int8_weight_only(**kwargs), set_inductor_config=False)
+    quantize_(mod, Int8WeightOnlyConfig(**kwargs), set_inductor_config=False)
 
 
 def _int8da_int8w_api(mod, **kwargs):
     quantize_(
         mod,
-        int8_dynamic_activation_int8_weight(**kwargs),
+        Int8DynamicActivationInt8WeightConfig(**kwargs),
         set_inductor_config=False,
     )
 
@@ -39,7 +39,7 @@ def _int4wo_api(mod, **kwargs):
     if "groupsize" in kwargs_copy:
         kwargs_copy["group_size"] = kwargs_copy["groupsize"]
         del kwargs_copy["groupsize"]
-    quantize_(mod, int4_weight_only(**kwargs_copy), set_inductor_config=False)
+    quantize_(mod, Int4WeightOnlyConfig(**kwargs_copy), set_inductor_config=False)
 
 
 class ToyLinearModel(torch.nn.Module):
 
@@ -53,7 +53,7 @@ cd ${TORCHTITAN_ROOT}
 echo "float8 args: ${FLOAT8_ARGS}"
 
 # run the command with the specified arguments
-CONFIG_FILE="./torchtitan/models/llama3/train_configs/llama3_8b.toml" ${TORCHTITAN_ROOT}/run_train.sh --training.steps=${STEPS} --training.local-batch-size=${LOCAL_BATCH_SIZE} --training.compile ${FLOAT8_ARGS} ${EXTRA_ARGS} 2>&1 | tee ${LOG_FILE}
+CONFIG_FILE="./torchtitan/models/llama3/train_configs/llama3_8b.toml" ${TORCHTITAN_ROOT}/run_train.sh --training.steps=${STEPS} --training.local-batch-size=${LOCAL_BATCH_SIZE} --compile.enable ${FLOAT8_ARGS} ${EXTRA_ARGS} 2>&1 | tee ${LOG_FILE}
 
 # return to original working directory
 cd $original_dir
 
@@ -260,7 +260,6 @@ def string_to_config(
             "int8_dynamic_activation_intx_weight requires using high_precision_dtype=torch.float32"
         )
 
-        from torchao.dtypes import PackedLinearInt8DynamicActivationIntxWeightLayout
         from torchao.quantization.granularity import PerAxis, PerGroup
         from torchao.quantization.quant_api import (
             Int8DynamicActivationIntxWeightConfig,
@@ -278,8 +277,7 @@ def string_to_config(
             weight_mapping_type=MappingType.ASYMMETRIC
             if is_asymmetric
             else MappingType.SYMMETRIC,
-            weight_scale_dtype=torch.bfloat16,
-            layout=PackedLinearInt8DynamicActivationIntxWeightLayout(),
+            intx_packing_format="opaque_torchao_auto",
         )
     elif "float8wo" in quantization:
         return Float8WeightOnlyConfig()
 
@@ -18,7 +18,7 @@
 from torchao.float8.config import ScalingGranularity
 from torchao.float8.float8_utils import tensor_to_scale, to_fp8_saturated
 from torchao.prototype.moe_training.kernels.mxfp8_blocked_scales import (
-    torch_to_blocked_per_group_2d,
+    torch_to_blocked_2d_M_groups,
     torch_to_blocked_per_group_3d,
 )
 from torchao.prototype.moe_training.utils import generate_jagged_offs
@@ -230,8 +230,8 @@ def bench_mxfp8_grouped_mm(A, B_t, offs, block_size=32) -> float:
 
     # Convert scales for each group to blocked format.
     Mg, K = A_fp8.shape
-    A_scales_blocked, starting_row_after_padding = torch_to_blocked_per_group_2d(
-        A_scales, offs, Mg, K
+    A_scales_blocked, starting_row_after_padding = torch_to_blocked_2d_M_groups(
+        A_scales, offs, K
     )
     B_scales_blocked = torch_to_blocked_per_group_3d(B_scales)