pytorch
diff --git a/‎.github/workflows/1xH100_tests.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/1xH100_tests.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/1xL4_tests.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/1xL4_tests.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/4xH100_tests.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/4xH100_tests.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/build_wheels_linux.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/build_wheels_linux.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/doc_build.yml‎
Lines changed: 1 addition & 0 deletions b/‎.github/workflows/doc_build.yml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎.github/workflows/regression_test.yml‎
Lines changed: 2 additions & 2 deletions b/‎.github/workflows/regression_test.yml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎.github/workflows/regression_test_rocm.yml‎
Lines changed: 4 additions & 4 deletions b/‎.github/workflows/regression_test_rocm.yml‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎README.md‎
Lines changed: 15 additions & 12 deletions b/‎README.md‎
Lines changed: 15 additions & 12 deletions
diff --git a/‎benchmarks/float8/float8_roofline.py‎
Lines changed: 7 additions & 1 deletion b/‎benchmarks/float8/float8_roofline.py‎
Lines changed: 7 additions & 1 deletion
diff --git a/‎benchmarks/float8/training/llama4.sh‎ renamed to ‎benchmarks/float8/training/bench.sh‎
Lines changed: 12 additions & 9 deletions b/‎benchmarks/float8/training/llama4.sh‎ renamed to ‎benchmarks/float8/training/bench.sh‎
Lines changed: 12 additions & 9 deletions
@@ -39,7 +39,7 @@ jobs:
       gpu-arch-version: ${{ matrix.gpu-arch-version }}
       submodules: recursive
       script: |
-        conda create -n venv python=3.9 -y
+        conda create -n venv python=3.10 -y
         conda activate venv
         export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH
         python -m pip install --upgrade pip
 
@@ -39,7 +39,7 @@ jobs:
       gpu-arch-version: ${{ matrix.gpu-arch-version }}
       submodules: recursive
       script: |
-        conda create -n venv python=3.9 -y
+        conda create -n venv python=3.10 -y
         conda activate venv
         export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH
         python -m pip install --upgrade pip
 
@@ -37,7 +37,7 @@ jobs:
       gpu-arch-version: ${{ matrix.gpu-arch-version }}
       submodules: recursive
       script: |
-        conda create -n venv python=3.9 -y
+        conda create -n venv python=3.10 -y
         conda activate venv
         export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH
         python -m pip install --upgrade pip
 
@@ -32,7 +32,7 @@ jobs:
       with-rocm: enable
       with-xpu: enable
       # Note: if free-threaded python is required add py3.13t here
-      python-versions: '["3.9"]'
+      python-versions: '["3.10"]'
 
   build:
     needs: generate-matrix
 
@@ -43,6 +43,7 @@ jobs:
       - name: Install dependencies
         run: |
           python -m pip install torch
+          python -m pip install setuptools==78.1.1 --force-reinstall
           python -m pip install -e .
           pip install -r dev-requirements.txt
           python -m pip install -r docs/requirements.txt
 
@@ -45,7 +45,7 @@ jobs:
       gpu-arch-version: ${{ matrix.gpu-arch-version }}
       submodules: recursive
       script: |
-        conda create -n venv python=3.9 -y
+        conda create -n venv python=3.10 -y
         conda activate venv
         python -m pip install --upgrade pip
         pip install ${{ matrix.torch-spec }}
@@ -105,7 +105,7 @@ jobs:
       gpu-arch-version: ${{ matrix.gpu-arch-version }}
       submodules: recursive
       script: |
-        conda create -n venv python=3.9 -y
+        conda create -n venv python=3.10 -y
         conda activate venv
         echo "::group::Install newer objcopy that supports --set-section-alignment"
         dnf install -y gcc-toolset-10-binutils
 
@@ -22,10 +22,10 @@ jobs:
         include:
           - name: ROCM Nightly
             runs-on: linux.rocm.gpu.gfx942.2
-            torch-spec: '--pre torch --index-url https://download.pytorch.org/whl/nightly/rocm6.3'
+            torch-spec: '--pre torch --index-url https://download.pytorch.org/whl/nightly/rocm7.0'
             gpu-arch-type: "rocm"
-            gpu-arch-version: "6.3"
-            docker-image: pytorch/manylinux2_28-builder:rocm6.3
+            gpu-arch-version: "7.0"
+            docker-image: pytorch/manylinux2_28-builder:rocm7.0
 
     permissions:
       id-token: write
@@ -40,7 +40,7 @@ jobs:
       docker-image: ${{ matrix.docker-image }}
       submodules: recursive
       script: |
-        conda create -n venv python=3.9 -y
+        conda create -n venv python=3.10 -y
         conda activate venv
         python -m pip install --upgrade pip
         pip install ${{ matrix.torch-spec }}
 
@@ -28,13 +28,13 @@
 - [May 25] QAT is now integrated into [Axolotl](https://github.com/axolotl-ai-cloud/axolotl) for fine-tuning ([docs](https://docs.axolotl.ai/docs/qat.html))!
 - [Apr 25] Float8 rowwise training yielded [1.34-1.43x training speedup](https://pytorch.org/blog/accelerating-large-scale-training-and-convergence-with-pytorch-float8-rowwise-on-crusoe-2k-h200s/) at 2k H100 GPU scale
 - [Apr 25] TorchAO is added as a [quantization backend to vLLM](https://docs.vllm.ai/en/latest/features/quantization/torchao.html) ([docs](https://docs.vllm.ai/en/latest/features/quantization/torchao.html))!
-- [Mar 25] Our [2:4 Sparsity paper](https://openreview.net/pdf?id=O5feVk7p6Y) was accepted to SLLM @ ICLR 2025!
-- [Jan 25] Our [integration with GemLite and SGLang](https://pytorch.org/blog/accelerating-llm-inference/) yielded 1.1-2x faster inference with int4 and float8 quantization across different batch sizes and tensor parallel sizes
-- [Jan 25] We added [1-8 bit ARM CPU kernels](https://pytorch.org/blog/hi-po-low-bit-operators/) for linear and embedding ops
 
 <details>
   <summary>Older news</summary>
 
+- [Mar 25] Our [2:4 Sparsity paper](https://openreview.net/pdf?id=O5feVk7p6Y) was accepted to SLLM @ ICLR 2025!
+- [Jan 25] Our [integration with GemLite and SGLang](https://pytorch.org/blog/accelerating-llm-inference/) yielded 1.1-2x faster inference with int4 and float8 quantization across different batch sizes and tensor parallel sizes
+- [Jan 25] We added [1-8 bit ARM CPU kernels](https://pytorch.org/blog/hi-po-low-bit-operators/) for linear and embedding ops
 - [Nov 24] We achieved [1.43-1.51x faster pre-training](https://pytorch.org/blog/training-using-float8-fsdp2/) on Llama-3.1-70B and 405B using float8 training
 - [Oct 24] TorchAO is added as a quantization backend to HF Transformers!
 - [Sep 24] We officially launched TorchAO. Check out our blog [here](https://pytorch.org/blog/pytorch-native-architecture-optimization/)!
@@ -47,8 +47,7 @@
 
 ## 🌅 Overview
 
-TorchAO is a PyTorch-native model optimization framework leveraging quantization and sparsity to provide an end-to-end, training-to-serving workflow
-for AI models. TorchAO works out-of-the-box with `torch.compile()` and `FSDP2` across most HuggingFace PyTorch models. Key features include:
+TorchAO is an easy to use quantization library for native PyTorch. TorchAO works out-of-the-box with `torch.compile()` and `FSDP2` across most HuggingFace PyTorch models. Key features include:
 * Float8 [training](torchao/float8/README.md) and [inference](https://docs.pytorch.org/ao/main/generated/torchao.quantization.Float8DynamicActivationFloat8WeightConfig.html) for speedups without compromising accuracy
 * [MX training and inference](torchao/prototype/mx_formats/README.md), provides MX tensor formats based on native PyTorch MX dtypes (prototype)
 * [Quantization-Aware Training (QAT)](torchao/quantization/qat/README.md) for mitigating quantization degradation
@@ -67,17 +66,17 @@ From the team that brought you the fast series:
 ## 🚀 Quick Start
 
 First, install TorchAO. We recommend installing the latest stable version:
-```
+```bash
 pip install torchao
 ```
 
 Quantize your model weights to int4!
-```
+```python
 from torchao.quantization import Int4WeightOnlyConfig, quantize_
 quantize_(model, Int4WeightOnlyConfig(group_size=32, version=1))
 ```
 Compared to a `torch.compiled` bf16 baseline, your quantized model should be significantly smaller and faster on a single A100 GPU:
-```
+```bash
 int4 model size: 1.25 MB
 bfloat16 model size: 4.00 MB
 compression ratio: 3.2
@@ -86,13 +85,13 @@ bf16 mean time: 30.393 ms
 int4 mean time: 4.410 ms
 speedup: 6.9x
 ```
-For the full model setup and benchmark details, check out our [quick start guide](https://docs.pytorch.org/ao/stable/quick_start.html). Alternatively, try quantizing your favorite model using our [HuggingFace space](https://huggingface.co/spaces/pytorch/torchao-my-repo)!
+See our [quick start guide](https://docs.pytorch.org/ao/stable/quick_start.html) for more details. Alternatively, try quantizing your favorite model using our [HuggingFace space](https://huggingface.co/spaces/pytorch/torchao-my-repo)!
 
 
 ## 🛠 Installation
 
 To install the latest stable version:
-```
+```bash
 pip install torchao
 ```
 
@@ -196,7 +195,7 @@ quantize_(my_model, QATConfig(base_config, step="convert"))
 Users can also combine LoRA + QAT to speed up training by [1.89x](https://dev-discuss.pytorch.org/t/speeding-up-qat-by-1-89x-with-lora/2700) compared to vanilla QAT using this [fine-tuning recipe](https://github.com/pytorch/torchtune/blob/main/recipes/qat_lora_finetune_distributed.py).
 
 
-### Float8
+### Quantized training
 
 [torchao.float8](torchao/float8) implements training recipes with the scaled float8 dtypes, as laid out in https://arxiv.org/abs/2209.05433. With ``torch.compile`` on, current results show throughput speedups of up to **1.5x on up to 512 GPU / 405B parameter count scale** ([details](https://pytorch.org/blog/training-using-float8-fsdp2/)):
 
@@ -211,6 +210,8 @@ Our float8 training is integrated into [TorchTitan's pre-training flows](https:/
 * [Efficient Pre-training of Llama 3-like model architectures using torchtitan on Amazon SageMaker](https://aws.amazon.com/blogs/machine-learning/efficient-pre-training-of-llama-3-like-model-architectures-using-torchtitan-on-amazon-sagemaker/)
 * [Float8 in PyTorch](https://dev-discuss.pytorch.org/t/float8-in-pytorch-1-x/1815)
 
+<details>
+  <summary>Other features (sparse training, memory efficient optimizers)</summary>
 
 ### Sparse Training
 
@@ -242,6 +243,8 @@ optim = CPUOffloadOptimizer(model.parameters(), torch.optim.AdamW, fused=True)
 optim.load_state_dict(ckpt["optim"])
 ```
 
+</details>
+
 <!--
 ## For Developers
 
@@ -258,7 +261,7 @@ Our framework makes it straightforward to add tensor parallel support to your cu
 
 We've added support for authoring and releasing [custom ops](./torchao/csrc/) that do not graph break with `torch.compile()`. We have a few examples you can follow
 
-1. [fp6](torchao/dtypes/floatx/README.md) for 2x faster inference over fp16 with an easy to use API `quantize_(model, fpx_weight_only(3, 2))`
+1. [fp6](torchao/dtypes/floatx/README.md) for 2x faster inference over fp16 with an easy to use API `quantize_(model, FPXWeightOnlyConfig(3, 2))`
 2. [2:4 Sparse Marlin GEMM](https://github.com/pytorch/ao/pull/733) 2x speedups for FP16xINT4 kernels even at batch sizes up to 256
 3. [int4 tinygemm unpacker](https://github.com/pytorch/ao/pull/415) which makes it easier to switch quantized backends for inference
 
 
@@ -245,8 +245,12 @@ def run(
     bf16_gemm_time_sympy = get_gemm_time_sympy(
         M, K, N, torch.bfloat16, None, None, None
     )
+    lowp_input_dtype = torch.float8_e4m3fn
+    if mx_recipe_name == "mxfp4_cutlass":
+        lowp_input_dtype = torch.float4_e2m1fn_x2
+
     fp8_gemm_time_sympy = get_gemm_time_sympy(
-        M, K, N, torch.float8_e4m3fn, float8_recipe_name, mx_recipe_name, None
+        M, K, N, lowp_input_dtype, float8_recipe_name, mx_recipe_name, None
     )
     print("bf16_gemm_time_sympy", bf16_gemm_time_sympy)
     print("fp8_gemm_time_sympy", fp8_gemm_time_sympy)
@@ -304,6 +308,8 @@ def run(
         rb_fp8_gemm_ratio = -1
 
         if do_benchmarks:
+            assert mx_recipe_name != "mxfp4_cutlass", "unsupported"
+
             # TODO(future): make the bf16 gemm times exactly match the e2e
             # benchmarks, there is a slight deviation, probably related to gemm
             # operand memory formats/transpositions below not exactly matching
 
@@ -7,17 +7,20 @@
 # This script can be used to launch a torchtitan float8 training run
 # with the given parameters,
 
-# script arguments
-LOCAL_BATCH_SIZE=${LOCAL_BATCH_SIZE:-1}
-STEPS=${STEPS:-100}
-
 # temporary log file which is deleted after performance data is parsed out and metrics are calculated.
-LOG_FILE="/tmp/float8_training_log.txt"
+LOG_FILE="/tmp/torchtitan_logs.txt"
 
-# validate user has specified torchtitan root directory
+# validate user has specified required args
 if [ -z "${TORCHTITAN_ROOT}" ]; then
-  echo "Error: TORCHTITAN environment variable is not set. Please set it before running this script."
-  echo "Usage: TORCHTITAN_ROOT=<directory> ./torchtitan_llama4.sh"
+  echo "Error: TORCHTITAN_ROOT environment variable is not set. Please set it before running this script."
+  echo "Usage: TORCHTITAN_ROOT=<directory> CONFIG_FILE=<model toml> ./moe.sh"
+  echo " * EXTRA_ARGS: additional arguments to pass to the torchtitan training script."
+  exit 1
+fi
+
+if [ -z "${CONFIG_FILE}" ]; then
+  echo "Error: CONFIG_FILE environment variable is not set. Please set it before running this script."
+  echo "Usage: TORCHTITAN_ROOT=<directory> CONFIG_FILE=<model toml> ./moe.sh"
   echo " * EXTRA_ARGS: additional arguments to pass to the torchtitan training script."
   exit 1
 fi
@@ -29,7 +32,7 @@ original_dir=$(pwd)
 cd ${TORCHTITAN_ROOT}
 
 # run the command with the specified arguments
-CONFIG_FILE="./torchtitan/experiments/llama4/train_configs/debug_model.toml" ${TORCHTITAN_ROOT}/run_train.sh  ${EXTRA_ARGS} 2>&1 | tee ${LOG_FILE}
+${TORCHTITAN_ROOT}/run_train.sh  ${EXTRA_ARGS} 2>&1 | tee ${LOG_FILE}
 
 # return to original working directory
 cd $original_dir