Update docs + add deprecation warning (#825)

jcaip · web-flow · commit 92dcc627d160 · 2024-09-06T14:01:22.000-04:00
* update docstrigs

* updated README

* update docs

* updated docs

* updated

* fix

* update doc

* update png

* fix affine quantized test
diff --git a/README.md b/README.md
@@ -31,7 +31,6 @@ from torchao.quantization.quant_api import (
     quantize_,
     int8_dynamic_activation_int4_weight,
     int8_dynamic_activation_int8_weight,
-    int8_dynamic_activation_int8_semi_sparse_weight,
     int4_weight_only,
     int8_weight_only
 )
diff --git a/docs/static/supported_sparsity_patterns.png b/docs/static/supported_sparsity_patterns.png
diff --git a/test/dtypes/test_affine_quantized.py b/test/dtypes/test_affine_quantized.py
@@ -10,6 +10,7 @@
     int8_dynamic_activation_int8_semi_sparse_weight,
     float8_weight_only,
 )
+from torchao.dtypes import SemiSparseLayoutType
 from torch.testing._internal import common_utils
 from torchao.utils import TORCH_VERSION_AT_LEAST_2_5
 
@@ -30,7 +31,7 @@ def get_quantization_functions(do_sparse: bool, do_int4: bool):
         base_functions.append(int4_weight_only(group_size=32))
 
     if do_sparse:
-        base_functions.append(int8_dynamic_activation_int8_semi_sparse_weight())
+        base_functions.append(int8_dynamic_activation_int8_weight(layout_type=SemiSparseLayoutType()))
 
     if is_cuda_8_9:
         base_functions.append(float8_weight_only())
diff --git a/test/sparsity/test_sparse_api.py b/test/sparsity/test_sparse_api.py
@@ -8,10 +8,9 @@
 from torchao.sparsity import (
     apply_fake_sparsity,
     sparsify_,
-    int8_dynamic_activation_int8_semi_sparse_weight,
     semi_sparse_weight,
 )
-from torchao.dtypes import MarlinSparseLayoutType
+from torchao.dtypes import MarlinSparseLayoutType, SemiSparseLayoutType
 from torchao.quantization.quant_api import (
     int8_dynamic_activation_int8_weight,
     quantize_,
@@ -67,7 +66,7 @@ def test_quant_semi_sparse(self):
         quantize_(model_copy, int8_dynamic_activation_int8_weight())
         dense_result = model_copy(input)
 
-        quantize_(model, int8_dynamic_activation_int8_semi_sparse_weight())
+        quantize_(model, int8_dynamic_activation_int8_weight(layout_type=SemiSparseLayoutType()))
         sparse_result = model(input)
 
         assert torch.allclose(dense_result, sparse_result, rtol=1e-2, atol=1e-2)
diff --git a/torchao/_models/sam/eval_combo.py b/torchao/_models/sam/eval_combo.py
@@ -10,7 +10,8 @@
 import resource
 
 from torchao.quantization import quantize_, int8_dynamic_activation_int8_weight, int4_weight_only
-from torchao.sparsity import sparsify_, apply_fake_sparsity, int8_dynamic_activation_int8_semi_sparse_weight, semi_sparse_weight
+from torchao.sparsity import sparsify_, apply_fake_sparsity, semi_sparse_weight
+from torchao.dtypes import SemiSparseLayoutType, MarlinSparseLayoutType
 from torchao.utils import unwrap_tensor_subclass
 from torchao.utils import TORCH_VERSION_AT_LEAST_2_5
 
@@ -314,7 +315,7 @@ def mlp_only(mod, name):
                   int8_dynamic_activation_int8_weight(),
                   attn_only)
         quantize_(predictor.model.image_encoder,
-                  int8_dynamic_activation_int8_semi_sparse_weight(),
+                  int8_dynamic_activation_int8_weight(layout_type=SemiSparseLayoutType()),
                   mlp_lin1_only)
         sparsify_(predictor.model.image_encoder,
                   semi_sparse_weight(),
diff --git a/torchao/quantization/quant_api.py b/torchao/quantization/quant_api.py
@@ -15,6 +15,7 @@
 and mixed GEMM kernels
 """
 from functools import partial
+import warnings
 import torch
 import torchao
 import torch.nn as nn
@@ -612,6 +613,11 @@ def int8_dynamic_activation_int8_semi_sparse_weight():
     Applies int8 dnynamic symmetric per-token activation and int8 per-channel weight
     quantization + 2:4 sparsity to linear layers.
     """
+    warnings.warn("""int8_dyanmic_activation_int8_semi_sparse_weight() will be deprecated at a later release. Please use the layout_type kwarg in int8_dynamic_activation_int8_weight instead.
+
+    from torchao.dtypes import SemiSparseLayoutType
+    int8_dynamic_activation_int8_weight(layout_type=SemiSparseLayoutType()""")
+
     return int8_dynamic_activation_int8_weight(layout_type=SemiSparseLayoutType())
 
 
diff --git a/torchao/sparsity/README.md b/torchao/sparsity/README.md
@@ -2,27 +2,10 @@
 
 Sparsity is the technique of removing parameters from a neural network in order to reduce its memory overhead or latency. By carefully choosing how the elements are pruned, one can achieve significant reduction in memory overhead and latency, while paying a reasonably low or no price in terms of model quality (accuracy / f1).
 
-## Goal
 
-We feel that the main problem current sparsity researchers / users face is fragmentation. Researchers rightfully aim to show end-to-end results, but this means a lot of time is spent figuring out how to integrate with PyTorch and implementation questions like:
-- *When should I mask?*
-- *When/how should I store the compressed representation?*
-- *Do I want in-place or out-of-place mask updates?*
-- *How can I call sparse matmul instead of dense?*
-
-We feel like the above problems can be solved once by `torchao`, letting researchers focus on what really matters - pushing sparse kernel performance or more accurate pruning algorithms.
-
-More concretely, we hope to provide tutorials and APIs for both sparse kernels (tensor subclassing) and pruning algorithms (torch.ao.pruning.Sparsifier) that users can extend. We aim to provide modular building blocks, that can be used to accelerate not only inference but training as well, and that compose nicely with `torchao` quantization workflows.
-
-1. Train sparse models from scratch with hardware acceleration, with minimal accuracy loss.
-2. Recover accuracy loss of pruned model with custom pruning algorthim.
-3. Accelerate masked/pruned models on sparsity-supported hardware to realize performance improvements.
-
-## Success Stories
-
-#### segment-anything-fast
-We applied 2:4 sparsity to accelerate segment-anything, as part of [segment-anything-fast](https://github.com/pytorch-labs/segment-anything-fast).
+## Benchmarks
 
+### segment-anything-fast
 We were able to provide a **1.16x (22.7 -> 26.5 img/s) speedup over our dense baseline, while maintaining 97.5% (0.581 -> 0.567) of the evaluation accuracy (mIOU)**.
 
 Overall, we found that accelerating the MLP linear layers provied the most speedups (`lin1`, `lin2`), while mitigating accuracy loss.
@@ -47,20 +30,23 @@ The following benchmarks we ran for sam ViT-h on an NVIDIA-A100-80GB, with batch
 
 To reproduce our benchmarks please follow these [instructions](/torchao/_models/sam/README.md).
 
-#### BERT
+### LLama3
 
-We were able to accelerate BERT 1.23x on an A100 with a negligible accuracy drop on SQuAD.
-For more information about accelerting BERT with semi-sturcutred sparsity, please see our [tutorial](https://pytorch.org/tutorials/advanced/semi_structured_sparse.html?highlight=beta).
+On Meta LLama3, we observe a 25% tok/s increase (180 -> 226) compared to our existing int4-wo implementation when using the sparse marlin kernel @Diogo-V added.
 
-| Metrics | fp16 | 2:4 sparse | delta / speedup |
-| --- | --- | --- | --- |
-| Exact Match (%) | 78.53 | 78.44 | -0.09 |
-| F1 (%) | 86.93 | 86.49 | -0.44 |
-| Time (bs=16) | 19.35 | 15.74 | 1.23x |
+| Model       | Technique               | Tokens/Second | Memory Bandwidth (GB/s) | Peak Memory (GB) | Model Size (GB) |
+| ----------- | ----------------------- | ------------- | ----------------------- | ---------------- | --------------- |
+| Llama-3-8B  | Base (bfloat16)         |   95.64       | 1435.54                 | 16.43            | 15.01           |
+|             | int8dq                  |    8.61       |   64.75                 |  9.24            |  7.52           |
+|             | int8wo                  |  153.03       | 1150.80                 | 10.42            |  7.52           |
+|             | int4wo-64               |  180.80       |  763.33                 |  6.88            |  4.22           |
+|             | int4wo-64-sparse-marlin |  226.02       |  689.20                 |  5.32            |  3.05           |
 
-# Implemented APIs
+These benchmarks were also ran on a NVIDIA-A100-80GB.
 
-## Quantization + Sparsity
+## Supported APIs
+
+![support_matrix](/docs/static/supported_sparsity_patterns.png)
 
 ### Sparse Marlin 2:4
 
@@ -72,11 +58,59 @@ from torchao.dtypes import MarlinSparseLayoutType
 
 # Your FP16 model
 model = model.cuda().half()
-
 quantize_(model, int4_weight_only(layout_type=MarlinSparseLayoutType()))
 ```
 
-# Design
+### int8 dynamic quant + 2:4 sparasity
+
+We support composing int8 dynaic quantization with 2:4 sparsity. We fuse one of the scalar dequant multiplications into our cuSPARSELt sparse mm in order to remain performant.
+
+```py
+from torchao.quantization.quant_api import quantize_, int8_dynamic_activation_int8_weight
+from torchao.dtypes import SemiSparseLayoutType
+
+model = model.cuda()
+quantize_(model, int8_dynamic_activation_int8_weight(layout_type=SemiSparseLayoutType()))
+```
+
+### 2:4 sparsity
+
+```py
+from torchao.sparsity.sparse_api import sparsify_, semi_sparse_weight
+from torchao.dtypes import SemiSparseLayoutType
+
+model = model.cuda()
+sparsify_(model, semi_sparse_weight())
+```
+
+### Block sparsity (prototype)
+We offer prototype support for accelerating block sparsity with our triton kernels for bfloat16/float16 workloads.
+
+```py
+from torchao.sparsity.sparse_api import sparsify_
+from torchao.sparsity.prototype.superblock.blocksparse import block_sparse_weight
+
+model = model.cuda()
+sparsify_(model, block_sparse_weight())
+```
+
+# Goal
+
+We feel that the main problem current sparsity researchers / users face is fragmentation. Researchers rightfully aim to show end-to-end results, but this means a lot of time is spent figuring out how to integrate with PyTorch and implementation questions like:
+- *When should I mask?*
+- *When/how should I store the compressed representation?*
+- *Do I want in-place or out-of-place mask updates?*
+- *How can I call sparse matmul instead of dense?*
+
+We feel like the above problems can be solved once by `torchao`, letting researchers focus on what really matters - pushing sparse kernel performance or more accurate pruning algorithms.
+
+More concretely, we hope to provide tutorials and APIs for both sparse kernels (tensor subclassing) and pruning algorithms (torch.ao.pruning.Sparsifier) that users can extend. We aim to provide modular building blocks, that can be used to accelerate not only inference but training as well, and that compose nicely with `torchao` quantization workflows.
+
+1. Train sparse models from scratch with hardware acceleration, with minimal accuracy loss.
+2. Recover accuracy loss of pruned model with custom pruning algorthim.
+3. Accelerate masked/pruned models on sparsity-supported hardware to realize performance improvements.
+
+## Design
 
 Sparsity, like quantization, is an accuracy/performance trade-off, where we care not only about the speedup but also on the accuracy degradation of our architecture optimization technique.
 
diff --git a/torchao/sparsity/sparse_api.py b/torchao/sparsity/sparse_api.py
@@ -42,9 +42,10 @@ def sparsify_(model: torch.nn.Module,
     """Convert the weight of linear modules in the model with `apply_tensor_subclass`
     This function is essentially the same as quantize, put for sparsity subclasses.
 
-    Currently, we support two options for sparsity:
+    Currently, we support three options for sparsity:
         - semi-structured (2:4) sparsity with `semi_sparse_weight`
-        - int8 dynamic quantization + 2:4 sparsity with `int8_dynamic_activation_int8_semi_sparse_weight`, which is also available via the quantize API
+        - int8 dynamic quantization + 2:4 sparsity with `layout_type=SemiSparseLayoutType`
+        - int4 weight-only quantization + 2:4 sparsity with `layout_type=SparseMarlinLayoutType`
 
     Args:
         model (torch.nn.Module): input model
@@ -67,8 +68,8 @@ def filter_fn(module: nn.Module, fqn: str) -> bool:
         m = sparsify_(m, semi_sparse_weight(), filter_fn)
 
         # for int8 dynamic quantization + 2:4 sparsity
-        from torchao.sparsity.prototype import int8_dynamic_activation_int8_semi_sparse_weight
-        m = sparsify_(m, int8_dynamic_activation_int8_semi_sparse_weight(), filter_fn)
+        from torchao.dtypes import SemiSparseLayoutType
+        m = quantize_(m, int8_dynamic_activation_int8_weight(layout_type=SemiSparseLayoutType), filter_fn)
     """
     _replace_with_custom_fn_if_matches_filter(
         model,

Original file line number	Diff line number	Diff line change
`@@ -31,7 +31,6 @@ from torchao.quantization.quant_api import (`
`31`	`31`	`quantize_,`
`32`	`32`	`int8_dynamic_activation_int4_weight,`
`33`	`33`	`int8_dynamic_activation_int8_weight,`
`34`		`- int8_dynamic_activation_int8_semi_sparse_weight,`
`35`	`34`	`int4_weight_only,`
`36`	`35`	`int8_weight_only`
`37`	`36`	`)`