Update attention layer (mlc-ai#1153)

junrushao · web-flow · commit ba678358d35a · 2023-10-29T23:54:12.000-07:00
Existing dlight optimization only works for NT matmul, but not NN. As a
result, the new `nn.Module`-based implementation, which uses NN matmul,
fails compilation at HEAD for now. This PR fixes this issue by tweaking
`k` to the preferred layout.

The following commands now work with the new compilation pipeline:

```bash
python -m mlc_chat.cli.compile --config llama2_7b  --quantization q4f16_1 -o /tmp/1.so
python -m mlc_chat.cli.compile --config llama2_13b --quantization q4f16_1 -o /tmp/1.so
python -m mlc_chat.cli.compile --config llama2_70b --quantization q4f16_1 -o /tmp/1.so
```

Note that the quantization algorithm per se, `q4f16_1`, has not been
implemented yet, meaning this code path is not yet ready for use so far.
diff --git a/python/mlc_chat/compiler/model/llama_model.py b/python/mlc_chat/compiler/model/llama_model.py
@@ -95,14 +95,16 @@ def forward(  # pylint: disable=too-many-locals
 
         self.k_cache.append(op.squeeze(k, axis=0))
         self.v_cache.append(op.squeeze(v, axis=0))
-        k = op.reshape(self.k_cache.view(total_seq_len), (t, b, h_kv, d))
-        v = op.reshape(self.v_cache.view(total_seq_len), (t, b, h_kv, d))
+        k = op.reshape(self.k_cache.view(total_seq_len), (b, t, h_kv, d))
+        v = op.reshape(self.v_cache.view(total_seq_len), (b, t, h_kv, d))
         if h_kv != h_q:
             k = k.repeat(h_q // h_kv, axis=2)
             v = v.repeat(h_q // h_kv, axis=2)
-        attn_weights = op.matmul(  # [b, h, s, t]
-            q.permute_dims([0, 2, 1, 3]),  # [b, h, s, d]
-            k.permute_dims([1, 2, 3, 0]),  # [b, h, d, t]
+        q = q.permute_dims([0, 2, 1, 3])  # [b, h, s, d]
+        k = k.permute_dims([0, 2, 1, 3])  # [b, h, t, d]
+        v = v.permute_dims([0, 2, 1, 3])  # [b, h, t, d]
+        attn_weights = op.matmul(
+            q, k.permute_dims([0, 1, 3, 2])  # [b, h, s, d] x [b, h, d, t] = [b, h, s, t]
         ) / math.sqrt(d)
         dtype = attn_weights.dtype
         attn_weights = attn_weights.maximum(tir.min_value(dtype)).minimum(attention_mask)
@@ -111,10 +113,7 @@ def forward(  # pylint: disable=too-many-locals
         else:
             attn_weights = op.softmax(attn_weights.astype("float32"), axis=-1).astype(dtype)
         return self.o_proj(
-            op.matmul(  # [b, h, s, d]
-                attn_weights,  # [b, h, s, t]
-                v.permute_dims([1, 2, 0, 3]),  # [b, h, t, d]
-            )
+            op.matmul(attn_weights, v)  # [b, h, s, t] x [b, h, t, d] = [b, h, s, d]
             .permute_dims([0, 2, 1, 3])  # [b, s, h, d]
             .reshape((b, s, h_q * d))
         )
diff --git a/python/mlc_chat/support/auto_config.py b/python/mlc_chat/support/auto_config.py
@@ -35,12 +35,13 @@ def detect_config(config: Union[str, Path]) -> Path:
     )
 
     if isinstance(config, str) and config in MODEL_PRESETS:
+        logger.info("%s preset model: %s", FOUND, config)
         content = MODEL_PRESETS[config]
         temp_file = tempfile.NamedTemporaryFile(  # pylint: disable=consider-using-with
             suffix=".json",
             delete=False,
         )
-        logger.info("%s preset model configuration: %s", FOUND, temp_file.name)
+        logger.info("Dumping config to: %s", temp_file.name)
         config_path = Path(temp_file.name)
         with config_path.open("w", encoding="utf-8") as config_file:
             json.dump(content, config_file, indent=2)

Original file line number	Diff line number	Diff line change
`@@ -35,12 +35,13 @@ def detect_config(config: Union[str, Path]) -> Path:`
`35`	`35`	`)`
`36`	`36`
`37`	`37`	`if isinstance(config, str) and config in MODEL_PRESETS:`
	`38`	`+ logger.info("%s preset model: %s", FOUND, config)`
`38`	`39`	`content = MODEL_PRESETS[config]`
`39`	`40`	`temp_file = tempfile.NamedTemporaryFile( # pylint: disable=consider-using-with`
`40`	`41`	`suffix=".json",`
`41`	`42`	`delete=False,`
`42`	`43`	`)`
`43`		`- logger.info("%s preset model configuration: %s", FOUND, temp_file.name)`
	`44`	`+ logger.info("Dumping config to: %s", temp_file.name)`
`44`	`45`	`config_path = Path(temp_file.name)`
`45`	`46`	`with config_path.open("w", encoding="utf-8") as config_file:`
`46`	`47`	`json.dump(content, config_file, indent=2)`